From b22f3f647517b6a2fd5bef57a3532025af0b8984 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 7 Jan 2025 21:02:35 -0800
Subject: [PATCH 001/248] Fix nightly accuracy tests (#2780)

---
 python/sglang/test/test_utils.py     |  2 +-
 test/srt/run_suite.py                |  3 +-
 test/srt/test_nightly_gsm8k_eval.py  | 49 +++++++++++++++-------------
 test/srt/test_nightly_human_eval.py  |  2 +-
 test/srt/test_skip_tokenizer_init.py |  6 ++--
 5 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index cd21c896a044..4121deb17cc7 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -36,7 +36,7 @@
 DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
-DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 83d2e90a43a9..2c1750d363ce 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -49,8 +49,7 @@
     ],
     "nightly": [
         "test_nightly_gsm8k_eval.py",
-        "test_nightly_human_eval.py",
-        # Disable temporarly
+        # Disable temporarily
         # "test_nightly_math_eval.py",
     ],
     "sampling/penaltylib": glob.glob(
diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py
index 7e23b721e433..7820f6825a9c 100644
--- a/test/srt/test_nightly_gsm8k_eval.py
+++ b/test/srt/test_nightly_gsm8k_eval.py
@@ -1,6 +1,5 @@
 import json
 import os
-import subprocess
 import unittest
 import warnings
 from datetime import datetime
@@ -16,24 +15,26 @@
     DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
+    is_in_ci,
     popen_launch_server,
+    write_github_step_summary,
 )
 
 MODEL_SCORE_THRESHOLDS = {
-    "meta-llama/Llama-3.1-8B-Instruct": 0.83,
+    "meta-llama/Llama-3.1-8B-Instruct": 0.82,
     "mistralai/Mistral-7B-Instruct-v0.3": 0.58,
-    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.84,
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85,
     "google/gemma-2-27b-it": 0.92,
-    "meta-llama/Llama-3.1-70B-Instruct": 0.96,
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.63,
-    "Qwen/Qwen2-57B-A14B-Instruct": 0.87,
-    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.84,
+    "meta-llama/Llama-3.1-70B-Instruct": 0.95,
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64,
+    "Qwen/Qwen2-57B-A14B-Instruct": 0.88,
+    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83,
     "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
-    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.83,
+    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.84,
     "neuralmagic/gemma-2-2b-it-FP8": 0.60,
-    "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.95,
-    "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.61,
-    "neuralmagic/Qwen2-72B-Instruct-FP8": 0.95,
+    "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.94,
+    "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.62,
+    "neuralmagic/Qwen2-72B-Instruct-FP8": 0.94,
     "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82,
     "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84,
     "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.83,
@@ -67,7 +68,6 @@ def launch_server(base_url, model, is_fp8, is_tp2):
         base_url,
         timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
         other_args=other_args,
-        return_stdout_stderr=(subprocess.DEVNULL, subprocess.DEVNULL),
     )
     return process
 
@@ -99,6 +99,9 @@ def write_results_to_json(model, metrics, mode="a"):
 
 def check_model_scores(results):
     failed_models = []
+    summary = " | model | score | threshold |\n"
+    summary += "| ----- | ----- | --------- |\n"
+
     for model, score in results:
         threshold = MODEL_SCORE_THRESHOLDS.get(model)
         if threshold is None:
@@ -111,11 +114,19 @@ def check_model_scores(results):
                 f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})"
             )
 
+        line = f"| {model} | {score} | {threshold} |\n"
+        summary += line
+
+    print(summary)
+
+    if is_in_ci():
+        write_github_step_summary(f"### TestNightlyGsm8KEval\n{summary}")
+
     if failed_models:
         raise AssertionError("\n".join(failed_models))
 
 
-class TestEvalAccuracyLarge(unittest.TestCase):
+class TestNightlyGsm8KEval(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls.model_groups = [
@@ -127,13 +138,6 @@ def setUpClass(cls):
         ]
         cls.base_url = DEFAULT_URL_FOR_TEST
 
-    def setUp(self):
-        self.process = None
-
-    def tearDown(self):
-        if self.process:
-            kill_process_tree(self.process.pid)
-
     def test_mgsm_en_all_models(self):
         warnings.filterwarnings(
             "ignore", category=ResourceWarning, message="unclosed.*socket"
@@ -144,7 +148,7 @@ def test_mgsm_en_all_models(self):
         for model_group, is_fp8, is_tp2 in self.model_groups:
             for model in model_group:
                 with self.subTest(model=model):
-                    self.process = launch_server(self.base_url, model, is_fp8, is_tp2)
+                    process = launch_server(self.base_url, model, is_fp8, is_tp2)
 
                     args = SimpleNamespace(
                         base_url=self.base_url,
@@ -163,8 +167,7 @@ def test_mgsm_en_all_models(self):
                     is_first = False
 
                     all_results.append((model, metrics["score"]))
-
-                    self.tearDown()
+                    kill_process_tree(process.pid)
 
         try:
             with open("results.json", "r") as f:
diff --git a/test/srt/test_nightly_human_eval.py b/test/srt/test_nightly_human_eval.py
index bffe214b5deb..0b682937a825 100644
--- a/test/srt/test_nightly_human_eval.py
+++ b/test/srt/test_nightly_human_eval.py
@@ -18,7 +18,7 @@
 )
 
 
-class TestEvalAccuracyLarge(unittest.TestCase):
+class TestNightlyHumanEval(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         if is_in_ci():
diff --git a/test/srt/test_skip_tokenizer_init.py b/test/srt/test_skip_tokenizer_init.py
index bc99b23ad581..eef033ea98cb 100644
--- a/test/srt/test_skip_tokenizer_init.py
+++ b/test/srt/test_skip_tokenizer_init.py
@@ -55,8 +55,10 @@ def run_decode(self, return_logprob=False, top_logprobs_num=0, n=1):
         print(json.dumps(ret))
 
         def assert_one_item(item):
-            assert len(item["token_ids"]) == item["meta_info"]["completion_tokens"]
-            assert len(item["token_ids"]) == max_new_tokens
+            self.assertEqual(
+                len(item["token_ids"]), item["meta_info"]["completion_tokens"]
+            )
+            self.assertEqual(len(item["token_ids"]), max_new_tokens)
             assert item["meta_info"]["prompt_tokens"] == len(input_ids)
 
             if return_logprob:

From 694e41925e6698829b5f24381ec3957429eb4701 Mon Sep 17 00:00:00 2001
From: JJJJOHNSON <jjjjohnsonjin@gmail.com>
Date: Wed, 8 Jan 2025 13:46:02 +0800
Subject: [PATCH 002/248] [eagle2] fix end check when target model verify
 (#2723)

---
 python/sglang/srt/speculative/eagle_utils.py | 50 ++++++++++++--------
 test/srt/test_eagle_infer.py                 | 29 ++++++++++++
 2 files changed, 60 insertions(+), 19 deletions(-)

diff --git a/python/sglang/srt/speculative/eagle_utils.py b/python/sglang/srt/speculative/eagle_utils.py
index a6fcf2e570df..88c88c0724f4 100644
--- a/python/sglang/srt/speculative/eagle_utils.py
+++ b/python/sglang/srt/speculative/eagle_utils.py
@@ -550,8 +550,37 @@ def verify(self, batch: ScheduleBatch, logits_output: torch.Tensor) -> torch.Ten
             triton.next_power_of_2(max_draft_len),
         )
 
-        accept_index = accept_index[accept_index != -1]
+        draft_input = EAGLEDraftInput()
+        new_accept_index = []
+        unfinished_index = []
+        finished_extend_len = {}  # {rid:accept_length + 1}
+        accept_index_cpu = accept_index.tolist()
+        predict_cpu = predict.tolist()
+        # iterate every accepted token and check if req has finished after append the token
+        # should be checked BEFORE free kv cache slots
+        for i, (req, accept_index_row) in enumerate(zip(batch.reqs, accept_index_cpu)):
+            new_accept_index_ = []
+            for j, idx in enumerate(accept_index_row):
+                if idx == -1:
+                    break
+                id = predict_cpu[idx]
+                # if not found_finished:
+                req.output_ids.append(id)
+                finished_extend_len[req.rid] = j + 1
+                req.check_finished()
+                if req.finished():
+                    draft_input.has_finished = True
+                    # set all tokens after finished token to -1 and break
+                    accept_index[i, j + 1 :] = -1
+                    break
+                else:
+                    new_accept_index_.append(idx)
+            if not req.finished():
+                new_accept_index.extend(new_accept_index_)
+                unfinished_index.append(i)
+        accept_length = (accept_index != -1).sum(dim=1) - 1
 
+        accept_index = accept_index[accept_index != -1]
         accept_length_cpu = accept_length.tolist()
         verified_id = predict[accept_index]
         verified_id_cpu = verified_id.tolist()
@@ -570,26 +599,9 @@ def verify(self, batch: ScheduleBatch, logits_output: torch.Tensor) -> torch.Ten
             triton.next_power_of_2(bs),
         )
         batch.seq_lens.add_(accept_length + 1)
-        new_accept_index = []
-        unfinished_index = []
-        finished_extend_len = {}  # {rid:accept_length + 1}
-        # retracted_reqs, new_token_ratio = batch.retract_decode()
-
-        low = 0
-        draft_input = EAGLEDraftInput()
-        for i, (req, verified_len) in enumerate(zip(batch.reqs, accept_length_cpu)):
-            req.output_ids.extend(verified_id_cpu[low : low + verified_len + 1])
-            req.check_finished()
-            if req.finished():
-                draft_input.has_finished = True
-            else:
-                new_accept_index.append(accept_index[low : low + verified_len + 1])
-                unfinished_index.append(i)
-            low += verified_len + 1
-            finished_extend_len[req.rid] = verified_len + 1
 
         if len(new_accept_index) > 0:
-            new_accept_index = torch.cat(new_accept_index, dim=0)
+            new_accept_index = torch.tensor(new_accept_index, device="cuda")
             draft_input.verified_id = predict[new_accept_index]
             draft_input.hidden_states = batch.spec_info.hidden_states[new_accept_index]
             draft_input.accept_length = accept_length[unfinished_index]
diff --git a/test/srt/test_eagle_infer.py b/test/srt/test_eagle_infer.py
index 609d4411d77d..94ebc79ca743 100644
--- a/test/srt/test_eagle_infer.py
+++ b/test/srt/test_eagle_infer.py
@@ -1,5 +1,7 @@
 import unittest
 
+from transformers import AutoConfig, AutoTokenizer
+
 import sglang as sgl
 
 
@@ -34,6 +36,33 @@ def test_eagle_accuracy(self):
         print(out2)
         self.assertEqual(out1, out2)
 
+    def test_eagle_end_check(self):
+        prompt = "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nToday is a sunny day and I like [/INST]"
+        target_model_path = "meta-llama/Llama-2-7b-chat-hf"
+        tokenizer = AutoTokenizer.from_pretrained(target_model_path)
+        speculative_draft_model_path = "lmzheng/sglang-EAGLE-llama2-chat-7B"
+
+        sampling_params = {
+            "temperature": 0,
+            "max_new_tokens": 1024,
+            "skip_special_tokens": False,
+        }
+
+        engine = sgl.Engine(
+            model_path=target_model_path,
+            speculative_draft_model_path=speculative_draft_model_path,
+            speculative_algorithm="EAGLE",
+            speculative_num_steps=3,
+            speculative_eagle_topk=4,
+            speculative_num_draft_tokens=16,
+        )
+        out1 = engine.generate(prompt, sampling_params)["text"]
+        engine.shutdown()
+        print("==== Answer 1 ====")
+        print(repr(out1))
+        tokens = tokenizer.encode(out1, truncation=False)
+        assert tokenizer.eos_token_id not in tokens
+
 
 if __name__ == "__main__":
     unittest.main()

From 8a6906127a81421e06c904273f8e06dff85039a7 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 7 Jan 2025 23:29:10 -0800
Subject: [PATCH 003/248] Improve linear.py to load sharded weights & remove
 the dependency of Parameters from vllm (#2784)

Co-authored-by: SangBin Cho rkooo567@gmail.com
---
 3rdparty/amd/tuning/benchmark_moe_rocm.py     |   5 +-
 .../sglang/srt/layers/attention/__init__.py   |   9 +-
 .../layers/attention/flashinfer_backend.py    |   6 +-
 python/sglang/srt/layers/linear.py            | 222 ++++++---
 .../srt/layers/moe/fused_moe_triton/layer.py  |   5 +-
 python/sglang/srt/layers/parameter.py         | 431 ++++++++++++++++++
 python/sglang/srt/layers/quantization/fp8.py  |   2 +-
 .../srt/layers/vocab_parallel_embedding.py    |   2 +-
 .../sglang/srt/managers/session_controller.py |   2 +-
 .../srt/model_executor/forward_batch_info.py  |   3 +
 .../sglang/srt/model_executor/model_runner.py |   3 +-
 python/sglang/srt/models/grok.py              |  41 +-
 python/sglang/srt/server.py                   |   9 +-
 python/sglang/srt/speculative/eagle_utils.py  |   2 +-
 scripts/killall_sglang.sh                     |   1 +
 15 files changed, 655 insertions(+), 88 deletions(-)
 create mode 100644 python/sglang/srt/layers/parameter.py

diff --git a/3rdparty/amd/tuning/benchmark_moe_rocm.py b/3rdparty/amd/tuning/benchmark_moe_rocm.py
index a3f26e8e5028..5aff8c0d664e 100644
--- a/3rdparty/amd/tuning/benchmark_moe_rocm.py
+++ b/3rdparty/amd/tuning/benchmark_moe_rocm.py
@@ -10,7 +10,10 @@
 from tqdm import tqdm
 from transformers import AutoConfig
 
-from sglang.srt.layers.fused_moe_triton.fused_moe import fused_moe, get_config_file_name
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+    fused_moe,
+    get_config_file_name,
+)
 
 padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0
 
diff --git a/python/sglang/srt/layers/attention/__init__.py b/python/sglang/srt/layers/attention/__init__.py
index 140755ff5e67..745598643028 100644
--- a/python/sglang/srt/layers/attention/__init__.py
+++ b/python/sglang/srt/layers/attention/__init__.py
@@ -66,7 +66,14 @@ def forward(
         if forward_batch.forward_mode.is_decode():
             return self.forward_decode(q, k, v, layer, forward_batch, save_kv_cache)
         else:
-            return self.forward_extend(q, k, v, layer, forward_batch, save_kv_cache)
+            return self.forward_extend(
+                q,
+                k,
+                v,
+                layer,
+                forward_batch,
+                save_kv_cache,
+            )
 
     def forward_decode(
         self,
diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py
index 8b823cc5a5dd..fc3455b60774 100644
--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -347,6 +347,8 @@ def forward_extend(
             else forward_batch.encoder_out_cache_loc
         )
 
+        logits_soft_cap = layer.logit_cap
+
         if not self.forward_metadata.use_ragged:
             if k is not None:
                 assert v is not None
@@ -359,7 +361,7 @@ def forward_extend(
                 causal=not layer.is_cross_attention,
                 sm_scale=layer.scaling,
                 window_left=layer.sliding_window_size,
-                logits_soft_cap=layer.logit_cap,
+                logits_soft_cap=logits_soft_cap,
             )
         else:
             o1, s1 = self.prefill_wrapper_ragged.forward_return_lse(
@@ -368,7 +370,7 @@ def forward_extend(
                 v.contiguous().view(-1, layer.tp_v_head_num, layer.head_dim),
                 causal=True,
                 sm_scale=layer.scaling,
-                logits_soft_cap=layer.logit_cap,
+                logits_soft_cap=logits_soft_cap,
             )
 
             if self.forward_metadata.extend_no_prefix:
diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py
index b828c03911e8..9edfa739458b 100644
--- a/python/sglang/srt/layers/linear.py
+++ b/python/sglang/srt/layers/linear.py
@@ -18,14 +18,15 @@
 
 # workaround
 from vllm.model_executor.layers.linear import LinearBase
-from vllm.model_executor.parameter import (
+
+from sglang.srt.layers.parameter import (
     BasevLLMParameter,
     PackedColumnParameter,
     PackedvLLMParameter,
     PerTensorScaleParameter,
     RowvLLMParameter,
+    _ColumnvLLMParameter,
 )
-
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
@@ -94,6 +95,62 @@ def adjust_scalar_to_fused_array(param, loaded_weight, shard_id):
     return param[shard_id], loaded_weight
 
 
+def load_column_qkv_weight(
+    self, loaded_weight, num_heads, shard_id, shard_offset, shard_size, tp_rank
+):
+    if (
+        isinstance(self, (PackedColumnParameter, PackedvLLMParameter))
+        and self.output_dim == self.packed_dim
+    ):
+        shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
+            shard_offset=shard_offset, shard_size=shard_size
+        )
+
+    param_data = self.data
+    shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
+    param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
+    loaded_weight = loaded_weight.narrow(
+        self.output_dim, shard_id * shard_size, shard_size
+    )
+
+    assert param_data.shape == loaded_weight.shape
+    param_data.copy_(loaded_weight)
+
+
+def load_column_parallel_weight(
+    self, loaded_weight: torch.Tensor, tp_rank, use_presharded_weights: bool = False
+):
+    if isinstance(self, _ColumnvLLMParameter):
+        if not use_presharded_weights:
+            shard_size = self.data.shape[self.output_dim]
+            loaded_weight = loaded_weight.narrow(
+                self.output_dim, tp_rank * shard_size, shard_size
+            )
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+    else:
+        self.data.copy_(loaded_weight)
+
+
+def load_row_parallel_weight(
+    self, loaded_weight: torch.Tensor, tp_rank, use_presharded_weights: bool = False
+):
+    if isinstance(self, RowvLLMParameter):
+        if not use_presharded_weights:
+            shard_size = self.data.shape[self.input_dim]
+            loaded_weight = loaded_weight.narrow(
+                self.input_dim, tp_rank * shard_size, shard_size
+            )
+
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+    else:
+        self.data.copy_(loaded_weight)
+
+
 class LinearMethodBase(QuantizeMethodBase):
     """Base class for different (maybe quantized) linear methods."""
 
@@ -287,6 +344,8 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         output_sizes: Optional[List[int]] = None,
         prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
     ):
         super().__init__(
             input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
@@ -295,7 +354,11 @@ def __init__(
         self.gather_output = gather_output
 
         # Divide the weight matrix along the last dimension.
-        tp_size = get_tensor_model_parallel_world_size()
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
         assert self.quant_method is not None
         self.output_size_per_partition = divide(self.output_size, tp_size)
         self.output_partition_sizes = [self.output_size_per_partition]
@@ -336,7 +399,6 @@ def __init__(
             self.register_parameter("bias", None)
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
-        tp_rank = get_tensor_model_parallel_rank()
         output_dim = getattr(param, "output_dim", None)
 
         # Special case for GGUF
@@ -356,7 +418,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         # no need to narrow here
         if output_dim is not None and not use_bitsandbytes_4bit:
             shard_size = param_data.shape[output_dim]
-            start_idx = tp_rank * shard_size
+            start_idx = self.tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
 
         # Special case for loading scales off disk, which often do not
@@ -364,7 +426,9 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
 
-        assert param_data.shape == loaded_weight.shape
+        assert (
+            param_data.shape == loaded_weight.shape
+        ), f"{param_data.shape=}, {loaded_weight.shape=}"
         param_data.copy_(loaded_weight)
 
     def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
@@ -373,7 +437,7 @@ def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
         if len(loaded_weight.shape) == 0:
             assert loaded_weight.numel() == 1
             loaded_weight = loaded_weight.reshape(1)
-        param.load_column_parallel_weight(loaded_weight=loaded_weight)
+        load_column_parallel_weight(param, loaded_weight, self.tp_rank)
 
     def forward(self, input_):
         bias = self.bias if not self.skip_bias_add else None
@@ -393,7 +457,7 @@ def extra_repr(self) -> str:
         s = f"in_features={self.input_size}"
         s += f", output_features={self.output_size_per_partition}"
         s += f", bias={self.bias is not None}"
-        s += f", tp_size={get_tensor_model_parallel_world_size()}"
+        s += f", tp_size={self.tp_size}"
         s += f", gather_output={self.gather_output}"
         return s
 
@@ -431,10 +495,18 @@ def __init__(
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+        use_presharded_weights: bool = False,
     ):
         self.output_sizes = output_sizes
-        tp_size = get_tensor_model_parallel_world_size()
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
+        self.use_presharded_weights = use_presharded_weights
         super().__init__(
             input_size=input_size,
             output_size=sum(output_sizes),
@@ -444,6 +516,8 @@ def __init__(
             params_dtype=params_dtype,
             quant_config=quant_config,
             prefix=prefix,
+            tp_rank=tp_rank,
+            tp_size=tp_size,
         )
 
     def weight_loader(
@@ -463,12 +537,9 @@ def weight_loader(
             return
 
         if is_gguf_weight:
-            tp_size = get_tensor_model_parallel_world_size()
-            tp_rank = get_tensor_model_parallel_rank()
-
             output_dim = getattr(param, "output_dim", None)
-            shard_size = loaded_weight.size(output_dim) // tp_size
-            start_idx = tp_rank * shard_size
+            shard_size = loaded_weight.size(output_dim) // self.tp_size
+            start_idx = self.tp_rank * shard_size
 
             loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
 
@@ -494,7 +565,9 @@ def weight_loader(
                         param_data, loaded_weight, 0
                     )
 
-                assert param_data.shape == loaded_weight.shape
+                assert (
+                    param_data.shape == loaded_weight.shape
+                ), f"{param_data.shape=}, {loaded_weight.shape=}"
                 param_data.copy_(loaded_weight)
                 return
             current_shard_offset = 0
@@ -522,11 +595,9 @@ def weight_loader(
             return
 
         assert loaded_shard_id < len(self.output_sizes)
-        tp_rank = get_tensor_model_parallel_rank()
-        tp_size = get_tensor_model_parallel_world_size()
         if output_dim is not None:
-            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
-            shard_size = self.output_sizes[loaded_shard_id] // tp_size
+            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
+            shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
             # Special case for quantization.
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
@@ -545,10 +616,10 @@ def weight_loader(
                 shard_offset = loaded_weight.shape[output_dim] * loaded_shard_id
 
             param_data = param_data.narrow(output_dim, shard_offset, shard_size)
-            start_idx = tp_rank * shard_size
+            start_idx = self.tp_rank * shard_size
             # bitsandbytes loads the weights of the specific portion
             # no need to narrow here
-            if not use_bitsandbytes_4bit:
+            if not use_bitsandbytes_4bit and not self.use_presharded_weights:
                 loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         # Special case for AQLM codebooks.
         elif is_metadata:
@@ -572,7 +643,9 @@ def weight_loader(
                     "the same for all partitions."
                 )
 
-        assert param_data.shape == loaded_weight.shape
+        assert (
+            param_data.shape == loaded_weight.shape
+        ), f"{param_data.shape=}, {loaded_weight.shape=}"
         param_data.copy_(loaded_weight)
 
     def _load_fused_module_from_checkpoint(
@@ -629,26 +702,27 @@ def weight_loader_v2(
 
         assert loaded_shard_id < len(self.output_sizes)
 
-        tp_size = get_tensor_model_parallel_world_size()
-
         if isinstance(param, BlockQuantScaleParameter):
             weight_block_size = self.quant_method.quant_config.weight_block_size
             block_n, _ = weight_block_size[0], weight_block_size[1]
             shard_offset = (
                 (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // block_n
-            ) // tp_size
+            ) // self.tp_size
             shard_size = (
-                (self.output_sizes[loaded_shard_id] + block_n - 1) // block_n // tp_size
+                (self.output_sizes[loaded_shard_id] + block_n - 1)
+                // block_n
+                // self.tp_size
             )
         else:
-            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
-            shard_size = self.output_sizes[loaded_shard_id] // tp_size
+            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
+            shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
 
         param.load_merged_column_weight(
             loaded_weight=loaded_weight,
             shard_id=loaded_shard_id,
             shard_offset=shard_offset,
             shard_size=shard_size,
+            use_presharded_weights=self.use_presharded_weights,
         )
 
 
@@ -689,6 +763,8 @@ def __init__(
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
     ):
         self.hidden_size = hidden_size
         self.head_size = head_size
@@ -697,7 +773,11 @@ def __init__(
             total_num_kv_heads = total_num_heads
         self.total_num_kv_heads = total_num_kv_heads
         # Divide the weight matrix along the last dimension.
-        tp_size = get_tensor_model_parallel_world_size()
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
         self.num_heads = divide(self.total_num_heads, tp_size)
         if tp_size >= self.total_num_kv_heads:
             self.num_kv_heads = 1
@@ -724,6 +804,8 @@ def __init__(
             params_dtype=params_dtype,
             quant_config=quant_config,
             prefix=prefix,
+            tp_rank=tp_rank,
+            tp_size=tp_size,
         )
 
     def _get_shard_offset_mapping(self, loaded_shard_id: str):
@@ -814,13 +896,24 @@ def weight_loader_v2(
             shard_offset = (shard_offset + block_n - 1) // block_n
             shard_size = (shard_size + block_n - 1) // block_n
 
-        param.load_qkv_weight(
-            loaded_weight=loaded_weight,
-            num_heads=self.num_kv_head_replicas,
-            shard_id=loaded_shard_id,
-            shard_offset=shard_offset,
-            shard_size=shard_size,
-        )
+        if isinstance(param, _ColumnvLLMParameter):
+            load_column_qkv_weight(
+                param,
+                loaded_weight,
+                num_heads=self.num_kv_head_replicas,
+                shard_id=loaded_shard_id,
+                shard_offset=shard_offset,
+                shard_size=shard_size,
+                tp_rank=self.tp_rank,
+            )
+        else:
+            param.load_qkv_weight(
+                loaded_weight=loaded_weight,
+                num_heads=self.num_kv_head_replicas,
+                shard_id=loaded_shard_id,
+                shard_offset=shard_offset,
+                shard_size=shard_size,
+            )
 
     def weight_loader(
         self,
@@ -840,12 +933,9 @@ def weight_loader(
             return
 
         if is_gguf_weight:
-            tp_size = get_tensor_model_parallel_world_size()
-            tp_rank = get_tensor_model_parallel_rank()
-
             output_dim = getattr(param, "output_dim", None)
-            shard_size = loaded_weight.size(output_dim) // tp_size
-            start_idx = tp_rank * shard_size
+            shard_size = loaded_weight.size(output_dim) // self.tp_size
+            start_idx = self.tp_rank * shard_size
 
             loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
 
@@ -872,7 +962,9 @@ def weight_loader(
                         param_data, loaded_weight, 0
                     )
 
-                assert param_data.shape == loaded_weight.shape
+                assert (
+                    param_data.shape == loaded_weight.shape
+                ), f"{param_data.shape=}, {loaded_weight.shape=}"
                 param_data.copy_(loaded_weight)
                 return
             shard_offsets = [
@@ -934,7 +1026,6 @@ def weight_loader(
                 self.weight_loader(param, loaded_weight_shard, shard_id)
             return
 
-        tp_rank = get_tensor_model_parallel_rank()
         assert loaded_shard_id in ["q", "k", "v"]
 
         # If output dim is defined, use the default loading process.
@@ -984,9 +1075,9 @@ def weight_loader(
 
             param_data = param_data.narrow(output_dim, shard_offset, shard_size)
             if loaded_shard_id == "q":
-                shard_id = tp_rank
+                shard_id = self.tp_rank
             else:
-                shard_id = tp_rank // self.num_kv_head_replicas
+                shard_id = self.tp_rank // self.num_kv_head_replicas
             start_idx = shard_id * shard_size
 
             # bitsandbytes loads the weights of the specific portion
@@ -1014,7 +1105,9 @@ def weight_loader(
                     "for all partitions."
                 )
 
-        assert param_data.shape == loaded_weight.shape
+        assert (
+            param_data.shape == loaded_weight.shape
+        ), f"{param_data.shape=}, {loaded_weight.shape=}"
         param_data.copy_(loaded_weight)
 
 
@@ -1055,6 +1148,9 @@ def __init__(
         reduce_results: bool = True,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+        use_presharded_weights: bool = False,
     ):
         super().__init__(
             input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
@@ -1064,10 +1160,14 @@ def __init__(
         self.reduce_results = reduce_results
 
         # Divide the weight matrix along the last dimension.
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
         self.input_size_per_partition = divide(input_size, self.tp_size)
         assert self.quant_method is not None
+        self.use_presharded_weights = use_presharded_weights
 
         self.quant_method.create_weights(
             layer=self,
@@ -1101,8 +1201,6 @@ def __init__(
             self.register_parameter("bias", None)
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
-        tp_rank = get_tensor_model_parallel_rank()
-        tp_size = get_tensor_model_parallel_world_size()
         input_dim = getattr(param, "input_dim", None)
         use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
 
@@ -1116,15 +1214,19 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         if is_gguf_weight and isinstance(param, UninitializedParameter):
             weight_shape = list(loaded_weight.shape)
             if input_dim:
-                weight_shape[input_dim] = weight_shape[input_dim] // tp_size
+                weight_shape[input_dim] = weight_shape[input_dim] // self.tp_size
             param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype)
 
         param_data = param.data
         # bitsandbytes loads the weights of the specific portion
         # no need to narrow here
-        if input_dim is not None and not use_bitsandbytes_4bit:
+        if (
+            input_dim is not None
+            and not use_bitsandbytes_4bit
+            and not self.use_presharded_weights
+        ):
             shard_size = param_data.shape[input_dim]
-            start_idx = tp_rank * shard_size
+            start_idx = self.tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size)
 
         # Special case for loading scales off disk, which often do not
@@ -1132,7 +1234,9 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
 
-        assert param_data.shape == loaded_weight.shape
+        assert (
+            param_data.shape == loaded_weight.shape
+        ), f"{param_data.shape=}, {loaded_weight.shape=}"
         param_data.copy_(loaded_weight)
 
     def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor):
@@ -1143,17 +1247,21 @@ def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor
             assert loaded_weight.numel() == 1
             loaded_weight = loaded_weight.reshape(1)
 
-        param.load_row_parallel_weight(loaded_weight=loaded_weight)
+        load_row_parallel_weight(
+            param,
+            loaded_weight,
+            self.tp_rank,
+            use_presharded_weights=self.use_presharded_weights,
+        )
 
     def forward(self, input_):
         if self.input_is_parallel:
             input_parallel = input_
         else:
-            tp_rank = get_tensor_model_parallel_rank()
             splitted_input = split_tensor_along_last_dim(
                 input_, num_partitions=self.tp_size
             )
-            input_parallel = splitted_input[tp_rank].contiguous()
+            input_parallel = splitted_input[self.tp_rank].contiguous()
 
         # Matrix multiply.
         assert self.quant_method is not None
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
index 96eaf856616f..8d0b7035ee50 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -204,6 +204,7 @@ def __init__(
         prefix: str = "",
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
+        use_presharded_weights: bool = False,
     ):
         super().__init__()
 
@@ -243,6 +244,7 @@ def __init__(
             params_dtype=params_dtype,
             weight_loader=self.weight_loader,
         )
+        self.use_presharded_weights = use_presharded_weights
 
     def _load_per_tensor_weight_scale(
         self,
@@ -395,10 +397,7 @@ def weight_loader(
         weight_name: str,
         shard_id: str,
         expert_id: int,
-        use_presharded_weights: bool = False,
     ) -> None:
-        self.use_presharded_weights = use_presharded_weights
-
         # compressed-tensors checkpoints with packed weights are stored flipped
         # TODO (mgoin): check self.quant_method.quant_config.quant_format
         # against known CompressionFormat enum values that have this quality
diff --git a/python/sglang/srt/layers/parameter.py b/python/sglang/srt/layers/parameter.py
new file mode 100644
index 000000000000..435cc69bb51d
--- /dev/null
+++ b/python/sglang/srt/layers/parameter.py
@@ -0,0 +1,431 @@
+"""
+Adapted from vLLM (0.6.4.post1).
+https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/parameter.py
+"""
+
+import logging
+from fractions import Fraction
+from typing import Callable, Optional, Union
+
+import torch
+from torch.nn import Parameter
+from vllm.distributed import get_tensor_model_parallel_rank
+
+__all__ = [
+    "BasevLLMParameter",
+    "PackedvLLMParameter",
+    "PerTensorScaleParameter",
+    "ModelWeightParameter",
+    "ChannelQuantScaleParameter",
+    "GroupQuantScaleParameter",
+    "PackedColumnParameter",
+    "RowvLLMParameter",
+]
+
+logger = logging.getLogger(__name__)
+
+
+class BasevLLMParameter(Parameter):
+    """
+    Base parameter for vLLM linear layers. Extends the torch.nn.parameter
+    by taking in a linear weight loader. Will copy the loaded weight
+    into the parameter when the provided weight loader is called.
+    """
+
+    def __new__(cls, data: torch.Tensor, **kwargs):
+
+        return super().__new__(cls, data=data, requires_grad=False)
+
+    def __init__(self, data: torch.Tensor, weight_loader: Callable):
+        """
+        Initialize the BasevLLMParameter
+
+        :param data: torch tensor with the parameter data
+        :param weight_loader: weight loader callable
+
+        :returns: a torch.nn.parameter
+        """
+
+        self._weight_loader = weight_loader
+
+    @property
+    def weight_loader(self):
+        return self._weight_loader
+
+    def _assert_and_load(self, loaded_weight: torch.Tensor):
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+
+    def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
+        self._assert_and_load(loaded_weight)
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        self._assert_and_load(loaded_weight)
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        self._assert_and_load(loaded_weight)
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        self._assert_and_load(loaded_weight)
+
+
+class _ColumnvLLMParameter(BasevLLMParameter):
+    """
+    Private class defining weight loading functionality
+    (load_merged_column_weight, load_qkv_weight)
+    for parameters being loaded into linear layers with column
+    parallelism. This includes QKV and MLP layers which are
+    not already fused on disk. Requires an output dimension
+    to be defined. Called within the weight loader of
+    each of the column parallel linear layers.
+    """
+
+    def __init__(self, output_dim: int, **kwargs):
+        self._output_dim = output_dim
+        super().__init__(**kwargs)
+
+    @property
+    def output_dim(self):
+        return self._output_dim
+
+    def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_size = self.data.shape[self.output_dim]
+        loaded_weight = loaded_weight.narrow(
+            self.output_dim, tp_rank * shard_size, shard_size
+        )
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+
+        shard_offset = kwargs.get("shard_offset")
+        shard_size = kwargs.get("shard_size")
+        use_presharded_weights = kwargs.get("use_presharded_weights")
+        if (
+            isinstance(self, (PackedColumnParameter, PackedvLLMParameter))
+            and self.packed_dim == self.output_dim
+        ):
+            shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
+                shard_offset=shard_offset, shard_size=shard_size
+            )
+
+        param_data = self.data
+
+        tp_rank = get_tensor_model_parallel_rank()
+        param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
+        if not use_presharded_weights:
+            loaded_weight = loaded_weight.narrow(
+                self.output_dim, tp_rank * shard_size, shard_size
+            )
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+
+        shard_offset = kwargs.get("shard_offset")
+        shard_size = kwargs.get("shard_size")
+        shard_id = kwargs.get("shard_id")
+        num_heads = kwargs.get("num_heads")
+
+        if (
+            isinstance(self, (PackedColumnParameter, PackedvLLMParameter))
+            and self.output_dim == self.packed_dim
+        ):
+            shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
+                shard_offset=shard_offset, shard_size=shard_size
+            )
+
+        param_data = self.data
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
+        param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
+        loaded_weight = loaded_weight.narrow(
+            self.output_dim, shard_id * shard_size, shard_size
+        )
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+class RowvLLMParameter(BasevLLMParameter):
+    """
+    Parameter class defining weight_loading functionality
+    (load_row_parallel_weight) for parameters being loaded
+    into linear layers with row parallel functionality.
+    Requires an input_dim to be defined.
+    """
+
+    def __init__(self, input_dim: int, **kwargs):
+        self._input_dim = input_dim
+        super().__init__(**kwargs)
+
+    @property
+    def input_dim(self):
+        return self._input_dim
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        use_presharded_weights = kwargs.get("use_presharded_weights")
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_size = self.data.shape[self.input_dim]
+        if not use_presharded_weights:
+            loaded_weight = loaded_weight.narrow(
+                self.input_dim, tp_rank * shard_size, shard_size
+            )
+
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+
+
+class ModelWeightParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for linear layer weights. Uses both column and
+    row parallelism.
+    """
+
+    pass
+
+
+class GroupQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    grouped quantization. Uses both column and row parallelism.
+    """
+
+    pass
+
+
+class ChannelQuantScaleParameter(_ColumnvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    channel-wise quantization. Equivalent to _ColumnvLLMParameter.
+    """
+
+    pass
+
+
+class PerTensorScaleParameter(BasevLLMParameter):
+    """
+    Parameter class for scales where the number of scales is
+    equivalent to the number of logical matrices in fused linear
+    layers (e.g. for QKV, there are 3 scales loaded from disk).
+    This is relevant to weights with per-tensor quantization.
+    Adds functionality to map the scalers to a shard during
+    weight loading.
+
+    Note: additional parameter manipulation may be handled
+    for each quantization config specifically, within
+    process_weights_after_loading
+    """
+
+    def __init__(self, **kwargs):
+        self.qkv_idxs = {"q": 0, "k": 1, "v": 2}
+        super().__init__(**kwargs)
+
+    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
+
+        # if not int, assume shard_id for qkv
+        # map to int and return
+        assert isinstance(shard_id, str)
+        assert shard_id in self.qkv_idxs
+        return self.qkv_idxs[shard_id]
+
+    # For row parallel layers, no sharding needed
+    # load weight into parameter as is
+    def load_row_parallel_weight(self, *args, **kwargs):
+        super().load_row_parallel_weight(*args, **kwargs)
+
+    def load_merged_column_weight(self, *args, **kwargs):
+        self._load_into_shard_id(*args, **kwargs)
+
+    def load_qkv_weight(self, *args, **kwargs):
+        self._load_into_shard_id(*args, **kwargs)
+
+    def load_column_parallel_weight(self, *args, **kwargs):
+        super().load_row_parallel_weight(*args, **kwargs)
+
+    def _load_into_shard_id(
+        self, loaded_weight: torch.Tensor, shard_id: Union[str, int], **kwargs
+    ):
+        """
+        Slice the parameter data based on the shard id for
+        loading.
+        """
+
+        param_data = self.data
+        shard_id = self._shard_id_as_int(shard_id)
+
+        # AutoFP8 scales do not have a shape
+        # compressed-tensors scales do have a shape
+        if len(loaded_weight.shape) != 0:
+            assert loaded_weight.shape[0] == 1
+            loaded_weight = loaded_weight[0]
+
+        param_data = param_data[shard_id]
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+class PackedColumnParameter(_ColumnvLLMParameter):
+    """
+    Parameter for model parameters which are packed on disk
+    and support column parallelism only. See PackedvLLMParameter
+    for more details on the packed properties.
+    """
+
+    def __init__(
+        self,
+        packed_factor: Union[int, Fraction],
+        packed_dim: int,
+        marlin_tile_size: Optional[int] = None,
+        **kwargs
+    ):
+        self._packed_factor = packed_factor
+        self._packed_dim = packed_dim
+        self._marlin_tile_size = marlin_tile_size
+        super().__init__(**kwargs)
+
+    @property
+    def packed_dim(self):
+        return self._packed_dim
+
+    @property
+    def packed_factor(self):
+        return self._packed_factor
+
+    @property
+    def marlin_tile_size(self):
+        return self._marlin_tile_size
+
+    def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
+        return _adjust_shard_indexes_for_packing(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            packed_factor=self.packed_factor,
+            marlin_tile_size=self.marlin_tile_size,
+        )
+
+
+class PackedvLLMParameter(ModelWeightParameter):
+    """
+    Parameter for model weights which are packed on disk.
+    Example: GPTQ Marlin weights are int4 or int8, packed into int32.
+    Extends the ModelWeightParameter to take in the
+    packed factor, the packed dimension, and optionally, marlin
+    tile size for marlin kernels. Adjusts the shard_size and
+    shard_offset for fused linear layers model weight loading
+    by accounting for packing and optionally, marlin tile size.
+    """
+
+    def __init__(
+        self,
+        packed_factor: Union[int, Fraction],
+        packed_dim: int,
+        marlin_tile_size: Optional[int] = None,
+        **kwargs
+    ):
+        self._packed_factor = packed_factor
+        self._packed_dim = packed_dim
+        self._marlin_tile_size = marlin_tile_size
+        super().__init__(**kwargs)
+
+    @property
+    def packed_dim(self):
+        return self._packed_dim
+
+    @property
+    def packed_factor(self):
+        return self._packed_factor
+
+    @property
+    def marlin_tile_size(self):
+        return self._marlin_tile_size
+
+    def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
+        return _adjust_shard_indexes_for_packing(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            packed_factor=self.packed_factor,
+            marlin_tile_size=self.marlin_tile_size,
+        )
+
+
+def permute_param_layout_(
+    param: BasevLLMParameter, input_dim: int, output_dim: int, **kwargs
+) -> BasevLLMParameter:
+    """
+    Permute a parameter's layout to the specified input and output dimensions,
+    useful for forcing the parameter into a known layout, for example, if I need
+    a packed (quantized) weight matrix to be in the layout
+        {input_dim = 0, output_dim = 1, packed_dim = 0}
+    then I can call:
+        permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+    to ensure x is in the correct layout (permuting it to the correct layout if
+    required, asserting if it cannot get it to the correct layout)
+    """
+
+    curr_input_dim = getattr(param, "input_dim", None)
+    curr_output_dim = getattr(param, "output_dim", None)
+
+    if curr_input_dim is None or curr_output_dim is None:
+        assert param.data.dim() == 2, (
+            "permute_param_layout_ only supports 2D parameters when either "
+            "input_dim or output_dim is not set"
+        )
+
+    # if one of the dimensions is not set, set it to the opposite of the other
+    #  we can only do this since we asserted the parameter is 2D above
+    if curr_input_dim is None:
+        assert curr_output_dim is not None, "either input or output dim must be set"
+        curr_input_dim = (curr_output_dim + 1) % 2
+    if curr_output_dim is None:
+        assert curr_input_dim is not None, "either input or output dim must be set"
+        curr_output_dim = (curr_input_dim + 1) % 2
+
+    # create permutation from the current layout to the layout with
+    # self.input_dim at input_dim and self.output_dim at output_dim preserving
+    # other dimensions
+    perm = [
+        i for i in range(param.data.dim()) if i not in [curr_input_dim, curr_output_dim]
+    ]
+    perm.insert(input_dim, curr_input_dim)
+    perm.insert(output_dim, curr_output_dim)
+
+    if "packed_dim" in kwargs:
+        assert (
+            hasattr(param, "packed_dim")
+            and param.packed_dim == perm[kwargs["packed_dim"]]
+        ), "permute_param_layout_ currently doesn't support repacking"
+
+    param.data = param.data.permute(*perm)
+    if hasattr(param, "_input_dim"):
+        param._input_dim = input_dim
+    if hasattr(param, "_output_dim"):
+        param._output_dim = output_dim
+    if "packed_dim" in kwargs and hasattr(param, "_packed_dim"):
+        param._packed_dim = kwargs["packed_dim"]
+
+    return param
+
+
+def _adjust_shard_indexes_for_marlin(shard_size, shard_offset, marlin_tile_size):
+    return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
+
+
+def _adjust_shard_indexes_for_packing(
+    shard_size, shard_offset, packed_factor, marlin_tile_size
+):
+    shard_size = shard_size // packed_factor
+    shard_offset = shard_offset // packed_factor
+    if marlin_tile_size is not None:
+        return _adjust_shard_indexes_for_marlin(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            marlin_tile_size=marlin_tile_size,
+        )
+    return shard_size, shard_offset
diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
index a263cb2362a9..f9e4a8a4ff45 100644
--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -25,9 +25,9 @@
     per_tensor_dequantize,
     requantize_with_max_scale,
 )
-from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
 
 from sglang.srt.layers.linear import LinearMethodBase, UnquantizedLinearMethod
+from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
diff --git a/python/sglang/srt/layers/vocab_parallel_embedding.py b/python/sglang/srt/layers/vocab_parallel_embedding.py
index effea1c6c950..21d973918758 100644
--- a/python/sglang/srt/layers/vocab_parallel_embedding.py
+++ b/python/sglang/srt/layers/vocab_parallel_embedding.py
@@ -12,8 +12,8 @@
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
-from vllm.model_executor.parameter import BasevLLMParameter
 
+from sglang.srt.layers.parameter import BasevLLMParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
diff --git a/python/sglang/srt/managers/session_controller.py b/python/sglang/srt/managers/session_controller.py
index e3e94ce6b655..e9c0c909d52c 100644
--- a/python/sglang/srt/managers/session_controller.py
+++ b/python/sglang/srt/managers/session_controller.py
@@ -99,7 +99,7 @@ def create_req(self, req: TokenizedGenerateReqInput, tokenizer):
 
         if last_req is not None:
             # trim bos token if it is an append
-            if req.input_ids[0] == tokenizer.bos_token_id:
+            if tokenizer is not None and req.input_ids[0] == tokenizer.bos_token_id:
                 req.input_ids = req.input_ids[1:]
 
             input_ids = (
diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py
index fab8b15a3316..354408ab3433 100644
--- a/python/sglang/srt/model_executor/forward_batch_info.py
+++ b/python/sglang/srt/model_executor/forward_batch_info.py
@@ -106,6 +106,9 @@ def is_cuda_graph(self):
     def is_dummy_first(self):
         return self == ForwardMode.DUMMY_FIRST
 
+    def is_decode_or_idle(self):
+        return self == ForwardMode.DECODE or self == ForwardMode.IDLE
+
 
 class CaptureHiddenMode(IntEnum):
     NULL = auto()
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 7cd9e759a3dc..719db19cd765 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -205,7 +205,7 @@ def init_torch_distributed(self):
         if self.device == "cuda":
             backend = "nccl"
         elif self.device == "xpu":
-            # TODO(liangan1):Just use gloo to bypass the initilization fail
+            # TODO(liangan1): Just use gloo to bypass the initilization fail
             # Need to use xccl for xpu backend in the future
             backend = "gloo"
         elif self.device == "hpu":
@@ -634,7 +634,6 @@ def init_attention_backend(self):
             )
 
     def init_double_sparsity_channel_config(self, selected_channel):
-
         selected_channel = "." + selected_channel + "_proj"
         self.sorted_channels = []
         # load channel config
diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py
index 0485b80fc3a2..33a055a8fcb9 100644
--- a/python/sglang/srt/models/grok.py
+++ b/python/sglang/srt/models/grok.py
@@ -57,6 +57,7 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
         reduce_results=True,
+        use_presharded_weights: bool = False,
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -65,6 +66,7 @@ def __init__(
             bias=False,
             quant_config=quant_config,
             prefix=f"{prefix}.gate_up_proj",
+            use_presharded_weights=use_presharded_weights,
         )
         self.down_proj = RowParallelLinear(
             intermediate_size,
@@ -73,6 +75,7 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.down_proj",
             reduce_results=reduce_results,
+            use_presharded_weights=use_presharded_weights,
         )
         self.act_fn = GeluAndMul(approximate="tanh")
 
@@ -103,6 +106,7 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
         reduce_results=True,
+        use_presharded_weights: bool = False,
     ):
         super().__init__()
         self.hidden_size = hidden_size
@@ -129,6 +133,7 @@ def __init__(
             renormalize=False,
             quant_config=quant_config,
             tp_size=tp_size,
+            use_presharded_weights=use_presharded_weights,
         )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -156,6 +161,7 @@ def __init__(
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
         quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
     ) -> None:
         super().__init__()
         self.config = config
@@ -194,6 +200,7 @@ def __init__(
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            reduce_results=reduce_results,
         )
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -234,10 +241,12 @@ def __init__(
         config: PretrainedConfig,
         layer_id: int = 0,
         quant_config: Optional[QuantizationConfig] = None,
+        use_presharded_weights: bool = False,
     ) -> None:
         super().__init__()
         self.num_experts = config.num_local_experts
         self.hidden_size = config.hidden_size
+        self.layer_id = layer_id
 
         rope_theta = getattr(config, "rope_theta", 10000)
         self.self_attn = Grok1Attention(
@@ -262,6 +271,7 @@ def __init__(
             ),
             quant_config=quant_config,
             reduce_results=True,
+            use_presharded_weights=use_presharded_weights,
         )
         self.pre_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -299,6 +309,7 @@ def __init__(
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        use_presharded_weights: bool = False,
     ) -> None:
         super().__init__()
         self.config = config
@@ -311,7 +322,12 @@ def __init__(
         )
         self.layers = nn.ModuleList(
             [
-                Grok1DecoderLayer(config, i, quant_config=quant_config)
+                Grok1DecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    use_presharded_weights=use_presharded_weights,
+                )
                 for i in range(config.num_hidden_layers)
             ]
         )
@@ -347,11 +363,7 @@ def __init__(
         super().__init__()
         self.config = config
         self.quant_config = quant_config
-        self.model = Grok1Model(config, quant_config=quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.logits_processor = LogitsProcessor(config)
 
-        # Monkey patch _prepare_weights to load pre-sharded weights
         if (
             self.config.num_local_experts > 0
             and get_tensor_model_parallel_world_size() > 1
@@ -361,6 +373,14 @@ def __init__(
         else:
             self.use_presharded_weights = False
 
+        self.model = Grok1Model(
+            config,
+            quant_config=quant_config,
+            use_presharded_weights=self.use_presharded_weights,
+        )
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.logits_processor = LogitsProcessor(config)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -376,10 +396,7 @@ def forward(
     def load_weights(
         self,
         weights: Iterable[Tuple[str, torch.Tensor]],
-        use_presharded_weights: Optional[bool] = None,
     ):
-        if use_presharded_weights is None:
-            use_presharded_weights = self.use_presharded_weights
         num_experts = self.config.num_local_experts
 
         stacked_params_mapping = [
@@ -435,20 +452,12 @@ def load_weight_wrapper(name, loaded_weight, *args, **kwargs):
                         continue
                     name = name.replace(weight_name, param_name)
 
-                    if use_presharded_weights:
-                        extra_kwargs = {
-                            "use_presharded_weights": use_presharded_weights
-                        }
-                    else:
-                        extra_kwargs = {}
-
                     load_weight_wrapper(
                         name,
                         loaded_weight,
                         name,
                         shard_id=shard_id,
                         expert_id=expert_id,
-                        **extra_kwargs,
                     )
                     break
                 else:
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index f60af5d73153..8fd902818995 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -544,7 +544,12 @@ def launch_server(
 
     # Send a warmup request
     t = threading.Thread(
-        target=_wait_and_warmup, args=(server_args, pipe_finish_writer)
+        target=_wait_and_warmup,
+        args=(
+            server_args,
+            pipe_finish_writer,
+            tokenizer_manager.image_token_id,
+        ),
     )
     t.start()
 
@@ -614,7 +619,7 @@ def sigquit_handler(signum, frame):
     mp.set_start_method("spawn", force=True)
 
 
-def _wait_and_warmup(server_args, pipe_finish_writer):
+def _wait_and_warmup(server_args, pipe_finish_writer, image_token_text):
     headers = {}
     url = server_args.url()
     if server_args.api_key:
diff --git a/python/sglang/srt/speculative/eagle_utils.py b/python/sglang/srt/speculative/eagle_utils.py
index 88c88c0724f4..b804e7c6af2e 100644
--- a/python/sglang/srt/speculative/eagle_utils.py
+++ b/python/sglang/srt/speculative/eagle_utils.py
@@ -14,7 +14,7 @@
 from sglang.srt.speculative.spec_info import SpecInfo
 
 if TYPE_CHECKING:
-    from python.sglang.srt.managers.schedule_batch import ScheduleBatch
+    from sglang.srt.managers.schedule_batch import ScheduleBatch
     from sglang.srt.server_args import ServerArgs
 
 
diff --git a/scripts/killall_sglang.sh b/scripts/killall_sglang.sh
index 4057d2be2fb4..53d08703e014 100755
--- a/scripts/killall_sglang.sh
+++ b/scripts/killall_sglang.sh
@@ -7,6 +7,7 @@ nvidia-smi
 kill -9 $(ps aux | grep 'sglang::' | grep -v 'grep' | awk '{print $2}') 2>/dev/null
 kill -9 $(ps aux | grep 'sglang.launch_server' | grep -v 'grep' | awk '{print $2}') 2>/dev/null
 kill -9 $(ps aux | grep 'sglang.bench' | grep -v 'grep' | awk '{print $2}') 2>/dev/null
+kill -9 $(ps aux | grep 'sglang.data_parallel' | grep -v 'grep' | awk '{print $2}') 2>/dev/null
 
 # Clean all GPU processes if any argument is provided
 if [ $# -gt 0 ]; then

From 977f785dad98540f01bca34abe6c6fd326fd6a7c Mon Sep 17 00:00:00 2001
From: mlmz <54172054+minleminzui@users.noreply.github.com>
Date: Wed, 8 Jan 2025 16:02:59 +0800
Subject: [PATCH 004/248] Docs: Rewrite docs for LLama 405B and ModelSpace
 (#2773)

Co-authored-by: Chayenne <zhaochen20@outlook.com>
---
 docs/backend/server_arguments.md | 43 --------------------------------
 docs/index.rst                   |  2 ++
 docs/references/llama_405B.md    | 16 ++++++++++++
 docs/references/modelscope.md    | 28 +++++++++++++++++++++
 4 files changed, 46 insertions(+), 43 deletions(-)
 create mode 100644 docs/references/llama_405B.md
 create mode 100644 docs/references/modelscope.md

diff --git a/docs/backend/server_arguments.md b/docs/backend/server_arguments.md
index a4913b8af6b9..fcee7f88d52a 100644
--- a/docs/backend/server_arguments.md
+++ b/docs/backend/server_arguments.md
@@ -32,46 +32,3 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
 ```
 
-## Use Models From ModelScope
-<details>
-<summary>More</summary>
-
-To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
-```
-export SGLANG_USE_MODELSCOPE=true
-```
-Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
-```
-SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
-```
-
-Or start it by docker.
-```bash
-docker run --gpus all \
-    -p 30000:30000 \
-    -v ~/.cache/modelscope:/root/.cache/modelscope \
-    --env "SGLANG_USE_MODELSCOPE=true" \
-    --ipc=host \
-    lmsysorg/sglang:latest \
-    python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
-```
-
-</details>
-
-## Example: Run Llama 3.1 405B
-<details>
-<summary>More</summary>
-
-```bash
-# Run 405B (fp8) on a single node
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
-
-# Run 405B (fp16) on two nodes
-## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
-python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0
-
-## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
-python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1
-```
-
-</details>
diff --git a/docs/index.rst b/docs/index.rst
index 80a53d1cb3bb..4141161894b4 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -60,3 +60,5 @@ The core features include:
    references/troubleshooting.md
    references/faq.md
    references/learn_more.md
+   references/llama_405B.md
+   references/modelscope.md
diff --git a/docs/references/llama_405B.md b/docs/references/llama_405B.md
new file mode 100644
index 000000000000..3383d1625c86
--- /dev/null
+++ b/docs/references/llama_405B.md
@@ -0,0 +1,16 @@
+# Example: Run Llama 3.1 405B
+
+```bash
+# Run 405B (fp8) on a single node
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
+```
+
+```bash
+# Run 405B (fp16) on two nodes
+## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
+python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0
+
+## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
+python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1
+```
+
diff --git a/docs/references/modelscope.md b/docs/references/modelscope.md
new file mode 100644
index 000000000000..ad7b6151b435
--- /dev/null
+++ b/docs/references/modelscope.md
@@ -0,0 +1,28 @@
+# Use Models From ModelScope
+
+To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable `SGLANG_USE_MODELSCOPE`.
+
+```bash
+export SGLANG_USE_MODELSCOPE=true
+```
+
+We take [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) as an example. Launch the Server:
+---
+
+```bash
+python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
+```
+
+Or start it by docker:
+
+```bash
+docker run --gpus all \
+    -p 30000:30000 \
+    -v ~/.cache/modelscope:/root/.cache/modelscope \
+    --env "SGLANG_USE_MODELSCOPE=true" \
+    --ipc=host \
+    lmsysorg/sglang:latest \
+    python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
+```
+
+Note that modelscope uses a different cache directory than huggingface. You may need to set it manually to avoid running out of disk space.

From 2e6346fc2ef9adecda3b71d7415f4d023dc22aff Mon Sep 17 00:00:00 2001
From: Chayenne <zhaochen20@outlook.com>
Date: Wed, 8 Jan 2025 01:07:54 -0800
Subject: [PATCH 005/248] =?UTF-8?q?Docs=EF=BC=9AUpdate=20the=20style=20of?=
 =?UTF-8?q?=20llma=203.1=20405B=20docs=20(#2789)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/backend/server_arguments.md |  1 -
 docs/index.rst                   |  4 ++--
 docs/references/llama_405B.md    | 17 ++++++++++-------
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/docs/backend/server_arguments.md b/docs/backend/server_arguments.md
index fcee7f88d52a..90b36a0bdd91 100644
--- a/docs/backend/server_arguments.md
+++ b/docs/backend/server_arguments.md
@@ -31,4 +31,3 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 # Node 1
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
 ```
-
diff --git a/docs/index.rst b/docs/index.rst
index 4141161894b4..ff104808ca99 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -56,9 +56,9 @@ The core features include:
    references/hyperparameter_tuning.md
    references/benchmark_and_profiling.md
    references/custom_chat_template.md
+   references/llama_405B.md
+   references/modelscope.md
    references/contribution_guide.md
    references/troubleshooting.md
    references/faq.md
    references/learn_more.md
-   references/llama_405B.md
-   references/modelscope.md
diff --git a/docs/references/llama_405B.md b/docs/references/llama_405B.md
index 3383d1625c86..4f70e89f6d9a 100644
--- a/docs/references/llama_405B.md
+++ b/docs/references/llama_405B.md
@@ -1,16 +1,19 @@
-# Example: Run Llama 3.1 405B
+# Run Llama 3.1 405B
+
+## Run 405B (fp8) on a Single Node
 
 ```bash
-# Run 405B (fp8) on a single node
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
 ```
 
+## Run 405B (fp16) on Two Nodes
+
 ```bash
-# Run 405B (fp16) on two nodes
-## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
+# on the first node, replace 172.16.4.52:20000 with your own node ip address and port
+
 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0
 
-## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
-python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1
-```
+# on the second node, replace 172.18.45.52:20000 with your own node ip address and port
 
+python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.18.45.52:20000 --nnodes 2 --node-rank 1
+```

From b5fb4ef58a6bbe6c105d533b69e8e8bc2bf4fc3c Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Wed, 8 Jan 2025 18:04:30 +0800
Subject: [PATCH 006/248] Update modelopt config and fix running issue (#2792)

---
 python/sglang/srt/layers/quantization/__init__.py             | 2 +-
 python/sglang/srt/layers/{ => quantization}/modelopt_quant.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)
 rename python/sglang/srt/layers/{ => quantization}/modelopt_quant.py (99%)

diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py
index df20a7a4ba47..35b0c4d94edb 100644
--- a/python/sglang/srt/layers/quantization/__init__.py
+++ b/python/sglang/srt/layers/quantization/__init__.py
@@ -17,12 +17,12 @@
 from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinConfig
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import GPTQMarlin24Config
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
-from vllm.model_executor.layers.quantization.modelopt import ModelOptFp8Config
 from vllm.model_executor.layers.quantization.qqq import QQQConfig
 from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
 
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.fp8 import Fp8Config
+from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp8Config
 
 QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "aqlm": AQLMConfig,
diff --git a/python/sglang/srt/layers/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py
similarity index 99%
rename from python/sglang/srt/layers/modelopt_quant.py
rename to python/sglang/srt/layers/quantization/modelopt_quant.py
index 2c0887df2391..8ce9d20d1911 100644
--- a/python/sglang/srt/layers/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -142,6 +142,7 @@ def create_weights(
                         data=torch.full(
                             (len(output_partition_sizes),),
                             torch.finfo(torch.float32).min,
+                            dtype=torch.float32,
                         ),
                         weight_loader=weight_loader,
                     ),

From 656aed58c6622bb03887ce9d2a7f34ba18eaaff3 Mon Sep 17 00:00:00 2001
From: Yunmeng <cym103@126.com>
Date: Thu, 9 Jan 2025 17:51:56 +0800
Subject: [PATCH 007/248] Remove vllm dependency in model config (#2809)

---
 python/sglang/srt/configs/__init__.py      |   4 +
 python/sglang/srt/configs/chatglm.py       |  78 ++++++
 python/sglang/srt/configs/dbrx.py          | 279 +++++++++++++++++++++
 python/sglang/srt/hf_transformers_utils.py |  23 +-
 python/sglang/srt/models/chatglm.py        |   2 +-
 python/sglang/srt/models/dbrx.py           |   2 +-
 6 files changed, 372 insertions(+), 16 deletions(-)
 create mode 100644 python/sglang/srt/configs/chatglm.py
 create mode 100644 python/sglang/srt/configs/dbrx.py

diff --git a/python/sglang/srt/configs/__init__.py b/python/sglang/srt/configs/__init__.py
index 600b58e49377..3d81c5d4fd50 100644
--- a/python/sglang/srt/configs/__init__.py
+++ b/python/sglang/srt/configs/__init__.py
@@ -1,3 +1,5 @@
+from sglang.srt.configs.chatglm import ChatGLMConfig
+from sglang.srt.configs.dbrx import DbrxConfig
 from sglang.srt.configs.exaone import ExaoneConfig
 from sglang.srt.configs.qwen2vl import Qwen2VLConfig, Qwen2VLVisionConfig
 
@@ -5,4 +7,6 @@
     "ExaoneConfig",
     "Qwen2VLConfig",
     "Qwen2VLVisionConfig",
+    "ChatGLMConfig",
+    "DbrxConfig",
 ]
diff --git a/python/sglang/srt/configs/chatglm.py b/python/sglang/srt/configs/chatglm.py
new file mode 100644
index 000000000000..9370c218aab8
--- /dev/null
+++ b/python/sglang/srt/configs/chatglm.py
@@ -0,0 +1,78 @@
+# Adapted from
+# https://github.com/THUDM/ChatGLM2-6B
+# https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/configs/chatglm.py
+
+# ChatGLM2 and ChatGLM3 share the same config.
+# ChatGLM4 is officially supported by Huggingface
+# transformers >= 4.46.0 is required
+# https://huggingface.co/docs/transformers/en/model_doc/glm
+from transformers import PretrainedConfig
+
+
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    attribute_map = {
+        "num_hidden_layers": "num_layers",
+        "n_head_kv": "multi_query_group_num",
+    }
+
+    def __init__(
+        self,
+        num_layers=28,
+        padded_vocab_size=65024,
+        hidden_size=4096,
+        ffn_hidden_size=13696,
+        kv_channels=128,
+        num_attention_heads=32,
+        seq_length=2048,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        rmsnorm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
+        add_bias_linear=False,
+        add_qkv_bias=False,
+        interleaved_qkv=False,
+        bias_dropout_fusion=True,
+        multi_query_attention=False,
+        multi_query_group_num=1,
+        apply_query_key_layer_scaling=True,
+        attention_softmax_in_fp32=True,
+        fp32_residual_connection=False,
+        quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
+        **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        # It is to be compatible with long lora.
+        self.max_position_embeddings = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = (
+            apply_residual_connection_post_layernorm
+        )
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        self.interleaved_qkv = interleaved_qkv
+        super().__init__(**kwargs)
diff --git a/python/sglang/srt/configs/dbrx.py b/python/sglang/srt/configs/dbrx.py
new file mode 100644
index 000000000000..75ccbde944ea
--- /dev/null
+++ b/python/sglang/srt/configs/dbrx.py
@@ -0,0 +1,279 @@
+# Adapted from
+# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
+# https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/configs/dbrx.py
+"""Dbrx configuration."""
+
+from typing import Any, Optional
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {}  # type: ignore
+
+
+class DbrxAttentionConfig(PretrainedConfig):
+    """Configuration class for Dbrx Attention.
+
+    [`DbrxAttention`] class. It is used to instantiate attention layers
+    according to the specified arguments, defining the layers architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        attn_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        clip_qkv (`float`, *optional*, defaults to None):
+            If not `None`, clip the queries, keys, and values in the attention layer to this value.
+        kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
+        rope_theta (float): The base frequency for rope.
+    """
+
+    def __init__(
+        self,
+        attn_pdrop: float = 0,
+        clip_qkv: Optional[float] = None,
+        kv_n_heads: int = 1,
+        rope_theta: float = 10000.0,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.attn_pdrop = attn_pdrop
+        self.clip_qkv = clip_qkv
+        self.kv_n_heads = kv_n_heads
+        self.rope_theta = rope_theta
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, **kwargs: Any
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["attn_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all configurations of "
+                "models and can yield errors.",
+                config_dict["model_type"],
+                cls.model_type,
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxFFNConfig(PretrainedConfig):
+    """Configuration class for Dbrx FFN.
+
+    [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
+    the specified arguments, defining the layers architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
+            The dict should have a key 'name' with the value being the name of
+            the activation function along with any additional keyword arguments.
+        ffn_hidden_size (int, optional): The hidden size of the feedforward network.
+        moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
+        moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
+        moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
+        moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
+        moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
+        uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
+            This should only be used for benchmarking purposes.
+    """
+
+    def __init__(
+        self,
+        ffn_act_fn: Optional[dict] = None,
+        ffn_hidden_size: int = 3584,
+        moe_num_experts: int = 4,
+        moe_top_k: int = 1,
+        moe_jitter_eps: Optional[float] = None,
+        moe_loss_weight: float = 0.01,
+        moe_normalize_expert_weights: Optional[float] = 1,
+        uniform_expert_assignment: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__()
+        if ffn_act_fn is None:
+            ffn_act_fn = {"name": "silu"}
+        self.ffn_act_fn = ffn_act_fn
+        self.ffn_hidden_size = ffn_hidden_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.moe_jitter_eps = moe_jitter_eps
+        self.moe_loss_weight = moe_loss_weight
+        self.moe_normalize_expert_weights = moe_normalize_expert_weights
+        self.uniform_expert_assignment = uniform_expert_assignment
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, **kwargs: Any
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["ffn_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all "
+                "configurations of models and can yield errors.",
+                config_dict["model_type"],
+                cls.model_type,
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxConfig(PretrainedConfig):
+    """Configuration class for Dbrx.
+
+    [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
+    specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        d_model (`int`, *optional*, defaults to 6144):
+            Dimensionality of the embeddings and hidden states.
+        n_heads (`int`, *optional*, defaults to 48):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer encoder.
+        max_seq_len (`int`, *optional*, defaults to 32768):
+            The maximum sequence length of the model.
+        vocab_size (`int`, *optional*, defaults to 100352):
+            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`DbrxModel`].
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability applied to the attention output before combining with residual.
+        emb_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the embedding layer.
+        attn_config (`dict`, *optional*):
+            A dictionary used to configure the model's attention module.
+        ffn_config (`dict`, *optional*):
+            A dictionary used to configure the model's FFN module.
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+
+    Example:
+    ```python
+    >>> from transformers import DbrxConfig, DbrxModel
+
+    >>> # Initializing a Dbrx configuration
+    >>> configuration = DbrxConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = DbrxModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "dbrx"
+    attribute_map = {
+        "num_attention_heads": "n_heads",
+        "hidden_size": "d_model",
+        "num_hidden_layers": "n_layers",
+        "max_position_embeddings": "max_seq_len",
+    }
+
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        max_seq_len: int = 2048,
+        vocab_size: int = 32000,
+        resid_pdrop: float = 0.0,
+        emb_pdrop: float = 0.0,
+        attn_config: Optional[DbrxAttentionConfig] = None,
+        ffn_config: Optional[DbrxFFNConfig] = None,
+        use_cache: bool = True,
+        initializer_range: float = 0.02,
+        output_router_logits: bool = False,
+        router_aux_loss_coef: float = 0.05,
+        **kwargs: Any,
+    ):
+        if attn_config is None:
+            self.attn_config = DbrxAttentionConfig()
+        elif isinstance(attn_config, dict):
+            self.attn_config = DbrxAttentionConfig(**attn_config)
+        else:
+            self.attn_config = attn_config
+
+        if ffn_config is None:
+            self.ffn_config = DbrxFFNConfig()
+        elif isinstance(ffn_config, dict):
+            self.ffn_config = DbrxFFNConfig(**ffn_config)
+        else:
+            self.ffn_config = ffn_config
+
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
+        if tie_word_embeddings:
+            raise ValueError("tie_word_embeddings is not supported for Dbrx models.")
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py
index 92b01d4524f8..ea39d73f2eea 100644
--- a/python/sglang/srt/hf_transformers_utils.py
+++ b/python/sglang/srt/hf_transformers_utils.py
@@ -30,20 +30,15 @@
 )
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 
-try:
-    from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig
-
-    from sglang.srt.configs import ExaoneConfig, Qwen2VLConfig
-
-    _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
-        ChatGLMConfig.model_type: ChatGLMConfig,
-        DbrxConfig.model_type: DbrxConfig,
-        ExaoneConfig.model_type: ExaoneConfig,
-        Qwen2VLConfig.model_type: Qwen2VLConfig,
-    }
-except ImportError:
-    # We want this file to run without vllm dependency
-    _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {}
+from sglang.srt.configs import ChatGLMConfig, DbrxConfig, ExaoneConfig, Qwen2VLConfig
+
+_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
+    ChatGLMConfig.model_type: ChatGLMConfig,
+    DbrxConfig.model_type: DbrxConfig,
+    ExaoneConfig.model_type: ExaoneConfig,
+    Qwen2VLConfig.model_type: Qwen2VLConfig,
+}
+
 
 for name, cls in _CONFIG_REGISTRY.items():
     with contextlib.suppress(ValueError):
diff --git a/python/sglang/srt/models/chatglm.py b/python/sglang/srt/models/chatglm.py
index 9c3bc2ee9e0a..b69a9e11639a 100644
--- a/python/sglang/srt/models/chatglm.py
+++ b/python/sglang/srt/models/chatglm.py
@@ -23,8 +23,8 @@
 from torch.nn import LayerNorm
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.transformers_utils.configs import ChatGLMConfig
 
+from sglang.srt.configs import ChatGLMConfig
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py
index 852f58a710d6..f838cfa575bb 100644
--- a/python/sglang/srt/models/dbrx.py
+++ b/python/sglang/srt/models/dbrx.py
@@ -25,8 +25,8 @@
     tensor_model_parallel_all_reduce,
 )
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.transformers_utils.configs.dbrx import DbrxConfig
 
+from sglang.srt.configs import DbrxConfig
 from sglang.srt.layers.linear import (
     QKVParallelLinear,
     ReplicatedLinear,

From 679c3bcacfd19eb852e8dbf42ad6b756eec56df4 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 9 Jan 2025 03:03:24 -0800
Subject: [PATCH 008/248] Fix typo in cuda_graph_bs (#2813)

---
 python/sglang/srt/model_executor/cuda_graph_runner.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
index deaea33129d1..e4580b5e2ba8 100644
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -131,11 +131,6 @@ def __init__(self, model_runner: "ModelRunner"):
             else:
                 self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
 
-        if model_runner.server_args.disable_cuda_graph_padding:
-            self.capture_bs = list(range(1, 33)) + [64, 128]
-        else:
-            self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
-
         if max(self.capture_bs) > model_runner.req_to_token_pool.size:
             # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
             # is very samll. We add more values here to make sure we capture the maximum bs.

From 4f077c01b8cca17993df1c2c77285dce176742c3 Mon Sep 17 00:00:00 2001
From: sleepcoo <118525388+sleepcoo@users.noreply.github.com>
Date: Thu, 9 Jan 2025 22:24:42 +0800
Subject: [PATCH 009/248] minor: support specifying local dataset path for
 gsm8k and hellaswag (#2816)

---
 benchmark/gsm8k/bench_sglang.py     | 7 +++++--
 benchmark/hellaswag/bench_sglang.py | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py
index 9fe9b79baaf8..f01734f0afb0 100644
--- a/benchmark/gsm8k/bench_sglang.py
+++ b/benchmark/gsm8k/bench_sglang.py
@@ -1,6 +1,7 @@
 import argparse
 import ast
 import json
+import os
 import re
 import time
 
@@ -46,9 +47,11 @@ def main(args):
     set_default_backend(select_sglang_backend(args))
 
     # Read data
+    data_path = args.data_path
     url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
-    filename = download_and_cache_file(url)
-    lines = list(read_jsonl(filename))
+    if not os.path.isfile(data_path):
+        data_path = download_and_cache_file(url)
+    lines = list(read_jsonl(data_path))
 
     # Construct prompts
     num_questions = args.num_questions
diff --git a/benchmark/hellaswag/bench_sglang.py b/benchmark/hellaswag/bench_sglang.py
index f09d7256da93..798521f9766d 100644
--- a/benchmark/hellaswag/bench_sglang.py
+++ b/benchmark/hellaswag/bench_sglang.py
@@ -1,5 +1,6 @@
 import argparse
 import json
+import os
 import time
 
 import numpy as np
@@ -31,9 +32,11 @@ def main(args):
     set_default_backend(select_sglang_backend(args))
 
     # Read data
+    data_path = args.data_path
     url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
-    filename = download_and_cache_file(url)
-    lines = list(read_jsonl(filename))
+    if not os.path.isfile(data_path):
+        data_path = download_and_cache_file(url)
+    lines = list(read_jsonl(data_path))
 
     # Construct prompts
     num_questions = args.num_questions

From 11fffbc95a919a2446ae10fc33753d9951374fdf Mon Sep 17 00:00:00 2001
From: Xiaotong Jiang <jiangxiaotong728@gmail.com>
Date: Thu, 9 Jan 2025 13:43:12 -0800
Subject: [PATCH 010/248] [Doc]: Deepseek reference docs (#2787)

---
 docs/index.rst                |  1 +
 docs/references/deepseek.md   | 34 ++++++++++++++++++++++++++++++++++
 docs/references/modelscope.md |  4 ++--
 3 files changed, 37 insertions(+), 2 deletions(-)
 create mode 100644 docs/references/deepseek.md

diff --git a/docs/index.rst b/docs/index.rst
index ff104808ca99..6ed313a3bd17 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -62,3 +62,4 @@ The core features include:
    references/troubleshooting.md
    references/faq.md
    references/learn_more.md
+   references/deepseek.md
diff --git a/docs/references/deepseek.md b/docs/references/deepseek.md
new file mode 100644
index 000000000000..6cf155f46211
--- /dev/null
+++ b/docs/references/deepseek.md
@@ -0,0 +1,34 @@
+# DeepSeek Model Optimizations in SGLang
+
+SGLang provides several optimizations specifically designed for the DeepSeek model to boost its inference speed. This document outlines current optimizations for DeepSeek. Additionally, the SGLang team is actively developing enhancements for [DeepSeek-V3](https://github.com/sgl-project/sglang/issues/2591).
+
+
+## Multi-head Latent Attention (MLA) Throughput Optimizations
+
+**Description**: [MLA](https://arxiv.org/pdf/2405.04434) is an innovative attention mechanism introduced by the DeepSeek team, aimed at improving inference efficiency. SGLang has implemented specific optimizations for this, including:
+
+- **Weight Absorption**: By applying the associative law of matrix multiplication to reorder computation steps, this method balances computation and memory access and improves efficiency in the decoding phase.
+- **Triton Decoding Kernel Optimization**: In the MLA decoding kernel, there is only one KV head. This optimization reduces memory access to the KV cache by processing multiple query heads within one block, accelerating the decoding process.
+- **FP8 Quantization**: W8A8 FP8 and KV Cache FP8 quantization enables efficient FP8 inference. Additionally, we have implemented Batched Matrix Multiplication (BMM) operator to facilitate FP8 inference in MLA with weight absorption.
+- **CUDA Graph & Torch.compile**: Both MLA and Mixture of Experts (MoE) are compatible with CUDA Graph and Torch.compile, which reduces latency and accelerates decoding speed for small batch sizes.
+
+Overall, with these optimizations, we have achieved up to a 7x acceleration in output throughput compared to the previous version.
+![Data Parallelism Attention for DeepSeek Series Models](https://lmsys.org/images/blog/sglang_v0_3/deepseek_mla.svg)
+
+**Usage**: MLA optimization is enabled by defalut, to disable, use `--disable-mla`.
+
+**Reference**: Check [Blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#deepseek-multi-head-latent-attention-mla-throughput-optimizations) and [Slides](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/lmsys_1st_meetup_deepseek_mla.pdf) for more details.
+
+## Data Parallelism Attention
+
+**Description**: This optimization involves data parallelism (DP) for the MLA attention mechanism of DeepSeek Series Models, which allows for a significant reduction in the KV cache size, enabling larger batch sizes. Each DP worker independently handles different types of batches (prefill, decode, idle), which are then synchronized before and after processing through the Mixture-of-Experts (MoE) layer.
+![Data Parallelism Attention for DeepSeek Series Models](https://lmsys.org/images/blog/sglang_v0_4/dp_attention.svg).
+
+**Usage**: This optimization is aimed at improving throughput and should be used for scenarios with high QPS (Queries Per Second). Data Parallelism Attention optimization can be enabeld by `--enable-dp-attention` for DeepSeek Series Models.
+
+**Reference**: Check [Blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#data-parallelism-attention-for-deepseek-models).
+
+## Multi Node Tensor Parallelism
+**Description**: For users with limited memory on a single node, SGLang supports serving DeepSeek Series Models, including DeepSeek V3, across multiple nodes using tensor parallelism. This approach partitions the model parameters across multiple GPUs or nodes to handle models that are too large for one node's memory.
+
+**Usage**: Check [here](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-2-h208) for usage examples.
diff --git a/docs/references/modelscope.md b/docs/references/modelscope.md
index ad7b6151b435..4740c2770f9e 100644
--- a/docs/references/modelscope.md
+++ b/docs/references/modelscope.md
@@ -6,9 +6,9 @@ To use a model from [ModelScope](https://www.modelscope.cn), set the environment
 export SGLANG_USE_MODELSCOPE=true
 ```
 
-We take [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) as an example. Launch the Server:
----
+We take [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) as an example.
 
+Launch the Server:
 ```bash
 python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
 ```

From 5cc1170552bfe1f32d070e802331d1b4b7f699cf Mon Sep 17 00:00:00 2001
From: Chayenne <zhaochen20@outlook.com>
Date: Fri, 10 Jan 2025 00:26:59 -0800
Subject: [PATCH 011/248] Doc: add block-wise FP8 in dpsk model reference
 (#2830)

---
 docs/references/deepseek.md | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/docs/references/deepseek.md b/docs/references/deepseek.md
index 6cf155f46211..5a95fd9a9418 100644
--- a/docs/references/deepseek.md
+++ b/docs/references/deepseek.md
@@ -9,11 +9,14 @@ SGLang provides several optimizations specifically designed for the DeepSeek mod
 
 - **Weight Absorption**: By applying the associative law of matrix multiplication to reorder computation steps, this method balances computation and memory access and improves efficiency in the decoding phase.
 - **Triton Decoding Kernel Optimization**: In the MLA decoding kernel, there is only one KV head. This optimization reduces memory access to the KV cache by processing multiple query heads within one block, accelerating the decoding process.
+
 - **FP8 Quantization**: W8A8 FP8 and KV Cache FP8 quantization enables efficient FP8 inference. Additionally, we have implemented Batched Matrix Multiplication (BMM) operator to facilitate FP8 inference in MLA with weight absorption.
+
 - **CUDA Graph & Torch.compile**: Both MLA and Mixture of Experts (MoE) are compatible with CUDA Graph and Torch.compile, which reduces latency and accelerates decoding speed for small batch sizes.
 
 Overall, with these optimizations, we have achieved up to a 7x acceleration in output throughput compared to the previous version.
-![Data Parallelism Attention for DeepSeek Series Models](https://lmsys.org/images/blog/sglang_v0_3/deepseek_mla.svg)
+
+![Multi-head Latent Attention for DeepSeek Series Models](https://lmsys.org/images/blog/sglang_v0_3/deepseek_mla.svg)
 
 **Usage**: MLA optimization is enabled by defalut, to disable, use `--disable-mla`.
 
@@ -22,6 +25,7 @@ Overall, with these optimizations, we have achieved up to a 7x acceleration in o
 ## Data Parallelism Attention
 
 **Description**: This optimization involves data parallelism (DP) for the MLA attention mechanism of DeepSeek Series Models, which allows for a significant reduction in the KV cache size, enabling larger batch sizes. Each DP worker independently handles different types of batches (prefill, decode, idle), which are then synchronized before and after processing through the Mixture-of-Experts (MoE) layer.
+
 ![Data Parallelism Attention for DeepSeek Series Models](https://lmsys.org/images/blog/sglang_v0_4/dp_attention.svg).
 
 **Usage**: This optimization is aimed at improving throughput and should be used for scenarios with high QPS (Queries Per Second). Data Parallelism Attention optimization can be enabeld by `--enable-dp-attention` for DeepSeek Series Models.
@@ -29,6 +33,16 @@ Overall, with these optimizations, we have achieved up to a 7x acceleration in o
 **Reference**: Check [Blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#data-parallelism-attention-for-deepseek-models).
 
 ## Multi Node Tensor Parallelism
+
 **Description**: For users with limited memory on a single node, SGLang supports serving DeepSeek Series Models, including DeepSeek V3, across multiple nodes using tensor parallelism. This approach partitions the model parameters across multiple GPUs or nodes to handle models that are too large for one node's memory.
 
 **Usage**: Check [here](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-2-h208) for usage examples.
+
+## Block-wise FP8
+
+**Description**: SGLang implements block-wise FP8 quantization with two key optimizations:
+
+- **Activation**: E4M3 format using per-token-per-128-channel sub-vector scales with online casting.
+- **Weight**: Per-128x128-block quantization for better numerical stability.
+
+**Usage**: turn on by default for DeepSeek V3 models.

From 2db03a04ca39dd85a5e419a28803bd483528fcc1 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Fri, 10 Jan 2025 03:49:04 -0800
Subject: [PATCH 012/248] Update README.md (#2833)

Co-authored-by: Heiner <heiner@x.ai>
---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 97ad1e935c68..024fa2761270 100644
--- a/README.md
+++ b/README.md
@@ -61,5 +61,4 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
 
 ## Acknowledgment and Citation
-We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
-Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
+We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.

From 8f157893141ea24ebb581c9e48c27a8eeb9b81fb Mon Sep 17 00:00:00 2001
From: Pratyush Patel <pratyushpatel.1995@gmail.com>
Date: Fri, 10 Jan 2025 07:30:44 -0800
Subject: [PATCH 013/248] Add more metrics to serving benchmark. (#2819)

---
 python/sglang/bench_serving.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index 4744ad3386ba..941507705e36 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -514,6 +514,8 @@ class BenchmarkMetrics:
     p99_itl_ms: float
     mean_e2e_latency_ms: float
     median_e2e_latency_ms: float
+    std_e2e_latency_ms: float
+    p99_e2e_latency_ms: float
 
 
 SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
@@ -873,6 +875,8 @@ def calculate_metrics(
         p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
         mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000,
         median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
+        std_e2e_latency_ms=np.std(e2e_latencies) * 1000,
+        p99_e2e_latency_ms=np.percentile(e2e_latencies, 99) * 1000,
     )
 
     return metrics, output_lens
@@ -1064,10 +1068,20 @@ async def limited_request_func(request_func_input, pbar):
             "total_output_tokens_retokenized": metrics.total_output_retokenized,
             "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
             "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
+            "std_e2e_latency_ms": metrics.std_e2e_latency_ms,
+            "p99_e2e_latency_ms": metrics.p99_e2e_latency_ms,
             "mean_ttft_ms": metrics.mean_ttft_ms,
             "median_ttft_ms": metrics.median_ttft_ms,
+            "std_ttft_ms": metrics.std_ttft_ms,
+            "p99_ttft_ms": metrics.p99_ttft_ms,
+            "mean_tpot_ms": metrics.mean_tpot_ms,
+            "median_tpot_ms": metrics.median_tpot_ms,
+            "std_tpot_ms": metrics.std_tpot_ms,
+            "p99_tpot_ms": metrics.p99_tpot_ms,
             "mean_itl_ms": metrics.mean_itl_ms,
             "median_itl_ms": metrics.median_itl_ms,
+            "std_itl_ms": metrics.std_itl_ms,
+            "p99_itl_ms": metrics.p99_itl_ms,
             "input_throughput": metrics.input_throughput,
             "output_throughput": metrics.output_throughput,
             "sharegpt_output_len": args.sharegpt_output_len,

From f290bd4332ce4ff4be97d59e82daa013f99c66ca Mon Sep 17 00:00:00 2001
From: Chang Su <csu272@usc.edu>
Date: Fri, 10 Jan 2025 13:14:51 -0800
Subject: [PATCH 014/248] [Bugfix] Fix embedding model hangs with
 `--enable-metrics` (#2822)

---
 python/sglang/srt/configs/model_config.py     |  2 +-
 .../sglang/srt/managers/tokenizer_manager.py  |  8 +++-
 .../sglang/srt/model_executor/model_runner.py |  2 +-
 test/srt/test_openai_server.py                | 41 +++++++++++++++++++
 4 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index a2f9b82844e8..072c88b04a78 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -128,7 +128,7 @@ def __init__(
         self.num_hidden_layers = self.hf_text_config.num_hidden_layers
         self.vocab_size = self.hf_text_config.vocab_size
 
-        # Veirfy quantization
+        # Verify quantization
         self._verify_quantization()
 
         # Cache attributes
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 08dbd02c5ba3..00ef8458ab82 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -688,7 +688,7 @@ async def handle_loop(self):
                     if self.enable_metrics:
                         completion_tokens = (
                             recv_obj.completion_tokens[i]
-                            if recv_obj.completion_tokens
+                            if getattr(recv_obj, "completion_tokens", None)
                             else 0
                         )
 
@@ -716,7 +716,11 @@ async def handle_loop(self):
                                 time.time() - state.created_time
                             )
                             # Compute time_per_output_token for the non-streaming case
-                            if not state.obj.stream and completion_tokens >= 1:
+                            if (
+                                hasattr(state.obj, "stream")
+                                and not state.obj.stream
+                                and completion_tokens >= 1
+                            ):
                                 self.metrics_collector.observe_time_per_output_token(
                                     (time.time() - state.created_time)
                                     / completion_tokens
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 719db19cd765..efba8c25b504 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -724,7 +724,7 @@ def forward(self, forward_batch: ForwardBatch) -> LogitsProcessorOutput:
         elif forward_batch.forward_mode.is_idle():
             return self.forward_idle(forward_batch)
         else:
-            raise ValueError(f"Invaid forward mode: {forward_batch.forward_mode}")
+            raise ValueError(f"Invalid forward mode: {forward_batch.forward_mode}")
 
     def sample(
         self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py
index 379e57f356e9..4bedf7439663 100644
--- a/test/srt/test_openai_server.py
+++ b/test/srt/test_openai_server.py
@@ -14,6 +14,7 @@
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
+    DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
     DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
@@ -675,5 +676,45 @@ def test_function_calling_format(self):
         ), "Function name should be add for the above response"
 
 
+class TestOpenAIEmbedding(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+
+        # Configure embedding-specific args
+        other_args = ["--is-embedding", "--enable-metrics"]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=other_args,
+        )
+        cls.base_url += "/v1"
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_embedding_single(self):
+        """Test single embedding request"""
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.embeddings.create(model=self.model, input="Hello world")
+        self.assertEqual(len(response.data), 1)
+        self.assertTrue(len(response.data[0].embedding) > 0)
+
+    def test_embedding_batch(self):
+        """Test batch embedding request"""
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.embeddings.create(
+            model=self.model, input=["Hello world", "Test text"]
+        )
+        self.assertEqual(len(response.data), 2)
+        self.assertTrue(len(response.data[0].embedding) > 0)
+        self.assertTrue(len(response.data[1].embedding) > 0)
+
+
 if __name__ == "__main__":
     unittest.main()

From 5413ec2bbe42de54d244e35c65bd7929b458fd22 Mon Sep 17 00:00:00 2001
From: Muqi Li <642733045@qq.com>
Date: Sat, 11 Jan 2025 05:37:00 +0800
Subject: [PATCH 015/248] [Bugfix] Fix bug in fork logic caused by null text_
 (#2835)

---
 python/sglang/lang/interpreter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/lang/interpreter.py b/python/sglang/lang/interpreter.py
index 6d1ca71adab1..4c294781c20e 100644
--- a/python/sglang/lang/interpreter.py
+++ b/python/sglang/lang/interpreter.py
@@ -347,7 +347,7 @@ def fork(
         size: int = 1,
         position_ids_offset: Optional[List[int]] = None,
     ):
-        if size > 1:
+        if size > 1 and str(self.text_):
             self.submit(SglCommitLazy())
 
         self.sync()

From b170646991a06cb18b1bd4e74efcd095f5b00c18 Mon Sep 17 00:00:00 2001
From: TianYu GUO <guoty9@mail2.sysu.edu.cn>
Date: Sat, 11 Jan 2025 05:44:32 +0800
Subject: [PATCH 016/248] Fix port number overflow (#2826)

---
 python/sglang/srt/server_args.py | 5 ++++-
 python/sglang/srt/utils.py       | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index ef4df60a5763..09d1a3edebc4 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -928,7 +928,10 @@ def init_new(server_args) -> "PortArgs":
         while True:
             if is_port_available(port):
                 break
-            port += 42
+            if port < 60000:
+                port += 42
+            else:
+                port -= 43
 
         return PortArgs(
             tokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index 44a5e41a41bd..b07f6f01d184 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -335,6 +335,8 @@ def is_port_available(port):
             return True
         except socket.error:
             return False
+        except OverflowError:
+            return False
 
 
 def decode_video_base64(video_base64):

From a47bf39123c4f5bffcf96a80640f234e3f637c4c Mon Sep 17 00:00:00 2001
From: justdoit <24875266+coolhok@users.noreply.github.com>
Date: Sat, 11 Jan 2025 06:00:43 +0800
Subject: [PATCH 017/248] [Eagle2] Fix multiple concurrent request crashes
 (#2730)

---
 python/sglang/srt/speculative/eagle_utils.py  |  17 ++-
 python/sglang/srt/speculative/eagle_worker.py |   2 +
 test/srt/test_eagle_infer.py                  | 119 ++++++++++++++++++
 3 files changed, 134 insertions(+), 4 deletions(-)

diff --git a/python/sglang/srt/speculative/eagle_utils.py b/python/sglang/srt/speculative/eagle_utils.py
index b804e7c6af2e..1a324000cb28 100644
--- a/python/sglang/srt/speculative/eagle_utils.py
+++ b/python/sglang/srt/speculative/eagle_utils.py
@@ -245,9 +245,10 @@ def prepare_for_decode(self, batch: ScheduleBatch):
             )  # (b, topk)
             topk_cs_index, topk_cs_p = topk_cs.indices, topk_cs.values
 
-            selected_input_index = (
-                topk_cs_index.flatten() // self.topk
-            )  # shape: (b * topk)
+            selected_input_index = topk_cs_index.flatten() // self.topk + torch.arange(
+                0, batch.batch_size() * self.topk, step=self.topk, device="cuda"
+            ).repeat_interleave(self.topk)
+
             batch.spec_info.hidden_states = batch.spec_info.hidden_states[
                 selected_input_index, :
             ]
@@ -336,6 +337,7 @@ def prepare_extend_after_decode(self, batch: ScheduleBatch):
             triton.next_power_of_2(self.spec_steps + 1),
         )
 
+        batch.seq_lens_sum = sum(batch.seq_lens)
         batch.input_ids = self.verified_id
         self.verified_id = new_verified_id
 
@@ -439,7 +441,14 @@ def generate_attn_arg_prefill(
         return kv_indices, cum_kv_seq_len, qo_indptr, None
 
     def merge_batch(self, spec_info: EAGLEDraftInput):
-
+        if self.hidden_states is None:
+            self.hidden_states = spec_info.hidden_states
+            self.verified_id = spec_info.verified_id
+            self.sample_output = spec_info.sample_output
+            self.prev_mode = spec_info.prev_mode
+            return
+        if spec_info.hidden_states is None:
+            return
         self.hidden_states = torch.cat(
             [self.hidden_states, spec_info.hidden_states], axis=0
         )
diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py
index 16d54c43bafb..0e53506a8840 100644
--- a/python/sglang/srt/speculative/eagle_worker.py
+++ b/python/sglang/srt/speculative/eagle_worker.py
@@ -169,6 +169,8 @@ def finish_request(self, reqs: Union[Req, List[Req]]):
         if not isinstance(reqs, List):
             reqs = [reqs]
         for req in reqs:
+            if req.rid not in self.finish_extend_len:
+                continue
             req_len = (
                 len(req.origin_input_ids)
                 + len(req.output_ids)
diff --git a/test/srt/test_eagle_infer.py b/test/srt/test_eagle_infer.py
index 94ebc79ca743..92127b8ef591 100644
--- a/test/srt/test_eagle_infer.py
+++ b/test/srt/test_eagle_infer.py
@@ -1,8 +1,18 @@
+import multiprocessing
+import random
+import time
 import unittest
 
+import requests
 from transformers import AutoConfig, AutoTokenizer
 
 import sglang as sgl
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
 
 
 class TestEAGLEEngine(unittest.TestCase):
@@ -64,5 +74,114 @@ def test_eagle_end_check(self):
         assert tokenizer.eos_token_id not in tokens
 
 
+prompts = [
+    "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nToday is a sunny day and I like[/INST]"
+    '[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nWhat are the mental triggers in Jeff Walker\'s Product Launch Formula and "Launch" book?[/INST]',
+    "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nSummarize Russell Brunson's Perfect Webinar Script...[/INST]",
+    "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nwho are you?[/INST]",
+    "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nwhere are you from?[/INST]",
+]
+
+
+def process(server_url: str):
+    time.sleep(random.uniform(0, 2))
+    for prompt in prompts:
+        url = server_url
+        data = {
+            "model": "base",
+            "text": prompt,
+            "sampling_params": {
+                "temperature": 0,
+                "max_new_tokens": 1024,
+            },
+        }
+        response = requests.post(url, json=data)
+        assert response.status_code == 200
+
+
+def abort_process(server_url: str):
+    for prompt in prompts:
+        try:
+            time.sleep(1)
+            url = server_url
+            data = {
+                "model": "base",
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 1024,
+                },
+            }
+            # set timeout = 1s,mock disconnected
+            requests.post(url, json=data, timeout=1)
+        except:
+            pass
+
+
+class TestEAGLELaunchServer(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        speculative_draft_model_path = "lmzheng/sglang-EAGLE-llama2-chat-7B"
+        cls.model = "meta-llama/Llama-2-7b-chat-hf"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                speculative_draft_model_path,
+                "--speculative-num-steps",
+                "3",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "16",
+                "--served-model-name",
+                "base",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_eagle_server_concurrency(self):
+        concurrency = 4
+        processes = [
+            multiprocessing.Process(
+                target=process,
+                kwargs={"server_url": self.base_url + "/generate"},
+            )
+            for _ in range(concurrency)
+        ]
+        for worker in processes:
+            worker.start()
+        for p in processes:
+            p.join()
+
+    def test_eagle_server_request_abort(self):
+        concurrency = 4
+        processes = [
+            multiprocessing.Process(
+                target=process,
+                kwargs={"server_url": self.base_url + "/generate"},
+            )
+            for _ in range(concurrency)
+        ] + [
+            multiprocessing.Process(
+                target=abort_process,
+                kwargs={"server_url": self.base_url + "/generate"},
+            )
+            for _ in range(concurrency)
+        ]
+        for worker in processes:
+            worker.start()
+        for p in processes:
+            p.join()
+
+
 if __name__ == "__main__":
     unittest.main()

From 5d6e9467d4624a66ca64b0714042cb032df72695 Mon Sep 17 00:00:00 2001
From: Zhiqiang Xie <xiezhq@stanford.edu>
Date: Fri, 10 Jan 2025 20:22:01 -0800
Subject: [PATCH 018/248] Cache controller for hierarchical caching (#2804)

---
 .../sglang/srt/managers/cache_controller.py   | 307 ++++++++++++++++++
 1 file changed, 307 insertions(+)
 create mode 100644 python/sglang/srt/managers/cache_controller.py

diff --git a/python/sglang/srt/managers/cache_controller.py b/python/sglang/srt/managers/cache_controller.py
new file mode 100644
index 000000000000..4560a270870f
--- /dev/null
+++ b/python/sglang/srt/managers/cache_controller.py
@@ -0,0 +1,307 @@
+from __future__ import annotations
+
+"""
+Copyright 2023-2025 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+import threading
+from queue import PriorityQueue, Queue
+from typing import Optional
+
+import torch
+
+from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool, MLATokenToKVPoolHost
+
+logger = logging.getLogger(__name__)
+
+
+class CacheOperation:
+
+    counter = 0
+
+    def __init__(
+        self,
+        host_indices: torch.Tensor,
+        device_indices: torch.Tensor,
+        node_id: int,
+        priority: Optional[int] = None,
+    ):
+        self.host_indices = host_indices
+        self.device_indices = device_indices
+        self.node_ids = [node_id]
+        self.data = None
+
+        self.id = CacheOperation.counter
+        CacheOperation.counter += 1
+        # default priority is the order of creation
+        self.priority = priority if priority is not None else self.id
+
+    def merge(self, other: "CacheOperation") -> None:
+        # multiple operations can be merged into a single operation for batch processing
+        self.host_indices = torch.cat([self.host_indices, other.host_indices])
+        self.device_indices = torch.cat([self.device_indices, other.device_indices])
+        self.priority = min(self.priority, other.priority)
+        self.node_ids.extend(other.node_ids)
+
+    def __lt__(self, other: "CacheOperation"):
+        return self.priority < other.priority
+
+
+class TransferBuffer:
+    """
+    Overlapping buffer preparation and transfer operations to improve throughput.
+    """
+
+    def __init__(self, buffer_count: int = 3, max_buffer_size: int = 1000) -> None:
+        self.buffers = Queue(maxsize=buffer_count)
+        # todo: adjust the buffer size based on throughput profile of the system
+        self.max_buffer_size = max_buffer_size
+
+    def full(self) -> bool:
+        return self.buffers.full()
+
+    def empty(self) -> bool:
+        return self.buffers.empty()
+
+    def put(self, item, block=True) -> None:
+        self.buffers.put(item, block=block)
+
+    def get(self, block=True) -> Optional[CacheOperation]:
+        try:
+            return self.buffers.get(block=block)
+        except Exception as e:
+            logger.error(e)
+
+
+class HiCacheController:
+
+    def __init__(
+        self,
+        mem_pool_device: MHATokenToKVPool,
+        mem_pool_host: MLATokenToKVPoolHost,
+        write_policy: str = "write_through_selective",
+    ):
+
+        self.mem_pool_device = mem_pool_device
+        self.mem_pool_host = mem_pool_host
+        self.write_policy = write_policy
+
+        if write_policy not in [
+            "write_through",
+            "write_through_selective",
+            "write_back",
+        ]:
+            raise ValueError(f"Invalid write policy: {write_policy}")
+
+        self.write_queue = PriorityQueue()
+        self.load_queue = PriorityQueue()
+
+        self.ack_write_queue = Queue()
+        self.ack_load_queue = Queue()
+
+        self.write_buffer = TransferBuffer()
+        self.load_buffer = TransferBuffer()
+
+        self.write_stream = torch.cuda.Stream()
+        self.load_stream = torch.cuda.Stream()
+
+        self.write_thread = threading.Thread(
+            target=self.write_thread_func_buffer, daemon=True
+        )
+        self.load_thread = threading.Thread(
+            target=self.load_thread_func_buffer, daemon=True
+        )
+        self.write_thread.start()
+        self.load_thread.start()
+
+    def write(
+        self,
+        device_indices: torch.Tensor,
+        priority: Optional[int] = None,
+        node_id: int = 0,
+    ) -> Optional[torch.Tensor]:
+        """
+        Back up KV caches from device memory to host memory.
+        """
+        host_indices = self.mem_pool_host.alloc(len(device_indices))
+        if host_indices is None:
+            return None
+        self.write_queue.put(
+            CacheOperation(host_indices, device_indices, node_id, priority)
+        )
+        self.mem_pool_host.protect_write(host_indices)
+        return host_indices
+
+    def load(
+        self,
+        host_indices: torch.Tensor,
+        priority: Optional[int] = None,
+        node_id: int = 0,
+    ) -> Optional[torch.Tensor]:
+        """
+        Load KV caches from host memory to device memory.
+        """
+        device_indices = self.mem_pool_device.alloc(len(host_indices))
+        if device_indices is None:
+            return None
+        self.load_queue.put(
+            CacheOperation(host_indices, device_indices, node_id, priority)
+        )
+        self.mem_pool_host.protect_load(host_indices)
+        return device_indices
+
+    def write_thread_func_direct(self):
+        """
+        Directly write through KV caches to host memory without buffering.
+        """
+        with torch.cuda.stream(self.write_stream):
+            while True:
+                try:
+                    operation = self.write_queue.get(block=True)
+                    operation.data = self.mem_pool_device.get_flat_data(
+                        operation.device_indices
+                    )
+                    self.mem_pool_host.transfer(operation.host_indices, operation.data)
+                    self.mem_pool_host.complete_io(operation.host_indices)
+                    for node_id in operation.node_ids:
+                        self.ack_write_queue.put(node_id)
+                except Exception as e:
+                    logger.error(e)
+
+    def load_thread_func_direct(self):
+        """
+        Directly load KV caches from host memory to device memory without buffering.
+        """
+        with torch.cuda.stream(self.load_stream):
+            while True:
+                try:
+                    operation = self.load_queue.get(block=True)
+                    operation.data = self.mem_pool_host.get_flat_data(
+                        operation.host_indices
+                    )
+                    self.mem_pool_device.transfer(
+                        operation.device_indices, operation.data
+                    )
+                    self.mem_pool_host.complete_io(operation.host_indices)
+                    for node_id in operation.node_ids:
+                        self.ack_load_queue.put(node_id)
+                except Exception as e:
+                    logger.error(e)
+
+    def write_aux_func(self, no_wait=False):
+        """
+        Auxiliary function to prepare the buffer for write operations.
+        """
+        buffer = None
+        while True:
+            try:
+                operation = self.write_queue.get(block=True)
+                if buffer is None:
+                    buffer = operation
+                else:
+                    buffer.merge(operation)
+                if (
+                    no_wait
+                    or len(buffer.host_indices) >= self.write_buffer.max_buffer_size
+                    or self.write_queue.empty()
+                    or self.write_buffer.empty()
+                ):
+                    assert (
+                        buffer.device_indices.is_cuda
+                    ), "Device indices should be on GPU"
+                    buffer.data = self.mem_pool_device.get_flat_data(
+                        buffer.device_indices
+                    ).contiguous()
+                    self.write_buffer.put(buffer, block=True)
+                    buffer = None
+            except Exception as e:
+                logger.error(e)
+
+    def load_aux_func(self):
+        """
+        Auxiliary function to prepare the buffer for load operations.
+        """
+        buffer = None
+        while True:
+            try:
+                operation = self.load_queue.get(block=True)
+                if buffer is None:
+                    buffer = operation
+                else:
+                    buffer.merge(operation)
+                if (
+                    len(buffer.host_indices) >= self.load_buffer.max_buffer_size
+                    or self.load_queue.empty()
+                    or self.load_buffer.empty()
+                ):
+                    buffer.data = (
+                        self.mem_pool_host.get_flat_data(buffer.host_indices)
+                        .contiguous()
+                        .pin_memory()
+                    )
+                    self.load_buffer.put(buffer, block=True)
+                    buffer = None
+            except Exception as e:
+                logger.error(e)
+
+    def write_thread_func_buffer(self):
+        aux_thread = threading.Thread(target=self.write_aux_func, daemon=True)
+        aux_thread.start()
+        with torch.cuda.stream(self.write_stream):
+            while True:
+                operation = self.write_buffer.get()
+                if operation is None:
+                    continue
+                self.mem_pool_host.transfer(operation.host_indices, operation.data)
+                self.mem_pool_host.complete_io(operation.host_indices)
+                for node_id in operation.node_ids:
+                    self.ack_write_queue.put(node_id)
+
+    def load_thread_func_buffer(self):
+        aux_thread = threading.Thread(target=self.load_aux_func, daemon=True)
+        aux_thread.start()
+        with torch.cuda.stream(self.load_stream):
+            while True:
+                operation = self.load_buffer.get()
+                if operation is None:
+                    continue
+                self.mem_pool_device.transfer(operation.device_indices, operation.data)
+                self.mem_pool_host.complete_io(operation.host_indices)
+                for node_id in operation.node_ids:
+                    self.ack_load_queue.put(node_id)
+
+    def evict_device(
+        self, device_indices: torch.Tensor, host_indices: torch.Tensor
+    ) -> int:
+        if self.mem_pool_host.is_synced(host_indices):
+            self.mem_pool_device.free(device_indices)
+            self.mem_pool_host.update_backup(host_indices)
+            return len(device_indices)
+        else:
+            raise ValueError(
+                f"Inconsistent states: {self.mem_pool_host.get_state(host_indices)}"
+            )
+
+    def evict_host(self, host_indices: torch.Tensor, backup_only: bool = True) -> int:
+        if not backup_only:
+            raise ValueError("Other eviction policies are not supported yet.")
+
+        if self.mem_pool_host.is_backup(host_indices):
+            self.mem_pool_host.free(host_indices)
+            return len(host_indices)
+        else:
+            raise ValueError(
+                f"Inconsistent states: {self.mem_pool_host.get_state(host_indices)}"
+            )

From f1769586d651c701bc5f5b6f3a39d5b0f478eb02 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Fri, 10 Jan 2025 20:37:34 -0800
Subject: [PATCH 019/248] Update threshold in test_nightly_gsm8k_eval.py
 (#2836)

---
 test/srt/test_nightly_gsm8k_eval.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py
index 7820f6825a9c..2e379c111799 100644
--- a/test/srt/test_nightly_gsm8k_eval.py
+++ b/test/srt/test_nightly_gsm8k_eval.py
@@ -26,8 +26,8 @@
     "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85,
     "google/gemma-2-27b-it": 0.92,
     "meta-llama/Llama-3.1-70B-Instruct": 0.95,
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64,
-    "Qwen/Qwen2-57B-A14B-Instruct": 0.88,
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.63,
+    "Qwen/Qwen2-57B-A14B-Instruct": 0.87,
     "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83,
     "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
     "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.84,

From f0e15dc6ab6766a8fcdeedb5432b92a18e14979f Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Sat, 11 Jan 2025 14:34:26 +0800
Subject: [PATCH 020/248] [HotFix] fix fp8 scale load failed in tp>1 (#2837)

---
 python/sglang/srt/layers/linear.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py
index 9edfa739458b..b839deeb3251 100644
--- a/python/sglang/srt/layers/linear.py
+++ b/python/sglang/srt/layers/linear.py
@@ -437,7 +437,7 @@ def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
         if len(loaded_weight.shape) == 0:
             assert loaded_weight.numel() == 1
             loaded_weight = loaded_weight.reshape(1)
-        load_column_parallel_weight(param, loaded_weight, self.tp_rank)
+        param.load_column_parallel_weight(loaded_weight=loaded_weight)
 
     def forward(self, input_):
         bias = self.bias if not self.skip_bias_add else None
@@ -1247,12 +1247,7 @@ def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor
             assert loaded_weight.numel() == 1
             loaded_weight = loaded_weight.reshape(1)
 
-        load_row_parallel_weight(
-            param,
-            loaded_weight,
-            self.tp_rank,
-            use_presharded_weights=self.use_presharded_weights,
-        )
+        param.load_row_parallel_weight(loaded_weight=loaded_weight)
 
     def forward(self, input_):
         if self.input_is_parallel:

From f624901cdd5da4ad6ffa20a5c29561dcbac0eb4a Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Sat, 11 Jan 2025 23:10:02 +0800
Subject: [PATCH 021/248] chore: bump v0.4.1.post5 (#2840)

---
 benchmark/deepseek_v3/README.md       |  4 +++-
 docker/Dockerfile.rocm                |  2 +-
 docs/developer/setup_github_runner.md |  4 ++--
 docs/start/install.md                 | 10 +++++-----
 python/pyproject.toml                 |  2 +-
 python/sglang/version.py              |  2 +-
 6 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md
index 15cf0b26a244..a4f5bf854bde 100644
--- a/benchmark/deepseek_v3/README.md
+++ b/benchmark/deepseek_v3/README.md
@@ -4,6 +4,8 @@ The SGLang and DeepSeek teams collaborated to get DeepSeek V3 FP8 running on NVI
 
 Special thanks to Meituan's Search & Recommend Platform Team and Baseten's Model Performance Team for implementing the model, and DataCrunch for providing GPU resources.
 
+For optimizations made on the DeepSeek series models regarding SGLang, please refer to https://sgl-project.github.io/references/deepseek.html
+
 ## Hardware Recommendation
 - 8 x NVIDIA H200 GPUs
 
@@ -29,7 +31,7 @@ For high QPS scenarios, add the `--enable-dp-attention` argument to boost throug
 ### Using pip
 ```bash
 # Installation
-pip install "sglang[all]>=0.4.1.post3" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer
+pip install "sglang[all]>=0.4.1.post5" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer
 
 # Launch
 python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 44b3f85b3516..9b1d67b5e4f7 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -1,5 +1,5 @@
 # Usage (to build SGLang ROCm docker image):
-#   docker build --build-arg SGL_BRANCH=v0.4.1.post4 -t v0.4.1.post4-rocm620 -f Dockerfile.rocm .
+#   docker build --build-arg SGL_BRANCH=v0.4.1.post5 -t v0.4.1.post5-rocm620 -f Dockerfile.rocm .
 
 # default base image
 ARG BASE_IMAGE="rocmshared/vllm-rocm:20241031-tuned"
diff --git a/docs/developer/setup_github_runner.md b/docs/developer/setup_github_runner.md
index 7b510d72305e..fe856e9d659b 100644
--- a/docs/developer/setup_github_runner.md
+++ b/docs/developer/setup_github_runner.md
@@ -11,9 +11,9 @@ docker pull nvidia/cuda:12.1.1-devel-ubuntu22.04
 # Nvidia
 docker run --shm-size 128g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash
 # AMD
-docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post4-rocm620 /bin/bash
+docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post5-rocm620 /bin/bash
 # AMD just the last 2 GPUs
-docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post4-rocm620 /bin/bash
+docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post5-rocm620 /bin/bash
 ```
 
 ### Step 2: Configure the runner by `config.sh`
diff --git a/docs/start/install.md b/docs/start/install.md
index 8a81bb177974..26b09dfe319f 100644
--- a/docs/start/install.md
+++ b/docs/start/install.md
@@ -13,7 +13,7 @@ Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/
 ## Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.4.1.post4 https://github.com/sgl-project/sglang.git
+git clone -b v0.4.1.post5 https://github.com/sgl-project/sglang.git
 cd sglang
 
 pip install --upgrade pip
@@ -26,7 +26,7 @@ Note: To AMD ROCm system with Instinct/MI GPUs, do following instead:
 
 ```
 # Use the last release branch
-git clone -b v0.4.1.post4 https://github.com/sgl-project/sglang.git
+git clone -b v0.4.1.post5 https://github.com/sgl-project/sglang.git
 cd sglang
 
 pip install --upgrade pip
@@ -51,7 +51,7 @@ docker run --gpus all \
 Note: To AMD ROCm system with Instinct/MI GPUs, it is recommended to use `docker/Dockerfile.rocm` to build images, example and usage as below:
 
 ```bash
-docker build --build-arg SGL_BRANCH=v0.4.1.post4 -t v0.4.1.post4-rocm620 -f Dockerfile.rocm .
+docker build --build-arg SGL_BRANCH=v0.4.1.post5 -t v0.4.1.post5-rocm620 -f Dockerfile.rocm .
 
 alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/dri --ipc=host \
     --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
@@ -60,11 +60,11 @@ alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/d
 drun -p 30000:30000 \
     -v ~/.cache/huggingface:/root/.cache/huggingface \
     --env "HF_TOKEN=<secret>" \
-    v0.4.1.post4-rocm620 \
+    v0.4.1.post5-rocm620 \
     python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
 
 # Till flashinfer backend available, --attention-backend triton --sampling-backend pytorch are set by default
-drun v0.4.1.post4-rocm620 python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8
+drun v0.4.1.post5-rocm620 python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8
 ```
 
 ## Method 4: Using docker compose
diff --git a/python/pyproject.toml b/python/pyproject.toml
index d536f8832e1d..a236469a17c8 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sglang"
-version = "0.4.1.post4"
+version = "0.4.1.post5"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
diff --git a/python/sglang/version.py b/python/sglang/version.py
index 24e54e5c95d5..51eb3167fae5 100644
--- a/python/sglang/version.py
+++ b/python/sglang/version.py
@@ -1 +1 @@
-__version__ = "0.4.1.post4"
+__version__ = "0.4.1.post5"

From 197cbf9bab6aa4d75d7da392bbb8ac9c58ba7c5d Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Sat, 11 Jan 2025 23:11:38 +0800
Subject: [PATCH 022/248] docs: update README (#2841)

---
 benchmark/deepseek_v3/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md
index a4f5bf854bde..d14a8d55630c 100644
--- a/benchmark/deepseek_v3/README.md
+++ b/benchmark/deepseek_v3/README.md
@@ -4,7 +4,7 @@ The SGLang and DeepSeek teams collaborated to get DeepSeek V3 FP8 running on NVI
 
 Special thanks to Meituan's Search & Recommend Platform Team and Baseten's Model Performance Team for implementing the model, and DataCrunch for providing GPU resources.
 
-For optimizations made on the DeepSeek series models regarding SGLang, please refer to https://sgl-project.github.io/references/deepseek.html
+For optimizations made on the DeepSeek series models regarding SGLang, please refer to [DeepSeek Model Optimizations in SGLang](https://sgl-project.github.io/references/deepseek.html).
 
 ## Hardware Recommendation
 - 8 x NVIDIA H200 GPUs

From c4f9707e16146d7bc85d2744693aa78642e75e18 Mon Sep 17 00:00:00 2001
From: Shi Shuai <126407087+shuaills@users.noreply.github.com>
Date: Sat, 11 Jan 2025 23:14:26 +0000
Subject: [PATCH 023/248] Improve: Token-In Token-Out Usage for RLHF (#2843)

---
 docs/backend/native_api.ipynb                 |  70 +++++++++++
 docs/backend/structured_outputs.ipynb         |   6 +-
 docs/index.rst                                |   2 +-
 docs/references/deepseek.md                   |  10 +-
 .../srt/managers/detokenizer_manager.py       |   2 -
 python/sglang/srt/managers/io_struct.py       |   8 +-
 python/sglang/srt/managers/scheduler.py       |  10 +-
 .../sglang/srt/managers/tokenizer_manager.py  |   7 --
 python/sglang/srt/server_args.py              |  18 +--
 test/srt/run_suite.py                         |   1 -
 test/srt/test_engine_token_ids.py             |  45 -------
 test/srt/test_skip_tokenizer_init.py          | 119 ++++++++++++------
 12 files changed, 168 insertions(+), 130 deletions(-)
 delete mode 100644 test/srt/test_engine_token_ids.py

diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb
index 26758f7f9759..f6c10d745c5e 100644
--- a/docs/backend/native_api.ipynb
+++ b/docs/backend/native_api.ipynb
@@ -348,6 +348,76 @@
    "source": [
     "terminate_process(reward_process)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Skip Tokenizer and Detokenizer\n",
+    "\n",
+    "SGLang Runtime also supports skip tokenizer and detokenizer. This is useful in cases like integrating with RLHF workflow."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer_free_server_process = execute_shell_command(\n",
+    "    \"\"\"\n",
+    "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010 --skip-tokenizer-init\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(\"http://localhost:30010\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Llama-3.2-1B-Instruct\")\n",
+    "\n",
+    "input_text = \"What is the capital of France?\"\n",
+    "\n",
+    "input_tokens = tokenizer.encode(input_text)\n",
+    "print_highlight(f\"Input Text: {input_text}\")\n",
+    "print_highlight(f\"Tokenized Input: {input_tokens}\")\n",
+    "\n",
+    "response = requests.post(\n",
+    "    \"http://localhost:30010/generate\",\n",
+    "    json={\n",
+    "        \"input_ids\": input_tokens,\n",
+    "        \"sampling_params\": {\n",
+    "            \"temperature\": 0,\n",
+    "            \"max_new_tokens\": 256,\n",
+    "            \"stop_token_ids\": [tokenizer.eos_token_id],\n",
+    "        },\n",
+    "        \"stream\": False,\n",
+    "    },\n",
+    ")\n",
+    "output = response.json()\n",
+    "output_tokens = output[\"token_ids\"]\n",
+    "\n",
+    "output_text = tokenizer.decode(output_tokens, skip_special_tokens=False)\n",
+    "print_highlight(f\"Tokenized Output: {output_tokens}\")\n",
+    "print_highlight(f\"Decoded Output: {output_text}\")\n",
+    "print_highlight(f\"Output Text: {output['meta_info']['finish_reason']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(tokenizer_free_server_process)"
+   ]
   }
  ],
  "metadata": {
diff --git a/docs/backend/structured_outputs.ipynb b/docs/backend/structured_outputs.ipynb
index f017ef863035..55ca0b627f9c 100644
--- a/docs/backend/structured_outputs.ipynb
+++ b/docs/backend/structured_outputs.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Structured Outputs (JSON, Regex, EBNF)"
+    "# Structured Outputs"
    ]
   },
   {
@@ -43,6 +43,10 @@
     "    print_highlight,\n",
     ")\n",
     "import openai\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "\n",
     "\n",
     "server_process = execute_shell_command(\n",
     "    \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0 --grammar-backend xgrammar\"\n",
diff --git a/docs/index.rst b/docs/index.rst
index 6ed313a3bd17..51796d4a1071 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -56,10 +56,10 @@ The core features include:
    references/hyperparameter_tuning.md
    references/benchmark_and_profiling.md
    references/custom_chat_template.md
+   references/deepseek.md
    references/llama_405B.md
    references/modelscope.md
    references/contribution_guide.md
    references/troubleshooting.md
    references/faq.md
    references/learn_more.md
-   references/deepseek.md
diff --git a/docs/references/deepseek.md b/docs/references/deepseek.md
index 5a95fd9a9418..913395357e1d 100644
--- a/docs/references/deepseek.md
+++ b/docs/references/deepseek.md
@@ -1,4 +1,4 @@
-# DeepSeek Model Optimizations in SGLang
+# DeepSeek Model Optimizations
 
 SGLang provides several optimizations specifically designed for the DeepSeek model to boost its inference speed. This document outlines current optimizations for DeepSeek. Additionally, the SGLang team is actively developing enhancements for [DeepSeek-V3](https://github.com/sgl-project/sglang/issues/2591).
 
@@ -16,7 +16,9 @@ SGLang provides several optimizations specifically designed for the DeepSeek mod
 
 Overall, with these optimizations, we have achieved up to a 7x acceleration in output throughput compared to the previous version.
 
-![Multi-head Latent Attention for DeepSeek Series Models](https://lmsys.org/images/blog/sglang_v0_3/deepseek_mla.svg)
+<p align="center">
+  <img src="https://lmsys.org/images/blog/sglang_v0_3/deepseek_mla.svg" alt="Multi-head Latent Attention for DeepSeek Series Models">
+</p>
 
 **Usage**: MLA optimization is enabled by defalut, to disable, use `--disable-mla`.
 
@@ -26,7 +28,9 @@ Overall, with these optimizations, we have achieved up to a 7x acceleration in o
 
 **Description**: This optimization involves data parallelism (DP) for the MLA attention mechanism of DeepSeek Series Models, which allows for a significant reduction in the KV cache size, enabling larger batch sizes. Each DP worker independently handles different types of batches (prefill, decode, idle), which are then synchronized before and after processing through the Mixture-of-Experts (MoE) layer.
 
-![Data Parallelism Attention for DeepSeek Series Models](https://lmsys.org/images/blog/sglang_v0_4/dp_attention.svg).
+<p align="center">
+  <img src="https://lmsys.org/images/blog/sglang_v0_4/dp_attention.svg" alt="Data Parallelism Attention for DeepSeek Series Models">
+</p>
 
 **Usage**: This optimization is aimed at improving throughput and should be used for scenarios with high QPS (Queries Per Second). Data Parallelism Attention optimization can be enabeld by `--enable-dp-attention` for DeepSeek Series Models.
 
diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py
index fd77d338edce..b4bc1e7a448d 100644
--- a/python/sglang/srt/managers/detokenizer_manager.py
+++ b/python/sglang/srt/managers/detokenizer_manager.py
@@ -181,8 +181,6 @@ def event_loop(self):
                     finished_reasons=recv_obj.finished_reasons,
                     output_strs=output_strs,
                     prompt_tokens=recv_obj.prompt_tokens,
-                    origin_input_ids=recv_obj.origin_input_ids,
-                    output_ids=recv_obj.output_ids,
                     completion_tokens=recv_obj.completion_tokens,
                     cached_tokens=recv_obj.cached_tokens,
                     input_token_logprobs_val=recv_obj.input_token_logprobs_val,
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
index 1aae28b00b76..6ddc0993f9d7 100644
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -323,9 +323,7 @@ class BatchTokenIDOut:
     decoded_texts: List[str]
     decode_ids: List[int]
     read_offsets: List[int]
-    # Only used when --return-token-ids` is set
-    origin_input_ids: Optional[List[int]]
-    # Only used when `--skip-tokenizer-init` or `--return-token-ids` is set
+    # Only used when `--skip-tokenizer-init` is on
     output_ids: Optional[List[int]]
     # Detokenization configs
     skip_special_tokens: List[bool]
@@ -356,10 +354,6 @@ class BatchStrOut:
     # The output decoded strings
     output_strs: List[str]
 
-    # The token ids
-    origin_input_ids: Optional[List[int]]
-    output_ids: Optional[List[int]]
-
     # Token counts
     # real input and output tokens can be get from
     # origin_input_ids and output_ids by enabling --return_token_ids
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 6022a2567343..31c8018e2581 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -1253,7 +1253,6 @@ def stream_output(
             decode_ids_list = []
             read_offsets = []
             output_ids = []
-            origin_input_ids = []
 
             skip_special_tokens = []
             spaces_between_special_tokens = []
@@ -1305,14 +1304,8 @@ def stream_output(
                     decode_ids, read_offset = req.init_incremental_detokenize()
                     decode_ids_list.append(decode_ids)
                     read_offsets.append(read_offset)
-                    if self.skip_tokenizer_init or self.server_args.return_token_ids:
+                    if self.skip_tokenizer_init:
                         output_ids.append(req.output_ids)
-                    else:
-                        output_ids = None
-                    if self.server_args.return_token_ids:
-                        origin_input_ids.append(req.origin_input_ids)
-                    else:
-                        origin_input_ids = None
                     skip_special_tokens.append(req.sampling_params.skip_special_tokens)
                     spaces_between_special_tokens.append(
                         req.sampling_params.spaces_between_special_tokens
@@ -1344,7 +1337,6 @@ def stream_output(
                         decoded_texts,
                         decode_ids_list,
                         read_offsets,
-                        origin_input_ids,
                         output_ids,
                         skip_special_tokens,
                         spaces_between_special_tokens,
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 00ef8458ab82..9f9c53eaa8ec 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -663,13 +663,6 @@ async def handle_loop(self):
                             "text": recv_obj.output_strs[i],
                             "meta_info": meta_info,
                         }
-                        if self.server_args.return_token_ids:
-                            out_dict.update(
-                                {
-                                    "input_ids": recv_obj.origin_input_ids[i],
-                                    "output_ids": recv_obj.output_ids[i],
-                                }
-                            )
                     elif isinstance(recv_obj, BatchTokenIDOut):
                         out_dict = {
                             "token_ids": recv_obj.output_ids[i],
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 09d1a3edebc4..66739652aa9d 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -55,7 +55,6 @@ class ServerArgs:
     is_embedding: bool = False
     revision: Optional[str] = None
     skip_tokenizer_init: bool = False
-    return_token_ids: bool = False
 
     # Port for the HTTP server
     host: str = "127.0.0.1"
@@ -296,6 +295,11 @@ def add_cli_args(parser: argparse.ArgumentParser):
             "tokenizer if available, and 'slow' will "
             "always use the slow tokenizer.",
         )
+        parser.add_argument(
+            "--skip-tokenizer-init",
+            action="store_true",
+            help="If set, skip init tokenizer and pass input_ids in generate request",
+        )
         parser.add_argument(
             "--load-format",
             type=str,
@@ -404,18 +408,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
             "name, a tag name, or a commit id. If unspecified, will use "
             "the default version.",
         )
-        parser.add_argument(
-            "--skip-tokenizer-init",
-            action="store_true",
-            help="If set, skip init tokenizer and pass input_ids in generate request",
-        )
-        parser.add_argument(
-            "--return-token-ids",
-            action="store_true",
-            default=ServerArgs.return_token_ids,
-            help="Whether to return token IDs in the output, this may introduce additional overhead.",
-        )
-
         # Memory and scheduling
         parser.add_argument(
             "--mem-fraction-static",
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 2c1750d363ce..320fea7294e5 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -45,7 +45,6 @@
         "test_vision_chunked_prefill.py",
         "test_vision_openai_server.py",
         "test_session_control.py",
-        "test_engine_token_ids.py",
     ],
     "nightly": [
         "test_nightly_gsm8k_eval.py",
diff --git a/test/srt/test_engine_token_ids.py b/test/srt/test_engine_token_ids.py
deleted file mode 100644
index 4dee24edc9de..000000000000
--- a/test/srt/test_engine_token_ids.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import unittest
-
-from transformers import AutoTokenizer
-
-import sglang as sgl
-from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST
-
-
-class TestEngineTokenIds(unittest.TestCase):
-    def test_token_ids_in_generate(self):
-        llm = sgl.Engine(
-            model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST, return_token_ids=True
-        )
-        tokenizer = AutoTokenizer.from_pretrained(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
-
-        prompts = [
-            "Hello, my name is",
-            "The president of the United States is",
-            "The capital of France is",
-            "The future of AI is",
-        ]
-
-        sampling_params = {"temperature": 0, "top_p": 0.95}
-        outputs = llm.generate(prompts, sampling_params)
-
-        for prompt, output in zip(prompts, outputs):
-            deocode_input = tokenizer.decode(
-                output["input_ids"], skip_special_tokens=True
-            )
-            assert (deocode_input in prompt) or (
-                prompt in deocode_input
-            ), f"Decode input: {deocode_input} mismatch for: {prompt}"
-
-            deocode_output = tokenizer.decode(
-                output["output_ids"], skip_special_tokens=True
-            )
-            assert (deocode_output in output["text"]) or (
-                output["text"] in deocode_output
-            ), f"Decode output: {deocode_output} mismatch for: {output['text']}"
-
-        llm.shutdown()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/srt/test_skip_tokenizer_init.py b/test/srt/test_skip_tokenizer_init.py
index eef033ea98cb..db70944091f2 100644
--- a/test/srt/test_skip_tokenizer_init.py
+++ b/test/srt/test_skip_tokenizer_init.py
@@ -1,11 +1,8 @@
-"""
-python3 -m unittest test_skip_tokenizer_init.TestSkipTokenizerInit.test_parallel_sample
-"""
-
 import json
 import unittest
 
 import requests
+from transformers import AutoTokenizer
 
 from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
@@ -15,35 +12,63 @@
     popen_launch_server,
 )
 
+_server_process = None
+_base_url = None
+_tokenizer = None
+
+
+def setUpModule():
+    """
+    Launch the server once before all tests and initialize the tokenizer.
+    """
+    global _server_process, _base_url, _tokenizer
+    _server_process = popen_launch_server(
+        DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+        DEFAULT_URL_FOR_TEST,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=["--skip-tokenizer-init"],
+    )
+    _base_url = DEFAULT_URL_FOR_TEST
+
+    _tokenizer = AutoTokenizer.from_pretrained(
+        DEFAULT_SMALL_MODEL_NAME_FOR_TEST, use_fast=False
+    )
+    print(">>> setUpModule: Server launched, tokenizer ready")
+
+
+def tearDownModule():
+    """
+    Terminate the server once after all tests have completed.
+    """
+    global _server_process
+    if _server_process is not None:
+        kill_process_tree(_server_process.pid)
+        _server_process = None
+    print(">>> tearDownModule: Server terminated")
 
-class TestSkipTokenizerInit(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=["--skip-tokenizer-init"],
-        )
 
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
+class TestSkipTokenizerInit(unittest.TestCase):
+    def run_decode(
+        self,
+        prompt_text="The capital of France is",
+        max_new_tokens=32,
+        return_logprob=False,
+        top_logprobs_num=0,
+        n=1,
+    ):
+        input_ids = _tokenizer(prompt_text, return_tensors="pt")["input_ids"][
+            0
+        ].tolist()
 
-    def run_decode(self, return_logprob=False, top_logprobs_num=0, n=1):
-        max_new_tokens = 32
-        input_ids = [128000, 791, 6864, 315, 9822, 374]  # The capital of France is
         response = requests.post(
-            self.base_url + "/generate",
+            _base_url + "/generate",
             json={
                 "input_ids": input_ids,
                 "sampling_params": {
                     "temperature": 0 if n == 1 else 0.5,
                     "max_new_tokens": max_new_tokens,
                     "n": n,
-                    "stop_token_ids": [119690],
+                    "stop_token_ids": [_tokenizer.eos_token_id],
                 },
                 "stream": False,
                 "return_logprob": return_logprob,
@@ -52,25 +77,37 @@ def run_decode(self, return_logprob=False, top_logprobs_num=0, n=1):
             },
         )
         ret = response.json()
-        print(json.dumps(ret))
+        print(json.dumps(ret, indent=2))
 
         def assert_one_item(item):
-            self.assertEqual(
-                len(item["token_ids"]), item["meta_info"]["completion_tokens"]
-            )
-            self.assertEqual(len(item["token_ids"]), max_new_tokens)
-            assert item["meta_info"]["prompt_tokens"] == len(input_ids)
-
-            if return_logprob:
-                assert len(item["meta_info"]["input_token_logprobs"]) == len(
-                    input_ids
-                ), f'{len(item["meta_info"]["input_token_logprobs"])} vs. f{len(input_ids)}'
-                assert len(item["meta_info"]["output_token_logprobs"]) == max_new_tokens
-
+            if item["meta_info"]["finish_reason"]["type"] == "stop":
+                self.assertEqual(
+                    item["meta_info"]["finish_reason"]["matched"],
+                    _tokenizer.eos_token_id,
+                )
+            elif item["meta_info"]["finish_reason"]["type"] == "length":
+                self.assertEqual(
+                    len(item["token_ids"]), item["meta_info"]["completion_tokens"]
+                )
+                self.assertEqual(len(item["token_ids"]), max_new_tokens)
+                self.assertEqual(item["meta_info"]["prompt_tokens"], len(input_ids))
+
+                if return_logprob:
+                    self.assertEqual(
+                        len(item["meta_info"]["input_token_logprobs"]),
+                        len(input_ids),
+                        f'{len(item["meta_info"]["input_token_logprobs"])} mismatch with {len(input_ids)}',
+                    )
+                    self.assertEqual(
+                        len(item["meta_info"]["output_token_logprobs"]),
+                        max_new_tokens,
+                    )
+
+        # Determine whether to assert a single item or multiple items based on n
         if n == 1:
             assert_one_item(ret)
         else:
-            assert len(ret) == n
+            self.assertEqual(len(ret), n)
             for i in range(n):
                 assert_one_item(ret[i])
 
@@ -84,10 +121,10 @@ def test_parallel_sample(self):
 
     def test_logprob(self):
         for top_logprobs_num in [0, 3]:
-            self.run_decode(
-                return_logprob=True,
-                top_logprobs_num=top_logprobs_num,
-            )
+            self.run_decode(return_logprob=True, top_logprobs_num=top_logprobs_num)
+
+    def test_eos_behavior(self):
+        self.run_decode(max_new_tokens=256)
 
 
 if __name__ == "__main__":

From e2b16c4716f220a0469cdb424c508c95767fb924 Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Mon, 13 Jan 2025 11:38:17 +0800
Subject: [PATCH 024/248] add sampling_scaling_penalties kernel (#2846)

---
 sgl-kernel/CMakeLists.txt                     |  1 +
 sgl-kernel/pyproject.toml                     |  2 +-
 sgl-kernel/setup.py                           |  1 +
 sgl-kernel/src/sgl-kernel/__init__.py         |  2 +
 .../csrc/sampling_scaling_penalties.cu        | 64 +++++++++++++++++++
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     |  5 ++
 .../src/sgl-kernel/csrc/vectorization.cuh     | 30 +++++++++
 sgl-kernel/src/sgl-kernel/ops/__init__.py     |  7 ++
 .../tests/test_sampling_scaling_penalties.py  | 39 +++++++++++
 9 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
 create mode 100644 sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh
 create mode 100644 sgl-kernel/tests/test_sampling_scaling_penalties.py

diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt
index 3c267a4de504..15818d289eae 100644
--- a/sgl-kernel/CMakeLists.txt
+++ b/sgl-kernel/CMakeLists.txt
@@ -32,6 +32,7 @@ add_library(_kernels SHARED
     src/sgl-kernel/csrc/trt_reduce_kernel.cu
     src/sgl-kernel/csrc/moe_align_kernel.cu
     src/sgl-kernel/csrc/int8_gemm_kernel.cu
+    src/sgl-kernel/csrc/sampling_scaling_penalties.cu
     src/sgl-kernel/csrc/sgl_kernel_ops.cu
 )
 
diff --git a/sgl-kernel/pyproject.toml b/sgl-kernel/pyproject.toml
index 359ffafd70d2..b03b4c02b5e1 100644
--- a/sgl-kernel/pyproject.toml
+++ b/sgl-kernel/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sgl-kernel"
-version = "0.0.2.post11"
+version = "0.0.2.post12"
 description = "Kernel Library for SGLang"
 readme = "README.md"
 requires-python = ">=3.8"
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index c93e87f6bad3..83025d6d6c6f 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -50,6 +50,7 @@ def update_wheel_platform_tag():
             "src/sgl-kernel/csrc/trt_reduce_kernel.cu",
             "src/sgl-kernel/csrc/moe_align_kernel.cu",
             "src/sgl-kernel/csrc/int8_gemm_kernel.cu",
+            "src/sgl-kernel/csrc/sampling_scaling_penalties.cu",
             "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
         ],
         include_dirs=include_dirs,
diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py
index 892808f1ee15..62c366731e55 100644
--- a/sgl-kernel/src/sgl-kernel/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/__init__.py
@@ -4,6 +4,7 @@
     init_custom_reduce,
     int8_scaled_mm,
     moe_align_block_size,
+    sampling_scaling_penalties,
 )
 
 __all__ = [
@@ -12,4 +13,5 @@
     "custom_dispose",
     "custom_reduce",
     "int8_scaled_mm",
+    "sampling_scaling_penalties",
 ]
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu b/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
new file mode 100644
index 000000000000..30264caa3666
--- /dev/null
+++ b/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
@@ -0,0 +1,64 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <THC/THCAtomics.cuh>
+#include "utils.hpp"
+#include "vectorization.cuh"
+
+template <typename scalar_t>
+__global__ void sampling_scaling_penalties_kernel(
+    const scalar_t* logits,
+    const scalar_t* scaling_penalties,
+    scalar_t* output,
+    const int32_t numel) {
+
+    const int32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32_t stride = blockDim.x * gridDim.x;
+
+    auto const* vectorized_logits = reinterpret_cast<vec4_t<scalar_t> const*>(logits);
+    auto const* vectorized_penalties = reinterpret_cast<vec4_t<scalar_t> const*>(scaling_penalties);
+    auto* vectorized_output = reinterpret_cast<vec4_t<scalar_t>*>(output);
+
+    const int32_t num_vec_elems = numel >> 2;
+
+#pragma unroll 4
+    for (int32_t i = tid; i < num_vec_elems; i += stride) {
+        vec4_t<scalar_t> logits_vec = vectorized_logits[i];
+        vec4_t<scalar_t> penalties_vec = vectorized_penalties[i];
+        vec4_t<scalar_t> out_vec;
+
+        out_vec.x = logits_vec.x > 0 ? logits_vec.x / penalties_vec.x : logits_vec.x * penalties_vec.x;
+        out_vec.y = logits_vec.y > 0 ? logits_vec.y / penalties_vec.y : logits_vec.y * penalties_vec.y;
+        out_vec.z = logits_vec.z > 0 ? logits_vec.z / penalties_vec.z : logits_vec.z * penalties_vec.z;
+        out_vec.w = logits_vec.w > 0 ? logits_vec.w / penalties_vec.w : logits_vec.w * penalties_vec.w;
+
+        vectorized_output[i] = out_vec;
+    }
+
+    const int32_t start_idx = num_vec_elems * 4;
+    for (int32_t i = start_idx + tid; i < numel; i += stride) {
+        scalar_t logit = logits[i];
+        scalar_t penalty = scaling_penalties[i];
+        output[i] = logit > 0 ? logit / penalty : logit * penalty;
+    }
+}
+
+torch::Tensor sampling_scaling_penalties(const torch::Tensor& logits, const torch::Tensor& scaling_penalties) {
+    auto output = torch::empty_like(logits);
+    const auto numel = logits.numel();
+    const int threads = 512;
+
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
+        logits.scalar_type(), "sampling_scaling_penalties_kernel", ([&] {
+        const int blocks = (numel + threads * 4 - 1) / (threads * 4);
+        sampling_scaling_penalties_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            logits.data_ptr<scalar_t>(),
+            scaling_penalties.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(),
+            numel);
+    }));
+
+    return output;
+}
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index 6ed543e6c542..fbfe51442a35 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -12,6 +12,9 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t b
                           torch::Tensor sorted_token_ids, torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad,
                           torch::Tensor token_cnts_buffer, torch::Tensor cumsum_buffer);
 
+// sampling_scaling_penalties
+torch::Tensor sampling_scaling_penalties(const torch::Tensor& logits, const torch::Tensor& scaling_penalties);
+
 // int8_scaled_mm
 torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
@@ -24,6 +27,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("all_reduce", &all_reduce, "custom all reduce (CUDA)");
   // moe_align_block_size
   m.def("moe_align_block_size", &moe_align_block_size, "MOE Align Block Size (CUDA)");
+  // sampling_scaling_penalties
+  m.def("sampling_scaling_penalties", &sampling_scaling_penalties, "Sampling scaling penalties (CUDA)");
   // int8_scaled_mm
   m.def("int8_scaled_mm", &int8_scaled_mm, "INT8 scaled matmul (CUDA)");
 }
diff --git a/sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh b/sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh
new file mode 100644
index 000000000000..cb36d0e7a456
--- /dev/null
+++ b/sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh
@@ -0,0 +1,30 @@
+// Adapted from https://github.com/vllm-project/vllm/blob/main/csrc/quantization/vectorization.cuh
+#pragma once
+/**
+ * __device__ datatypes vectorized by 4
+ */
+
+// Include both AMD and NVIDIA fp8 types to avoid circular import
+// TODO(luka/varun) use FP8_TYPE instead after refactoring
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e4m3fn.h>
+
+// Vectorization containers
+template <typename scalar_t>
+struct __align__(8) vec4_t {
+  scalar_t x;
+  scalar_t y;
+  scalar_t z;
+  scalar_t w;
+};
+
+template <typename quant_type_t>
+struct __align__(4) q8x4_t {
+  static_assert(std::is_same_v<quant_type_t, int8_t> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
+  quant_type_t x;
+  quant_type_t y;
+  quant_type_t z;
+  quant_type_t w;
+};
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index e388ae35653b..03a8db80fd37 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -3,6 +3,9 @@
 from sgl_kernel.ops._kernels import init_custom_ar as _init_custom_ar
 from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm
 from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size
+from sgl_kernel.ops._kernels import (
+    sampling_scaling_penalties as _sampling_scaling_penalties,
+)
 
 
 def init_custom_reduce(rank_id, num_devices, buffers, barrier_in, barrier_out):
@@ -39,6 +42,10 @@ def moe_align_block_size(
     )
 
 
+def sampling_scaling_penalties(logits, scaling_penalties):
+    return _sampling_scaling_penalties(logits, scaling_penalties)
+
+
 def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
     return _int8_scaled_mm(
         mat_a,
diff --git a/sgl-kernel/tests/test_sampling_scaling_penalties.py b/sgl-kernel/tests/test_sampling_scaling_penalties.py
new file mode 100644
index 000000000000..4b9746fd7934
--- /dev/null
+++ b/sgl-kernel/tests/test_sampling_scaling_penalties.py
@@ -0,0 +1,39 @@
+import torch
+from sgl_kernel import sampling_scaling_penalties
+
+
+def test_sampling_scaling_penalties():
+    batch_sizes = [1, 2, 4, 8, 16, 32, 64, 65]
+    vocab_sizes = [2048, 4096, 8192, 16384, 32768, 32767]
+    dtypes = [torch.float32, torch.half, torch.bfloat16]
+    device = torch.device("cuda")
+
+    for dtype in dtypes:
+        rtol = 1e-3
+        atol = 1e-3
+
+        for bs in batch_sizes:
+            for vocab_size in vocab_sizes:
+                logits = torch.randn(bs, vocab_size, device=device, dtype=dtype)
+                scaling_penalties = (
+                    torch.rand(bs, vocab_size, device=device, dtype=dtype) + 0.5
+                )
+
+                ref_output = torch.where(
+                    logits > 0, logits / scaling_penalties, logits * scaling_penalties
+                )
+
+                kernel_output = sampling_scaling_penalties(logits, scaling_penalties)
+
+                torch.testing.assert_close(
+                    kernel_output,
+                    ref_output,
+                    rtol=rtol,
+                    atol=atol,
+                    msg=f"Failed for batch_size={bs}, vocab_size={vocab_size}, dtype={dtype}",
+                )
+
+
+if __name__ == "__main__":
+    test_sampling_scaling_penalties()
+    print("All tests passed!")

From a879c2fb4cf5e976f24a4dc95d21b2af99a7624b Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Mon, 13 Jan 2025 12:27:17 +0800
Subject: [PATCH 025/248] fix sgl-kernel build (#2850)

---
 .github/workflows/release-pypi-kernel.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/release-pypi-kernel.yml b/.github/workflows/release-pypi-kernel.yml
index f046538a6fad..362088c47fd1 100644
--- a/.github/workflows/release-pypi-kernel.yml
+++ b/.github/workflows/release-pypi-kernel.yml
@@ -22,6 +22,8 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
+      with:
+        submodules: 'recursive'
 
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5

From 85b2e05770ea453bf619d20b5e41679e4b86efb6 Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Mon, 13 Jan 2025 13:16:58 +0800
Subject: [PATCH 026/248] Add int8 quant kernel (#2848)

---
 .../kernels/quantization/bench_int8_quant.py  | 94 +++++++++++++++++++
 .../srt/layers/quantization/int8_kernel.py    | 53 +++++++++++
 2 files changed, 147 insertions(+)
 create mode 100644 benchmark/kernels/quantization/bench_int8_quant.py
 create mode 100644 python/sglang/srt/layers/quantization/int8_kernel.py

diff --git a/benchmark/kernels/quantization/bench_int8_quant.py b/benchmark/kernels/quantization/bench_int8_quant.py
new file mode 100644
index 000000000000..94b795690bfc
--- /dev/null
+++ b/benchmark/kernels/quantization/bench_int8_quant.py
@@ -0,0 +1,94 @@
+import argparse
+
+import torch
+import triton
+from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant
+
+from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+
+
+@torch.compile(backend="inductor")
+def torch_int8_quant(x):
+    int8_max = torch.iinfo(torch.int8).max
+
+    abs_max = x.abs().max(dim=-1, keepdim=True).values
+    scales = abs_max.to(torch.float32) / float(int8_max)
+
+    q_x = (x / scales).round().to(torch.int8)
+
+    return q_x, scales
+
+
+def _test_accuracy_once(M, K, input_dtype, device):
+    x = torch.randn(M, K, dtype=input_dtype, device=device) * 5000
+    out, scales, _ = vllm_scaled_int8_quant(x, symmetric=True)
+    out1, scales1 = per_token_quant_int8(x)
+    out2, scales2 = torch_int8_quant(x)
+    torch.testing.assert_close(out, out2, atol=1, rtol=0)
+    torch.testing.assert_close(out, out1, atol=1, rtol=0)
+    torch.testing.assert_close(scales, scales2)
+    torch.testing.assert_close(scales1, scales2)
+    print(f"M: {M}, K: {K}, type: {input_dtype} OK")
+
+
+def test_accuracy():
+    Ms = [1, 13, 128, 1024, 2048, 4096]
+    Ks = [512, 1024, 2048, 8192]
+    input_dtypes = [torch.float16, torch.bfloat16]
+    for M in Ms:
+        for K in Ks:
+            for input_dtype in input_dtypes:
+                _test_accuracy_once(M, K, input_dtype, "cuda")
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["vllm op", "triton", "torch.compile"],
+        line_names=["vllm op", "triton", "torch.compile"],
+        styles=[("blue", "-"), ("orange", "-"), ("red", "-")],
+        ylabel="ms",
+        plot_name="int8 per token quant",
+        args={},
+    )
+)
+def benchmark(batch_size, provider):
+    M, K = batch_size, 16384
+    x = torch.randn(M, K, dtype=torch.float16, device="cuda") * 1000
+
+    quantiles = [0.5, 0.2, 0.8]
+    if provider == "vllm op":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: vllm_scaled_int8_quant(x, symmetric=True),
+            quantiles=quantiles,
+        )
+    if provider == "triton":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: per_token_quant_int8(x),
+            quantiles=quantiles,
+        )
+    if provider == "torch.compile":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: torch_int8_quant(x),
+            quantiles=quantiles,
+        )
+
+    return ms, min_ms, max_ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./bench_int8_quant_res",
+        help="Path to save int8 quant benchmark results",
+    )
+    args = parser.parse_args()
+
+    test_accuracy()
+
+    benchmark.run(print_data=True, show_plots=True, save_path=args.save_path)
diff --git a/python/sglang/srt/layers/quantization/int8_kernel.py b/python/sglang/srt/layers/quantization/int8_kernel.py
new file mode 100644
index 000000000000..d1e74c6044de
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/int8_kernel.py
@@ -0,0 +1,53 @@
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _per_token_quant_int8(
+    x_ptr,
+    xq_ptr,
+    scale_ptr,
+    stride_x,
+    stride_xq,
+    N,
+    BLOCK: tl.constexpr,
+):
+    # Adapted from https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282
+    row_id = tl.program_id(0)
+
+    cols = tl.arange(0, BLOCK)
+    mask = cols < N
+
+    x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask, other=0.0).to(tl.float32)
+    absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10)
+    scale_x = absmax / 127
+    x_q = tl.extra.cuda.libdevice.round(x / scale_x).to(tl.int8)
+
+    tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask)
+    tl.store(scale_ptr + row_id, scale_x)
+
+
+def per_token_quant_int8(x):
+    M = x.numel() // x.shape[-1]
+    N = x.shape[-1]
+    x_q = torch.empty_like(x, device=x.device, dtype=torch.int8)
+    scales = torch.empty(x.shape[:-1] + (1,), device=x.device, dtype=torch.float32)
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+
+    assert x.is_contiguous()
+    _per_token_quant_int8[(M,)](
+        x,
+        x_q,
+        scales,
+        stride_x=x.stride(-2),
+        stride_xq=x_q.stride(-2),
+        N=N,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+
+    return x_q, scales

From 0bb0f7631114b8a4b614ec8f197327ea7fce645d Mon Sep 17 00:00:00 2001
From: bjmsong <wq.songbob@gmail.com>
Date: Mon, 13 Jan 2025 13:17:11 +0800
Subject: [PATCH 027/248] Support FP8 E4M3 KV Cache (#2786)

Co-authored-by: root <bjmsong@126.com>
---
 .../layers/attention/flashinfer_backend.py    | 16 ++++-
 python/sglang/srt/layers/radix_attention.py   |  2 +
 python/sglang/srt/mem_cache/memory_pool.py    | 10 +--
 .../sglang/srt/model_executor/model_runner.py | 27 ++++++++
 python/sglang/srt/models/llama.py             | 33 +++++++++-
 python/sglang/srt/server_args.py              | 15 ++++-
 python/sglang/srt/utils.py                    |  6 ++
 test/srt/kv_cache_scales_llama3_1_8b.json     | 42 ++++++++++++
 test/srt/test_fp8_kvcache.py                  | 64 +++++++++++++++++++
 9 files changed, 205 insertions(+), 10 deletions(-)
 create mode 100644 test/srt/kv_cache_scales_llama3_1_8b.json
 create mode 100644 test/srt/test_fp8_kvcache.py

diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py
index fc3455b60774..f038394628fd 100644
--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -353,7 +353,9 @@ def forward_extend(
             if k is not None:
                 assert v is not None
                 if save_kv_cache:
-                    forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                    )
 
             o = prefill_wrapper_paged.forward(
                 q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
@@ -362,6 +364,8 @@ def forward_extend(
                 sm_scale=layer.scaling,
                 window_left=layer.sliding_window_size,
                 logits_soft_cap=logits_soft_cap,
+                k_scale=layer.k_scale,
+                v_scale=layer.v_scale,
             )
         else:
             o1, s1 = self.prefill_wrapper_ragged.forward_return_lse(
@@ -387,7 +391,9 @@ def forward_extend(
                 o, _ = merge_state(o1, s1, o2, s2)
 
             if save_kv_cache:
-                forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                )
 
         return o.view(-1, layer.tp_q_head_num * layer.head_dim)
 
@@ -412,13 +418,17 @@ def forward_decode(
         if k is not None:
             assert v is not None
             if save_kv_cache:
-                forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                )
 
         o = decode_wrapper.forward(
             q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
             forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id),
             sm_scale=layer.scaling,
             logits_soft_cap=layer.logit_cap,
+            k_scale=layer.k_scale,
+            v_scale=layer.v_scale,
         )
 
         return o.view(-1, layer.tp_q_head_num * layer.head_dim)
diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py
index 4b762c00ba55..a449d7188a46 100644
--- a/python/sglang/srt/layers/radix_attention.py
+++ b/python/sglang/srt/layers/radix_attention.py
@@ -47,6 +47,8 @@ def __init__(
         self.logit_cap = logit_cap
         self.sliding_window_size = sliding_window_size or -1
         self.is_cross_attention = is_cross_attention
+        self.k_scale = 1.0
+        self.v_scale = 1.0
 
     def forward(
         self,
diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py
index b67f085b204b..6cb186577238 100644
--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -109,8 +109,8 @@ def __init__(
     ):
         self.size = size
         self.dtype = dtype
-        if dtype == torch.float8_e5m2:
-            # NOTE: Store as torch.uint8 because Tensor index_put is not implemented for torch.float8_e5m2
+        if dtype in (torch.float8_e5m2, torch.float8_e4m3fn):
+            # NOTE: Store as torch.uint8 because Tensor.index_put is not implemented for torch.float8_e5m2
             self.store_dtype = torch.uint8
         else:
             self.store_dtype = dtype
@@ -256,11 +256,13 @@ def set_kv_buffer(
         loc: torch.Tensor,
         cache_k: torch.Tensor,
         cache_v: torch.Tensor,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
     ):
         layer_id = layer.layer_id
         if cache_k.dtype != self.dtype:
-            cache_k = cache_k.to(self.dtype)
-            cache_v = cache_v.to(self.dtype)
+            cache_k = (cache_k / k_scale).to(self.dtype)
+            cache_v = (cache_v / v_scale).to(self.dtype)
         if self.store_dtype != self.dtype:
             self.k_buffer[layer_id][loc] = cache_k.view(self.store_dtype)
             self.v_buffer[layer_id][loc] = cache_v.view(self.store_dtype)
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index efba8c25b504..d46a2c0dc725 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -54,6 +54,7 @@
     enable_show_time_cost,
     get_available_gpu_memory,
     init_custom_process_group,
+    is_cuda,
     is_hip,
     monkey_patch_vllm_gguf_config,
     monkey_patch_vllm_p2p_access_check,
@@ -277,6 +278,29 @@ def load_model(self):
             device_config=DeviceConfig(self.device),
         )
 
+        if self.server_args.kv_cache_dtype == "fp8_e4m3":
+            if self.server_args.quantization_param_path is not None:
+                if callable(getattr(self.model, "load_kv_cache_scales", None)):
+                    self.model.load_kv_cache_scales(
+                        self.server_args.quantization_param_path
+                    )
+                    logger.info(
+                        "Loaded KV cache scaling factors from %s",
+                        self.server_args.quantization_param_path,
+                    )
+                else:
+                    raise RuntimeError(
+                        "Using FP8 KV cache and scaling factors provided but "
+                        "model %s does not support loading scaling factors.",
+                        self.model.__class__,
+                    )
+            else:
+                logger.warning(
+                    "Using FP8 KV cache but no scaling factors "
+                    "provided. Defaulting to scaling factors of 1.0. "
+                    "This may lead to less accurate results!"
+                )
+
         # Parse other args
         self.sliding_window_size = (
             self.model.get_attention_sliding_window_size()
@@ -516,6 +540,9 @@ def init_memory_pool(
                 self.kv_cache_dtype = torch.float8_e5m2fnuz
             else:
                 self.kv_cache_dtype = torch.float8_e5m2
+        elif self.server_args.kv_cache_dtype == "fp8_e4m3":
+            if is_cuda():
+                self.kv_cache_dtype = torch.float8_e4m3fn
         else:
             raise ValueError(
                 f"Unsupported kv_cache_dtype: {self.server_args.kv_cache_dtype}."
diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py
index e1688df01a8c..d606e52f8b8d 100644
--- a/python/sglang/srt/models/llama.py
+++ b/python/sglang/srt/models/llama.py
@@ -22,8 +22,12 @@
 import torch
 from torch import nn
 from transformers import LlamaConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.model_loader.weight_utils import kv_cache_scales_loader
 
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
@@ -299,6 +303,30 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    # If this function is called, it should always initialize KV cache scale
+    # factors (or else raise an exception). Thus, handled exceptions should
+    # make sure to leave KV cache scale factors in a known good (dummy) state
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.layers[layer_idx].self_attn
+
+            if hasattr(layer_self_attn.attn, "k_scale"):
+                layer_self_attn.attn.k_scale = scaling_factor
+                layer_self_attn.attn.v_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling " "factor attribute!"
+                )
+
 
 class LlamaForCausalLM(nn.Module):
 
@@ -534,6 +562,9 @@ def set_embed_and_head(self, embed, head):
         torch.cuda.empty_cache()
         torch.cuda.synchronize()
 
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
 
 class Phi3ForCausalLM(LlamaForCausalLM):
     pass
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 66739652aa9d..be85a3670d40 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -32,6 +32,7 @@
     is_hip,
     is_ipv6,
     is_port_available,
+    nullable_str,
 )
 
 logger = logging.getLogger(__name__)
@@ -47,6 +48,7 @@ class ServerArgs:
     trust_remote_code: bool = True
     dtype: str = "auto"
     kv_cache_dtype: str = "auto"
+    quantization_param_path: nullable_str = None
     quantization: Optional[str] = None
     context_length: Optional[int] = None
     device: str = "cuda"
@@ -350,8 +352,17 @@ def add_cli_args(parser: argparse.ArgumentParser):
             "--kv-cache-dtype",
             type=str,
             default=ServerArgs.kv_cache_dtype,
-            choices=["auto", "fp8_e5m2"],
-            help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" is supported for CUDA 11.8+.',
+            choices=["auto", "fp8_e5m2", "fp8_e4m3"],
+            help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
+        )
+        parser.add_argument(
+            "--quantization-param-path",
+            type=nullable_str,
+            default=None,
+            help="Path to the JSON file containing the KV cache "
+            "scaling factors. This should generally be supplied, when "
+            "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
+            "default to 1.0, which may cause accuracy issues. ",
         )
         parser.add_argument(
             "--quantization",
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index b07f6f01d184..af9bdd60b66f 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -1375,3 +1375,9 @@ def wrapper(*args, **kwargs):
             return func(*args, **kwargs)
 
     return wrapper
+
+
+def nullable_str(val: str):
+    if not val or val == "None":
+        return None
+    return val
diff --git a/test/srt/kv_cache_scales_llama3_1_8b.json b/test/srt/kv_cache_scales_llama3_1_8b.json
new file mode 100644
index 000000000000..3e890e50e4af
--- /dev/null
+++ b/test/srt/kv_cache_scales_llama3_1_8b.json
@@ -0,0 +1,42 @@
+{
+    "model_type": "llama",
+    "kv_cache": {
+        "dtype": "float8_e4m3fn",
+        "scaling_factor": {
+            "0": {
+                "0": 1,
+                "1": 1,
+                "2": 1,
+                "3": 1,
+                "4": 1,
+                "5": 1,
+                "6": 1,
+                "7": 1,
+                "8": 1,
+                "9": 1,
+                "10": 1,
+                "11": 1,
+                "12": 1,
+                "13": 1,
+                "14": 1,
+                "15": 1,
+                "16": 1,
+                "17": 1,
+                "18": 1,
+                "19": 1,
+                "20": 1,
+                "21": 1,
+                "22": 1,
+                "23": 1,
+                "24": 1,
+                "25": 1,
+                "26": 1,
+                "27": 1,
+                "28": 1,
+                "29": 1,
+                "30": 1,
+                "31": 1
+            }
+        }
+    }
+}
diff --git a/test/srt/test_fp8_kvcache.py b/test/srt/test_fp8_kvcache.py
new file mode 100644
index 000000000000..0d6602997de5
--- /dev/null
+++ b/test/srt/test_fp8_kvcache.py
@@ -0,0 +1,64 @@
+import os
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestFp8Kvcache(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        dirpath = os.path.dirname(__file__)
+        config_file = os.path.join(dirpath, "kv_cache_scales_llama3_8b_chat.json")
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--kv-cache-dtype",
+                "fp8_e4m3",
+                "--quantization-param-path",
+                config_file,
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["score"], 0.835)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.65)
+
+
+if __name__ == "__main__":
+    unittest.main()

From a18ab81ddd505fed4b663c1e3b6df81e6613484a Mon Sep 17 00:00:00 2001
From: sogalin <39478626+sogalin@users.noreply.github.com>
Date: Mon, 13 Jan 2025 14:39:44 +0800
Subject: [PATCH 028/248] Update base image for ROCm (#2852)

Co-authored-by: HAI <hixiao@gmail.com>
---
 docker/Dockerfile.rocm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 9b1d67b5e4f7..7e6ae193aae5 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -2,7 +2,7 @@
 #   docker build --build-arg SGL_BRANCH=v0.4.1.post5 -t v0.4.1.post5-rocm620 -f Dockerfile.rocm .
 
 # default base image
-ARG BASE_IMAGE="rocmshared/vllm-rocm:20241031-tuned"
+ARG BASE_IMAGE="rocmshared/vllm-rocm:20250113-tuned-elementwise"
 
 FROM $BASE_IMAGE AS base
 USER root

From e808c1df3e046d2c590efa32a22ebcb8741593ed Mon Sep 17 00:00:00 2001
From: kk <43161300+kkHuang-amd@users.noreply.github.com>
Date: Mon, 13 Jan 2025 16:23:07 +0800
Subject: [PATCH 029/248] Integrate ROCm ater package for ck moe function
 feasibility (#2854)

Co-authored-by: wunhuang <wunhuang@amd.com>
Co-authored-by: Lin, Soga <soga.lin@amd.com>
---
 docker/Dockerfile.rocm                        |   9 ++
 .../srt/layers/moe/fused_moe_triton/layer.py  |  45 ++++--
 python/sglang/srt/layers/quantization/fp8.py  | 143 ++++++++++++------
 python/sglang/srt/utils.py                    |  19 +++
 4 files changed, 162 insertions(+), 54 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 7e6ae193aae5..2ad62d2d493d 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -16,6 +16,10 @@ ARG SGL_BRANCH=${SGL_DEFAULT}
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
 ARG TRITON_COMMIT="845d75a"
 
+
+ARG ATER_REPO="https://github.com/HaiShaw/ater"
+ARG CK_COMMITS="fa05ae"
+
 RUN git clone ${SGL_REPO} \
     && cd sglang \
     && if [ "${SGL_BRANCH}" = ${SGL_DEFAULT} ]; then \
@@ -46,6 +50,11 @@ RUN git clone ${TRITON_REPO} \
     && cd python \
     && python3 setup.py install
 
+RUN git clone ${ATER_REPO} \
+    && cd ater \
+    && git submodule update --init --recursive \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop
+
 # Performance environment variable.
 
 ENV HIP_FORCE_DEV_KERNARG=1
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
index 8d0b7035ee50..e1064bcdabd1 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -1,5 +1,6 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/model_executor/layers/fused_moe/layer.py
 
+import os
 from abc import abstractmethod
 from enum import Enum
 from typing import Callable, List, Optional, Tuple
@@ -18,7 +19,7 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.utils import set_weight_attrs
+from sglang.srt.utils import is_hip, permute_weight, set_weight_attrs
 
 if torch.cuda.is_available():
     from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
@@ -97,6 +98,20 @@ def create_weights(
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if is_hip() and bool(int(os.getenv("CK_MOE", "0"))):
+            layer.w13_weight = torch.nn.Parameter(
+                permute_weight(layer.w13_weight.data),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+            layer.w2_weight = torch.nn.Parameter(
+                permute_weight(layer.w2_weight.data),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+        return
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -148,14 +163,26 @@ def forward_cuda(
             correction_bias=correction_bias,
         )
 
-        return fused_experts(
-            hidden_states=x,
-            w1=layer.w13_weight,
-            w2=layer.w2_weight,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            inplace=True,
-        )
+        if is_hip() and bool(int(os.getenv("CK_MOE", "0"))):
+            import ater
+            from ater.fused_moe import fused_experts_ck
+
+            return fused_experts_ck(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+            )
+        else:
+            return fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+            )
 
     def forward_cpu(self, *args, **kwargs):
         raise NotImplementedError("The CPU backend currently does not support MoE.")
diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
index f9e4a8a4ff45..22a43675bf85 100644
--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -40,6 +40,7 @@
 from sglang.srt.utils import (
     get_bool_env_var,
     is_hip,
+    permute_weight,
     print_warning_once,
     set_weight_attrs,
 )
@@ -616,18 +617,30 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
 
-            # If ROCm, apply weight padding (min. Mem channel contention) only if set
-            if is_hip() and bool(int(os.getenv("MOE_PADDING", "0"))):
-                layer.w13_weight = torch.nn.Parameter(
-                    F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
-                    requires_grad=False,
-                )
-                torch.cuda.empty_cache()
-                layer.w2_weight = torch.nn.Parameter(
-                    F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
-                    requires_grad=False,
-                )
-                torch.cuda.empty_cache()
+            if is_hip():
+                if bool(int(os.getenv("CK_MOE", "0"))):
+                    layer.w13_weight = torch.nn.Parameter(
+                        permute_weight(layer.w13_weight.data),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                    layer.w2_weight = torch.nn.Parameter(
+                        permute_weight(layer.w2_weight.data),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                elif bool(int(os.getenv("MOE_PADDING", "0"))):
+                    # If ROCm, apply weight padding (min. Mem channel contention) only if set
+                    layer.w13_weight = torch.nn.Parameter(
+                        F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                    layer.w2_weight = torch.nn.Parameter(
+                        F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
             return
 
         # If checkpoint is fp8, we need to handle that the
@@ -708,18 +721,30 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 max_w13_scales, requires_grad=False
             )
 
-            # If ROCm, apply weight padding (min. Mem channel contention) only if set
-            if is_hip() and bool(int(os.getenv("MOE_PADDING", "0"))):
-                layer.w13_weight = torch.nn.Parameter(
-                    F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
-                    requires_grad=False,
-                )
-                torch.cuda.empty_cache()
-                layer.w2_weight = torch.nn.Parameter(
-                    F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
-                    requires_grad=False,
-                )
-                torch.cuda.empty_cache()
+            if is_hip():
+                if bool(int(os.getenv("CK_MOE", "0"))):
+                    layer.w13_weight = torch.nn.Parameter(
+                        permute_weight(layer.w13_weight.data),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                    layer.w2_weight = torch.nn.Parameter(
+                        permute_weight(layer.w2_weight.data),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                elif bool(int(os.getenv("MOE_PADDING", "0"))):
+                    # If ROCm, apply weight padding (min. Mem channel contention) only if set
+                    layer.w13_weight = torch.nn.Parameter(
+                        F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                    layer.w2_weight = torch.nn.Parameter(
+                        F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
             return
 
     def apply(
@@ -752,27 +777,55 @@ def apply(
             correction_bias=correction_bias,
         )
 
-        # Expert fusion with FP8 quantization
-        return fused_experts(
-            x,
-            layer.w13_weight,
-            layer.w2_weight,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            inplace=True,
-            use_fp8_w8a8=True,
-            w1_scale=(
-                layer.w13_weight_scale_inv
-                if self.block_quant
-                else layer.w13_weight_scale
-            ),
-            w2_scale=(
-                layer.w2_weight_scale_inv if self.block_quant else layer.w2_weight_scale
-            ),
-            a1_scale=layer.w13_input_scale,
-            a2_scale=layer.w2_input_scale,
-            block_shape=self.quant_config.weight_block_size,
-        )
+        if is_hip() and bool(int(os.getenv("CK_MOE", "0"))):
+            import ater
+            from ater.fused_moe import fused_experts_ck
+
+            return fused_experts_ck(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                use_fp8_w8a8=True,
+                w1_scale=(
+                    layer.w13_weight_scale_inv
+                    if self.block_quant
+                    else layer.w13_weight_scale
+                ),
+                w2_scale=(
+                    layer.w2_weight_scale_inv
+                    if self.block_quant
+                    else layer.w2_weight_scale
+                ),
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+            )
+
+        else:
+            # Expert fusion with FP8 quantization
+            return fused_experts(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+                use_fp8_w8a8=True,
+                w1_scale=(
+                    layer.w13_weight_scale_inv
+                    if self.block_quant
+                    else layer.w13_weight_scale
+                ),
+                w2_scale=(
+                    layer.w2_weight_scale_inv
+                    if self.block_quant
+                    else layer.w2_weight_scale
+                ),
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+                block_shape=self.quant_config.weight_block_size,
+            )
 
 
 class Fp8KVCacheMethod(BaseKVCacheMethod):
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index af9bdd60b66f..51ca91a96b0d 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -1340,6 +1340,25 @@ def parse_tool_response(text, tools, **kwargs):
     return text, call_info_list
 
 
+def permute_weight(x: torch.Tensor) -> torch.Tensor:
+    b_ = x.shape[0]
+    n_ = x.shape[1]
+    k_ = x.shape[2]
+
+    x_ = x
+    if x.dtype == torch.bfloat16 or x.dtype == torch.float16:
+        x_ = x_.view(int(b_), int(n_ / 16), 16, int(k_ / 32), 4, 8)
+    elif x.dtype == torch.float8_e4m3fnuz or x.dtype == torch.int8:
+        x_ = x_.view(int(b_), int(n_ / 16), 16, int(k_ / 64), 4, 16)
+    else:
+        return x_
+
+    x_ = x_.permute(0, 1, 3, 4, 2, 5)
+    x_ = x_.contiguous()
+    x_ = x_.view(*x.shape)
+    return x_
+
+
 class MultiprocessingSerializer:
     @staticmethod
     def serialize(obj):

From 4093aa4660838c42a51f860989450b7d4c480436 Mon Sep 17 00:00:00 2001
From: justdoit <24875266+coolhok@users.noreply.github.com>
Date: Mon, 13 Jan 2025 17:01:21 +0800
Subject: [PATCH 030/248] [Fix]eagle2 health_generate is first
 request,apiserver will core (#2853)

---
 python/sglang/srt/speculative/eagle_worker.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py
index 0e53506a8840..2a6ec96048bb 100644
--- a/python/sglang/srt/speculative/eagle_worker.py
+++ b/python/sglang/srt/speculative/eagle_worker.py
@@ -40,6 +40,7 @@ def __init__(
         )
         self.target_worker = target_worker
         self.server_args = server_args
+        self.finish_extend_len = []
 
         # Share the embedding and lm_head
         embed, head = self.target_worker.model_runner.model.get_embed_and_head()

From 72c77763559317b2c8bddfd67e173b67aa1facb0 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 13 Jan 2025 01:39:14 -0800
Subject: [PATCH 031/248] Fix linear.py and improve weight loading (#2851)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
---
 benchmark/deepseek_v3/README.md               |   7 +-
 docs/references/supported_models.md           |   2 +-
 python/sglang/srt/layers/linear.py            | 134 +++++-------------
 python/sglang/srt/layers/moe/topk.py          |   6 +-
 python/sglang/srt/layers/parameter.py         |  40 +++---
 .../srt/layers/quantization/fp8_utils.py      |   2 +-
 .../srt/layers/quantization/modelopt_quant.py |   2 +-
 .../srt/layers/vocab_parallel_embedding.py    |  17 ++-
 python/sglang/srt/managers/scheduler.py       |   4 +
 python/sglang/srt/mem_cache/memory_pool.py    |  19 +++
 python/sglang/srt/server.py                   |   3 +
 test/srt/test_moe_eval_accuracy_large.py      |   2 +-
 12 files changed, 113 insertions(+), 125 deletions(-)

diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md
index d14a8d55630c..5c353bca5c79 100644
--- a/benchmark/deepseek_v3/README.md
+++ b/benchmark/deepseek_v3/README.md
@@ -39,7 +39,7 @@ python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-r
 
 For high QPS scenarios, add the `--enable-dp-attention` argument to boost throughput.
 
-### Example with OpenAI API
+### Example: Sending requests with OpenAI API
 
 ```python3
 import openai
@@ -58,7 +58,8 @@ response = client.chat.completions.create(
 )
 print(response)
 ```
-### Example serving with 2 H20*8
+
+### Example: Serving with two H20*8 nodes
 For example, there are two H20 nodes, each with 8 GPUs. The first node's IP is `10.0.0.1`, and the second node's IP is `10.0.0.2`.
 
 ```bash
@@ -71,7 +72,7 @@ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --di
 
 If you have two H100 nodes, the usage is similar to the aforementioned H20.
 
-### Example serving with Docker two H200*8 nodes
+### Example: Serving with two H200*8 nodes and docker
 There are two H200 nodes, each with 8 GPUs. The first node's IP is `192.168.114.10`, and the second node's IP is `192.168.114.11`. Configure the endpoint to expose it to another Docker container using `--host 0.0.0.0` and `--port 40000`, and set up communications with `--dist-init-addr 192.168.114.10:20000`.
 A single H200 with 8 devices can run DeepSeek V3, the dual H200 setup is just to demonstrate multi-node usage.
 
diff --git a/docs/references/supported_models.md b/docs/references/supported_models.md
index 9dafc3d2a3d7..1cc7b874732d 100644
--- a/docs/references/supported_models.md
+++ b/docs/references/supported_models.md
@@ -5,7 +5,7 @@
 - Mistral / Mixtral / Mistral NeMo
 - Gemma / Gemma 2
 - Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
-- DeepSeek / DeepSeek 2
+- DeepSeek / DeepSeek 2 / [DeepSeek 3](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3)
 - OLMoE
 - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
   - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py
index b839deeb3251..ee9386c13fa3 100644
--- a/python/sglang/srt/layers/linear.py
+++ b/python/sglang/srt/layers/linear.py
@@ -1,4 +1,4 @@
-# Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/linear.py
+"""Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/linear.py"""
 
 import logging
 from abc import abstractmethod
@@ -16,7 +16,7 @@
     tensor_model_parallel_all_reduce,
 )
 
-# workaround
+# Workaround: many QuantizationConfig still depends on this, so we have to use vLLM's LinearBase now.
 from vllm.model_executor.layers.linear import LinearBase
 
 from sglang.srt.layers.parameter import (
@@ -25,7 +25,6 @@
     PackedvLLMParameter,
     PerTensorScaleParameter,
     RowvLLMParameter,
-    _ColumnvLLMParameter,
 )
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
@@ -43,9 +42,13 @@
     "GPTQMarlinLinearMethod",
     "Fp8LinearMethod",
     "MarlinLinearMethod",
-    "GPTQLinearMethod",
     "QQQLinearMethod",
+    "GPTQMarlin24LinearMethod",
+    "TPUInt8LinearMethod",
+    "GPTQLinearMethod",
+    "FBGEMMFp8LinearMethod",
     "ModelOptFp8LinearMethod",
+    "IPEXAWQLinearMethod",
 ]
 
 
@@ -95,62 +98,6 @@ def adjust_scalar_to_fused_array(param, loaded_weight, shard_id):
     return param[shard_id], loaded_weight
 
 
-def load_column_qkv_weight(
-    self, loaded_weight, num_heads, shard_id, shard_offset, shard_size, tp_rank
-):
-    if (
-        isinstance(self, (PackedColumnParameter, PackedvLLMParameter))
-        and self.output_dim == self.packed_dim
-    ):
-        shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
-            shard_offset=shard_offset, shard_size=shard_size
-        )
-
-    param_data = self.data
-    shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
-    param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
-    loaded_weight = loaded_weight.narrow(
-        self.output_dim, shard_id * shard_size, shard_size
-    )
-
-    assert param_data.shape == loaded_weight.shape
-    param_data.copy_(loaded_weight)
-
-
-def load_column_parallel_weight(
-    self, loaded_weight: torch.Tensor, tp_rank, use_presharded_weights: bool = False
-):
-    if isinstance(self, _ColumnvLLMParameter):
-        if not use_presharded_weights:
-            shard_size = self.data.shape[self.output_dim]
-            loaded_weight = loaded_weight.narrow(
-                self.output_dim, tp_rank * shard_size, shard_size
-            )
-        assert self.data.shape == loaded_weight.shape
-        self.data.copy_(loaded_weight)
-    else:
-        self.data.copy_(loaded_weight)
-
-
-def load_row_parallel_weight(
-    self, loaded_weight: torch.Tensor, tp_rank, use_presharded_weights: bool = False
-):
-    if isinstance(self, RowvLLMParameter):
-        if not use_presharded_weights:
-            shard_size = self.data.shape[self.input_dim]
-            loaded_weight = loaded_weight.narrow(
-                self.input_dim, tp_rank * shard_size, shard_size
-            )
-
-        if len(loaded_weight.shape) == 0:
-            loaded_weight = loaded_weight.reshape(1)
-
-        assert self.data.shape == loaded_weight.shape
-        self.data.copy_(loaded_weight)
-    else:
-        self.data.copy_(loaded_weight)
-
-
 class LinearMethodBase(QuantizeMethodBase):
     """Base class for different (maybe quantized) linear methods."""
 
@@ -426,9 +373,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
 
-        assert (
-            param_data.shape == loaded_weight.shape
-        ), f"{param_data.shape=}, {loaded_weight.shape=}"
+        assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
     def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
@@ -437,7 +382,7 @@ def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
         if len(loaded_weight.shape) == 0:
             assert loaded_weight.numel() == 1
             loaded_weight = loaded_weight.reshape(1)
-        param.load_column_parallel_weight(loaded_weight=loaded_weight)
+        param.load_column_parallel_weight(loaded_weight, tp_rank=self.tp_rank)
 
     def forward(self, input_):
         bias = self.bias if not self.skip_bias_add else None
@@ -565,9 +510,7 @@ def weight_loader(
                         param_data, loaded_weight, 0
                     )
 
-                assert (
-                    param_data.shape == loaded_weight.shape
-                ), f"{param_data.shape=}, {loaded_weight.shape=}"
+                assert param_data.shape == loaded_weight.shape
                 param_data.copy_(loaded_weight)
                 return
             current_shard_offset = 0
@@ -643,9 +586,7 @@ def weight_loader(
                     "the same for all partitions."
                 )
 
-        assert (
-            param_data.shape == loaded_weight.shape
-        ), f"{param_data.shape=}, {loaded_weight.shape=}"
+        assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
     def _load_fused_module_from_checkpoint(
@@ -697,6 +638,7 @@ def weight_loader_v2(
             elif type(param) in (RowvLLMParameter, BasevLLMParameter):
                 param.load_merged_column_weight(loaded_weight=loaded_weight)
                 return
+            # TODO: @dsikka - move to parameter.py
             self._load_fused_module_from_checkpoint(param, loaded_weight)
             return
 
@@ -882,6 +824,7 @@ def weight_loader_v2(
             elif type(param) in (RowvLLMParameter, BasevLLMParameter):
                 param.load_qkv_weight(loaded_weight=loaded_weight)
                 return
+            # TODO: @dsikka - move to parameter.py
             self._load_fused_module_from_checkpoint(param, loaded_weight)
             return
 
@@ -896,24 +839,14 @@ def weight_loader_v2(
             shard_offset = (shard_offset + block_n - 1) // block_n
             shard_size = (shard_size + block_n - 1) // block_n
 
-        if isinstance(param, _ColumnvLLMParameter):
-            load_column_qkv_weight(
-                param,
-                loaded_weight,
-                num_heads=self.num_kv_head_replicas,
-                shard_id=loaded_shard_id,
-                shard_offset=shard_offset,
-                shard_size=shard_size,
-                tp_rank=self.tp_rank,
-            )
-        else:
-            param.load_qkv_weight(
-                loaded_weight=loaded_weight,
-                num_heads=self.num_kv_head_replicas,
-                shard_id=loaded_shard_id,
-                shard_offset=shard_offset,
-                shard_size=shard_size,
-            )
+        param.load_qkv_weight(
+            loaded_weight=loaded_weight,
+            num_heads=self.num_kv_head_replicas,
+            shard_id=loaded_shard_id,
+            shard_offset=shard_offset,
+            shard_size=shard_size,
+            tp_rank=self.tp_rank,
+        )
 
     def weight_loader(
         self,
@@ -962,9 +895,7 @@ def weight_loader(
                         param_data, loaded_weight, 0
                     )
 
-                assert (
-                    param_data.shape == loaded_weight.shape
-                ), f"{param_data.shape=}, {loaded_weight.shape=}"
+                assert param_data.shape == loaded_weight.shape
                 param_data.copy_(loaded_weight)
                 return
             shard_offsets = [
@@ -1105,9 +1036,7 @@ def weight_loader(
                     "for all partitions."
                 )
 
-        assert (
-            param_data.shape == loaded_weight.shape
-        ), f"{param_data.shape=}, {loaded_weight.shape=}"
+        assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
 
@@ -1234,9 +1163,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
 
-        assert (
-            param_data.shape == loaded_weight.shape
-        ), f"{param_data.shape=}, {loaded_weight.shape=}"
+        assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
     def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor):
@@ -1247,7 +1174,18 @@ def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor
             assert loaded_weight.numel() == 1
             loaded_weight = loaded_weight.reshape(1)
 
-        param.load_row_parallel_weight(loaded_weight=loaded_weight)
+        if isinstance(param, BasevLLMParameter):
+            # This `BasevLLMParameter` is defined in sglang/srt/layers/parameter.py,
+            # It supports additional parameters like tp_rank and use_presharded_weights.
+            param.load_row_parallel_weight(
+                loaded_weight,
+                tp_rank=self.tp_rank,
+                use_presharded_weights=self.use_presharded_weights,
+            )
+        else:
+            # `params` is defined in `vllm/model_executor/parameter.py`,
+            # It does not support additional parameters.
+            param.load_row_parallel_weight(loaded_weight)
 
     def forward(self, input_):
         if self.input_is_parallel:
diff --git a/python/sglang/srt/layers/moe/topk.py b/python/sglang/srt/layers/moe/topk.py
index 8190321988dc..527a7d499b6a 100644
--- a/python/sglang/srt/layers/moe/topk.py
+++ b/python/sglang/srt/layers/moe/topk.py
@@ -24,7 +24,9 @@ def fused_topk_native(
     topk: int,
     renormalize: bool,
 ):
-    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    assert (
+        hidden_states.shape[0] == gating_output.shape[0]
+    ), f"Number of tokens mismatch, {hidden_states.shape=} vs {gating_output.shape=}"
     M, _ = hidden_states.shape
     topk_weights = torch.empty(
         M, topk, dtype=torch.float32, device=hidden_states.device
@@ -180,7 +182,7 @@ def select_experts(
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
             )
-    elif torch_native:
+    elif torch_native and custom_routing_function is None:
         topk_weights, topk_ids = fused_topk_native(
             hidden_states=hidden_states,
             gating_output=router_logits,
diff --git a/python/sglang/srt/layers/parameter.py b/python/sglang/srt/layers/parameter.py
index 435cc69bb51d..fe999baa2660 100644
--- a/python/sglang/srt/layers/parameter.py
+++ b/python/sglang/srt/layers/parameter.py
@@ -1,7 +1,4 @@
-"""
-Adapted from vLLM (0.6.4.post1).
-https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/parameter.py
-"""
+"""Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/parameter.py"""
 
 import logging
 from fractions import Fraction
@@ -88,12 +85,17 @@ def __init__(self, output_dim: int, **kwargs):
     def output_dim(self):
         return self._output_dim
 
-    def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
-        tp_rank = get_tensor_model_parallel_rank()
-        shard_size = self.data.shape[self.output_dim]
-        loaded_weight = loaded_weight.narrow(
-            self.output_dim, tp_rank * shard_size, shard_size
-        )
+    def load_column_parallel_weight(
+        self,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        use_presharded_weights: bool = False,
+    ):
+        if not use_presharded_weights:
+            shard_size = self.data.shape[self.output_dim]
+            loaded_weight = loaded_weight.narrow(
+                self.output_dim, tp_rank * shard_size, shard_size
+            )
         assert self.data.shape == loaded_weight.shape
         self.data.copy_(loaded_weight)
 
@@ -121,7 +123,7 @@ def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
-    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, tp_rank: int, **kwargs):
 
         shard_offset = kwargs.get("shard_offset")
         shard_size = kwargs.get("shard_size")
@@ -137,7 +139,6 @@ def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
             )
 
         param_data = self.data
-        tp_rank = get_tensor_model_parallel_rank()
         shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
         param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
         loaded_weight = loaded_weight.narrow(
@@ -164,11 +165,14 @@ def __init__(self, input_dim: int, **kwargs):
     def input_dim(self):
         return self._input_dim
 
-    def load_row_parallel_weight(self, loaded_weight: torch.Tensor, **kwargs):
-        use_presharded_weights = kwargs.get("use_presharded_weights")
-        tp_rank = get_tensor_model_parallel_rank()
-        shard_size = self.data.shape[self.input_dim]
+    def load_row_parallel_weight(
+        self,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        use_presharded_weights: bool = False,
+    ):
         if not use_presharded_weights:
+            shard_size = self.data.shape[self.input_dim]
             loaded_weight = loaded_weight.narrow(
                 self.input_dim, tp_rank * shard_size, shard_size
             )
@@ -238,6 +242,8 @@ def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
     # For row parallel layers, no sharding needed
     # load weight into parameter as is
     def load_row_parallel_weight(self, *args, **kwargs):
+        kwargs.pop("tp_rank", None)
+        kwargs.pop("use_presharded_weights", None)
         super().load_row_parallel_weight(*args, **kwargs)
 
     def load_merged_column_weight(self, *args, **kwargs):
@@ -247,6 +253,8 @@ def load_qkv_weight(self, *args, **kwargs):
         self._load_into_shard_id(*args, **kwargs)
 
     def load_column_parallel_weight(self, *args, **kwargs):
+        kwargs.pop("tp_rank", None)
+        kwargs.pop("use_presharded_weights", None)
         super().load_row_parallel_weight(*args, **kwargs)
 
     def _load_into_shard_id(
diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py
index 140e70dd9d20..d6ff12ee1635 100644
--- a/python/sglang/srt/layers/quantization/fp8_utils.py
+++ b/python/sglang/srt/layers/quantization/fp8_utils.py
@@ -1,8 +1,8 @@
 from typing import List, Optional, Tuple
 
 import torch
-from vllm.model_executor.parameter import RowvLLMParameter, _ColumnvLLMParameter
 
+from sglang.srt.layers.parameter import RowvLLMParameter, _ColumnvLLMParameter
 from sglang.srt.layers.quantization.fp8_kernel import (
     per_token_group_quant_fp8,
     w8a8_block_fp8_matmul,
diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py
index 8ce9d20d1911..5d65899d6349 100644
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -11,9 +11,9 @@
     cutlass_fp8_supported,
     requantize_with_max_scale,
 )
-from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
 
 from sglang.srt.layers.linear import LinearMethodBase
+from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
diff --git a/python/sglang/srt/layers/vocab_parallel_embedding.py b/python/sglang/srt/layers/vocab_parallel_embedding.py
index 21d973918758..a346a2cbd1c9 100644
--- a/python/sglang/srt/layers/vocab_parallel_embedding.py
+++ b/python/sglang/srt/layers/vocab_parallel_embedding.py
@@ -220,6 +220,7 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
         enable_tp: bool = True,
+        use_presharded_weights: bool = False,
     ):
         super().__init__()
         self.quant_config = quant_config
@@ -236,6 +237,12 @@ def __init__(
         self.padding_size = padding_size
         self.org_vocab_size = org_num_embeddings or num_embeddings
         num_added_embeddings = num_embeddings - self.org_vocab_size
+        self.use_presharded_weights = use_presharded_weights
+        if use_presharded_weights:
+            assert (
+                num_added_embeddings == 0
+            ), "Lora is not supported with presharded weights."
+
         self.org_vocab_size_padded = pad_vocab_size(
             self.org_vocab_size, self.padding_size
         )
@@ -447,10 +454,14 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             start_idx = start_idx // packed_factor
             shard_size = shard_size // packed_factor
         else:
-            assert loaded_weight.shape[output_dim] == self.org_vocab_size
+            assert loaded_weight.shape[output_dim] == (
+                self.org_vocab_size
+                // (self.tp_size if self.use_presharded_weights else 1)
+            )
 
         # Copy the data.
-        loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+        if not self.use_presharded_weights:
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         param[: loaded_weight.shape[0]].data.copy_(loaded_weight)
         param[loaded_weight.shape[0] :].data.fill_(0)
 
@@ -514,6 +525,7 @@ def __init__(
         padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_presharded_weights: bool = False,
     ):
         super().__init__(
             num_embeddings,
@@ -523,6 +535,7 @@ def __init__(
             padding_size,
             quant_config,
             prefix,
+            use_presharded_weights=use_presharded_weights,
         )
         self.quant_config = quant_config
         if bias:
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 31c8018e2581..1c07ea6adb75 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -13,6 +13,7 @@
 # ==============================================================================
 """A scheduler that manages a tensor parallel GPU worker."""
 
+import faulthandler
 import logging
 import os
 import signal
@@ -399,6 +400,8 @@ def watchdog_thread(self):
                     self.watchdog_last_time = time.time()
             time.sleep(self.watchdog_timeout / 2)
 
+        # Wait sometimes so that the parent process can print the error.
+        time.sleep(5)
         self.parent_process.send_signal(signal.SIGQUIT)
 
     @torch.no_grad()
@@ -1582,6 +1585,7 @@ def run_scheduler_process(
     pipe_writer,
 ):
     setproctitle.setproctitle("sglang::scheduler")
+    faulthandler.enable()
 
     # [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
     if dp_rank is None and "SGLANG_DP_RANK" in os.environ:
diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py
index 6cb186577238..abee7764bebf 100644
--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -27,6 +27,7 @@
 from functools import wraps
 from typing import List, Tuple, Union
 
+import numpy as np
 import psutil
 import torch
 
@@ -35,6 +36,8 @@
 
 logger = logging.getLogger(__name__)
 
+GB = 1024 * 1024 * 1024
+
 
 class ReqToTokenPool:
     """A memory pool that maps a request to its token locations."""
@@ -193,6 +196,11 @@ def __init__(
         self.layer_num = layer_num
         self._create_buffers()
 
+        k_size, v_size = self.get_kv_size_bytes()
+        logger.info(
+            f"KV Cache is allocated. K size: {k_size / GB:.2f} GB, V size: {v_size / GB:.2f} GB."
+        )
+
     def _create_buffers(self):
         # [size, head_num, head_dim] for each layer
         # The padded slot 0 is used for writing dummy outputs from padded tokens.
@@ -217,6 +225,17 @@ def _clear_buffers(self):
         del self.k_buffer
         del self.v_buffer
 
+    def get_kv_size_bytes(self):
+        assert hasattr(self, "k_buffer")
+        assert hasattr(self, "v_buffer")
+        k_size_bytes = 0
+        for k_cache in self.k_buffer:
+            k_size_bytes += np.prod(k_cache.shape) * k_cache.dtype.itemsize
+        v_size_bytes = 0
+        for v_cache in self.v_buffer:
+            v_size_bytes += np.prod(v_cache.shape) * v_cache.dtype.itemsize
+        return k_size_bytes, v_size_bytes
+
     # Todo: different memory layout
     def get_flat_data(self, indices):
         # prepare a large chunk of contiguous data for efficient transfer
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index 8fd902818995..fa1625b09595 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -611,6 +611,9 @@ def _set_envs_and_config(server_args: ServerArgs):
     # The child processes will send SIGQUIT to this process when any error happens
     # This process then clean up the whole process tree
     def sigquit_handler(signum, frame):
+        logger.error(
+            "Received sigquit from a child proces. It usually means the child failed."
+        )
         kill_process_tree(os.getpid())
 
     signal.signal(signal.SIGQUIT, sigquit_handler)
diff --git a/test/srt/test_moe_eval_accuracy_large.py b/test/srt/test_moe_eval_accuracy_large.py
index 6f3affbba4d7..dc420f00dfaf 100644
--- a/test/srt/test_moe_eval_accuracy_large.py
+++ b/test/srt/test_moe_eval_accuracy_large.py
@@ -71,7 +71,7 @@ def test_mgsm_en(self):
         )
 
         metrics = run_eval(args)
-        self.assertGreater(metrics["score"], 0.62)
+        self.assertGreater(metrics["score"], 0.61)
 
 
 if __name__ == "__main__":

From 42f390996317a162f00571f51b6a54dc5fb3165f Mon Sep 17 00:00:00 2001
From: kk <43161300+kkHuang-amd@users.noreply.github.com>
Date: Mon, 13 Jan 2025 18:12:44 +0800
Subject: [PATCH 032/248] Unify sglang coding style (#2856)

Co-authored-by: Lin, Soga <soga.lin@amd.com>
---
 .../srt/layers/moe/fused_moe_triton/layer.py  |  9 +++---
 python/sglang/srt/layers/quantization/fp8.py  | 29 ++++++++++---------
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
index e1064bcdabd1..d95498377793 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -1,6 +1,5 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/model_executor/layers/fused_moe/layer.py
 
-import os
 from abc import abstractmethod
 from enum import Enum
 from typing import Callable, List, Optional, Tuple
@@ -19,7 +18,7 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.utils import is_hip, permute_weight, set_weight_attrs
+from sglang.srt.utils import get_bool_env_var, is_hip, permute_weight, set_weight_attrs
 
 if torch.cuda.is_available():
     from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
@@ -28,6 +27,8 @@
 
 import logging
 
+is_hip_ = is_hip()
+
 logger = logging.getLogger(__name__)
 
 
@@ -99,7 +100,7 @@ def create_weights(
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        if is_hip() and bool(int(os.getenv("CK_MOE", "0"))):
+        if is_hip_ and get_bool_env_var("CK_MOE"):
             layer.w13_weight = torch.nn.Parameter(
                 permute_weight(layer.w13_weight.data),
                 requires_grad=False,
@@ -163,7 +164,7 @@ def forward_cuda(
             correction_bias=correction_bias,
         )
 
-        if is_hip() and bool(int(os.getenv("CK_MOE", "0"))):
+        if is_hip_ and get_bool_env_var("CK_MOE"):
             import ater
             from ater.fused_moe import fused_experts_ck
 
diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
index 22a43675bf85..d16a3b0c257b 100644
--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -1,7 +1,6 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/quantization/fp8.py
 
 import logging
-import os
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
@@ -47,6 +46,8 @@
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
+is_hip_ = is_hip()
+
 logger = logging.getLogger(__name__)
 
 
@@ -162,7 +163,7 @@ def __init__(self, quant_config: Fp8Config):
         # kernel for fast weight-only FP8 quantization
         self.use_marlin = get_bool_env_var("SGLANG_FORCE_FP8_MARLIN")
         # Disable marlin for ROCm
-        if is_hip():
+        if is_hip_:
             self.use_marlin = False
 
         self.block_quant = self.quant_config.weight_block_size is not None
@@ -274,7 +275,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # Block quant doesn't need to process weights after loading
         if self.block_quant:
             # If ROCm, normalize the weights and scales to e4m3fnuz
-            if is_hip():
+            if is_hip_:
                 # activation_scheme: dynamic
                 weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
                     weight=layer.weight,
@@ -331,7 +332,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 weight_scale = layer.weight_scale
 
                 # If ROCm, normalize the weights and scales to e4m3fnuz
-                if is_hip():
+                if is_hip_:
                     weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
                         weight=weight,
                         weight_scale=weight_scale,
@@ -568,7 +569,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # Block quant doesn't need to process weights after loading
         if self.block_quant:
             # If ROCm, normalize the weights and scales to e4m3fnuz
-            if is_hip():
+            if is_hip_:
                 # activation_scheme: dynamic
                 w13_weight, w13_weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
                     weight=layer.w13_weight,
@@ -595,7 +596,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # If checkpoint is fp16 or bfloat16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             # If ROCm, use float8_e4m3fnuz instead (MI300x HW)
-            fp8_dtype = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
+            fp8_dtype = torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn
             w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
             w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
 
@@ -617,8 +618,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
 
-            if is_hip():
-                if bool(int(os.getenv("CK_MOE", "0"))):
+            if is_hip_:
+                if get_bool_env_var("CK_MOE"):
                     layer.w13_weight = torch.nn.Parameter(
                         permute_weight(layer.w13_weight.data),
                         requires_grad=False,
@@ -629,7 +630,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                         requires_grad=False,
                     )
                     torch.cuda.empty_cache()
-                elif bool(int(os.getenv("MOE_PADDING", "0"))):
+                elif get_bool_env_var("MOE_PADDING"):
                     # If ROCm, apply weight padding (min. Mem channel contention) only if set
                     layer.w13_weight = torch.nn.Parameter(
                         F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
@@ -671,7 +672,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 )
 
             # If ROCm, normalize the weights and scales to e4m3fnuz
-            if is_hip():
+            if is_hip_:
                 # Normalize the weights and scales
                 w13_weight, w13_weight_scale, w13_input_scale = (
                     normalize_e4m3fn_to_e4m3fnuz(
@@ -721,8 +722,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 max_w13_scales, requires_grad=False
             )
 
-            if is_hip():
-                if bool(int(os.getenv("CK_MOE", "0"))):
+            if is_hip_:
+                if get_bool_env_var("CK_MOE"):
                     layer.w13_weight = torch.nn.Parameter(
                         permute_weight(layer.w13_weight.data),
                         requires_grad=False,
@@ -733,7 +734,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                         requires_grad=False,
                     )
                     torch.cuda.empty_cache()
-                elif bool(int(os.getenv("MOE_PADDING", "0"))):
+                elif get_bool_env_var("MOE_PADDING"):
                     # If ROCm, apply weight padding (min. Mem channel contention) only if set
                     layer.w13_weight = torch.nn.Parameter(
                         F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
@@ -777,7 +778,7 @@ def apply(
             correction_bias=correction_bias,
         )
 
-        if is_hip() and bool(int(os.getenv("CK_MOE", "0"))):
+        if is_hip_ and get_bool_env_var("CK_MOE"):
             import ater
             from ater.fused_moe import fused_experts_ck
 

From 20a9f5dfe0b75614f723401072bca8589c781770 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Mon, 13 Jan 2025 18:36:40 +0800
Subject: [PATCH 033/248] fix: not delete CNAME (#2860)

---
 .github/workflows/release-docs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml
index ab2129e3721a..44bdfa0fa1ab 100644
--- a/.github/workflows/release-docs.yml
+++ b/.github/workflows/release-docs.yml
@@ -49,7 +49,7 @@ jobs:
           cd _build/html
 
           git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1
-          rm -rf  ../sgl-project.github.io/*
+          find ../sgl-project.github.io/ -mindepth 1 -not -name CNAME -delete
           cp -r * ../sgl-project.github.io
           cp ../../README.md ../sgl-project.github.io/README.md
           cd ../sgl-project.github.io

From 41d7e5b7e68f3aa0ef741d8774333c3518522d5e Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Mon, 13 Jan 2025 18:40:48 +0800
Subject: [PATCH 034/248] docs: update link (#2857)

---
 README.md                                     | 16 ++++++++--------
 benchmark/deepseek_v3/README.md               |  2 +-
 docs/references/contribution_guide.md         |  2 +-
 python/sglang/srt/sampling/sampling_params.py |  2 +-
 sgl-router/README.md                          |  2 +-
 sgl-router/v0.1.0.md                          |  2 +-
 6 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 024fa2761270..bacdb9fc15f6 100644
--- a/README.md
+++ b/README.md
@@ -13,9 +13,9 @@
 --------------------------------------------------------------------------------
 
 | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
-| [**Documentation**](https://sgl-project.github.io/)
-| [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw)
-| [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
+| [**Documentation**](https://docs.sglang.ai/)
+| [**Join Slack**](https://slack.sglang.ai/)
+| [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 
 ## News
@@ -45,11 +45,11 @@ The core features include:
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
 
 ## Getting Started
-- [Install SGLang](https://sgl-project.github.io/start/install.html)
-- [Quick Start](https://sgl-project.github.io/start/send_request.html)
-- [Backend Tutorial](https://sgl-project.github.io/backend/openai_api_completions.html)
-- [Frontend Tutorial](https://sgl-project.github.io/frontend/frontend.html)
-- [Contribution Guide](https://sgl-project.github.io/references/contribution_guide.html)
+- [Install SGLang](https://docs.sglang.ai/start/install.html)
+- [Quick Start](https://docs.sglang.ai/start/send_request.html)
+- [Backend Tutorial](https://docs.sglang.ai/backend/openai_api_completions.html)
+- [Frontend Tutorial](https://docs.sglang.ai/frontend/frontend.html)
+- [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html)
 
 ## Benchmark and Performance
 Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md
index 5c353bca5c79..e7ad8d33609c 100644
--- a/benchmark/deepseek_v3/README.md
+++ b/benchmark/deepseek_v3/README.md
@@ -4,7 +4,7 @@ The SGLang and DeepSeek teams collaborated to get DeepSeek V3 FP8 running on NVI
 
 Special thanks to Meituan's Search & Recommend Platform Team and Baseten's Model Performance Team for implementing the model, and DataCrunch for providing GPU resources.
 
-For optimizations made on the DeepSeek series models regarding SGLang, please refer to [DeepSeek Model Optimizations in SGLang](https://sgl-project.github.io/references/deepseek.html).
+For optimizations made on the DeepSeek series models regarding SGLang, please refer to [DeepSeek Model Optimizations in SGLang](https://docs.sglang.ai/references/deepseek.html).
 
 ## Hardware Recommendation
 - 8 x NVIDIA H200 GPUs
diff --git a/docs/references/contribution_guide.md b/docs/references/contribution_guide.md
index b2211f463fb0..b3b7f826894a 100644
--- a/docs/references/contribution_guide.md
+++ b/docs/references/contribution_guide.md
@@ -14,7 +14,7 @@ git clone https://github.com/<your_user_name>/sglang.git
 
 ### Install Dependencies & Build
 
-Refer to [Install SGLang from Source](https://sgl-project.github.io/start/install.html#method-2-from-source) documentation for more details on setting up the necessary dependencies.
+Refer to [Install SGLang from Source](https://docs.sglang.ai/start/install.html#method-2-from-source) documentation for more details on setting up the necessary dependencies.
 
 ## Code Formatting with Pre-Commit
 
diff --git a/python/sglang/srt/sampling/sampling_params.py b/python/sglang/srt/sampling/sampling_params.py
index 2c3817e1b795..d1d932693c61 100644
--- a/python/sglang/srt/sampling/sampling_params.py
+++ b/python/sglang/srt/sampling/sampling_params.py
@@ -23,7 +23,7 @@ class SamplingParams:
     The sampling parameters.
 
     See docs/references/sampling_params.md or
-    https://sgl-project.github.io/references/sampling_params.html
+    https://docs.sglang.ai/references/sampling_params.html
     for the documentation.
     """
 
diff --git a/sgl-router/README.md b/sgl-router/README.md
index 617bca5405fe..f39d63625de1 100644
--- a/sgl-router/README.md
+++ b/sgl-router/README.md
@@ -4,7 +4,7 @@ SGLang router is a standalone module implemented in Rust to achieve data paralle
 
 ## User docs
 
-Please check https://sgl-project.github.io/router/router.html
+Please check https://docs.sglang.ai/router/router.html
 
 ## Developer docs
 
diff --git a/sgl-router/v0.1.0.md b/sgl-router/v0.1.0.md
index 9a1ee152f113..747731a71c2d 100644
--- a/sgl-router/v0.1.0.md
+++ b/sgl-router/v0.1.0.md
@@ -54,7 +54,7 @@ Note:
 
 ## Closing remarks:
 
-1. Please read the full usage at https://sgl-project.github.io/router/router.html
+1. Please read the full usage at https://docs.sglang.ai/router/router.html
 2. The feature is still under active improvement, so please don't hesitate to raise issues or submit PRs if you have any suggestions or feedback.
 
 

From 4536d7244637f7e62e5892e272d06275bba8b5f1 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Mon, 13 Jan 2025 18:58:56 +0800
Subject: [PATCH 035/248] minor: use ubuntu-latest instead of self-hosted
 runner for amd build (#2861)

---
 .github/workflows/release-docker-amd.yml | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/release-docker-amd.yml b/.github/workflows/release-docker-amd.yml
index 866cc5fa5209..c0394e8e57a2 100644
--- a/.github/workflows/release-docker-amd.yml
+++ b/.github/workflows/release-docker-amd.yml
@@ -10,19 +10,27 @@ on:
 jobs:
   publish:
     if: github.repository == 'sgl-project/sglang'
-    runs-on: docker-builder-amd
+    runs-on: ubuntu-latest
     environment: 'prod'
     strategy:
       matrix:
         rocm_version: ['6.2.0']
         build_type: ['all', 'srt']
     steps:
-      - name: Delete huge unnecessary tools folder
-        run: rm -rf /opt/hostedtoolcache
-
       - name: Checkout repository
         uses: actions/checkout@v3
 
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: false
+          docker-images: false
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
+
       - name: Login to Docker Hub
         uses: docker/login-action@v2
         with:

From 67008f4b320d8950803fcb14b1e5dc6e80bf75e4 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 13 Jan 2025 03:55:33 -0800
Subject: [PATCH 036/248] Use only one GPU for MLA CI tests (#2858)

---
 .github/workflows/pr-test.yml |  8 +++-----
 test/srt/run_suite.py         |  2 ++
 test/srt/test_mla.py          | 35 ++++++++++++++++++++++++++++++++++-
 test/srt/test_mla_fp8.py      |  2 --
 4 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index f1c7871debb2..274c97c63932 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -87,18 +87,16 @@ jobs:
         run: |
           bash scripts/ci_install_dependency.sh
 
-      - name: Evaluate data parallelism accuracy (DP=2)
+      - name: Test data parallelism (DP=2)
         timeout-minutes: 10
         run: |
           cd test/srt
           python3 test_data_parallelism.py
 
-      - name: Evaluate MLA accuracy (TP=2)
+      - name: Test data parallelism attention (DP=2)
         timeout-minutes: 10
         run: |
           cd test/srt
-          python3 test_mla.py
-          python3 test_mla_fp8.py
           python3 test_dp_attention.py
 
       - name: Test update weights from distributed
@@ -107,7 +105,7 @@ jobs:
           cd test/srt
           python3 test_update_weights_from_distributed.py
 
-      - name: Evaluate MoE EP accuracy (TP=2)
+      - name: Test expert parallelism (EP=2)
         timeout-minutes: 10
         run: |
           cd test/srt
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 320fea7294e5..d617fcf69e62 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -22,6 +22,8 @@
         "test_json_constrained.py",
         "test_large_max_new_tokens.py",
         "test_metrics.py",
+        "test_mla.py",
+        "test_mla_fp8.py",
         "test_no_chunked_prefill.py",
         "test_no_overlap_scheduler.py",
         "test_openai_server.py",
diff --git a/test/srt/test_mla.py b/test/srt/test_mla.py
index b8105a84af1a..34bc4b446452 100644
--- a/test/srt/test_mla.py
+++ b/test/srt/test_mla.py
@@ -2,6 +2,7 @@
 from types import SimpleNamespace
 
 from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MLA_MODEL_NAME_FOR_TEST,
@@ -20,7 +21,7 @@ def setUpClass(cls):
             cls.model,
             cls.base_url,
             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=["--tp", "2", "--trust-remote-code"],
+            other_args=["--trust-remote-code"],
         )
 
     @classmethod
@@ -52,5 +53,37 @@ def test_mgsm_en(self):
         self.assertGreater(metrics["score"], 0.8)
 
 
+class TestDeepseekV3(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "lmzheng/sglang-ci-dsv3-test"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--trust-remote-code"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.62)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/srt/test_mla_fp8.py b/test/srt/test_mla_fp8.py
index 769bdf34da87..4fe18b526b1e 100644
--- a/test/srt/test_mla_fp8.py
+++ b/test/srt/test_mla_fp8.py
@@ -21,8 +21,6 @@ def setUpClass(cls):
             cls.base_url,
             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
             other_args=[
-                "--tp",
-                "2",
                 "--trust-remote-code",
                 "--kv-cache-dtype",
                 "fp8_e5m2",

From 51ab3ccf470ac51c8779091dd5db4c91c11e6c8c Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 13 Jan 2025 03:57:39 -0800
Subject: [PATCH 037/248] Collect more metrics: num_requests_total (#2859)

---
 .../sglang/srt/managers/tokenizer_manager.py  | 78 +++++++++----------
 python/sglang/srt/metrics/collector.py        | 15 ++--
 test/srt/test_metrics.py                      |  1 +
 3 files changed, 49 insertions(+), 45 deletions(-)

diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 9f9c53eaa8ec..fb6202932f0f 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -601,7 +601,7 @@ async def sigterm_watchdog(self):
         while not self.gracefully_exit:
             await asyncio.sleep(5)
 
-        # drain requests
+        # Drain requests
         while True:
             remain_num_req = len(self.rid_to_state)
             logger.info(
@@ -679,45 +679,7 @@ async def handle_loop(self):
                     state.event.set()
 
                     if self.enable_metrics:
-                        completion_tokens = (
-                            recv_obj.completion_tokens[i]
-                            if getattr(recv_obj, "completion_tokens", None)
-                            else 0
-                        )
-
-                        if state.first_token_time is None:
-                            state.first_token_time = time.time()
-                            self.metrics_collector.observe_time_to_first_token(
-                                state.first_token_time - state.created_time
-                            )
-                        else:
-                            if completion_tokens >= 2:
-                                # Compute time_per_output_token for the streaming case
-                                self.metrics_collector.observe_time_per_output_token(
-                                    (time.time() - state.first_token_time)
-                                    / (completion_tokens - 1)
-                                )
-
-                        if state.finished:
-                            self.metrics_collector.inc_prompt_tokens(
-                                recv_obj.prompt_tokens[i]
-                            )
-                            self.metrics_collector.inc_generation_tokens(
-                                completion_tokens
-                            )
-                            self.metrics_collector.observe_e2e_request_latency(
-                                time.time() - state.created_time
-                            )
-                            # Compute time_per_output_token for the non-streaming case
-                            if (
-                                hasattr(state.obj, "stream")
-                                and not state.obj.stream
-                                and completion_tokens >= 1
-                            ):
-                                self.metrics_collector.observe_time_per_output_token(
-                                    (time.time() - state.created_time)
-                                    / completion_tokens
-                                )
+                        self.collect_metrics(state, recv_obj, i)
             elif isinstance(recv_obj, OpenSessionReqOutput):
                 self.session_futures[recv_obj.session_id].set_result(
                     recv_obj.session_id if recv_obj.success else None
@@ -820,6 +782,42 @@ def detokenize_top_logprobs_tokens(
                 ret.append(None)
         return ret
 
+    def collect_metrics(self, state: ReqState, recv_obj: BatchStrOut, i: int):
+        completion_tokens = (
+            recv_obj.completion_tokens[i]
+            if getattr(recv_obj, "completion_tokens", None)
+            else 0
+        )
+
+        if state.first_token_time is None:
+            state.first_token_time = time.time()
+            self.metrics_collector.observe_time_to_first_token(
+                state.first_token_time - state.created_time
+            )
+        else:
+            if completion_tokens >= 2:
+                # Compute time_per_output_token for the streaming case
+                self.metrics_collector.observe_time_per_output_token(
+                    (time.time() - state.first_token_time) / (completion_tokens - 1)
+                )
+
+        if state.finished:
+            self.metrics_collector.observe_one_finished_request(
+                recv_obj.prompt_tokens[i], completion_tokens
+            )
+            self.metrics_collector.observe_e2e_request_latency(
+                time.time() - state.created_time
+            )
+            # Compute time_per_output_token for the non-streaming case
+            if (
+                hasattr(state.obj, "stream")
+                and not state.obj.stream
+                and completion_tokens >= 1
+            ):
+                self.metrics_collector.observe_time_per_output_token(
+                    (time.time() - state.created_time) / completion_tokens
+                )
+
 
 class SignalHandler:
     def __init__(self, tokenizer_manager):
diff --git a/python/sglang/srt/metrics/collector.py b/python/sglang/srt/metrics/collector.py
index 9505f012f067..070b405be429 100644
--- a/python/sglang/srt/metrics/collector.py
+++ b/python/sglang/srt/metrics/collector.py
@@ -109,6 +109,12 @@ def __init__(self, labels: Dict[str, str]) -> None:
             labelnames=labels.keys(),
         )
 
+        self.num_requests_total = Counter(
+            name="sglang:num_requests_total",
+            documentation="Number of requests processed.",
+            labelnames=labels.keys(),
+        )
+
         self.histogram_time_to_first_token = Histogram(
             name="sglang:time_to_first_token_seconds",
             documentation="Histogram of time to first token in seconds.",
@@ -185,11 +191,10 @@ def _log_counter(self, counter, data: Union[int, float]) -> None:
         # Convenience function for logging to counter.
         counter.labels(**self.labels).inc(data)
 
-    def inc_prompt_tokens(self, value: int):
-        self._log_counter(self.prompt_tokens_total, value)
-
-    def inc_generation_tokens(self, value: int):
-        self._log_counter(self.generation_tokens_total, value)
+    def observe_one_finished_request(self, prompt_tokens: int, generation_tokens: int):
+        self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens)
+        self.generation_tokens_total.labels(**self.labels).inc(generation_tokens)
+        self.num_requests_total.labels(**self.labels).inc(1)
 
     def observe_time_to_first_token(self, value: Union[float, int]):
         self._log_histogram(self.histogram_time_to_first_token, value)
diff --git a/test/srt/test_metrics.py b/test/srt/test_metrics.py
index ccaea5be800e..69babf795f01 100644
--- a/test/srt/test_metrics.py
+++ b/test/srt/test_metrics.py
@@ -59,6 +59,7 @@ def test_metrics_enabled(self):
                 "sglang:func_latency_seconds",
                 "sglang:prompt_tokens_total",
                 "sglang:generation_tokens_total",
+                "sglang:num_requests_total",
                 "sglang:time_to_first_token_seconds",
                 "sglang:time_per_output_token_seconds",
                 "sglang:e2e_request_latency_seconds",

From 17de02f98d8f28e5affec7c5ff8e28f110d0af42 Mon Sep 17 00:00:00 2001
From: bjmsong <wq.songbob@gmail.com>
Date: Mon, 13 Jan 2025 20:14:16 +0800
Subject: [PATCH 038/248] Integration of TurboMind AWQ (#2828)

Co-authored-by: root <bjmsong@126.com>
---
 python/pyproject.toml                         |   2 +-
 python/sglang/srt/configs/model_config.py     |  10 +-
 python/sglang/srt/layers/linear.py            |   1 +
 .../srt/layers/quantization/__init__.py       |   2 +
 .../srt/layers/quantization/awq_turbomind.py  | 287 ++++++++++++++++++
 .../layers/quantization/turbomind_utils.py    |  63 ++++
 python/sglang/srt/server_args.py              |   1 +
 test/srt/test_turbomind_awq.py                |  47 +++
 8 files changed, 411 insertions(+), 2 deletions(-)
 create mode 100644 python/sglang/srt/layers/quantization/awq_turbomind.py
 create mode 100644 python/sglang/srt/layers/quantization/turbomind_utils.py
 create mode 100644 test/srt/test_turbomind_awq.py

diff --git a/python/pyproject.toml b/python/pyproject.toml
index a236469a17c8..c29580b50b1b 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -28,7 +28,7 @@ runtime_common = [
 srt = [
     "sglang[runtime_common]", "cuda-python",
     "sgl-kernel>=0.0.2.post11", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1",
-    "flashinfer==0.1.6"
+    "flashinfer==0.1.6", "turbomind"
 ]
 
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 072c88b04a78..28144f139958 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -14,6 +14,7 @@
 
 import json
 import logging
+import sys
 from enum import IntEnum, auto
 from typing import List, Optional, Set, Union
 
@@ -230,7 +231,7 @@ def _verify_quantization(self) -> None:
         # Parse quantization method from the HF model config, if available.
         quant_cfg = self._parse_quant_hf_config()
 
-        if quant_cfg is not None:
+        if quant_cfg is not None and not quantization_in_turbomind(self.quantization):
             quant_method = quant_cfg.get("quant_method", "").lower()
 
             # Detect which checkpoint is it
@@ -401,3 +402,10 @@ def is_multimodal_model(model_architectures: List[str]):
 
 def is_encoder_decoder_model(model_architectures: List[str]):
     return "MllamaForConditionalGeneration" in model_architectures
+
+
+def quantization_in_turbomind(quantization: str) -> bool:
+    if quantization in ["awq_turbomind"]:
+        return True
+    else:
+        return False
diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py
index ee9386c13fa3..815255d5c167 100644
--- a/python/sglang/srt/layers/linear.py
+++ b/python/sglang/srt/layers/linear.py
@@ -48,6 +48,7 @@
     "GPTQLinearMethod",
     "FBGEMMFp8LinearMethod",
     "ModelOptFp8LinearMethod",
+    "AWQTurbomindLinearMethod",
     "IPEXAWQLinearMethod",
 ]
 
diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py
index 35b0c4d94edb..faf14d6fdd6b 100644
--- a/python/sglang/srt/layers/quantization/__init__.py
+++ b/python/sglang/srt/layers/quantization/__init__.py
@@ -20,6 +20,7 @@
 from vllm.model_executor.layers.quantization.qqq import QQQConfig
 from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
 
+from sglang.srt.layers.quantization.awq_turbomind import AWQTurbomindConfig
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.fp8 import Fp8Config
 from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp8Config
@@ -37,6 +38,7 @@
     "gptq_marlin_24": GPTQMarlin24Config,
     "gptq_marlin": GPTQMarlinConfig,
     "awq_marlin": AWQMarlinConfig,
+    "awq_turbomind": AWQTurbomindConfig,
     "gptq": GPTQConfig,
     "compressed-tensors": CompressedTensorsConfig,
     "bitsandbytes": BitsAndBytesConfig,
diff --git a/python/sglang/srt/layers/quantization/awq_turbomind.py b/python/sglang/srt/layers/quantization/awq_turbomind.py
new file mode 100644
index 000000000000..007b20420973
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/awq_turbomind.py
@@ -0,0 +1,287 @@
+import logging
+import os
+import sys
+from typing import Any, Dict, List, Optional
+
+import torch
+import turbomind
+from torch.nn import Parameter
+
+turbomind_dir = os.path.split(turbomind.__file__)[0]
+sys.path.append(os.path.join(turbomind_dir, "lib"))
+import _turbomind_ext
+from vllm.model_executor.layers.linear import LinearBase
+
+from sglang.srt.layers.linear import LinearMethodBase, UnquantizedLinearMethod
+from sglang.srt.layers.parameter import GroupQuantScaleParameter, PackedvLLMParameter
+from sglang.srt.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.turbomind_utils import (
+    get_u4_slices,
+    is_layer_skipped_awq,
+    pack_u4_row,
+    unpack_awq_gemm,
+    verify_turbomind_supported,
+)
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.utils import is_cuda, set_weight_attrs
+
+logger = logging.getLogger(__name__)
+
+
+class AWQTurbomindConfig(QuantizationConfig):
+    """Config class for AWQ Turbomind"""
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        zero_point: bool,
+        lm_head_quantized: bool,
+        modules_to_not_convert: Optional[List[str]] = None,
+    ) -> None:
+        self.pack_factor = 32 // weight_bits  # packed into int32
+        self.group_size = group_size
+        self.zero_point = zero_point
+        self.lm_head_quantized = lm_head_quantized
+        self.weight_bits = weight_bits
+        self.modules_to_not_convert = modules_to_not_convert or []
+
+        verify_turbomind_supported(self.weight_bits, self.group_size)
+
+    def __repr__(self) -> str:
+        return (
+            f"AWQTurbomindConfig(weight_bits={self.weight_bits}, "
+            f"group_size={self.group_size}, "
+            f"zero_point={self.zero_point}, "
+            f"lm_head_quantized={self.lm_head_quantized}, "
+            f"modules_to_not_convert={self.modules_to_not_convert})"
+        )
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "awq_turbomind"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "AWQTurbomindConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        zero_point = cls.get_from_keys(config, ["zero_point"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        modules_to_not_convert = cls.get_from_keys_or(
+            config, ["modules_to_not_convert"], None
+        )
+        return cls(
+            weight_bits,
+            group_size,
+            zero_point,
+            lm_head_quantized,
+            modules_to_not_convert,
+        )
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        can_convert = cls.is_awq_turbomind_compatible(hf_quant_cfg)
+        is_valid_user_quant = user_quant is None or user_quant == "awq_turbomind"
+
+        if can_convert and is_valid_user_quant:
+            msg = f"The model is convertible to {cls.get_name()} during runtime. Using {cls.get_name()} kernel."
+            logger.info(msg)
+            return cls.get_name()
+
+        if can_convert and user_quant == "awq":
+            logger.info(
+                "Detected that the model can run with awq_turbomind"
+                ", however you specified quantization=awq explicitly,"
+                " so forcing awq. Use quantization=awq_turbomind for"
+                " faster inference"
+            )
+        return None
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase) or (
+            isinstance(layer, ParallelLMHead) and self.lm_head_quantized
+        ):
+            if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
+                return UnquantizedLinearMethod()
+            return AWQTurbomindLinearMethod(self)
+
+        return None
+
+    @classmethod
+    def is_awq_turbomind_compatible(cls, quant_config: Dict[str, Any]):
+        if not is_cuda():
+            return False
+
+        # Extract data from quant config.
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits")
+        group_size = quant_config.get("group_size")
+        zero_point = quant_config.get("zero_point")
+
+        if quant_method != "awq":
+            return False
+
+        # If we cannot find the info needed in the config, cannot convert.
+        if num_bits is None or group_size is None or zero_point is None:
+            return False
+
+        return verify_turbomind_supported(quant_bit=num_bits, group_size=group_size)
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class AWQTurbomindLinearMethod(LinearMethodBase):
+    """Linear method for AWQ Turbomind.
+
+    Args:
+        quant_config: The AWQ Turbomind quantization config.
+    """
+
+    def __init__(self, quant_config: AWQTurbomindConfig) -> None:
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        num_groups = input_size_per_partition // group_size
+
+        qzeros = PackedvLLMParameter(
+            data=torch.empty(
+                num_groups,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        scales = GroupQuantScaleParameter(
+            data=torch.empty(
+                num_groups,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            input_dim=0,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("qzeros", qzeros)
+        layer.register_parameter("scales", scales)
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.num_groups = num_groups
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+
+        qweight_turbomind = unpack_awq_gemm(layer.qweight.data)
+        qzeros_turbomind = unpack_awq_gemm(layer.qzeros.data)
+        scales_turbomind = layer.scales.data
+
+        qweight_turbomind = pack_u4_row(qweight_turbomind)
+        qzeros_turbomind = qzeros_turbomind.to(torch.half)
+
+        device_id = layer.qweight.device.index
+        properties = torch.cuda.get_device_properties(device_id)
+
+        def is_16xx_series(name):
+            import re
+
+            pattern = r"GTX 16\d\d"
+            return bool(re.search(pattern, name))
+
+        simt = is_16xx_series(properties.name)
+        qweight_turbomind = qweight_turbomind.contiguous()
+        scales_turbomind = scales_turbomind.contiguous()
+        qzeros_turbomind = qzeros_turbomind.contiguous()
+
+        self.linear = _turbomind_ext.Linear(
+            layer.input_size_per_partition,
+            layer.output_size_per_partition,
+            self.quant_config.weight_bits,
+            self.quant_config.group_size,
+        )
+
+        self.linear.post_init(
+            qweight_turbomind, scales_turbomind, qzeros_turbomind, simt
+        )
+
+        layer.qweight = Parameter(qweight_turbomind, requires_grad=False)
+        layer.scales = Parameter(scales_turbomind, requires_grad=False)
+        layer.qzeros = Parameter(qzeros_turbomind, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        x = x.view(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (layer.output_size_per_partition,)
+        out = torch.empty(
+            (x.shape[0], layer.output_size_per_partition),
+            dtype=torch.float16,
+            device=x.device,
+        )
+        stream = torch.cuda.current_stream()
+
+        self.linear.forward(x, out, stream.cuda_stream)
+        out = torch.from_dlpack(out)
+        if bias is not None:
+            out.add_(bias)
+
+        return out.view(out_shape)
diff --git a/python/sglang/srt/layers/quantization/turbomind_utils.py b/python/sglang/srt/layers/quantization/turbomind_utils.py
new file mode 100644
index 000000000000..b8d4b97d00d2
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/turbomind_utils.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import List
+
+import torch
+
+from sglang.srt.utils import get_device_capability
+
+
+def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> List[torch.Tensor]:
+    assert x.dtype == torch.int32
+    xs = []
+    for _ in range(8):
+        xs.append((x & 15).to(dtype))
+        x = x >> 4
+    return xs
+
+
+def unpack_awq_gemm(x: torch.Tensor) -> torch.Tensor:
+    """
+    The int4 weights are packed into int32:
+    bit:       31-28  27-24  23-20  19-16  15-12  11-8   7-4    3-0
+    weight:    int4_1 int4_2 int4_3 int4_4 int4_5 int4_6 int4_7 int4_8
+    """
+    xs = get_u4_slices(x, torch.uint8)
+    order = [0, 4, 1, 5, 2, 6, 3, 7]
+    ys = [xs[i] for i in order]
+    return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1)
+
+
+def pack_u4_row(x: torch.Tensor) -> torch.Tensor:
+    assert x.dtype == torch.uint8
+    xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1)
+    a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device)
+    for t in reversed(xs):
+        a = (a << 4) | t
+    return a.squeeze(dim=-1)
+
+
+def verify_turbomind_supported(quant_bit: int, group_size: int) -> bool:
+
+    if quant_bit not in [4]:
+        raise NotImplementedError(
+            f"[Tubomind] Only 4-bit is supported for now, but got {quant_bit} bit"
+        )
+    if group_size != 128:
+        raise NotImplementedError(
+            f"[Tubomind] Only group_size 128 is supported for now, "
+            f"but got group_size {group_size}"
+        )
+
+    major, minor = get_device_capability()
+    capability = major * 10 + minor
+    if capability < 70:
+        raise NotImplementedError(
+            f"[Tubomind] Only capability >= 70 is supported for now, but got {capability}"
+        )
+
+    return True
+
+
+def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]):
+    return any(module_name in prefix for module_name in modules_to_not_convert)
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index be85a3670d40..061d320ef47b 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -375,6 +375,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
                 "marlin",
                 "gptq_marlin",
                 "awq_marlin",
+                "awq_turbomind",
                 "bitsandbytes",
                 "gguf",
                 "modelopt",
diff --git a/test/srt/test_turbomind_awq.py b/test/srt/test_turbomind_awq.py
new file mode 100644
index 000000000000..fa2a879d4ff2
--- /dev/null
+++ b/test/srt/test_turbomind_awq.py
@@ -0,0 +1,47 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestMLA(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--quantization",
+                "awq_turbomind",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.5
+
+
+if __name__ == "__main__":
+    unittest.main()

From f3516c28944215c576187f94468d7a4c2546ff61 Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Mon, 13 Jan 2025 20:32:17 +0800
Subject: [PATCH 039/248] Fix quant kernel accuracy issue (#2865)

---
 python/sglang/srt/layers/quantization/int8_kernel.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/int8_kernel.py b/python/sglang/srt/layers/quantization/int8_kernel.py
index d1e74c6044de..91b56f9e0e9c 100644
--- a/python/sglang/srt/layers/quantization/int8_kernel.py
+++ b/python/sglang/srt/layers/quantization/int8_kernel.py
@@ -22,7 +22,8 @@ def _per_token_quant_int8(
     x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask, other=0.0).to(tl.float32)
     absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10)
     scale_x = absmax / 127
-    x_q = tl.extra.cuda.libdevice.round(x / scale_x).to(tl.int8)
+    x_q = x * (127 / absmax)
+    x_q = tl.extra.cuda.libdevice.round(x_q).to(tl.int8)
 
     tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask)
     tl.store(scale_ptr + row_id, scale_x)

From 6249e4a19ed66afa100d55fa41997b725ff4b296 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 13 Jan 2025 04:44:39 -0800
Subject: [PATCH 040/248] Revert "Integration of TurboMind AWQ" (#2866)

---
 python/pyproject.toml                         |   2 +-
 python/sglang/srt/configs/model_config.py     |  10 +-
 python/sglang/srt/layers/linear.py            |   1 -
 .../srt/layers/quantization/__init__.py       |   2 -
 .../srt/layers/quantization/awq_turbomind.py  | 287 ------------------
 .../layers/quantization/turbomind_utils.py    |  63 ----
 python/sglang/srt/server_args.py              |   1 -
 test/srt/test_turbomind_awq.py                |  47 ---
 8 files changed, 2 insertions(+), 411 deletions(-)
 delete mode 100644 python/sglang/srt/layers/quantization/awq_turbomind.py
 delete mode 100644 python/sglang/srt/layers/quantization/turbomind_utils.py
 delete mode 100644 test/srt/test_turbomind_awq.py

diff --git a/python/pyproject.toml b/python/pyproject.toml
index c29580b50b1b..a236469a17c8 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -28,7 +28,7 @@ runtime_common = [
 srt = [
     "sglang[runtime_common]", "cuda-python",
     "sgl-kernel>=0.0.2.post11", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1",
-    "flashinfer==0.1.6", "turbomind"
+    "flashinfer==0.1.6"
 ]
 
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 28144f139958..072c88b04a78 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -14,7 +14,6 @@
 
 import json
 import logging
-import sys
 from enum import IntEnum, auto
 from typing import List, Optional, Set, Union
 
@@ -231,7 +230,7 @@ def _verify_quantization(self) -> None:
         # Parse quantization method from the HF model config, if available.
         quant_cfg = self._parse_quant_hf_config()
 
-        if quant_cfg is not None and not quantization_in_turbomind(self.quantization):
+        if quant_cfg is not None:
             quant_method = quant_cfg.get("quant_method", "").lower()
 
             # Detect which checkpoint is it
@@ -402,10 +401,3 @@ def is_multimodal_model(model_architectures: List[str]):
 
 def is_encoder_decoder_model(model_architectures: List[str]):
     return "MllamaForConditionalGeneration" in model_architectures
-
-
-def quantization_in_turbomind(quantization: str) -> bool:
-    if quantization in ["awq_turbomind"]:
-        return True
-    else:
-        return False
diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py
index 815255d5c167..ee9386c13fa3 100644
--- a/python/sglang/srt/layers/linear.py
+++ b/python/sglang/srt/layers/linear.py
@@ -48,7 +48,6 @@
     "GPTQLinearMethod",
     "FBGEMMFp8LinearMethod",
     "ModelOptFp8LinearMethod",
-    "AWQTurbomindLinearMethod",
     "IPEXAWQLinearMethod",
 ]
 
diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py
index faf14d6fdd6b..35b0c4d94edb 100644
--- a/python/sglang/srt/layers/quantization/__init__.py
+++ b/python/sglang/srt/layers/quantization/__init__.py
@@ -20,7 +20,6 @@
 from vllm.model_executor.layers.quantization.qqq import QQQConfig
 from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
 
-from sglang.srt.layers.quantization.awq_turbomind import AWQTurbomindConfig
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.fp8 import Fp8Config
 from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp8Config
@@ -38,7 +37,6 @@
     "gptq_marlin_24": GPTQMarlin24Config,
     "gptq_marlin": GPTQMarlinConfig,
     "awq_marlin": AWQMarlinConfig,
-    "awq_turbomind": AWQTurbomindConfig,
     "gptq": GPTQConfig,
     "compressed-tensors": CompressedTensorsConfig,
     "bitsandbytes": BitsAndBytesConfig,
diff --git a/python/sglang/srt/layers/quantization/awq_turbomind.py b/python/sglang/srt/layers/quantization/awq_turbomind.py
deleted file mode 100644
index 007b20420973..000000000000
--- a/python/sglang/srt/layers/quantization/awq_turbomind.py
+++ /dev/null
@@ -1,287 +0,0 @@
-import logging
-import os
-import sys
-from typing import Any, Dict, List, Optional
-
-import torch
-import turbomind
-from torch.nn import Parameter
-
-turbomind_dir = os.path.split(turbomind.__file__)[0]
-sys.path.append(os.path.join(turbomind_dir, "lib"))
-import _turbomind_ext
-from vllm.model_executor.layers.linear import LinearBase
-
-from sglang.srt.layers.linear import LinearMethodBase, UnquantizedLinearMethod
-from sglang.srt.layers.parameter import GroupQuantScaleParameter, PackedvLLMParameter
-from sglang.srt.layers.quantization.base_config import (
-    QuantizationConfig,
-    QuantizeMethodBase,
-)
-from sglang.srt.layers.quantization.turbomind_utils import (
-    get_u4_slices,
-    is_layer_skipped_awq,
-    pack_u4_row,
-    unpack_awq_gemm,
-    verify_turbomind_supported,
-)
-from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
-from sglang.srt.utils import is_cuda, set_weight_attrs
-
-logger = logging.getLogger(__name__)
-
-
-class AWQTurbomindConfig(QuantizationConfig):
-    """Config class for AWQ Turbomind"""
-
-    def __init__(
-        self,
-        weight_bits: int,
-        group_size: int,
-        zero_point: bool,
-        lm_head_quantized: bool,
-        modules_to_not_convert: Optional[List[str]] = None,
-    ) -> None:
-        self.pack_factor = 32 // weight_bits  # packed into int32
-        self.group_size = group_size
-        self.zero_point = zero_point
-        self.lm_head_quantized = lm_head_quantized
-        self.weight_bits = weight_bits
-        self.modules_to_not_convert = modules_to_not_convert or []
-
-        verify_turbomind_supported(self.weight_bits, self.group_size)
-
-    def __repr__(self) -> str:
-        return (
-            f"AWQTurbomindConfig(weight_bits={self.weight_bits}, "
-            f"group_size={self.group_size}, "
-            f"zero_point={self.zero_point}, "
-            f"lm_head_quantized={self.lm_head_quantized}, "
-            f"modules_to_not_convert={self.modules_to_not_convert})"
-        )
-
-    @classmethod
-    def get_name(cls) -> str:
-        return "awq_turbomind"
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        return [torch.half, torch.bfloat16]
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        return 70
-
-    @classmethod
-    def get_config_filenames(cls) -> List[str]:
-        return ["quantize_config.json"]
-
-    @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "AWQTurbomindConfig":
-        weight_bits = cls.get_from_keys(config, ["bits"])
-        group_size = cls.get_from_keys(config, ["group_size"])
-        zero_point = cls.get_from_keys(config, ["zero_point"])
-        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
-        modules_to_not_convert = cls.get_from_keys_or(
-            config, ["modules_to_not_convert"], None
-        )
-        return cls(
-            weight_bits,
-            group_size,
-            zero_point,
-            lm_head_quantized,
-            modules_to_not_convert,
-        )
-
-    @classmethod
-    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
-        can_convert = cls.is_awq_turbomind_compatible(hf_quant_cfg)
-        is_valid_user_quant = user_quant is None or user_quant == "awq_turbomind"
-
-        if can_convert and is_valid_user_quant:
-            msg = f"The model is convertible to {cls.get_name()} during runtime. Using {cls.get_name()} kernel."
-            logger.info(msg)
-            return cls.get_name()
-
-        if can_convert and user_quant == "awq":
-            logger.info(
-                "Detected that the model can run with awq_turbomind"
-                ", however you specified quantization=awq explicitly,"
-                " so forcing awq. Use quantization=awq_turbomind for"
-                " faster inference"
-            )
-        return None
-
-    def get_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["QuantizeMethodBase"]:
-        if isinstance(layer, LinearBase) or (
-            isinstance(layer, ParallelLMHead) and self.lm_head_quantized
-        ):
-            if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
-                return UnquantizedLinearMethod()
-            return AWQTurbomindLinearMethod(self)
-
-        return None
-
-    @classmethod
-    def is_awq_turbomind_compatible(cls, quant_config: Dict[str, Any]):
-        if not is_cuda():
-            return False
-
-        # Extract data from quant config.
-        quant_method = quant_config.get("quant_method", "").lower()
-        num_bits = quant_config.get("bits")
-        group_size = quant_config.get("group_size")
-        zero_point = quant_config.get("zero_point")
-
-        if quant_method != "awq":
-            return False
-
-        # If we cannot find the info needed in the config, cannot convert.
-        if num_bits is None or group_size is None or zero_point is None:
-            return False
-
-        return verify_turbomind_supported(quant_bit=num_bits, group_size=group_size)
-
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
-
-class AWQTurbomindLinearMethod(LinearMethodBase):
-    """Linear method for AWQ Turbomind.
-
-    Args:
-        quant_config: The AWQ Turbomind quantization config.
-    """
-
-    def __init__(self, quant_config: AWQTurbomindConfig) -> None:
-        self.quant_config = quant_config
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: List[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ) -> None:
-
-        output_size_per_partition = sum(output_partition_sizes)
-        weight_loader = extra_weight_attrs.get("weight_loader")
-
-        # Normalize group_size
-        if self.quant_config.group_size != -1:
-            group_size = self.quant_config.group_size
-        else:
-            group_size = input_size
-
-        qweight = PackedvLLMParameter(
-            data=torch.empty(
-                input_size_per_partition,
-                output_size_per_partition // self.quant_config.pack_factor,
-                dtype=torch.int32,
-            ),
-            input_dim=0,
-            output_dim=1,
-            packed_dim=1,
-            packed_factor=self.quant_config.pack_factor,
-            weight_loader=weight_loader,
-        )
-
-        num_groups = input_size_per_partition // group_size
-
-        qzeros = PackedvLLMParameter(
-            data=torch.empty(
-                num_groups,
-                output_size_per_partition // self.quant_config.pack_factor,
-                dtype=torch.int32,
-            ),
-            input_dim=0,
-            output_dim=1,
-            packed_dim=1,
-            packed_factor=self.quant_config.pack_factor,
-            weight_loader=weight_loader,
-        )
-
-        scales = GroupQuantScaleParameter(
-            data=torch.empty(
-                num_groups,
-                output_size_per_partition,
-                dtype=params_dtype,
-            ),
-            input_dim=0,
-            output_dim=1,
-            weight_loader=weight_loader,
-        )
-
-        layer.register_parameter("qweight", qweight)
-        layer.register_parameter("qzeros", qzeros)
-        layer.register_parameter("scales", scales)
-
-        layer.input_size_per_partition = input_size_per_partition
-        layer.output_size_per_partition = output_size_per_partition
-        layer.num_groups = num_groups
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-
-        qweight_turbomind = unpack_awq_gemm(layer.qweight.data)
-        qzeros_turbomind = unpack_awq_gemm(layer.qzeros.data)
-        scales_turbomind = layer.scales.data
-
-        qweight_turbomind = pack_u4_row(qweight_turbomind)
-        qzeros_turbomind = qzeros_turbomind.to(torch.half)
-
-        device_id = layer.qweight.device.index
-        properties = torch.cuda.get_device_properties(device_id)
-
-        def is_16xx_series(name):
-            import re
-
-            pattern = r"GTX 16\d\d"
-            return bool(re.search(pattern, name))
-
-        simt = is_16xx_series(properties.name)
-        qweight_turbomind = qweight_turbomind.contiguous()
-        scales_turbomind = scales_turbomind.contiguous()
-        qzeros_turbomind = qzeros_turbomind.contiguous()
-
-        self.linear = _turbomind_ext.Linear(
-            layer.input_size_per_partition,
-            layer.output_size_per_partition,
-            self.quant_config.weight_bits,
-            self.quant_config.group_size,
-        )
-
-        self.linear.post_init(
-            qweight_turbomind, scales_turbomind, qzeros_turbomind, simt
-        )
-
-        layer.qweight = Parameter(qweight_turbomind, requires_grad=False)
-        layer.scales = Parameter(scales_turbomind, requires_grad=False)
-        layer.qzeros = Parameter(qzeros_turbomind, requires_grad=False)
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-
-        x = x.view(-1, x.shape[-1])
-        out_shape = x.shape[:-1] + (layer.output_size_per_partition,)
-        out = torch.empty(
-            (x.shape[0], layer.output_size_per_partition),
-            dtype=torch.float16,
-            device=x.device,
-        )
-        stream = torch.cuda.current_stream()
-
-        self.linear.forward(x, out, stream.cuda_stream)
-        out = torch.from_dlpack(out)
-        if bias is not None:
-            out.add_(bias)
-
-        return out.view(out_shape)
diff --git a/python/sglang/srt/layers/quantization/turbomind_utils.py b/python/sglang/srt/layers/quantization/turbomind_utils.py
deleted file mode 100644
index b8d4b97d00d2..000000000000
--- a/python/sglang/srt/layers/quantization/turbomind_utils.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-from typing import List
-
-import torch
-
-from sglang.srt.utils import get_device_capability
-
-
-def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> List[torch.Tensor]:
-    assert x.dtype == torch.int32
-    xs = []
-    for _ in range(8):
-        xs.append((x & 15).to(dtype))
-        x = x >> 4
-    return xs
-
-
-def unpack_awq_gemm(x: torch.Tensor) -> torch.Tensor:
-    """
-    The int4 weights are packed into int32:
-    bit:       31-28  27-24  23-20  19-16  15-12  11-8   7-4    3-0
-    weight:    int4_1 int4_2 int4_3 int4_4 int4_5 int4_6 int4_7 int4_8
-    """
-    xs = get_u4_slices(x, torch.uint8)
-    order = [0, 4, 1, 5, 2, 6, 3, 7]
-    ys = [xs[i] for i in order]
-    return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1)
-
-
-def pack_u4_row(x: torch.Tensor) -> torch.Tensor:
-    assert x.dtype == torch.uint8
-    xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1)
-    a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device)
-    for t in reversed(xs):
-        a = (a << 4) | t
-    return a.squeeze(dim=-1)
-
-
-def verify_turbomind_supported(quant_bit: int, group_size: int) -> bool:
-
-    if quant_bit not in [4]:
-        raise NotImplementedError(
-            f"[Tubomind] Only 4-bit is supported for now, but got {quant_bit} bit"
-        )
-    if group_size != 128:
-        raise NotImplementedError(
-            f"[Tubomind] Only group_size 128 is supported for now, "
-            f"but got group_size {group_size}"
-        )
-
-    major, minor = get_device_capability()
-    capability = major * 10 + minor
-    if capability < 70:
-        raise NotImplementedError(
-            f"[Tubomind] Only capability >= 70 is supported for now, but got {capability}"
-        )
-
-    return True
-
-
-def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]):
-    return any(module_name in prefix for module_name in modules_to_not_convert)
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 061d320ef47b..be85a3670d40 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -375,7 +375,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
                 "marlin",
                 "gptq_marlin",
                 "awq_marlin",
-                "awq_turbomind",
                 "bitsandbytes",
                 "gguf",
                 "modelopt",
diff --git a/test/srt/test_turbomind_awq.py b/test/srt/test_turbomind_awq.py
deleted file mode 100644
index fa2a879d4ff2..000000000000
--- a/test/srt/test_turbomind_awq.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import unittest
-from types import SimpleNamespace
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    popen_launch_server,
-)
-
-
-class TestMLA(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--quantization",
-                "awq_turbomind",
-            ],
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_mgsm_en(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mgsm_en",
-            num_examples=None,
-            num_threads=1024,
-        )
-
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.5
-
-
-if __name__ == "__main__":
-    unittest.main()

From 3b141e15097d5e436f0c5ded65a364aba3d7c043 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 13 Jan 2025 04:51:56 -0800
Subject: [PATCH 041/248] Dump requests (#2862)

---
 .../sglang/srt/managers/tokenizer_manager.py  | 28 +++++++++++++++++++
 python/sglang/srt/server_args.py              | 10 +++++--
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index fb6202932f0f..d12ed8c575b8 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -18,10 +18,12 @@
 import dataclasses
 import logging
 import os
+import pickle
 import signal
 import sys
 import time
 import uuid
+from datetime import datetime
 from typing import Any, Awaitable, Dict, Generic, List, Optional, Tuple, TypeVar, Union
 
 import fastapi
@@ -105,6 +107,7 @@ def __init__(
         # Parse args
         self.server_args = server_args
         self.enable_metrics = server_args.enable_metrics
+        self.dump_requsts_folder = server_args.dump_requests_folder
 
         # Init inter-process communication
         context = zmq.asyncio.Context(2)
@@ -163,6 +166,7 @@ def __init__(
         # Store states
         self.to_create_loop = True
         self.rid_to_state: Dict[str, ReqState] = {}
+        self.dump_request_list: List[Tuple] = []
 
         # The event to notify the weight sync is finished.
         self.model_update_lock = RWLock()
@@ -680,6 +684,9 @@ async def handle_loop(self):
 
                     if self.enable_metrics:
                         self.collect_metrics(state, recv_obj, i)
+                    if self.dump_requsts_folder and state.finished:
+                        self.dump_requests(state, out_dict)
+
             elif isinstance(recv_obj, OpenSessionReqOutput):
                 self.session_futures[recv_obj.session_id].set_result(
                     recv_obj.session_id if recv_obj.success else None
@@ -818,6 +825,27 @@ def collect_metrics(self, state: ReqState, recv_obj: BatchStrOut, i: int):
                     (time.time() - state.created_time) / completion_tokens
                 )
 
+    def dump_requests(self, state: ReqState, out_dict: dict):
+        self.dump_request_list.append(
+            (state.obj, out_dict, state.created_time, time.time())
+        )
+
+        if len(self.dump_request_list) > int(
+            os.environ.get("SGLANG_DUMP_REQUESTS_THRESHOLD", "1000")
+        ):
+            to_dump = self.dump_request_list
+            self.dump_request_list = []
+
+            def background_task():
+                os.makedirs(self.dump_requsts_folder, exist_ok=True)
+                current_time = datetime.now()
+                filename = current_time.strftime("%Y-%m-%d_%H-%M-%S") + ".pkl"
+                with open(os.path.join(self.dump_requsts_folder, filename), "wb") as f:
+                    pickle.dump(to_dump, f)
+
+            # Schedule the task to run in the background without awaiting it
+            asyncio.create_task(asyncio.to_thread(background_task))
+
 
 class SignalHandler:
     def __init__(self, tokenizer_manager):
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index be85a3670d40..e5c423a35188 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -23,7 +23,6 @@
 import torch
 
 from sglang.srt.hf_transformers_utils import check_gguf_file
-from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.utils import (
     get_amdgpu_memory_capacity,
     get_hpu_memory_capacity,
@@ -89,6 +88,7 @@ class ServerArgs:
     show_time_cost: bool = False
     enable_metrics: bool = False
     decode_log_interval: int = 40
+    dump_requests_folder: str = ""
 
     # API related
     api_key: Optional[str] = None
@@ -554,7 +554,13 @@ def add_cli_args(parser: argparse.ArgumentParser):
             "--decode-log-interval",
             type=int,
             default=ServerArgs.decode_log_interval,
-            help="The log interval of decode batch",
+            help="The log interval of decode batch.",
+        )
+        parser.add_argument(
+            "--dump-requests-folder",
+            type=str,
+            default=ServerArgs.decode_log_interval,
+            help="Dump raw requests to a folder for replay.",
         )
 
         # API related

From 336ff5b9f564a1af2d8b4f1a22caf4c17c0cbbdc Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 13 Jan 2025 05:13:02 -0800
Subject: [PATCH 042/248] Fix typos in io_struct.py (#2867)

---
 python/sglang/srt/managers/io_struct.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
index 6ddc0993f9d7..26b8921c493f 100644
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -355,9 +355,6 @@ class BatchStrOut:
     output_strs: List[str]
 
     # Token counts
-    # real input and output tokens can be get from
-    # origin_input_ids and output_ids by enabling --return_token_ids
-    # TODO (Shuai): Rename this to clarify the meaning.
     prompt_tokens: List[int]
     completion_tokens: List[int]
     cached_tokens: List[int]

From d855653bd42ad8b037a6843e53171a6bb21ea420 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Mon, 13 Jan 2025 21:18:39 +0800
Subject: [PATCH 043/248] minor: fix release docs (#2868)

---
 .github/workflows/release-docs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml
index 44bdfa0fa1ab..c200f5313e65 100644
--- a/.github/workflows/release-docs.yml
+++ b/.github/workflows/release-docs.yml
@@ -49,7 +49,7 @@ jobs:
           cd _build/html
 
           git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1
-          find ../sgl-project.github.io/ -mindepth 1 -not -name CNAME -delete
+          find ../sgl-project.github.io/ -mindepth 1 -not -path "../sgl-project.github.io/.git*" -not -name CNAME -delete
           cp -r * ../sgl-project.github.io
           cp ../../README.md ../sgl-project.github.io/README.md
           cd ../sgl-project.github.io

From 6ec75e626d5949dfca49069cd778cd4eb29d02b1 Mon Sep 17 00:00:00 2001
From: Lzhang-hub <57925599+Lzhang-hub@users.noreply.github.com>
Date: Mon, 13 Jan 2025 21:29:33 +0800
Subject: [PATCH 044/248] add qwen2 eagle model (#2863)

---
 python/sglang/srt/models/qwen2.py       |  11 ++
 python/sglang/srt/models/qwen2_eagle.py | 131 ++++++++++++++++++++++++
 2 files changed, 142 insertions(+)
 create mode 100644 python/sglang/srt/models/qwen2_eagle.py

diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py
index 2a20d6c50de1..e42559bbc00c 100644
--- a/python/sglang/srt/models/qwen2.py
+++ b/python/sglang/srt/models/qwen2.py
@@ -362,5 +362,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
 
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
 
 EntryClass = Qwen2ForCausalLM
diff --git a/python/sglang/srt/models/qwen2_eagle.py b/python/sglang/srt/models/qwen2_eagle.py
new file mode 100644
index 000000000000..01069ef482cd
--- /dev/null
+++ b/python/sglang/srt/models/qwen2_eagle.py
@@ -0,0 +1,131 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+# Adapted from
+# https://github.com/SafeAILab/EAGLE/blob/main/eagle/model/cnets.py
+"""Inference-only LLaMA-EAGLE model compatible with HuggingFace weights."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2ForCausalLM
+
+Qwen2Config = None
+
+
+class Qwen2DecoderLayer(Qwen2DecoderLayer):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, layer_id, quant_config)
+
+        # Skip the input_layernorm
+        # https://github.com/SafeAILab/EAGLE/blob/35c78f6cdc19a73e05cf5c330b4c358dad970c6a/eagle/model/cnets.py#L427
+        if layer_id == 0:
+            del self.input_layernorm
+            setattr(self, "input_layernorm", lambda x: x)
+
+
+class Qwen2Model(nn.Module):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [
+                Qwen2DecoderLayer(
+                    config, i, quant_config=quant_config, prefix=f"model.layers.{i}"
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.fc = torch.nn.Linear(config.hidden_size * 2, config.hidden_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        hidden_states = self.fc(
+            torch.cat((hidden_states, forward_batch.spec_info.hidden_states), dim=-1)
+        )
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        return hidden_states + residual
+
+
+class Qwen2ForCausalLMEagle(Qwen2ForCausalLM):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        cache_config=None,
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen2Model(config, quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size, config.hidden_size, quant_config=quant_config
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        for name, loaded_weight in weights:
+            if "lm_head" not in name:
+                name = "model." + name
+                super().load_weights([(name, loaded_weight)])
+
+
+EntryClass = [Qwen2ForCausalLMEagle]

From c1e097ca669838f2bc09655612cc9d38fc55a275 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 13 Jan 2025 06:21:25 -0800
Subject: [PATCH 045/248] Revert "Dump requests to a folder" (#2869)

---
 .../sglang/srt/managers/tokenizer_manager.py  | 28 -------------------
 python/sglang/srt/server_args.py              | 10 ++-----
 2 files changed, 2 insertions(+), 36 deletions(-)

diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index d12ed8c575b8..fb6202932f0f 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -18,12 +18,10 @@
 import dataclasses
 import logging
 import os
-import pickle
 import signal
 import sys
 import time
 import uuid
-from datetime import datetime
 from typing import Any, Awaitable, Dict, Generic, List, Optional, Tuple, TypeVar, Union
 
 import fastapi
@@ -107,7 +105,6 @@ def __init__(
         # Parse args
         self.server_args = server_args
         self.enable_metrics = server_args.enable_metrics
-        self.dump_requsts_folder = server_args.dump_requests_folder
 
         # Init inter-process communication
         context = zmq.asyncio.Context(2)
@@ -166,7 +163,6 @@ def __init__(
         # Store states
         self.to_create_loop = True
         self.rid_to_state: Dict[str, ReqState] = {}
-        self.dump_request_list: List[Tuple] = []
 
         # The event to notify the weight sync is finished.
         self.model_update_lock = RWLock()
@@ -684,9 +680,6 @@ async def handle_loop(self):
 
                     if self.enable_metrics:
                         self.collect_metrics(state, recv_obj, i)
-                    if self.dump_requsts_folder and state.finished:
-                        self.dump_requests(state, out_dict)
-
             elif isinstance(recv_obj, OpenSessionReqOutput):
                 self.session_futures[recv_obj.session_id].set_result(
                     recv_obj.session_id if recv_obj.success else None
@@ -825,27 +818,6 @@ def collect_metrics(self, state: ReqState, recv_obj: BatchStrOut, i: int):
                     (time.time() - state.created_time) / completion_tokens
                 )
 
-    def dump_requests(self, state: ReqState, out_dict: dict):
-        self.dump_request_list.append(
-            (state.obj, out_dict, state.created_time, time.time())
-        )
-
-        if len(self.dump_request_list) > int(
-            os.environ.get("SGLANG_DUMP_REQUESTS_THRESHOLD", "1000")
-        ):
-            to_dump = self.dump_request_list
-            self.dump_request_list = []
-
-            def background_task():
-                os.makedirs(self.dump_requsts_folder, exist_ok=True)
-                current_time = datetime.now()
-                filename = current_time.strftime("%Y-%m-%d_%H-%M-%S") + ".pkl"
-                with open(os.path.join(self.dump_requsts_folder, filename), "wb") as f:
-                    pickle.dump(to_dump, f)
-
-            # Schedule the task to run in the background without awaiting it
-            asyncio.create_task(asyncio.to_thread(background_task))
-
 
 class SignalHandler:
     def __init__(self, tokenizer_manager):
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index e5c423a35188..be85a3670d40 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -23,6 +23,7 @@
 import torch
 
 from sglang.srt.hf_transformers_utils import check_gguf_file
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.utils import (
     get_amdgpu_memory_capacity,
     get_hpu_memory_capacity,
@@ -88,7 +89,6 @@ class ServerArgs:
     show_time_cost: bool = False
     enable_metrics: bool = False
     decode_log_interval: int = 40
-    dump_requests_folder: str = ""
 
     # API related
     api_key: Optional[str] = None
@@ -554,13 +554,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
             "--decode-log-interval",
             type=int,
             default=ServerArgs.decode_log_interval,
-            help="The log interval of decode batch.",
-        )
-        parser.add_argument(
-            "--dump-requests-folder",
-            type=str,
-            default=ServerArgs.decode_log_interval,
-            help="Dump raw requests to a folder for replay.",
+            help="The log interval of decode batch",
         )
 
         # API related

From d08c77c434981534267d13ef78c22a817ac08775 Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Mon, 13 Jan 2025 23:09:00 +0800
Subject: [PATCH 046/248] Sampling penalties memory interface (#2870)

---
 ... benchmark_deepseekv3_moe_align_blocks.py} |   3 +-
 python/pyproject.toml                         |   2 +-
 .../penalizers/repetition_penalty.py          |  20 ++-
 .../srt/sampling/sampling_batch_info.py       |  19 ++-
 python/sglang/srt/utils.py                    |   4 +
 .../benchmark_sampling_scaling_penalties.py   | 159 ++++++++++++++++++
 sgl-kernel/tests/test_moe_align.py            |  95 +++++++----
 7 files changed, 256 insertions(+), 46 deletions(-)
 rename benchmark/kernels/fused_moe_triton/{benchmark_moe_align_blocks.py => benchmark_deepseekv3_moe_align_blocks.py} (98%)
 create mode 100644 sgl-kernel/benchmark/benchmark_sampling_scaling_penalties.py

diff --git a/benchmark/kernels/fused_moe_triton/benchmark_moe_align_blocks.py b/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py
similarity index 98%
rename from benchmark/kernels/fused_moe_triton/benchmark_moe_align_blocks.py
rename to benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py
index 92547ea95ae2..0a6049a1200c 100644
--- a/benchmark/kernels/fused_moe_triton/benchmark_moe_align_blocks.py
+++ b/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py
@@ -222,8 +222,9 @@ def calculate_diff(batch_size, seq_len):
 def benchmark(batch_size, seq_len, provider):
     num_experts = 256
     block_size = 128
+    topk = 8
     topk_ids = torch.randint(
-        0, num_experts, (batch_size, seq_len), dtype=torch.int32, device="cuda"
+        0, num_experts, (batch_size * seq_len, topk), dtype=torch.int32, device="cuda"
     )
 
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
diff --git a/python/pyproject.toml b/python/pyproject.toml
index a236469a17c8..4b627ae94785 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -27,7 +27,7 @@ runtime_common = [
 ]
 srt = [
     "sglang[runtime_common]", "cuda-python",
-    "sgl-kernel>=0.0.2.post11", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1",
+    "sgl-kernel>=0.0.2.post12", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1",
     "flashinfer==0.1.6"
 ]
 
diff --git a/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py b/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py
index 4c293b89520d..fcd5ff71c233 100644
--- a/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py
+++ b/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py
@@ -3,6 +3,11 @@
 import torch
 
 from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs
+from sglang.srt.utils import is_cuda_available
+
+is_cuda = is_cuda_available()
+if is_cuda:
+    from sgl_kernel import sampling_scaling_penalties
 
 
 class BatchedRepetitionPenalizer(_BatchedPenalizer):
@@ -56,11 +61,16 @@ def _cumulate_output_tokens(self, output_ids: _TokenIDs):
         self.cumulated_repetition_penalties[mask] = self.repetition_penalties[mask]
 
     def _apply(self, logits: torch.Tensor) -> torch.Tensor:
-        return torch.where(
-            logits > 0,
-            logits / self.cumulated_repetition_penalties,
-            logits * self.cumulated_repetition_penalties,
-        )
+        if is_cuda:
+            return sampling_scaling_penalties(
+                logits, self.cumulated_repetition_penalties
+            )
+        else:
+            return torch.where(
+                logits > 0,
+                logits / self.cumulated_repetition_penalties,
+                logits * self.cumulated_repetition_penalties,
+            )
 
     def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.repetition_penalties = self.repetition_penalties[indices_tensor_to_keep]
diff --git a/python/sglang/srt/sampling/sampling_batch_info.py b/python/sglang/srt/sampling/sampling_batch_info.py
index 9497e53d3092..6eda63c706a3 100644
--- a/python/sglang/srt/sampling/sampling_batch_info.py
+++ b/python/sglang/srt/sampling/sampling_batch_info.py
@@ -7,6 +7,12 @@
 
 import torch
 
+from sglang.srt.utils import is_cuda_available
+
+is_cuda = is_cuda_available()
+if is_cuda:
+    from sgl_kernel import sampling_scaling_penalties
+
 import sglang.srt.sampling.penaltylib as penaltylib
 
 logger = logging.getLogger(__name__)
@@ -245,11 +251,14 @@ def apply_logits_bias(self, logits: torch.Tensor):
 
         # repetition
         if self.scaling_penalties is not None:
-            logits[:] = torch.where(
-                logits > 0,
-                logits / self.scaling_penalties,
-                logits * self.scaling_penalties,
-            )
+            if is_cuda:
+                logits[:] = sampling_scaling_penalties(logits, self.scaling_penalties)
+            else:
+                logits[:] = torch.where(
+                    logits > 0,
+                    logits / self.scaling_penalties,
+                    logits * self.scaling_penalties,
+                )
 
         # Apply regex vocab_mask
         if self.vocab_mask is not None:
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index 51ca91a96b0d..e70e6b42526d 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -97,6 +97,10 @@ def is_flashinfer_available():
     return torch.cuda.is_available() and torch.version.cuda
 
 
+def is_cuda_available():
+    return torch.cuda.is_available() and torch.version.cuda
+
+
 def is_ipv6(address):
     try:
         ipaddress.IPv6Address(address)
diff --git a/sgl-kernel/benchmark/benchmark_sampling_scaling_penalties.py b/sgl-kernel/benchmark/benchmark_sampling_scaling_penalties.py
new file mode 100644
index 000000000000..000dab0d8e9a
--- /dev/null
+++ b/sgl-kernel/benchmark/benchmark_sampling_scaling_penalties.py
@@ -0,0 +1,159 @@
+import itertools
+
+import torch
+import triton
+from sgl_kernel import sampling_scaling_penalties
+
+
+def sampling_scaling_penalties_naive(logits, scaling_penalties):
+    return torch.where(
+        logits > 0, logits / scaling_penalties, logits * scaling_penalties
+    )
+
+
+def sampling_scaling_penalties_kernel(logits, scaling_penalties):
+    return sampling_scaling_penalties(logits, scaling_penalties)
+
+
+def test_memory(func, _iter):
+    total_mem = []
+
+    for _ in range(_iter):
+        torch.cuda.memory.reset_peak_memory_stats()
+        func()
+        mem = torch.cuda.max_memory_allocated() / (2**20)
+        total_mem.append(mem)
+
+    return sum(total_mem) / len(total_mem)
+
+
+def calculate_diff(batch_size, vocab_size):
+    dtype = torch.bfloat16
+    device = torch.device("cuda")
+
+    logits = torch.randn(batch_size, vocab_size, device=device, dtype=dtype)
+    scaling_penalties = (
+        torch.rand(batch_size, vocab_size, device=device, dtype=dtype) + 0.5
+    )
+
+    output_naive = sampling_scaling_penalties_naive(
+        logits.clone(), scaling_penalties.clone()
+    )
+    output_kernel = sampling_scaling_penalties_kernel(
+        logits.clone(), scaling_penalties.clone()
+    )
+
+    print(f"Naive output={output_naive}")
+    print(f"Kernel output={output_kernel}")
+
+    if torch.allclose(output_naive, output_kernel, atol=1e-2, rtol=1e-2):
+        print("✅ Both implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 12)]
+vocab_size_range = [2**i for i in range(10, 17)]
+configs = list(itertools.product(batch_size_range, vocab_size_range))
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size", "vocab_size"],
+        x_vals=[list(_) for _ in configs],
+        line_arg="provider",
+        line_vals=["naive", "kernel"],
+        line_names=["PyTorch Naive", "SGL Kernel"],
+        styles=[("blue", "-"), ("red", "-")],
+        ylabel="us",
+        plot_name="sampling-scaling-penalties-performance",
+        args={},
+    )
+)
+def benchmark(batch_size, vocab_size, provider):
+    dtype = torch.bfloat16
+    device = torch.device("cuda")
+
+    logits = torch.randn(batch_size, vocab_size, device=device, dtype=dtype)
+    scaling_penalties = (
+        torch.rand(batch_size, vocab_size, device=device, dtype=dtype) + 0.5
+    )
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "naive":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: sampling_scaling_penalties_naive(
+                logits.clone(),
+                scaling_penalties.clone(),
+            ),
+            quantiles=quantiles,
+        )
+    else:
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: sampling_scaling_penalties_kernel(
+                logits.clone(),
+                scaling_penalties.clone(),
+            ),
+            quantiles=quantiles,
+        )
+
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size", "vocab_size"],
+        x_vals=[list(_) for _ in configs],
+        line_arg="provider",
+        line_vals=["naive", "kernel"],
+        line_names=["PyTorch Naive", "SGL Kernel"],
+        styles=[("blue", "-"), ("red", "-")],
+        ylabel="GPU memory usage (MB)",
+        plot_name="sampling-scaling-penalties-memory",
+        args={},
+    )
+)
+def benchmark_memory(batch_size, vocab_size, provider):
+    dtype = torch.bfloat16
+    device = torch.device("cuda")
+
+    print(
+        f"Running memory benchmark with batch_size={batch_size}, vocab_size={vocab_size}, provider={provider}"
+    )
+
+    def run_kernel():
+        logits = torch.randn(batch_size, vocab_size, device=device, dtype=dtype)
+        scaling_penalties = (
+            torch.rand(batch_size, vocab_size, device=device, dtype=dtype) + 0.5
+        )
+
+        if provider == "naive":
+            return sampling_scaling_penalties_naive(logits, scaling_penalties)
+        else:
+            return sampling_scaling_penalties_kernel(logits, scaling_penalties)
+
+    mem = test_memory(run_kernel, _iter=10)
+    return mem
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./configs/benchmark_ops/sampling_scaling_penalties/",
+        help="Path to save sampling_scaling_penalties benchmark results",
+    )
+    args = parser.parse_args()
+
+    # Run correctness test
+    calculate_diff(batch_size=4, vocab_size=4096)
+
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
+
+    # Run memory benchmark
+    benchmark_memory.run(print_data=True, save_path=args.save_path)
diff --git a/sgl-kernel/tests/test_moe_align.py b/sgl-kernel/tests/test_moe_align.py
index 92596a47e5db..2fca90b2f561 100644
--- a/sgl-kernel/tests/test_moe_align.py
+++ b/sgl-kernel/tests/test_moe_align.py
@@ -3,38 +3,65 @@
 
 
 def test_moe_align_block_size():
+    # For DeepSeek V3, we have 256 experts
     num_experts = 256
-    block_size = 128
-    topk_ids = torch.randint(0, num_experts, (3, 4), dtype=torch.int32, device="cuda")
-
-    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
-    sorted_ids = torch.empty(
-        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
-    )
-    sorted_ids.fill_(topk_ids.numel())
-    max_num_m_blocks = max_num_tokens_padded // block_size
-    expert_ids = torch.empty(
-        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
-    )
-    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
-
-    token_cnts_buffer = torch.empty(
-        (num_experts + 1) * num_experts, dtype=torch.int32, device=topk_ids.device
-    )
-    cumsum_buffer = torch.empty(
-        num_experts + 1, dtype=torch.int32, device=topk_ids.device
-    )
-
-    moe_align_block_size(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_ids,
-        expert_ids,
-        num_tokens_post_pad,
-        token_cnts_buffer,
-        cumsum_buffer,
-    )
-
-
-test_moe_align_block_size()
+
+    # Test different combinations of block_size, num_tokens and topk
+    for block_size in [32, 64, 128, 256]:
+        print(f"\nTesting block_size={block_size}")
+        for num_tokens in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]:
+            for topk in [1, 2, 4, 8, 16, 32, 64]:
+                print(
+                    f"Testing block_size={block_size}, num_tokens={num_tokens}, topk={topk}"
+                )
+
+                # Create random topk_ids with shape [num_tokens, topk]
+                topk_ids = torch.randint(
+                    0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda"
+                )
+
+                max_num_tokens_padded = topk_ids.numel() + num_experts * (
+                    block_size - 1
+                )
+                sorted_ids = torch.empty(
+                    (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+                )
+                sorted_ids.fill_(topk_ids.numel())
+                max_num_m_blocks = max_num_tokens_padded // block_size
+                expert_ids = torch.empty(
+                    (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+                )
+                num_tokens_post_pad = torch.empty(
+                    (1), dtype=torch.int32, device=topk_ids.device
+                )
+
+                token_cnts_buffer = torch.empty(
+                    (num_experts + 1) * num_experts,
+                    dtype=torch.int32,
+                    device=topk_ids.device,
+                )
+                cumsum_buffer = torch.empty(
+                    num_experts + 1, dtype=torch.int32, device=topk_ids.device
+                )
+
+                try:
+                    moe_align_block_size(
+                        topk_ids,
+                        num_experts,
+                        block_size,
+                        sorted_ids,
+                        expert_ids,
+                        num_tokens_post_pad,
+                        token_cnts_buffer,
+                        cumsum_buffer,
+                    )
+                except Exception as e:
+                    print(
+                        f"Error occurred with block_size={block_size}, num_tokens={num_tokens}, topk={topk}"
+                    )
+                    print(f"Error message: {str(e)}")
+                    raise e
+
+
+if __name__ == "__main__":
+    test_moe_align_block_size()

From 923f518337ed4ec878a215ecc6193f8634e3b785 Mon Sep 17 00:00:00 2001
From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
Date: Tue, 14 Jan 2025 03:38:51 +0800
Subject: [PATCH 047/248] CUDA-graph-compatible releasing and resuming KV cache
 and model weight memory (#2630)

---
 python/pyproject.toml                         |   1 +
 python/sglang/srt/managers/io_struct.py       |  24 +++-
 python/sglang/srt/managers/scheduler.py       |  43 ++++++
 .../sglang/srt/managers/tokenizer_manager.py  |  32 +++++
 python/sglang/srt/mem_cache/memory_pool.py    | 130 +++++++++++-------
 .../sglang/srt/model_executor/model_runner.py |  22 ++-
 python/sglang/srt/server.py                   |  48 ++++++-
 python/sglang/srt/server_args.py              |   7 +-
 python/sglang/torch_memory_saver_adapter.py   |  59 ++++++++
 scripts/ci_install_dependency.sh              |   3 +-
 test/srt/run_suite.py                         |   1 +
 test/srt/test_release_memory_occupation.py    |  98 +++++++++++++
 12 files changed, 407 insertions(+), 61 deletions(-)
 create mode 100644 python/sglang/torch_memory_saver_adapter.py
 create mode 100644 test/srt/test_release_memory_occupation.py

diff --git a/python/pyproject.toml b/python/pyproject.toml
index 4b627ae94785..61a36e34132e 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -44,6 +44,7 @@ srt_hpu = ["sglang[runtime_common]"]
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]
+torch_memory_saver = ["torch_memory_saver"]
 test = [
     "jsonlines",
     "matplotlib",
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
index 26b8921c493f..ec45696bf5fd 100644
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -19,9 +19,7 @@
 import uuid
 from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, List, Optional, Tuple, Union
-
-import torch
+from typing import Dict, List, Optional, Union
 
 from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.sampling.sampling_params import SamplingParams
@@ -459,6 +457,26 @@ class GetWeightsByNameReqOutput:
     parameter: list
 
 
+@dataclass
+class ReleaseMemoryOccupationReqInput:
+    pass
+
+
+@dataclass
+class ReleaseMemoryOccupationReqOutput:
+    pass
+
+
+@dataclass
+class ResumeMemoryOccupationReqInput:
+    pass
+
+
+@dataclass
+class ResumeMemoryOccupationReqOutput:
+    pass
+
+
 @dataclass
 class AbortReq:
     # The request id
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 1c07ea6adb75..b9e74aa9d93d 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -47,6 +47,10 @@
     OpenSessionReqInput,
     OpenSessionReqOutput,
     ProfileReq,
+    ReleaseMemoryOccupationReqInput,
+    ReleaseMemoryOccupationReqOutput,
+    ResumeMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqOutput,
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
     UpdateWeightFromDiskReqInput,
@@ -88,6 +92,7 @@
     set_random_seed,
     suppress_other_loggers,
 )
+from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.utils import get_exception_traceback
 
 logger = logging.getLogger(__name__)
@@ -357,6 +362,10 @@ def __init__(
         t.start()
         self.parent_process = psutil.Process().parent()
 
+        self.memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=server_args.enable_memory_saver
+        )
+
         # Init profiler
         if os.getenv("SGLANG_TORCH_PROFILER_DIR", "") == "":
             self.profiler = None
@@ -519,6 +528,12 @@ def process_input_requests(self, recv_reqs: List):
             elif isinstance(recv_req, GetWeightsByNameReqInput):
                 parameter = self.get_weights_by_name(recv_req)
                 self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
+            elif isinstance(recv_req, ReleaseMemoryOccupationReqInput):
+                self.release_memory_occupation()
+                self.send_to_tokenizer.send_pyobj(ReleaseMemoryOccupationReqOutput())
+            elif isinstance(recv_req, ResumeMemoryOccupationReqInput):
+                self.resume_memory_occupation()
+                self.send_to_tokenizer.send_pyobj(ResumeMemoryOccupationReqOutput())
             elif isinstance(recv_req, ProfileReq):
                 if recv_req == ProfileReq.START_PROFILE:
                     self.start_profile()
@@ -1538,6 +1553,20 @@ def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
         parameter = self.tp_worker.get_weights_by_name(recv_req)
         return parameter
 
+    def release_memory_occupation(self):
+        self.stashed_model_static_state = _export_static_state(
+            self.tp_worker.worker.model_runner.model
+        )
+        self.memory_saver_adapter.pause()
+        self.flush_cache()
+
+    def resume_memory_occupation(self):
+        self.memory_saver_adapter.resume()
+        _import_static_state(
+            self.tp_worker.worker.model_runner.model, self.stashed_model_static_state
+        )
+        del self.stashed_model_static_state
+
     def start_profile(self) -> None:
         if self.profiler is None:
             raise RuntimeError("Profiler is not enabled.")
@@ -1576,6 +1605,20 @@ def close_session(self, recv_req: CloseSessionReqInput):
             del self.sessions[session_id]
 
 
+def _export_static_state(model):
+    return dict(
+        buffers=[
+            (name, buffer.detach().clone()) for name, buffer in model.named_buffers()
+        ]
+    )
+
+
+def _import_static_state(model, static_params):
+    self_named_buffers = dict(model.named_buffers())
+    for name, tensor in static_params["buffers"]:
+        self_named_buffers[name][...] = tensor
+
+
 def run_scheduler_process(
     server_args: ServerArgs,
     port_args: PortArgs,
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index fb6202932f0f..33968e34fe47 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -53,6 +53,10 @@
     OpenSessionReqInput,
     OpenSessionReqOutput,
     ProfileReq,
+    ReleaseMemoryOccupationReqInput,
+    ReleaseMemoryOccupationReqOutput,
+    ResumeMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqOutput,
     SessionParams,
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
@@ -188,6 +192,12 @@ def __init__(
         self.get_weights_by_name_communicator = _Communicator(
             self.send_to_scheduler, server_args.dp_size
         )
+        self.release_memory_occupation_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.resume_memory_occupation_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
 
         # Metrics
         if self.enable_metrics:
@@ -548,6 +558,22 @@ async def get_weights_by_name(
         else:
             return all_parameters
 
+    async def release_memory_occupation(
+        self,
+        obj: ReleaseMemoryOccupationReqInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        await self.release_memory_occupation_communicator(obj)
+
+    async def resume_memory_occupation(
+        self,
+        obj: ResumeMemoryOccupationReqInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        await self.resume_memory_occupation_communicator(obj)
+
     async def open_session(
         self, obj: OpenSessionReqInput, request: Optional[fastapi.Request] = None
     ):
@@ -627,6 +653,8 @@ async def handle_loop(self):
                 UpdateWeightsFromDistributedReqOutput,
                 GetWeightsByNameReqOutput,
                 InitWeightsUpdateGroupReqOutput,
+                ReleaseMemoryOccupationReqOutput,
+                ResumeMemoryOccupationReqOutput,
             ] = await self.recv_from_detokenizer.recv_pyobj()
 
             if isinstance(recv_obj, (BatchStrOut, BatchEmbeddingOut, BatchTokenIDOut)):
@@ -709,6 +737,10 @@ async def handle_loop(self):
                 self.update_weights_from_tensor_communicator.handle_recv(recv_obj)
             elif isinstance(recv_obj, GetWeightsByNameReqOutput):
                 self.get_weights_by_name_communicator.handle_recv(recv_obj)
+            elif isinstance(recv_obj, ReleaseMemoryOccupationReqOutput):
+                self.release_memory_occupation_communicator.handle_recv(recv_obj)
+            elif isinstance(recv_obj, ResumeMemoryOccupationReqOutput):
+                self.resume_memory_occupation_communicator.handle_recv(recv_obj)
             else:
                 raise ValueError(f"Invalid object: {recv_obj=}")
 
diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py
index abee7764bebf..0761169e40e5 100644
--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -13,6 +13,8 @@
 limitations under the License.
 """
 
+from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter
+
 """
 Memory pool.
 
@@ -42,13 +44,25 @@
 class ReqToTokenPool:
     """A memory pool that maps a request to its token locations."""
 
-    def __init__(self, size: int, max_context_len: int, device: str, use_records: bool):
+    def __init__(
+        self,
+        size: int,
+        max_context_len: int,
+        device: str,
+        use_records: bool,
+        enable_memory_saver: bool,
+    ):
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=enable_memory_saver
+        )
+
         self.size = size
         self.max_context_len = max_context_len
         self.device = device
-        self.req_to_token = torch.zeros(
-            (size, max_context_len), dtype=torch.int32, device=device
-        )
+        with memory_saver_adapter.region():
+            self.req_to_token = torch.zeros(
+                (size, max_context_len), dtype=torch.int32, device=device
+            )
         self.free_slots = list(range(size))
         self.write_records = []
         self.use_records = use_records
@@ -189,8 +203,14 @@ def __init__(
         head_dim: int,
         layer_num: int,
         device: str,
+        enable_memory_saver: bool,
     ):
         super().__init__(size, dtype, device)
+
+        self.memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=enable_memory_saver
+        )
+
         self.head_num = head_num
         self.head_dim = head_dim
         self.layer_num = layer_num
@@ -202,24 +222,25 @@ def __init__(
         )
 
     def _create_buffers(self):
-        # [size, head_num, head_dim] for each layer
-        # The padded slot 0 is used for writing dummy outputs from padded tokens.
-        self.k_buffer = [
-            torch.empty(
-                (self.size + 1, self.head_num, self.head_dim),
-                dtype=self.store_dtype,
-                device=self.device,
-            )
-            for _ in range(self.layer_num)
-        ]
-        self.v_buffer = [
-            torch.empty(
-                (self.size + 1, self.head_num, self.head_dim),
-                dtype=self.store_dtype,
-                device=self.device,
-            )
-            for _ in range(self.layer_num)
-        ]
+        with self.memory_saver_adapter.region():
+            # [size, head_num, head_dim] for each layer
+            # The padded slot 0 is used for writing dummy outputs from padded tokens.
+            self.k_buffer = [
+                torch.empty(
+                    (self.size + 1, self.head_num, self.head_dim),
+                    dtype=self.store_dtype,
+                    device=self.device,
+                )
+                for _ in range(self.layer_num)
+            ]
+            self.v_buffer = [
+                torch.empty(
+                    (self.size + 1, self.head_num, self.head_dim),
+                    dtype=self.store_dtype,
+                    device=self.device,
+                )
+                for _ in range(self.layer_num)
+            ]
 
     def _clear_buffers(self):
         del self.k_buffer
@@ -307,19 +328,26 @@ def __init__(
         qk_rope_head_dim: int,
         layer_num: int,
         device: str,
+        enable_memory_saver: bool,
     ):
         super().__init__(size, dtype, device)
 
         self.kv_lora_rank = kv_lora_rank
-        # The padded slot 0 is used for writing dummy outputs from padded tokens.
-        self.kv_buffer = [
-            torch.empty(
-                (size + 1, 1, kv_lora_rank + qk_rope_head_dim),
-                dtype=self.store_dtype,
-                device=device,
-            )
-            for _ in range(layer_num)
-        ]
+
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=enable_memory_saver
+        )
+
+        with memory_saver_adapter.region():
+            # The padded slot 0 is used for writing dummy outputs from padded tokens.
+            self.kv_buffer = [
+                torch.empty(
+                    (size + 1, 1, kv_lora_rank + qk_rope_head_dim),
+                    dtype=self.store_dtype,
+                    device=device,
+                )
+                for _ in range(layer_num)
+            ]
 
     def get_key_buffer(self, layer_id: int):
         if self.store_dtype != self.dtype:
@@ -360,26 +388,32 @@ def __init__(
         layer_num: int,
         device: str,
         heavy_channel_num: int,
+        enable_memory_saver: bool,
     ):
         super().__init__(size, dtype, device)
 
-        # [size, head_num, head_dim] for each layer
-        self.k_buffer = [
-            torch.empty((size + 1, head_num, head_dim), dtype=dtype, device=device)
-            for _ in range(layer_num)
-        ]
-        self.v_buffer = [
-            torch.empty((size + 1, head_num, head_dim), dtype=dtype, device=device)
-            for _ in range(layer_num)
-        ]
-
-        # [size, head_num, heavy_channel_num] for each layer
-        self.label_buffer = [
-            torch.empty(
-                (size + 1, head_num, heavy_channel_num), dtype=dtype, device=device
-            )
-            for _ in range(layer_num)
-        ]
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=enable_memory_saver
+        )
+
+        with memory_saver_adapter.region():
+            # [size, head_num, head_dim] for each layer
+            self.k_buffer = [
+                torch.empty((size + 1, head_num, head_dim), dtype=dtype, device=device)
+                for _ in range(layer_num)
+            ]
+            self.v_buffer = [
+                torch.empty((size + 1, head_num, head_dim), dtype=dtype, device=device)
+                for _ in range(layer_num)
+            ]
+
+            # [size, head_num, heavy_channel_num] for each layer
+            self.label_buffer = [
+                torch.empty(
+                    (size + 1, head_num, heavy_channel_num), dtype=dtype, device=device
+                )
+                for _ in range(layer_num)
+            ]
 
     def get_key_buffer(self, layer_id: int):
         return self.k_buffer[layer_id]
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index d46a2c0dc725..190427649312 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -60,6 +60,7 @@
     monkey_patch_vllm_p2p_access_check,
     set_cpu_offload_max_bytes,
 )
+from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter
 
 logger = logging.getLogger(__name__)
 
@@ -166,6 +167,10 @@ def __init__(
         # Get memory before model loading
         min_per_gpu_memory = self.init_torch_distributed()
 
+        self.memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=self.server_args.enable_memory_saver
+        )
+
         # Load the model
         self.sampler = Sampler()
         self.load_model()
@@ -272,11 +277,12 @@ def load_model(self):
             monkey_patch_vllm_gguf_config()
 
         # Load the model
-        self.model = get_model(
-            model_config=self.model_config,
-            load_config=self.load_config,
-            device_config=DeviceConfig(self.device),
-        )
+        with self.memory_saver_adapter.region():
+            self.model = get_model(
+                model_config=self.model_config,
+                load_config=self.load_config,
+                device_config=DeviceConfig(self.device),
+            )
 
         if self.server_args.kv_cache_dtype == "fp8_e4m3":
             if self.server_args.quantization_param_path is not None:
@@ -417,7 +423,7 @@ def init_weights_update_group(
 
         logger.info(
             f"init custom process group: master_address={master_address}, master_port={master_port}, "
-            f"rank_offset={rank_offset}, world_size={world_size}, group_name={group_name}, backend={backend}"
+            f"rank_offset={rank_offset}, rank={rank}, world_size={world_size}, group_name={group_name}, backend={backend}"
         )
 
         try:
@@ -590,6 +596,7 @@ def init_memory_pool(
             max_context_len=self.model_config.context_len + 4,
             device=self.device,
             use_records=False,
+            enable_memory_saver=self.server_args.enable_memory_saver,
         )
         if (
             self.model_config.attention_arch == AttentionArch.MLA
@@ -602,6 +609,7 @@ def init_memory_pool(
                 qk_rope_head_dim=self.model_config.qk_rope_head_dim,
                 layer_num=self.model_config.num_hidden_layers,
                 device=self.device,
+                enable_memory_saver=self.server_args.enable_memory_saver,
             )
         elif self.server_args.enable_double_sparsity:
             self.token_to_kv_pool = DoubleSparseTokenToKVPool(
@@ -612,6 +620,7 @@ def init_memory_pool(
                 layer_num=self.model_config.num_hidden_layers,
                 device=self.device,
                 heavy_channel_num=self.server_args.ds_heavy_channel_num,
+                enable_memory_saver=self.server_args.enable_memory_saver,
             )
         else:
             self.token_to_kv_pool = MHATokenToKVPool(
@@ -621,6 +630,7 @@ def init_memory_pool(
                 head_dim=self.model_config.head_dim,
                 layer_num=self.model_config.num_hidden_layers,
                 device=self.device,
+                enable_memory_saver=self.server_args.enable_memory_saver,
             )
         logger.info(
             f"Memory pool end. "
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index fa1625b09595..4e837e5389ba 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -31,6 +31,8 @@
 
 import torch
 
+from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter
+
 # Fix a bug of Python threading
 setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
 
@@ -57,6 +59,8 @@
     GetWeightsByNameReqInput,
     InitWeightsUpdateGroupReqInput,
     OpenSessionReqInput,
+    ReleaseMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqInput,
     UpdateWeightFromDiskReqInput,
     UpdateWeightsFromDistributedReqInput,
     UpdateWeightsFromTensorReqInput,
@@ -255,6 +259,28 @@ async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
         return _create_error_response(e)
 
 
+@app.api_route("/release_memory_occupation", methods=["GET", "POST"])
+async def release_memory_occupation(
+    obj: ReleaseMemoryOccupationReqInput, request: Request
+):
+    """Release GPU occupation temporarily"""
+    try:
+        await tokenizer_manager.release_memory_occupation(obj, request)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/resume_memory_occupation", methods=["GET", "POST"])
+async def resume_memory_occupation(
+    obj: ResumeMemoryOccupationReqInput, request: Request
+):
+    """Resume GPU occupation"""
+    try:
+        await tokenizer_manager.resume_memory_occupation(obj, request)
+    except Exception as e:
+        return _create_error_response(e)
+
+
 @app.api_route("/open_session", methods=["GET", "POST"])
 async def open_session(obj: OpenSessionReqInput, request: Request):
     """Open a session, and return its unique session id."""
@@ -438,6 +464,10 @@ def launch_engine(
         server_args.model_path, server_args.tokenizer_path
     )
 
+    memory_saver_adapter = TorchMemorySaverAdapter.create(
+        enable=server_args.enable_memory_saver
+    )
+
     if server_args.dp_size == 1:
         # Launch tensor parallel scheduler processes
         scheduler_procs = []
@@ -454,7 +484,8 @@ def launch_engine(
                 target=run_scheduler_process,
                 args=(server_args, port_args, gpu_id, tp_rank, None, writer),
             )
-            proc.start()
+            with memory_saver_adapter.configure_subprocess():
+                proc.start()
             scheduler_procs.append(proc)
             scheduler_pipe_readers.append(reader)
 
@@ -471,7 +502,8 @@ def launch_engine(
             target=run_data_parallel_controller_process,
             args=(server_args, port_args, writer),
         )
-        proc.start()
+        with memory_saver_adapter.configure_subprocess():
+            proc.start()
 
     # Launch detokenizer process
     detoken_proc = mp.Process(
@@ -897,6 +929,18 @@ def get_weights_by_name(self, name, truncate_size=100):
         loop = asyncio.get_event_loop()
         return loop.run_until_complete(tokenizer_manager.get_weights_by_name(obj, None))
 
+    def release_memory_occupation(self):
+        """Release GPU occupation temporarily"""
+        obj = ReleaseMemoryOccupationReqInput()
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(tokenizer_manager.release_memory_occupation(obj, None))
+
+    def resume_memory_occupation(self):
+        """Resume GPU occupation"""
+        obj = ResumeMemoryOccupationReqInput()
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(tokenizer_manager.resume_memory_occupation(obj, None))
+
 
 class Runtime:
     """
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index be85a3670d40..4f44d5c877dc 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -23,7 +23,6 @@
 import torch
 
 from sglang.srt.hf_transformers_utils import check_gguf_file
-from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.utils import (
     get_amdgpu_memory_capacity,
     get_hpu_memory_capacity,
@@ -157,6 +156,7 @@ class ServerArgs:
     triton_attention_num_kv_splits: int = 8
     num_continuous_decode_steps: int = 1
     delete_ckpt_after_loading: bool = False
+    enable_memory_saver: bool = False
 
     def __post_init__(self):
         # Set missing default values
@@ -854,6 +854,11 @@ def add_cli_args(parser: argparse.ArgumentParser):
             action="store_true",
             help="Delete the model checkpoint after loading the model.",
         )
+        parser.add_argument(
+            "--enable-memory-saver",
+            action="store_true",
+            help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
+        )
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
diff --git a/python/sglang/torch_memory_saver_adapter.py b/python/sglang/torch_memory_saver_adapter.py
new file mode 100644
index 000000000000..31f8ebf2f077
--- /dev/null
+++ b/python/sglang/torch_memory_saver_adapter.py
@@ -0,0 +1,59 @@
+from abc import ABC
+from contextlib import contextmanager
+
+try:
+    import torch_memory_saver
+
+    _primary_memory_saver = torch_memory_saver.TorchMemorySaver()
+except ImportError:
+    pass
+
+
+class TorchMemorySaverAdapter(ABC):
+    @staticmethod
+    def create(enable: bool):
+        return (
+            _TorchMemorySaverAdapterReal() if enable else _TorchMemorySaverAdapterNoop()
+        )
+
+    def configure_subprocess(self):
+        raise NotImplementedError
+
+    def region(self):
+        raise NotImplementedError
+
+    def pause(self):
+        raise NotImplementedError
+
+    def resume(self):
+        raise NotImplementedError
+
+
+class _TorchMemorySaverAdapterReal(TorchMemorySaverAdapter):
+    def configure_subprocess(self):
+        return torch_memory_saver.configure_subprocess()
+
+    def region(self):
+        return _primary_memory_saver.region()
+
+    def pause(self):
+        return _primary_memory_saver.pause()
+
+    def resume(self):
+        return _primary_memory_saver.resume()
+
+
+class _TorchMemorySaverAdapterNoop(TorchMemorySaverAdapter):
+    @contextmanager
+    def configure_subprocess(self):
+        yield
+
+    @contextmanager
+    def region(self):
+        yield
+
+    def pause(self):
+        pass
+
+    def resume(self):
+        pass
diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh
index 26c34879e9ba..66b113f61976 100755
--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
@@ -12,8 +12,9 @@ bash "${SCRIPT_DIR}/killall_sglang.sh"
 pip install --upgrade pip
 pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
 
-# Force reinstall flashinfer
+# Force reinstall flashinfer and torch_memory_saver
 pip install flashinfer==0.1.6 --find-links ${FLASHINFER_REPO} --force-reinstall --no-deps
+pip install torch_memory_saver --force-reinstall
 
 pip install transformers==4.45.2 sentence_transformers accelerate peft
 
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index d617fcf69e62..658b3d2f8158 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -29,6 +29,7 @@
         "test_openai_server.py",
         "test_pytorch_sampling_backend.py",
         "test_radix_attention.py",
+        "test_release_memory_occupation.py",
         "test_retract_decode.py",
         "test_server_args.py",
         "test_session_control.py",
diff --git a/test/srt/test_release_memory_occupation.py b/test/srt/test_release_memory_occupation.py
new file mode 100644
index 000000000000..c84b64e77dfe
--- /dev/null
+++ b/test/srt/test_release_memory_occupation.py
@@ -0,0 +1,98 @@
+import time
+import unittest
+
+import torch
+from transformers import AutoModelForCausalLM
+
+import sglang as sgl
+from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+
+# (temporarily) set to true to observe memory usage in nvidia-smi more clearly
+_DEBUG_EXTRA = True
+
+
+class TestReleaseMemoryOccupation(unittest.TestCase):
+    def test_release_and_resume_occupation(self):
+        prompt = "Today is a sunny day and I like"
+        sampling_params = {"temperature": 0, "max_new_tokens": 8}
+        model_name = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        expect_output = " to spend it outdoors. I decided to"
+
+        engine = sgl.Engine(
+            model_path=model_name,
+            random_seed=42,
+            enable_memory_saver=True,
+            # disable_cuda_graph=True,  # for debugging only
+        )
+        hf_model_new = AutoModelForCausalLM.from_pretrained(
+            model_name, torch_dtype="bfloat16"
+        )
+
+        print("generate (#1)")
+        outputs = engine.generate(prompt, sampling_params)["text"]
+        self.assertEqual(outputs, expect_output)
+
+        if _DEBUG_EXTRA:
+            time.sleep(3)
+
+        self.assertEqual(
+            _try_allocate_big_tensor(),
+            False,
+            "Should not be able to allocate big tensors before releasing",
+        )
+
+        print("release_memory_occupation start")
+        t = time.time()
+        engine.release_memory_occupation()
+        if _DEBUG_EXTRA:
+            print("release_memory_occupation", time.time() - t)
+
+        if _DEBUG_EXTRA:
+            time.sleep(5)
+
+        self.assertEqual(
+            _try_allocate_big_tensor(),
+            True,
+            "Should be able to allocate big tensors aftre releasing",
+        )
+
+        if _DEBUG_EXTRA:
+            time.sleep(5)
+
+        print("resume_memory_occupation start")
+        t = time.time()
+        engine.resume_memory_occupation()
+        if _DEBUG_EXTRA:
+            print("resume_memory_occupation", time.time() - t)
+
+        self.assertEqual(
+            _try_allocate_big_tensor(),
+            False,
+            "Should not be able to allocate big tensors after resuming",
+        )
+
+        print("update_weights_from_tensor")
+        # As if: PPO has updated hf model's weights, and now we sync it to SGLang
+        engine.update_weights_from_tensor(list(hf_model_new.named_parameters()))
+
+        print("generate (#2)")
+        outputs = engine.generate(prompt, sampling_params)["text"]
+        self.assertEqual(outputs, expect_output)
+
+        if _DEBUG_EXTRA:
+            time.sleep(4)
+
+        engine.shutdown()
+
+
+def _try_allocate_big_tensor(size: int = 20_000_000_000):
+    try:
+        torch.empty((size,), dtype=torch.uint8, device="cuda")
+        torch.cuda.empty_cache()
+        return True
+    except torch.cuda.OutOfMemoryError:
+        return False
+
+
+if __name__ == "__main__":
+    unittest.main()

From 46d44318894a13dc6d018892b32dd4a7e09f20f7 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 13 Jan 2025 14:24:00 -0800
Subject: [PATCH 048/248] Add a new api configure_logging to allow dumping the
 requests (#2875)

---
 3rdparty/amd/profiling/PROFILING.md           |   2 +-
 3rdparty/amd/profiling/server.sh              |   2 +-
 3rdparty/amd/tuning/TUNING.md                 |   2 +-
 benchmark/blog_v0_2/405b_sglang.sh            |   2 +-
 .../sglang/srt/managers/configure_logging.py  |  43 ++++++
 python/sglang/srt/managers/io_struct.py       |   7 +
 python/sglang/srt/managers/scheduler.py       |   2 +-
 .../sglang/srt/managers/tokenizer_manager.py  |  41 +++++-
 python/sglang/srt/mem_cache/memory_pool.py    |   2 +-
 .../sglang/srt/model_executor/model_runner.py |   2 +-
 python/sglang/srt/server.py                   | 126 +++++++++---------
 python/sglang/srt/server_args.py              |   4 +-
 .../{ => srt}/torch_memory_saver_adapter.py   |   0
 13 files changed, 164 insertions(+), 71 deletions(-)
 create mode 100644 python/sglang/srt/managers/configure_logging.py
 rename python/sglang/{ => srt}/torch_memory_saver_adapter.py (100%)

diff --git a/3rdparty/amd/profiling/PROFILING.md b/3rdparty/amd/profiling/PROFILING.md
index 79bc75b503bc..7e15ec844f2b 100644
--- a/3rdparty/amd/profiling/PROFILING.md
+++ b/3rdparty/amd/profiling/PROFILING.md
@@ -336,7 +336,7 @@ loadTracer.sh python3 -m sglang.launch_server \
     --model-path /sgl-workspace/sglang/dummy_grok1 \
     --tokenizer-path Xenova/grok-1-tokenizer \
     --load-format dummy \
-    --quant fp8 \
+    --quantization fp8 \
     --tp 8 \
     --port 30000 \
     --disable-radix-cache 2>&1 | tee "$LOGFILE"
diff --git a/3rdparty/amd/profiling/server.sh b/3rdparty/amd/profiling/server.sh
index aa574f64c940..f877e6c7acd4 100755
--- a/3rdparty/amd/profiling/server.sh
+++ b/3rdparty/amd/profiling/server.sh
@@ -14,7 +14,7 @@ loadTracer.sh python3 -m sglang.launch_server \
     --model-path /sgl-workspace/sglang/dummy_grok1 \
     --tokenizer-path Xenova/grok-1-tokenizer \
     --load-format dummy \
-    --quant fp8 \
+    --quantization fp8 \
     --tp 8 \
     --port 30000 \
     --disable-radix-cache 2>&1 | tee "$LOGFILE"
diff --git a/3rdparty/amd/tuning/TUNING.md b/3rdparty/amd/tuning/TUNING.md
index a38a16d4f7a5..0638041c9743 100644
--- a/3rdparty/amd/tuning/TUNING.md
+++ b/3rdparty/amd/tuning/TUNING.md
@@ -104,7 +104,7 @@ To maximize moe kernel efficiency, need to use below scripts to find out the bes
 
 ```bash
 #Tuning
-#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quant fp" to run, it defined batch-size 32 input lenth 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run).
+#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input lenth 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run).
 #so we can tune decode moe use below command
 python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32"
 # and use this command to tune prefill moe
diff --git a/benchmark/blog_v0_2/405b_sglang.sh b/benchmark/blog_v0_2/405b_sglang.sh
index 4e3372ae8c70..491853782805 100644
--- a/benchmark/blog_v0_2/405b_sglang.sh
+++ b/benchmark/blog_v0_2/405b_sglang.sh
@@ -6,7 +6,7 @@
 #   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
 
 # Launch sglang
-# python -m sglang.launch_server --model-path ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quant fp8 --disable-radix --mem-frac 0.87
+# python -m sglang.launch_server --model-path ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quantization fp8 --disable-radix --mem-frac 0.87
 
 # offline
 python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > sglang_log11
diff --git a/python/sglang/srt/managers/configure_logging.py b/python/sglang/srt/managers/configure_logging.py
new file mode 100644
index 000000000000..3351cdc400ce
--- /dev/null
+++ b/python/sglang/srt/managers/configure_logging.py
@@ -0,0 +1,43 @@
+"""
+Copyright 2023-2025 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+Configure the logging settings of a server.
+
+Usage:
+python3 -m sglang.srt.managers.configure_logging --url http://localhost:30000
+"""
+
+import argparse
+
+import requests
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--url", type=str, default="http://localhost:30000")
+    parser.add_argument(
+        "--dump-requests-folder", type=str, default="/tmp/sglang_request_dump"
+    )
+    parser.add_argument("--dump-requests-threshold", type=int, default=1000)
+    args = parser.parse_args()
+
+    response = requests.post(
+        args.url + "/configure_logging",
+        json={
+            "dump_requests_folder": args.dump_requests_folder,
+            "dump_requests_threshold": args.dump_requests_threshold,
+        },
+    )
+    assert response.status_code == 200
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
index ec45696bf5fd..075693c7bc90 100644
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -488,6 +488,13 @@ class ProfileReq(Enum):
     STOP_PROFILE = 2
 
 
+@dataclass
+class ConfigureLoggingReq:
+    log_requests: Optional[bool] = None
+    dump_requests_folder: Optional[str] = None
+    dump_requests_threshold: Optional[int] = None
+
+
 @dataclass
 class OpenSessionReqInput:
     capacity_of_str_len: int
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index b9e74aa9d93d..187216353171 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -82,6 +82,7 @@
 from sglang.srt.model_executor.forward_batch_info import ForwardMode
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.srt.utils import (
     broadcast_pyobj,
     configure_logger,
@@ -92,7 +93,6 @@
     set_random_seed,
     suppress_other_loggers,
 )
-from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.utils import get_exception_traceback
 
 logger = logging.getLogger(__name__)
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 33968e34fe47..acd3b674a455 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -18,10 +18,12 @@
 import dataclasses
 import logging
 import os
+import pickle
 import signal
 import sys
 import time
 import uuid
+from datetime import datetime
 from typing import Any, Awaitable, Dict, Generic, List, Optional, Tuple, TypeVar, Union
 
 import fastapi
@@ -43,6 +45,7 @@
     BatchStrOut,
     BatchTokenIDOut,
     CloseSessionReqInput,
+    ConfigureLoggingReq,
     EmbeddingReqInput,
     FlushCacheReq,
     GenerateReqInput,
@@ -109,6 +112,7 @@ def __init__(
         # Parse args
         self.server_args = server_args
         self.enable_metrics = server_args.enable_metrics
+        self.log_requests = server_args.log_requests
 
         # Init inter-process communication
         context = zmq.asyncio.Context(2)
@@ -167,6 +171,9 @@ def __init__(
         # Store states
         self.to_create_loop = True
         self.rid_to_state: Dict[str, ReqState] = {}
+        self.dump_requests_folder = ""  # By default do not dump
+        self.dump_requests_threshold = 1000
+        self.dump_request_list: List[Tuple] = []
 
         # The event to notify the weight sync is finished.
         self.model_update_lock = RWLock()
@@ -225,7 +232,7 @@ async def generate_request(
 
         obj.normalize_batch_and_arguments()
 
-        if self.server_args.log_requests:
+        if self.log_requests:
             logger.info(f"Receive: obj={dataclass_to_string_truncated(obj)}")
 
         async with self.model_update_lock.reader_lock:
@@ -346,7 +353,7 @@ async def _wait_one_response(
 
             state.out_list = []
             if state.finished:
-                if self.server_args.log_requests:
+                if self.log_requests:
                     msg = f"Finish: obj={dataclass_to_string_truncated(obj)}, out={dataclass_to_string_truncated(out)}"
                     logger.info(msg)
                 del self.rid_to_state[obj.rid]
@@ -597,6 +604,15 @@ async def close_session(
         assert not self.to_create_loop, "close session should not be the first request"
         await self.send_to_scheduler.send_pyobj(obj)
 
+    def configure_logging(self, obj: ConfigureLoggingReq):
+        if obj.log_requests is not None:
+            self.log_requests = obj.log_requests
+        if obj.dump_requests_folder is not None:
+            self.dump_requests_folder = obj.dump_requests_folder
+        if obj.dump_requests_threshold is not None:
+            self.dump_requests_threshold = obj.dump_requests_threshold
+        logging.info(f"Config logging: {obj=}")
+
     def create_abort_task(self, obj: GenerateReqInput):
         # Abort the request if the client is disconnected.
         async def abort_request():
@@ -708,6 +724,8 @@ async def handle_loop(self):
 
                     if self.enable_metrics:
                         self.collect_metrics(state, recv_obj, i)
+                    if self.dump_requests_folder and state.finished:
+                        self.dump_requests(state, out_dict)
             elif isinstance(recv_obj, OpenSessionReqOutput):
                 self.session_futures[recv_obj.session_id].set_result(
                     recv_obj.session_id if recv_obj.success else None
@@ -850,6 +868,25 @@ def collect_metrics(self, state: ReqState, recv_obj: BatchStrOut, i: int):
                     (time.time() - state.created_time) / completion_tokens
                 )
 
+    def dump_requests(self, state: ReqState, out_dict: dict):
+        self.dump_request_list.append(
+            (state.obj, out_dict, state.created_time, time.time())
+        )
+
+        if len(self.dump_request_list) >= self.dump_requests_threshold:
+            to_dump = self.dump_request_list
+            self.dump_request_list = []
+
+            def background_task():
+                os.makedirs(self.dump_requests_folder, exist_ok=True)
+                current_time = datetime.now()
+                filename = current_time.strftime("%Y-%m-%d_%H-%M-%S") + ".pkl"
+                with open(os.path.join(self.dump_requests_folder, filename), "wb") as f:
+                    pickle.dump(to_dump, f)
+
+            # Schedule the task to run in the background without awaiting it
+            asyncio.create_task(asyncio.to_thread(background_task))
+
 
 class SignalHandler:
     def __init__(self, tokenizer_manager):
diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py
index 0761169e40e5..ab27e81b7430 100644
--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -13,7 +13,7 @@
 limitations under the License.
 """
 
-from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 
 """
 Memory pool.
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 190427649312..238f8603ac95 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -50,6 +50,7 @@
 from sglang.srt.model_loader import get_model
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.srt.utils import (
     enable_show_time_cost,
     get_available_gpu_memory,
@@ -60,7 +61,6 @@
     monkey_patch_vllm_p2p_access_check,
     set_cpu_offload_max_bytes,
 )
-from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter
 
 logger = logging.getLogger(__name__)
 
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index 4e837e5389ba..93fe1304caff 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -31,7 +31,7 @@
 
 import torch
 
-from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 
 # Fix a bug of Python threading
 setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
@@ -54,6 +54,7 @@
 from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
 from sglang.srt.managers.io_struct import (
     CloseSessionReqInput,
+    ConfigureLoggingReq,
     EmbeddingReqInput,
     GenerateReqInput,
     GetWeightsByNameReqInput,
@@ -161,12 +162,68 @@ async def get_model_info():
 @app.get("/get_server_info")
 async def get_server_info():
     return {
-        **dataclasses.asdict(tokenizer_manager.server_args),  # server args
+        **dataclasses.asdict(tokenizer_manager.server_args),
         **scheduler_info,
         "version": __version__,
     }
 
 
+# fastapi implicitly converts json in the request to obj (dataclass)
+@app.api_route("/generate", methods=["POST", "PUT"])
+@time_func_latency
+async def generate_request(obj: GenerateReqInput, request: Request):
+    """Handle a generate request."""
+    if obj.stream:
+
+        async def stream_results() -> AsyncIterator[bytes]:
+            try:
+                async for out in tokenizer_manager.generate_request(obj, request):
+                    yield b"data: " + orjson.dumps(
+                        out, option=orjson.OPT_NON_STR_KEYS
+                    ) + b"\n\n"
+            except ValueError as e:
+                out = {"error": {"message": str(e)}}
+                yield b"data: " + orjson.dumps(
+                    out, option=orjson.OPT_NON_STR_KEYS
+                ) + b"\n\n"
+            yield b"data: [DONE]\n\n"
+
+        return StreamingResponse(
+            stream_results(),
+            media_type="text/event-stream",
+            background=tokenizer_manager.create_abort_task(obj),
+        )
+    else:
+        try:
+            ret = await tokenizer_manager.generate_request(obj, request).__anext__()
+            return ret
+        except ValueError as e:
+            logger.error(f"Error: {e}")
+            return _create_error_response(e)
+
+
+@app.api_route("/encode", methods=["POST", "PUT"])
+@time_func_latency
+async def encode_request(obj: EmbeddingReqInput, request: Request):
+    """Handle an embedding request."""
+    try:
+        ret = await tokenizer_manager.generate_request(obj, request).__anext__()
+        return ret
+    except ValueError as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/classify", methods=["POST", "PUT"])
+@time_func_latency
+async def classify_request(obj: EmbeddingReqInput, request: Request):
+    """Handle a reward model request. Now the arguments and return values are the same as embedding models."""
+    try:
+        ret = await tokenizer_manager.generate_request(obj, request).__anext__()
+        return ret
+    except ValueError as e:
+        return _create_error_response(e)
+
+
 @app.post("/flush_cache")
 async def flush_cache():
     """Flush the radix cache."""
@@ -178,8 +235,7 @@ async def flush_cache():
     )
 
 
-@app.get("/start_profile")
-@app.post("/start_profile")
+@app.api_route("/start_profile", methods=["GET", "POST"])
 async def start_profile_async():
     """Start profiling."""
     tokenizer_manager.start_profile()
@@ -189,8 +245,7 @@ async def start_profile_async():
     )
 
 
-@app.get("/stop_profile")
-@app.post("/stop_profile")
+@app.api_route("/stop_profile", methods=["GET", "POST"])
 async def stop_profile_async():
     """Stop profiling."""
     tokenizer_manager.stop_profile()
@@ -305,60 +360,11 @@ async def close_session(obj: CloseSessionReqInput, request: Request):
         return _create_error_response(e)
 
 
-# fastapi implicitly converts json in the request to obj (dataclass)
-@app.api_route("/generate", methods=["POST", "PUT"])
-@time_func_latency
-async def generate_request(obj: GenerateReqInput, request: Request):
-    """Handle a generate request."""
-    if obj.stream:
-
-        async def stream_results() -> AsyncIterator[bytes]:
-            try:
-                async for out in tokenizer_manager.generate_request(obj, request):
-                    yield b"data: " + orjson.dumps(
-                        out, option=orjson.OPT_NON_STR_KEYS
-                    ) + b"\n\n"
-            except ValueError as e:
-                out = {"error": {"message": str(e)}}
-                yield b"data: " + orjson.dumps(
-                    out, option=orjson.OPT_NON_STR_KEYS
-                ) + b"\n\n"
-            yield b"data: [DONE]\n\n"
-
-        return StreamingResponse(
-            stream_results(),
-            media_type="text/event-stream",
-            background=tokenizer_manager.create_abort_task(obj),
-        )
-    else:
-        try:
-            ret = await tokenizer_manager.generate_request(obj, request).__anext__()
-            return ret
-        except ValueError as e:
-            logger.error(f"Error: {e}")
-            return _create_error_response(e)
-
-
-@app.api_route("/encode", methods=["POST", "PUT"])
-@time_func_latency
-async def encode_request(obj: EmbeddingReqInput, request: Request):
-    """Handle an embedding request."""
-    try:
-        ret = await tokenizer_manager.generate_request(obj, request).__anext__()
-        return ret
-    except ValueError as e:
-        return _create_error_response(e)
-
-
-@app.api_route("/classify", methods=["POST", "PUT"])
-@time_func_latency
-async def classify_request(obj: EmbeddingReqInput, request: Request):
-    """Handle a reward model request. Now the arguments and return values are the same as embedding models."""
-    try:
-        ret = await tokenizer_manager.generate_request(obj, request).__anext__()
-        return ret
-    except ValueError as e:
-        return _create_error_response(e)
+@app.api_route("/configure_logging", methods=["GET", "POST"])
+async def configure_logging(obj: ConfigureLoggingReq, request: Request):
+    """Close the session"""
+    tokenizer_manager.configure_logging(obj)
+    return Response(status_code=200)
 
 
 ##### OpenAI-compatible API endpoints #####
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 4f44d5c877dc..57a82c18a331 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -91,7 +91,7 @@ class ServerArgs:
 
     # API related
     api_key: Optional[str] = None
-    file_storage_pth: str = "SGLang_storage"
+    file_storage_pth: str = "sglang_storage"
     enable_cache_report: bool = False
 
     # Data parallelism
@@ -554,7 +554,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
             "--decode-log-interval",
             type=int,
             default=ServerArgs.decode_log_interval,
-            help="The log interval of decode batch",
+            help="The log interval of decode batch.",
         )
 
         # API related
diff --git a/python/sglang/torch_memory_saver_adapter.py b/python/sglang/srt/torch_memory_saver_adapter.py
similarity index 100%
rename from python/sglang/torch_memory_saver_adapter.py
rename to python/sglang/srt/torch_memory_saver_adapter.py

From 80002562a8158b5c531f2ab81155da313a2a5cd6 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Tue, 14 Jan 2025 12:48:17 +0800
Subject: [PATCH 049/248] docs: update README (#2878)

---
 .github/workflows/release-docs.yml | 2 +-
 benchmark/deepseek_v3/README.md    | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml
index c200f5313e65..84138f7430e0 100644
--- a/.github/workflows/release-docs.yml
+++ b/.github/workflows/release-docs.yml
@@ -49,7 +49,7 @@ jobs:
           cd _build/html
 
           git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1
-          find ../sgl-project.github.io/ -mindepth 1 -not -path "../sgl-project.github.io/.git*" -not -name CNAME -delete
+          find ../sgl-project.github.io/ -mindepth 1 -not -path "../sgl-project.github.io/.git*" -not -name CNAME -not -name ".jekyll" -not -name ".nojekyll" -delete
           cp -r * ../sgl-project.github.io
           cp ../../README.md ../sgl-project.github.io/README.md
           cd ../sgl-project.github.io
diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md
index e7ad8d33609c..ea972831a368 100644
--- a/benchmark/deepseek_v3/README.md
+++ b/benchmark/deepseek_v3/README.md
@@ -60,7 +60,9 @@ print(response)
 ```
 
 ### Example: Serving with two H20*8 nodes
-For example, there are two H20 nodes, each with 8 GPUs. The first node's IP is `10.0.0.1`, and the second node's IP is `10.0.0.2`.
+For example, there are two H20 nodes, each with 8 GPUs. The first node's IP is `10.0.0.1`, and the second node's IP is `10.0.0.2`. Please **use the first node's IP** for both commands.
+
+If the command fails, try setting the `GLOO_SOCKET_IFNAME` parameter. For more information, see [Common Environment Variables](https://pytorch.org/docs/stable/distributed.html#common-environment-variables).
 
 ```bash
 # node 1

From c19d84829c7de194d3965cb0edd414de24c145d8 Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Tue, 14 Jan 2025 13:34:22 +0800
Subject: [PATCH 050/248] Adjust flashinfer workspace size for Qwen2 models
 (#2879)

---
 python/sglang/srt/layers/attention/flashinfer_backend.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py
index f038394628fd..6a4636128103 100644
--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -84,6 +84,10 @@ def __init__(self, model_runner: ModelRunner):
             self.num_wrappers = 1
             self.dispatch_reason = None
 
+        # Qwen2 models require higher flashinfer workspace size
+        if "Qwen2ForCausalLM" in model_runner.model_config.hf_config.architectures:
+            global_config.flashinfer_workspace_size = 512 * 1024 * 1024
+
         # Allocate buffers
         self.workspace_buffer = torch.empty(
             global_config.flashinfer_workspace_size,

From b8cd09f27aaee18f90424f8baf74e936269428a0 Mon Sep 17 00:00:00 2001
From: kk <43161300+kkHuang-amd@users.noreply.github.com>
Date: Tue, 14 Jan 2025 16:59:43 +0800
Subject: [PATCH 051/248] update ROCm docker for layernorm kernel optimization
 (#2885)

Co-authored-by: wunhuang <wunhuang@amd.com>
---
 docker/Dockerfile.rocm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 2ad62d2d493d..e71cd1694029 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -2,7 +2,7 @@
 #   docker build --build-arg SGL_BRANCH=v0.4.1.post5 -t v0.4.1.post5-rocm620 -f Dockerfile.rocm .
 
 # default base image
-ARG BASE_IMAGE="rocmshared/vllm-rocm:20250113-tuned-elementwise"
+ARG BASE_IMAGE="rocmshared/vllm-rocm:20250114-tuned-elementwise-layernorm"
 
 FROM $BASE_IMAGE AS base
 USER root

From cc0485bef29831f2fcf707ecc1a371be0c7bc816 Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Tue, 14 Jan 2025 17:07:49 +0800
Subject: [PATCH 052/248] Support w8a8 int8 quantization config (#2881)

---
 python/sglang/srt/configs/model_config.py     |  21 +++-
 .../srt/layers/quantization/__init__.py       |   2 +
 .../srt/layers/quantization/w8a8_int8.py      | 117 ++++++++++++++++++
 python/sglang/srt/server_args.py              |   1 +
 4 files changed, 135 insertions(+), 6 deletions(-)
 create mode 100644 python/sglang/srt/layers/quantization/w8a8_int8.py

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 072c88b04a78..d087a2f2348c 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -223,7 +223,11 @@ def _verify_quantization(self) -> None:
             "compressed_tensors",
             "compressed-tensors",
             "experts_int8",
+            "w8a8_int8",
         ]
+        compatible_quantization_methods = {
+            "w8a8_int8": ["compressed-tensors", "compressed_tensors"]
+        }
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
@@ -247,12 +251,17 @@ def _verify_quantization(self) -> None:
             if self.quantization is None:
                 self.quantization = quant_method
             elif self.quantization != quant_method:
-                raise ValueError(
-                    "Quantization method specified in the model config "
-                    f"({quant_method}) does not match the quantization "
-                    f"method specified in the `quantization` argument "
-                    f"({self.quantization})."
-                )
+                if (
+                    self.quantization not in compatible_quantization_methods
+                    or quant_method
+                    not in compatible_quantization_methods[self.quantization]
+                ):
+                    raise ValueError(
+                        "Quantization method specified in the model config "
+                        f"({quant_method}) does not match the quantization "
+                        f"method specified in the `quantization` argument "
+                        f"({self.quantization})."
+                    )
 
         if self.quantization is not None:
             if self.quantization not in supported_quantization:
diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py
index 35b0c4d94edb..1a39e800633c 100644
--- a/python/sglang/srt/layers/quantization/__init__.py
+++ b/python/sglang/srt/layers/quantization/__init__.py
@@ -23,6 +23,7 @@
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.fp8 import Fp8Config
 from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp8Config
+from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
 
 QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "aqlm": AQLMConfig,
@@ -42,6 +43,7 @@
     "bitsandbytes": BitsAndBytesConfig,
     "qqq": QQQConfig,
     "experts_int8": ExpertsInt8Config,
+    "w8a8_int8": W8A8Int8Config,
 }
 
 
diff --git a/python/sglang/srt/layers/quantization/w8a8_int8.py b/python/sglang/srt/layers/quantization/w8a8_int8.py
new file mode 100644
index 000000000000..0c39393b70a9
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/w8a8_int8.py
@@ -0,0 +1,117 @@
+from typing import Any, Dict, List, Optional
+
+import torch
+
+from sglang.srt.utils import is_cuda_available
+
+is_cuda = is_cuda_available()
+if is_cuda:
+    from sgl_kernel import int8_scaled_mm
+
+from torch.nn.parameter import Parameter
+
+from sglang.srt.layers.linear import LinearMethodBase
+from sglang.srt.layers.parameter import ChannelQuantScaleParameter, ModelWeightParameter
+from sglang.srt.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+
+
+class W8A8Int8Config(QuantizationConfig):
+    """Config class for W8A8 Int8 Quantization.
+
+    - Weight: static, per-channel, symmetric
+    - Activation: dynamic, per-token, symmetric
+    """
+
+    def __init__(self):
+        pass
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+
+    @classmethod
+    def get_name(self) -> str:
+        return "w8a8_int8"
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "W8A8Int8Config":
+        return cls()
+
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional["QuantizeMethodBase"]:
+        from vllm.model_executor.layers.linear import LinearBase
+
+        if isinstance(layer, LinearBase):
+            return W8A8Int8LinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class W8A8Int8LinearMethod(LinearMethodBase):
+
+    def __init__(self, quantization_config: W8A8Int8Config):
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.weight = Parameter(layer.weight.t(), requires_grad=False)
+        layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs
+    ):
+
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        self.logical_widths = output_partition_sizes
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes), input_size_per_partition, dtype=torch.int8
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        x_q, x_scale = per_token_quant_int8(x)
+
+        return int8_scaled_mm(
+            x_q, layer.weight, x_scale, layer.weight_scale, out_dtype=x.dtype, bias=bias
+        )
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 57a82c18a331..e445217b62fd 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -378,6 +378,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
                 "bitsandbytes",
                 "gguf",
                 "modelopt",
+                "w8a8_int8",
             ],
             help="The quantization method.",
         )

From f5c6c667940b53d9465f53c657508fc0316a5bad Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Tue, 14 Jan 2025 19:23:26 +0800
Subject: [PATCH 053/248] feat: support internlm 3 dense (#2888)

---
 python/sglang/srt/models/llama.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py
index d606e52f8b8d..4f09fd185b83 100644
--- a/python/sglang/srt/models/llama.py
+++ b/python/sglang/srt/models/llama.py
@@ -570,4 +570,8 @@ class Phi3ForCausalLM(LlamaForCausalLM):
     pass
 
 
-EntryClass = [LlamaForCausalLM, Phi3ForCausalLM]
+class InternLM3ForCausalLM(LlamaForCausalLM):
+    pass
+
+
+EntryClass = [LlamaForCausalLM, Phi3ForCausalLM, InternLM3ForCausalLM]

From f005758f2bcf367739a5a71a90b91d18b56aa4cd Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Tue, 14 Jan 2025 19:48:59 +0800
Subject: [PATCH 054/248] introduce CUB in sgl-kernel (#2887)

---
 .gitmodules               | 3 +++
 sgl-kernel/3rdparty/cub   | 1 +
 sgl-kernel/CMakeLists.txt | 2 ++
 3 files changed, 6 insertions(+)
 create mode 160000 sgl-kernel/3rdparty/cub

diff --git a/.gitmodules b/.gitmodules
index 3a14f6297a3a..c588176e7c07 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "sgl-kernel/3rdparty/cutlass"]
 	path = sgl-kernel/3rdparty/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
+[submodule "sgl-kernel/3rdparty/cub"]
+	path = sgl-kernel/3rdparty/cub
+	url = https://github.com/NVIDIA/cub.git
diff --git a/sgl-kernel/3rdparty/cub b/sgl-kernel/3rdparty/cub
new file mode 160000
index 000000000000..0fc3c3701632
--- /dev/null
+++ b/sgl-kernel/3rdparty/cub
@@ -0,0 +1 @@
+Subproject commit 0fc3c3701632a4be906765b73be20a9ad0da603d
diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt
index 15818d289eae..623984f2f3e7 100644
--- a/sgl-kernel/CMakeLists.txt
+++ b/sgl-kernel/CMakeLists.txt
@@ -9,6 +9,7 @@ set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
 set(CUTLASS_DIR "3rdparty/cutlass")
+set(CUB_DIR "3rdparty/cub")
 
 # Set CUDA architectures
 set(CMAKE_CUDA_ARCHITECTURES "75;80;86;89;90")
@@ -43,6 +44,7 @@ target_include_directories(_kernels
         ${TORCH_INCLUDE_DIRS}
         ${CUTLASS_DIR}/include
         ${CUTLASS_DIR}/tools/util/include
+        ${CUB_DIR}/cub
 )
 
 target_link_libraries(_kernels

From 955a2fbf4e2b0140c6954a6344bf129fc07a7d27 Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Tue, 7 Jan 2025 17:24:45 +0800
Subject: [PATCH 055/248] Add performance and accuracy test code for FP8 GEMM
 operations

---
 sgl-kernel/benchmark/bench_fp8_gemm.py | 71 ++++++++++++++++++++++++++
 sgl-kernel/tests/test_fp8_gemm.py      | 59 +++++++++++++++++++++
 2 files changed, 130 insertions(+)
 create mode 100644 sgl-kernel/benchmark/bench_fp8_gemm.py
 create mode 100644 sgl-kernel/tests/test_fp8_gemm.py

diff --git a/sgl-kernel/benchmark/bench_fp8_gemm.py b/sgl-kernel/benchmark/bench_fp8_gemm.py
new file mode 100644
index 000000000000..d4bc2fdb91a3
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_fp8_gemm.py
@@ -0,0 +1,71 @@
+import torch
+import torch.nn.functional as F
+import triton
+
+from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
+from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+@triton.testing.perf_report(
+        triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["vllm-fp8", "torch-fp8"],
+        line_names=["vllm-fp8", "torch-fp8"],
+        styles=[("green", "-"), ("blue", "-")],
+        ylabel="GB/s",
+        plot_name="int8 scaled matmul",
+        args={},
+    )
+)
+def benchmark(batch_size, provider):
+    M, N, K = batch_size, 8192, 21760
+    a = torch.ones((M, K), device="cuda") * 5.0
+    b = torch.ones((N, K), device="cuda") * 5.0
+    scale_a = torch.randn((M,), device="cuda", dtype=torch.float32)
+    scale_b = torch.randn((N,), device="cuda", dtype=torch.float32)
+    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
+    b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+    b_fp8 = b_fp8.t()
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "vllm-fp8":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: vllm_scaled_mm(
+                a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, torch.bfloat16
+            ),
+            quantiles=quantiles,
+        )
+    if provider == "torch-fp8":
+        scale_a_2d = scale_a_fp8.float().unsqueeze(1)  # [M, 1]
+        scale_b_2d = scale_b_fp8.float().unsqueeze(0)  # [1, N]
+        try:
+            out = torch.empty(
+                (a_fp8.shape[0], b_fp8.shape[0]), device="cuda", dtype=torch.bfloat16
+            )
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: torch._scaled_mm(
+                    a_fp8,
+                    b_fp8,
+                    out=out,
+                    out_dtype=torch.bfloat16,
+                    scale_a=scale_a_2d,
+                    scale_b=scale_b_2d,
+                    use_fast_accum=True,
+                ),
+                quantiles=quantiles,
+            )
+        except RuntimeError as e:
+            print("Error details:", e)
+            raise
+    gbps = lambda ms: (2 * M * N * K + M * N) * a.element_size() * 1e-9 / (ms * 1e-3)
+    return gbps(ms), gbps(max_ms), gbps(min_ms)
+
+
+benchmark.run(print_data=True, show_plots=True, save_path="bench_int8_res")
\ No newline at end of file
diff --git a/sgl-kernel/tests/test_fp8_gemm.py b/sgl-kernel/tests/test_fp8_gemm.py
new file mode 100644
index 000000000000..a233b3b435ab
--- /dev/null
+++ b/sgl-kernel/tests/test_fp8_gemm.py
@@ -0,0 +1,59 @@
+import unittest
+
+import torch
+from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
+from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
+
+
+def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
+    o = torch.matmul(a.to(torch.float32), b.to(torch.float32))
+
+    o = o.to(torch.float32)
+    temp1 = o * scale_a.view(-1, 1)
+    temp2 = temp1 * scale_b.view(1, -1)
+    final = temp2.to(out_dtype)
+
+    return final
+
+
+class TestInt8Gemm(unittest.TestCase):
+    def _test_accuracy_once(self, M, N, K, with_bias, out_dtype, device):
+        a = torch.randn((M, K), device=device) * 5
+        b = torch.randn((N, K), device=device) * 5
+
+        scale_a = torch.randn((M,), device="cuda", dtype=torch.float32)
+        scale_b = torch.randn((N,), device="cuda", dtype=torch.float32)
+        if with_bias:
+            bias = torch.ones((N,), device="cuda", dtype=out_dtype) * 10
+        else:
+            bias = None
+        o1 = torch.empty((a.shape[0], b.shape[1]), device="cuda", dtype=torch.bfloat16)
+        b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+        b_fp8 = b_fp8.t()
+        a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
+        o = torch_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias)
+        o1 = vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias)
+        max_val = max(o.abs().max().item(), o1.abs().max().item())
+        rtol = 2e-2
+        atol = max_val * rtol
+        torch.testing.assert_close(o, o1, rtol=rtol, atol=atol)
+        print(f"M={M}, N={N}, K={K}, with_bias={with_bias}, out_dtype={out_dtype}: OK")
+
+    def test_accuracy(self):
+        Ms = [1, 128, 512, 1024, 4096]
+        Ns = [16, 128, 512, 1024, 4096]
+        Ks = [512, 1024, 4096, 8192, 16384]
+        bias_opts = [True, False]
+        out_dtypes = [torch.bfloat16]
+        for M in Ms:
+            for N in Ns:
+                for K in Ks:
+                    for with_bias in bias_opts:
+                        for out_dtype in out_dtypes:
+                            self._test_accuracy_once(
+                                M, N, K, with_bias, out_dtype, "cuda"
+                            )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 30bdf20c81cdddf9eab4a9daba47742ab1e7fe17 Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Wed, 8 Jan 2025 19:25:23 +0800
Subject: [PATCH 056/248] support w8a8 fp8

---
 sgl-kernel/CMakeLists.txt                     |   1 +
 sgl-kernel/setup.py                           |   1 +
 sgl-kernel/src/sgl-kernel/__init__.py         |   2 +
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    | 571 ++++++++++++++++++
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     |   6 +
 sgl-kernel/src/sgl-kernel/csrc/utils.hpp      |   5 +
 sgl-kernel/src/sgl-kernel/ops/__init__.py     |  11 +
 sgl-kernel/tests/test_fp8_gemm.py             |  28 +-
 8 files changed, 615 insertions(+), 10 deletions(-)
 create mode 100644 sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu

diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt
index 3c267a4de504..c2bfd356c3db 100644
--- a/sgl-kernel/CMakeLists.txt
+++ b/sgl-kernel/CMakeLists.txt
@@ -32,6 +32,7 @@ add_library(_kernels SHARED
     src/sgl-kernel/csrc/trt_reduce_kernel.cu
     src/sgl-kernel/csrc/moe_align_kernel.cu
     src/sgl-kernel/csrc/int8_gemm_kernel.cu
+    src/sgl-kernel/csrc/fp8_gemm_kernel.cu
     src/sgl-kernel/csrc/sgl_kernel_ops.cu
 )
 
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index c93e87f6bad3..3a60f6ba0a6b 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -50,6 +50,7 @@ def update_wheel_platform_tag():
             "src/sgl-kernel/csrc/trt_reduce_kernel.cu",
             "src/sgl-kernel/csrc/moe_align_kernel.cu",
             "src/sgl-kernel/csrc/int8_gemm_kernel.cu",
+            "src/sgl-kernel/csrc/fp8_gemm_kernel.cu",
             "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
         ],
         include_dirs=include_dirs,
diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py
index 892808f1ee15..2a4a2bd51771 100644
--- a/sgl-kernel/src/sgl-kernel/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/__init__.py
@@ -3,6 +3,7 @@
     custom_reduce,
     init_custom_reduce,
     int8_scaled_mm,
+    fp8_scaled_mm,
     moe_align_block_size,
 )
 
@@ -12,4 +13,5 @@
     "custom_dispose",
     "custom_reduce",
     "int8_scaled_mm",
+    "fp8_scaled_mm",
 ]
diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
new file mode 100644
index 000000000000..795328930634
--- /dev/null
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2022-2024, Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef __GNUC__ // Check if the compiler is GCC or Clang
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif // __GNUC__
+
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cute/tensor.hpp"
+#include "cutlass/conv/convolution.h"
+// Order matters here, packed_stride.hpp is missing cute and convolution includes
+#include "cutlass/util/packed_stride.hpp"
+
+#ifdef __GNUC__ // Check if the compiler is GCC or Clang
+#pragma GCC diagnostic pop
+#endif          // __GNUC__
+
+// #include "fp8_rowwise_gemm_kernel_template_sm89.h"
+// #include "fp8_rowwise_gemm_kernel_template_sm90.h"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
+#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+
+#include "utils.hpp"
+using namespace cute;
+
+template <typename ElementType, typename OutElementType, typename AccumElementType, typename CTAShape,
+    typename ClusterShape, typename MainloopScheduleType, typename EpilogueScheduleType,
+    typename TileSchedulerType = void>
+struct DeviceGemmFp8RowwiseSm90
+{
+    static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
+
+    // A matrix configuration
+    using ElementA = ElementType;                      // Element type for A matrix operand
+    using LayoutA = cutlass::layout::RowMajor;         // Layout type for A matrix operand
+    static constexpr int AlignmentA
+        = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A
+                                                       // matrix in units of elements (up to 16 bytes)
+
+    // B matrix configuration
+    using ElementB = ElementType;                      // Element type for B matrix operand
+    using LayoutB = cutlass::layout::ColumnMajor;      // Layout type for B matrix operand
+    static constexpr int AlignmentB
+        = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B
+                                                       // matrix in units of elements (up to 16 bytes)
+
+    // C/D matrix configuration
+    using ElementC = void;                                   // Element type for C matrix operands
+    using LayoutC = cutlass::layout::RowMajor;               // Layout type for C matrix operands
+    static constexpr int AlignmentC
+        = 128 / cutlass::sizeof_bits<OutElementType>::value; // Memory access granularity/alignment of C matrices in
+                                                             // units of elements (up to 16 bytes)
+
+    // Output matrix configuration
+    using ElementOutput = OutElementType;           // Element type for output matrix operands
+    using LayoutOutput = cutlass::layout::RowMajor; // Layout type for output matrix operands
+    static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+    // Auxiliary matrix configuration and other fusion types
+    using ElementBias = float;
+
+    // Multiply-accumulate blocking/pipelining details
+    using ElementAccumulator = AccumElementType; // Element type for internal accumulation
+    using ElementCompute = float;                // Element type for compute
+    using ElementComputeEpilogue = float;
+    using ArchTag = cutlass::arch::Sm90;         // Tag indicating the minimum SM that supports the intended feature
+    using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
+    using TileShape = CTAShape;                           // Threadblock-level tile size
+    using TileScheduler = TileSchedulerType;
+
+    static constexpr bool PONG = false;
+    static constexpr bool FAST_ACCUM = true;
+    static constexpr bool USE_BIAS = false;
+
+    using StageCountType = cutlass::gemm::collective::StageCountAuto;     // Stage count maximized
+                                                                          // based on the tile size
+    using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; // Kernel to launch based on the default
+                                                                          // setting in the Collective Builder
+    // Implement rowwise scaling epilogue.
+    using XScale = cutlass::epilogue::fusion::Sm90ColBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue,
+        cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>>;
+
+    using WScale = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue,
+        cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
+
+    using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementBias, ElementBias,
+        cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
+
+    using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+    using Compute0 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies,
+        ElementComputeEpilogue, // First stage output type.
+        ElementComputeEpilogue, // First stage input types.
+        cutlass::FloatRoundStyle::round_to_nearest>;
+
+    using EVTCompute0 = cutlass::epilogue::fusion::Sm90EVT<Compute0, WScale, Accum>;
+
+    using Compute1 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies, ElementOutput,
+        ElementComputeEpilogue, // Second stage input types.
+        cutlass::FloatRoundStyle::round_to_nearest>;
+
+    using EVTCompute1 = cutlass::epilogue::fusion::Sm90EVT<Compute1, XScale, EVTCompute0>;
+
+    using ComputeBias = cutlass::epilogue::fusion::Sm90Compute<cutlass::plus,
+        ElementOutput, // Final (optional) stage output type.
+        ElementBias,   // Final stage input types.
+        cutlass::FloatRoundStyle::round_to_nearest>;
+
+    using EVTComputeBias = cutlass::epilogue::fusion::Sm90EVT<ComputeBias, Bias, EVTCompute1>;
+
+    using EpilogueEVT = EVTCompute1;
+
+    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<cutlass::arch::Sm90,
+        cutlass::arch::OpClassTensorOp, TileShape, ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+        ElementAccumulator, ElementComputeEpilogue, ElementC, LayoutC, AlignmentC, ElementOutput, LayoutOutput,
+        AlignmentOutput, cutlass::epilogue::TmaWarpSpecialized, EpilogueEVT>::CollectiveOp;
+
+    using DefaultSchedule = cutlass::gemm::KernelTmaWarpSpecialized;
+    using PongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+    using FastDefaultSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+    using FastPongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+
+    using SlowAccum = DefaultSchedule;
+    using FastAccum = FastDefaultSchedule;
+    using MainLoopSchedule = cute::conditional_t<FAST_ACCUM, FastAccum, SlowAccum>;
+
+    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<ArchTag, OperatorClass, ElementA,
+        LayoutA, AlignmentA, ElementB, LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape,
+        cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+            sizeof(typename CollectiveEpilogue::SharedStorage))>,
+        MainLoopSchedule>::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>, // Indicates ProblemShape
+        CollectiveMainloop, CollectiveEpilogue, TileScheduler>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+};
+
+template <typename ElementType, typename OutElementType, typename AccumElementType, typename CtaShape,
+    typename WarpShape, int Stages>
+// template <typename ElementType, typename OutElementType, typename AccumElementType, typename CtaShape,
+//     typename WarpShape, int Stages, bool WithBias>
+struct DeviceGemmFp8RowwiseSm89
+{
+    static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
+
+    using ElementA = ElementType;
+    using LayoutA = cutlass::layout::RowMajor;
+    static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+    using ElementB = ElementType;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+    using ElementC = OutElementType;
+    using LayoutC = cutlass::layout::RowMajor;
+    static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using ElementOutput = OutElementType;
+    using LayoutOutput = cutlass::layout::RowMajor;
+    static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+    using ElementAccumulator = AccumElementType;
+    using ElementComputeEpilogue = float;
+    using ArchTag = cutlass::arch::Sm89;
+    using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+    // Number of epilogue stages in EVT
+    static constexpr int EVTEpilogueStages = 1;
+
+    using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<CtaShape, WarpShape, ElementC,
+        AlignmentC, EVTEpilogueStages>;
+
+    // Definition of EVT
+    using accSrc = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+    using ComputeBScale = cutlass::epilogue::threadblock::VisitorCompute<cutlass::multiplies, ElementComputeEpilogue,
+        ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
+    using bScaleSrc = cutlass::epilogue::threadblock::VisitorRowBroadcast<OutputTileThreadMap, ElementComputeEpilogue,
+        Stride<_0, _1, _0>>;
+    using EpilogueBScale = cutlass::epilogue::threadblock::Sm80EVT<ComputeBScale, accSrc, bScaleSrc>;
+
+    using ComputeAScale = cutlass::epilogue::threadblock::VisitorCompute<cutlass::multiplies, ElementC,
+        ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
+    using aScaleSrc = cutlass::epilogue::threadblock::VisitorColBroadcast<OutputTileThreadMap, ElementComputeEpilogue,
+        Stride<_1, _0, _0>>;
+    using EpilogueAScale = cutlass::epilogue::threadblock::Sm80EVT<ComputeAScale, EpilogueBScale, aScaleSrc>;
+
+    // // With bias
+    // using biasSrc = cutlass::epilogue::threadblock::VisitorRowBroadcast<OutputTileThreadMap, ElementOutput, Stride<_0, _1, _0>>;
+    // using ComputeAScaleWithBias = cutlass::epilogue::threadblock::VisitorCompute<cutlass::multiply_add, ElementC,
+    //     ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
+    // using EpilogueAScaleWithBias = cutlass::epilogue::threadblock::Sm80EVT<ComputeAScaleWithBias, EpilogueBScale, aScaleSrc, biasSrc>;
+
+
+    using dTar = cutlass::epilogue::threadblock::VisitorAuxStore<OutputTileThreadMap, ElementC,
+        cutlass::FloatRoundStyle::round_to_nearest, Stride<int64_t, _1, _0>>;
+    using EpilogueStore = cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScale>;
+    // using EpilogueStore = cutlass::platform::conditional<WithBias, cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScaleWithBias>,
+    //     cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScale>>::type;
+    
+
+    using EpilogueOp = EpilogueStore;
+
+    using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmWithVisitor<ElementA, LayoutA,
+        cutlass::ComplexTransform::kNone, AlignmentA, ElementB, LayoutB, cutlass::ComplexTransform::kNone, AlignmentB,
+        ElementC, LayoutC, AlignmentC, ElementAccumulator, ElementComputeEpilogue, OperatorClass, ArchTag, CtaShape,
+        WarpShape, InstructionShape, EpilogueOp, cutlass::gemm::threadblock::ThreadblockSwizzleStreamK, Stages,
+        cutlass::arch::OpMultiplyAdd, EVTEpilogueStages>::GemmKernel;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+};
+
+
+template <typename Gemm>
+typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
+                             const torch::Tensor& scales_b,
+                             const c10::optional<torch::Tensor>& bias)
+{
+    using ElementT = typename Gemm::ElementA;
+    using ElementOutput = typename Gemm::ElementD;
+    using ElementComputeEpilogue = float;
+
+    // int const lda = k;
+    // int const ldb = k;
+    // int const ldc = n;
+    int32_t m = a.size(0);
+    int32_t n = b.size(1);
+    int32_t k = a.size(1);
+
+    int64_t lda = a.stride(0);
+    int64_t ldb = b.stride(1);
+    int64_t ldc = out.stride(0);
+
+    ElementT const* ptr_a = reinterpret_cast<ElementT const*>(a.data_ptr());
+    ElementT const* ptr_b = reinterpret_cast<ElementT const*>(b.data_ptr());
+    ElementOutput* ptr_d = reinterpret_cast<ElementOutput*>(out.data_ptr());
+    ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
+    ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
+
+    typename Gemm::Arguments args(cutlass::gemm::GemmUniversalMode::kGemm, // Mode
+        {m, n, k},                                                         // Problem size
+        1,                                                                 // Split-k factor
+        {},                                                                // Epilogue args
+        ptr_a,                              // a pointer
+        ptr_b,                              // b pointer
+        nullptr,                                                           // c pointer (unused)
+        nullptr,                                                           // d pointer (unused)
+        m * k,                                                             // batch stride a (unused)
+        n * k,                                                             // batch stride b (unused)
+        m * n,                                                             // batch stride c (unused)
+        m * n,                                                             // batch stride d (unused)
+        lda,                                                               // stride a
+        ldb,                                                               // stride b
+        ldc,                                                               // stride c (unused)
+        ldc);                                                              // stride d (unused)
+
+    args.epilogue = {
+        {
+            {
+                {}, // Accumulator
+                {ptr_scales_b, ElementComputeEpilogue(0),
+                    {_0{}, _1{}, _0{}}},
+                {} // Multiplies
+            },
+            {ptr_scales_a, ElementComputeEpilogue(0), {_1{}, _0{}, _0{}}},
+            {} // Multiplies
+        },
+        {ptr_d, {n, _1{}, _0{}}}};
+    return args;
+}
+
+template <typename OutType, typename CtaShape, typename WarpShape, int Stages>
+void launch_sm89_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
+                             const torch::Tensor& scales_b,
+                             const c10::optional<torch::Tensor>& bias)
+{
+    using ElementInput = cutlass::float_e4m3_t;
+    using ElementOutput = OutType;
+    using AccumElementType = float;
+
+    using Gemm = typename DeviceGemmFp8RowwiseSm89<ElementInput, ElementOutput, AccumElementType, CtaShape, WarpShape,
+        Stages>::Gemm;
+
+    auto args = prepare_sm89_fp8_args<Gemm>(out, a, b, scales_a, scales_b, bias);
+    Gemm gemm_op;
+    // CUTLASS_CHECK(gemm_op.can_implement(args));
+
+    size_t workspace_size = gemm_op.get_workspace_size(args);
+    auto const workspace_options =
+        torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+    auto workspace = torch::empty(workspace_size, workspace_options);
+
+    auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+    auto can_implement = gemm_op.can_implement(args);
+    TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
+
+    // auto status = gemm_op.run(args, workspace.data_ptr(), stream);
+    auto status = gemm_op(args, workspace.data_ptr(), stream);
+    TORCH_CHECK(status == cutlass::Status::kSuccess)
+    // return typedFp8RowwiseGemmKernelLauncher(
+    //     Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy);
+}
+
+
+template <typename OutType>
+void s89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
+                             const torch::Tensor& scales_b,
+                             const c10::optional<torch::Tensor>& bias) {
+    uint32_t const m = a.size(0);
+    uint32_t const mp2 =
+        std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    if (np2 <= 8192) {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+    } else if (np2 <= 24576) {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<16, 128, 64>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+    } else {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    if (np2 <= 8192) {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+    } else if (np2 <= 16384) {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<32, 128, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
+    } else {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    if (np2 <= 8192) {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+    } else if (np2 <= 16384) {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<64, 128, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+    } else {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    if (np2 <= 8192) {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<64, 128, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+    } else if (np2 <= 16384) {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+    } else {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<128, 64, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (mp2 <= 256) {
+    // M in (128, 256]
+    if (np2 <= 4096) {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<64, 128, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+    } else {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else {
+    // M in (256, inf)
+    if (np2 <= 4096) {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+    } else if (np2 <= 8192) {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<256, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+    } else {
+        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+    }
+  }
+}
+
+template <typename Gemm>
+typename Gemm::Arguments prepare_sm90_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
+                             const torch::Tensor& scales_b,
+                             const c10::optional<torch::Tensor>& bias)
+{
+    using ElementT = typename Gemm::ElementA;
+    using ElementOutput = typename Gemm::ElementD;
+    using ElementComputeEpilogue = float;
+    using StrideA = typename Gemm::GemmKernel::StrideA;
+    using StrideB = typename Gemm::GemmKernel::StrideB;
+    using StrideC = typename Gemm::GemmKernel::StrideC;
+    using StrideD = typename Gemm::GemmKernel::StrideD;
+
+    int32_t m = a.size(0);
+    int32_t n = b.size(1);
+    int32_t k = a.size(1);
+    ElementT const* ptr_a = reinterpret_cast<ElementT const*>(a.data_ptr());
+    ElementT const* ptr_b = reinterpret_cast<ElementT const*>(b.data_ptr());
+    ElementOutput* ptr_d = reinterpret_cast<ElementOutput*>(out.data_ptr());
+    ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
+    ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
+
+    // TODO: confirm correctess
+    StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, make_shape(m, k, 1));
+    StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, make_shape(n, k, 1));
+    StrideC stride_c;
+    StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, make_shape(m, n, 1));
+    typename Gemm::Arguments args
+        = {cutlass::gemm::GemmUniversalMode::kGemm, {m, n, k, 1}, {ptr_a, stride_a, ptr_b, stride_b},
+            {{}, // epilogue.thread
+                nullptr, stride_c, ptr_d, stride_d}};
+    args.epilogue.thread = {
+        {ptr_scales_a},
+        {
+            {ptr_scales_b}, {}, // Accumulator
+            {}                                                                             // Multiplies
+        },
+        {},                                                                                // Multiplies
+    };
+    return args;
+}
+
+template <typename OutType, typename CTAShape, typename ClusterShape>
+void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
+                             const torch::Tensor& scales_b,
+                             const c10::optional<torch::Tensor>& bias)
+{
+    using ElementInput = cutlass::float_e4m3_t;
+    using ElementOutput = OutType;
+    using AccumElementType = float;
+    using MainloopScheduleType = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized;
+    using TileSchedulerType = void;
+    using Gemm = typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape,
+        ClusterShape, MainloopScheduleType, EpilogueScheduleType, TileSchedulerType>::Gemm;
+    auto args = prepare_sm90_fp8_args<Gemm>(out, a, b, scales_a, scales_b, bias);
+
+    // Launch the CUTLASS GEMM kernel.
+    Gemm gemm_op;
+    // CUTLASS_CHECK(gemm_op.can_implement(args));
+
+    size_t workspace_size = gemm_op.get_workspace_size(args);
+    auto const workspace_options =
+        torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+    auto workspace = torch::empty(workspace_size, workspace_options);
+
+    auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+    auto can_implement = gemm_op.can_implement(args);
+    TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
+
+    auto status = gemm_op.run(args, workspace.data_ptr(), stream);
+    TORCH_CHECK(status == cutlass::Status::kSuccess)
+    // cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+    // CUTLASS_CHECK(status);
+//     return typedFp8RowwiseGemmKernelLauncher(
+//         Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy);
+// #else  // COMPILE_HOPPER_TMA_GEMMS
+//     throw std::runtime_error(
+//         "[TensorRT-LLm Error][Fp8RowwiseGemmKernelLauncherSm90] Please recompile with support for hopper by passing "
+//         "90-real as an arch to build_wheel.py.");
+// #endif // COMPILE_HOPPER_TMA_GEMMS
+}
+
+template <typename OutType>
+void s90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
+                             const torch::Tensor& scales_b,
+                             const c10::optional<torch::Tensor>& bias) {
+    uint32_t const m = a.size(0);
+    uint32_t const mp2 =
+        std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+    if (mp2 <= 64) {
+        // m in [1, 64]
+        return launch_sm90_fp8_scaled_mm<OutType, Shape<_64, _64, _128>, Shape<_1, _8, _1>>(out, a, b, scales_a, scales_b, bias);
+    } else if (mp2 <= 128) {
+        // m in (64, 128]
+        return launch_sm90_fp8_scaled_mm<OutType, Shape<_64, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+    } else {
+        // m in (128, inf)
+        return launch_sm90_fp8_scaled_mm<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+    }
+}
+
+torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
+                             const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
+                             const c10::optional<torch::Tensor>& bias) {
+  TORCH_CHECK(mat_a.is_cuda(), "mat_a must be a CUDA tensor");
+  TORCH_CHECK(mat_b.is_cuda(), "mat_b must be a CUDA tensor");
+  TORCH_CHECK(mat_a.dim() == 2, "mat_a must be a 2D tensor");
+  TORCH_CHECK(mat_b.dim() == 2, "mat_b must be a 2D tensor");
+  TORCH_CHECK(mat_a.stride(1) == 1, "mat_a must be a row major tensor");
+  TORCH_CHECK(mat_b.stride(0) == 1, "mat_a must be a column major tensor");
+  TORCH_CHECK(mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied");
+
+  TORCH_CHECK(mat_a.size(1) % 16 == 0, "mat_a.size(1) must be multiple of 16 for memory alignment");
+//   TORCH_CHECK(mat_b.size(0) % 16 == 0, "mat_b.size(0) must be multiple of 16 for memory alignment");
+//TODO: % 8
+  TORCH_CHECK(mat_b.size(1) % 16 == 0, "mat_b.size(1) must be multiple of 16 for memory alignment");  // out.stride(0)
+  TORCH_CHECK(mat_a.scalar_type() == torch::kFloat8_e4m3fn, "mat_a must be Float8_e4m3fn");
+  TORCH_CHECK(mat_b.scalar_type() == torch::kFloat8_e4m3fn, "mat_b must be Float8_e4m3fn");
+  TORCH_CHECK(out_dtype == torch::kHalf || out_dtype == torch::kBFloat16, "out_dtype must be Half or BFloat16");
+
+  TORCH_CHECK(scales_a.numel() == mat_a.size(0), "size of scales_a is not matched");
+  TORCH_CHECK(scales_b.numel() == mat_b.size(1), "size of scales_b is not matched");
+  TORCH_CHECK(scales_a.is_contiguous(), "scales_a must be contiguous");
+  TORCH_CHECK(scales_b.is_contiguous(), "scales_b msut be contiguous");
+  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32, "scales_a must be Float32");
+  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32, "scales_b must be Float32");
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == mat_b.size(1), "size of bias is not matched");
+    TORCH_CHECK(bias->is_contiguous(), "bias must be contiguous");
+    TORCH_CHECK(bias->dtype() == out_dtype, "bias dtype must match output dtype");
+  }
+
+  torch::Tensor out = torch::empty({mat_a.size(0), mat_b.size(1)}, mat_a.options().dtype(out_dtype));
+
+  auto sm_version = getSMVersion();
+
+  if (sm_version >= 90) {
+    if (out_dtype == torch::kBFloat16) {
+        s90_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+        s90_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else if (sm_version == 89) {
+    if (out_dtype == torch::kBFloat16) {
+        s89_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+        s89_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented int8_scaled_mm for current compute capability: ", sm_version);
+  }
+
+  return out;
+}
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index 6ed543e6c542..b12d324cc62b 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -17,6 +17,10 @@ torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& ma
                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
                              const c10::optional<torch::Tensor>& bias);
 
+torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
+                             const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
+                             const c10::optional<torch::Tensor>& bias);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // trt_reduce
   m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)");
@@ -26,4 +30,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("moe_align_block_size", &moe_align_block_size, "MOE Align Block Size (CUDA)");
   // int8_scaled_mm
   m.def("int8_scaled_mm", &int8_scaled_mm, "INT8 scaled matmul (CUDA)");
+  // fp8_scaled_mm
+  m.def("fp8_scaled_mm", &fp8_scaled_mm, "FP8 scaled matmul (CUDA)");
 }
diff --git a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
index 2fed2d60c039..5820b1350ab5 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
+++ b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
@@ -44,3 +44,8 @@ inline int getSMVersion() {
   CHECK_CUDA_SUCCESS(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
   return sm_major * 10 + sm_minor;
 }
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index e388ae35653b..f339997b027f 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -2,6 +2,7 @@
 from sgl_kernel.ops._kernels import dispose as _dispose
 from sgl_kernel.ops._kernels import init_custom_ar as _init_custom_ar
 from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm
+from sgl_kernel.ops._kernels import fp8_scaled_mm as _fp8_scaled_mm
 from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size
 
 
@@ -48,3 +49,13 @@ def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
         out_dtype,
         bias,
     )
+
+def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
+    return _fp8_scaled_mm(
+        mat_a,
+        mat_b,
+        scales_a,
+        scales_b,
+        out_dtype,
+        bias,
+    )
diff --git a/sgl-kernel/tests/test_fp8_gemm.py b/sgl-kernel/tests/test_fp8_gemm.py
index a233b3b435ab..c303bef1d1ed 100644
--- a/sgl-kernel/tests/test_fp8_gemm.py
+++ b/sgl-kernel/tests/test_fp8_gemm.py
@@ -3,6 +3,7 @@
 import torch
 from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
 from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
+from sgl_kernel import fp8_scaled_mm
 
 
 def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
@@ -16,15 +17,16 @@ def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
     return final
 
 
-class TestInt8Gemm(unittest.TestCase):
+class TestFp8Gemm(unittest.TestCase):
     def _test_accuracy_once(self, M, N, K, with_bias, out_dtype, device):
-        a = torch.randn((M, K), device=device) * 5
-        b = torch.randn((N, K), device=device) * 5
+        a = torch.randn((M, K), device=device)
+        b = torch.randn((N, K), device=device)
 
-        scale_a = torch.randn((M,), device="cuda", dtype=torch.float32)
-        scale_b = torch.randn((N,), device="cuda", dtype=torch.float32)
+        scale_a = torch.randn((M,), device="cuda", dtype=torch.float32) * 0.01
+        scale_b = torch.randn((N,), device="cuda", dtype=torch.float32) * 0.01
         if with_bias:
-            bias = torch.ones((N,), device="cuda", dtype=out_dtype) * 10
+            # bias = torch.ones((N,), device="cuda", dtype=out_dtype) * 10
+            bias = torch.randn((N,), device="cuda", dtype=out_dtype)
         else:
             bias = None
         o1 = torch.empty((a.shape[0], b.shape[1]), device="cuda", dtype=torch.bfloat16)
@@ -32,9 +34,10 @@ def _test_accuracy_once(self, M, N, K, with_bias, out_dtype, device):
         b_fp8 = b_fp8.t()
         a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
         o = torch_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias)
-        o1 = vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias)
+        # o1 = vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias)
+        o1 = fp8_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias)
         max_val = max(o.abs().max().item(), o1.abs().max().item())
-        rtol = 2e-2
+        rtol = 4e-3
         atol = max_val * rtol
         torch.testing.assert_close(o, o1, rtol=rtol, atol=atol)
         print(f"M={M}, N={N}, K={K}, with_bias={with_bias}, out_dtype={out_dtype}: OK")
@@ -43,8 +46,13 @@ def test_accuracy(self):
         Ms = [1, 128, 512, 1024, 4096]
         Ns = [16, 128, 512, 1024, 4096]
         Ks = [512, 1024, 4096, 8192, 16384]
-        bias_opts = [True, False]
-        out_dtypes = [torch.bfloat16]
+        # Ms = [128]
+        # Ns = [512]
+        # Ks = [4096]
+        # bias_opts = [True, False]
+        bias_opts = [False]
+        out_dtypes = [torch.bfloat16, torch.float16]
+        # out_dtypes = [torch.float16]
         for M in Ms:
             for N in Ns:
                 for K in Ks:

From 4cac9fb925f58e3e90fa5c7053ad10d42afa099b Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Thu, 9 Jan 2025 17:41:46 +0800
Subject: [PATCH 057/248] support bias

---
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    | 505 +++++++++---------
 sgl-kernel/tests/test_fp8_gemm.py             |  21 +-
 2 files changed, 262 insertions(+), 264 deletions(-)

diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index 795328930634..ef88110e9258 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -1,32 +1,9 @@
-/*
- * Copyright (c) 2022-2024, Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- *
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+// Adapted from https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h
+// https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm89.h
+// https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h
 
 #pragma once
 
-#ifdef __GNUC__ // Check if the compiler is GCC or Clang
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstrict-aliasing"
-#endif // __GNUC__
-
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 
@@ -35,13 +12,6 @@
 // Order matters here, packed_stride.hpp is missing cute and convolution includes
 #include "cutlass/util/packed_stride.hpp"
 
-#ifdef __GNUC__ // Check if the compiler is GCC or Clang
-#pragma GCC diagnostic pop
-#endif          // __GNUC__
-
-// #include "fp8_rowwise_gemm_kernel_template_sm89.h"
-// #include "fp8_rowwise_gemm_kernel_template_sm90.h"
-
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm.h"
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
@@ -59,127 +29,11 @@
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
 
-
 #include "utils.hpp"
 using namespace cute;
 
-template <typename ElementType, typename OutElementType, typename AccumElementType, typename CTAShape,
-    typename ClusterShape, typename MainloopScheduleType, typename EpilogueScheduleType,
-    typename TileSchedulerType = void>
-struct DeviceGemmFp8RowwiseSm90
-{
-    static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
-
-    // A matrix configuration
-    using ElementA = ElementType;                      // Element type for A matrix operand
-    using LayoutA = cutlass::layout::RowMajor;         // Layout type for A matrix operand
-    static constexpr int AlignmentA
-        = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A
-                                                       // matrix in units of elements (up to 16 bytes)
-
-    // B matrix configuration
-    using ElementB = ElementType;                      // Element type for B matrix operand
-    using LayoutB = cutlass::layout::ColumnMajor;      // Layout type for B matrix operand
-    static constexpr int AlignmentB
-        = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B
-                                                       // matrix in units of elements (up to 16 bytes)
-
-    // C/D matrix configuration
-    using ElementC = void;                                   // Element type for C matrix operands
-    using LayoutC = cutlass::layout::RowMajor;               // Layout type for C matrix operands
-    static constexpr int AlignmentC
-        = 128 / cutlass::sizeof_bits<OutElementType>::value; // Memory access granularity/alignment of C matrices in
-                                                             // units of elements (up to 16 bytes)
-
-    // Output matrix configuration
-    using ElementOutput = OutElementType;           // Element type for output matrix operands
-    using LayoutOutput = cutlass::layout::RowMajor; // Layout type for output matrix operands
-    static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
-
-    // Auxiliary matrix configuration and other fusion types
-    using ElementBias = float;
-
-    // Multiply-accumulate blocking/pipelining details
-    using ElementAccumulator = AccumElementType; // Element type for internal accumulation
-    using ElementCompute = float;                // Element type for compute
-    using ElementComputeEpilogue = float;
-    using ArchTag = cutlass::arch::Sm90;         // Tag indicating the minimum SM that supports the intended feature
-    using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
-    using TileShape = CTAShape;                           // Threadblock-level tile size
-    using TileScheduler = TileSchedulerType;
-
-    static constexpr bool PONG = false;
-    static constexpr bool FAST_ACCUM = true;
-    static constexpr bool USE_BIAS = false;
-
-    using StageCountType = cutlass::gemm::collective::StageCountAuto;     // Stage count maximized
-                                                                          // based on the tile size
-    using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; // Kernel to launch based on the default
-                                                                          // setting in the Collective Builder
-    // Implement rowwise scaling epilogue.
-    using XScale = cutlass::epilogue::fusion::Sm90ColBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue,
-        cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>>;
-
-    using WScale = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue,
-        cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
-
-    using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementBias, ElementBias,
-        cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
-
-    using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
-
-    using Compute0 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies,
-        ElementComputeEpilogue, // First stage output type.
-        ElementComputeEpilogue, // First stage input types.
-        cutlass::FloatRoundStyle::round_to_nearest>;
-
-    using EVTCompute0 = cutlass::epilogue::fusion::Sm90EVT<Compute0, WScale, Accum>;
-
-    using Compute1 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies, ElementOutput,
-        ElementComputeEpilogue, // Second stage input types.
-        cutlass::FloatRoundStyle::round_to_nearest>;
-
-    using EVTCompute1 = cutlass::epilogue::fusion::Sm90EVT<Compute1, XScale, EVTCompute0>;
-
-    using ComputeBias = cutlass::epilogue::fusion::Sm90Compute<cutlass::plus,
-        ElementOutput, // Final (optional) stage output type.
-        ElementBias,   // Final stage input types.
-        cutlass::FloatRoundStyle::round_to_nearest>;
-
-    using EVTComputeBias = cutlass::epilogue::fusion::Sm90EVT<ComputeBias, Bias, EVTCompute1>;
-
-    using EpilogueEVT = EVTCompute1;
-
-    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<cutlass::arch::Sm90,
-        cutlass::arch::OpClassTensorOp, TileShape, ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-        ElementAccumulator, ElementComputeEpilogue, ElementC, LayoutC, AlignmentC, ElementOutput, LayoutOutput,
-        AlignmentOutput, cutlass::epilogue::TmaWarpSpecialized, EpilogueEVT>::CollectiveOp;
-
-    using DefaultSchedule = cutlass::gemm::KernelTmaWarpSpecialized;
-    using PongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-    using FastDefaultSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
-    using FastPongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-
-    using SlowAccum = DefaultSchedule;
-    using FastAccum = FastDefaultSchedule;
-    using MainLoopSchedule = cute::conditional_t<FAST_ACCUM, FastAccum, SlowAccum>;
-
-    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<ArchTag, OperatorClass, ElementA,
-        LayoutA, AlignmentA, ElementB, LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape,
-        cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
-            sizeof(typename CollectiveEpilogue::SharedStorage))>,
-        MainLoopSchedule>::CollectiveOp;
-
-    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>, // Indicates ProblemShape
-        CollectiveMainloop, CollectiveEpilogue, TileScheduler>;
-
-    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-};
-
 template <typename ElementType, typename OutElementType, typename AccumElementType, typename CtaShape,
-    typename WarpShape, int Stages>
-// template <typename ElementType, typename OutElementType, typename AccumElementType, typename CtaShape,
-//     typename WarpShape, int Stages, bool WithBias>
+    typename WarpShape, int Stages, bool WithBias>
 struct DeviceGemmFp8RowwiseSm89
 {
     static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
@@ -227,20 +81,17 @@ struct DeviceGemmFp8RowwiseSm89
         Stride<_1, _0, _0>>;
     using EpilogueAScale = cutlass::epilogue::threadblock::Sm80EVT<ComputeAScale, EpilogueBScale, aScaleSrc>;
 
-    // // With bias
-    // using biasSrc = cutlass::epilogue::threadblock::VisitorRowBroadcast<OutputTileThreadMap, ElementOutput, Stride<_0, _1, _0>>;
-    // using ComputeAScaleWithBias = cutlass::epilogue::threadblock::VisitorCompute<cutlass::multiply_add, ElementC,
-    //     ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
-    // using EpilogueAScaleWithBias = cutlass::epilogue::threadblock::Sm80EVT<ComputeAScaleWithBias, EpilogueBScale, aScaleSrc, biasSrc>;
-
+    // With bias
+    using biasSrc = cutlass::epilogue::threadblock::VisitorRowBroadcast<OutputTileThreadMap, ElementOutput, Stride<_0, _1, _0>>;
+    using ComputeAScaleWithBias = cutlass::epilogue::threadblock::VisitorCompute<cutlass::multiply_add, ElementC,
+        ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
+    using EpilogueAScaleWithBias = cutlass::epilogue::threadblock::Sm80EVT<ComputeAScaleWithBias, EpilogueBScale, aScaleSrc, biasSrc>;
 
     using dTar = cutlass::epilogue::threadblock::VisitorAuxStore<OutputTileThreadMap, ElementC,
         cutlass::FloatRoundStyle::round_to_nearest, Stride<int64_t, _1, _0>>;
-    using EpilogueStore = cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScale>;
-    // using EpilogueStore = cutlass::platform::conditional<WithBias, cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScaleWithBias>,
-    //     cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScale>>::type;
+    using EpilogueStore = typename cutlass::platform::conditional<WithBias, cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScaleWithBias>,
+        cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScale>>::type;
     
-
     using EpilogueOp = EpilogueStore;
 
     using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmWithVisitor<ElementA, LayoutA,
@@ -253,7 +104,7 @@ struct DeviceGemmFp8RowwiseSm89
 };
 
 
-template <typename Gemm>
+template <typename Gemm, bool WithBias>
 typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b,
                              const c10::optional<torch::Tensor>& bias)
@@ -262,9 +113,6 @@ typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch::
     using ElementOutput = typename Gemm::ElementD;
     using ElementComputeEpilogue = float;
 
-    // int const lda = k;
-    // int const ldb = k;
-    // int const ldc = n;
     int32_t m = a.size(0);
     int32_t n = b.size(1);
     int32_t k = a.size(1);
@@ -275,16 +123,22 @@ typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch::
 
     ElementT const* ptr_a = reinterpret_cast<ElementT const*>(a.data_ptr());
     ElementT const* ptr_b = reinterpret_cast<ElementT const*>(b.data_ptr());
+    ElementOutput const* ptr_bias = nullptr;
+    if constexpr (WithBias) {
+        TORCH_CHECK(bias.has_value())
+        ptr_bias = reinterpret_cast<ElementOutput const*>(bias.value().data_ptr());
+    }
     ElementOutput* ptr_d = reinterpret_cast<ElementOutput*>(out.data_ptr());
     ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
     ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
 
+
     typename Gemm::Arguments args(cutlass::gemm::GemmUniversalMode::kGemm, // Mode
         {m, n, k},                                                         // Problem size
         1,                                                                 // Split-k factor
         {},                                                                // Epilogue args
-        ptr_a,                              // a pointer
-        ptr_b,                              // b pointer
+        ptr_a,                                                             // a pointer
+        ptr_b,                                                             // b pointer
         nullptr,                                                           // c pointer (unused)
         nullptr,                                                           // d pointer (unused)
         m * k,                                                             // batch stride a (unused)
@@ -295,8 +149,22 @@ typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch::
         ldb,                                                               // stride b
         ldc,                                                               // stride c (unused)
         ldc);                                                              // stride d (unused)
-
-    args.epilogue = {
+    if constexpr (WithBias) {
+        args.epilogue = {
+        {
+            {
+                {}, // Accumulator
+                {ptr_scales_b, ElementComputeEpilogue(0),
+                    {_0{}, _1{}, _0{}}},
+                {} // Multiplies
+            },
+            {ptr_scales_a, ElementComputeEpilogue(0), {_1{}, _0{}, _0{}}},
+            {ptr_bias, ElementOutput(0), {_0{}, _1{}, _0{}}},
+            {} // Multiplies
+        },
+        {ptr_d, {n, _1{}, _0{}}}};
+    } else {
+        args.epilogue = {
         {
             {
                 {}, // Accumulator
@@ -308,45 +176,53 @@ typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch::
             {} // Multiplies
         },
         {ptr_d, {n, _1{}, _0{}}}};
+    }
+
     return args;
 }
 
-template <typename OutType, typename CtaShape, typename WarpShape, int Stages>
+template <typename Gemm, bool WithBias>
 void launch_sm89_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b,
                              const c10::optional<torch::Tensor>& bias)
 {
-    using ElementInput = cutlass::float_e4m3_t;
-    using ElementOutput = OutType;
-    using AccumElementType = float;
-
-    using Gemm = typename DeviceGemmFp8RowwiseSm89<ElementInput, ElementOutput, AccumElementType, CtaShape, WarpShape,
-        Stages>::Gemm;
-
-    auto args = prepare_sm89_fp8_args<Gemm>(out, a, b, scales_a, scales_b, bias);
+    auto args = prepare_sm89_fp8_args<Gemm, WithBias>(out, a, b, scales_a, scales_b, bias);
     Gemm gemm_op;
-    // CUTLASS_CHECK(gemm_op.can_implement(args));
 
     size_t workspace_size = gemm_op.get_workspace_size(args);
     auto const workspace_options =
         torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
     auto workspace = torch::empty(workspace_size, workspace_options);
-
     auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
 
     auto can_implement = gemm_op.can_implement(args);
     TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
 
-    // auto status = gemm_op.run(args, workspace.data_ptr(), stream);
     auto status = gemm_op(args, workspace.data_ptr(), stream);
     TORCH_CHECK(status == cutlass::Status::kSuccess)
-    // return typedFp8RowwiseGemmKernelLauncher(
-    //     Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy);
+}
+
+template <typename OutType, typename CtaShape, typename WarpShape, int Stages>
+void sm89_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
+                             const torch::Tensor& scales_b,
+                             const c10::optional<torch::Tensor>& bias) {
+    using ElementInput = cutlass::float_e4m3_t;
+    using ElementOutput = OutType;
+    using AccumElementType = float;
+    if (bias) {
+        using Gemm = typename DeviceGemmFp8RowwiseSm89<ElementInput, ElementOutput, AccumElementType, CtaShape, WarpShape,
+            Stages, true>::Gemm;
+        return launch_sm89_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
+    } else {
+        using Gemm = typename DeviceGemmFp8RowwiseSm89<ElementInput, ElementOutput, AccumElementType, CtaShape, WarpShape,
+            Stages, false>::Gemm;
+        return launch_sm89_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
+    }
 }
 
 
 template <typename OutType>
-void s89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
+void sm89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b,
                              const c10::optional<torch::Tensor>& bias) {
     uint32_t const m = a.size(0);
@@ -359,59 +235,170 @@ void s89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch:
   if (mp2 <= 16) {
     // M in [1, 16]
     if (np2 <= 8192) {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     } else if (np2 <= 24576) {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<16, 128, 64>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 128, 64>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     } else {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     }
   } else if (mp2 <= 32) {
     // M in (16, 32]
     if (np2 <= 8192) {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     } else if (np2 <= 16384) {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<32, 128, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 128, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
     } else {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     }
   } else if (mp2 <= 64) {
     // M in (32, 64]
     if (np2 <= 8192) {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     } else if (np2 <= 16384) {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<64, 128, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
     } else {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     }
   } else if (mp2 <= 128) {
     // M in (64, 128]
     if (np2 <= 8192) {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<64, 128, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
     } else if (np2 <= 16384) {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     } else {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<128, 64, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
     }
   } else if (mp2 <= 256) {
     // M in (128, 256]
     if (np2 <= 4096) {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<64, 128, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
     } else {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     }
   } else {
     // M in (256, inf)
     if (np2 <= 4096) {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     } else if (np2 <= 8192) {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<256, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<256, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
     } else {
-        return launch_sm89_fp8_scaled_mm<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     }
   }
 }
 
-template <typename Gemm>
+template <typename ElementType, typename OutElementType, typename AccumElementType, typename CTAShape,
+    typename ClusterShape, typename MainloopScheduleType, typename EpilogueScheduleType,
+    typename TileSchedulerType = void, bool WithBias = false>
+struct DeviceGemmFp8RowwiseSm90
+{
+    static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
+
+    // A matrix configuration
+    using ElementA = ElementType;                      // Element type for A matrix operand
+    using LayoutA = cutlass::layout::RowMajor;         // Layout type for A matrix operand
+    static constexpr int AlignmentA
+        = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A
+                                                       // matrix in units of elements (up to 16 bytes)
+
+    // B matrix configuration
+    using ElementB = ElementType;                      // Element type for B matrix operand
+    using LayoutB = cutlass::layout::ColumnMajor;      // Layout type for B matrix operand
+    static constexpr int AlignmentB
+        = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B
+                                                       // matrix in units of elements (up to 16 bytes)
+
+    // C/D matrix configuration
+    using ElementC = void;                                   // Element type for C matrix operands
+    using LayoutC = cutlass::layout::RowMajor;               // Layout type for C matrix operands
+    static constexpr int AlignmentC
+        = 128 / cutlass::sizeof_bits<OutElementType>::value; // Memory access granularity/alignment of C matrices in
+                                                             // units of elements (up to 16 bytes)
+
+    // Output matrix configuration
+    using ElementOutput = OutElementType;           // Element type for output matrix operands
+    using LayoutOutput = cutlass::layout::RowMajor; // Layout type for output matrix operands
+    static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+    // // Auxiliary matrix configuration and other fusion types
+    // using ElementBias = float;
+
+    // Multiply-accumulate blocking/pipelining details
+    using ElementAccumulator = AccumElementType; // Element type for internal accumulation
+    using ElementCompute = float;                // Element type for compute
+    using ElementComputeEpilogue = float;
+    using ArchTag = cutlass::arch::Sm90;         // Tag indicating the minimum SM that supports the intended feature
+    using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
+    using TileShape = CTAShape;                           // Threadblock-level tile size
+    using TileScheduler = TileSchedulerType;
+
+    static constexpr bool PONG = false;
+    static constexpr bool FAST_ACCUM = true;
+    static constexpr bool USE_BIAS = false;
+
+    using StageCountType = cutlass::gemm::collective::StageCountAuto;     // Stage count maximized
+                                                                          // based on the tile size
+    using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; // Kernel to launch based on the default
+                                                                          // setting in the Collective Builder
+    // Implement rowwise scaling epilogue.
+    using XScale = cutlass::epilogue::fusion::Sm90ColBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue,
+        cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>>;
+
+    using WScale = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue,
+        cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
+
+    using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementOutput, ElementOutput,
+        cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
+
+    using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+    using Compute0 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies,
+        ElementComputeEpilogue, // First stage output type.
+        ElementComputeEpilogue, // First stage input types.
+        cutlass::FloatRoundStyle::round_to_nearest>;
+
+    using EVTCompute0 = cutlass::epilogue::fusion::Sm90EVT<Compute0, WScale, Accum>;
+
+    using Compute1 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies, ElementOutput,
+        ElementComputeEpilogue, // Second stage input types.
+        cutlass::FloatRoundStyle::round_to_nearest>;
+
+    using EVTCompute1 = cutlass::epilogue::fusion::Sm90EVT<Compute1, XScale, EVTCompute0>;
+
+    // With bias
+    using ComputeWithBias = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiply_add, ElementOutput,
+        ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
+    using EVTComputeWithBias = cutlass::epilogue::fusion::Sm90EVT<ComputeWithBias, XScale, EVTCompute0, Bias>;
+
+    using EpilogueEVT = typename cutlass::platform::conditional<WithBias, EVTComputeWithBias, EVTCompute1>::type;
+
+    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<cutlass::arch::Sm90,
+        cutlass::arch::OpClassTensorOp, TileShape, ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+        ElementAccumulator, ElementComputeEpilogue, ElementC, LayoutC, AlignmentC, ElementOutput, LayoutOutput,
+        AlignmentOutput, cutlass::epilogue::TmaWarpSpecialized, EpilogueEVT>::CollectiveOp;
+
+    using DefaultSchedule = cutlass::gemm::KernelTmaWarpSpecialized;
+    using PongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+    using FastDefaultSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+    using FastPongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+
+    using SlowAccum = DefaultSchedule;
+    using FastAccum = FastDefaultSchedule;
+    using MainLoopSchedule = cute::conditional_t<FAST_ACCUM, FastAccum, SlowAccum>;
+
+    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<ArchTag, OperatorClass, ElementA,
+        LayoutA, AlignmentA, ElementB, LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape,
+        cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+            sizeof(typename CollectiveEpilogue::SharedStorage))>,
+        MainLoopSchedule>::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>, // Indicates ProblemShape
+        CollectiveMainloop, CollectiveEpilogue, TileScheduler>;
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+};
+
+template <typename Gemm, bool WithBias>
 typename Gemm::Arguments prepare_sm90_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b,
                              const c10::optional<torch::Tensor>& bias)
@@ -429,6 +416,11 @@ typename Gemm::Arguments prepare_sm90_fp8_args(torch::Tensor& out, const torch::
     int32_t k = a.size(1);
     ElementT const* ptr_a = reinterpret_cast<ElementT const*>(a.data_ptr());
     ElementT const* ptr_b = reinterpret_cast<ElementT const*>(b.data_ptr());
+    ElementOutput const* ptr_bias = nullptr;
+    if constexpr (WithBias) {
+        TORCH_CHECK(bias.has_value())
+        ptr_bias = reinterpret_cast<ElementOutput const*>(bias.value().data_ptr());
+    }
     ElementOutput* ptr_d = reinterpret_cast<ElementOutput*>(out.data_ptr());
     ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
     ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
@@ -442,41 +434,42 @@ typename Gemm::Arguments prepare_sm90_fp8_args(torch::Tensor& out, const torch::
         = {cutlass::gemm::GemmUniversalMode::kGemm, {m, n, k, 1}, {ptr_a, stride_a, ptr_b, stride_b},
             {{}, // epilogue.thread
                 nullptr, stride_c, ptr_d, stride_d}};
-    args.epilogue.thread = {
-        {ptr_scales_a},
-        {
-            {ptr_scales_b}, {}, // Accumulator
-            {}                                                                             // Multiplies
-        },
-        {},                                                                                // Multiplies
-    };
+    if constexpr (WithBias) {
+        args.epilogue.thread = {
+            {ptr_scales_a},
+            {
+                {ptr_scales_b}, {}, // Accumulator
+                {}                                                                             // Multiplies
+            },
+            {ptr_bias},
+            {},                                                                                // Multiplies
+        };
+    } else {
+        args.epilogue.thread = {
+            {ptr_scales_a},
+            {
+                {ptr_scales_b}, {}, // Accumulator
+                {}                                                                             // Multiplies
+            },
+            {},                                                                                // Multiplies
+        };
+    }
+
     return args;
 }
 
-template <typename OutType, typename CTAShape, typename ClusterShape>
+template <typename Gemm, bool WithBias>
 void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b,
                              const c10::optional<torch::Tensor>& bias)
 {
-    using ElementInput = cutlass::float_e4m3_t;
-    using ElementOutput = OutType;
-    using AccumElementType = float;
-    using MainloopScheduleType = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
-    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized;
-    using TileSchedulerType = void;
-    using Gemm = typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape,
-        ClusterShape, MainloopScheduleType, EpilogueScheduleType, TileSchedulerType>::Gemm;
-    auto args = prepare_sm90_fp8_args<Gemm>(out, a, b, scales_a, scales_b, bias);
-
-    // Launch the CUTLASS GEMM kernel.
+    auto args = prepare_sm90_fp8_args<Gemm, WithBias>(out, a, b, scales_a, scales_b, bias);
     Gemm gemm_op;
-    // CUTLASS_CHECK(gemm_op.can_implement(args));
 
     size_t workspace_size = gemm_op.get_workspace_size(args);
     auto const workspace_options =
         torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
     auto workspace = torch::empty(workspace_size, workspace_options);
-
     auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
 
     auto can_implement = gemm_op.can_implement(args);
@@ -484,19 +477,32 @@ void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const
 
     auto status = gemm_op.run(args, workspace.data_ptr(), stream);
     TORCH_CHECK(status == cutlass::Status::kSuccess)
-    // cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
-    // CUTLASS_CHECK(status);
-//     return typedFp8RowwiseGemmKernelLauncher(
-//         Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy);
-// #else  // COMPILE_HOPPER_TMA_GEMMS
-//     throw std::runtime_error(
-//         "[TensorRT-LLm Error][Fp8RowwiseGemmKernelLauncherSm90] Please recompile with support for hopper by passing "
-//         "90-real as an arch to build_wheel.py.");
-// #endif // COMPILE_HOPPER_TMA_GEMMS
+}
+
+
+template <typename OutType, typename CTAShape, typename ClusterShape>
+void sm90_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
+                             const torch::Tensor& scales_b,
+                             const c10::optional<torch::Tensor>& bias) {
+    using ElementInput = cutlass::float_e4m3_t;
+    using ElementOutput = OutType;
+    using AccumElementType = float;
+    using MainloopScheduleType = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized;
+    using TileSchedulerType = void;
+    if (bias) {
+        using Gemm = typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape,
+            ClusterShape, MainloopScheduleType, EpilogueScheduleType, TileSchedulerType, true>::Gemm;
+        return launch_sm90_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
+    } else {
+        using Gemm = typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape,
+            ClusterShape, MainloopScheduleType, EpilogueScheduleType, TileSchedulerType, false>::Gemm;
+        return launch_sm90_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
+    }
 }
 
 template <typename OutType>
-void s90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
+void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b,
                              const c10::optional<torch::Tensor>& bias) {
     uint32_t const m = a.size(0);
@@ -505,13 +511,13 @@ void s90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch:
 
     if (mp2 <= 64) {
         // m in [1, 64]
-        return launch_sm90_fp8_scaled_mm<OutType, Shape<_64, _64, _128>, Shape<_1, _8, _1>>(out, a, b, scales_a, scales_b, bias);
+        return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _8, _1>>(out, a, b, scales_a, scales_b, bias);
     } else if (mp2 <= 128) {
         // m in (64, 128]
-        return launch_sm90_fp8_scaled_mm<OutType, Shape<_64, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+        return sm90_dispatch_bias<OutType, Shape<_64, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
     } else {
         // m in (128, inf)
-        return launch_sm90_fp8_scaled_mm<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+        return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
     }
 }
 
@@ -526,10 +532,8 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
   TORCH_CHECK(mat_b.stride(0) == 1, "mat_a must be a column major tensor");
   TORCH_CHECK(mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied");
 
-  TORCH_CHECK(mat_a.size(1) % 16 == 0, "mat_a.size(1) must be multiple of 16 for memory alignment");
-//   TORCH_CHECK(mat_b.size(0) % 16 == 0, "mat_b.size(0) must be multiple of 16 for memory alignment");
-//TODO: % 8
-  TORCH_CHECK(mat_b.size(1) % 16 == 0, "mat_b.size(1) must be multiple of 16 for memory alignment");  // out.stride(0)
+  TORCH_CHECK((mat_a.size(1) * mat_a.element_size()) % 16 == 0, "mat_a must be multiple of 16 bytes for memory alignment");
+  TORCH_CHECK((mat_b.size(0) * mat_b.element_size()) % 16 == 0, "mat_b must be multiple of 16 bytes for memory alignment");
   TORCH_CHECK(mat_a.scalar_type() == torch::kFloat8_e4m3fn, "mat_a must be Float8_e4m3fn");
   TORCH_CHECK(mat_b.scalar_type() == torch::kFloat8_e4m3fn, "mat_b must be Float8_e4m3fn");
   TORCH_CHECK(out_dtype == torch::kHalf || out_dtype == torch::kBFloat16, "out_dtype must be Half or BFloat16");
@@ -548,21 +552,22 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
   }
 
   torch::Tensor out = torch::empty({mat_a.size(0), mat_b.size(1)}, mat_a.options().dtype(out_dtype));
+  TORCH_CHECK((out.size(1) * out.element_size()) % 16 == 0, "out must be multiple of 16 bytes for memory alignment");
 
   auto sm_version = getSMVersion();
 
   if (sm_version >= 90) {
-    if (out_dtype == torch::kBFloat16) {
-        s90_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
-    } else {
-        s90_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
-    }
+        if (out_dtype == torch::kBFloat16) {
+            sm90_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+        } else {
+            sm90_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+        }
   } else if (sm_version == 89) {
-    if (out_dtype == torch::kBFloat16) {
-        s89_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
-    } else {
-        s89_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
-    }
+        if (out_dtype == torch::kBFloat16) {
+            sm89_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+        } else {
+            sm89_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+        }
   } else {
     TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented int8_scaled_mm for current compute capability: ", sm_version);
   }
diff --git a/sgl-kernel/tests/test_fp8_gemm.py b/sgl-kernel/tests/test_fp8_gemm.py
index c303bef1d1ed..2a474d7ea17e 100644
--- a/sgl-kernel/tests/test_fp8_gemm.py
+++ b/sgl-kernel/tests/test_fp8_gemm.py
@@ -1,7 +1,6 @@
 import unittest
 
 import torch
-from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
 from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
 from sgl_kernel import fp8_scaled_mm
 
@@ -13,6 +12,8 @@ def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
     temp1 = o * scale_a.view(-1, 1)
     temp2 = temp1 * scale_b.view(1, -1)
     final = temp2.to(out_dtype)
+    if bias is not None:
+        final = final + bias.view(1, -1)
 
     return final
 
@@ -22,10 +23,9 @@ def _test_accuracy_once(self, M, N, K, with_bias, out_dtype, device):
         a = torch.randn((M, K), device=device)
         b = torch.randn((N, K), device=device)
 
-        scale_a = torch.randn((M,), device="cuda", dtype=torch.float32) * 0.01
-        scale_b = torch.randn((N,), device="cuda", dtype=torch.float32) * 0.01
+        scale_a = torch.randn((M,), device="cuda", dtype=torch.float32) * 0.001
+        scale_b = torch.randn((N,), device="cuda", dtype=torch.float32) * 0.001
         if with_bias:
-            # bias = torch.ones((N,), device="cuda", dtype=out_dtype) * 10
             bias = torch.randn((N,), device="cuda", dtype=out_dtype)
         else:
             bias = None
@@ -34,11 +34,9 @@ def _test_accuracy_once(self, M, N, K, with_bias, out_dtype, device):
         b_fp8 = b_fp8.t()
         a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
         o = torch_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias)
-        # o1 = vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias)
         o1 = fp8_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias)
-        max_val = max(o.abs().max().item(), o1.abs().max().item())
-        rtol = 4e-3
-        atol = max_val * rtol
+        rtol = 0.01
+        atol = 0.1
         torch.testing.assert_close(o, o1, rtol=rtol, atol=atol)
         print(f"M={M}, N={N}, K={K}, with_bias={with_bias}, out_dtype={out_dtype}: OK")
 
@@ -46,13 +44,8 @@ def test_accuracy(self):
         Ms = [1, 128, 512, 1024, 4096]
         Ns = [16, 128, 512, 1024, 4096]
         Ks = [512, 1024, 4096, 8192, 16384]
-        # Ms = [128]
-        # Ns = [512]
-        # Ks = [4096]
-        # bias_opts = [True, False]
-        bias_opts = [False]
+        bias_opts = [True, False]
         out_dtypes = [torch.bfloat16, torch.float16]
-        # out_dtypes = [torch.float16]
         for M in Ms:
             for N in Ns:
                 for K in Ks:

From ecc90a484fb6a150d4a76b760baad4640b2ae064 Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Fri, 10 Jan 2025 17:22:15 +0800
Subject: [PATCH 058/248] opitmize

---
 sgl-kernel/benchmark/bench_fp8_gemm.py        |  89 +++++++++-
 .../benchmark/bench_int8_res/results.html     |   3 +
 sgl-kernel/benchmark/best_fp8_configs.json    |  42 +++++
 sgl-kernel/setup.py                           |  38 ++++-
 sgl-kernel/src/sgl-kernel/__init__.py         |   2 +
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    | 159 ++++++++++++++++--
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     |  15 +-
 sgl-kernel/src/sgl-kernel/ops/__init__.py     |  22 ++-
 8 files changed, 338 insertions(+), 32 deletions(-)
 create mode 100644 sgl-kernel/benchmark/bench_int8_res/results.html
 create mode 100644 sgl-kernel/benchmark/best_fp8_configs.json

diff --git a/sgl-kernel/benchmark/bench_fp8_gemm.py b/sgl-kernel/benchmark/bench_fp8_gemm.py
index d4bc2fdb91a3..65efce4417c0 100644
--- a/sgl-kernel/benchmark/bench_fp8_gemm.py
+++ b/sgl-kernel/benchmark/bench_fp8_gemm.py
@@ -4,8 +4,9 @@
 
 from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
 from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
-
-
+from sgl_kernel import fp8_scaled_mm as sgl_scaled_mm
+from sgl_kernel import fp8_scaled_mm_profile as sgl_scaled_mm_profile
+import time
 def to_int8(tensor: torch.Tensor) -> torch.Tensor:
     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 
@@ -16,16 +17,18 @@ def to_int8(tensor: torch.Tensor) -> torch.Tensor:
         x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048],
         x_log=False,
         line_arg="provider",
-        line_vals=["vllm-fp8", "torch-fp8"],
-        line_names=["vllm-fp8", "torch-fp8"],
-        styles=[("green", "-"), ("blue", "-")],
+        # line_vals=["vllm-fp8", "torch-fp8", "sglang-fp8"],
+        # line_names=["vllm-fp8", "torch-fp8", "sglang-fp8"],
+        line_vals=["vllm-fp8", "sglang-fp8", "sglang-fp8-profile"],
+        line_names=["vllm-fp8", "sglang-fp8", "sglang-fp8-profile"],
+        styles=[("green", "-"), ("blue", "-"), ("red", "-")],
         ylabel="GB/s",
         plot_name="int8 scaled matmul",
         args={},
     )
 )
 def benchmark(batch_size, provider):
-    M, N, K = batch_size, 8192, 21760
+    M, N, K = batch_size, 4096, 8192
     a = torch.ones((M, K), device="cuda") * 5.0
     b = torch.ones((N, K), device="cuda") * 5.0
     scale_a = torch.randn((M,), device="cuda", dtype=torch.float32)
@@ -64,6 +67,80 @@ def benchmark(batch_size, provider):
         except RuntimeError as e:
             print("Error details:", e)
             raise
+    if provider == "sglang-fp8":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: sgl_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, torch.bfloat16),
+            quantiles=quantiles,
+        )
+    if provider == "sglang-fp8-profile":
+        best_configs = []
+        times = []
+        valid_configs = []
+        best_config_info = {}  # 新增：用于存储每个输入规模的最优配置信息
+        
+        try:
+            sgl_scaled_mm_profile(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, torch.bfloat16, bias=None, config_id=35)
+        except RuntimeError as e:
+            print(f"Skip config_id 35 due to error: {e}")
+            
+        for config_id in range(1, 7):
+            try:
+                torch.cuda.synchronize()
+                start = time.time()
+                sgl_scaled_mm_profile(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, 
+                                    torch.bfloat16, bias=None, config_id=config_id)
+                torch.cuda.synchronize()
+                end = time.time()
+                times.append(end - start)
+                valid_configs.append(config_id)
+                print(f"config_id: {config_id}, time: {end - start}")
+            except RuntimeError as e:
+                print(f"Skip config_id {config_id} due to error: {e}")
+                continue
+                
+        if not valid_configs:
+            print("No valid config found")
+            return 0, 0, 0
+            
+        min_time = float('inf')
+        best_config = None
+        for i, config_id in enumerate(valid_configs):
+            if times[i] < min_time:
+                min_time = times[i]
+                best_config = config_id
+                
+        # 记录当前输入规模的最优配置
+        best_config_info[f"M{M}_N{N}_K{K}"] = {
+            "best_config": best_config,
+            "time": min_time,
+            "batch_size": batch_size
+        }
+        
+        # 将最优配置信息保存到文件
+        import json
+        config_file = "best_fp8_configs.json"
+        try:
+            with open(config_file, "r") as f:
+                existing_configs = json.load(f)
+        except FileNotFoundError:
+            existing_configs = {}
+            
+        existing_configs.update(best_config_info)
+        with open(config_file, "w") as f:
+            json.dump(existing_configs, f, indent=4)
+            
+        print(f"Best config for batch_size={batch_size}: config_id={best_config}, time={min_time:.6f}s")
+        
+        # 使用最佳配置进行基准测试
+        try:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: sgl_scaled_mm_profile(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, torch.bfloat16, bias=None, config_id=best_config),
+                quantiles=quantiles,
+            )
+        except RuntimeError as e:
+            print("Error details:", e)
+            print(f"config_id is not valid {best_config}")
+            ms, min_ms, max_ms = 1, 1, 1
     gbps = lambda ms: (2 * M * N * K + M * N) * a.element_size() * 1e-9 / (ms * 1e-3)
     return gbps(ms), gbps(max_ms), gbps(min_ms)
 
diff --git a/sgl-kernel/benchmark/bench_int8_res/results.html b/sgl-kernel/benchmark/bench_int8_res/results.html
new file mode 100644
index 000000000000..f8f21993bfa1
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_int8_res/results.html
@@ -0,0 +1,3 @@
+<html><body>
+<image src="int8 scaled matmul.png"/>
+</body></html>
diff --git a/sgl-kernel/benchmark/best_fp8_configs.json b/sgl-kernel/benchmark/best_fp8_configs.json
new file mode 100644
index 000000000000..cff052cfd253
--- /dev/null
+++ b/sgl-kernel/benchmark/best_fp8_configs.json
@@ -0,0 +1,42 @@
+{
+    "M1_N4096_K8192": {
+        "best_config": 6,
+        "time": 6.532669067382812e-05,
+        "batch_size": 1
+    },
+    "M16_N4096_K8192": {
+        "best_config": 6,
+        "time": 6.699562072753906e-05,
+        "batch_size": 16
+    },
+    "M64_N4096_K8192": {
+        "best_config": 6,
+        "time": 6.67572021484375e-05,
+        "batch_size": 64
+    },
+    "M128_N4096_K8192": {
+        "best_config": 6,
+        "time": 6.699562072753906e-05,
+        "batch_size": 128
+    },
+    "M256_N4096_K8192": {
+        "best_config": 6,
+        "time": 6.842613220214844e-05,
+        "batch_size": 256
+    },
+    "M512_N4096_K8192": {
+        "best_config": 6,
+        "time": 0.00012421607971191406,
+        "batch_size": 512
+    },
+    "M1024_N4096_K8192": {
+        "best_config": 6,
+        "time": 0.00023627281188964844,
+        "batch_size": 1024
+    },
+    "M2048_N4096_K8192": {
+        "best_config": 6,
+        "time": 0.00045871734619140625,
+        "batch_size": 2048
+    }
+}
\ No newline at end of file
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 3a60f6ba0a6b..aaa0a53dc899 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -2,6 +2,9 @@
 
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+import os
+import sys
+import multiprocessing
 
 root = Path(__file__).parent.resolve()
 
@@ -23,19 +26,32 @@ def update_wheel_platform_tag():
 
 
 cutlass = root / "3rdparty" / "cutlass"
+nlohmann = root / "3rdparty" / "nlohmann"
+
 include_dirs = [
     cutlass.resolve() / "include",
     cutlass.resolve() / "tools" / "util" / "include",
     root / "src" / "sgl-kernel" / "csrc",
+    nlohmann.resolve(),
 ]
+
+# nvcc_flags = [
+#     "-O3",
+#     "-Xcompiler",
+#     "-fPIC",
+#     "-gencode=arch=compute_75,code=sm_75",
+#     "-gencode=arch=compute_80,code=sm_80",
+#     "-gencode=arch=compute_89,code=sm_89",
+#     "-gencode=arch=compute_90,code=sm_90",
+#     "-U__CUDA_NO_HALF_OPERATORS__",
+#     "-U__CUDA_NO_HALF2_OPERATORS__",
+# ]
 nvcc_flags = [
     "-O3",
     "-Xcompiler",
     "-fPIC",
-    "-gencode=arch=compute_75,code=sm_75",
-    "-gencode=arch=compute_80,code=sm_80",
+    # 只保留需要的架构
     "-gencode=arch=compute_89,code=sm_89",
-    "-gencode=arch=compute_90,code=sm_90",
     "-U__CUDA_NO_HALF_OPERATORS__",
     "-U__CUDA_NO_HALF2_OPERATORS__",
 ]
@@ -49,7 +65,7 @@ def update_wheel_platform_tag():
             "src/sgl-kernel/csrc/trt_reduce_internal.cu",
             "src/sgl-kernel/csrc/trt_reduce_kernel.cu",
             "src/sgl-kernel/csrc/moe_align_kernel.cu",
-            "src/sgl-kernel/csrc/int8_gemm_kernel.cu",
+            # "src/sgl-kernel/csrc/int8_gemm_kernel.cu",
             "src/sgl-kernel/csrc/fp8_gemm_kernel.cu",
             "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
         ],
@@ -63,6 +79,20 @@ def update_wheel_platform_tag():
     ),
 ]
 
+def set_parallel_jobs():
+    if sys.platform == 'win32':
+        num_cores = int(os.environ.get('NUMBER_OF_PROCESSORS', 4))
+    else:
+        num_cores = len(os.sched_getaffinity(0)) if hasattr(os, 'sched_getaffinity') else os.cpu_count()
+    
+    # 限制并行度为核心数的1/4或更少
+    num_jobs = max(1, num_cores // 2)
+    os.environ['MAX_JOBS'] = str(num_jobs)
+    
+    # 设置CUDA编译的并行任务数
+    os.environ['CUDA_NVCC_THREADS'] = str(num_jobs)
+    return num_jobs
+set_parallel_jobs()
 setup(
     name="sgl-kernel",
     version=get_version(),
diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py
index 2a4a2bd51771..06894c3358ef 100644
--- a/sgl-kernel/src/sgl-kernel/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/__init__.py
@@ -4,6 +4,7 @@
     init_custom_reduce,
     int8_scaled_mm,
     fp8_scaled_mm,
+    fp8_scaled_mm_profile,
     moe_align_block_size,
 )
 
@@ -14,4 +15,5 @@
     "custom_reduce",
     "int8_scaled_mm",
     "fp8_scaled_mm",
+    "fp8_scaled_mm_profile",
 ]
diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index ef88110e9258..914d1cb4df83 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -33,7 +33,10 @@
 using namespace cute;
 
 template <typename ElementType, typename OutElementType, typename AccumElementType, typename CtaShape,
-    typename WarpShape, int Stages, bool WithBias>
+    typename WarpShape, int Stages, bool WithBias,
+    typename FP8MathOperator = cutlass::arch::OpMultiplyAdd,
+    template <typename, typename> typename EpilogueVisitor = cutlass::epilogue::threadblock::Sm80EVT,
+    typename ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>>
 struct DeviceGemmFp8RowwiseSm89
 {
     static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
@@ -97,8 +100,8 @@ struct DeviceGemmFp8RowwiseSm89
     using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmWithVisitor<ElementA, LayoutA,
         cutlass::ComplexTransform::kNone, AlignmentA, ElementB, LayoutB, cutlass::ComplexTransform::kNone, AlignmentB,
         ElementC, LayoutC, AlignmentC, ElementAccumulator, ElementComputeEpilogue, OperatorClass, ArchTag, CtaShape,
-        WarpShape, InstructionShape, EpilogueOp, cutlass::gemm::threadblock::ThreadblockSwizzleStreamK, Stages,
-        cutlass::arch::OpMultiplyAdd, EVTEpilogueStages>::GemmKernel;
+        WarpShape, InstructionShape, EpilogueOp, ThreadblockSwizzle,
+        Stages, FP8MathOperator, EVTEpilogueStages>::GemmKernel;
 
     using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 };
@@ -509,16 +512,16 @@ void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch
     uint32_t const mp2 =
         std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
 
-    if (mp2 <= 64) {
-        // m in [1, 64]
-        return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _8, _1>>(out, a, b, scales_a, scales_b, bias);
-    } else if (mp2 <= 128) {
-        // m in (64, 128]
-        return sm90_dispatch_bias<OutType, Shape<_64, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
-    } else {
-        // m in (128, inf)
-        return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
-    }
+    // if (mp2 <= 64) {
+    //     // m in [1, 64]
+    //     return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _8, _1>>(out, a, b, scales_a, scales_b, bias);
+    // } else if (mp2 <= 128) {
+    //     // m in (64, 128]
+    //     return sm90_dispatch_bias<OutType, Shape<_64, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+    // } else {
+    //     // m in (128, inf)
+    //     return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+    // }
 }
 
 torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
@@ -574,3 +577,133 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
 
   return out;
 }
+
+
+#define DISPATCH_FP8_GEMM_CONFIG(TB_M, TB_N, TB_K, WP_M, WP_N, WP_K, STAGES) \
+    sm89_dispatch_bias<ElementOutput, cutlass::gemm::GemmShape<TB_M, TB_N, TB_K>, \
+        cutlass::gemm::GemmShape<WP_M, WP_N, WP_K>, STAGES>(out, mat_a, mat_b, scales_a, scales_b, bias)
+// 定义一个宏来生成一组配置的所有stages
+#define DISPATCH_FP8_GEMM_GROUP(GROUP_ID, CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, BASE_CASE) \
+    case BASE_CASE:     DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 2); break; \
+    case BASE_CASE + 1: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 3); break; \
+    case BASE_CASE + 2: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 4); break; \
+    case BASE_CASE + 3: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 5); break; \
+    case BASE_CASE + 4: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 6); break; \
+    case BASE_CASE + 5: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 7); break;
+
+template <typename ElementOutput>
+void sm89_dispatch_shape_profile(torch::Tensor& out, const torch::Tensor& mat_a, const torch::Tensor& mat_b,
+                            const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                            const c10::optional<torch::Tensor>& bias,
+                            int config_id) {
+    switch(config_id) {
+        case 1:
+            DISPATCH_FP8_GEMM_CONFIG(32, 64, 128, 16, 64, 64, 5);
+        case 2:
+            DISPATCH_FP8_GEMM_CONFIG(16, 64, 128, 16, 64, 64, 5);
+        case 3:
+            DISPATCH_FP8_GEMM_CONFIG(64, 64, 128, 32, 64, 64, 5);
+        case 4:
+            DISPATCH_FP8_GEMM_CONFIG(64, 128, 64, 32, 64, 64, 5);
+        case 5:
+            DISPATCH_FP8_GEMM_CONFIG(128, 128, 64, 64, 32, 64, 2);
+        case 6:
+            DISPATCH_FP8_GEMM_CONFIG(64, 128, 64, 32, 64, 64, 6);
+        // // Group 1: CtaShape32x128x64_WarpShape32x32x64
+        // DISPATCH_FP8_GEMM_GROUP(1, 32, 128, 64, 32, 32, 64, 1)
+
+        // // Group 2: CtaShape64x128x64_WarpShape32x64x64
+        // DISPATCH_FP8_GEMM_GROUP(2, 64, 128, 64, 32, 64, 64, 7)
+
+        // // Group 3: CtaShape64x64x128_WarpShape32x64x64
+        // DISPATCH_FP8_GEMM_GROUP(3, 64, 64, 128, 32, 64, 64, 13)
+
+        // // Group 4: CtaShape64x128x64_WarpShape64x32x64
+        // DISPATCH_FP8_GEMM_GROUP(4, 64, 128, 64, 64, 32, 64, 19)
+
+        // // Group 5: CtaShape128x64x64_WarpShape64x32x64
+        // DISPATCH_FP8_GEMM_GROUP(5, 128, 64, 64, 64, 32, 64, 25)
+
+        // // Group 6: CtaShape128x128x64_WarpShape64x32x64
+        // DISPATCH_FP8_GEMM_GROUP(6, 128, 128, 64, 64, 32, 64, 31)
+
+        // // Group 7: CtaShape128x128x64_WarpShape64x64x64
+        // DISPATCH_FP8_GEMM_GROUP(7, 128, 128, 64, 64, 64, 64, 37)
+
+        // // Group 8: CtaShape128x128x64_WarpShape128x32x64
+        // DISPATCH_FP8_GEMM_GROUP(8, 128, 128, 64, 128, 32, 64, 43)
+
+        // // Group 9: CtaShape128x256x64_WarpShape64x64x64
+        // DISPATCH_FP8_GEMM_GROUP(9, 128, 256, 64, 64, 64, 64, 49)
+
+        // // Group 10: CtaShape256x128x64_WarpShape64x64x64
+        // DISPATCH_FP8_GEMM_GROUP(10, 256, 128, 64, 64, 64, 64, 55)
+
+        // // Group 11: CtaShape128x64x128_WarpShape64x32x128
+        // DISPATCH_FP8_GEMM_GROUP(11, 128, 64, 128, 64, 32, 128, 61)
+
+        // // Group 12: CtaShape16x256x128_WarpShape16x64x128
+        // DISPATCH_FP8_GEMM_GROUP(12, 16, 256, 128, 16, 64, 128, 67)
+
+        // // Group 13: CtaShape16x64x128_WarpShape16x64x64
+        // DISPATCH_FP8_GEMM_GROUP(13, 16, 64, 128, 16, 64, 64, 73)
+
+        // // Group 14: CtaShape16x128x64_WarpShape16x64x64
+        // DISPATCH_FP8_GEMM_GROUP(14, 16, 128, 64, 16, 64, 64, 79)
+
+        // // Group 15: CtaShape32x64x128_WarpShape16x64x64
+        // DISPATCH_FP8_GEMM_GROUP(15, 32, 64, 128, 16, 64, 64, 85)
+    }
+}
+torch::Tensor fp8_scaled_mm_profile(const torch::Tensor& mat_a, const torch::Tensor& mat_b, 
+    const torch::Tensor& scales_a, const torch::Tensor& scales_b, 
+    const torch::Dtype& out_dtype, const c10::optional<torch::Tensor>& bias,
+    int config_id) {
+    
+    // 基本检查
+    TORCH_CHECK(mat_a.is_cuda(), "mat_a must be a CUDA tensor");
+    TORCH_CHECK(mat_b.is_cuda(), "mat_b must be a CUDA tensor");
+    TORCH_CHECK(mat_a.dim() == 2, "mat_a must be a 2D tensor");
+    TORCH_CHECK(mat_b.dim() == 2, "mat_b must be a 2D tensor");
+    TORCH_CHECK(mat_a.stride(1) == 1, "mat_a must be a row major tensor");
+    TORCH_CHECK(mat_b.stride(0) == 1, "mat_a must be a column major tensor");
+    TORCH_CHECK(mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied");
+
+    TORCH_CHECK((mat_a.size(1) * mat_a.element_size()) % 16 == 0, "mat_a must be multiple of 16 bytes for memory alignment");
+    TORCH_CHECK((mat_b.size(0) * mat_b.element_size()) % 16 == 0, "mat_b must be multiple of 16 bytes for memory alignment");
+    TORCH_CHECK(mat_a.scalar_type() == torch::kFloat8_e4m3fn, "mat_a must be Float8_e4m3fn");
+    TORCH_CHECK(mat_b.scalar_type() == torch::kFloat8_e4m3fn, "mat_b must be Float8_e4m3fn");
+    TORCH_CHECK(out_dtype == torch::kHalf || out_dtype == torch::kBFloat16, "out_dtype must be Half or BFloat16");
+
+    // 检查scales
+    TORCH_CHECK(scales_a.numel() == mat_a.size(0), "size of scales_a is not matched");
+    TORCH_CHECK(scales_b.numel() == mat_b.size(1), "size of scales_b is not matched");
+    TORCH_CHECK(scales_a.is_contiguous(), "scales_a must be contiguous");
+    TORCH_CHECK(scales_b.is_contiguous(), "scales_b must be contiguous");
+    TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32, "scales_a must be Float32");
+    TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32, "scales_b must be Float32");
+
+    // 检查bias
+    if (bias) {
+        TORCH_CHECK(bias->numel() == mat_b.size(1), "size of bias is not matched");
+        TORCH_CHECK(bias->is_contiguous(), "bias must be contiguous");
+        TORCH_CHECK(bias->dtype() == out_dtype, "bias dtype must match output dtype");
+    }
+
+    torch::Tensor out = torch::empty({mat_a.size(0), mat_b.size(1)}, mat_a.options().dtype(out_dtype));
+    TORCH_CHECK((out.size(1) * out.element_size()) % 16 == 0, "out must be multiple of 16 bytes for memory alignment");
+
+    auto sm_version = getSMVersion();
+    
+    if (sm_version == 89) {
+        if (out_dtype == torch::kBFloat16) {
+            sm89_dispatch_shape_profile<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias, config_id);
+        } else {
+            sm89_dispatch_shape_profile<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias, config_id);
+        }
+    } else {
+        TORCH_CHECK_NOT_IMPLEMENTED(false, "FP8 operations require SM89 GPU architecture");
+    }
+
+    return out;
+}
\ No newline at end of file
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index b12d324cc62b..4673a13271d0 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -13,14 +13,19 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t b
                           torch::Tensor token_cnts_buffer, torch::Tensor cumsum_buffer);
 
 // int8_scaled_mm
-torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
-                             const c10::optional<torch::Tensor>& bias);
+// torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
+//                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
+//                              const c10::optional<torch::Tensor>& bias);
 
 torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
                              const c10::optional<torch::Tensor>& bias);
 
+torch::Tensor fp8_scaled_mm_profile(const torch::Tensor& mat_a, const torch::Tensor& mat_b, 
+    const torch::Tensor& scales_a, const torch::Tensor& scales_b, 
+    const torch::Dtype& out_dtype, const c10::optional<torch::Tensor>& bias,
+    int config_id);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // trt_reduce
   m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)");
@@ -29,7 +34,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // moe_align_block_size
   m.def("moe_align_block_size", &moe_align_block_size, "MOE Align Block Size (CUDA)");
   // int8_scaled_mm
-  m.def("int8_scaled_mm", &int8_scaled_mm, "INT8 scaled matmul (CUDA)");
+  // m.def("int8_scaled_mm", &int8_scaled_mm, "INT8 scaled matmul (CUDA)");
   // fp8_scaled_mm
   m.def("fp8_scaled_mm", &fp8_scaled_mm, "FP8 scaled matmul (CUDA)");
+  // fp8_scaled_mm_profile
+  m.def("fp8_scaled_mm_profile", &fp8_scaled_mm_profile, "FP8 scaled matmul profile (CUDA)");
 }
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index f339997b027f..8b36c1738cde 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -1,10 +1,10 @@
 from sgl_kernel.ops._kernels import all_reduce as _all_reduce
 from sgl_kernel.ops._kernels import dispose as _dispose
 from sgl_kernel.ops._kernels import init_custom_ar as _init_custom_ar
-from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm
+# from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm
 from sgl_kernel.ops._kernels import fp8_scaled_mm as _fp8_scaled_mm
 from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size
-
+from sgl_kernel.ops._kernels import fp8_scaled_mm_profile as _fp8_scaled_mm_profile
 
 def init_custom_reduce(rank_id, num_devices, buffers, barrier_in, barrier_out):
     return _init_custom_ar(rank_id, num_devices, buffers, barrier_in, barrier_out)
@@ -41,7 +41,18 @@ def moe_align_block_size(
 
 
 def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
-    return _int8_scaled_mm(
+    return None
+    # return _int8_scaled_mm(
+    #     mat_a,
+    #     mat_b,
+    #     scales_a,
+    #     scales_b,
+    #     out_dtype,
+    #     bias,
+    # )
+
+def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
+    return _fp8_scaled_mm(
         mat_a,
         mat_b,
         scales_a,
@@ -50,12 +61,13 @@ def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
         bias,
     )
 
-def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
-    return _fp8_scaled_mm(
+def fp8_scaled_mm_profile(mat_a, mat_b, scales_a, scales_b, out_dtype, bias, config_id):
+    return _fp8_scaled_mm_profile(
         mat_a,
         mat_b,
         scales_a,
         scales_b,
         out_dtype,
         bias,
+        config_id,
     )

From 349795099e106bb9155a62f373f1e7c2bd85bab3 Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Mon, 13 Jan 2025 19:38:03 +0800
Subject: [PATCH 059/248] add config_profile for sm_89

---
 sgl-kernel/3rdparty/nlohmann/json.hpp         | 25420 ++++++++++++++++
 sgl-kernel/3rdparty/nlohmann/json_fwd.hpp     |   187 +
 sgl-kernel/benchmark/89_fp8_bf16.json         |    10 +
 ...fp8_bf16_256\350\247\243\345\206\263.json" |    10 +
 ...4096,device=NVIDIA_L40,dtype=bfloat16.json |    11 +
 ...=4096,device=NVIDIA_L40,dtype=float16.json |    11 +
 sgl-kernel/benchmark/bench_fp8_gemm.py        |   142 +-
 .../benchmark/bench_fp8_res/results.html      |     1 +
 sgl-kernel/benchmark/best_fp8_configs.json    |    42 -
 sgl-kernel/outp                               |     0
 sgl-kernel/setup.py                           |    13 +-
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    |   320 +-
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     |     7 +-
 sgl-kernel/src/sgl-kernel/csrc/utils.hpp      |    99 +-
 sgl-kernel/src/sgl-kernel/ops/__init__.py     |     8 +-
 15 files changed, 26041 insertions(+), 240 deletions(-)
 create mode 100644 sgl-kernel/3rdparty/nlohmann/json.hpp
 create mode 100644 sgl-kernel/3rdparty/nlohmann/json_fwd.hpp
 create mode 100644 sgl-kernel/benchmark/89_fp8_bf16.json
 create mode 100644 "sgl-kernel/benchmark/89_fp8_bf16_256\350\247\243\345\206\263.json"
 create mode 100644 sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=bfloat16.json
 create mode 100644 sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=float16.json
 create mode 100644 sgl-kernel/benchmark/bench_fp8_res/results.html
 delete mode 100644 sgl-kernel/benchmark/best_fp8_configs.json
 create mode 100644 sgl-kernel/outp

diff --git a/sgl-kernel/3rdparty/nlohmann/json.hpp b/sgl-kernel/3rdparty/nlohmann/json.hpp
new file mode 100644
index 000000000000..9be8b892e3dc
--- /dev/null
+++ b/sgl-kernel/3rdparty/nlohmann/json.hpp
@@ -0,0 +1,25420 @@
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+/****************************************************************************\
+ * Note on documentation: The source files contain links to the online      *
+ * documentation of the public API at https://json.nlohmann.me. This URL    *
+ * contains the most recent documentation and should also be applicable to  *
+ * previous versions; documentation for deprecated functions is not         *
+ * removed, but marked deprecated. See "Generate documentation" section in  *
+ * file docs/README.md.                                                     *
+\****************************************************************************/
+
+#ifndef INCLUDE_NLOHMANN_JSON_HPP_
+#define INCLUDE_NLOHMANN_JSON_HPP_
+
+#include <algorithm> // all_of, find, for_each
+#include <cstddef> // nullptr_t, ptrdiff_t, size_t
+#include <functional> // hash, less
+#include <initializer_list> // initializer_list
+#ifndef JSON_NO_IO
+    #include <iosfwd> // istream, ostream
+#endif  // JSON_NO_IO
+#include <iterator> // random_access_iterator_tag
+#include <memory> // unique_ptr
+#include <string> // string, stoi, to_string
+#include <utility> // declval, forward, move, pair, swap
+#include <vector> // vector
+
+// #include <nlohmann/adl_serializer.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <utility>
+
+// #include <nlohmann/detail/abi_macros.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+// This file contains all macro definitions affecting or depending on the ABI
+
+#ifndef JSON_SKIP_LIBRARY_VERSION_CHECK
+    #if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH)
+        #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 3
+            #warning "Already included a different version of the library!"
+        #endif
+    #endif
+#endif
+
+#define NLOHMANN_JSON_VERSION_MAJOR 3   // NOLINT(modernize-macro-to-enum)
+#define NLOHMANN_JSON_VERSION_MINOR 11  // NOLINT(modernize-macro-to-enum)
+#define NLOHMANN_JSON_VERSION_PATCH 3   // NOLINT(modernize-macro-to-enum)
+
+#ifndef JSON_DIAGNOSTICS
+    #define JSON_DIAGNOSTICS 0
+#endif
+
+#ifndef JSON_DIAGNOSTIC_POSITIONS
+    #define JSON_DIAGNOSTIC_POSITIONS 0
+#endif
+
+#ifndef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+    #define JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON 0
+#endif
+
+#if JSON_DIAGNOSTICS
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS _diag
+#else
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS
+#endif
+
+#if JSON_DIAGNOSTIC_POSITIONS
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS _dp
+#else
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS
+#endif
+
+#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON _ldvcmp
+#else
+    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON
+#endif
+
+#ifndef NLOHMANN_JSON_NAMESPACE_NO_VERSION
+    #define NLOHMANN_JSON_NAMESPACE_NO_VERSION 0
+#endif
+
+// Construct the namespace ABI tags component
+#define NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c) json_abi ## a ## b ## c
+#define NLOHMANN_JSON_ABI_TAGS_CONCAT(a, b, c) \
+    NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c)
+
+#define NLOHMANN_JSON_ABI_TAGS                                       \
+    NLOHMANN_JSON_ABI_TAGS_CONCAT(                                   \
+            NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS,                       \
+            NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON, \
+            NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS)
+
+// Construct the namespace version component
+#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) \
+    _v ## major ## _ ## minor ## _ ## patch
+#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(major, minor, patch) \
+    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch)
+
+#if NLOHMANN_JSON_NAMESPACE_NO_VERSION
+#define NLOHMANN_JSON_NAMESPACE_VERSION
+#else
+#define NLOHMANN_JSON_NAMESPACE_VERSION                                 \
+    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(NLOHMANN_JSON_VERSION_MAJOR, \
+                                           NLOHMANN_JSON_VERSION_MINOR, \
+                                           NLOHMANN_JSON_VERSION_PATCH)
+#endif
+
+// Combine namespace components
+#define NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b) a ## b
+#define NLOHMANN_JSON_NAMESPACE_CONCAT(a, b) \
+    NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b)
+
+#ifndef NLOHMANN_JSON_NAMESPACE
+#define NLOHMANN_JSON_NAMESPACE               \
+    nlohmann::NLOHMANN_JSON_NAMESPACE_CONCAT( \
+            NLOHMANN_JSON_ABI_TAGS,           \
+            NLOHMANN_JSON_NAMESPACE_VERSION)
+#endif
+
+#ifndef NLOHMANN_JSON_NAMESPACE_BEGIN
+#define NLOHMANN_JSON_NAMESPACE_BEGIN                \
+    namespace nlohmann                               \
+    {                                                \
+    inline namespace NLOHMANN_JSON_NAMESPACE_CONCAT( \
+                NLOHMANN_JSON_ABI_TAGS,              \
+                NLOHMANN_JSON_NAMESPACE_VERSION)     \
+    {
+#endif
+
+#ifndef NLOHMANN_JSON_NAMESPACE_END
+#define NLOHMANN_JSON_NAMESPACE_END                                     \
+    }  /* namespace (inline namespace) NOLINT(readability/namespace) */ \
+    }  // namespace nlohmann
+#endif
+
+// #include <nlohmann/detail/conversions/from_json.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <algorithm> // transform
+#include <array> // array
+#include <forward_list> // forward_list
+#include <iterator> // inserter, front_inserter, end
+#include <map> // map
+#ifdef JSON_HAS_CPP_17
+    #include <optional> // optional
+#endif
+#include <string> // string
+#include <tuple> // tuple, make_tuple
+#include <type_traits> // is_arithmetic, is_same, is_enum, underlying_type, is_convertible
+#include <unordered_map> // unordered_map
+#include <utility> // pair, declval
+#include <valarray> // valarray
+
+// #include <nlohmann/detail/exceptions.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <cstddef> // nullptr_t
+#include <exception> // exception
+#if JSON_DIAGNOSTICS
+    #include <numeric> // accumulate
+#endif
+#include <stdexcept> // runtime_error
+#include <string> // to_string
+#include <vector> // vector
+
+// #include <nlohmann/detail/value_t.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <array> // array
+#include <cstddef> // size_t
+#include <cstdint> // uint8_t
+#include <string> // string
+
+// #include <nlohmann/detail/macro_scope.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <utility> // declval, pair
+// #include <nlohmann/detail/meta/detected.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <type_traits>
+
+// #include <nlohmann/detail/meta/void_t.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+template<typename ...Ts> struct make_void
+{
+    using type = void;
+};
+template<typename ...Ts> using void_t = typename make_void<Ts...>::type;
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+// https://en.cppreference.com/w/cpp/experimental/is_detected
+struct nonesuch
+{
+    nonesuch() = delete;
+    ~nonesuch() = delete;
+    nonesuch(nonesuch const&) = delete;
+    nonesuch(nonesuch const&&) = delete;
+    void operator=(nonesuch const&) = delete;
+    void operator=(nonesuch&&) = delete;
+};
+
+template<class Default,
+         class AlwaysVoid,
+         template<class...> class Op,
+         class... Args>
+struct detector
+{
+    using value_t = std::false_type;
+    using type = Default;
+};
+
+template<class Default, template<class...> class Op, class... Args>
+struct detector<Default, void_t<Op<Args...>>, Op, Args...>
+{
+    using value_t = std::true_type;
+    using type = Op<Args...>;
+};
+
+template<template<class...> class Op, class... Args>
+using is_detected = typename detector<nonesuch, void, Op, Args...>::value_t;
+
+template<template<class...> class Op, class... Args>
+struct is_detected_lazy : is_detected<Op, Args...> { };
+
+template<template<class...> class Op, class... Args>
+using detected_t = typename detector<nonesuch, void, Op, Args...>::type;
+
+template<class Default, template<class...> class Op, class... Args>
+using detected_or = detector<Default, void, Op, Args...>;
+
+template<class Default, template<class...> class Op, class... Args>
+using detected_or_t = typename detected_or<Default, Op, Args...>::type;
+
+template<class Expected, template<class...> class Op, class... Args>
+using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
+
+template<class To, template<class...> class Op, class... Args>
+using is_detected_convertible =
+    std::is_convertible<detected_t<Op, Args...>, To>;
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/thirdparty/hedley/hedley.hpp>
+
+
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2016 - 2021 Evan Nemerson <evan@nemerson.com>
+// SPDX-License-Identifier: MIT
+
+/* Hedley - https://nemequ.github.io/hedley
+ * Created by Evan Nemerson <evan@nemerson.com>
+ */
+
+#if !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < 15)
+#if defined(JSON_HEDLEY_VERSION)
+    #undef JSON_HEDLEY_VERSION
+#endif
+#define JSON_HEDLEY_VERSION 15
+
+#if defined(JSON_HEDLEY_STRINGIFY_EX)
+    #undef JSON_HEDLEY_STRINGIFY_EX
+#endif
+#define JSON_HEDLEY_STRINGIFY_EX(x) #x
+
+#if defined(JSON_HEDLEY_STRINGIFY)
+    #undef JSON_HEDLEY_STRINGIFY
+#endif
+#define JSON_HEDLEY_STRINGIFY(x) JSON_HEDLEY_STRINGIFY_EX(x)
+
+#if defined(JSON_HEDLEY_CONCAT_EX)
+    #undef JSON_HEDLEY_CONCAT_EX
+#endif
+#define JSON_HEDLEY_CONCAT_EX(a,b) a##b
+
+#if defined(JSON_HEDLEY_CONCAT)
+    #undef JSON_HEDLEY_CONCAT
+#endif
+#define JSON_HEDLEY_CONCAT(a,b) JSON_HEDLEY_CONCAT_EX(a,b)
+
+#if defined(JSON_HEDLEY_CONCAT3_EX)
+    #undef JSON_HEDLEY_CONCAT3_EX
+#endif
+#define JSON_HEDLEY_CONCAT3_EX(a,b,c) a##b##c
+
+#if defined(JSON_HEDLEY_CONCAT3)
+    #undef JSON_HEDLEY_CONCAT3
+#endif
+#define JSON_HEDLEY_CONCAT3(a,b,c) JSON_HEDLEY_CONCAT3_EX(a,b,c)
+
+#if defined(JSON_HEDLEY_VERSION_ENCODE)
+    #undef JSON_HEDLEY_VERSION_ENCODE
+#endif
+#define JSON_HEDLEY_VERSION_ENCODE(major,minor,revision) (((major) * 1000000) + ((minor) * 1000) + (revision))
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_MAJOR)
+    #undef JSON_HEDLEY_VERSION_DECODE_MAJOR
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000)
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_MINOR)
+    #undef JSON_HEDLEY_VERSION_DECODE_MINOR
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000)
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_REVISION)
+    #undef JSON_HEDLEY_VERSION_DECODE_REVISION
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000)
+
+#if defined(JSON_HEDLEY_GNUC_VERSION)
+    #undef JSON_HEDLEY_GNUC_VERSION
+#endif
+#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__)
+    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
+#elif defined(__GNUC__)
+    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_VERSION_CHECK)
+    #undef JSON_HEDLEY_GNUC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_GNUC_VERSION)
+    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GNUC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_MSVC_VERSION)
+    #undef JSON_HEDLEY_MSVC_VERSION
+#endif
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL)
+    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100)
+#elif defined(_MSC_FULL_VER) && !defined(__ICL)
+    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10)
+#elif defined(_MSC_VER) && !defined(__ICL)
+    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_MSVC_VERSION_CHECK)
+    #undef JSON_HEDLEY_MSVC_VERSION_CHECK
+#endif
+#if !defined(JSON_HEDLEY_MSVC_VERSION)
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0)
+#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch)))
+#elif defined(_MSC_VER) && (_MSC_VER >= 1200)
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch)))
+#else
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_VER >= ((major * 100) + (minor)))
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_VERSION)
+    #undef JSON_HEDLEY_INTEL_VERSION
+#endif
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && !defined(__ICL)
+    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE)
+#elif defined(__INTEL_COMPILER) && !defined(__ICL)
+    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_VERSION_CHECK)
+    #undef JSON_HEDLEY_INTEL_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_INTEL_VERSION)
+    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_CL_VERSION)
+    #undef JSON_HEDLEY_INTEL_CL_VERSION
+#endif
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && defined(__ICL)
+    #define JSON_HEDLEY_INTEL_CL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0)
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_CL_VERSION_CHECK)
+    #undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_INTEL_CL_VERSION)
+    #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_CL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_PGI_VERSION)
+    #undef JSON_HEDLEY_PGI_VERSION
+#endif
+#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__)
+    #define JSON_HEDLEY_PGI_VERSION JSON_HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__)
+#endif
+
+#if defined(JSON_HEDLEY_PGI_VERSION_CHECK)
+    #undef JSON_HEDLEY_PGI_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_PGI_VERSION)
+    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PGI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_SUNPRO_VERSION)
+    #undef JSON_HEDLEY_SUNPRO_VERSION
+#endif
+#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf), (__SUNPRO_C & 0xf) * 10)
+#elif defined(__SUNPRO_C)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf)
+#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf), (__SUNPRO_CC & 0xf) * 10)
+#elif defined(__SUNPRO_CC)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf)
+#endif
+
+#if defined(JSON_HEDLEY_SUNPRO_VERSION_CHECK)
+    #undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_SUNPRO_VERSION)
+    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_SUNPRO_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
+    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION
+#endif
+#if defined(__EMSCRIPTEN__)
+    #define JSON_HEDLEY_EMSCRIPTEN_VERSION JSON_HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__)
+#endif
+
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK)
+    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
+    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_EMSCRIPTEN_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_ARM_VERSION)
+    #undef JSON_HEDLEY_ARM_VERSION
+#endif
+#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION)
+    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, (__ARMCOMPILER_VERSION % 1000000) / 10000, (__ARMCOMPILER_VERSION % 10000) / 100)
+#elif defined(__CC_ARM) && defined(__ARMCC_VERSION)
+    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100)
+#endif
+
+#if defined(JSON_HEDLEY_ARM_VERSION_CHECK)
+    #undef JSON_HEDLEY_ARM_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_ARM_VERSION)
+    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_ARM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_IBM_VERSION)
+    #undef JSON_HEDLEY_IBM_VERSION
+#endif
+#if defined(__ibmxl__)
+    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__)
+#elif defined(__xlC__) && defined(__xlC_ver__)
+    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff)
+#elif defined(__xlC__)
+    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0)
+#endif
+
+#if defined(JSON_HEDLEY_IBM_VERSION_CHECK)
+    #undef JSON_HEDLEY_IBM_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_IBM_VERSION)
+    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IBM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_VERSION)
+    #undef JSON_HEDLEY_TI_VERSION
+#endif
+#if \
+    defined(__TI_COMPILER_VERSION__) && \
+    ( \
+      defined(__TMS470__) || defined(__TI_ARM__) || \
+      defined(__MSP430__) || \
+      defined(__TMS320C2000__) \
+    )
+#if (__TI_COMPILER_VERSION__ >= 16000000)
+    #define JSON_HEDLEY_TI_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+#endif
+
+#if defined(JSON_HEDLEY_TI_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_VERSION)
+    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
+    #undef JSON_HEDLEY_TI_CL2000_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__)
+    #define JSON_HEDLEY_TI_CL2000_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL2000_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
+    #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL2000_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL430_VERSION)
+    #undef JSON_HEDLEY_TI_CL430_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__)
+    #define JSON_HEDLEY_TI_CL430_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL430_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL430_VERSION)
+    #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL430_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
+    #undef JSON_HEDLEY_TI_ARMCL_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__))
+    #define JSON_HEDLEY_TI_ARMCL_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_ARMCL_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
+    #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_ARMCL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
+    #undef JSON_HEDLEY_TI_CL6X_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__)
+    #define JSON_HEDLEY_TI_CL6X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL6X_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
+    #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL6X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
+    #undef JSON_HEDLEY_TI_CL7X_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__)
+    #define JSON_HEDLEY_TI_CL7X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL7X_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
+    #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL7X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
+    #undef JSON_HEDLEY_TI_CLPRU_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__)
+    #define JSON_HEDLEY_TI_CLPRU_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CLPRU_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
+    #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CLPRU_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_CRAY_VERSION)
+    #undef JSON_HEDLEY_CRAY_VERSION
+#endif
+#if defined(_CRAYC)
+    #if defined(_RELEASE_PATCHLEVEL)
+        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL)
+    #else
+        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0)
+    #endif
+#endif
+
+#if defined(JSON_HEDLEY_CRAY_VERSION_CHECK)
+    #undef JSON_HEDLEY_CRAY_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_CRAY_VERSION)
+    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_CRAY_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_IAR_VERSION)
+    #undef JSON_HEDLEY_IAR_VERSION
+#endif
+#if defined(__IAR_SYSTEMS_ICC__)
+    #if __VER__ > 1000
+        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000))
+    #else
+        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE(__VER__ / 100, __VER__ % 100, 0)
+    #endif
+#endif
+
+#if defined(JSON_HEDLEY_IAR_VERSION_CHECK)
+    #undef JSON_HEDLEY_IAR_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_IAR_VERSION)
+    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IAR_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TINYC_VERSION)
+    #undef JSON_HEDLEY_TINYC_VERSION
+#endif
+#if defined(__TINYC__)
+    #define JSON_HEDLEY_TINYC_VERSION JSON_HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100)
+#endif
+
+#if defined(JSON_HEDLEY_TINYC_VERSION_CHECK)
+    #undef JSON_HEDLEY_TINYC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TINYC_VERSION)
+    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TINYC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_DMC_VERSION)
+    #undef JSON_HEDLEY_DMC_VERSION
+#endif
+#if defined(__DMC__)
+    #define JSON_HEDLEY_DMC_VERSION JSON_HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf)
+#endif
+
+#if defined(JSON_HEDLEY_DMC_VERSION_CHECK)
+    #undef JSON_HEDLEY_DMC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_DMC_VERSION)
+    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_DMC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_COMPCERT_VERSION)
+    #undef JSON_HEDLEY_COMPCERT_VERSION
+#endif
+#if defined(__COMPCERT_VERSION__)
+    #define JSON_HEDLEY_COMPCERT_VERSION JSON_HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100)
+#endif
+
+#if defined(JSON_HEDLEY_COMPCERT_VERSION_CHECK)
+    #undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_COMPCERT_VERSION)
+    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_COMPCERT_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_PELLES_VERSION)
+    #undef JSON_HEDLEY_PELLES_VERSION
+#endif
+#if defined(__POCC__)
+    #define JSON_HEDLEY_PELLES_VERSION JSON_HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_PELLES_VERSION_CHECK)
+    #undef JSON_HEDLEY_PELLES_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_PELLES_VERSION)
+    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PELLES_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_MCST_LCC_VERSION)
+    #undef JSON_HEDLEY_MCST_LCC_VERSION
+#endif
+#if defined(__LCC__) && defined(__LCC_MINOR__)
+    #define JSON_HEDLEY_MCST_LCC_VERSION JSON_HEDLEY_VERSION_ENCODE(__LCC__ / 100, __LCC__ % 100, __LCC_MINOR__)
+#endif
+
+#if defined(JSON_HEDLEY_MCST_LCC_VERSION_CHECK)
+    #undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_MCST_LCC_VERSION)
+    #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_MCST_LCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_VERSION)
+    #undef JSON_HEDLEY_GCC_VERSION
+#endif
+#if \
+    defined(JSON_HEDLEY_GNUC_VERSION) && \
+    !defined(__clang__) && \
+    !defined(JSON_HEDLEY_INTEL_VERSION) && \
+    !defined(JSON_HEDLEY_PGI_VERSION) && \
+    !defined(JSON_HEDLEY_ARM_VERSION) && \
+    !defined(JSON_HEDLEY_CRAY_VERSION) && \
+    !defined(JSON_HEDLEY_TI_VERSION) && \
+    !defined(JSON_HEDLEY_TI_ARMCL_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL430_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL2000_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL6X_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL7X_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CLPRU_VERSION) && \
+    !defined(__COMPCERT__) && \
+    !defined(JSON_HEDLEY_MCST_LCC_VERSION)
+    #define JSON_HEDLEY_GCC_VERSION JSON_HEDLEY_GNUC_VERSION
+#endif
+
+#if defined(JSON_HEDLEY_GCC_VERSION_CHECK)
+    #undef JSON_HEDLEY_GCC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_GCC_VERSION)
+    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_HAS_ATTRIBUTE
+#endif
+#if \
+  defined(__has_attribute) && \
+  ( \
+    (!defined(JSON_HEDLEY_IAR_VERSION) || JSON_HEDLEY_IAR_VERSION_CHECK(8,5,9)) \
+  )
+#  define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute)
+#else
+#  define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
+#else
+    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
+#endif
+#if \
+    defined(__has_cpp_attribute) && \
+    defined(__cplusplus) && \
+    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0))
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute)
+#else
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS)
+    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
+#endif
+#if !defined(__cplusplus) || !defined(__has_cpp_attribute)
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
+#elif \
+    !defined(JSON_HEDLEY_PGI_VERSION) && \
+    !defined(JSON_HEDLEY_IAR_VERSION) && \
+    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \
+    (!defined(JSON_HEDLEY_MSVC_VERSION) || JSON_HEDLEY_MSVC_VERSION_CHECK(19,20,0))
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute)
+#else
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus)
+    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus)
+    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_BUILTIN)
+    #undef JSON_HEDLEY_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+    #define JSON_HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin)
+#else
+    #define JSON_HEDLEY_HAS_BUILTIN(builtin) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_BUILTIN)
+    #undef JSON_HEDLEY_GNUC_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_BUILTIN)
+    #undef JSON_HEDLEY_GCC_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
+#else
+    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_FEATURE)
+    #undef JSON_HEDLEY_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+    #define JSON_HEDLEY_HAS_FEATURE(feature) __has_feature(feature)
+#else
+    #define JSON_HEDLEY_HAS_FEATURE(feature) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_FEATURE)
+    #undef JSON_HEDLEY_GNUC_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_FEATURE)
+    #undef JSON_HEDLEY_GCC_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
+#else
+    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_EXTENSION)
+    #undef JSON_HEDLEY_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+    #define JSON_HEDLEY_HAS_EXTENSION(extension) __has_extension(extension)
+#else
+    #define JSON_HEDLEY_HAS_EXTENSION(extension) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_EXTENSION)
+    #undef JSON_HEDLEY_GNUC_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_EXTENSION)
+    #undef JSON_HEDLEY_GCC_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
+#else
+    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute)
+#else
+    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_WARNING)
+    #undef JSON_HEDLEY_HAS_WARNING
+#endif
+#if defined(__has_warning)
+    #define JSON_HEDLEY_HAS_WARNING(warning) __has_warning(warning)
+#else
+    #define JSON_HEDLEY_HAS_WARNING(warning) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_WARNING)
+    #undef JSON_HEDLEY_GNUC_HAS_WARNING
+#endif
+#if defined(__has_warning)
+    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_WARNING)
+    #undef JSON_HEDLEY_GCC_HAS_WARNING
+#endif
+#if defined(__has_warning)
+    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
+#else
+    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if \
+    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
+    defined(__clang__) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \
+    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \
+    (JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) && defined(__C99_PRAGMA_OPERATOR))
+    #define JSON_HEDLEY_PRAGMA(value) _Pragma(#value)
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_PRAGMA(value) __pragma(value)
+#else
+    #define JSON_HEDLEY_PRAGMA(value)
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_PUSH)
+    #undef JSON_HEDLEY_DIAGNOSTIC_PUSH
+#endif
+#if defined(JSON_HEDLEY_DIAGNOSTIC_POP)
+    #undef JSON_HEDLEY_DIAGNOSTIC_POP
+#endif
+#if defined(__clang__)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
+    #define JSON_HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
+#elif JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH
+    #define JSON_HEDLEY_DIAGNOSTIC_POP
+#endif
+
+/* JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for
+   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
+#endif
+#if defined(__cplusplus)
+#  if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat")
+#    if JSON_HEDLEY_HAS_WARNING("-Wc++17-extensions")
+#      if JSON_HEDLEY_HAS_WARNING("-Wc++1z-extensions")
+#        define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
+    _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
+    _Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") \
+    xpr \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#      else
+#        define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
+    _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
+    xpr \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#      endif
+#    else
+#      define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
+    xpr \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#    endif
+#  endif
+#endif
+#if !defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x
+#endif
+
+#if defined(JSON_HEDLEY_CONST_CAST)
+    #undef JSON_HEDLEY_CONST_CAST
+#endif
+#if defined(__cplusplus)
+#  define JSON_HEDLEY_CONST_CAST(T, expr) (const_cast<T>(expr))
+#elif \
+  JSON_HEDLEY_HAS_WARNING("-Wcast-qual") || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define JSON_HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \
+        JSON_HEDLEY_DIAGNOSTIC_PUSH \
+        JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
+        ((T) (expr)); \
+        JSON_HEDLEY_DIAGNOSTIC_POP \
+    }))
+#else
+#  define JSON_HEDLEY_CONST_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(JSON_HEDLEY_REINTERPRET_CAST)
+    #undef JSON_HEDLEY_REINTERPRET_CAST
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast<T>(expr))
+#else
+    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(JSON_HEDLEY_STATIC_CAST)
+    #undef JSON_HEDLEY_STATIC_CAST
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_STATIC_CAST(T, expr) (static_cast<T>(expr))
+#else
+    #define JSON_HEDLEY_STATIC_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(JSON_HEDLEY_CPP_CAST)
+    #undef JSON_HEDLEY_CPP_CAST
+#endif
+#if defined(__cplusplus)
+#  if JSON_HEDLEY_HAS_WARNING("-Wold-style-cast")
+#    define JSON_HEDLEY_CPP_CAST(T, expr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \
+    ((T) (expr)) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#  elif JSON_HEDLEY_IAR_VERSION_CHECK(8,3,0)
+#    define JSON_HEDLEY_CPP_CAST(T, expr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("diag_suppress=Pe137") \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#  else
+#    define JSON_HEDLEY_CPP_CAST(T, expr) ((T) (expr))
+#  endif
+#else
+#  define JSON_HEDLEY_CPP_CAST(T, expr) (expr)
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wdeprecated-declarations")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)")
+#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:1478 1786))
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1216,1444,1445")
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996))
+#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215")
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)")
+#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:161))
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068))
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(16,9,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
+#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161")
+#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 161")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-attributes")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)")
+#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:1292))
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030))
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097,1098")
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)")
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097")
+#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wcast-qual")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunused-function")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("clang diagnostic ignored \"-Wunused-function\"")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("GCC diagnostic ignored \"-Wunused-function\"")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(1,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION __pragma(warning(disable:4505))
+#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("diag_suppress 3142")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
+#endif
+
+#if defined(JSON_HEDLEY_DEPRECATED)
+    #undef JSON_HEDLEY_DEPRECATED
+#endif
+#if defined(JSON_HEDLEY_DEPRECATED_FOR)
+    #undef JSON_HEDLEY_DEPRECATED_FOR
+#endif
+#if \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since))
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement))
+#elif \
+    (JSON_HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) && !defined(JSON_HEDLEY_IAR_VERSION)) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since)))
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement)))
+#elif defined(__cplusplus) && (__cplusplus >= 201402L)
+    #define JSON_HEDLEY_DEPRECATED(since) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]])
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]])
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(deprecated) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__))
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__))
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+    JSON_HEDLEY_PELLES_VERSION_CHECK(6,50,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated)
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated)
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DEPRECATED(since) _Pragma("deprecated")
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated")
+#else
+    #define JSON_HEDLEY_DEPRECATED(since)
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement)
+#endif
+
+#if defined(JSON_HEDLEY_UNAVAILABLE)
+    #undef JSON_HEDLEY_UNAVAILABLE
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(warning) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since)))
+#else
+    #define JSON_HEDLEY_UNAVAILABLE(available_since)
+#endif
+
+#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT)
+    #undef JSON_HEDLEY_WARN_UNUSED_RESULT
+#endif
+#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT_MSG)
+    #undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__))
+#elif (JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L)
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]])
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard)
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+#elif defined(_Check_return_) /* SAL */
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT _Check_return_
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_
+#else
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg)
+#endif
+
+#if defined(JSON_HEDLEY_SENTINEL)
+    #undef JSON_HEDLEY_SENTINEL
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(sentinel) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position)))
+#else
+    #define JSON_HEDLEY_SENTINEL(position)
+#endif
+
+#if defined(JSON_HEDLEY_NO_RETURN)
+    #undef JSON_HEDLEY_NO_RETURN
+#endif
+#if JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_NO_RETURN __noreturn
+#elif \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+    #define JSON_HEDLEY_NO_RETURN _Noreturn
+#elif defined(__cplusplus) && (__cplusplus >= 201103L)
+    #define JSON_HEDLEY_NO_RETURN JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]])
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(noreturn) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,2,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
+    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_NO_RETURN _Pragma("does_not_return")
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
+#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;")
+#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
+    #define JSON_HEDLEY_NO_RETURN __attribute((noreturn))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
+    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
+#else
+    #define JSON_HEDLEY_NO_RETURN
+#endif
+
+#if defined(JSON_HEDLEY_NO_ESCAPE)
+    #undef JSON_HEDLEY_NO_ESCAPE
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(noescape)
+    #define JSON_HEDLEY_NO_ESCAPE __attribute__((__noescape__))
+#else
+    #define JSON_HEDLEY_NO_ESCAPE
+#endif
+
+#if defined(JSON_HEDLEY_UNREACHABLE)
+    #undef JSON_HEDLEY_UNREACHABLE
+#endif
+#if defined(JSON_HEDLEY_UNREACHABLE_RETURN)
+    #undef JSON_HEDLEY_UNREACHABLE_RETURN
+#endif
+#if defined(JSON_HEDLEY_ASSUME)
+    #undef JSON_HEDLEY_ASSUME
+#endif
+#if \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_ASSUME(expr) __assume(expr)
+#elif JSON_HEDLEY_HAS_BUILTIN(__builtin_assume)
+    #define JSON_HEDLEY_ASSUME(expr) __builtin_assume(expr)
+#elif \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
+    #if defined(__cplusplus)
+        #define JSON_HEDLEY_ASSUME(expr) std::_nassert(expr)
+    #else
+        #define JSON_HEDLEY_ASSUME(expr) _nassert(expr)
+    #endif
+#endif
+#if \
+    (JSON_HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(JSON_HEDLEY_ARM_VERSION))) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(18,10,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,5) || \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_UNREACHABLE() __builtin_unreachable()
+#elif defined(JSON_HEDLEY_ASSUME)
+    #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
+#endif
+#if !defined(JSON_HEDLEY_ASSUME)
+    #if defined(JSON_HEDLEY_UNREACHABLE)
+        #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (JSON_HEDLEY_UNREACHABLE(), 1)))
+    #else
+        #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, expr)
+    #endif
+#endif
+#if defined(JSON_HEDLEY_UNREACHABLE)
+    #if  \
+        JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
+        JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
+        #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (JSON_HEDLEY_STATIC_CAST(void, JSON_HEDLEY_ASSUME(0)), (value))
+    #else
+        #define JSON_HEDLEY_UNREACHABLE_RETURN(value) JSON_HEDLEY_UNREACHABLE()
+    #endif
+#else
+    #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (value)
+#endif
+#if !defined(JSON_HEDLEY_UNREACHABLE)
+    #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
+#endif
+
+JSON_HEDLEY_DIAGNOSTIC_PUSH
+#if JSON_HEDLEY_HAS_WARNING("-Wpedantic")
+    #pragma clang diagnostic ignored "-Wpedantic"
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus)
+    #pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
+#endif
+#if JSON_HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0)
+    #if defined(__clang__)
+        #pragma clang diagnostic ignored "-Wvariadic-macros"
+    #elif defined(JSON_HEDLEY_GCC_VERSION)
+        #pragma GCC diagnostic ignored "-Wvariadic-macros"
+    #endif
+#endif
+#if defined(JSON_HEDLEY_NON_NULL)
+    #undef JSON_HEDLEY_NON_NULL
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(nonnull) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
+    #define JSON_HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__)))
+#else
+    #define JSON_HEDLEY_NON_NULL(...)
+#endif
+JSON_HEDLEY_DIAGNOSTIC_POP
+
+#if defined(JSON_HEDLEY_PRINTF_FORMAT)
+    #undef JSON_HEDLEY_PRINTF_FORMAT
+#endif
+#if defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && !defined(__USE_MINGW_ANSI_STDIO)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(ms_printf, string_idx, first_to_check)))
+#elif defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && defined(__USE_MINGW_ANSI_STDIO)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(gnu_printf, string_idx, first_to_check)))
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(format) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check)))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(6,0,0)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check))
+#else
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check)
+#endif
+
+#if defined(JSON_HEDLEY_CONSTEXPR)
+    #undef JSON_HEDLEY_CONSTEXPR
+#endif
+#if defined(__cplusplus)
+    #if __cplusplus >= 201103L
+        #define JSON_HEDLEY_CONSTEXPR JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr)
+    #endif
+#endif
+#if !defined(JSON_HEDLEY_CONSTEXPR)
+    #define JSON_HEDLEY_CONSTEXPR
+#endif
+
+#if defined(JSON_HEDLEY_PREDICT)
+    #undef JSON_HEDLEY_PREDICT
+#endif
+#if defined(JSON_HEDLEY_LIKELY)
+    #undef JSON_HEDLEY_LIKELY
+#endif
+#if defined(JSON_HEDLEY_UNLIKELY)
+    #undef JSON_HEDLEY_UNLIKELY
+#endif
+#if defined(JSON_HEDLEY_UNPREDICTABLE)
+    #undef JSON_HEDLEY_UNPREDICTABLE
+#endif
+#if JSON_HEDLEY_HAS_BUILTIN(__builtin_unpredictable)
+    #define JSON_HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr))
+#endif
+#if \
+  (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && !defined(JSON_HEDLEY_PGI_VERSION)) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(9,0,0) || \
+  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+#  define JSON_HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability(  (expr), (value), (probability))
+#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability)   __builtin_expect_with_probability(!!(expr),    1   , (probability))
+#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability)  __builtin_expect_with_probability(!!(expr),    0   , (probability))
+#  define JSON_HEDLEY_LIKELY(expr)                      __builtin_expect                 (!!(expr),    1                  )
+#  define JSON_HEDLEY_UNLIKELY(expr)                    __builtin_expect                 (!!(expr),    0                  )
+#elif \
+  (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
+  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
+  JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
+  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
+  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
+  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \
+  JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
+  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+#  define JSON_HEDLEY_PREDICT(expr, expected, probability) \
+    (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (JSON_HEDLEY_STATIC_CAST(void, expected), (expr)))
+#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) \
+    (__extension__ ({ \
+        double hedley_probability_ = (probability); \
+        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \
+    }))
+#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) \
+    (__extension__ ({ \
+        double hedley_probability_ = (probability); \
+        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \
+    }))
+#  define JSON_HEDLEY_LIKELY(expr)   __builtin_expect(!!(expr), 1)
+#  define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#else
+#  define JSON_HEDLEY_PREDICT(expr, expected, probability) (JSON_HEDLEY_STATIC_CAST(void, expected), (expr))
+#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr))
+#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr))
+#  define JSON_HEDLEY_LIKELY(expr) (!!(expr))
+#  define JSON_HEDLEY_UNLIKELY(expr) (!!(expr))
+#endif
+#if !defined(JSON_HEDLEY_UNPREDICTABLE)
+    #define JSON_HEDLEY_UNPREDICTABLE(expr) JSON_HEDLEY_PREDICT(expr, 1, 0.5)
+#endif
+
+#if defined(JSON_HEDLEY_MALLOC)
+    #undef JSON_HEDLEY_MALLOC
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(malloc) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_MALLOC __attribute__((__malloc__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_MALLOC _Pragma("returns_new_memory")
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_MALLOC __declspec(restrict)
+#else
+    #define JSON_HEDLEY_MALLOC
+#endif
+
+#if defined(JSON_HEDLEY_PURE)
+    #undef JSON_HEDLEY_PURE
+#endif
+#if \
+  JSON_HEDLEY_HAS_ATTRIBUTE(pure) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(2,96,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+#  define JSON_HEDLEY_PURE __attribute__((__pure__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+#  define JSON_HEDLEY_PURE _Pragma("does_not_write_global_data")
+#elif defined(__cplusplus) && \
+    ( \
+      JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
+      JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \
+      JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \
+    )
+#  define JSON_HEDLEY_PURE _Pragma("FUNC_IS_PURE;")
+#else
+#  define JSON_HEDLEY_PURE
+#endif
+
+#if defined(JSON_HEDLEY_CONST)
+    #undef JSON_HEDLEY_CONST
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(const) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(2,5,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_CONST __attribute__((__const__))
+#elif \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_CONST _Pragma("no_side_effect")
+#else
+    #define JSON_HEDLEY_CONST JSON_HEDLEY_PURE
+#endif
+
+#if defined(JSON_HEDLEY_RESTRICT)
+    #undef JSON_HEDLEY_RESTRICT
+#endif
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus)
+    #define JSON_HEDLEY_RESTRICT restrict
+#elif \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
+    defined(__clang__) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_RESTRICT __restrict
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus)
+    #define JSON_HEDLEY_RESTRICT _Restrict
+#else
+    #define JSON_HEDLEY_RESTRICT
+#endif
+
+#if defined(JSON_HEDLEY_INLINE)
+    #undef JSON_HEDLEY_INLINE
+#endif
+#if \
+    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
+    (defined(__cplusplus) && (__cplusplus >= 199711L))
+    #define JSON_HEDLEY_INLINE inline
+#elif \
+    defined(JSON_HEDLEY_GCC_VERSION) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(6,2,0)
+    #define JSON_HEDLEY_INLINE __inline__
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_INLINE __inline
+#else
+    #define JSON_HEDLEY_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_ALWAYS_INLINE)
+    #undef JSON_HEDLEY_ALWAYS_INLINE
+#endif
+#if \
+  JSON_HEDLEY_HAS_ATTRIBUTE(always_inline) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
+  JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
+#  define JSON_HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) JSON_HEDLEY_INLINE
+#elif \
+  JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
+  JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+#  define JSON_HEDLEY_ALWAYS_INLINE __forceinline
+#elif defined(__cplusplus) && \
+    ( \
+      JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+      JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+      JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+      JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
+      JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+      JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \
+    )
+#  define JSON_HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+#  define JSON_HEDLEY_ALWAYS_INLINE _Pragma("inline=forced")
+#else
+#  define JSON_HEDLEY_ALWAYS_INLINE JSON_HEDLEY_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_NEVER_INLINE)
+    #undef JSON_HEDLEY_NEVER_INLINE
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(noinline) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
+    #define JSON_HEDLEY_NEVER_INLINE __attribute__((__noinline__))
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(10,2,0)
+    #define JSON_HEDLEY_NEVER_INLINE _Pragma("noinline")
+#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_NEVER_INLINE _Pragma("inline=never")
+#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
+    #define JSON_HEDLEY_NEVER_INLINE __attribute((noinline))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
+    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
+#else
+    #define JSON_HEDLEY_NEVER_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_PRIVATE)
+    #undef JSON_HEDLEY_PRIVATE
+#endif
+#if defined(JSON_HEDLEY_PUBLIC)
+    #undef JSON_HEDLEY_PUBLIC
+#endif
+#if defined(JSON_HEDLEY_IMPORT)
+    #undef JSON_HEDLEY_IMPORT
+#endif
+#if defined(_WIN32) || defined(__CYGWIN__)
+#  define JSON_HEDLEY_PRIVATE
+#  define JSON_HEDLEY_PUBLIC   __declspec(dllexport)
+#  define JSON_HEDLEY_IMPORT   __declspec(dllimport)
+#else
+#  if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(visibility) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
+    ( \
+      defined(__TI_EABI__) && \
+      ( \
+        (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+        JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \
+      ) \
+    ) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+#    define JSON_HEDLEY_PRIVATE __attribute__((__visibility__("hidden")))
+#    define JSON_HEDLEY_PUBLIC  __attribute__((__visibility__("default")))
+#  else
+#    define JSON_HEDLEY_PRIVATE
+#    define JSON_HEDLEY_PUBLIC
+#  endif
+#  define JSON_HEDLEY_IMPORT    extern
+#endif
+
+#if defined(JSON_HEDLEY_NO_THROW)
+    #undef JSON_HEDLEY_NO_THROW
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(nothrow) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_NO_THROW __attribute__((__nothrow__))
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
+    #define JSON_HEDLEY_NO_THROW __declspec(nothrow)
+#else
+    #define JSON_HEDLEY_NO_THROW
+#endif
+
+#if defined(JSON_HEDLEY_FALL_THROUGH)
+    #undef JSON_HEDLEY_FALL_THROUGH
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(fallthrough) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(7,0,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough)
+    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]])
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough)
+    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]])
+#elif defined(__fallthrough) /* SAL */
+    #define JSON_HEDLEY_FALL_THROUGH __fallthrough
+#else
+    #define JSON_HEDLEY_FALL_THROUGH
+#endif
+
+#if defined(JSON_HEDLEY_RETURNS_NON_NULL)
+    #undef JSON_HEDLEY_RETURNS_NON_NULL
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__))
+#elif defined(_Ret_notnull_) /* SAL */
+    #define JSON_HEDLEY_RETURNS_NON_NULL _Ret_notnull_
+#else
+    #define JSON_HEDLEY_RETURNS_NON_NULL
+#endif
+
+#if defined(JSON_HEDLEY_ARRAY_PARAM)
+    #undef JSON_HEDLEY_ARRAY_PARAM
+#endif
+#if \
+    defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
+    !defined(__STDC_NO_VLA__) && \
+    !defined(__cplusplus) && \
+    !defined(JSON_HEDLEY_PGI_VERSION) && \
+    !defined(JSON_HEDLEY_TINYC_VERSION)
+    #define JSON_HEDLEY_ARRAY_PARAM(name) (name)
+#else
+    #define JSON_HEDLEY_ARRAY_PARAM(name)
+#endif
+
+#if defined(JSON_HEDLEY_IS_CONSTANT)
+    #undef JSON_HEDLEY_IS_CONSTANT
+#endif
+#if defined(JSON_HEDLEY_REQUIRE_CONSTEXPR)
+    #undef JSON_HEDLEY_REQUIRE_CONSTEXPR
+#endif
+/* JSON_HEDLEY_IS_CONSTEXPR_ is for
+   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
+#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
+    #undef JSON_HEDLEY_IS_CONSTEXPR_
+#endif
+#if \
+    JSON_HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr)
+#endif
+#if !defined(__cplusplus)
+#  if \
+       JSON_HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \
+       JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
+       JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+       JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
+       JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
+       JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
+       JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,24)
+#if defined(__INTPTR_TYPE__)
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*)
+#else
+    #include <stdint.h>
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*)
+#endif
+#  elif \
+       ( \
+          defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
+          !defined(JSON_HEDLEY_SUNPRO_VERSION) && \
+          !defined(JSON_HEDLEY_PGI_VERSION) && \
+          !defined(JSON_HEDLEY_IAR_VERSION)) || \
+       (JSON_HEDLEY_HAS_EXTENSION(c_generic_selections) && !defined(JSON_HEDLEY_IAR_VERSION)) || \
+       JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
+       JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \
+       JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
+       JSON_HEDLEY_ARM_VERSION_CHECK(5,3,0)
+#if defined(__INTPTR_TYPE__)
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0)
+#else
+    #include <stdint.h>
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0)
+#endif
+#  elif \
+       defined(JSON_HEDLEY_GCC_VERSION) || \
+       defined(JSON_HEDLEY_INTEL_VERSION) || \
+       defined(JSON_HEDLEY_TINYC_VERSION) || \
+       defined(JSON_HEDLEY_TI_ARMCL_VERSION) || \
+       JSON_HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \
+       defined(JSON_HEDLEY_TI_CL2000_VERSION) || \
+       defined(JSON_HEDLEY_TI_CL6X_VERSION) || \
+       defined(JSON_HEDLEY_TI_CL7X_VERSION) || \
+       defined(JSON_HEDLEY_TI_CLPRU_VERSION) || \
+       defined(__clang__)
+#    define JSON_HEDLEY_IS_CONSTEXPR_(expr) ( \
+        sizeof(void) != \
+        sizeof(*( \
+                  1 ? \
+                  ((void*) ((expr) * 0L) ) : \
+((struct { char v[sizeof(void) * 2]; } *) 1) \
+                ) \
+              ) \
+                                            )
+#  endif
+#endif
+#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
+    #if !defined(JSON_HEDLEY_IS_CONSTANT)
+        #define JSON_HEDLEY_IS_CONSTANT(expr) JSON_HEDLEY_IS_CONSTEXPR_(expr)
+    #endif
+    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (JSON_HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1))
+#else
+    #if !defined(JSON_HEDLEY_IS_CONSTANT)
+        #define JSON_HEDLEY_IS_CONSTANT(expr) (0)
+    #endif
+    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (expr)
+#endif
+
+#if defined(JSON_HEDLEY_BEGIN_C_DECLS)
+    #undef JSON_HEDLEY_BEGIN_C_DECLS
+#endif
+#if defined(JSON_HEDLEY_END_C_DECLS)
+    #undef JSON_HEDLEY_END_C_DECLS
+#endif
+#if defined(JSON_HEDLEY_C_DECL)
+    #undef JSON_HEDLEY_C_DECL
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_BEGIN_C_DECLS extern "C" {
+    #define JSON_HEDLEY_END_C_DECLS }
+    #define JSON_HEDLEY_C_DECL extern "C"
+#else
+    #define JSON_HEDLEY_BEGIN_C_DECLS
+    #define JSON_HEDLEY_END_C_DECLS
+    #define JSON_HEDLEY_C_DECL
+#endif
+
+#if defined(JSON_HEDLEY_STATIC_ASSERT)
+    #undef JSON_HEDLEY_STATIC_ASSERT
+#endif
+#if \
+  !defined(__cplusplus) && ( \
+      (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
+      (JSON_HEDLEY_HAS_FEATURE(c_static_assert) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \
+      JSON_HEDLEY_GCC_VERSION_CHECK(6,0,0) || \
+      JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+      defined(_Static_assert) \
+    )
+#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
+#elif \
+  (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
+  JSON_HEDLEY_MSVC_VERSION_CHECK(16,0,0) || \
+  JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message))
+#else
+#  define JSON_HEDLEY_STATIC_ASSERT(expr, message)
+#endif
+
+#if defined(JSON_HEDLEY_NULL)
+    #undef JSON_HEDLEY_NULL
+#endif
+#if defined(__cplusplus)
+    #if __cplusplus >= 201103L
+        #define JSON_HEDLEY_NULL JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr)
+    #elif defined(NULL)
+        #define JSON_HEDLEY_NULL NULL
+    #else
+        #define JSON_HEDLEY_NULL JSON_HEDLEY_STATIC_CAST(void*, 0)
+    #endif
+#elif defined(NULL)
+    #define JSON_HEDLEY_NULL NULL
+#else
+    #define JSON_HEDLEY_NULL ((void*) 0)
+#endif
+
+#if defined(JSON_HEDLEY_MESSAGE)
+    #undef JSON_HEDLEY_MESSAGE
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#  define JSON_HEDLEY_MESSAGE(msg) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+    JSON_HEDLEY_PRAGMA(message msg) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#elif \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,4,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message msg)
+#elif JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(_CRI message msg)
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#else
+#  define JSON_HEDLEY_MESSAGE(msg)
+#endif
+
+#if defined(JSON_HEDLEY_WARNING)
+    #undef JSON_HEDLEY_WARNING
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#  define JSON_HEDLEY_WARNING(msg) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+    JSON_HEDLEY_PRAGMA(clang warning msg) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#elif \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,8,0) || \
+  JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(GCC warning msg)
+#elif \
+  JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \
+  JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#else
+#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_MESSAGE(msg)
+#endif
+
+#if defined(JSON_HEDLEY_REQUIRE)
+    #undef JSON_HEDLEY_REQUIRE
+#endif
+#if defined(JSON_HEDLEY_REQUIRE_MSG)
+    #undef JSON_HEDLEY_REQUIRE_MSG
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(diagnose_if)
+#  if JSON_HEDLEY_HAS_WARNING("-Wgcc-compat")
+#    define JSON_HEDLEY_REQUIRE(expr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
+    __attribute__((diagnose_if(!(expr), #expr, "error"))) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
+    __attribute__((diagnose_if(!(expr), msg, "error"))) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#  else
+#    define JSON_HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error")))
+#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error")))
+#  endif
+#else
+#  define JSON_HEDLEY_REQUIRE(expr)
+#  define JSON_HEDLEY_REQUIRE_MSG(expr,msg)
+#endif
+
+#if defined(JSON_HEDLEY_FLAGS)
+    #undef JSON_HEDLEY_FLAGS
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(flag_enum) && (!defined(__cplusplus) || JSON_HEDLEY_HAS_WARNING("-Wbitfield-enum-conversion"))
+    #define JSON_HEDLEY_FLAGS __attribute__((__flag_enum__))
+#else
+    #define JSON_HEDLEY_FLAGS
+#endif
+
+#if defined(JSON_HEDLEY_FLAGS_CAST)
+    #undef JSON_HEDLEY_FLAGS_CAST
+#endif
+#if JSON_HEDLEY_INTEL_VERSION_CHECK(19,0,0)
+#  define JSON_HEDLEY_FLAGS_CAST(T, expr) (__extension__ ({ \
+        JSON_HEDLEY_DIAGNOSTIC_PUSH \
+        _Pragma("warning(disable:188)") \
+        ((T) (expr)); \
+        JSON_HEDLEY_DIAGNOSTIC_POP \
+    }))
+#else
+#  define JSON_HEDLEY_FLAGS_CAST(T, expr) JSON_HEDLEY_STATIC_CAST(T, expr)
+#endif
+
+#if defined(JSON_HEDLEY_EMPTY_BASES)
+    #undef JSON_HEDLEY_EMPTY_BASES
+#endif
+#if \
+    (JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !JSON_HEDLEY_MSVC_VERSION_CHECK(20,0,0)) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_EMPTY_BASES __declspec(empty_bases)
+#else
+    #define JSON_HEDLEY_EMPTY_BASES
+#endif
+
+/* Remaining macros are deprecated. */
+
+#if defined(JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK)
+    #undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
+#endif
+#if defined(__clang__)
+    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) (0)
+#else
+    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_CLANG_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_BUILTIN)
+    #undef JSON_HEDLEY_CLANG_HAS_BUILTIN
+#endif
+#define JSON_HEDLEY_CLANG_HAS_BUILTIN(builtin) JSON_HEDLEY_HAS_BUILTIN(builtin)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_FEATURE)
+    #undef JSON_HEDLEY_CLANG_HAS_FEATURE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_FEATURE(feature) JSON_HEDLEY_HAS_FEATURE(feature)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_EXTENSION)
+    #undef JSON_HEDLEY_CLANG_HAS_EXTENSION
+#endif
+#define JSON_HEDLEY_CLANG_HAS_EXTENSION(extension) JSON_HEDLEY_HAS_EXTENSION(extension)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_WARNING)
+    #undef JSON_HEDLEY_CLANG_HAS_WARNING
+#endif
+#define JSON_HEDLEY_CLANG_HAS_WARNING(warning) JSON_HEDLEY_HAS_WARNING(warning)
+
+#endif /* !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < X) */
+
+
+// This file contains all internal macro definitions (except those affecting ABI)
+// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+
+// exclude unsupported compilers
+#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK)
+    #if defined(__clang__)
+        #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400
+            #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers"
+        #endif
+    #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER))
+        #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800
+            #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers"
+        #endif
+    #endif
+#endif
+
+// C++ language standard detection
+// if the user manually specified the used c++ version this is skipped
+#if !defined(JSON_HAS_CPP_20) && !defined(JSON_HAS_CPP_17) && !defined(JSON_HAS_CPP_14) && !defined(JSON_HAS_CPP_11)
+    #if (defined(__cplusplus) && __cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+        #define JSON_HAS_CPP_20
+        #define JSON_HAS_CPP_17
+        #define JSON_HAS_CPP_14
+    #elif (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464
+        #define JSON_HAS_CPP_17
+        #define JSON_HAS_CPP_14
+    #elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1)
+        #define JSON_HAS_CPP_14
+    #endif
+    // the cpp 11 flag is always specified because it is the minimal required version
+    #define JSON_HAS_CPP_11
+#endif
+
+#ifdef __has_include
+    #if __has_include(<version>)
+        #include <version>
+    #endif
+#endif
+
+#if !defined(JSON_HAS_FILESYSTEM) && !defined(JSON_HAS_EXPERIMENTAL_FILESYSTEM)
+    #ifdef JSON_HAS_CPP_17
+        #if defined(__cpp_lib_filesystem)
+            #define JSON_HAS_FILESYSTEM 1
+        #elif defined(__cpp_lib_experimental_filesystem)
+            #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1
+        #elif !defined(__has_include)
+            #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1
+        #elif __has_include(<filesystem>)
+            #define JSON_HAS_FILESYSTEM 1
+        #elif __has_include(<experimental/filesystem>)
+            #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1
+        #endif
+
+        // std::filesystem does not work on MinGW GCC 8: https://sourceforge.net/p/mingw-w64/bugs/737/
+        #if defined(__MINGW32__) && defined(__GNUC__) && __GNUC__ == 8
+            #undef JSON_HAS_FILESYSTEM
+            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+        #endif
+
+        // no filesystem support before GCC 8: https://en.cppreference.com/w/cpp/compiler_support
+        #if defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 8
+            #undef JSON_HAS_FILESYSTEM
+            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+        #endif
+
+        // no filesystem support before Clang 7: https://en.cppreference.com/w/cpp/compiler_support
+        #if defined(__clang_major__) && __clang_major__ < 7
+            #undef JSON_HAS_FILESYSTEM
+            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+        #endif
+
+        // no filesystem support before MSVC 19.14: https://en.cppreference.com/w/cpp/compiler_support
+        #if defined(_MSC_VER) && _MSC_VER < 1914
+            #undef JSON_HAS_FILESYSTEM
+            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+        #endif
+
+        // no filesystem support before iOS 13
+        #if defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 130000
+            #undef JSON_HAS_FILESYSTEM
+            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+        #endif
+
+        // no filesystem support before macOS Catalina
+        #if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < 101500
+            #undef JSON_HAS_FILESYSTEM
+            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+        #endif
+    #endif
+#endif
+
+#ifndef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+    #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 0
+#endif
+
+#ifndef JSON_HAS_FILESYSTEM
+    #define JSON_HAS_FILESYSTEM 0
+#endif
+
+#ifndef JSON_HAS_THREE_WAY_COMPARISON
+    #if defined(__cpp_impl_three_way_comparison) && __cpp_impl_three_way_comparison >= 201907L \
+        && defined(__cpp_lib_three_way_comparison) && __cpp_lib_three_way_comparison >= 201907L
+        #define JSON_HAS_THREE_WAY_COMPARISON 1
+    #else
+        #define JSON_HAS_THREE_WAY_COMPARISON 0
+    #endif
+#endif
+
+#ifndef JSON_HAS_RANGES
+    // ranges header shipping in GCC 11.1.0 (released 2021-04-27) has syntax error
+    #if defined(__GLIBCXX__) && __GLIBCXX__ == 20210427
+        #define JSON_HAS_RANGES 0
+    #elif defined(__cpp_lib_ranges)
+        #define JSON_HAS_RANGES 1
+    #else
+        #define JSON_HAS_RANGES 0
+    #endif
+#endif
+
+#ifndef JSON_HAS_STATIC_RTTI
+    #if !defined(_HAS_STATIC_RTTI) || _HAS_STATIC_RTTI != 0
+        #define JSON_HAS_STATIC_RTTI 1
+    #else
+        #define JSON_HAS_STATIC_RTTI 0
+    #endif
+#endif
+
+#ifdef JSON_HAS_CPP_17
+    #define JSON_INLINE_VARIABLE inline
+#else
+    #define JSON_INLINE_VARIABLE
+#endif
+
+#if JSON_HEDLEY_HAS_ATTRIBUTE(no_unique_address)
+    #define JSON_NO_UNIQUE_ADDRESS [[no_unique_address]]
+#else
+    #define JSON_NO_UNIQUE_ADDRESS
+#endif
+
+// disable documentation warnings on clang
+#if defined(__clang__)
+    #pragma clang diagnostic push
+    #pragma clang diagnostic ignored "-Wdocumentation"
+    #pragma clang diagnostic ignored "-Wdocumentation-unknown-command"
+#endif
+
+// allow disabling exceptions
+#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION)
+    #define JSON_THROW(exception) throw exception
+    #define JSON_TRY try
+    #define JSON_CATCH(exception) catch(exception)
+    #define JSON_INTERNAL_CATCH(exception) catch(exception)
+#else
+    #include <cstdlib>
+    #define JSON_THROW(exception) std::abort()
+    #define JSON_TRY if(true)
+    #define JSON_CATCH(exception) if(false)
+    #define JSON_INTERNAL_CATCH(exception) if(false)
+#endif
+
+// override exception macros
+#if defined(JSON_THROW_USER)
+    #undef JSON_THROW
+    #define JSON_THROW JSON_THROW_USER
+#endif
+#if defined(JSON_TRY_USER)
+    #undef JSON_TRY
+    #define JSON_TRY JSON_TRY_USER
+#endif
+#if defined(JSON_CATCH_USER)
+    #undef JSON_CATCH
+    #define JSON_CATCH JSON_CATCH_USER
+    #undef JSON_INTERNAL_CATCH
+    #define JSON_INTERNAL_CATCH JSON_CATCH_USER
+#endif
+#if defined(JSON_INTERNAL_CATCH_USER)
+    #undef JSON_INTERNAL_CATCH
+    #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER
+#endif
+
+// allow overriding assert
+#if !defined(JSON_ASSERT)
+    #include <cassert> // assert
+    #define JSON_ASSERT(x) assert(x)
+#endif
+
+// allow to access some private functions (needed by the test suite)
+#if defined(JSON_TESTS_PRIVATE)
+    #define JSON_PRIVATE_UNLESS_TESTED public
+#else
+    #define JSON_PRIVATE_UNLESS_TESTED private
+#endif
+
+/*!
+@brief macro to briefly define a mapping between an enum and JSON
+@def NLOHMANN_JSON_SERIALIZE_ENUM
+@since version 3.4.0
+*/
+#define NLOHMANN_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...)                                            \
+    template<typename BasicJsonType>                                                            \
+    inline void to_json(BasicJsonType& j, const ENUM_TYPE& e)                                   \
+    {                                                                                           \
+        /* NOLINTNEXTLINE(modernize-type-traits) we use C++11 */                                \
+        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
+        /* NOLINTNEXTLINE(modernize-avoid-c-arrays) we don't want to depend on <array> */       \
+        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
+        auto it = std::find_if(std::begin(m), std::end(m),                                      \
+                               [e](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool  \
+        {                                                                                       \
+            return ej_pair.first == e;                                                          \
+        });                                                                                     \
+        j = ((it != std::end(m)) ? it : std::begin(m))->second;                                 \
+    }                                                                                           \
+    template<typename BasicJsonType>                                                            \
+    inline void from_json(const BasicJsonType& j, ENUM_TYPE& e)                                 \
+    {                                                                                           \
+        /* NOLINTNEXTLINE(modernize-type-traits) we use C++11 */                                \
+        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
+        /* NOLINTNEXTLINE(modernize-avoid-c-arrays) we don't want to depend on <array> */       \
+        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
+        auto it = std::find_if(std::begin(m), std::end(m),                                      \
+                               [&j](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool \
+        {                                                                                       \
+            return ej_pair.second == j;                                                         \
+        });                                                                                     \
+        e = ((it != std::end(m)) ? it : std::begin(m))->first;                                  \
+    }
+
+// Ugly macros to avoid uglier copy-paste when specializing basic_json. They
+// may be removed in the future once the class is split.
+
+#define NLOHMANN_BASIC_JSON_TPL_DECLARATION                                \
+    template<template<typename, typename, typename...> class ObjectType,   \
+             template<typename, typename...> class ArrayType,              \
+             class StringType, class BooleanType, class NumberIntegerType, \
+             class NumberUnsignedType, class NumberFloatType,              \
+             template<typename> class AllocatorType,                       \
+             template<typename, typename = void> class JSONSerializer,     \
+             class BinaryType,                                             \
+             class CustomBaseClass>
+
+#define NLOHMANN_BASIC_JSON_TPL                                            \
+    basic_json<ObjectType, ArrayType, StringType, BooleanType,             \
+    NumberIntegerType, NumberUnsignedType, NumberFloatType,                \
+    AllocatorType, JSONSerializer, BinaryType, CustomBaseClass>
+
+// Macros to simplify conversion from/to types
+
+#define NLOHMANN_JSON_EXPAND( x ) x
+#define NLOHMANN_JSON_GET_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, _61, _62, _63, _64, NAME,...) NAME
+#define NLOHMANN_JSON_PASTE(...) NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_GET_MACRO(__VA_ARGS__, \
+        NLOHMANN_JSON_PASTE64, \
+        NLOHMANN_JSON_PASTE63, \
+        NLOHMANN_JSON_PASTE62, \
+        NLOHMANN_JSON_PASTE61, \
+        NLOHMANN_JSON_PASTE60, \
+        NLOHMANN_JSON_PASTE59, \
+        NLOHMANN_JSON_PASTE58, \
+        NLOHMANN_JSON_PASTE57, \
+        NLOHMANN_JSON_PASTE56, \
+        NLOHMANN_JSON_PASTE55, \
+        NLOHMANN_JSON_PASTE54, \
+        NLOHMANN_JSON_PASTE53, \
+        NLOHMANN_JSON_PASTE52, \
+        NLOHMANN_JSON_PASTE51, \
+        NLOHMANN_JSON_PASTE50, \
+        NLOHMANN_JSON_PASTE49, \
+        NLOHMANN_JSON_PASTE48, \
+        NLOHMANN_JSON_PASTE47, \
+        NLOHMANN_JSON_PASTE46, \
+        NLOHMANN_JSON_PASTE45, \
+        NLOHMANN_JSON_PASTE44, \
+        NLOHMANN_JSON_PASTE43, \
+        NLOHMANN_JSON_PASTE42, \
+        NLOHMANN_JSON_PASTE41, \
+        NLOHMANN_JSON_PASTE40, \
+        NLOHMANN_JSON_PASTE39, \
+        NLOHMANN_JSON_PASTE38, \
+        NLOHMANN_JSON_PASTE37, \
+        NLOHMANN_JSON_PASTE36, \
+        NLOHMANN_JSON_PASTE35, \
+        NLOHMANN_JSON_PASTE34, \
+        NLOHMANN_JSON_PASTE33, \
+        NLOHMANN_JSON_PASTE32, \
+        NLOHMANN_JSON_PASTE31, \
+        NLOHMANN_JSON_PASTE30, \
+        NLOHMANN_JSON_PASTE29, \
+        NLOHMANN_JSON_PASTE28, \
+        NLOHMANN_JSON_PASTE27, \
+        NLOHMANN_JSON_PASTE26, \
+        NLOHMANN_JSON_PASTE25, \
+        NLOHMANN_JSON_PASTE24, \
+        NLOHMANN_JSON_PASTE23, \
+        NLOHMANN_JSON_PASTE22, \
+        NLOHMANN_JSON_PASTE21, \
+        NLOHMANN_JSON_PASTE20, \
+        NLOHMANN_JSON_PASTE19, \
+        NLOHMANN_JSON_PASTE18, \
+        NLOHMANN_JSON_PASTE17, \
+        NLOHMANN_JSON_PASTE16, \
+        NLOHMANN_JSON_PASTE15, \
+        NLOHMANN_JSON_PASTE14, \
+        NLOHMANN_JSON_PASTE13, \
+        NLOHMANN_JSON_PASTE12, \
+        NLOHMANN_JSON_PASTE11, \
+        NLOHMANN_JSON_PASTE10, \
+        NLOHMANN_JSON_PASTE9, \
+        NLOHMANN_JSON_PASTE8, \
+        NLOHMANN_JSON_PASTE7, \
+        NLOHMANN_JSON_PASTE6, \
+        NLOHMANN_JSON_PASTE5, \
+        NLOHMANN_JSON_PASTE4, \
+        NLOHMANN_JSON_PASTE3, \
+        NLOHMANN_JSON_PASTE2, \
+        NLOHMANN_JSON_PASTE1)(__VA_ARGS__))
+#define NLOHMANN_JSON_PASTE2(func, v1) func(v1)
+#define NLOHMANN_JSON_PASTE3(func, v1, v2) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE2(func, v2)
+#define NLOHMANN_JSON_PASTE4(func, v1, v2, v3) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE3(func, v2, v3)
+#define NLOHMANN_JSON_PASTE5(func, v1, v2, v3, v4) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE4(func, v2, v3, v4)
+#define NLOHMANN_JSON_PASTE6(func, v1, v2, v3, v4, v5) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE5(func, v2, v3, v4, v5)
+#define NLOHMANN_JSON_PASTE7(func, v1, v2, v3, v4, v5, v6) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE6(func, v2, v3, v4, v5, v6)
+#define NLOHMANN_JSON_PASTE8(func, v1, v2, v3, v4, v5, v6, v7) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE7(func, v2, v3, v4, v5, v6, v7)
+#define NLOHMANN_JSON_PASTE9(func, v1, v2, v3, v4, v5, v6, v7, v8) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE8(func, v2, v3, v4, v5, v6, v7, v8)
+#define NLOHMANN_JSON_PASTE10(func, v1, v2, v3, v4, v5, v6, v7, v8, v9) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE9(func, v2, v3, v4, v5, v6, v7, v8, v9)
+#define NLOHMANN_JSON_PASTE11(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE10(func, v2, v3, v4, v5, v6, v7, v8, v9, v10)
+#define NLOHMANN_JSON_PASTE12(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE11(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11)
+#define NLOHMANN_JSON_PASTE13(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE12(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12)
+#define NLOHMANN_JSON_PASTE14(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE13(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13)
+#define NLOHMANN_JSON_PASTE15(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE14(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14)
+#define NLOHMANN_JSON_PASTE16(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE15(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15)
+#define NLOHMANN_JSON_PASTE17(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE16(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16)
+#define NLOHMANN_JSON_PASTE18(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE17(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17)
+#define NLOHMANN_JSON_PASTE19(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE18(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18)
+#define NLOHMANN_JSON_PASTE20(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE19(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19)
+#define NLOHMANN_JSON_PASTE21(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE20(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20)
+#define NLOHMANN_JSON_PASTE22(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE21(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21)
+#define NLOHMANN_JSON_PASTE23(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE22(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22)
+#define NLOHMANN_JSON_PASTE24(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE23(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23)
+#define NLOHMANN_JSON_PASTE25(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE24(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24)
+#define NLOHMANN_JSON_PASTE26(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE25(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25)
+#define NLOHMANN_JSON_PASTE27(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE26(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26)
+#define NLOHMANN_JSON_PASTE28(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE27(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27)
+#define NLOHMANN_JSON_PASTE29(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE28(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28)
+#define NLOHMANN_JSON_PASTE30(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE29(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29)
+#define NLOHMANN_JSON_PASTE31(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE30(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30)
+#define NLOHMANN_JSON_PASTE32(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE31(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31)
+#define NLOHMANN_JSON_PASTE33(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE32(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32)
+#define NLOHMANN_JSON_PASTE34(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE33(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33)
+#define NLOHMANN_JSON_PASTE35(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE34(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34)
+#define NLOHMANN_JSON_PASTE36(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE35(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35)
+#define NLOHMANN_JSON_PASTE37(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE36(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36)
+#define NLOHMANN_JSON_PASTE38(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE37(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37)
+#define NLOHMANN_JSON_PASTE39(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE38(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38)
+#define NLOHMANN_JSON_PASTE40(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE39(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39)
+#define NLOHMANN_JSON_PASTE41(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE40(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40)
+#define NLOHMANN_JSON_PASTE42(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE41(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41)
+#define NLOHMANN_JSON_PASTE43(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE42(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42)
+#define NLOHMANN_JSON_PASTE44(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE43(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43)
+#define NLOHMANN_JSON_PASTE45(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE44(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44)
+#define NLOHMANN_JSON_PASTE46(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE45(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45)
+#define NLOHMANN_JSON_PASTE47(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE46(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46)
+#define NLOHMANN_JSON_PASTE48(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE47(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47)
+#define NLOHMANN_JSON_PASTE49(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE48(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48)
+#define NLOHMANN_JSON_PASTE50(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE49(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49)
+#define NLOHMANN_JSON_PASTE51(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE50(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50)
+#define NLOHMANN_JSON_PASTE52(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE51(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51)
+#define NLOHMANN_JSON_PASTE53(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE52(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52)
+#define NLOHMANN_JSON_PASTE54(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE53(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53)
+#define NLOHMANN_JSON_PASTE55(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE54(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54)
+#define NLOHMANN_JSON_PASTE56(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE55(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55)
+#define NLOHMANN_JSON_PASTE57(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE56(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56)
+#define NLOHMANN_JSON_PASTE58(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE57(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57)
+#define NLOHMANN_JSON_PASTE59(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE58(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58)
+#define NLOHMANN_JSON_PASTE60(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE59(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59)
+#define NLOHMANN_JSON_PASTE61(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE60(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60)
+#define NLOHMANN_JSON_PASTE62(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE61(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61)
+#define NLOHMANN_JSON_PASTE63(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE62(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62)
+#define NLOHMANN_JSON_PASTE64(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE63(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63)
+
+#define NLOHMANN_JSON_TO(v1) nlohmann_json_j[#v1] = nlohmann_json_t.v1;
+#define NLOHMANN_JSON_FROM(v1) nlohmann_json_j.at(#v1).get_to(nlohmann_json_t.v1);
+#define NLOHMANN_JSON_FROM_WITH_DEFAULT(v1) nlohmann_json_t.v1 = !nlohmann_json_j.is_null() ? nlohmann_json_j.value(#v1, nlohmann_json_default_obj.v1) : nlohmann_json_default_obj.v1;
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_INTRUSIVE
+@since version 3.9.0
+*/
+#define NLOHMANN_DEFINE_TYPE_INTRUSIVE(Type, ...)  \
+    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT
+@since version 3.11.0
+*/
+#define NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(Type, ...)  \
+    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_INTRUSIVE_ONLY_SERIALIZE
+@since version 3.11.x
+*/
+#define NLOHMANN_DEFINE_TYPE_INTRUSIVE_ONLY_SERIALIZE(Type, ...)  \
+    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE
+@since version 3.9.0
+*/
+#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Type, ...)  \
+    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_WITH_DEFAULT
+@since version 3.11.0
+*/
+#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_WITH_DEFAULT(Type, ...)  \
+    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE
+@since version 3.11.x
+*/
+#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE(Type, ...)  \
+    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE
+@since version 3.11.x
+*/
+#define NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE(Type, BaseType, ...)  \
+    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
+
+#define NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE_WITH_DEFAULT(Type, BaseType, ...)  \
+    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType&>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
+
+#define NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE_ONLY_SERIALIZE(Type, BaseType, ...)  \
+    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE
+@since version 3.11.x
+*/
+#define NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE(Type, BaseType, ...)  \
+    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
+
+#define NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE_WITH_DEFAULT(Type, BaseType, ...)  \
+    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
+
+#define NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE(Type, BaseType, ...)  \
+    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+
+// inspired from https://stackoverflow.com/a/26745591
+// allows to call any std function as if (e.g. with begin):
+// using std::begin; begin(x);
+//
+// it allows using the detected idiom to retrieve the return type
+// of such an expression
+#define NLOHMANN_CAN_CALL_STD_FUNC_IMPL(std_name)                                 \
+    namespace detail {                                                            \
+    using std::std_name;                                                          \
+    \
+    template<typename... T>                                                       \
+    using result_of_##std_name = decltype(std_name(std::declval<T>()...));        \
+    }                                                                             \
+    \
+    namespace detail2 {                                                           \
+    struct std_name##_tag                                                         \
+    {                                                                             \
+    };                                                                            \
+    \
+    template<typename... T>                                                       \
+    std_name##_tag std_name(T&&...);                                              \
+    \
+    template<typename... T>                                                       \
+    using result_of_##std_name = decltype(std_name(std::declval<T>()...));        \
+    \
+    template<typename... T>                                                       \
+    struct would_call_std_##std_name                                              \
+    {                                                                             \
+        static constexpr auto const value = ::nlohmann::detail::                  \
+                                            is_detected_exact<std_name##_tag, result_of_##std_name, T...>::value; \
+    };                                                                            \
+    } /* namespace detail2 */ \
+    \
+    template<typename... T>                                                       \
+    struct would_call_std_##std_name : detail2::would_call_std_##std_name<T...>   \
+    {                                                                             \
+    }
+
+#ifndef JSON_USE_IMPLICIT_CONVERSIONS
+    #define JSON_USE_IMPLICIT_CONVERSIONS 1
+#endif
+
+#if JSON_USE_IMPLICIT_CONVERSIONS
+    #define JSON_EXPLICIT
+#else
+    #define JSON_EXPLICIT explicit
+#endif
+
+#ifndef JSON_DISABLE_ENUM_SERIALIZATION
+    #define JSON_DISABLE_ENUM_SERIALIZATION 0
+#endif
+
+#ifndef JSON_USE_GLOBAL_UDLS
+    #define JSON_USE_GLOBAL_UDLS 1
+#endif
+
+#if JSON_HAS_THREE_WAY_COMPARISON
+    #include <compare> // partial_ordering
+#endif
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+///////////////////////////
+// JSON type enumeration //
+///////////////////////////
+
+/*!
+@brief the JSON type enumeration
+
+This enumeration collects the different JSON types. It is internally used to
+distinguish the stored values, and the functions @ref basic_json::is_null(),
+@ref basic_json::is_object(), @ref basic_json::is_array(),
+@ref basic_json::is_string(), @ref basic_json::is_boolean(),
+@ref basic_json::is_number() (with @ref basic_json::is_number_integer(),
+@ref basic_json::is_number_unsigned(), and @ref basic_json::is_number_float()),
+@ref basic_json::is_discarded(), @ref basic_json::is_primitive(), and
+@ref basic_json::is_structured() rely on it.
+
+@note There are three enumeration entries (number_integer, number_unsigned, and
+number_float), because the library distinguishes these three types for numbers:
+@ref basic_json::number_unsigned_t is used for unsigned integers,
+@ref basic_json::number_integer_t is used for signed integers, and
+@ref basic_json::number_float_t is used for floating-point numbers or to
+approximate integers which do not fit in the limits of their respective type.
+
+@sa see @ref basic_json::basic_json(const value_t value_type) -- create a JSON
+value with the default value for a given type
+
+@since version 1.0.0
+*/
+enum class value_t : std::uint8_t
+{
+    null,             ///< null value
+    object,           ///< object (unordered set of name/value pairs)
+    array,            ///< array (ordered collection of values)
+    string,           ///< string value
+    boolean,          ///< boolean value
+    number_integer,   ///< number value (signed integer)
+    number_unsigned,  ///< number value (unsigned integer)
+    number_float,     ///< number value (floating-point)
+    binary,           ///< binary array (ordered collection of bytes)
+    discarded         ///< discarded by the parser callback function
+};
+
+/*!
+@brief comparison operator for JSON types
+
+Returns an ordering that is similar to Python:
+- order: null < boolean < number < object < array < string < binary
+- furthermore, each type is not smaller than itself
+- discarded values are not comparable
+- binary is represented as a b"" string in python and directly comparable to a
+  string; however, making a binary array directly comparable with a string would
+  be surprising behavior in a JSON file.
+
+@since version 1.0.0
+*/
+#if JSON_HAS_THREE_WAY_COMPARISON
+    inline std::partial_ordering operator<=>(const value_t lhs, const value_t rhs) noexcept // *NOPAD*
+#else
+    inline bool operator<(const value_t lhs, const value_t rhs) noexcept
+#endif
+{
+    static constexpr std::array<std::uint8_t, 9> order = {{
+            0 /* null */, 3 /* object */, 4 /* array */, 5 /* string */,
+            1 /* boolean */, 2 /* integer */, 2 /* unsigned */, 2 /* float */,
+            6 /* binary */
+        }
+    };
+
+    const auto l_index = static_cast<std::size_t>(lhs);
+    const auto r_index = static_cast<std::size_t>(rhs);
+#if JSON_HAS_THREE_WAY_COMPARISON
+    if (l_index < order.size() && r_index < order.size())
+    {
+        return order[l_index] <=> order[r_index]; // *NOPAD*
+    }
+    return std::partial_ordering::unordered;
+#else
+    return l_index < order.size() && r_index < order.size() && order[l_index] < order[r_index];
+#endif
+}
+
+// GCC selects the built-in operator< over an operator rewritten from
+// a user-defined spaceship operator
+// Clang, MSVC, and ICC select the rewritten candidate
+// (see GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105200)
+#if JSON_HAS_THREE_WAY_COMPARISON && defined(__GNUC__)
+inline bool operator<(const value_t lhs, const value_t rhs) noexcept
+{
+    return std::is_lt(lhs <=> rhs); // *NOPAD*
+}
+#endif
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/string_escape.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+/*!
+@brief replace all occurrences of a substring by another string
+
+@param[in,out] s  the string to manipulate; changed so that all
+               occurrences of @a f are replaced with @a t
+@param[in]     f  the substring to replace with @a t
+@param[in]     t  the string to replace @a f
+
+@pre The search string @a f must not be empty. **This precondition is
+enforced with an assertion.**
+
+@since version 2.0.0
+*/
+template<typename StringType>
+inline void replace_substring(StringType& s, const StringType& f,
+                              const StringType& t)
+{
+    JSON_ASSERT(!f.empty());
+    for (auto pos = s.find(f);                // find first occurrence of f
+            pos != StringType::npos;          // make sure f was found
+            s.replace(pos, f.size(), t),      // replace with t, and
+            pos = s.find(f, pos + t.size()))  // find next occurrence of f
+    {}
+}
+
+/*!
+ * @brief string escaping as described in RFC 6901 (Sect. 4)
+ * @param[in] s string to escape
+ * @return    escaped string
+ *
+ * Note the order of escaping "~" to "~0" and "/" to "~1" is important.
+ */
+template<typename StringType>
+inline StringType escape(StringType s)
+{
+    replace_substring(s, StringType{"~"}, StringType{"~0"});
+    replace_substring(s, StringType{"/"}, StringType{"~1"});
+    return s;
+}
+
+/*!
+ * @brief string unescaping as described in RFC 6901 (Sect. 4)
+ * @param[in] s string to unescape
+ * @return    unescaped string
+ *
+ * Note the order of escaping "~1" to "/" and "~0" to "~" is important.
+ */
+template<typename StringType>
+static void unescape(StringType& s)
+{
+    replace_substring(s, StringType{"~1"}, StringType{"/"});
+    replace_substring(s, StringType{"~0"}, StringType{"~"});
+}
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/input/position_t.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <cstddef> // size_t
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+/// struct to capture the start position of the current token
+struct position_t
+{
+    /// the total number of characters read
+    std::size_t chars_read_total = 0;
+    /// the number of characters read in the current line
+    std::size_t chars_read_current_line = 0;
+    /// the number of lines read
+    std::size_t lines_read = 0;
+
+    /// conversion to size_t to preserve SAX interface
+    constexpr operator size_t() const
+    {
+        return chars_read_total;
+    }
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2018 The Abseil Authors
+// SPDX-License-Identifier: MIT
+
+
+
+#include <array> // array
+#include <cstddef> // size_t
+#include <type_traits> // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type
+#include <utility> // index_sequence, make_index_sequence, index_sequence_for
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+template<typename T>
+using uncvref_t = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+
+#ifdef JSON_HAS_CPP_14
+
+// the following utilities are natively available in C++14
+using std::enable_if_t;
+using std::index_sequence;
+using std::make_index_sequence;
+using std::index_sequence_for;
+
+#else
+
+// alias templates to reduce boilerplate
+template<bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+// The following code is taken from https://github.com/abseil/abseil-cpp/blob/10cb35e459f5ecca5b2ff107635da0bfa41011b4/absl/utility/utility.h
+// which is part of Google Abseil (https://github.com/abseil/abseil-cpp), licensed under the Apache License 2.0.
+
+//// START OF CODE FROM GOOGLE ABSEIL
+
+// integer_sequence
+//
+// Class template representing a compile-time integer sequence. An instantiation
+// of `integer_sequence<T, Ints...>` has a sequence of integers encoded in its
+// type through its template arguments (which is a common need when
+// working with C++11 variadic templates). `absl::integer_sequence` is designed
+// to be a drop-in replacement for C++14's `std::integer_sequence`.
+//
+// Example:
+//
+//   template< class T, T... Ints >
+//   void user_function(integer_sequence<T, Ints...>);
+//
+//   int main()
+//   {
+//     // user_function's `T` will be deduced to `int` and `Ints...`
+//     // will be deduced to `0, 1, 2, 3, 4`.
+//     user_function(make_integer_sequence<int, 5>());
+//   }
+template <typename T, T... Ints>
+struct integer_sequence
+{
+    using value_type = T;
+    static constexpr std::size_t size() noexcept
+    {
+        return sizeof...(Ints);
+    }
+};
+
+// index_sequence
+//
+// A helper template for an `integer_sequence` of `size_t`,
+// `absl::index_sequence` is designed to be a drop-in replacement for C++14's
+// `std::index_sequence`.
+template <size_t... Ints>
+using index_sequence = integer_sequence<size_t, Ints...>;
+
+namespace utility_internal
+{
+
+template <typename Seq, size_t SeqSize, size_t Rem>
+struct Extend;
+
+// Note that SeqSize == sizeof...(Ints). It's passed explicitly for efficiency.
+template <typename T, T... Ints, size_t SeqSize>
+struct Extend<integer_sequence<T, Ints...>, SeqSize, 0>
+{
+    using type = integer_sequence < T, Ints..., (Ints + SeqSize)... >;
+};
+
+template <typename T, T... Ints, size_t SeqSize>
+struct Extend<integer_sequence<T, Ints...>, SeqSize, 1>
+{
+    using type = integer_sequence < T, Ints..., (Ints + SeqSize)..., 2 * SeqSize >;
+};
+
+// Recursion helper for 'make_integer_sequence<T, N>'.
+// 'Gen<T, N>::type' is an alias for 'integer_sequence<T, 0, 1, ... N-1>'.
+template <typename T, size_t N>
+struct Gen
+{
+    using type =
+        typename Extend < typename Gen < T, N / 2 >::type, N / 2, N % 2 >::type;
+};
+
+template <typename T>
+struct Gen<T, 0>
+{
+    using type = integer_sequence<T>;
+};
+
+}  // namespace utility_internal
+
+// Compile-time sequences of integers
+
+// make_integer_sequence
+//
+// This template alias is equivalent to
+// `integer_sequence<int, 0, 1, ..., N-1>`, and is designed to be a drop-in
+// replacement for C++14's `std::make_integer_sequence`.
+template <typename T, T N>
+using make_integer_sequence = typename utility_internal::Gen<T, N>::type;
+
+// make_index_sequence
+//
+// This template alias is equivalent to `index_sequence<0, 1, ..., N-1>`,
+// and is designed to be a drop-in replacement for C++14's
+// `std::make_index_sequence`.
+template <size_t N>
+using make_index_sequence = make_integer_sequence<size_t, N>;
+
+// index_sequence_for
+//
+// Converts a typename pack into an index sequence of the same length, and
+// is designed to be a drop-in replacement for C++14's
+// `std::index_sequence_for()`
+template <typename... Ts>
+using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
+
+//// END OF CODE FROM GOOGLE ABSEIL
+
+#endif
+
+// dispatch utility (taken from ranges-v3)
+template<unsigned N> struct priority_tag : priority_tag < N - 1 > {};
+template<> struct priority_tag<0> {};
+
+// taken from ranges-v3
+template<typename T>
+struct static_const
+{
+    static JSON_INLINE_VARIABLE constexpr T value{};
+};
+
+#ifndef JSON_HAS_CPP_17
+    template<typename T>
+    constexpr T static_const<T>::value;
+#endif
+
+template<typename T, typename... Args>
+constexpr std::array<T, sizeof...(Args)> make_array(Args&& ... args)
+{
+    return std::array<T, sizeof...(Args)> {{static_cast<T>(std::forward<Args>(args))...}};
+}
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <limits> // numeric_limits
+#include <string> // char_traits
+#include <tuple> // tuple
+#include <type_traits> // false_type, is_constructible, is_integral, is_same, true_type
+#include <utility> // declval
+
+// #include <nlohmann/detail/iterators/iterator_traits.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <iterator> // random_access_iterator_tag
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+// #include <nlohmann/detail/meta/void_t.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+template<typename It, typename = void>
+struct iterator_types {};
+
+template<typename It>
+struct iterator_types <
+    It,
+    void_t<typename It::difference_type, typename It::value_type, typename It::pointer,
+    typename It::reference, typename It::iterator_category >>
+{
+    using difference_type = typename It::difference_type;
+    using value_type = typename It::value_type;
+    using pointer = typename It::pointer;
+    using reference = typename It::reference;
+    using iterator_category = typename It::iterator_category;
+};
+
+// This is required as some compilers implement std::iterator_traits in a way that
+// doesn't work with SFINAE. See https://github.com/nlohmann/json/issues/1341.
+template<typename T, typename = void>
+struct iterator_traits
+{
+};
+
+template<typename T>
+struct iterator_traits < T, enable_if_t < !std::is_pointer<T>::value >>
+    : iterator_types<T>
+{
+};
+
+template<typename T>
+struct iterator_traits<T*, enable_if_t<std::is_object<T>::value>>
+{
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = T;
+    using difference_type = ptrdiff_t;
+    using pointer = T*;
+    using reference = T&;
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/call_std/begin.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+NLOHMANN_CAN_CALL_STD_FUNC_IMPL(begin);
+
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/meta/call_std/end.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+NLOHMANN_CAN_CALL_STD_FUNC_IMPL(end);
+
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+// #include <nlohmann/json_fwd.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
+    #define INCLUDE_NLOHMANN_JSON_FWD_HPP_
+
+    #include <cstdint> // int64_t, uint64_t
+    #include <map> // map
+    #include <memory> // allocator
+    #include <string> // string
+    #include <vector> // vector
+
+    // #include <nlohmann/detail/abi_macros.hpp>
+
+
+    /*!
+    @brief namespace for Niels Lohmann
+    @see https://github.com/nlohmann
+    @since version 1.0.0
+    */
+    NLOHMANN_JSON_NAMESPACE_BEGIN
+
+    /*!
+    @brief default JSONSerializer template argument
+
+    This serializer ignores the template arguments and uses ADL
+    ([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
+    for serialization.
+    */
+    template<typename T = void, typename SFINAE = void>
+    struct adl_serializer;
+
+    /// a class to store JSON values
+    /// @sa https://json.nlohmann.me/api/basic_json/
+    template<template<typename U, typename V, typename... Args> class ObjectType =
+    std::map,
+    template<typename U, typename... Args> class ArrayType = std::vector,
+    class StringType = std::string, class BooleanType = bool,
+    class NumberIntegerType = std::int64_t,
+    class NumberUnsignedType = std::uint64_t,
+    class NumberFloatType = double,
+    template<typename U> class AllocatorType = std::allocator,
+    template<typename T, typename SFINAE = void> class JSONSerializer =
+    adl_serializer,
+    class BinaryType = std::vector<std::uint8_t>, // cppcheck-suppress syntaxError
+    class CustomBaseClass = void>
+    class basic_json;
+
+    /// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
+    /// @sa https://json.nlohmann.me/api/json_pointer/
+    template<typename RefStringType>
+    class json_pointer;
+
+    /*!
+    @brief default specialization
+    @sa https://json.nlohmann.me/api/json/
+    */
+    using json = basic_json<>;
+
+    /// @brief a minimal map-like container that preserves insertion order
+    /// @sa https://json.nlohmann.me/api/ordered_map/
+    template<class Key, class T, class IgnoredLess, class Allocator>
+    struct ordered_map;
+
+    /// @brief specialization that maintains the insertion order of object keys
+    /// @sa https://json.nlohmann.me/api/ordered_json/
+    using ordered_json = basic_json<nlohmann::ordered_map>;
+
+    NLOHMANN_JSON_NAMESPACE_END
+
+#endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+/*!
+@brief detail namespace with internal helper functions
+
+This namespace collects functions that should not be exposed,
+implementations of some @ref basic_json methods, and meta-programming helpers.
+
+@since version 2.1.0
+*/
+namespace detail
+{
+
+/////////////
+// helpers //
+/////////////
+
+// Note to maintainers:
+//
+// Every trait in this file expects a non CV-qualified type.
+// The only exceptions are in the 'aliases for detected' section
+// (i.e. those of the form: decltype(T::member_function(std::declval<T>())))
+//
+// In this case, T has to be properly CV-qualified to constraint the function arguments
+// (e.g. to_json(BasicJsonType&, const T&))
+
+template<typename> struct is_basic_json : std::false_type {};
+
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+struct is_basic_json<NLOHMANN_BASIC_JSON_TPL> : std::true_type {};
+
+// used by exceptions create() member functions
+// true_type for pointer to possibly cv-qualified basic_json or std::nullptr_t
+// false_type otherwise
+template<typename BasicJsonContext>
+struct is_basic_json_context :
+    std::integral_constant < bool,
+    is_basic_json<typename std::remove_cv<typename std::remove_pointer<BasicJsonContext>::type>::type>::value
+    || std::is_same<BasicJsonContext, std::nullptr_t>::value >
+{};
+
+//////////////////////
+// json_ref helpers //
+//////////////////////
+
+template<typename>
+class json_ref;
+
+template<typename>
+struct is_json_ref : std::false_type {};
+
+template<typename T>
+struct is_json_ref<json_ref<T>> : std::true_type {};
+
+//////////////////////////
+// aliases for detected //
+//////////////////////////
+
+template<typename T>
+using mapped_type_t = typename T::mapped_type;
+
+template<typename T>
+using key_type_t = typename T::key_type;
+
+template<typename T>
+using value_type_t = typename T::value_type;
+
+template<typename T>
+using difference_type_t = typename T::difference_type;
+
+template<typename T>
+using pointer_t = typename T::pointer;
+
+template<typename T>
+using reference_t = typename T::reference;
+
+template<typename T>
+using iterator_category_t = typename T::iterator_category;
+
+template<typename T, typename... Args>
+using to_json_function = decltype(T::to_json(std::declval<Args>()...));
+
+template<typename T, typename... Args>
+using from_json_function = decltype(T::from_json(std::declval<Args>()...));
+
+template<typename T, typename U>
+using get_template_function = decltype(std::declval<T>().template get<U>());
+
+// trait checking if JSONSerializer<T>::from_json(json const&, udt&) exists
+template<typename BasicJsonType, typename T, typename = void>
+struct has_from_json : std::false_type {};
+
+// trait checking if j.get<T> is valid
+// use this trait instead of std::is_constructible or std::is_convertible,
+// both rely on, or make use of implicit conversions, and thus fail when T
+// has several constructors/operator= (see https://github.com/nlohmann/json/issues/958)
+template <typename BasicJsonType, typename T>
+struct is_getable
+{
+    static constexpr bool value = is_detected<get_template_function, const BasicJsonType&, T>::value;
+};
+
+template<typename BasicJsonType, typename T>
+struct has_from_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<void, from_json_function, serializer,
+        const BasicJsonType&, T&>::value;
+};
+
+// This trait checks if JSONSerializer<T>::from_json(json const&) exists
+// this overload is used for non-default-constructible user-defined-types
+template<typename BasicJsonType, typename T, typename = void>
+struct has_non_default_from_json : std::false_type {};
+
+template<typename BasicJsonType, typename T>
+struct has_non_default_from_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<T, from_json_function, serializer,
+        const BasicJsonType&>::value;
+};
+
+// This trait checks if BasicJsonType::json_serializer<T>::to_json exists
+// Do not evaluate the trait when T is a basic_json type, to avoid template instantiation infinite recursion.
+template<typename BasicJsonType, typename T, typename = void>
+struct has_to_json : std::false_type {};
+
+template<typename BasicJsonType, typename T>
+struct has_to_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<void, to_json_function, serializer, BasicJsonType&,
+        T>::value;
+};
+
+template<typename T>
+using detect_key_compare = typename T::key_compare;
+
+template<typename T>
+struct has_key_compare : std::integral_constant<bool, is_detected<detect_key_compare, T>::value> {};
+
+// obtains the actual object key comparator
+template<typename BasicJsonType>
+struct actual_object_comparator
+{
+    using object_t = typename BasicJsonType::object_t;
+    using object_comparator_t = typename BasicJsonType::default_object_comparator_t;
+    using type = typename std::conditional < has_key_compare<object_t>::value,
+          typename object_t::key_compare, object_comparator_t>::type;
+};
+
+template<typename BasicJsonType>
+using actual_object_comparator_t = typename actual_object_comparator<BasicJsonType>::type;
+
+/////////////////
+// char_traits //
+/////////////////
+
+// Primary template of char_traits calls std char_traits
+template<typename T>
+struct char_traits : std::char_traits<T>
+{};
+
+// Explicitly define char traits for unsigned char since it is not standard
+template<>
+struct char_traits<unsigned char> : std::char_traits<char>
+{
+    using char_type = unsigned char;
+    using int_type = uint64_t;
+
+    // Redefine to_int_type function
+    static int_type to_int_type(char_type c) noexcept
+    {
+        return static_cast<int_type>(c);
+    }
+
+    static char_type to_char_type(int_type i) noexcept
+    {
+        return static_cast<char_type>(i);
+    }
+
+    static constexpr int_type eof() noexcept
+    {
+        return static_cast<int_type>(std::char_traits<char>::eof());
+    }
+};
+
+// Explicitly define char traits for signed char since it is not standard
+template<>
+struct char_traits<signed char> : std::char_traits<char>
+{
+    using char_type = signed char;
+    using int_type = uint64_t;
+
+    // Redefine to_int_type function
+    static int_type to_int_type(char_type c) noexcept
+    {
+        return static_cast<int_type>(c);
+    }
+
+    static char_type to_char_type(int_type i) noexcept
+    {
+        return static_cast<char_type>(i);
+    }
+
+    static constexpr int_type eof() noexcept
+    {
+        return static_cast<int_type>(std::char_traits<char>::eof());
+    }
+};
+
+///////////////////
+// is_ functions //
+///////////////////
+
+// https://en.cppreference.com/w/cpp/types/conjunction
+template<class...> struct conjunction : std::true_type { };
+template<class B> struct conjunction<B> : B { };
+template<class B, class... Bn>
+struct conjunction<B, Bn...>
+: std::conditional<static_cast<bool>(B::value), conjunction<Bn...>, B>::type {};
+
+// https://en.cppreference.com/w/cpp/types/negation
+template<class B> struct negation : std::integral_constant < bool, !B::value > { };
+
+// Reimplementation of is_constructible and is_default_constructible, due to them being broken for
+// std::pair and std::tuple until LWG 2367 fix (see https://cplusplus.github.io/LWG/lwg-defects.html#2367).
+// This causes compile errors in e.g. clang 3.5 or gcc 4.9.
+template <typename T>
+struct is_default_constructible : std::is_default_constructible<T> {};
+
+template <typename T1, typename T2>
+struct is_default_constructible<std::pair<T1, T2>>
+    : conjunction<is_default_constructible<T1>, is_default_constructible<T2>> {};
+
+template <typename T1, typename T2>
+struct is_default_constructible<const std::pair<T1, T2>>
+    : conjunction<is_default_constructible<T1>, is_default_constructible<T2>> {};
+
+template <typename... Ts>
+struct is_default_constructible<std::tuple<Ts...>>
+    : conjunction<is_default_constructible<Ts>...> {};
+
+template <typename... Ts>
+struct is_default_constructible<const std::tuple<Ts...>>
+    : conjunction<is_default_constructible<Ts>...> {};
+
+template <typename T, typename... Args>
+struct is_constructible : std::is_constructible<T, Args...> {};
+
+template <typename T1, typename T2>
+struct is_constructible<std::pair<T1, T2>> : is_default_constructible<std::pair<T1, T2>> {};
+
+template <typename T1, typename T2>
+struct is_constructible<const std::pair<T1, T2>> : is_default_constructible<const std::pair<T1, T2>> {};
+
+template <typename... Ts>
+struct is_constructible<std::tuple<Ts...>> : is_default_constructible<std::tuple<Ts...>> {};
+
+template <typename... Ts>
+struct is_constructible<const std::tuple<Ts...>> : is_default_constructible<const std::tuple<Ts...>> {};
+
+template<typename T, typename = void>
+struct is_iterator_traits : std::false_type {};
+
+template<typename T>
+struct is_iterator_traits<iterator_traits<T>>
+{
+  private:
+    using traits = iterator_traits<T>;
+
+  public:
+    static constexpr auto value =
+        is_detected<value_type_t, traits>::value &&
+        is_detected<difference_type_t, traits>::value &&
+        is_detected<pointer_t, traits>::value &&
+        is_detected<iterator_category_t, traits>::value &&
+        is_detected<reference_t, traits>::value;
+};
+
+template<typename T>
+struct is_range
+{
+  private:
+    using t_ref = typename std::add_lvalue_reference<T>::type;
+
+    using iterator = detected_t<result_of_begin, t_ref>;
+    using sentinel = detected_t<result_of_end, t_ref>;
+
+    // to be 100% correct, it should use https://en.cppreference.com/w/cpp/iterator/input_or_output_iterator
+    // and https://en.cppreference.com/w/cpp/iterator/sentinel_for
+    // but reimplementing these would be too much work, as a lot of other concepts are used underneath
+    static constexpr auto is_iterator_begin =
+        is_iterator_traits<iterator_traits<iterator>>::value;
+
+  public:
+    static constexpr bool value = !std::is_same<iterator, nonesuch>::value && !std::is_same<sentinel, nonesuch>::value && is_iterator_begin;
+};
+
+template<typename R>
+using iterator_t = enable_if_t<is_range<R>::value, result_of_begin<decltype(std::declval<R&>())>>;
+
+template<typename T>
+using range_value_t = value_type_t<iterator_traits<iterator_t<T>>>;
+
+// The following implementation of is_complete_type is taken from
+// https://blogs.msdn.microsoft.com/vcblog/2015/12/02/partial-support-for-expression-sfinae-in-vs-2015-update-1/
+// and is written by Xiang Fan who agreed to using it in this library.
+
+template<typename T, typename = void>
+struct is_complete_type : std::false_type {};
+
+template<typename T>
+struct is_complete_type<T, decltype(void(sizeof(T)))> : std::true_type {};
+
+template<typename BasicJsonType, typename CompatibleObjectType,
+         typename = void>
+struct is_compatible_object_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename CompatibleObjectType>
+struct is_compatible_object_type_impl <
+    BasicJsonType, CompatibleObjectType,
+    enable_if_t < is_detected<mapped_type_t, CompatibleObjectType>::value&&
+    is_detected<key_type_t, CompatibleObjectType>::value >>
+{
+    using object_t = typename BasicJsonType::object_t;
+
+    // macOS's is_constructible does not play well with nonesuch...
+    static constexpr bool value =
+        is_constructible<typename object_t::key_type,
+        typename CompatibleObjectType::key_type>::value &&
+        is_constructible<typename object_t::mapped_type,
+        typename CompatibleObjectType::mapped_type>::value;
+};
+
+template<typename BasicJsonType, typename CompatibleObjectType>
+struct is_compatible_object_type
+    : is_compatible_object_type_impl<BasicJsonType, CompatibleObjectType> {};
+
+template<typename BasicJsonType, typename ConstructibleObjectType,
+         typename = void>
+struct is_constructible_object_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename ConstructibleObjectType>
+struct is_constructible_object_type_impl <
+    BasicJsonType, ConstructibleObjectType,
+    enable_if_t < is_detected<mapped_type_t, ConstructibleObjectType>::value&&
+    is_detected<key_type_t, ConstructibleObjectType>::value >>
+{
+    using object_t = typename BasicJsonType::object_t;
+
+    static constexpr bool value =
+        (is_default_constructible<ConstructibleObjectType>::value &&
+         (std::is_move_assignable<ConstructibleObjectType>::value ||
+          std::is_copy_assignable<ConstructibleObjectType>::value) &&
+         (is_constructible<typename ConstructibleObjectType::key_type,
+          typename object_t::key_type>::value &&
+          std::is_same <
+          typename object_t::mapped_type,
+          typename ConstructibleObjectType::mapped_type >::value)) ||
+        (has_from_json<BasicJsonType,
+         typename ConstructibleObjectType::mapped_type>::value ||
+         has_non_default_from_json <
+         BasicJsonType,
+         typename ConstructibleObjectType::mapped_type >::value);
+};
+
+template<typename BasicJsonType, typename ConstructibleObjectType>
+struct is_constructible_object_type
+    : is_constructible_object_type_impl<BasicJsonType,
+      ConstructibleObjectType> {};
+
+template<typename BasicJsonType, typename CompatibleStringType>
+struct is_compatible_string_type
+{
+    static constexpr auto value =
+        is_constructible<typename BasicJsonType::string_t, CompatibleStringType>::value;
+};
+
+template<typename BasicJsonType, typename ConstructibleStringType>
+struct is_constructible_string_type
+{
+    // launder type through decltype() to fix compilation failure on ICPC
+#ifdef __INTEL_COMPILER
+    using laundered_type = decltype(std::declval<ConstructibleStringType>());
+#else
+    using laundered_type = ConstructibleStringType;
+#endif
+
+    static constexpr auto value =
+        conjunction <
+        is_constructible<laundered_type, typename BasicJsonType::string_t>,
+        is_detected_exact<typename BasicJsonType::string_t::value_type,
+        value_type_t, laundered_type >>::value;
+};
+
+template<typename BasicJsonType, typename CompatibleArrayType, typename = void>
+struct is_compatible_array_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename CompatibleArrayType>
+struct is_compatible_array_type_impl <
+    BasicJsonType, CompatibleArrayType,
+    enable_if_t <
+    is_detected<iterator_t, CompatibleArrayType>::value&&
+    is_iterator_traits<iterator_traits<detected_t<iterator_t, CompatibleArrayType>>>::value&&
+// special case for types like std::filesystem::path whose iterator's value_type are themselves
+// c.f. https://github.com/nlohmann/json/pull/3073
+    !std::is_same<CompatibleArrayType, detected_t<range_value_t, CompatibleArrayType>>::value >>
+{
+    static constexpr bool value =
+        is_constructible<BasicJsonType,
+        range_value_t<CompatibleArrayType>>::value;
+};
+
+template<typename BasicJsonType, typename CompatibleArrayType>
+struct is_compatible_array_type
+    : is_compatible_array_type_impl<BasicJsonType, CompatibleArrayType> {};
+
+template<typename BasicJsonType, typename ConstructibleArrayType, typename = void>
+struct is_constructible_array_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type_impl <
+    BasicJsonType, ConstructibleArrayType,
+    enable_if_t<std::is_same<ConstructibleArrayType,
+    typename BasicJsonType::value_type>::value >>
+            : std::true_type {};
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type_impl <
+    BasicJsonType, ConstructibleArrayType,
+    enable_if_t < !std::is_same<ConstructibleArrayType,
+    typename BasicJsonType::value_type>::value&&
+    !is_compatible_string_type<BasicJsonType, ConstructibleArrayType>::value&&
+    is_default_constructible<ConstructibleArrayType>::value&&
+(std::is_move_assignable<ConstructibleArrayType>::value ||
+ std::is_copy_assignable<ConstructibleArrayType>::value)&&
+is_detected<iterator_t, ConstructibleArrayType>::value&&
+is_iterator_traits<iterator_traits<detected_t<iterator_t, ConstructibleArrayType>>>::value&&
+is_detected<range_value_t, ConstructibleArrayType>::value&&
+// special case for types like std::filesystem::path whose iterator's value_type are themselves
+// c.f. https://github.com/nlohmann/json/pull/3073
+!std::is_same<ConstructibleArrayType, detected_t<range_value_t, ConstructibleArrayType>>::value&&
+is_complete_type <
+detected_t<range_value_t, ConstructibleArrayType >>::value >>
+{
+    using value_type = range_value_t<ConstructibleArrayType>;
+
+    static constexpr bool value =
+        std::is_same<value_type,
+        typename BasicJsonType::array_t::value_type>::value ||
+        has_from_json<BasicJsonType,
+        value_type>::value ||
+        has_non_default_from_json <
+        BasicJsonType,
+        value_type >::value;
+};
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type
+    : is_constructible_array_type_impl<BasicJsonType, ConstructibleArrayType> {};
+
+template<typename RealIntegerType, typename CompatibleNumberIntegerType,
+         typename = void>
+struct is_compatible_integer_type_impl : std::false_type {};
+
+template<typename RealIntegerType, typename CompatibleNumberIntegerType>
+struct is_compatible_integer_type_impl <
+    RealIntegerType, CompatibleNumberIntegerType,
+    enable_if_t < std::is_integral<RealIntegerType>::value&&
+    std::is_integral<CompatibleNumberIntegerType>::value&&
+    !std::is_same<bool, CompatibleNumberIntegerType>::value >>
+{
+    // is there an assert somewhere on overflows?
+    using RealLimits = std::numeric_limits<RealIntegerType>;
+    using CompatibleLimits = std::numeric_limits<CompatibleNumberIntegerType>;
+
+    static constexpr auto value =
+        is_constructible<RealIntegerType,
+        CompatibleNumberIntegerType>::value &&
+        CompatibleLimits::is_integer &&
+        RealLimits::is_signed == CompatibleLimits::is_signed;
+};
+
+template<typename RealIntegerType, typename CompatibleNumberIntegerType>
+struct is_compatible_integer_type
+    : is_compatible_integer_type_impl<RealIntegerType,
+      CompatibleNumberIntegerType> {};
+
+template<typename BasicJsonType, typename CompatibleType, typename = void>
+struct is_compatible_type_impl: std::false_type {};
+
+template<typename BasicJsonType, typename CompatibleType>
+struct is_compatible_type_impl <
+    BasicJsonType, CompatibleType,
+    enable_if_t<is_complete_type<CompatibleType>::value >>
+{
+    static constexpr bool value =
+        has_to_json<BasicJsonType, CompatibleType>::value;
+};
+
+template<typename BasicJsonType, typename CompatibleType>
+struct is_compatible_type
+    : is_compatible_type_impl<BasicJsonType, CompatibleType> {};
+
+template<typename T1, typename T2>
+struct is_constructible_tuple : std::false_type {};
+
+template<typename T1, typename... Args>
+struct is_constructible_tuple<T1, std::tuple<Args...>> : conjunction<is_constructible<T1, Args>...> {};
+
+template<typename BasicJsonType, typename T>
+struct is_json_iterator_of : std::false_type {};
+
+template<typename BasicJsonType>
+struct is_json_iterator_of<BasicJsonType, typename BasicJsonType::iterator> : std::true_type {};
+
+template<typename BasicJsonType>
+struct is_json_iterator_of<BasicJsonType, typename BasicJsonType::const_iterator> : std::true_type
+{};
+
+// checks if a given type T is a template specialization of Primary
+template<template <typename...> class Primary, typename T>
+struct is_specialization_of : std::false_type {};
+
+template<template <typename...> class Primary, typename... Args>
+struct is_specialization_of<Primary, Primary<Args...>> : std::true_type {};
+
+template<typename T>
+using is_json_pointer = is_specialization_of<::nlohmann::json_pointer, uncvref_t<T>>;
+
+// checks if A and B are comparable using Compare functor
+template<typename Compare, typename A, typename B, typename = void>
+struct is_comparable : std::false_type {};
+
+template<typename Compare, typename A, typename B>
+struct is_comparable<Compare, A, B, void_t<
+decltype(std::declval<Compare>()(std::declval<A>(), std::declval<B>())),
+decltype(std::declval<Compare>()(std::declval<B>(), std::declval<A>()))
+>> : std::true_type {};
+
+template<typename T>
+using detect_is_transparent = typename T::is_transparent;
+
+// type trait to check if KeyType can be used as object key (without a BasicJsonType)
+// see is_usable_as_basic_json_key_type below
+template<typename Comparator, typename ObjectKeyType, typename KeyTypeCVRef, bool RequireTransparentComparator = true,
+         bool ExcludeObjectKeyType = RequireTransparentComparator, typename KeyType = uncvref_t<KeyTypeCVRef>>
+using is_usable_as_key_type = typename std::conditional <
+                              is_comparable<Comparator, ObjectKeyType, KeyTypeCVRef>::value
+                              && !(ExcludeObjectKeyType && std::is_same<KeyType,
+                                   ObjectKeyType>::value)
+                              && (!RequireTransparentComparator
+                                  || is_detected <detect_is_transparent, Comparator>::value)
+                              && !is_json_pointer<KeyType>::value,
+                              std::true_type,
+                              std::false_type >::type;
+
+// type trait to check if KeyType can be used as object key
+// true if:
+//   - KeyType is comparable with BasicJsonType::object_t::key_type
+//   - if ExcludeObjectKeyType is true, KeyType is not BasicJsonType::object_t::key_type
+//   - the comparator is transparent or RequireTransparentComparator is false
+//   - KeyType is not a JSON iterator or json_pointer
+template<typename BasicJsonType, typename KeyTypeCVRef, bool RequireTransparentComparator = true,
+         bool ExcludeObjectKeyType = RequireTransparentComparator, typename KeyType = uncvref_t<KeyTypeCVRef>>
+using is_usable_as_basic_json_key_type = typename std::conditional <
+    is_usable_as_key_type<typename BasicJsonType::object_comparator_t,
+    typename BasicJsonType::object_t::key_type, KeyTypeCVRef,
+    RequireTransparentComparator, ExcludeObjectKeyType>::value
+    && !is_json_iterator_of<BasicJsonType, KeyType>::value,
+    std::true_type,
+    std::false_type >::type;
+
+template<typename ObjectType, typename KeyType>
+using detect_erase_with_key_type = decltype(std::declval<ObjectType&>().erase(std::declval<KeyType>()));
+
+// type trait to check if object_t has an erase() member functions accepting KeyType
+template<typename BasicJsonType, typename KeyType>
+using has_erase_with_key_type = typename std::conditional <
+                                is_detected <
+                                detect_erase_with_key_type,
+                                typename BasicJsonType::object_t, KeyType >::value,
+                                std::true_type,
+                                std::false_type >::type;
+
+// a naive helper to check if a type is an ordered_map (exploits the fact that
+// ordered_map inherits capacity() from std::vector)
+template <typename T>
+struct is_ordered_map
+{
+    using one = char;
+
+    struct two
+    {
+        char x[2]; // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+    };
+
+    template <typename C> static one test( decltype(&C::capacity) ) ;
+    template <typename C> static two test(...);
+
+    enum { value = sizeof(test<T>(nullptr)) == sizeof(char) }; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+};
+
+// to avoid useless casts (see https://github.com/nlohmann/json/issues/2893#issuecomment-889152324)
+template < typename T, typename U, enable_if_t < !std::is_same<T, U>::value, int > = 0 >
+T conditional_static_cast(U value)
+{
+    return static_cast<T>(value);
+}
+
+template<typename T, typename U, enable_if_t<std::is_same<T, U>::value, int> = 0>
+T conditional_static_cast(U value)
+{
+    return value;
+}
+
+template<typename... Types>
+using all_integral = conjunction<std::is_integral<Types>...>;
+
+template<typename... Types>
+using all_signed = conjunction<std::is_signed<Types>...>;
+
+template<typename... Types>
+using all_unsigned = conjunction<std::is_unsigned<Types>...>;
+
+// there's a disjunction trait in another PR; replace when merged
+template<typename... Types>
+using same_sign = std::integral_constant < bool,
+      all_signed<Types...>::value || all_unsigned<Types...>::value >;
+
+template<typename OfType, typename T>
+using never_out_of_range = std::integral_constant < bool,
+      (std::is_signed<OfType>::value && (sizeof(T) < sizeof(OfType)))
+      || (same_sign<OfType, T>::value && sizeof(OfType) == sizeof(T)) >;
+
+template<typename OfType, typename T,
+         bool OfTypeSigned = std::is_signed<OfType>::value,
+         bool TSigned = std::is_signed<T>::value>
+struct value_in_range_of_impl2;
+
+template<typename OfType, typename T>
+struct value_in_range_of_impl2<OfType, T, false, false>
+{
+    static constexpr bool test(T val)
+    {
+        using CommonType = typename std::common_type<OfType, T>::type;
+        return static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
+    }
+};
+
+template<typename OfType, typename T>
+struct value_in_range_of_impl2<OfType, T, true, false>
+{
+    static constexpr bool test(T val)
+    {
+        using CommonType = typename std::common_type<OfType, T>::type;
+        return static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
+    }
+};
+
+template<typename OfType, typename T>
+struct value_in_range_of_impl2<OfType, T, false, true>
+{
+    static constexpr bool test(T val)
+    {
+        using CommonType = typename std::common_type<OfType, T>::type;
+        return val >= 0 && static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
+    }
+};
+
+template<typename OfType, typename T>
+struct value_in_range_of_impl2<OfType, T, true, true>
+{
+    static constexpr bool test(T val)
+    {
+        using CommonType = typename std::common_type<OfType, T>::type;
+        return static_cast<CommonType>(val) >= static_cast<CommonType>((std::numeric_limits<OfType>::min)())
+               && static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
+    }
+};
+
+template<typename OfType, typename T,
+         bool NeverOutOfRange = never_out_of_range<OfType, T>::value,
+         typename = detail::enable_if_t<all_integral<OfType, T>::value>>
+struct value_in_range_of_impl1;
+
+template<typename OfType, typename T>
+struct value_in_range_of_impl1<OfType, T, false>
+{
+    static constexpr bool test(T val)
+    {
+        return value_in_range_of_impl2<OfType, T>::test(val);
+    }
+};
+
+template<typename OfType, typename T>
+struct value_in_range_of_impl1<OfType, T, true>
+{
+    static constexpr bool test(T /*val*/)
+    {
+        return true;
+    }
+};
+
+template<typename OfType, typename T>
+constexpr bool value_in_range_of(T val)
+{
+    return value_in_range_of_impl1<OfType, T>::test(val);
+}
+
+template<bool Value>
+using bool_constant = std::integral_constant<bool, Value>;
+
+///////////////////////////////////////////////////////////////////////////////
+// is_c_string
+///////////////////////////////////////////////////////////////////////////////
+
+namespace impl
+{
+
+template<typename T>
+constexpr bool is_c_string()
+{
+    using TUnExt = typename std::remove_extent<T>::type;
+    using TUnCVExt = typename std::remove_cv<TUnExt>::type;
+    using TUnPtr = typename std::remove_pointer<T>::type;
+    using TUnCVPtr = typename std::remove_cv<TUnPtr>::type;
+    return
+        (std::is_array<T>::value && std::is_same<TUnCVExt, char>::value)
+        || (std::is_pointer<T>::value && std::is_same<TUnCVPtr, char>::value);
+}
+
+}  // namespace impl
+
+// checks whether T is a [cv] char */[cv] char[] C string
+template<typename T>
+struct is_c_string : bool_constant<impl::is_c_string<T>()> {};
+
+template<typename T>
+using is_c_string_uncvref = is_c_string<uncvref_t<T>>;
+
+///////////////////////////////////////////////////////////////////////////////
+// is_transparent
+///////////////////////////////////////////////////////////////////////////////
+
+namespace impl
+{
+
+template<typename T>
+constexpr bool is_transparent()
+{
+    return is_detected<detect_is_transparent, T>::value;
+}
+
+}  // namespace impl
+
+// checks whether T has a member named is_transparent
+template<typename T>
+struct is_transparent : bool_constant<impl::is_transparent<T>()> {};
+
+///////////////////////////////////////////////////////////////////////////////
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/string_concat.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <cstring> // strlen
+#include <string> // string
+#include <utility> // forward
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+inline std::size_t concat_length()
+{
+    return 0;
+}
+
+template<typename... Args>
+inline std::size_t concat_length(const char* cstr, const Args& ... rest);
+
+template<typename StringType, typename... Args>
+inline std::size_t concat_length(const StringType& str, const Args& ... rest);
+
+template<typename... Args>
+inline std::size_t concat_length(const char /*c*/, const Args& ... rest)
+{
+    return 1 + concat_length(rest...);
+}
+
+template<typename... Args>
+inline std::size_t concat_length(const char* cstr, const Args& ... rest)
+{
+    // cppcheck-suppress ignoredReturnValue
+    return ::strlen(cstr) + concat_length(rest...);
+}
+
+template<typename StringType, typename... Args>
+inline std::size_t concat_length(const StringType& str, const Args& ... rest)
+{
+    return str.size() + concat_length(rest...);
+}
+
+template<typename OutStringType>
+inline void concat_into(OutStringType& /*out*/)
+{}
+
+template<typename StringType, typename Arg>
+using string_can_append = decltype(std::declval<StringType&>().append(std::declval < Arg && > ()));
+
+template<typename StringType, typename Arg>
+using detect_string_can_append = is_detected<string_can_append, StringType, Arg>;
+
+template<typename StringType, typename Arg>
+using string_can_append_op = decltype(std::declval<StringType&>() += std::declval < Arg && > ());
+
+template<typename StringType, typename Arg>
+using detect_string_can_append_op = is_detected<string_can_append_op, StringType, Arg>;
+
+template<typename StringType, typename Arg>
+using string_can_append_iter = decltype(std::declval<StringType&>().append(std::declval<const Arg&>().begin(), std::declval<const Arg&>().end()));
+
+template<typename StringType, typename Arg>
+using detect_string_can_append_iter = is_detected<string_can_append_iter, StringType, Arg>;
+
+template<typename StringType, typename Arg>
+using string_can_append_data = decltype(std::declval<StringType&>().append(std::declval<const Arg&>().data(), std::declval<const Arg&>().size()));
+
+template<typename StringType, typename Arg>
+using detect_string_can_append_data = is_detected<string_can_append_data, StringType, Arg>;
+
+template < typename OutStringType, typename Arg, typename... Args,
+           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
+                         && detect_string_can_append_op<OutStringType, Arg>::value, int > = 0 >
+inline void concat_into(OutStringType& out, Arg && arg, Args && ... rest);
+
+template < typename OutStringType, typename Arg, typename... Args,
+           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
+                         && !detect_string_can_append_op<OutStringType, Arg>::value
+                         && detect_string_can_append_iter<OutStringType, Arg>::value, int > = 0 >
+inline void concat_into(OutStringType& out, const Arg& arg, Args && ... rest);
+
+template < typename OutStringType, typename Arg, typename... Args,
+           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
+                         && !detect_string_can_append_op<OutStringType, Arg>::value
+                         && !detect_string_can_append_iter<OutStringType, Arg>::value
+                         && detect_string_can_append_data<OutStringType, Arg>::value, int > = 0 >
+inline void concat_into(OutStringType& out, const Arg& arg, Args && ... rest);
+
+template<typename OutStringType, typename Arg, typename... Args,
+         enable_if_t<detect_string_can_append<OutStringType, Arg>::value, int> = 0>
+inline void concat_into(OutStringType& out, Arg && arg, Args && ... rest)
+{
+    out.append(std::forward<Arg>(arg));
+    concat_into(out, std::forward<Args>(rest)...);
+}
+
+template < typename OutStringType, typename Arg, typename... Args,
+           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
+                         && detect_string_can_append_op<OutStringType, Arg>::value, int > >
+inline void concat_into(OutStringType& out, Arg&& arg, Args&& ... rest)
+{
+    out += std::forward<Arg>(arg);
+    concat_into(out, std::forward<Args>(rest)...);
+}
+
+template < typename OutStringType, typename Arg, typename... Args,
+           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
+                         && !detect_string_can_append_op<OutStringType, Arg>::value
+                         && detect_string_can_append_iter<OutStringType, Arg>::value, int > >
+inline void concat_into(OutStringType& out, const Arg& arg, Args&& ... rest)
+{
+    out.append(arg.begin(), arg.end());
+    concat_into(out, std::forward<Args>(rest)...);
+}
+
+template < typename OutStringType, typename Arg, typename... Args,
+           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
+                         && !detect_string_can_append_op<OutStringType, Arg>::value
+                         && !detect_string_can_append_iter<OutStringType, Arg>::value
+                         && detect_string_can_append_data<OutStringType, Arg>::value, int > >
+inline void concat_into(OutStringType& out, const Arg& arg, Args&& ... rest)
+{
+    out.append(arg.data(), arg.size());
+    concat_into(out, std::forward<Args>(rest)...);
+}
+
+template<typename OutStringType = std::string, typename... Args>
+inline OutStringType concat(Args && ... args)
+{
+    OutStringType str;
+    str.reserve(concat_length(args...));
+    concat_into(str, std::forward<Args>(args)...);
+    return str;
+}
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+
+// With -Wweak-vtables, Clang will complain about the exception classes as they
+// have no out-of-line virtual method definitions and their vtable will be
+// emitted in every translation unit. This issue cannot be fixed with a
+// header-only library as there is no implementation file to move these
+// functions to. As a result, we suppress this warning here to avoid client
+// code to stumble over this. See https://github.com/nlohmann/json/issues/4087
+// for a discussion.
+#if defined(__clang__)
+    #pragma clang diagnostic push
+    #pragma clang diagnostic ignored "-Wweak-vtables"
+#endif
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+////////////////
+// exceptions //
+////////////////
+
+/// @brief general exception of the @ref basic_json class
+/// @sa https://json.nlohmann.me/api/basic_json/exception/
+class exception : public std::exception
+{
+  public:
+    /// returns the explanatory string
+    const char* what() const noexcept override
+    {
+        return m.what();
+    }
+
+    /// the id of the exception
+    const int id; // NOLINT(cppcoreguidelines-non-private-member-variables-in-classes)
+
+  protected:
+    JSON_HEDLEY_NON_NULL(3)
+    exception(int id_, const char* what_arg) : id(id_), m(what_arg) {} // NOLINT(bugprone-throw-keyword-missing)
+
+    static std::string name(const std::string& ename, int id_)
+    {
+        return concat("[json.exception.", ename, '.', std::to_string(id_), "] ");
+    }
+
+    static std::string diagnostics(std::nullptr_t /*leaf_element*/)
+    {
+        return "";
+    }
+
+    template<typename BasicJsonType>
+    static std::string diagnostics(const BasicJsonType* leaf_element)
+    {
+#if JSON_DIAGNOSTICS
+        std::vector<std::string> tokens;
+        for (const auto* current = leaf_element; current != nullptr && current->m_parent != nullptr; current = current->m_parent)
+        {
+            switch (current->m_parent->type())
+            {
+                case value_t::array:
+                {
+                    for (std::size_t i = 0; i < current->m_parent->m_data.m_value.array->size(); ++i)
+                    {
+                        if (&current->m_parent->m_data.m_value.array->operator[](i) == current)
+                        {
+                            tokens.emplace_back(std::to_string(i));
+                            break;
+                        }
+                    }
+                    break;
+                }
+
+                case value_t::object:
+                {
+                    for (const auto& element : *current->m_parent->m_data.m_value.object)
+                    {
+                        if (&element.second == current)
+                        {
+                            tokens.emplace_back(element.first.c_str());
+                            break;
+                        }
+                    }
+                    break;
+                }
+
+                case value_t::null: // LCOV_EXCL_LINE
+                case value_t::string: // LCOV_EXCL_LINE
+                case value_t::boolean: // LCOV_EXCL_LINE
+                case value_t::number_integer: // LCOV_EXCL_LINE
+                case value_t::number_unsigned: // LCOV_EXCL_LINE
+                case value_t::number_float: // LCOV_EXCL_LINE
+                case value_t::binary: // LCOV_EXCL_LINE
+                case value_t::discarded: // LCOV_EXCL_LINE
+                default:   // LCOV_EXCL_LINE
+                    break; // LCOV_EXCL_LINE
+            }
+        }
+
+        if (tokens.empty())
+        {
+            return "";
+        }
+
+        auto str = std::accumulate(tokens.rbegin(), tokens.rend(), std::string{},
+                                   [](const std::string & a, const std::string & b)
+        {
+            return concat(a, '/', detail::escape(b));
+        });
+        return concat('(', str, ") ");
+#else
+        static_cast<void>(leaf_element);
+        return "";
+#endif
+    }
+
+  private:
+    /// an exception object as storage for error messages
+    std::runtime_error m;
+};
+
+/// @brief exception indicating a parse error
+/// @sa https://json.nlohmann.me/api/basic_json/parse_error/
+class parse_error : public exception
+{
+  public:
+    /*!
+    @brief create a parse error exception
+    @param[in] id_       the id of the exception
+    @param[in] pos       the position where the error occurred (or with
+                         chars_read_total=0 if the position cannot be
+                         determined)
+    @param[in] what_arg  the explanatory string
+    @return parse_error object
+    */
+    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
+    static parse_error create(int id_, const position_t& pos, const std::string& what_arg, BasicJsonContext context)
+    {
+        const std::string w = concat(exception::name("parse_error", id_), "parse error",
+                                     position_string(pos), ": ", exception::diagnostics(context), what_arg);
+        return {id_, pos.chars_read_total, w.c_str()};
+    }
+
+    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
+    static parse_error create(int id_, std::size_t byte_, const std::string& what_arg, BasicJsonContext context)
+    {
+        const std::string w = concat(exception::name("parse_error", id_), "parse error",
+                                     (byte_ != 0 ? (concat(" at byte ", std::to_string(byte_))) : ""),
+                                     ": ", exception::diagnostics(context), what_arg);
+        return {id_, byte_, w.c_str()};
+    }
+
+    /*!
+    @brief byte index of the parse error
+
+    The byte index of the last read character in the input file.
+
+    @note For an input with n bytes, 1 is the index of the first character and
+          n+1 is the index of the terminating null byte or the end of file.
+          This also holds true when reading a byte vector (CBOR or MessagePack).
+    */
+    const std::size_t byte;
+
+  private:
+    parse_error(int id_, std::size_t byte_, const char* what_arg)
+        : exception(id_, what_arg), byte(byte_) {}
+
+    static std::string position_string(const position_t& pos)
+    {
+        return concat(" at line ", std::to_string(pos.lines_read + 1),
+                      ", column ", std::to_string(pos.chars_read_current_line));
+    }
+};
+
+/// @brief exception indicating errors with iterators
+/// @sa https://json.nlohmann.me/api/basic_json/invalid_iterator/
+class invalid_iterator : public exception
+{
+  public:
+    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
+    static invalid_iterator create(int id_, const std::string& what_arg, BasicJsonContext context)
+    {
+        const std::string w = concat(exception::name("invalid_iterator", id_), exception::diagnostics(context), what_arg);
+        return {id_, w.c_str()};
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    invalid_iterator(int id_, const char* what_arg)
+        : exception(id_, what_arg) {}
+};
+
+/// @brief exception indicating executing a member function with a wrong type
+/// @sa https://json.nlohmann.me/api/basic_json/type_error/
+class type_error : public exception
+{
+  public:
+    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
+    static type_error create(int id_, const std::string& what_arg, BasicJsonContext context)
+    {
+        const std::string w = concat(exception::name("type_error", id_), exception::diagnostics(context), what_arg);
+        return {id_, w.c_str()};
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    type_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+
+/// @brief exception indicating access out of the defined range
+/// @sa https://json.nlohmann.me/api/basic_json/out_of_range/
+class out_of_range : public exception
+{
+  public:
+    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
+    static out_of_range create(int id_, const std::string& what_arg, BasicJsonContext context)
+    {
+        const std::string w = concat(exception::name("out_of_range", id_), exception::diagnostics(context), what_arg);
+        return {id_, w.c_str()};
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    out_of_range(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+
+/// @brief exception indicating other library errors
+/// @sa https://json.nlohmann.me/api/basic_json/other_error/
+class other_error : public exception
+{
+  public:
+    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
+    static other_error create(int id_, const std::string& what_arg, BasicJsonContext context)
+    {
+        const std::string w = concat(exception::name("other_error", id_), exception::diagnostics(context), what_arg);
+        return {id_, w.c_str()};
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    other_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+#if defined(__clang__)
+    #pragma clang diagnostic pop
+#endif
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/identity_tag.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+// dispatching helper struct
+template <class T> struct identity_tag {};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/meta/std_fs.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+#if JSON_HAS_EXPERIMENTAL_FILESYSTEM
+#include <experimental/filesystem>
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+namespace std_fs = std::experimental::filesystem;
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+#elif JSON_HAS_FILESYSTEM
+#include <filesystem> // NOLINT(build/c++17)
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+namespace std_fs = std::filesystem;
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+#endif
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+template<typename BasicJsonType>
+inline void from_json(const BasicJsonType& j, typename std::nullptr_t& n)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_null()))
+    {
+        JSON_THROW(type_error::create(302, concat("type must be null, but is ", j.type_name()), &j));
+    }
+    n = nullptr;
+}
+
+#ifdef JSON_HAS_CPP_17
+#ifndef JSON_USE_IMPLICIT_CONVERSIONS
+template<typename BasicJsonType, typename T>
+void from_json(const BasicJsonType& j, std::optional<T>& opt)
+{
+    if (j.is_null())
+    {
+        opt = std::nullopt;
+    }
+    else
+    {
+        opt.emplace(j.template get<T>());
+    }
+}
+
+#endif // JSON_USE_IMPLICIT_CONVERSIONS
+#endif // JSON_HAS_CPP_17
+
+// overloads for basic_json template parameters
+template < typename BasicJsonType, typename ArithmeticType,
+           enable_if_t < std::is_arithmetic<ArithmeticType>::value&&
+                         !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
+                         int > = 0 >
+void get_arithmetic_value(const BasicJsonType& j, ArithmeticType& val)
+{
+    switch (static_cast<value_t>(j))
+    {
+        case value_t::number_unsigned:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
+            break;
+        }
+        case value_t::number_integer:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
+            break;
+        }
+        case value_t::number_float:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
+            break;
+        }
+
+        case value_t::null:
+        case value_t::object:
+        case value_t::array:
+        case value_t::string:
+        case value_t::boolean:
+        case value_t::binary:
+        case value_t::discarded:
+        default:
+            JSON_THROW(type_error::create(302, concat("type must be number, but is ", j.type_name()), &j));
+    }
+}
+
+template<typename BasicJsonType>
+inline void from_json(const BasicJsonType& j, typename BasicJsonType::boolean_t& b)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_boolean()))
+    {
+        JSON_THROW(type_error::create(302, concat("type must be boolean, but is ", j.type_name()), &j));
+    }
+    b = *j.template get_ptr<const typename BasicJsonType::boolean_t*>();
+}
+
+template<typename BasicJsonType>
+inline void from_json(const BasicJsonType& j, typename BasicJsonType::string_t& s)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
+    {
+        JSON_THROW(type_error::create(302, concat("type must be string, but is ", j.type_name()), &j));
+    }
+    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+}
+
+template <
+    typename BasicJsonType, typename StringType,
+    enable_if_t <
+        std::is_assignable<StringType&, const typename BasicJsonType::string_t>::value
+        && is_detected_exact<typename BasicJsonType::string_t::value_type, value_type_t, StringType>::value
+        && !std::is_same<typename BasicJsonType::string_t, StringType>::value
+        && !is_json_ref<StringType>::value, int > = 0 >
+inline void from_json(const BasicJsonType& j, StringType& s)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
+    {
+        JSON_THROW(type_error::create(302, concat("type must be string, but is ", j.type_name()), &j));
+    }
+
+    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+}
+
+template<typename BasicJsonType>
+inline void from_json(const BasicJsonType& j, typename BasicJsonType::number_float_t& val)
+{
+    get_arithmetic_value(j, val);
+}
+
+template<typename BasicJsonType>
+inline void from_json(const BasicJsonType& j, typename BasicJsonType::number_unsigned_t& val)
+{
+    get_arithmetic_value(j, val);
+}
+
+template<typename BasicJsonType>
+inline void from_json(const BasicJsonType& j, typename BasicJsonType::number_integer_t& val)
+{
+    get_arithmetic_value(j, val);
+}
+
+#if !JSON_DISABLE_ENUM_SERIALIZATION
+template<typename BasicJsonType, typename EnumType,
+         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
+inline void from_json(const BasicJsonType& j, EnumType& e)
+{
+    typename std::underlying_type<EnumType>::type val;
+    get_arithmetic_value(j, val);
+    e = static_cast<EnumType>(val);
+}
+#endif  // JSON_DISABLE_ENUM_SERIALIZATION
+
+// forward_list doesn't have an insert method
+template<typename BasicJsonType, typename T, typename Allocator,
+         enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
+inline void from_json(const BasicJsonType& j, std::forward_list<T, Allocator>& l)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
+    }
+    l.clear();
+    std::transform(j.rbegin(), j.rend(),
+                   std::front_inserter(l), [](const BasicJsonType & i)
+    {
+        return i.template get<T>();
+    });
+}
+
+// valarray doesn't have an insert method
+template<typename BasicJsonType, typename T,
+         enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
+inline void from_json(const BasicJsonType& j, std::valarray<T>& l)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
+    }
+    l.resize(j.size());
+    std::transform(j.begin(), j.end(), std::begin(l),
+                   [](const BasicJsonType & elem)
+    {
+        return elem.template get<T>();
+    });
+}
+
+template<typename BasicJsonType, typename T, std::size_t N>
+auto from_json(const BasicJsonType& j, T (&arr)[N])  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i = 0; i < N; ++i)
+    {
+        arr[i] = j.at(i).template get<T>();
+    }
+}
+
+template<typename BasicJsonType, typename T, std::size_t N1, std::size_t N2>
+auto from_json(const BasicJsonType& j, T (&arr)[N1][N2])  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i1 = 0; i1 < N1; ++i1)
+    {
+        for (std::size_t i2 = 0; i2 < N2; ++i2)
+        {
+            arr[i1][i2] = j.at(i1).at(i2).template get<T>();
+        }
+    }
+}
+
+template<typename BasicJsonType, typename T, std::size_t N1, std::size_t N2, std::size_t N3>
+auto from_json(const BasicJsonType& j, T (&arr)[N1][N2][N3])  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i1 = 0; i1 < N1; ++i1)
+    {
+        for (std::size_t i2 = 0; i2 < N2; ++i2)
+        {
+            for (std::size_t i3 = 0; i3 < N3; ++i3)
+            {
+                arr[i1][i2][i3] = j.at(i1).at(i2).at(i3).template get<T>();
+            }
+        }
+    }
+}
+
+template<typename BasicJsonType, typename T, std::size_t N1, std::size_t N2, std::size_t N3, std::size_t N4>
+auto from_json(const BasicJsonType& j, T (&arr)[N1][N2][N3][N4])  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i1 = 0; i1 < N1; ++i1)
+    {
+        for (std::size_t i2 = 0; i2 < N2; ++i2)
+        {
+            for (std::size_t i3 = 0; i3 < N3; ++i3)
+            {
+                for (std::size_t i4 = 0; i4 < N4; ++i4)
+                {
+                    arr[i1][i2][i3][i4] = j.at(i1).at(i2).at(i3).at(i4).template get<T>();
+                }
+            }
+        }
+    }
+}
+
+template<typename BasicJsonType>
+inline void from_json_array_impl(const BasicJsonType& j, typename BasicJsonType::array_t& arr, priority_tag<3> /*unused*/)
+{
+    arr = *j.template get_ptr<const typename BasicJsonType::array_t*>();
+}
+
+template<typename BasicJsonType, typename T, std::size_t N>
+auto from_json_array_impl(const BasicJsonType& j, std::array<T, N>& arr,
+                          priority_tag<2> /*unused*/)
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i = 0; i < N; ++i)
+    {
+        arr[i] = j.at(i).template get<T>();
+    }
+}
+
+template<typename BasicJsonType, typename ConstructibleArrayType,
+         enable_if_t<
+             std::is_assignable<ConstructibleArrayType&, ConstructibleArrayType>::value,
+             int> = 0>
+auto from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr, priority_tag<1> /*unused*/)
+-> decltype(
+    arr.reserve(std::declval<typename ConstructibleArrayType::size_type>()),
+    j.template get<typename ConstructibleArrayType::value_type>(),
+    void())
+{
+    using std::end;
+
+    ConstructibleArrayType ret;
+    ret.reserve(j.size());
+    std::transform(j.begin(), j.end(),
+                   std::inserter(ret, end(ret)), [](const BasicJsonType & i)
+    {
+        // get<BasicJsonType>() returns *this, this won't call a from_json
+        // method when value_type is BasicJsonType
+        return i.template get<typename ConstructibleArrayType::value_type>();
+    });
+    arr = std::move(ret);
+}
+
+template<typename BasicJsonType, typename ConstructibleArrayType,
+         enable_if_t<
+             std::is_assignable<ConstructibleArrayType&, ConstructibleArrayType>::value,
+             int> = 0>
+inline void from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr,
+                                 priority_tag<0> /*unused*/)
+{
+    using std::end;
+
+    ConstructibleArrayType ret;
+    std::transform(
+        j.begin(), j.end(), std::inserter(ret, end(ret)),
+        [](const BasicJsonType & i)
+    {
+        // get<BasicJsonType>() returns *this, this won't call a from_json
+        // method when value_type is BasicJsonType
+        return i.template get<typename ConstructibleArrayType::value_type>();
+    });
+    arr = std::move(ret);
+}
+
+template < typename BasicJsonType, typename ConstructibleArrayType,
+           enable_if_t <
+               is_constructible_array_type<BasicJsonType, ConstructibleArrayType>::value&&
+               !is_constructible_object_type<BasicJsonType, ConstructibleArrayType>::value&&
+               !is_constructible_string_type<BasicJsonType, ConstructibleArrayType>::value&&
+               !std::is_same<ConstructibleArrayType, typename BasicJsonType::binary_t>::value&&
+               !is_basic_json<ConstructibleArrayType>::value,
+               int > = 0 >
+auto from_json(const BasicJsonType& j, ConstructibleArrayType& arr)
+-> decltype(from_json_array_impl(j, arr, priority_tag<3> {}),
+j.template get<typename ConstructibleArrayType::value_type>(),
+void())
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
+    }
+
+    from_json_array_impl(j, arr, priority_tag<3> {});
+}
+
+template < typename BasicJsonType, typename T, std::size_t... Idx >
+std::array<T, sizeof...(Idx)> from_json_inplace_array_impl(BasicJsonType&& j,
+                     identity_tag<std::array<T, sizeof...(Idx)>> /*unused*/, index_sequence<Idx...> /*unused*/)
+{
+    return { { std::forward<BasicJsonType>(j).at(Idx).template get<T>()... } };
+}
+
+template < typename BasicJsonType, typename T, std::size_t N >
+auto from_json(BasicJsonType&& j, identity_tag<std::array<T, N>> tag)
+-> decltype(from_json_inplace_array_impl(std::forward<BasicJsonType>(j), tag, make_index_sequence<N> {}))
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
+    }
+
+    return from_json_inplace_array_impl(std::forward<BasicJsonType>(j), tag, make_index_sequence<N> {});
+}
+
+template<typename BasicJsonType>
+inline void from_json(const BasicJsonType& j, typename BasicJsonType::binary_t& bin)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_binary()))
+    {
+        JSON_THROW(type_error::create(302, concat("type must be binary, but is ", j.type_name()), &j));
+    }
+
+    bin = *j.template get_ptr<const typename BasicJsonType::binary_t*>();
+}
+
+template<typename BasicJsonType, typename ConstructibleObjectType,
+         enable_if_t<is_constructible_object_type<BasicJsonType, ConstructibleObjectType>::value, int> = 0>
+inline void from_json(const BasicJsonType& j, ConstructibleObjectType& obj)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_object()))
+    {
+        JSON_THROW(type_error::create(302, concat("type must be object, but is ", j.type_name()), &j));
+    }
+
+    ConstructibleObjectType ret;
+    const auto* inner_object = j.template get_ptr<const typename BasicJsonType::object_t*>();
+    using value_type = typename ConstructibleObjectType::value_type;
+    std::transform(
+        inner_object->begin(), inner_object->end(),
+        std::inserter(ret, ret.begin()),
+        [](typename BasicJsonType::object_t::value_type const & p)
+    {
+        return value_type(p.first, p.second.template get<typename ConstructibleObjectType::mapped_type>());
+    });
+    obj = std::move(ret);
+}
+
+// overload for arithmetic types, not chosen for basic_json template arguments
+// (BooleanType, etc..); note: Is it really necessary to provide explicit
+// overloads for boolean_t etc. in case of a custom BooleanType which is not
+// an arithmetic type?
+template < typename BasicJsonType, typename ArithmeticType,
+           enable_if_t <
+               std::is_arithmetic<ArithmeticType>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::number_unsigned_t>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::number_integer_t>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::number_float_t>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
+               int > = 0 >
+inline void from_json(const BasicJsonType& j, ArithmeticType& val)
+{
+    switch (static_cast<value_t>(j))
+    {
+        case value_t::number_unsigned:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
+            break;
+        }
+        case value_t::number_integer:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
+            break;
+        }
+        case value_t::number_float:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
+            break;
+        }
+        case value_t::boolean:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::boolean_t*>());
+            break;
+        }
+
+        case value_t::null:
+        case value_t::object:
+        case value_t::array:
+        case value_t::string:
+        case value_t::binary:
+        case value_t::discarded:
+        default:
+            JSON_THROW(type_error::create(302, concat("type must be number, but is ", j.type_name()), &j));
+    }
+}
+
+template<typename BasicJsonType, typename... Args, std::size_t... Idx>
+std::tuple<Args...> from_json_tuple_impl_base(BasicJsonType&& j, index_sequence<Idx...> /*unused*/)
+{
+    return std::make_tuple(std::forward<BasicJsonType>(j).at(Idx).template get<Args>()...);
+}
+
+template < typename BasicJsonType, class A1, class A2 >
+std::pair<A1, A2> from_json_tuple_impl(BasicJsonType&& j, identity_tag<std::pair<A1, A2>> /*unused*/, priority_tag<0> /*unused*/)
+{
+    return {std::forward<BasicJsonType>(j).at(0).template get<A1>(),
+            std::forward<BasicJsonType>(j).at(1).template get<A2>()};
+}
+
+template<typename BasicJsonType, typename A1, typename A2>
+inline void from_json_tuple_impl(BasicJsonType&& j, std::pair<A1, A2>& p, priority_tag<1> /*unused*/)
+{
+    p = from_json_tuple_impl(std::forward<BasicJsonType>(j), identity_tag<std::pair<A1, A2>> {}, priority_tag<0> {});
+}
+
+template<typename BasicJsonType, typename... Args>
+std::tuple<Args...> from_json_tuple_impl(BasicJsonType&& j, identity_tag<std::tuple<Args...>> /*unused*/, priority_tag<2> /*unused*/)
+{
+    return from_json_tuple_impl_base<BasicJsonType, Args...>(std::forward<BasicJsonType>(j), index_sequence_for<Args...> {});
+}
+
+template<typename BasicJsonType, typename... Args>
+inline void from_json_tuple_impl(BasicJsonType&& j, std::tuple<Args...>& t, priority_tag<3> /*unused*/)
+{
+    t = from_json_tuple_impl_base<BasicJsonType, Args...>(std::forward<BasicJsonType>(j), index_sequence_for<Args...> {});
+}
+
+template<typename BasicJsonType, typename TupleRelated>
+auto from_json(BasicJsonType&& j, TupleRelated&& t)
+-> decltype(from_json_tuple_impl(std::forward<BasicJsonType>(j), std::forward<TupleRelated>(t), priority_tag<3> {}))
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
+    }
+
+    return from_json_tuple_impl(std::forward<BasicJsonType>(j), std::forward<TupleRelated>(t), priority_tag<3> {});
+}
+
+template < typename BasicJsonType, typename Key, typename Value, typename Compare, typename Allocator,
+           typename = enable_if_t < !std::is_constructible <
+                                        typename BasicJsonType::string_t, Key >::value >>
+inline void from_json(const BasicJsonType& j, std::map<Key, Value, Compare, Allocator>& m)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
+    }
+    m.clear();
+    for (const auto& p : j)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!p.is_array()))
+        {
+            JSON_THROW(type_error::create(302, concat("type must be array, but is ", p.type_name()), &j));
+        }
+        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
+    }
+}
+
+template < typename BasicJsonType, typename Key, typename Value, typename Hash, typename KeyEqual, typename Allocator,
+           typename = enable_if_t < !std::is_constructible <
+                                        typename BasicJsonType::string_t, Key >::value >>
+inline void from_json(const BasicJsonType& j, std::unordered_map<Key, Value, Hash, KeyEqual, Allocator>& m)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
+    }
+    m.clear();
+    for (const auto& p : j)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!p.is_array()))
+        {
+            JSON_THROW(type_error::create(302, concat("type must be array, but is ", p.type_name()), &j));
+        }
+        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
+    }
+}
+
+#if JSON_HAS_FILESYSTEM || JSON_HAS_EXPERIMENTAL_FILESYSTEM
+template<typename BasicJsonType>
+inline void from_json(const BasicJsonType& j, std_fs::path& p)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
+    {
+        JSON_THROW(type_error::create(302, concat("type must be string, but is ", j.type_name()), &j));
+    }
+    p = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+}
+#endif
+
+struct from_json_fn
+{
+    template<typename BasicJsonType, typename T>
+    auto operator()(const BasicJsonType& j, T&& val) const
+    noexcept(noexcept(from_json(j, std::forward<T>(val))))
+    -> decltype(from_json(j, std::forward<T>(val)))
+    {
+        return from_json(j, std::forward<T>(val));
+    }
+};
+
+}  // namespace detail
+
+#ifndef JSON_HAS_CPP_17
+/// namespace to hold default `from_json` function
+/// to see why this is required:
+/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
+namespace // NOLINT(cert-dcl59-cpp,fuchsia-header-anon-namespaces,google-build-namespaces)
+{
+#endif
+JSON_INLINE_VARIABLE constexpr const auto& from_json = // NOLINT(misc-definitions-in-headers)
+    detail::static_const<detail::from_json_fn>::value;
+#ifndef JSON_HAS_CPP_17
+}  // namespace
+#endif
+
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/conversions/to_json.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+// #include <nlohmann/detail/macro_scope.hpp>
+// JSON_HAS_CPP_17
+#ifdef JSON_HAS_CPP_17
+    #include <optional> // optional
+#endif
+
+#include <algorithm> // copy
+#include <iterator> // begin, end
+#include <string> // string
+#include <tuple> // tuple, get
+#include <type_traits> // is_same, is_constructible, is_floating_point, is_enum, underlying_type
+#include <utility> // move, forward, declval, pair
+#include <valarray> // valarray
+#include <vector> // vector
+
+// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <cstddef> // size_t
+#include <iterator> // forward_iterator_tag
+#include <tuple> // tuple_size, get, tuple_element
+#include <utility> // move
+
+#if JSON_HAS_RANGES
+    #include <ranges> // enable_borrowed_range
+#endif
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/string_utils.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <cstddef> // size_t
+#include <string> // string, to_string
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+template<typename StringType>
+void int_to_string(StringType& target, std::size_t value)
+{
+    // For ADL
+    using std::to_string;
+    target = to_string(value);
+}
+
+template<typename StringType>
+StringType to_string(std::size_t value)
+{
+    StringType result;
+    int_to_string(result, value);
+    return result;
+}
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+template<typename IteratorType> class iteration_proxy_value
+{
+  public:
+    using difference_type = std::ptrdiff_t;
+    using value_type = iteration_proxy_value;
+    using pointer = value_type *;
+    using reference = value_type &;
+    using iterator_category = std::forward_iterator_tag;
+    using string_type = typename std::remove_cv< typename std::remove_reference<decltype( std::declval<IteratorType>().key() ) >::type >::type;
+
+  private:
+    /// the iterator
+    IteratorType anchor{};
+    /// an index for arrays (used to create key names)
+    std::size_t array_index = 0;
+    /// last stringified array index
+    mutable std::size_t array_index_last = 0;
+    /// a string representation of the array index
+    mutable string_type array_index_str = "0";
+    /// an empty string (to return a reference for primitive values)
+    string_type empty_str{};
+
+  public:
+    explicit iteration_proxy_value() = default;
+    explicit iteration_proxy_value(IteratorType it, std::size_t array_index_ = 0)
+    noexcept(std::is_nothrow_move_constructible<IteratorType>::value
+             && std::is_nothrow_default_constructible<string_type>::value)
+        : anchor(std::move(it))
+        , array_index(array_index_)
+    {}
+
+    iteration_proxy_value(iteration_proxy_value const&) = default;
+    iteration_proxy_value& operator=(iteration_proxy_value const&) = default;
+    // older GCCs are a bit fussy and require explicit noexcept specifiers on defaulted functions
+    iteration_proxy_value(iteration_proxy_value&&)
+    noexcept(std::is_nothrow_move_constructible<IteratorType>::value
+             && std::is_nothrow_move_constructible<string_type>::value) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor,cppcoreguidelines-noexcept-move-operations)
+    iteration_proxy_value& operator=(iteration_proxy_value&&)
+    noexcept(std::is_nothrow_move_assignable<IteratorType>::value
+             && std::is_nothrow_move_assignable<string_type>::value) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor,cppcoreguidelines-noexcept-move-operations)
+    ~iteration_proxy_value() = default;
+
+    /// dereference operator (needed for range-based for)
+    const iteration_proxy_value& operator*() const
+    {
+        return *this;
+    }
+
+    /// increment operator (needed for range-based for)
+    iteration_proxy_value& operator++()
+    {
+        ++anchor;
+        ++array_index;
+
+        return *this;
+    }
+
+    iteration_proxy_value operator++(int)& // NOLINT(cert-dcl21-cpp)
+    {
+        auto tmp = iteration_proxy_value(anchor, array_index);
+        ++anchor;
+        ++array_index;
+        return tmp;
+    }
+
+    /// equality operator (needed for InputIterator)
+    bool operator==(const iteration_proxy_value& o) const
+    {
+        return anchor == o.anchor;
+    }
+
+    /// inequality operator (needed for range-based for)
+    bool operator!=(const iteration_proxy_value& o) const
+    {
+        return anchor != o.anchor;
+    }
+
+    /// return key of the iterator
+    const string_type& key() const
+    {
+        JSON_ASSERT(anchor.m_object != nullptr);
+
+        switch (anchor.m_object->type())
+        {
+            // use integer array index as key
+            case value_t::array:
+            {
+                if (array_index != array_index_last)
+                {
+                    int_to_string( array_index_str, array_index );
+                    array_index_last = array_index;
+                }
+                return array_index_str;
+            }
+
+            // use key from the object
+            case value_t::object:
+                return anchor.key();
+
+            // use an empty key for all primitive types
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                return empty_str;
+        }
+    }
+
+    /// return value of the iterator
+    typename IteratorType::reference value() const
+    {
+        return anchor.value();
+    }
+};
+
+/// proxy class for the items() function
+template<typename IteratorType> class iteration_proxy
+{
+  private:
+    /// the container to iterate
+    typename IteratorType::pointer container = nullptr;
+
+  public:
+    explicit iteration_proxy() = default;
+
+    /// construct iteration proxy from a container
+    explicit iteration_proxy(typename IteratorType::reference cont) noexcept
+        : container(&cont) {}
+
+    iteration_proxy(iteration_proxy const&) = default;
+    iteration_proxy& operator=(iteration_proxy const&) = default;
+    iteration_proxy(iteration_proxy&&) noexcept = default;
+    iteration_proxy& operator=(iteration_proxy&&) noexcept = default;
+    ~iteration_proxy() = default;
+
+    /// return iterator begin (needed for range-based for)
+    iteration_proxy_value<IteratorType> begin() const noexcept
+    {
+        return iteration_proxy_value<IteratorType>(container->begin());
+    }
+
+    /// return iterator end (needed for range-based for)
+    iteration_proxy_value<IteratorType> end() const noexcept
+    {
+        return iteration_proxy_value<IteratorType>(container->end());
+    }
+};
+
+// Structured Bindings Support
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+template<std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0>
+auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key())
+{
+    return i.key();
+}
+// Structured Bindings Support
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+template<std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0>
+auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value())
+{
+    return i.value();
+}
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// The Addition to the STD Namespace is required to add
+// Structured Bindings Support to the iteration_proxy_value class
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+namespace std
+{
+
+#if defined(__clang__)
+    // Fix: https://github.com/nlohmann/json/issues/1401
+    #pragma clang diagnostic push
+    #pragma clang diagnostic ignored "-Wmismatched-tags"
+#endif
+template<typename IteratorType>
+class tuple_size<::nlohmann::detail::iteration_proxy_value<IteratorType>> // NOLINT(cert-dcl58-cpp)
+    : public std::integral_constant<std::size_t, 2> {};
+
+template<std::size_t N, typename IteratorType>
+class tuple_element<N, ::nlohmann::detail::iteration_proxy_value<IteratorType >> // NOLINT(cert-dcl58-cpp)
+{
+  public:
+    using type = decltype(
+                     get<N>(std::declval <
+                            ::nlohmann::detail::iteration_proxy_value<IteratorType >> ()));
+};
+#if defined(__clang__)
+    #pragma clang diagnostic pop
+#endif
+
+}  // namespace std
+
+#if JSON_HAS_RANGES
+    template <typename IteratorType>
+    inline constexpr bool ::std::ranges::enable_borrowed_range<::nlohmann::detail::iteration_proxy<IteratorType>> = true;
+#endif
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/std_fs.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+//////////////////
+// constructors //
+//////////////////
+
+/*
+ * Note all external_constructor<>::construct functions need to call
+ * j.m_data.m_value.destroy(j.m_data.m_type) to avoid a memory leak in case j contains an
+ * allocated value (e.g., a string). See bug issue
+ * https://github.com/nlohmann/json/issues/2865 for more information.
+ */
+
+template<value_t> struct external_constructor;
+
+template<>
+struct external_constructor<value_t::boolean>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::boolean_t b) noexcept
+    {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::boolean;
+        j.m_data.m_value = b;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::string>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::string_t& s)
+    {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::string;
+        j.m_data.m_value = s;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::string_t&& s)
+    {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::string;
+        j.m_data.m_value = std::move(s);
+        j.assert_invariant();
+    }
+
+    template < typename BasicJsonType, typename CompatibleStringType,
+               enable_if_t < !std::is_same<CompatibleStringType, typename BasicJsonType::string_t>::value,
+                             int > = 0 >
+    static void construct(BasicJsonType& j, const CompatibleStringType& str)
+    {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::string;
+        j.m_data.m_value.string = j.template create<typename BasicJsonType::string_t>(str);
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::binary>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::binary_t& b)
+    {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::binary;
+        j.m_data.m_value = typename BasicJsonType::binary_t(b);
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::binary_t&& b)
+    {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::binary;
+        j.m_data.m_value = typename BasicJsonType::binary_t(std::move(b));
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_float>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_float_t val) noexcept
+    {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::number_float;
+        j.m_data.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_unsigned>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_unsigned_t val) noexcept
+    {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::number_unsigned;
+        j.m_data.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_integer>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_integer_t val) noexcept
+    {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::number_integer;
+        j.m_data.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::array>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::array_t& arr)
+    {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::array;
+        j.m_data.m_value = arr;
+        j.set_parents();
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
+    {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::array;
+        j.m_data.m_value = std::move(arr);
+        j.set_parents();
+        j.assert_invariant();
+    }
+
+    template < typename BasicJsonType, typename CompatibleArrayType,
+               enable_if_t < !std::is_same<CompatibleArrayType, typename BasicJsonType::array_t>::value,
+                             int > = 0 >
+    static void construct(BasicJsonType& j, const CompatibleArrayType& arr)
+    {
+        using std::begin;
+        using std::end;
+
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::array;
+        j.m_data.m_value.array = j.template create<typename BasicJsonType::array_t>(begin(arr), end(arr));
+        j.set_parents();
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const std::vector<bool>& arr)
+    {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::array;
+        j.m_data.m_value = value_t::array;
+        j.m_data.m_value.array->reserve(arr.size());
+        for (const bool x : arr)
+        {
+            j.m_data.m_value.array->push_back(x);
+            j.set_parent(j.m_data.m_value.array->back());
+        }
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType, typename T,
+             enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
+    static void construct(BasicJsonType& j, const std::valarray<T>& arr)
+    {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::array;
+        j.m_data.m_value = value_t::array;
+        j.m_data.m_value.array->resize(arr.size());
+        if (arr.size() > 0)
+        {
+            std::copy(std::begin(arr), std::end(arr), j.m_data.m_value.array->begin());
+        }
+        j.set_parents();
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::object>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::object_t& obj)
+    {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::object;
+        j.m_data.m_value = obj;
+        j.set_parents();
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
+    {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::object;
+        j.m_data.m_value = std::move(obj);
+        j.set_parents();
+        j.assert_invariant();
+    }
+
+    template < typename BasicJsonType, typename CompatibleObjectType,
+               enable_if_t < !std::is_same<CompatibleObjectType, typename BasicJsonType::object_t>::value, int > = 0 >
+    static void construct(BasicJsonType& j, const CompatibleObjectType& obj)
+    {
+        using std::begin;
+        using std::end;
+
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::object;
+        j.m_data.m_value.object = j.template create<typename BasicJsonType::object_t>(begin(obj), end(obj));
+        j.set_parents();
+        j.assert_invariant();
+    }
+};
+
+/////////////
+// to_json //
+/////////////
+
+#ifdef JSON_HAS_CPP_17
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_constructible<BasicJsonType, T>::value, int> = 0>
+void to_json(BasicJsonType& j, const std::optional<T>& opt)
+{
+    if (opt.has_value())
+    {
+        j = *opt;
+    }
+    else
+    {
+        j = nullptr;
+    }
+}
+#endif
+
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_same<T, typename BasicJsonType::boolean_t>::value, int> = 0>
+inline void to_json(BasicJsonType& j, T b) noexcept
+{
+    external_constructor<value_t::boolean>::construct(j, b);
+}
+
+template < typename BasicJsonType, typename BoolRef,
+           enable_if_t <
+               ((std::is_same<std::vector<bool>::reference, BoolRef>::value
+                 && !std::is_same <std::vector<bool>::reference, typename BasicJsonType::boolean_t&>::value)
+                || (std::is_same<std::vector<bool>::const_reference, BoolRef>::value
+                    && !std::is_same <detail::uncvref_t<std::vector<bool>::const_reference>,
+                                      typename BasicJsonType::boolean_t >::value))
+               && std::is_convertible<const BoolRef&, typename BasicJsonType::boolean_t>::value, int > = 0 >
+inline void to_json(BasicJsonType& j, const BoolRef& b) noexcept
+{
+    external_constructor<value_t::boolean>::construct(j, static_cast<typename BasicJsonType::boolean_t>(b));
+}
+
+template<typename BasicJsonType, typename CompatibleString,
+         enable_if_t<std::is_constructible<typename BasicJsonType::string_t, CompatibleString>::value, int> = 0>
+inline void to_json(BasicJsonType& j, const CompatibleString& s)
+{
+    external_constructor<value_t::string>::construct(j, s);
+}
+
+template<typename BasicJsonType>
+inline void to_json(BasicJsonType& j, typename BasicJsonType::string_t&& s)
+{
+    external_constructor<value_t::string>::construct(j, std::move(s));
+}
+
+template<typename BasicJsonType, typename FloatType,
+         enable_if_t<std::is_floating_point<FloatType>::value, int> = 0>
+inline void to_json(BasicJsonType& j, FloatType val) noexcept
+{
+    external_constructor<value_t::number_float>::construct(j, static_cast<typename BasicJsonType::number_float_t>(val));
+}
+
+template<typename BasicJsonType, typename CompatibleNumberUnsignedType,
+         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_unsigned_t, CompatibleNumberUnsignedType>::value, int> = 0>
+inline void to_json(BasicJsonType& j, CompatibleNumberUnsignedType val) noexcept
+{
+    external_constructor<value_t::number_unsigned>::construct(j, static_cast<typename BasicJsonType::number_unsigned_t>(val));
+}
+
+template<typename BasicJsonType, typename CompatibleNumberIntegerType,
+         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_integer_t, CompatibleNumberIntegerType>::value, int> = 0>
+inline void to_json(BasicJsonType& j, CompatibleNumberIntegerType val) noexcept
+{
+    external_constructor<value_t::number_integer>::construct(j, static_cast<typename BasicJsonType::number_integer_t>(val));
+}
+
+#if !JSON_DISABLE_ENUM_SERIALIZATION
+template<typename BasicJsonType, typename EnumType,
+         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
+inline void to_json(BasicJsonType& j, EnumType e) noexcept
+{
+    using underlying_type = typename std::underlying_type<EnumType>::type;
+    static constexpr value_t integral_value_t = std::is_unsigned<underlying_type>::value ? value_t::number_unsigned : value_t::number_integer;
+    external_constructor<integral_value_t>::construct(j, static_cast<underlying_type>(e));
+}
+#endif  // JSON_DISABLE_ENUM_SERIALIZATION
+
+template<typename BasicJsonType>
+inline void to_json(BasicJsonType& j, const std::vector<bool>& e)
+{
+    external_constructor<value_t::array>::construct(j, e);
+}
+
+template < typename BasicJsonType, typename CompatibleArrayType,
+           enable_if_t < is_compatible_array_type<BasicJsonType,
+                         CompatibleArrayType>::value&&
+                         !is_compatible_object_type<BasicJsonType, CompatibleArrayType>::value&&
+                         !is_compatible_string_type<BasicJsonType, CompatibleArrayType>::value&&
+                         !std::is_same<typename BasicJsonType::binary_t, CompatibleArrayType>::value&&
+                         !is_basic_json<CompatibleArrayType>::value,
+                         int > = 0 >
+inline void to_json(BasicJsonType& j, const CompatibleArrayType& arr)
+{
+    external_constructor<value_t::array>::construct(j, arr);
+}
+
+template<typename BasicJsonType>
+inline void to_json(BasicJsonType& j, const typename BasicJsonType::binary_t& bin)
+{
+    external_constructor<value_t::binary>::construct(j, bin);
+}
+
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
+inline void to_json(BasicJsonType& j, const std::valarray<T>& arr)
+{
+    external_constructor<value_t::array>::construct(j, std::move(arr));
+}
+
+template<typename BasicJsonType>
+inline void to_json(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
+{
+    external_constructor<value_t::array>::construct(j, std::move(arr));
+}
+
+template < typename BasicJsonType, typename CompatibleObjectType,
+           enable_if_t < is_compatible_object_type<BasicJsonType, CompatibleObjectType>::value&& !is_basic_json<CompatibleObjectType>::value, int > = 0 >
+inline void to_json(BasicJsonType& j, const CompatibleObjectType& obj)
+{
+    external_constructor<value_t::object>::construct(j, obj);
+}
+
+template<typename BasicJsonType>
+inline void to_json(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
+{
+    external_constructor<value_t::object>::construct(j, std::move(obj));
+}
+
+template <
+    typename BasicJsonType, typename T, std::size_t N,
+    enable_if_t < !std::is_constructible<typename BasicJsonType::string_t,
+                  const T(&)[N]>::value, // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+                  int > = 0 >
+inline void to_json(BasicJsonType& j, const T(&arr)[N]) // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+{
+    external_constructor<value_t::array>::construct(j, arr);
+}
+
+template < typename BasicJsonType, typename T1, typename T2, enable_if_t < std::is_constructible<BasicJsonType, T1>::value&& std::is_constructible<BasicJsonType, T2>::value, int > = 0 >
+inline void to_json(BasicJsonType& j, const std::pair<T1, T2>& p)
+{
+    j = { p.first, p.second };
+}
+
+// for https://github.com/nlohmann/json/pull/1134
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_same<T, iteration_proxy_value<typename BasicJsonType::iterator>>::value, int> = 0>
+inline void to_json(BasicJsonType& j, const T& b)
+{
+    j = { {b.key(), b.value()} };
+}
+
+template<typename BasicJsonType, typename Tuple, std::size_t... Idx>
+inline void to_json_tuple_impl(BasicJsonType& j, const Tuple& t, index_sequence<Idx...> /*unused*/)
+{
+    j = { std::get<Idx>(t)... };
+}
+
+template<typename BasicJsonType, typename T, enable_if_t<is_constructible_tuple<BasicJsonType, T>::value, int > = 0>
+inline void to_json(BasicJsonType& j, const T& t)
+{
+    to_json_tuple_impl(j, t, make_index_sequence<std::tuple_size<T>::value> {});
+}
+
+#if JSON_HAS_FILESYSTEM || JSON_HAS_EXPERIMENTAL_FILESYSTEM
+template<typename BasicJsonType>
+inline void to_json(BasicJsonType& j, const std_fs::path& p)
+{
+    j = p.string();
+}
+#endif
+
+struct to_json_fn
+{
+    template<typename BasicJsonType, typename T>
+    auto operator()(BasicJsonType& j, T&& val) const noexcept(noexcept(to_json(j, std::forward<T>(val))))
+    -> decltype(to_json(j, std::forward<T>(val)), void())
+    {
+        return to_json(j, std::forward<T>(val));
+    }
+};
+}  // namespace detail
+
+#ifndef JSON_HAS_CPP_17
+/// namespace to hold default `to_json` function
+/// to see why this is required:
+/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
+namespace // NOLINT(cert-dcl59-cpp,fuchsia-header-anon-namespaces,google-build-namespaces)
+{
+#endif
+JSON_INLINE_VARIABLE constexpr const auto& to_json = // NOLINT(misc-definitions-in-headers)
+    detail::static_const<detail::to_json_fn>::value;
+#ifndef JSON_HAS_CPP_17
+}  // namespace
+#endif
+
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/meta/identity_tag.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/// @sa https://json.nlohmann.me/api/adl_serializer/
+template<typename ValueType, typename>
+struct adl_serializer
+{
+    /// @brief convert a JSON value to any value type
+    /// @sa https://json.nlohmann.me/api/adl_serializer/from_json/
+    template<typename BasicJsonType, typename TargetType = ValueType>
+    static auto from_json(BasicJsonType && j, TargetType& val) noexcept(
+        noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j), val)))
+    -> decltype(::nlohmann::from_json(std::forward<BasicJsonType>(j), val), void())
+    {
+        ::nlohmann::from_json(std::forward<BasicJsonType>(j), val);
+    }
+
+    /// @brief convert a JSON value to any value type
+    /// @sa https://json.nlohmann.me/api/adl_serializer/from_json/
+    template<typename BasicJsonType, typename TargetType = ValueType>
+    static auto from_json(BasicJsonType && j) noexcept(
+    noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j), detail::identity_tag<TargetType> {})))
+    -> decltype(::nlohmann::from_json(std::forward<BasicJsonType>(j), detail::identity_tag<TargetType> {}))
+    {
+        return ::nlohmann::from_json(std::forward<BasicJsonType>(j), detail::identity_tag<TargetType> {});
+    }
+
+    /// @brief convert any value type to a JSON value
+    /// @sa https://json.nlohmann.me/api/adl_serializer/to_json/
+    template<typename BasicJsonType, typename TargetType = ValueType>
+    static auto to_json(BasicJsonType& j, TargetType && val) noexcept(
+        noexcept(::nlohmann::to_json(j, std::forward<TargetType>(val))))
+    -> decltype(::nlohmann::to_json(j, std::forward<TargetType>(val)), void())
+    {
+        ::nlohmann::to_json(j, std::forward<TargetType>(val));
+    }
+};
+
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/byte_container_with_subtype.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <cstdint> // uint8_t, uint64_t
+#include <tuple> // tie
+#include <utility> // move
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/// @brief an internal type for a backed binary type
+/// @sa https://json.nlohmann.me/api/byte_container_with_subtype/
+template<typename BinaryType>
+class byte_container_with_subtype : public BinaryType
+{
+  public:
+    using container_type = BinaryType;
+    using subtype_type = std::uint64_t;
+
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
+    byte_container_with_subtype() noexcept(noexcept(container_type()))
+        : container_type()
+    {}
+
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
+    byte_container_with_subtype(const container_type& b) noexcept(noexcept(container_type(b)))
+        : container_type(b)
+    {}
+
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
+    byte_container_with_subtype(container_type&& b) noexcept(noexcept(container_type(std::move(b))))
+        : container_type(std::move(b))
+    {}
+
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
+    byte_container_with_subtype(const container_type& b, subtype_type subtype_) noexcept(noexcept(container_type(b)))
+        : container_type(b)
+        , m_subtype(subtype_)
+        , m_has_subtype(true)
+    {}
+
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
+    byte_container_with_subtype(container_type&& b, subtype_type subtype_) noexcept(noexcept(container_type(std::move(b))))
+        : container_type(std::move(b))
+        , m_subtype(subtype_)
+        , m_has_subtype(true)
+    {}
+
+    bool operator==(const byte_container_with_subtype& rhs) const
+    {
+        return std::tie(static_cast<const BinaryType&>(*this), m_subtype, m_has_subtype) ==
+               std::tie(static_cast<const BinaryType&>(rhs), rhs.m_subtype, rhs.m_has_subtype);
+    }
+
+    bool operator!=(const byte_container_with_subtype& rhs) const
+    {
+        return !(rhs == *this);
+    }
+
+    /// @brief sets the binary subtype
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/set_subtype/
+    void set_subtype(subtype_type subtype_) noexcept
+    {
+        m_subtype = subtype_;
+        m_has_subtype = true;
+    }
+
+    /// @brief return the binary subtype
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/subtype/
+    constexpr subtype_type subtype() const noexcept
+    {
+        return m_has_subtype ? m_subtype : static_cast<subtype_type>(-1);
+    }
+
+    /// @brief return whether the value has a subtype
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/has_subtype/
+    constexpr bool has_subtype() const noexcept
+    {
+        return m_has_subtype;
+    }
+
+    /// @brief clears the binary subtype
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/clear_subtype/
+    void clear_subtype() noexcept
+    {
+        m_subtype = 0;
+        m_has_subtype = false;
+    }
+
+  private:
+    subtype_type m_subtype = 0;
+    bool m_has_subtype = false;
+};
+
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/conversions/from_json.hpp>
+
+// #include <nlohmann/detail/conversions/to_json.hpp>
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/hash.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <cstdint> // uint8_t
+#include <cstddef> // size_t
+#include <functional> // hash
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+// boost::hash_combine
+inline std::size_t combine(std::size_t seed, std::size_t h) noexcept
+{
+    seed ^= h + 0x9e3779b9 + (seed << 6U) + (seed >> 2U);
+    return seed;
+}
+
+/*!
+@brief hash a JSON value
+
+The hash function tries to rely on std::hash where possible. Furthermore, the
+type of the JSON value is taken into account to have different hash values for
+null, 0, 0U, and false, etc.
+
+@tparam BasicJsonType basic_json specialization
+@param j JSON value to hash
+@return hash value of j
+*/
+template<typename BasicJsonType>
+std::size_t hash(const BasicJsonType& j)
+{
+    using string_t = typename BasicJsonType::string_t;
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+
+    const auto type = static_cast<std::size_t>(j.type());
+    switch (j.type())
+    {
+        case BasicJsonType::value_t::null:
+        case BasicJsonType::value_t::discarded:
+        {
+            return combine(type, 0);
+        }
+
+        case BasicJsonType::value_t::object:
+        {
+            auto seed = combine(type, j.size());
+            for (const auto& element : j.items())
+            {
+                const auto h = std::hash<string_t> {}(element.key());
+                seed = combine(seed, h);
+                seed = combine(seed, hash(element.value()));
+            }
+            return seed;
+        }
+
+        case BasicJsonType::value_t::array:
+        {
+            auto seed = combine(type, j.size());
+            for (const auto& element : j)
+            {
+                seed = combine(seed, hash(element));
+            }
+            return seed;
+        }
+
+        case BasicJsonType::value_t::string:
+        {
+            const auto h = std::hash<string_t> {}(j.template get_ref<const string_t&>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::boolean:
+        {
+            const auto h = std::hash<bool> {}(j.template get<bool>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::number_integer:
+        {
+            const auto h = std::hash<number_integer_t> {}(j.template get<number_integer_t>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::number_unsigned:
+        {
+            const auto h = std::hash<number_unsigned_t> {}(j.template get<number_unsigned_t>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::number_float:
+        {
+            const auto h = std::hash<number_float_t> {}(j.template get<number_float_t>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::binary:
+        {
+            auto seed = combine(type, j.get_binary().size());
+            const auto h = std::hash<bool> {}(j.get_binary().has_subtype());
+            seed = combine(seed, h);
+            seed = combine(seed, static_cast<std::size_t>(j.get_binary().subtype()));
+            for (const auto byte : j.get_binary())
+            {
+                seed = combine(seed, std::hash<std::uint8_t> {}(byte));
+            }
+            return seed;
+        }
+
+        default:                   // LCOV_EXCL_LINE
+            JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+            return 0;              // LCOV_EXCL_LINE
+    }
+}
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/input/binary_reader.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <algorithm> // generate_n
+#include <array> // array
+#include <cmath> // ldexp
+#include <cstddef> // size_t
+#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
+#include <cstdio> // snprintf
+#include <cstring> // memcpy
+#include <iterator> // back_inserter
+#include <limits> // numeric_limits
+#include <string> // char_traits, string
+#include <utility> // make_pair, move
+#include <vector> // vector
+#ifdef __cpp_lib_byteswap
+    #include <bit>  //byteswap
+#endif
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <array> // array
+#include <cstddef> // size_t
+#include <cstring> // strlen
+#include <iterator> // begin, end, iterator_traits, random_access_iterator_tag, distance, next
+#include <memory> // shared_ptr, make_shared, addressof
+#include <numeric> // accumulate
+#include <string> // string, char_traits
+#include <type_traits> // enable_if, is_base_of, is_pointer, is_integral, remove_pointer
+#include <utility> // pair, declval
+
+#ifndef JSON_NO_IO
+    #include <cstdio>   // FILE *
+    #include <istream>  // istream
+#endif                  // JSON_NO_IO
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/iterators/iterator_traits.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+/// the supported input formats
+enum class input_format_t { json, cbor, msgpack, ubjson, bson, bjdata };
+
+////////////////////
+// input adapters //
+////////////////////
+
+#ifndef JSON_NO_IO
+/*!
+Input adapter for stdio file access. This adapter read only 1 byte and do not use any
+ buffer. This adapter is a very low level adapter.
+*/
+class file_input_adapter
+{
+  public:
+    using char_type = char;
+
+    JSON_HEDLEY_NON_NULL(2)
+    explicit file_input_adapter(std::FILE* f) noexcept
+        : m_file(f)
+    {
+        JSON_ASSERT(m_file != nullptr);
+    }
+
+    // make class move-only
+    file_input_adapter(const file_input_adapter&) = delete;
+    file_input_adapter(file_input_adapter&&) noexcept = default;
+    file_input_adapter& operator=(const file_input_adapter&) = delete;
+    file_input_adapter& operator=(file_input_adapter&&) = delete;
+    ~file_input_adapter() = default;
+
+    std::char_traits<char>::int_type get_character() noexcept
+    {
+        return std::fgetc(m_file);
+    }
+
+    // returns the number of characters successfully read
+    template<class T>
+    std::size_t get_elements(T* dest, std::size_t count = 1)
+    {
+        return fread(dest, 1, sizeof(T) * count, m_file);
+    }
+
+  private:
+    /// the file pointer to read from
+    std::FILE* m_file;
+};
+
+/*!
+Input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at
+beginning of input. Does not support changing the underlying std::streambuf
+in mid-input. Maintains underlying std::istream and std::streambuf to support
+subsequent use of standard std::istream operations to process any input
+characters following those used in parsing the JSON input.  Clears the
+std::istream flags; any input errors (e.g., EOF) will be detected by the first
+subsequent call for input from the std::istream.
+*/
+class input_stream_adapter
+{
+  public:
+    using char_type = char;
+
+    ~input_stream_adapter()
+    {
+        // clear stream flags; we use underlying streambuf I/O, do not
+        // maintain ifstream flags, except eof
+        if (is != nullptr)
+        {
+            is->clear(is->rdstate() & std::ios::eofbit);
+        }
+    }
+
+    explicit input_stream_adapter(std::istream& i)
+        : is(&i), sb(i.rdbuf())
+    {}
+
+    // delete because of pointer members
+    input_stream_adapter(const input_stream_adapter&) = delete;
+    input_stream_adapter& operator=(input_stream_adapter&) = delete;
+    input_stream_adapter& operator=(input_stream_adapter&&) = delete;
+
+    input_stream_adapter(input_stream_adapter&& rhs) noexcept
+        : is(rhs.is), sb(rhs.sb)
+    {
+        rhs.is = nullptr;
+        rhs.sb = nullptr;
+    }
+
+    // std::istream/std::streambuf use std::char_traits<char>::to_int_type, to
+    // ensure that std::char_traits<char>::eof() and the character 0xFF do not
+    // end up as the same value, e.g. 0xFFFFFFFF.
+    std::char_traits<char>::int_type get_character()
+    {
+        auto res = sb->sbumpc();
+        // set eof manually, as we don't use the istream interface.
+        if (JSON_HEDLEY_UNLIKELY(res == std::char_traits<char>::eof()))
+        {
+            is->clear(is->rdstate() | std::ios::eofbit);
+        }
+        return res;
+    }
+
+    template<class T>
+    std::size_t get_elements(T* dest, std::size_t count = 1)
+    {
+        auto res = static_cast<std::size_t>(sb->sgetn(reinterpret_cast<char*>(dest), static_cast<std::streamsize>(count * sizeof(T))));
+        if (JSON_HEDLEY_UNLIKELY(res < count * sizeof(T)))
+        {
+            is->clear(is->rdstate() | std::ios::eofbit);
+        }
+        return res;
+    }
+
+  private:
+    /// the associated input stream
+    std::istream* is = nullptr;
+    std::streambuf* sb = nullptr;
+};
+#endif  // JSON_NO_IO
+
+// General-purpose iterator-based adapter. It might not be as fast as
+// theoretically possible for some containers, but it is extremely versatile.
+template<typename IteratorType>
+class iterator_input_adapter
+{
+  public:
+    using char_type = typename std::iterator_traits<IteratorType>::value_type;
+
+    iterator_input_adapter(IteratorType first, IteratorType last)
+        : current(std::move(first)), end(std::move(last))
+    {}
+
+    typename char_traits<char_type>::int_type get_character()
+    {
+        if (JSON_HEDLEY_LIKELY(current != end))
+        {
+            auto result = char_traits<char_type>::to_int_type(*current);
+            std::advance(current, 1);
+            return result;
+        }
+
+        return char_traits<char_type>::eof();
+    }
+
+    // for general iterators, we cannot really do something better than falling back to processing the range one-by-one
+    template<class T>
+    std::size_t get_elements(T* dest, std::size_t count = 1)
+    {
+        auto* ptr = reinterpret_cast<char*>(dest);
+        for (std::size_t read_index = 0; read_index < count * sizeof(T); ++read_index)
+        {
+            if (JSON_HEDLEY_LIKELY(current != end))
+            {
+                ptr[read_index] = static_cast<char>(*current);
+                std::advance(current, 1);
+            }
+            else
+            {
+                return read_index;
+            }
+        }
+        return count * sizeof(T);
+    }
+
+  private:
+    IteratorType current;
+    IteratorType end;
+
+    template<typename BaseInputAdapter, size_t T>
+    friend struct wide_string_input_helper;
+
+    bool empty() const
+    {
+        return current == end;
+    }
+};
+
+template<typename BaseInputAdapter, size_t T>
+struct wide_string_input_helper;
+
+template<typename BaseInputAdapter>
+struct wide_string_input_helper<BaseInputAdapter, 4>
+{
+    // UTF-32
+    static void fill_buffer(BaseInputAdapter& input,
+                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
+                            size_t& utf8_bytes_index,
+                            size_t& utf8_bytes_filled)
+    {
+        utf8_bytes_index = 0;
+
+        if (JSON_HEDLEY_UNLIKELY(input.empty()))
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const auto wc = input.get_character();
+
+            // UTF-32 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((static_cast<unsigned int>(wc) >> 6u) & 0x1Fu));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 2;
+            }
+            else if (wc <= 0xFFFF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((static_cast<unsigned int>(wc) >> 12u) & 0x0Fu));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 3;
+            }
+            else if (wc <= 0x10FFFF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | ((static_cast<unsigned int>(wc) >> 18u) & 0x07u));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 12u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
+                utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 4;
+            }
+            else
+            {
+                // unknown character
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+        }
+    }
+};
+
+template<typename BaseInputAdapter>
+struct wide_string_input_helper<BaseInputAdapter, 2>
+{
+    // UTF-16
+    static void fill_buffer(BaseInputAdapter& input,
+                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
+                            size_t& utf8_bytes_index,
+                            size_t& utf8_bytes_filled)
+    {
+        utf8_bytes_index = 0;
+
+        if (JSON_HEDLEY_UNLIKELY(input.empty()))
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const auto wc = input.get_character();
+
+            // UTF-16 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((static_cast<unsigned int>(wc) >> 6u)));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 2;
+            }
+            else if (0xD800 > wc || wc >= 0xE000)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((static_cast<unsigned int>(wc) >> 12u)));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 3;
+            }
+            else
+            {
+                if (JSON_HEDLEY_UNLIKELY(!input.empty()))
+                {
+                    const auto wc2 = static_cast<unsigned int>(input.get_character());
+                    const auto charcode = 0x10000u + (((static_cast<unsigned int>(wc) & 0x3FFu) << 10u) | (wc2 & 0x3FFu));
+                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | (charcode >> 18u));
+                    utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 12u) & 0x3Fu));
+                    utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 6u) & 0x3Fu));
+                    utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (charcode & 0x3Fu));
+                    utf8_bytes_filled = 4;
+                }
+                else
+                {
+                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                    utf8_bytes_filled = 1;
+                }
+            }
+        }
+    }
+};
+
+// Wraps another input adapter to convert wide character types into individual bytes.
+template<typename BaseInputAdapter, typename WideCharType>
+class wide_string_input_adapter
+{
+  public:
+    using char_type = char;
+
+    wide_string_input_adapter(BaseInputAdapter base)
+        : base_adapter(base) {}
+
+    typename std::char_traits<char>::int_type get_character() noexcept
+    {
+        // check if buffer needs to be filled
+        if (utf8_bytes_index == utf8_bytes_filled)
+        {
+            fill_buffer<sizeof(WideCharType)>();
+
+            JSON_ASSERT(utf8_bytes_filled > 0);
+            JSON_ASSERT(utf8_bytes_index == 0);
+        }
+
+        // use buffer
+        JSON_ASSERT(utf8_bytes_filled > 0);
+        JSON_ASSERT(utf8_bytes_index < utf8_bytes_filled);
+        return utf8_bytes[utf8_bytes_index++];
+    }
+
+    // parsing binary with wchar doesn't make sense, but since the parsing mode can be runtime, we need something here
+    template<class T>
+    std::size_t get_elements(T* /*dest*/, std::size_t /*count*/ = 1)
+    {
+        JSON_THROW(parse_error::create(112, 1, "wide string type cannot be interpreted as binary data", nullptr));
+    }
+
+  private:
+    BaseInputAdapter base_adapter;
+
+    template<size_t T>
+    void fill_buffer()
+    {
+        wide_string_input_helper<BaseInputAdapter, T>::fill_buffer(base_adapter, utf8_bytes, utf8_bytes_index, utf8_bytes_filled);
+    }
+
+    /// a buffer for UTF-8 bytes
+    std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
+
+    /// index to the utf8_codes array for the next valid byte
+    std::size_t utf8_bytes_index = 0;
+    /// number of valid bytes in the utf8_codes array
+    std::size_t utf8_bytes_filled = 0;
+};
+
+template<typename IteratorType, typename Enable = void>
+struct iterator_input_adapter_factory
+{
+    using iterator_type = IteratorType;
+    using char_type = typename std::iterator_traits<iterator_type>::value_type;
+    using adapter_type = iterator_input_adapter<iterator_type>;
+
+    static adapter_type create(IteratorType first, IteratorType last)
+    {
+        return adapter_type(std::move(first), std::move(last));
+    }
+};
+
+template<typename T>
+struct is_iterator_of_multibyte
+{
+    using value_type = typename std::iterator_traits<T>::value_type;
+    enum
+    {
+        value = sizeof(value_type) > 1
+    };
+};
+
+template<typename IteratorType>
+struct iterator_input_adapter_factory<IteratorType, enable_if_t<is_iterator_of_multibyte<IteratorType>::value>>
+{
+    using iterator_type = IteratorType;
+    using char_type = typename std::iterator_traits<iterator_type>::value_type;
+    using base_adapter_type = iterator_input_adapter<iterator_type>;
+    using adapter_type = wide_string_input_adapter<base_adapter_type, char_type>;
+
+    static adapter_type create(IteratorType first, IteratorType last)
+    {
+        return adapter_type(base_adapter_type(std::move(first), std::move(last)));
+    }
+};
+
+// General purpose iterator-based input
+template<typename IteratorType>
+typename iterator_input_adapter_factory<IteratorType>::adapter_type input_adapter(IteratorType first, IteratorType last)
+{
+    using factory_type = iterator_input_adapter_factory<IteratorType>;
+    return factory_type::create(first, last);
+}
+
+// Convenience shorthand from container to iterator
+// Enables ADL on begin(container) and end(container)
+// Encloses the using declarations in namespace for not to leak them to outside scope
+
+namespace container_input_adapter_factory_impl
+{
+
+using std::begin;
+using std::end;
+
+template<typename ContainerType, typename Enable = void>
+struct container_input_adapter_factory {};
+
+template<typename ContainerType>
+struct container_input_adapter_factory< ContainerType,
+       void_t<decltype(begin(std::declval<ContainerType>()), end(std::declval<ContainerType>()))>>
+       {
+           using adapter_type = decltype(input_adapter(begin(std::declval<ContainerType>()), end(std::declval<ContainerType>())));
+
+           static adapter_type create(const ContainerType& container)
+{
+    return input_adapter(begin(container), end(container));
+}
+       };
+
+}  // namespace container_input_adapter_factory_impl
+
+template<typename ContainerType>
+typename container_input_adapter_factory_impl::container_input_adapter_factory<ContainerType>::adapter_type input_adapter(const ContainerType& container)
+{
+    return container_input_adapter_factory_impl::container_input_adapter_factory<ContainerType>::create(container);
+}
+
+// specialization for std::string
+using string_input_adapter_type = decltype(input_adapter(std::declval<std::string>()));
+
+#ifndef JSON_NO_IO
+// Special cases with fast paths
+inline file_input_adapter input_adapter(std::FILE* file)
+{
+    if (file == nullptr)
+    {
+        JSON_THROW(parse_error::create(101, 0, "attempting to parse an empty input; check that your input string or stream contains the expected JSON", nullptr));
+    }
+    return file_input_adapter(file);
+}
+
+inline input_stream_adapter input_adapter(std::istream& stream)
+{
+    return input_stream_adapter(stream);
+}
+
+inline input_stream_adapter input_adapter(std::istream&& stream)
+{
+    return input_stream_adapter(stream);
+}
+#endif  // JSON_NO_IO
+
+using contiguous_bytes_input_adapter = decltype(input_adapter(std::declval<const char*>(), std::declval<const char*>()));
+
+// Null-delimited strings, and the like.
+template < typename CharT,
+           typename std::enable_if <
+               std::is_pointer<CharT>::value&&
+               !std::is_array<CharT>::value&&
+               std::is_integral<typename std::remove_pointer<CharT>::type>::value&&
+               sizeof(typename std::remove_pointer<CharT>::type) == 1,
+               int >::type = 0 >
+contiguous_bytes_input_adapter input_adapter(CharT b)
+{
+    if (b == nullptr)
+    {
+        JSON_THROW(parse_error::create(101, 0, "attempting to parse an empty input; check that your input string or stream contains the expected JSON", nullptr));
+    }
+    auto length = std::strlen(reinterpret_cast<const char*>(b));
+    const auto* ptr = reinterpret_cast<const char*>(b);
+    return input_adapter(ptr, ptr + length); // cppcheck-suppress[nullPointerArithmeticRedundantCheck]
+}
+
+template<typename T, std::size_t N>
+auto input_adapter(T (&array)[N]) -> decltype(input_adapter(array, array + N)) // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+{
+    return input_adapter(array, array + N);
+}
+
+// This class only handles inputs of input_buffer_adapter type.
+// It's required so that expressions like {ptr, len} can be implicitly cast
+// to the correct adapter.
+class span_input_adapter
+{
+  public:
+    template < typename CharT,
+               typename std::enable_if <
+                   std::is_pointer<CharT>::value&&
+                   std::is_integral<typename std::remove_pointer<CharT>::type>::value&&
+                   sizeof(typename std::remove_pointer<CharT>::type) == 1,
+                   int >::type = 0 >
+    span_input_adapter(CharT b, std::size_t l)
+        : ia(reinterpret_cast<const char*>(b), reinterpret_cast<const char*>(b) + l) {}
+
+    template<class IteratorType,
+             typename std::enable_if<
+                 std::is_same<typename iterator_traits<IteratorType>::iterator_category, std::random_access_iterator_tag>::value,
+                 int>::type = 0>
+    span_input_adapter(IteratorType first, IteratorType last)
+        : ia(input_adapter(first, last)) {}
+
+    contiguous_bytes_input_adapter&& get()
+    {
+        return std::move(ia); // NOLINT(hicpp-move-const-arg,performance-move-const-arg)
+    }
+
+  private:
+    contiguous_bytes_input_adapter ia;
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/input/json_sax.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <cstddef>
+#include <string> // string
+#include <type_traits> // enable_if_t
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/input/lexer.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <array> // array
+#include <clocale> // localeconv
+#include <cstddef> // size_t
+#include <cstdio> // snprintf
+#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
+#include <initializer_list> // initializer_list
+#include <string> // char_traits, string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/position_t.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+///////////
+// lexer //
+///////////
+
+template<typename BasicJsonType>
+class lexer_base
+{
+  public:
+    /// token types for the parser
+    enum class token_type
+    {
+        uninitialized,    ///< indicating the scanner is uninitialized
+        literal_true,     ///< the `true` literal
+        literal_false,    ///< the `false` literal
+        literal_null,     ///< the `null` literal
+        value_string,     ///< a string -- use get_string() for actual value
+        value_unsigned,   ///< an unsigned integer -- use get_number_unsigned() for actual value
+        value_integer,    ///< a signed integer -- use get_number_integer() for actual value
+        value_float,      ///< an floating point number -- use get_number_float() for actual value
+        begin_array,      ///< the character for array begin `[`
+        begin_object,     ///< the character for object begin `{`
+        end_array,        ///< the character for array end `]`
+        end_object,       ///< the character for object end `}`
+        name_separator,   ///< the name separator `:`
+        value_separator,  ///< the value separator `,`
+        parse_error,      ///< indicating a parse error
+        end_of_input,     ///< indicating the end of the input buffer
+        literal_or_value  ///< a literal or the begin of a value (only for diagnostics)
+    };
+
+    /// return name of values of type token_type (only used for errors)
+    JSON_HEDLEY_RETURNS_NON_NULL
+    JSON_HEDLEY_CONST
+    static const char* token_type_name(const token_type t) noexcept
+    {
+        switch (t)
+        {
+            case token_type::uninitialized:
+                return "<uninitialized>";
+            case token_type::literal_true:
+                return "true literal";
+            case token_type::literal_false:
+                return "false literal";
+            case token_type::literal_null:
+                return "null literal";
+            case token_type::value_string:
+                return "string literal";
+            case token_type::value_unsigned:
+            case token_type::value_integer:
+            case token_type::value_float:
+                return "number literal";
+            case token_type::begin_array:
+                return "'['";
+            case token_type::begin_object:
+                return "'{'";
+            case token_type::end_array:
+                return "']'";
+            case token_type::end_object:
+                return "'}'";
+            case token_type::name_separator:
+                return "':'";
+            case token_type::value_separator:
+                return "','";
+            case token_type::parse_error:
+                return "<parse error>";
+            case token_type::end_of_input:
+                return "end of input";
+            case token_type::literal_or_value:
+                return "'[', '{', or a literal";
+            // LCOV_EXCL_START
+            default: // catch non-enum values
+                return "unknown token";
+                // LCOV_EXCL_STOP
+        }
+    }
+};
+/*!
+@brief lexical analysis
+
+This class organizes the lexical analysis during JSON deserialization.
+*/
+template<typename BasicJsonType, typename InputAdapterType>
+class lexer : public lexer_base<BasicJsonType>
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using char_type = typename InputAdapterType::char_type;
+    using char_int_type = typename char_traits<char_type>::int_type;
+
+  public:
+    using token_type = typename lexer_base<BasicJsonType>::token_type;
+
+    explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) noexcept
+        : ia(std::move(adapter))
+        , ignore_comments(ignore_comments_)
+        , decimal_point_char(static_cast<char_int_type>(get_decimal_point()))
+    {}
+
+    // delete because of pointer members
+    lexer(const lexer&) = delete;
+    lexer(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    lexer& operator=(lexer&) = delete;
+    lexer& operator=(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    ~lexer() = default;
+
+  private:
+    /////////////////////
+    // locales
+    /////////////////////
+
+    /// return the locale-dependent decimal point
+    JSON_HEDLEY_PURE
+    static char get_decimal_point() noexcept
+    {
+        const auto* loc = localeconv();
+        JSON_ASSERT(loc != nullptr);
+        return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
+    }
+
+    /////////////////////
+    // scan functions
+    /////////////////////
+
+    /*!
+    @brief get codepoint from 4 hex characters following `\u`
+
+    For input "\u c1 c2 c3 c4" the codepoint is:
+      (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
+    = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
+
+    Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
+    must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
+    conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
+    between the ASCII value of the character and the desired integer value.
+
+    @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
+            non-hex character)
+    */
+    int get_codepoint()
+    {
+        // this function only makes sense after reading `\u`
+        JSON_ASSERT(current == 'u');
+        int codepoint = 0;
+
+        const auto factors = { 12u, 8u, 4u, 0u };
+        for (const auto factor : factors)
+        {
+            get();
+
+            if (current >= '0' && current <= '9')
+            {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
+            }
+            else if (current >= 'A' && current <= 'F')
+            {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
+            }
+            else if (current >= 'a' && current <= 'f')
+            {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
+            }
+            else
+            {
+                return -1;
+            }
+        }
+
+        JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
+        return codepoint;
+    }
+
+    /*!
+    @brief check if the next byte(s) are inside a given range
+
+    Adds the current byte and, for each passed range, reads a new byte and
+    checks if it is inside the range. If a violation was detected, set up an
+    error message and return false. Otherwise, return true.
+
+    @param[in] ranges  list of integers; interpreted as list of pairs of
+                       inclusive lower and upper bound, respectively
+
+    @pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
+         1, 2, or 3 pairs. This precondition is enforced by an assertion.
+
+    @return true if and only if no range violation was detected
+    */
+    bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
+    {
+        JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
+        add(current);
+
+        for (auto range = ranges.begin(); range != ranges.end(); ++range)
+        {
+            get();
+            if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) // NOLINT(bugprone-inc-dec-in-conditions)
+            {
+                add(current);
+            }
+            else
+            {
+                error_message = "invalid string: ill-formed UTF-8 byte";
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    /*!
+    @brief scan a string literal
+
+    This function scans a string according to Sect. 7 of RFC 8259. While
+    scanning, bytes are escaped and copied into buffer token_buffer. Then the
+    function returns successfully, token_buffer is *not* null-terminated (as it
+    may contain \0 bytes), and token_buffer.size() is the number of bytes in the
+    string.
+
+    @return token_type::value_string if string could be successfully scanned,
+            token_type::parse_error otherwise
+
+    @note In case of errors, variable error_message contains a textual
+          description.
+    */
+    token_type scan_string()
+    {
+        // reset token_buffer (ignore opening quote)
+        reset();
+
+        // we entered the function by reading an open quote
+        JSON_ASSERT(current == '\"');
+
+        while (true)
+        {
+            // get next character
+            switch (get())
+            {
+                // end of file while parsing string
+                case char_traits<char_type>::eof():
+                {
+                    error_message = "invalid string: missing closing quote";
+                    return token_type::parse_error;
+                }
+
+                // closing quote
+                case '\"':
+                {
+                    return token_type::value_string;
+                }
+
+                // escapes
+                case '\\':
+                {
+                    switch (get())
+                    {
+                        // quotation mark
+                        case '\"':
+                            add('\"');
+                            break;
+                        // reverse solidus
+                        case '\\':
+                            add('\\');
+                            break;
+                        // solidus
+                        case '/':
+                            add('/');
+                            break;
+                        // backspace
+                        case 'b':
+                            add('\b');
+                            break;
+                        // form feed
+                        case 'f':
+                            add('\f');
+                            break;
+                        // line feed
+                        case 'n':
+                            add('\n');
+                            break;
+                        // carriage return
+                        case 'r':
+                            add('\r');
+                            break;
+                        // tab
+                        case 't':
+                            add('\t');
+                            break;
+
+                        // unicode escapes
+                        case 'u':
+                        {
+                            const int codepoint1 = get_codepoint();
+                            int codepoint = codepoint1; // start with codepoint1
+
+                            if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
+                            {
+                                error_message = "invalid string: '\\u' must be followed by 4 hex digits";
+                                return token_type::parse_error;
+                            }
+
+                            // check if code point is a high surrogate
+                            if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
+                            {
+                                // expect next \uxxxx entry
+                                if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
+                                {
+                                    const int codepoint2 = get_codepoint();
+
+                                    if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
+                                    {
+                                        error_message = "invalid string: '\\u' must be followed by 4 hex digits";
+                                        return token_type::parse_error;
+                                    }
+
+                                    // check if codepoint2 is a low surrogate
+                                    if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
+                                    {
+                                        // overwrite codepoint
+                                        codepoint = static_cast<int>(
+                                                        // high surrogate occupies the most significant 22 bits
+                                                        (static_cast<unsigned int>(codepoint1) << 10u)
+                                                        // low surrogate occupies the least significant 15 bits
+                                                        + static_cast<unsigned int>(codepoint2)
+                                                        // there is still the 0xD800, 0xDC00 and 0x10000 noise
+                                                        // in the result, so we have to subtract with:
+                                                        // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
+                                                        - 0x35FDC00u);
+                                    }
+                                    else
+                                    {
+                                        error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
+                                        return token_type::parse_error;
+                                    }
+                                }
+                                else
+                                {
+                                    error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
+                                    return token_type::parse_error;
+                                }
+                            }
+                            else
+                            {
+                                if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
+                                {
+                                    error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
+                                    return token_type::parse_error;
+                                }
+                            }
+
+                            // result of the above calculation yields a proper codepoint
+                            JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
+
+                            // translate codepoint into bytes
+                            if (codepoint < 0x80)
+                            {
+                                // 1-byte characters: 0xxxxxxx (ASCII)
+                                add(static_cast<char_int_type>(codepoint));
+                            }
+                            else if (codepoint <= 0x7FF)
+                            {
+                                // 2-byte characters: 110xxxxx 10xxxxxx
+                                add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
+                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+                            else if (codepoint <= 0xFFFF)
+                            {
+                                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+                                add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
+                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
+                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+                            else
+                            {
+                                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                                add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
+                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
+                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
+                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+
+                            break;
+                        }
+
+                        // other characters after escape
+                        default:
+                            error_message = "invalid string: forbidden character after backslash";
+                            return token_type::parse_error;
+                    }
+
+                    break;
+                }
+
+                // invalid control characters
+                case 0x00:
+                {
+                    error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
+                    return token_type::parse_error;
+                }
+
+                case 0x01:
+                {
+                    error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
+                    return token_type::parse_error;
+                }
+
+                case 0x02:
+                {
+                    error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
+                    return token_type::parse_error;
+                }
+
+                case 0x03:
+                {
+                    error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
+                    return token_type::parse_error;
+                }
+
+                case 0x04:
+                {
+                    error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
+                    return token_type::parse_error;
+                }
+
+                case 0x05:
+                {
+                    error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
+                    return token_type::parse_error;
+                }
+
+                case 0x06:
+                {
+                    error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
+                    return token_type::parse_error;
+                }
+
+                case 0x07:
+                {
+                    error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
+                    return token_type::parse_error;
+                }
+
+                case 0x08:
+                {
+                    error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
+                    return token_type::parse_error;
+                }
+
+                case 0x09:
+                {
+                    error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
+                    return token_type::parse_error;
+                }
+
+                case 0x0A:
+                {
+                    error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
+                    return token_type::parse_error;
+                }
+
+                case 0x0B:
+                {
+                    error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
+                    return token_type::parse_error;
+                }
+
+                case 0x0C:
+                {
+                    error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
+                    return token_type::parse_error;
+                }
+
+                case 0x0D:
+                {
+                    error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
+                    return token_type::parse_error;
+                }
+
+                case 0x0E:
+                {
+                    error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
+                    return token_type::parse_error;
+                }
+
+                case 0x0F:
+                {
+                    error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
+                    return token_type::parse_error;
+                }
+
+                case 0x10:
+                {
+                    error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
+                    return token_type::parse_error;
+                }
+
+                case 0x11:
+                {
+                    error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
+                    return token_type::parse_error;
+                }
+
+                case 0x12:
+                {
+                    error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
+                    return token_type::parse_error;
+                }
+
+                case 0x13:
+                {
+                    error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
+                    return token_type::parse_error;
+                }
+
+                case 0x14:
+                {
+                    error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
+                    return token_type::parse_error;
+                }
+
+                case 0x15:
+                {
+                    error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
+                    return token_type::parse_error;
+                }
+
+                case 0x16:
+                {
+                    error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
+                    return token_type::parse_error;
+                }
+
+                case 0x17:
+                {
+                    error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
+                    return token_type::parse_error;
+                }
+
+                case 0x18:
+                {
+                    error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
+                    return token_type::parse_error;
+                }
+
+                case 0x19:
+                {
+                    error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
+                    return token_type::parse_error;
+                }
+
+                case 0x1A:
+                {
+                    error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
+                    return token_type::parse_error;
+                }
+
+                case 0x1B:
+                {
+                    error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
+                    return token_type::parse_error;
+                }
+
+                case 0x1C:
+                {
+                    error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
+                    return token_type::parse_error;
+                }
+
+                case 0x1D:
+                {
+                    error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
+                    return token_type::parse_error;
+                }
+
+                case 0x1E:
+                {
+                    error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
+                    return token_type::parse_error;
+                }
+
+                case 0x1F:
+                {
+                    error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
+                    return token_type::parse_error;
+                }
+
+                // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
+                case 0x20:
+                case 0x21:
+                case 0x23:
+                case 0x24:
+                case 0x25:
+                case 0x26:
+                case 0x27:
+                case 0x28:
+                case 0x29:
+                case 0x2A:
+                case 0x2B:
+                case 0x2C:
+                case 0x2D:
+                case 0x2E:
+                case 0x2F:
+                case 0x30:
+                case 0x31:
+                case 0x32:
+                case 0x33:
+                case 0x34:
+                case 0x35:
+                case 0x36:
+                case 0x37:
+                case 0x38:
+                case 0x39:
+                case 0x3A:
+                case 0x3B:
+                case 0x3C:
+                case 0x3D:
+                case 0x3E:
+                case 0x3F:
+                case 0x40:
+                case 0x41:
+                case 0x42:
+                case 0x43:
+                case 0x44:
+                case 0x45:
+                case 0x46:
+                case 0x47:
+                case 0x48:
+                case 0x49:
+                case 0x4A:
+                case 0x4B:
+                case 0x4C:
+                case 0x4D:
+                case 0x4E:
+                case 0x4F:
+                case 0x50:
+                case 0x51:
+                case 0x52:
+                case 0x53:
+                case 0x54:
+                case 0x55:
+                case 0x56:
+                case 0x57:
+                case 0x58:
+                case 0x59:
+                case 0x5A:
+                case 0x5B:
+                case 0x5D:
+                case 0x5E:
+                case 0x5F:
+                case 0x60:
+                case 0x61:
+                case 0x62:
+                case 0x63:
+                case 0x64:
+                case 0x65:
+                case 0x66:
+                case 0x67:
+                case 0x68:
+                case 0x69:
+                case 0x6A:
+                case 0x6B:
+                case 0x6C:
+                case 0x6D:
+                case 0x6E:
+                case 0x6F:
+                case 0x70:
+                case 0x71:
+                case 0x72:
+                case 0x73:
+                case 0x74:
+                case 0x75:
+                case 0x76:
+                case 0x77:
+                case 0x78:
+                case 0x79:
+                case 0x7A:
+                case 0x7B:
+                case 0x7C:
+                case 0x7D:
+                case 0x7E:
+                case 0x7F:
+                {
+                    add(current);
+                    break;
+                }
+
+                // U+0080..U+07FF: bytes C2..DF 80..BF
+                case 0xC2:
+                case 0xC3:
+                case 0xC4:
+                case 0xC5:
+                case 0xC6:
+                case 0xC7:
+                case 0xC8:
+                case 0xC9:
+                case 0xCA:
+                case 0xCB:
+                case 0xCC:
+                case 0xCD:
+                case 0xCE:
+                case 0xCF:
+                case 0xD0:
+                case 0xD1:
+                case 0xD2:
+                case 0xD3:
+                case 0xD4:
+                case 0xD5:
+                case 0xD6:
+                case 0xD7:
+                case 0xD8:
+                case 0xD9:
+                case 0xDA:
+                case 0xDB:
+                case 0xDC:
+                case 0xDD:
+                case 0xDE:
+                case 0xDF:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
+                case 0xE0:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
+                // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
+                case 0xE1:
+                case 0xE2:
+                case 0xE3:
+                case 0xE4:
+                case 0xE5:
+                case 0xE6:
+                case 0xE7:
+                case 0xE8:
+                case 0xE9:
+                case 0xEA:
+                case 0xEB:
+                case 0xEC:
+                case 0xEE:
+                case 0xEF:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+D000..U+D7FF: bytes ED 80..9F 80..BF
+                case 0xED:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
+                case 0xF0:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
+                case 0xF1:
+                case 0xF2:
+                case 0xF3:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+                case 0xF4:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // remaining bytes (80..C1 and F5..FF) are ill-formed
+                default:
+                {
+                    error_message = "invalid string: ill-formed UTF-8 byte";
+                    return token_type::parse_error;
+                }
+            }
+        }
+    }
+
+    /*!
+     * @brief scan a comment
+     * @return whether comment could be scanned successfully
+     */
+    bool scan_comment()
+    {
+        switch (get())
+        {
+            // single-line comments skip input until a newline or EOF is read
+            case '/':
+            {
+                while (true)
+                {
+                    switch (get())
+                    {
+                        case '\n':
+                        case '\r':
+                        case char_traits<char_type>::eof():
+                        case '\0':
+                            return true;
+
+                        default:
+                            break;
+                    }
+                }
+            }
+
+            // multi-line comments skip input until */ is read
+            case '*':
+            {
+                while (true)
+                {
+                    switch (get())
+                    {
+                        case char_traits<char_type>::eof():
+                        case '\0':
+                        {
+                            error_message = "invalid comment; missing closing '*/'";
+                            return false;
+                        }
+
+                        case '*':
+                        {
+                            switch (get())
+                            {
+                                case '/':
+                                    return true;
+
+                                default:
+                                {
+                                    unget();
+                                    continue;
+                                }
+                            }
+                        }
+
+                        default:
+                            continue;
+                    }
+                }
+            }
+
+            // unexpected character after reading '/'
+            default:
+            {
+                error_message = "invalid comment; expecting '/' or '*' after '/'";
+                return false;
+            }
+        }
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void strtof(float& f, const char* str, char** endptr) noexcept
+    {
+        f = std::strtof(str, endptr);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void strtof(double& f, const char* str, char** endptr) noexcept
+    {
+        f = std::strtod(str, endptr);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void strtof(long double& f, const char* str, char** endptr) noexcept
+    {
+        f = std::strtold(str, endptr);
+    }
+
+    /*!
+    @brief scan a number literal
+
+    This function scans a string according to Sect. 6 of RFC 8259.
+
+    The function is realized with a deterministic finite state machine derived
+    from the grammar described in RFC 8259. Starting in state "init", the
+    input is read and used to determined the next state. Only state "done"
+    accepts the number. State "error" is a trap state to model errors. In the
+    table below, "anything" means any character but the ones listed before.
+
+    state    | 0        | 1-9      | e E      | +       | -       | .        | anything
+    ---------|----------|----------|----------|---------|---------|----------|-----------
+    init     | zero     | any1     | [error]  | [error] | minus   | [error]  | [error]
+    minus    | zero     | any1     | [error]  | [error] | [error] | [error]  | [error]
+    zero     | done     | done     | exponent | done    | done    | decimal1 | done
+    any1     | any1     | any1     | exponent | done    | done    | decimal1 | done
+    decimal1 | decimal2 | decimal2 | [error]  | [error] | [error] | [error]  | [error]
+    decimal2 | decimal2 | decimal2 | exponent | done    | done    | done     | done
+    exponent | any2     | any2     | [error]  | sign    | sign    | [error]  | [error]
+    sign     | any2     | any2     | [error]  | [error] | [error] | [error]  | [error]
+    any2     | any2     | any2     | done     | done    | done    | done     | done
+
+    The state machine is realized with one label per state (prefixed with
+    "scan_number_") and `goto` statements between them. The state machine
+    contains cycles, but any cycle can be left when EOF is read. Therefore,
+    the function is guaranteed to terminate.
+
+    During scanning, the read bytes are stored in token_buffer. This string is
+    then converted to a signed integer, an unsigned integer, or a
+    floating-point number.
+
+    @return token_type::value_unsigned, token_type::value_integer, or
+            token_type::value_float if number could be successfully scanned,
+            token_type::parse_error otherwise
+
+    @note The scanner is independent of the current locale. Internally, the
+          locale's decimal point is used instead of `.` to work with the
+          locale-dependent converters.
+    */
+    token_type scan_number()  // lgtm [cpp/use-of-goto] `goto` is used in this function to implement the number-parsing state machine described above. By design, any finite input will eventually reach the "done" state or return token_type::parse_error. In each intermediate state, 1 byte of the input is appended to the token_buffer vector, and only the already initialized variables token_buffer, number_type, and error_message are manipulated.
+    {
+        // reset token_buffer to store the number's bytes
+        reset();
+
+        // the type of the parsed number; initially set to unsigned; will be
+        // changed if minus sign, decimal point or exponent is read
+        token_type number_type = token_type::value_unsigned;
+
+        // state (init): we just found out we need to scan a number
+        switch (current)
+        {
+            case '-':
+            {
+                add(current);
+                goto scan_number_minus;
+            }
+
+            case '0':
+            {
+                add(current);
+                goto scan_number_zero;
+            }
+
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            // all other characters are rejected outside scan_number()
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        }
+
+scan_number_minus:
+        // state: we just parsed a leading minus sign
+        number_type = token_type::value_integer;
+        switch (get())
+        {
+            case '0':
+            {
+                add(current);
+                goto scan_number_zero;
+            }
+
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            default:
+            {
+                error_message = "invalid number; expected digit after '-'";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_zero:
+        // state: we just parse a zero (maybe with a leading minus sign)
+        switch (get())
+        {
+            case '.':
+            {
+                add(decimal_point_char);
+                decimal_point_position = token_buffer.size() - 1;
+                goto scan_number_decimal1;
+            }
+
+            case 'e':
+            case 'E':
+            {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_any1:
+        // state: we just parsed a number 0-9 (maybe with a leading minus sign)
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            case '.':
+            {
+                add(decimal_point_char);
+                decimal_point_position = token_buffer.size() - 1;
+                goto scan_number_decimal1;
+            }
+
+            case 'e':
+            case 'E':
+            {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_decimal1:
+        // state: we just parsed a decimal point
+        number_type = token_type::value_float;
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_decimal2;
+            }
+
+            default:
+            {
+                error_message = "invalid number; expected digit after '.'";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_decimal2:
+        // we just parsed at least one number after a decimal point
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_decimal2;
+            }
+
+            case 'e':
+            case 'E':
+            {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_exponent:
+        // we just parsed an exponent
+        number_type = token_type::value_float;
+        switch (get())
+        {
+            case '+':
+            case '-':
+            {
+                add(current);
+                goto scan_number_sign;
+            }
+
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+            {
+                error_message =
+                    "invalid number; expected '+', '-', or digit after exponent";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_sign:
+        // we just parsed an exponent sign
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+            {
+                error_message = "invalid number; expected digit after exponent sign";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_any2:
+        // we just parsed a number after the exponent or exponent sign
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_done:
+        // unget the character after the number (we only read it to know that
+        // we are done scanning a number)
+        unget();
+
+        char* endptr = nullptr; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+        errno = 0;
+
+        // try to parse integers first and fall back to floats
+        if (number_type == token_type::value_unsigned)
+        {
+            const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
+
+            // we checked the number format before
+            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
+
+            if (errno != ERANGE)
+            {
+                value_unsigned = static_cast<number_unsigned_t>(x);
+                if (value_unsigned == x)
+                {
+                    return token_type::value_unsigned;
+                }
+            }
+        }
+        else if (number_type == token_type::value_integer)
+        {
+            const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
+
+            // we checked the number format before
+            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
+
+            if (errno != ERANGE)
+            {
+                value_integer = static_cast<number_integer_t>(x);
+                if (value_integer == x)
+                {
+                    return token_type::value_integer;
+                }
+            }
+        }
+
+        // this code is reached if we parse a floating-point number or if an
+        // integer conversion above failed
+        strtof(value_float, token_buffer.data(), &endptr);
+
+        // we checked the number format before
+        JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
+
+        return token_type::value_float;
+    }
+
+    /*!
+    @param[in] literal_text  the literal text to expect
+    @param[in] length        the length of the passed literal text
+    @param[in] return_type   the token type to return on success
+    */
+    JSON_HEDLEY_NON_NULL(2)
+    token_type scan_literal(const char_type* literal_text, const std::size_t length,
+                            token_type return_type)
+    {
+        JSON_ASSERT(char_traits<char_type>::to_char_type(current) == literal_text[0]);
+        for (std::size_t i = 1; i < length; ++i)
+        {
+            if (JSON_HEDLEY_UNLIKELY(char_traits<char_type>::to_char_type(get()) != literal_text[i]))
+            {
+                error_message = "invalid literal";
+                return token_type::parse_error;
+            }
+        }
+        return return_type;
+    }
+
+    /////////////////////
+    // input management
+    /////////////////////
+
+    /// reset token_buffer; current character is beginning of token
+    void reset() noexcept
+    {
+        token_buffer.clear();
+        token_string.clear();
+        decimal_point_position = std::string::npos;
+        token_string.push_back(char_traits<char_type>::to_char_type(current));
+    }
+
+    /*
+    @brief get next character from the input
+
+    This function provides the interface to the used input adapter. It does
+    not throw in case the input reached EOF, but returns a
+    `char_traits<char>::eof()` in that case.  Stores the scanned characters
+    for use in error messages.
+
+    @return character read from the input
+    */
+    char_int_type get()
+    {
+        ++position.chars_read_total;
+        ++position.chars_read_current_line;
+
+        if (next_unget)
+        {
+            // just reset the next_unget variable and work with current
+            next_unget = false;
+        }
+        else
+        {
+            current = ia.get_character();
+        }
+
+        if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof()))
+        {
+            token_string.push_back(char_traits<char_type>::to_char_type(current));
+        }
+
+        if (current == '\n')
+        {
+            ++position.lines_read;
+            position.chars_read_current_line = 0;
+        }
+
+        return current;
+    }
+
+    /*!
+    @brief unget current character (read it again on next get)
+
+    We implement unget by setting variable next_unget to true. The input is not
+    changed - we just simulate ungetting by modifying chars_read_total,
+    chars_read_current_line, and token_string. The next call to get() will
+    behave as if the unget character is read again.
+    */
+    void unget()
+    {
+        next_unget = true;
+
+        --position.chars_read_total;
+
+        // in case we "unget" a newline, we have to also decrement the lines_read
+        if (position.chars_read_current_line == 0)
+        {
+            if (position.lines_read > 0)
+            {
+                --position.lines_read;
+            }
+        }
+        else
+        {
+            --position.chars_read_current_line;
+        }
+
+        if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof()))
+        {
+            JSON_ASSERT(!token_string.empty());
+            token_string.pop_back();
+        }
+    }
+
+    /// add a character to token_buffer
+    void add(char_int_type c)
+    {
+        token_buffer.push_back(static_cast<typename string_t::value_type>(c));
+    }
+
+  public:
+    /////////////////////
+    // value getters
+    /////////////////////
+
+    /// return integer value
+    constexpr number_integer_t get_number_integer() const noexcept
+    {
+        return value_integer;
+    }
+
+    /// return unsigned integer value
+    constexpr number_unsigned_t get_number_unsigned() const noexcept
+    {
+        return value_unsigned;
+    }
+
+    /// return floating-point value
+    constexpr number_float_t get_number_float() const noexcept
+    {
+        return value_float;
+    }
+
+    /// return current string value (implicitly resets the token; useful only once)
+    string_t& get_string()
+    {
+        // translate decimal points from locale back to '.' (#4084)
+        if (decimal_point_char != '.' && decimal_point_position != std::string::npos)
+        {
+            token_buffer[decimal_point_position] = '.';
+        }
+        return token_buffer;
+    }
+
+    /////////////////////
+    // diagnostics
+    /////////////////////
+
+    /// return position of last read token
+    constexpr position_t get_position() const noexcept
+    {
+        return position;
+    }
+
+    /// return the last read token (for errors only).  Will never contain EOF
+    /// (an arbitrary value that is not a valid char value, often -1), because
+    /// 255 may legitimately occur.  May contain NUL, which should be escaped.
+    std::string get_token_string() const
+    {
+        // escape control characters
+        std::string result;
+        for (const auto c : token_string)
+        {
+            if (static_cast<unsigned char>(c) <= '\x1F')
+            {
+                // escape control characters
+                std::array<char, 9> cs{{}};
+                static_cast<void>((std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+                result += cs.data();
+            }
+            else
+            {
+                // add character as is
+                result.push_back(static_cast<std::string::value_type>(c));
+            }
+        }
+
+        return result;
+    }
+
+    /// return syntax error message
+    JSON_HEDLEY_RETURNS_NON_NULL
+    constexpr const char* get_error_message() const noexcept
+    {
+        return error_message;
+    }
+
+    /////////////////////
+    // actual scanner
+    /////////////////////
+
+    /*!
+    @brief skip the UTF-8 byte order mark
+    @return true iff there is no BOM or the correct BOM has been skipped
+    */
+    bool skip_bom()
+    {
+        if (get() == 0xEF)
+        {
+            // check if we completely parse the BOM
+            return get() == 0xBB && get() == 0xBF;
+        }
+
+        // the first character is not the beginning of the BOM; unget it to
+        // process is later
+        unget();
+        return true;
+    }
+
+    void skip_whitespace()
+    {
+        do
+        {
+            get();
+        }
+        while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
+    }
+
+    token_type scan()
+    {
+        // initially, skip the BOM
+        if (position.chars_read_total == 0 && !skip_bom())
+        {
+            error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
+            return token_type::parse_error;
+        }
+
+        // read next character and ignore whitespace
+        skip_whitespace();
+
+        // ignore comments
+        while (ignore_comments && current == '/')
+        {
+            if (!scan_comment())
+            {
+                return token_type::parse_error;
+            }
+
+            // skip following whitespace
+            skip_whitespace();
+        }
+
+        switch (current)
+        {
+            // structural characters
+            case '[':
+                return token_type::begin_array;
+            case ']':
+                return token_type::end_array;
+            case '{':
+                return token_type::begin_object;
+            case '}':
+                return token_type::end_object;
+            case ':':
+                return token_type::name_separator;
+            case ',':
+                return token_type::value_separator;
+
+            // literals
+            case 't':
+            {
+                std::array<char_type, 4> true_literal = {{static_cast<char_type>('t'), static_cast<char_type>('r'), static_cast<char_type>('u'), static_cast<char_type>('e')}};
+                return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true);
+            }
+            case 'f':
+            {
+                std::array<char_type, 5> false_literal = {{static_cast<char_type>('f'), static_cast<char_type>('a'), static_cast<char_type>('l'), static_cast<char_type>('s'), static_cast<char_type>('e')}};
+                return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false);
+            }
+            case 'n':
+            {
+                std::array<char_type, 4> null_literal = {{static_cast<char_type>('n'), static_cast<char_type>('u'), static_cast<char_type>('l'), static_cast<char_type>('l')}};
+                return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null);
+            }
+
+            // string
+            case '\"':
+                return scan_string();
+
+            // number
+            case '-':
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+                return scan_number();
+
+            // end of input (the null byte is needed when parsing from
+            // string literals)
+            case '\0':
+            case char_traits<char_type>::eof():
+                return token_type::end_of_input;
+
+            // error
+            default:
+                error_message = "invalid literal";
+                return token_type::parse_error;
+        }
+    }
+
+  private:
+    /// input adapter
+    InputAdapterType ia;
+
+    /// whether comments should be ignored (true) or signaled as errors (false)
+    const bool ignore_comments = false;
+
+    /// the current character
+    char_int_type current = char_traits<char_type>::eof();
+
+    /// whether the next get() call should just return current
+    bool next_unget = false;
+
+    /// the start position of the current token
+    position_t position {};
+
+    /// raw input token string (for error messages)
+    std::vector<char_type> token_string {};
+
+    /// buffer for variable-length tokens (numbers, strings)
+    string_t token_buffer {};
+
+    /// a description of occurred lexer errors
+    const char* error_message = "";
+
+    // number values
+    number_integer_t value_integer = 0;
+    number_unsigned_t value_unsigned = 0;
+    number_float_t value_float = 0;
+
+    /// the decimal point
+    const char_int_type decimal_point_char = '.';
+    /// the position of the decimal point in the input
+    std::size_t decimal_point_position = std::string::npos;
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/*!
+@brief SAX interface
+
+This class describes the SAX interface used by @ref nlohmann::json::sax_parse.
+Each function is called in different situations while the input is parsed. The
+boolean return value informs the parser whether to continue processing the
+input.
+*/
+template<typename BasicJsonType>
+struct json_sax
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+
+    /*!
+    @brief a null value was read
+    @return whether parsing should proceed
+    */
+    virtual bool null() = 0;
+
+    /*!
+    @brief a boolean value was read
+    @param[in] val  boolean value
+    @return whether parsing should proceed
+    */
+    virtual bool boolean(bool val) = 0;
+
+    /*!
+    @brief an integer number was read
+    @param[in] val  integer value
+    @return whether parsing should proceed
+    */
+    virtual bool number_integer(number_integer_t val) = 0;
+
+    /*!
+    @brief an unsigned integer number was read
+    @param[in] val  unsigned integer value
+    @return whether parsing should proceed
+    */
+    virtual bool number_unsigned(number_unsigned_t val) = 0;
+
+    /*!
+    @brief a floating-point number was read
+    @param[in] val  floating-point value
+    @param[in] s    raw token value
+    @return whether parsing should proceed
+    */
+    virtual bool number_float(number_float_t val, const string_t& s) = 0;
+
+    /*!
+    @brief a string value was read
+    @param[in] val  string value
+    @return whether parsing should proceed
+    @note It is safe to move the passed string value.
+    */
+    virtual bool string(string_t& val) = 0;
+
+    /*!
+    @brief a binary value was read
+    @param[in] val  binary value
+    @return whether parsing should proceed
+    @note It is safe to move the passed binary value.
+    */
+    virtual bool binary(binary_t& val) = 0;
+
+    /*!
+    @brief the beginning of an object was read
+    @param[in] elements  number of object elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool start_object(std::size_t elements) = 0;
+
+    /*!
+    @brief an object key was read
+    @param[in] val  object key
+    @return whether parsing should proceed
+    @note It is safe to move the passed string.
+    */
+    virtual bool key(string_t& val) = 0;
+
+    /*!
+    @brief the end of an object was read
+    @return whether parsing should proceed
+    */
+    virtual bool end_object() = 0;
+
+    /*!
+    @brief the beginning of an array was read
+    @param[in] elements  number of array elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool start_array(std::size_t elements) = 0;
+
+    /*!
+    @brief the end of an array was read
+    @return whether parsing should proceed
+    */
+    virtual bool end_array() = 0;
+
+    /*!
+    @brief a parse error occurred
+    @param[in] position    the position in the input where the error occurs
+    @param[in] last_token  the last read token
+    @param[in] ex          an exception object describing the error
+    @return whether parsing should proceed (must return false)
+    */
+    virtual bool parse_error(std::size_t position,
+                             const std::string& last_token,
+                             const detail::exception& ex) = 0;
+
+    json_sax() = default;
+    json_sax(const json_sax&) = default;
+    json_sax(json_sax&&) noexcept = default;
+    json_sax& operator=(const json_sax&) = default;
+    json_sax& operator=(json_sax&&) noexcept = default;
+    virtual ~json_sax() = default;
+};
+
+namespace detail
+{
+constexpr std::size_t unknown_size()
+{
+    return (std::numeric_limits<std::size_t>::max)();
+}
+
+/*!
+@brief SAX implementation to create a JSON value from SAX events
+
+This class implements the @ref json_sax interface and processes the SAX events
+to create a JSON value which makes it basically a DOM parser. The structure or
+hierarchy of the JSON value is managed by the stack `ref_stack` which contains
+a pointer to the respective array or object for each recursion depth.
+
+After successful parsing, the value that is passed by reference to the
+constructor contains the parsed value.
+
+@tparam BasicJsonType  the JSON type
+*/
+template<typename BasicJsonType, typename InputAdapterType>
+class json_sax_dom_parser
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using lexer_t = lexer<BasicJsonType, InputAdapterType>;
+
+    /*!
+    @param[in,out] r  reference to a JSON value that is manipulated while
+                       parsing
+    @param[in] allow_exceptions_  whether parse errors yield exceptions
+    */
+    explicit json_sax_dom_parser(BasicJsonType& r, const bool allow_exceptions_ = true, lexer_t* lexer_ = nullptr)
+        : root(r), allow_exceptions(allow_exceptions_), m_lexer_ref(lexer_)
+    {}
+
+    // make class move-only
+    json_sax_dom_parser(const json_sax_dom_parser&) = delete;
+    json_sax_dom_parser(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    json_sax_dom_parser& operator=(const json_sax_dom_parser&) = delete;
+    json_sax_dom_parser& operator=(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    ~json_sax_dom_parser() = default;
+
+    bool null()
+    {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool boolean(bool val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_integer(number_integer_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_float(number_float_t val, const string_t& /*unused*/)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool string(string_t& val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool binary(binary_t& val)
+    {
+        handle_value(std::move(val));
+        return true;
+    }
+
+    bool start_object(std::size_t len)
+    {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::object));
+
+#if JSON_DIAGNOSTIC_POSITIONS
+        // Manually set the start position of the object here.
+        // Ensure this is after the call to handle_value to ensure correct start position.
+        if (m_lexer_ref)
+        {
+            // Lexer has read the first character of the object, so
+            // subtract 1 from the position to get the correct start position.
+            ref_stack.back()->start_position = m_lexer_ref->get_position() - 1;
+        }
+#endif
+
+        if (JSON_HEDLEY_UNLIKELY(len != detail::unknown_size() && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408, concat("excessive object size: ", std::to_string(len)), ref_stack.back()));
+        }
+
+        return true;
+    }
+
+    bool key(string_t& val)
+    {
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(ref_stack.back()->is_object());
+
+        // add null at given key and store the reference for later
+        object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val));
+        return true;
+    }
+
+    bool end_object()
+    {
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(ref_stack.back()->is_object());
+
+#if JSON_DIAGNOSTIC_POSITIONS
+        if (m_lexer_ref)
+        {
+            // Lexer's position is past the closing brace, so set that as the end position.
+            ref_stack.back()->end_position = m_lexer_ref->get_position();
+        }
+#endif
+
+        ref_stack.back()->set_parents();
+        ref_stack.pop_back();
+        return true;
+    }
+
+    bool start_array(std::size_t len)
+    {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::array));
+
+#if JSON_DIAGNOSTIC_POSITIONS
+        // Manually set the start position of the array here.
+        // Ensure this is after the call to handle_value to ensure correct start position.
+        if (m_lexer_ref)
+        {
+            ref_stack.back()->start_position = m_lexer_ref->get_position() - 1;
+        }
+#endif
+
+        if (JSON_HEDLEY_UNLIKELY(len != detail::unknown_size() && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408, concat("excessive array size: ", std::to_string(len)), ref_stack.back()));
+        }
+
+        return true;
+    }
+
+    bool end_array()
+    {
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(ref_stack.back()->is_array());
+
+#if JSON_DIAGNOSTIC_POSITIONS
+        if (m_lexer_ref)
+        {
+            // Lexer's position is past the closing bracket, so set that as the end position.
+            ref_stack.back()->end_position = m_lexer_ref->get_position();
+        }
+#endif
+
+        ref_stack.back()->set_parents();
+        ref_stack.pop_back();
+        return true;
+    }
+
+    template<class Exception>
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
+                     const Exception& ex)
+    {
+        errored = true;
+        static_cast<void>(ex);
+        if (allow_exceptions)
+        {
+            JSON_THROW(ex);
+        }
+        return false;
+    }
+
+    constexpr bool is_errored() const
+    {
+        return errored;
+    }
+
+  private:
+
+#if JSON_DIAGNOSTIC_POSITIONS
+    void handle_diagnostic_positions_for_json_value(BasicJsonType& v)
+    {
+        if (m_lexer_ref)
+        {
+            // Lexer has read past the current field value, so set the end position to the current position.
+            // The start position will be set below based on the length of the string representation
+            // of the value.
+            v.end_position = m_lexer_ref->get_position();
+
+            switch (v.type())
+            {
+                case value_t::boolean:
+                {
+                    // 4 and 5 are the string length of "true" and "false"
+                    v.start_position = v.end_position - (v.m_data.m_value.boolean ? 4 : 5);
+                    break;
+                }
+
+                case value_t::null:
+                {
+                    // 4 is the string length of "null"
+                    v.start_position = v.end_position - 4;
+                    break;
+                }
+
+                case value_t::string:
+                {
+                    // include the length of the quotes, which is 2
+                    v.start_position = v.end_position - v.m_data.m_value.string->size() - 2;
+                    break;
+                }
+
+                // As we handle the start and end positions for values created during parsing,
+                // we do not expect the following value type to be called. Regardless, set the positions
+                // in case this is created manually or through a different constructor. Exclude from lcov
+                // since the exact condition of this switch is esoteric.
+                // LCOV_EXCL_START
+                case value_t::discarded:
+                {
+                    v.end_position = std::string::npos;
+                    v.start_position = v.end_position;
+                    break;
+                }
+                // LCOV_EXCL_STOP
+                case value_t::binary:
+                case value_t::number_integer:
+                case value_t::number_unsigned:
+                case value_t::number_float:
+                {
+                    v.start_position = v.end_position - m_lexer_ref->get_string().size();
+                    break;
+                }
+                case value_t::object:
+                case value_t::array:
+                {
+                    // object and array are handled in start_object() and start_array() handlers
+                    // skip setting the values here.
+                    break;
+                }
+                default: // LCOV_EXCL_LINE
+                    // Handle all possible types discretely, default handler should never be reached.
+                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert,-warnings-as-errors) LCOV_EXCL_LINE
+            }
+        }
+    }
+#endif
+
+    /*!
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+    */
+    template<typename Value>
+    JSON_HEDLEY_RETURNS_NON_NULL
+    BasicJsonType* handle_value(Value&& v)
+    {
+        if (ref_stack.empty())
+        {
+            root = BasicJsonType(std::forward<Value>(v));
+
+#if JSON_DIAGNOSTIC_POSITIONS
+            handle_diagnostic_positions_for_json_value(root);
+#endif
+
+            return &root;
+        }
+
+        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
+
+        if (ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_data.m_value.array->emplace_back(std::forward<Value>(v));
+
+#if JSON_DIAGNOSTIC_POSITIONS
+            handle_diagnostic_positions_for_json_value(ref_stack.back()->m_data.m_value.array->back());
+#endif
+
+            return &(ref_stack.back()->m_data.m_value.array->back());
+        }
+
+        JSON_ASSERT(ref_stack.back()->is_object());
+        JSON_ASSERT(object_element);
+        *object_element = BasicJsonType(std::forward<Value>(v));
+
+#if JSON_DIAGNOSTIC_POSITIONS
+        handle_diagnostic_positions_for_json_value(*object_element);
+#endif
+
+        return object_element;
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack {};
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+    /// the lexer reference to obtain the current position
+    lexer_t* m_lexer_ref = nullptr;
+};
+
+template<typename BasicJsonType, typename InputAdapterType>
+class json_sax_dom_callback_parser
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using parser_callback_t = typename BasicJsonType::parser_callback_t;
+    using parse_event_t = typename BasicJsonType::parse_event_t;
+    using lexer_t = lexer<BasicJsonType, InputAdapterType>;
+
+    json_sax_dom_callback_parser(BasicJsonType& r,
+                                 parser_callback_t cb,
+                                 const bool allow_exceptions_ = true,
+                                 lexer_t* lexer_ = nullptr)
+        : root(r), callback(std::move(cb)), allow_exceptions(allow_exceptions_), m_lexer_ref(lexer_)
+    {
+        keep_stack.push_back(true);
+    }
+
+    // make class move-only
+    json_sax_dom_callback_parser(const json_sax_dom_callback_parser&) = delete;
+    json_sax_dom_callback_parser(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    json_sax_dom_callback_parser& operator=(const json_sax_dom_callback_parser&) = delete;
+    json_sax_dom_callback_parser& operator=(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    ~json_sax_dom_callback_parser() = default;
+
+    bool null()
+    {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool boolean(bool val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_integer(number_integer_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_float(number_float_t val, const string_t& /*unused*/)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool string(string_t& val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool binary(binary_t& val)
+    {
+        handle_value(std::move(val));
+        return true;
+    }
+
+    bool start_object(std::size_t len)
+    {
+        // check callback for object start
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::object_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::object, true);
+        ref_stack.push_back(val.second);
+
+        if (ref_stack.back())
+        {
+
+#if JSON_DIAGNOSTIC_POSITIONS
+            // Manually set the start position of the object here.
+            // Ensure this is after the call to handle_value to ensure correct start position.
+            if (m_lexer_ref)
+            {
+                // Lexer has read the first character of the object, so
+                // subtract 1 from the position to get the correct start position.
+                ref_stack.back()->start_position = m_lexer_ref->get_position() - 1;
+            }
+#endif
+
+            // check object limit
+            if (JSON_HEDLEY_UNLIKELY(len != detail::unknown_size() && len > ref_stack.back()->max_size()))
+            {
+                JSON_THROW(out_of_range::create(408, concat("excessive object size: ", std::to_string(len)), ref_stack.back()));
+            }
+        }
+        return true;
+    }
+
+    bool key(string_t& val)
+    {
+        BasicJsonType k = BasicJsonType(val);
+
+        // check callback for key
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::key, k);
+        key_keep_stack.push_back(keep);
+
+        // add discarded value at given key and store the reference for later
+        if (keep && ref_stack.back())
+        {
+            object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val) = discarded);
+        }
+
+        return true;
+    }
+
+    bool end_object()
+    {
+        if (ref_stack.back())
+        {
+            if (!callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::object_end, *ref_stack.back()))
+            {
+                // discard object
+                *ref_stack.back() = discarded;
+
+#if JSON_DIAGNOSTIC_POSITIONS
+                // Set start/end positions for discarded object.
+                handle_diagnostic_positions_for_json_value(*ref_stack.back());
+#endif
+            }
+            else
+            {
+
+#if JSON_DIAGNOSTIC_POSITIONS
+                if (m_lexer_ref)
+                {
+                    // Lexer's position is past the closing brace, so set that as the end position.
+                    ref_stack.back()->end_position = m_lexer_ref->get_position();
+                }
+#endif
+
+                ref_stack.back()->set_parents();
+            }
+        }
+
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(!keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        if (!ref_stack.empty() && ref_stack.back() && ref_stack.back()->is_structured())
+        {
+            // remove discarded value
+            for (auto it = ref_stack.back()->begin(); it != ref_stack.back()->end(); ++it)
+            {
+                if (it->is_discarded())
+                {
+                    ref_stack.back()->erase(it);
+                    break;
+                }
+            }
+        }
+
+        return true;
+    }
+
+    bool start_array(std::size_t len)
+    {
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::array_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::array, true);
+        ref_stack.push_back(val.second);
+
+        if (ref_stack.back())
+        {
+
+#if JSON_DIAGNOSTIC_POSITIONS
+            // Manually set the start position of the array here.
+            // Ensure this is after the call to handle_value to ensure correct start position.
+            if (m_lexer_ref)
+            {
+                // Lexer has read the first character of the array, so
+                // subtract 1 from the position to get the correct start position.
+                ref_stack.back()->start_position = m_lexer_ref->get_position() - 1;
+            }
+#endif
+
+            // check array limit
+            if (JSON_HEDLEY_UNLIKELY(len != detail::unknown_size() && len > ref_stack.back()->max_size()))
+            {
+                JSON_THROW(out_of_range::create(408, concat("excessive array size: ", std::to_string(len)), ref_stack.back()));
+            }
+        }
+
+        return true;
+    }
+
+    bool end_array()
+    {
+        bool keep = true;
+
+        if (ref_stack.back())
+        {
+            keep = callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::array_end, *ref_stack.back());
+            if (keep)
+            {
+
+#if JSON_DIAGNOSTIC_POSITIONS
+                if (m_lexer_ref)
+                {
+                    // Lexer's position is past the closing bracket, so set that as the end position.
+                    ref_stack.back()->end_position = m_lexer_ref->get_position();
+                }
+#endif
+
+                ref_stack.back()->set_parents();
+            }
+            else
+            {
+                // discard array
+                *ref_stack.back() = discarded;
+
+#if JSON_DIAGNOSTIC_POSITIONS
+                // Set start/end positions for discarded array.
+                handle_diagnostic_positions_for_json_value(*ref_stack.back());
+#endif
+            }
+        }
+
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(!keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        // remove discarded value
+        if (!keep && !ref_stack.empty() && ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_data.m_value.array->pop_back();
+        }
+
+        return true;
+    }
+
+    template<class Exception>
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
+                     const Exception& ex)
+    {
+        errored = true;
+        static_cast<void>(ex);
+        if (allow_exceptions)
+        {
+            JSON_THROW(ex);
+        }
+        return false;
+    }
+
+    constexpr bool is_errored() const
+    {
+        return errored;
+    }
+
+  private:
+
+#if JSON_DIAGNOSTIC_POSITIONS
+    void handle_diagnostic_positions_for_json_value(BasicJsonType& v)
+    {
+        if (m_lexer_ref)
+        {
+            // Lexer has read past the current field value, so set the end position to the current position.
+            // The start position will be set below based on the length of the string representation
+            // of the value.
+            v.end_position = m_lexer_ref->get_position();
+
+            switch (v.type())
+            {
+                case value_t::boolean:
+                {
+                    // 4 and 5 are the string length of "true" and "false"
+                    v.start_position = v.end_position - (v.m_data.m_value.boolean ? 4 : 5);
+                    break;
+                }
+
+                case value_t::null:
+                {
+                    // 4 is the string length of "null"
+                    v.start_position = v.end_position - 4;
+                    break;
+                }
+
+                case value_t::string:
+                {
+                    // include the length of the quotes, which is 2
+                    v.start_position = v.end_position - v.m_data.m_value.string->size() - 2;
+                    break;
+                }
+
+                case value_t::discarded:
+                {
+                    v.end_position = std::string::npos;
+                    v.start_position = v.end_position;
+                    break;
+                }
+
+                case value_t::binary:
+                case value_t::number_integer:
+                case value_t::number_unsigned:
+                case value_t::number_float:
+                {
+                    v.start_position = v.end_position - m_lexer_ref->get_string().size();
+                    break;
+                }
+
+                case value_t::object:
+                case value_t::array:
+                {
+                    // object and array are handled in start_object() and start_array() handlers
+                    // skip setting the values here.
+                    break;
+                }
+                default: // LCOV_EXCL_LINE
+                    // Handle all possible types discretely, default handler should never be reached.
+                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert,-warnings-as-errors) LCOV_EXCL_LINE
+            }
+        }
+    }
+#endif
+
+    /*!
+    @param[in] v  value to add to the JSON value we build during parsing
+    @param[in] skip_callback  whether we should skip calling the callback
+               function; this is required after start_array() and
+               start_object() SAX events, because otherwise we would call the
+               callback function with an empty array or object, respectively.
+
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+
+    @return pair of boolean (whether value should be kept) and pointer (to the
+            passed value in the ref_stack hierarchy; nullptr if not kept)
+    */
+    template<typename Value>
+    std::pair<bool, BasicJsonType*> handle_value(Value&& v, const bool skip_callback = false)
+    {
+        JSON_ASSERT(!keep_stack.empty());
+
+        // do not handle this value if we know it would be added to a discarded
+        // container
+        if (!keep_stack.back())
+        {
+            return {false, nullptr};
+        }
+
+        // create value
+        auto value = BasicJsonType(std::forward<Value>(v));
+
+#if JSON_DIAGNOSTIC_POSITIONS
+        handle_diagnostic_positions_for_json_value(value);
+#endif
+
+        // check callback
+        const bool keep = skip_callback || callback(static_cast<int>(ref_stack.size()), parse_event_t::value, value);
+
+        // do not handle this value if we just learnt it shall be discarded
+        if (!keep)
+        {
+            return {false, nullptr};
+        }
+
+        if (ref_stack.empty())
+        {
+            root = std::move(value);
+            return {true, & root};
+        }
+
+        // skip this value if we already decided to skip the parent
+        // (https://github.com/nlohmann/json/issues/971#issuecomment-413678360)
+        if (!ref_stack.back())
+        {
+            return {false, nullptr};
+        }
+
+        // we now only expect arrays and objects
+        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
+
+        // array
+        if (ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_data.m_value.array->emplace_back(std::move(value));
+            return {true, & (ref_stack.back()->m_data.m_value.array->back())};
+        }
+
+        // object
+        JSON_ASSERT(ref_stack.back()->is_object());
+        // check if we should store an element for the current key
+        JSON_ASSERT(!key_keep_stack.empty());
+        const bool store_element = key_keep_stack.back();
+        key_keep_stack.pop_back();
+
+        if (!store_element)
+        {
+            return {false, nullptr};
+        }
+
+        JSON_ASSERT(object_element);
+        *object_element = std::move(value);
+        return {true, object_element};
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack {};
+    /// stack to manage which values to keep
+    std::vector<bool> keep_stack {}; // NOLINT(readability-redundant-member-init)
+    /// stack to manage which object keys to keep
+    std::vector<bool> key_keep_stack {}; // NOLINT(readability-redundant-member-init)
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// callback function
+    const parser_callback_t callback = nullptr;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+    /// a discarded value for the callback
+    BasicJsonType discarded = BasicJsonType::value_t::discarded;
+    /// the lexer reference to obtain the current position
+    lexer_t* m_lexer_ref = nullptr;
+};
+
+template<typename BasicJsonType>
+class json_sax_acceptor
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+
+    bool null()
+    {
+        return true;
+    }
+
+    bool boolean(bool /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_integer(number_integer_t /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_float(number_float_t /*unused*/, const string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool string(string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool binary(binary_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool start_object(std::size_t /*unused*/ = detail::unknown_size())
+    {
+        return true;
+    }
+
+    bool key(string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool end_object()
+    {
+        return true;
+    }
+
+    bool start_array(std::size_t /*unused*/ = detail::unknown_size())
+    {
+        return true;
+    }
+
+    bool end_array()
+    {
+        return true;
+    }
+
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const detail::exception& /*unused*/)
+    {
+        return false;
+    }
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/is_sax.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <cstdint> // size_t
+#include <utility> // declval
+#include <string> // string
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+template<typename T>
+using null_function_t = decltype(std::declval<T&>().null());
+
+template<typename T>
+using boolean_function_t =
+    decltype(std::declval<T&>().boolean(std::declval<bool>()));
+
+template<typename T, typename Integer>
+using number_integer_function_t =
+    decltype(std::declval<T&>().number_integer(std::declval<Integer>()));
+
+template<typename T, typename Unsigned>
+using number_unsigned_function_t =
+    decltype(std::declval<T&>().number_unsigned(std::declval<Unsigned>()));
+
+template<typename T, typename Float, typename String>
+using number_float_function_t = decltype(std::declval<T&>().number_float(
+                                    std::declval<Float>(), std::declval<const String&>()));
+
+template<typename T, typename String>
+using string_function_t =
+    decltype(std::declval<T&>().string(std::declval<String&>()));
+
+template<typename T, typename Binary>
+using binary_function_t =
+    decltype(std::declval<T&>().binary(std::declval<Binary&>()));
+
+template<typename T>
+using start_object_function_t =
+    decltype(std::declval<T&>().start_object(std::declval<std::size_t>()));
+
+template<typename T, typename String>
+using key_function_t =
+    decltype(std::declval<T&>().key(std::declval<String&>()));
+
+template<typename T>
+using end_object_function_t = decltype(std::declval<T&>().end_object());
+
+template<typename T>
+using start_array_function_t =
+    decltype(std::declval<T&>().start_array(std::declval<std::size_t>()));
+
+template<typename T>
+using end_array_function_t = decltype(std::declval<T&>().end_array());
+
+template<typename T, typename Exception>
+using parse_error_function_t = decltype(std::declval<T&>().parse_error(
+        std::declval<std::size_t>(), std::declval<const std::string&>(),
+        std::declval<const Exception&>()));
+
+template<typename SAX, typename BasicJsonType>
+struct is_sax
+{
+  private:
+    static_assert(is_basic_json<BasicJsonType>::value,
+                  "BasicJsonType must be of type basic_json<...>");
+
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using exception_t = typename BasicJsonType::exception;
+
+  public:
+    static constexpr bool value =
+        is_detected_exact<bool, null_function_t, SAX>::value &&
+        is_detected_exact<bool, boolean_function_t, SAX>::value &&
+        is_detected_exact<bool, number_integer_function_t, SAX, number_integer_t>::value &&
+        is_detected_exact<bool, number_unsigned_function_t, SAX, number_unsigned_t>::value &&
+        is_detected_exact<bool, number_float_function_t, SAX, number_float_t, string_t>::value &&
+        is_detected_exact<bool, string_function_t, SAX, string_t>::value &&
+        is_detected_exact<bool, binary_function_t, SAX, binary_t>::value &&
+        is_detected_exact<bool, start_object_function_t, SAX>::value &&
+        is_detected_exact<bool, key_function_t, SAX, string_t>::value &&
+        is_detected_exact<bool, end_object_function_t, SAX>::value &&
+        is_detected_exact<bool, start_array_function_t, SAX>::value &&
+        is_detected_exact<bool, end_array_function_t, SAX>::value &&
+        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value;
+};
+
+template<typename SAX, typename BasicJsonType>
+struct is_sax_static_asserts
+{
+  private:
+    static_assert(is_basic_json<BasicJsonType>::value,
+                  "BasicJsonType must be of type basic_json<...>");
+
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using exception_t = typename BasicJsonType::exception;
+
+  public:
+    static_assert(is_detected_exact<bool, null_function_t, SAX>::value,
+                  "Missing/invalid function: bool null()");
+    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
+                  "Missing/invalid function: bool boolean(bool)");
+    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
+                  "Missing/invalid function: bool boolean(bool)");
+    static_assert(
+        is_detected_exact<bool, number_integer_function_t, SAX,
+        number_integer_t>::value,
+        "Missing/invalid function: bool number_integer(number_integer_t)");
+    static_assert(
+        is_detected_exact<bool, number_unsigned_function_t, SAX,
+        number_unsigned_t>::value,
+        "Missing/invalid function: bool number_unsigned(number_unsigned_t)");
+    static_assert(is_detected_exact<bool, number_float_function_t, SAX,
+                  number_float_t, string_t>::value,
+                  "Missing/invalid function: bool number_float(number_float_t, const string_t&)");
+    static_assert(
+        is_detected_exact<bool, string_function_t, SAX, string_t>::value,
+        "Missing/invalid function: bool string(string_t&)");
+    static_assert(
+        is_detected_exact<bool, binary_function_t, SAX, binary_t>::value,
+        "Missing/invalid function: bool binary(binary_t&)");
+    static_assert(is_detected_exact<bool, start_object_function_t, SAX>::value,
+                  "Missing/invalid function: bool start_object(std::size_t)");
+    static_assert(is_detected_exact<bool, key_function_t, SAX, string_t>::value,
+                  "Missing/invalid function: bool key(string_t&)");
+    static_assert(is_detected_exact<bool, end_object_function_t, SAX>::value,
+                  "Missing/invalid function: bool end_object()");
+    static_assert(is_detected_exact<bool, start_array_function_t, SAX>::value,
+                  "Missing/invalid function: bool start_array(std::size_t)");
+    static_assert(is_detected_exact<bool, end_array_function_t, SAX>::value,
+                  "Missing/invalid function: bool end_array()");
+    static_assert(
+        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value,
+        "Missing/invalid function: bool parse_error(std::size_t, const "
+        "std::string&, const exception&)");
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+/// how to treat CBOR tags
+enum class cbor_tag_handler_t
+{
+    error,   ///< throw a parse_error exception in case of a tag
+    ignore,  ///< ignore tags
+    store    ///< store tags as binary type
+};
+
+/*!
+@brief determine system byte order
+
+@return true if and only if system's byte order is little endian
+
+@note from https://stackoverflow.com/a/1001328/266378
+*/
+static inline bool little_endianness(int num = 1) noexcept
+{
+    return *reinterpret_cast<char*>(&num) == 1;
+}
+
+///////////////////
+// binary reader //
+///////////////////
+
+/*!
+@brief deserialization of CBOR, MessagePack, and UBJSON values
+*/
+template<typename BasicJsonType, typename InputAdapterType, typename SAX = json_sax_dom_parser<BasicJsonType, InputAdapterType>>
+class binary_reader
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using json_sax_t = SAX;
+    using char_type = typename InputAdapterType::char_type;
+    using char_int_type = typename char_traits<char_type>::int_type;
+
+  public:
+    /*!
+    @brief create a binary reader
+
+    @param[in] adapter  input adapter to read from
+    */
+    explicit binary_reader(InputAdapterType&& adapter, const input_format_t format = input_format_t::json) noexcept : ia(std::move(adapter)), input_format(format)
+    {
+        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
+    }
+
+    // make class move-only
+    binary_reader(const binary_reader&) = delete;
+    binary_reader(binary_reader&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    binary_reader& operator=(const binary_reader&) = delete;
+    binary_reader& operator=(binary_reader&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    ~binary_reader() = default;
+
+    /*!
+    @param[in] format  the binary format to parse
+    @param[in] sax_    a SAX event processor
+    @param[in] strict  whether to expect the input to be consumed completed
+    @param[in] tag_handler  how to treat CBOR tags
+
+    @return whether parsing was successful
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    bool sax_parse(const input_format_t format,
+                   json_sax_t* sax_,
+                   const bool strict = true,
+                   const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        sax = sax_;
+        bool result = false;
+
+        switch (format)
+        {
+            case input_format_t::bson:
+                result = parse_bson_internal();
+                break;
+
+            case input_format_t::cbor:
+                result = parse_cbor_internal(true, tag_handler);
+                break;
+
+            case input_format_t::msgpack:
+                result = parse_msgpack_internal();
+                break;
+
+            case input_format_t::ubjson:
+            case input_format_t::bjdata:
+                result = parse_ubjson_internal();
+                break;
+
+            case input_format_t::json: // LCOV_EXCL_LINE
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        }
+
+        // strict mode: next byte must be EOF
+        if (result && strict)
+        {
+            if (input_format == input_format_t::ubjson || input_format == input_format_t::bjdata)
+            {
+                get_ignore_noop();
+            }
+            else
+            {
+                get();
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(current != char_traits<char_type>::eof()))
+            {
+                return sax->parse_error(chars_read, get_token_string(), parse_error::create(110, chars_read,
+                                        exception_message(input_format, concat("expected end of input; last byte: 0x", get_token_string()), "value"), nullptr));
+            }
+        }
+
+        return result;
+    }
+
+  private:
+    //////////
+    // BSON //
+    //////////
+
+    /*!
+    @brief Reads in a BSON-object and passes it to the SAX-parser.
+    @return whether a valid BSON-value was passed to the SAX parser
+    */
+    bool parse_bson_internal()
+    {
+        std::int32_t document_size{};
+        get_number<std::int32_t, true>(input_format_t::bson, document_size);
+
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(detail::unknown_size())))
+        {
+            return false;
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/false)))
+        {
+            return false;
+        }
+
+        return sax->end_object();
+    }
+
+    /*!
+    @brief Parses a C-style string from the BSON input.
+    @param[in,out] result  A reference to the string variable where the read
+                            string is to be stored.
+    @return `true` if the \x00-byte indicating the end of the string was
+             encountered before the EOF; false` indicates an unexpected EOF.
+    */
+    bool get_bson_cstr(string_t& result)
+    {
+        auto out = std::back_inserter(result);
+        while (true)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "cstring")))
+            {
+                return false;
+            }
+            if (current == 0x00)
+            {
+                return true;
+            }
+            *out++ = static_cast<typename string_t::value_type>(current);
+        }
+    }
+
+    /*!
+    @brief Parses a zero-terminated string of length @a len from the BSON
+           input.
+    @param[in] len  The length (including the zero-byte at the end) of the
+                    string to be read.
+    @param[in,out] result  A reference to the string variable where the read
+                            string is to be stored.
+    @tparam NumberType The type of the length @a len
+    @pre len >= 1
+    @return `true` if the string was successfully parsed
+    */
+    template<typename NumberType>
+    bool get_bson_string(const NumberType len, string_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(len < 1))
+        {
+            auto last_token = get_token_string();
+            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
+                                    exception_message(input_format_t::bson, concat("string length must be at least 1, is ", std::to_string(len)), "string"), nullptr));
+        }
+
+        return get_string(input_format_t::bson, len - static_cast<NumberType>(1), result) && get() != char_traits<char_type>::eof();
+    }
+
+    /*!
+    @brief Parses a byte array input of length @a len from the BSON input.
+    @param[in] len  The length of the byte array to be read.
+    @param[in,out] result  A reference to the binary variable where the read
+                            array is to be stored.
+    @tparam NumberType The type of the length @a len
+    @pre len >= 0
+    @return `true` if the byte array was successfully parsed
+    */
+    template<typename NumberType>
+    bool get_bson_binary(const NumberType len, binary_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(len < 0))
+        {
+            auto last_token = get_token_string();
+            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
+                                    exception_message(input_format_t::bson, concat("byte array length cannot be negative, is ", std::to_string(len)), "binary"), nullptr));
+        }
+
+        // All BSON binary values have a subtype
+        std::uint8_t subtype{};
+        get_number<std::uint8_t>(input_format_t::bson, subtype);
+        result.set_subtype(subtype);
+
+        return get_binary(input_format_t::bson, len, result);
+    }
+
+    /*!
+    @brief Read a BSON document element of the given @a element_type.
+    @param[in] element_type The BSON element type, c.f. http://bsonspec.org/spec.html
+    @param[in] element_type_parse_position The position in the input stream,
+               where the `element_type` was read.
+    @warning Not all BSON element types are supported yet. An unsupported
+             @a element_type will give rise to a parse_error.114:
+             Unsupported BSON record type 0x...
+    @return whether a valid BSON-object/array was passed to the SAX parser
+    */
+    bool parse_bson_element_internal(const char_int_type element_type,
+                                     const std::size_t element_type_parse_position)
+    {
+        switch (element_type)
+        {
+            case 0x01: // double
+            {
+                double number{};
+                return get_number<double, true>(input_format_t::bson, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0x02: // string
+            {
+                std::int32_t len{};
+                string_t value;
+                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_string(len, value) && sax->string(value);
+            }
+
+            case 0x03: // object
+            {
+                return parse_bson_internal();
+            }
+
+            case 0x04: // array
+            {
+                return parse_bson_array();
+            }
+
+            case 0x05: // binary
+            {
+                std::int32_t len{};
+                binary_t value;
+                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_binary(len, value) && sax->binary(value);
+            }
+
+            case 0x08: // boolean
+            {
+                return sax->boolean(get() != 0);
+            }
+
+            case 0x0A: // null
+            {
+                return sax->null();
+            }
+
+            case 0x10: // int32
+            {
+                std::int32_t value{};
+                return get_number<std::int32_t, true>(input_format_t::bson, value) && sax->number_integer(value);
+            }
+
+            case 0x12: // int64
+            {
+                std::int64_t value{};
+                return get_number<std::int64_t, true>(input_format_t::bson, value) && sax->number_integer(value);
+            }
+
+            default: // anything else not supported (yet)
+            {
+                std::array<char, 3> cr{{}};
+                static_cast<void>((std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(element_type))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+                const std::string cr_str{cr.data()};
+                return sax->parse_error(element_type_parse_position, cr_str,
+                                        parse_error::create(114, element_type_parse_position, concat("Unsupported BSON record type 0x", cr_str), nullptr));
+            }
+        }
+    }
+
+    /*!
+    @brief Read a BSON element list (as specified in the BSON-spec)
+
+    The same binary layout is used for objects and arrays, hence it must be
+    indicated with the argument @a is_array which one is expected
+    (true --> array, false --> object).
+
+    @param[in] is_array Determines if the element list being read is to be
+                        treated as an object (@a is_array == false), or as an
+                        array (@a is_array == true).
+    @return whether a valid BSON-object/array was passed to the SAX parser
+    */
+    bool parse_bson_element_list(const bool is_array)
+    {
+        string_t key;
+
+        while (auto element_type = get())
+        {
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "element list")))
+            {
+                return false;
+            }
+
+            const std::size_t element_type_parse_position = chars_read;
+            if (JSON_HEDLEY_UNLIKELY(!get_bson_cstr(key)))
+            {
+                return false;
+            }
+
+            if (!is_array && !sax->key(key))
+            {
+                return false;
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_internal(element_type, element_type_parse_position)))
+            {
+                return false;
+            }
+
+            // get_bson_cstr only appends
+            key.clear();
+        }
+
+        return true;
+    }
+
+    /*!
+    @brief Reads an array from the BSON input and passes it to the SAX-parser.
+    @return whether a valid BSON-array was passed to the SAX parser
+    */
+    bool parse_bson_array()
+    {
+        std::int32_t document_size{};
+        get_number<std::int32_t, true>(input_format_t::bson, document_size);
+
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(detail::unknown_size())))
+        {
+            return false;
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/true)))
+        {
+            return false;
+        }
+
+        return sax->end_array();
+    }
+
+    //////////
+    // CBOR //
+    //////////
+
+    /*!
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true) or whether the last read character should
+                         be considered instead (false)
+    @param[in] tag_handler how CBOR tags should be treated
+
+    @return whether a valid CBOR value was passed to the SAX parser
+    */
+    bool parse_cbor_internal(const bool get_char,
+                             const cbor_tag_handler_t tag_handler)
+    {
+        switch (get_char ? get() : current)
+        {
+            // EOF
+            case char_traits<char_type>::eof():
+                return unexpect_eof(input_format_t::cbor, "value");
+
+            // Integer 0x00..0x17 (0..23)
+            case 0x00:
+            case 0x01:
+            case 0x02:
+            case 0x03:
+            case 0x04:
+            case 0x05:
+            case 0x06:
+            case 0x07:
+            case 0x08:
+            case 0x09:
+            case 0x0A:
+            case 0x0B:
+            case 0x0C:
+            case 0x0D:
+            case 0x0E:
+            case 0x0F:
+            case 0x10:
+            case 0x11:
+            case 0x12:
+            case 0x13:
+            case 0x14:
+            case 0x15:
+            case 0x16:
+            case 0x17:
+                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
+
+            case 0x18: // Unsigned integer (one-byte uint8_t follows)
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            case 0x19: // Unsigned integer (two-byte uint16_t follows)
+            {
+                std::uint16_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            case 0x1A: // Unsigned integer (four-byte uint32_t follows)
+            {
+                std::uint32_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            case 0x1B: // Unsigned integer (eight-byte uint64_t follows)
+            {
+                std::uint64_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            // Negative integer -1-0x00..-1-0x17 (-1..-24)
+            case 0x20:
+            case 0x21:
+            case 0x22:
+            case 0x23:
+            case 0x24:
+            case 0x25:
+            case 0x26:
+            case 0x27:
+            case 0x28:
+            case 0x29:
+            case 0x2A:
+            case 0x2B:
+            case 0x2C:
+            case 0x2D:
+            case 0x2E:
+            case 0x2F:
+            case 0x30:
+            case 0x31:
+            case 0x32:
+            case 0x33:
+            case 0x34:
+            case 0x35:
+            case 0x36:
+            case 0x37:
+                return sax->number_integer(static_cast<std::int8_t>(0x20 - 1 - current));
+
+            case 0x38: // Negative integer (one-byte uint8_t follows)
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x39: // Negative integer -1-n (two-byte uint16_t follows)
+            {
+                std::uint16_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x3A: // Negative integer -1-n (four-byte uint32_t follows)
+            {
+                std::uint32_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x3B: // Negative integer -1-n (eight-byte uint64_t follows)
+            {
+                std::uint64_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1)
+                        - static_cast<number_integer_t>(number));
+            }
+
+            // Binary data (0x00..0x17 bytes follow)
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57:
+            case 0x58: // Binary data (one-byte uint8_t for n follows)
+            case 0x59: // Binary data (two-byte uint16_t for n follow)
+            case 0x5A: // Binary data (four-byte uint32_t for n follow)
+            case 0x5B: // Binary data (eight-byte uint64_t for n follow)
+            case 0x5F: // Binary data (indefinite length)
+            {
+                binary_t b;
+                return get_cbor_binary(b) && sax->binary(b);
+            }
+
+            // UTF-8 string (0x00..0x17 bytes follow)
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
+            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
+            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
+            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
+            case 0x7F: // UTF-8 string (indefinite length)
+            {
+                string_t s;
+                return get_cbor_string(s) && sax->string(s);
+            }
+
+            // array (0x00..0x17 data items follow)
+            case 0x80:
+            case 0x81:
+            case 0x82:
+            case 0x83:
+            case 0x84:
+            case 0x85:
+            case 0x86:
+            case 0x87:
+            case 0x88:
+            case 0x89:
+            case 0x8A:
+            case 0x8B:
+            case 0x8C:
+            case 0x8D:
+            case 0x8E:
+            case 0x8F:
+            case 0x90:
+            case 0x91:
+            case 0x92:
+            case 0x93:
+            case 0x94:
+            case 0x95:
+            case 0x96:
+            case 0x97:
+                return get_cbor_array(
+                           conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu), tag_handler);
+
+            case 0x98: // array (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x99: // array (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x9A: // array (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(conditional_static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x9B: // array (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(conditional_static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x9F: // array (indefinite length)
+                return get_cbor_array(detail::unknown_size(), tag_handler);
+
+            // map (0x00..0x17 pairs of data items follow)
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+                return get_cbor_object(conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu), tag_handler);
+
+            case 0xB8: // map (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xB9: // map (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xBA: // map (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(conditional_static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xBB: // map (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(conditional_static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xBF: // map (indefinite length)
+                return get_cbor_object(detail::unknown_size(), tag_handler);
+
+            case 0xC6: // tagged item
+            case 0xC7:
+            case 0xC8:
+            case 0xC9:
+            case 0xCA:
+            case 0xCB:
+            case 0xCC:
+            case 0xCD:
+            case 0xCE:
+            case 0xCF:
+            case 0xD0:
+            case 0xD1:
+            case 0xD2:
+            case 0xD3:
+            case 0xD4:
+            case 0xD8: // tagged item (1 bytes follow)
+            case 0xD9: // tagged item (2 bytes follow)
+            case 0xDA: // tagged item (4 bytes follow)
+            case 0xDB: // tagged item (8 bytes follow)
+            {
+                switch (tag_handler)
+                {
+                    case cbor_tag_handler_t::error:
+                    {
+                        auto last_token = get_token_string();
+                        return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
+                                                exception_message(input_format_t::cbor, concat("invalid byte: 0x", last_token), "value"), nullptr));
+                    }
+
+                    case cbor_tag_handler_t::ignore:
+                    {
+                        // ignore binary subtype
+                        switch (current)
+                        {
+                            case 0xD8:
+                            {
+                                std::uint8_t subtype_to_ignore{};
+                                get_number(input_format_t::cbor, subtype_to_ignore);
+                                break;
+                            }
+                            case 0xD9:
+                            {
+                                std::uint16_t subtype_to_ignore{};
+                                get_number(input_format_t::cbor, subtype_to_ignore);
+                                break;
+                            }
+                            case 0xDA:
+                            {
+                                std::uint32_t subtype_to_ignore{};
+                                get_number(input_format_t::cbor, subtype_to_ignore);
+                                break;
+                            }
+                            case 0xDB:
+                            {
+                                std::uint64_t subtype_to_ignore{};
+                                get_number(input_format_t::cbor, subtype_to_ignore);
+                                break;
+                            }
+                            default:
+                                break;
+                        }
+                        return parse_cbor_internal(true, tag_handler);
+                    }
+
+                    case cbor_tag_handler_t::store:
+                    {
+                        binary_t b;
+                        // use binary subtype and store in binary container
+                        switch (current)
+                        {
+                            case 0xD8:
+                            {
+                                std::uint8_t subtype{};
+                                get_number(input_format_t::cbor, subtype);
+                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
+                                break;
+                            }
+                            case 0xD9:
+                            {
+                                std::uint16_t subtype{};
+                                get_number(input_format_t::cbor, subtype);
+                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
+                                break;
+                            }
+                            case 0xDA:
+                            {
+                                std::uint32_t subtype{};
+                                get_number(input_format_t::cbor, subtype);
+                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
+                                break;
+                            }
+                            case 0xDB:
+                            {
+                                std::uint64_t subtype{};
+                                get_number(input_format_t::cbor, subtype);
+                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
+                                break;
+                            }
+                            default:
+                                return parse_cbor_internal(true, tag_handler);
+                        }
+                        get();
+                        return get_cbor_binary(b) && sax->binary(b);
+                    }
+
+                    default:                 // LCOV_EXCL_LINE
+                        JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+                        return false;        // LCOV_EXCL_LINE
+                }
+            }
+
+            case 0xF4: // false
+                return sax->boolean(false);
+
+            case 0xF5: // true
+                return sax->boolean(true);
+
+            case 0xF6: // null
+                return sax->null();
+
+            case 0xF9: // Half-Precision Float (two-byte IEEE 754)
+            {
+                const auto byte1_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number")))
+                {
+                    return false;
+                }
+                const auto byte2_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number")))
+                {
+                    return false;
+                }
+
+                const auto byte1 = static_cast<unsigned char>(byte1_raw);
+                const auto byte2 = static_cast<unsigned char>(byte2_raw);
+
+                // code from RFC 7049, Appendix D, Figure 3:
+                // As half-precision floating-point numbers were only added
+                // to IEEE 754 in 2008, today's programming platforms often
+                // still only have limited support for them. It is very
+                // easy to include at least decoding support for them even
+                // without such support. An example of a small decoder for
+                // half-precision floating-point numbers in the C language
+                // is shown in Fig. 3.
+                const auto half = static_cast<unsigned int>((byte1 << 8u) + byte2);
+                const double val = [&half]
+                {
+                    const int exp = (half >> 10u) & 0x1Fu;
+                    const unsigned int mant = half & 0x3FFu;
+                    JSON_ASSERT(0 <= exp&& exp <= 32);
+                    JSON_ASSERT(mant <= 1024);
+                    switch (exp)
+                    {
+                        case 0:
+                            return std::ldexp(mant, -24);
+                        case 31:
+                            return (mant == 0)
+                            ? std::numeric_limits<double>::infinity()
+                            : std::numeric_limits<double>::quiet_NaN();
+                        default:
+                            return std::ldexp(mant + 1024, exp - 25);
+                    }
+                }();
+                return sax->number_float((half & 0x8000u) != 0
+                                         ? static_cast<number_float_t>(-val)
+                                         : static_cast<number_float_t>(val), "");
+            }
+
+            case 0xFA: // Single-Precision Float (four-byte IEEE 754)
+            {
+                float number{};
+                return get_number(input_format_t::cbor, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xFB: // Double-Precision Float (eight-byte IEEE 754)
+            {
+                double number{};
+                return get_number(input_format_t::cbor, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            default: // anything else (0xFF is handled inside the other types)
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
+                                        exception_message(input_format_t::cbor, concat("invalid byte: 0x", last_token), "value"), nullptr));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a CBOR string
+
+    This function first reads starting bytes to determine the expected
+    string length and then copies this number of bytes into a string.
+    Additionally, CBOR's strings with indefinite lengths are supported.
+
+    @param[out] result  created string
+
+    @return whether string creation completed
+    */
+    bool get_cbor_string(string_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "string")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            // UTF-8 string (0x00..0x17 bytes follow)
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            {
+                return get_string(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7F: // UTF-8 string (indefinite length)
+            {
+                while (get() != 0xFF)
+                {
+                    string_t chunk;
+                    if (!get_cbor_string(chunk))
+                    {
+                        return false;
+                    }
+                    result.append(chunk);
+                }
+                return true;
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read,
+                                        exception_message(input_format_t::cbor, concat("expected length specification (0x60-0x7B) or indefinite string type (0x7F); last byte: 0x", last_token), "string"), nullptr));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a CBOR byte array
+
+    This function first reads starting bytes to determine the expected
+    byte array length and then copies this number of bytes into the byte array.
+    Additionally, CBOR's byte arrays with indefinite lengths are supported.
+
+    @param[out] result  created byte array
+
+    @return whether byte array creation completed
+    */
+    bool get_cbor_binary(binary_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "binary")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            // Binary data (0x00..0x17 bytes follow)
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57:
+            {
+                return get_binary(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0x58: // Binary data (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x59: // Binary data (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x5A: // Binary data (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x5B: // Binary data (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x5F: // Binary data (indefinite length)
+            {
+                while (get() != 0xFF)
+                {
+                    binary_t chunk;
+                    if (!get_cbor_binary(chunk))
+                    {
+                        return false;
+                    }
+                    result.insert(result.end(), chunk.begin(), chunk.end());
+                }
+                return true;
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read,
+                                        exception_message(input_format_t::cbor, concat("expected length specification (0x40-0x5B) or indefinite binary array type (0x5F); last byte: 0x", last_token), "binary"), nullptr));
+            }
+        }
+    }
+
+    /*!
+    @param[in] len  the length of the array or detail::unknown_size() for an
+                    array of indefinite size
+    @param[in] tag_handler how CBOR tags should be treated
+    @return whether array creation completed
+    */
+    bool get_cbor_array(const std::size_t len,
+                        const cbor_tag_handler_t tag_handler)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len)))
+        {
+            return false;
+        }
+
+        if (len != detail::unknown_size())
+        {
+            for (std::size_t i = 0; i < len; ++i)
+            {
+                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
+                {
+                    return false;
+                }
+            }
+        }
+        else
+        {
+            while (get() != 0xFF)
+            {
+                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(false, tag_handler)))
+                {
+                    return false;
+                }
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @param[in] len  the length of the object or detail::unknown_size() for an
+                    object of indefinite size
+    @param[in] tag_handler how CBOR tags should be treated
+    @return whether object creation completed
+    */
+    bool get_cbor_object(const std::size_t len,
+                         const cbor_tag_handler_t tag_handler)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len)))
+        {
+            return false;
+        }
+
+        if (len != 0)
+        {
+            string_t key;
+            if (len != detail::unknown_size())
+            {
+                for (std::size_t i = 0; i < len; ++i)
+                {
+                    get();
+                    if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key)))
+                    {
+                        return false;
+                    }
+
+                    if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+            else
+            {
+                while (get() != 0xFF)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key)))
+                    {
+                        return false;
+                    }
+
+                    if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+        }
+
+        return sax->end_object();
+    }
+
+    /////////////
+    // MsgPack //
+    /////////////
+
+    /*!
+    @return whether a valid MessagePack value was passed to the SAX parser
+    */
+    bool parse_msgpack_internal()
+    {
+        switch (get())
+        {
+            // EOF
+            case char_traits<char_type>::eof():
+                return unexpect_eof(input_format_t::msgpack, "value");
+
+            // positive fixint
+            case 0x00:
+            case 0x01:
+            case 0x02:
+            case 0x03:
+            case 0x04:
+            case 0x05:
+            case 0x06:
+            case 0x07:
+            case 0x08:
+            case 0x09:
+            case 0x0A:
+            case 0x0B:
+            case 0x0C:
+            case 0x0D:
+            case 0x0E:
+            case 0x0F:
+            case 0x10:
+            case 0x11:
+            case 0x12:
+            case 0x13:
+            case 0x14:
+            case 0x15:
+            case 0x16:
+            case 0x17:
+            case 0x18:
+            case 0x19:
+            case 0x1A:
+            case 0x1B:
+            case 0x1C:
+            case 0x1D:
+            case 0x1E:
+            case 0x1F:
+            case 0x20:
+            case 0x21:
+            case 0x22:
+            case 0x23:
+            case 0x24:
+            case 0x25:
+            case 0x26:
+            case 0x27:
+            case 0x28:
+            case 0x29:
+            case 0x2A:
+            case 0x2B:
+            case 0x2C:
+            case 0x2D:
+            case 0x2E:
+            case 0x2F:
+            case 0x30:
+            case 0x31:
+            case 0x32:
+            case 0x33:
+            case 0x34:
+            case 0x35:
+            case 0x36:
+            case 0x37:
+            case 0x38:
+            case 0x39:
+            case 0x3A:
+            case 0x3B:
+            case 0x3C:
+            case 0x3D:
+            case 0x3E:
+            case 0x3F:
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57:
+            case 0x58:
+            case 0x59:
+            case 0x5A:
+            case 0x5B:
+            case 0x5C:
+            case 0x5D:
+            case 0x5E:
+            case 0x5F:
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            case 0x78:
+            case 0x79:
+            case 0x7A:
+            case 0x7B:
+            case 0x7C:
+            case 0x7D:
+            case 0x7E:
+            case 0x7F:
+                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
+
+            // fixmap
+            case 0x80:
+            case 0x81:
+            case 0x82:
+            case 0x83:
+            case 0x84:
+            case 0x85:
+            case 0x86:
+            case 0x87:
+            case 0x88:
+            case 0x89:
+            case 0x8A:
+            case 0x8B:
+            case 0x8C:
+            case 0x8D:
+            case 0x8E:
+            case 0x8F:
+                return get_msgpack_object(conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));
+
+            // fixarray
+            case 0x90:
+            case 0x91:
+            case 0x92:
+            case 0x93:
+            case 0x94:
+            case 0x95:
+            case 0x96:
+            case 0x97:
+            case 0x98:
+            case 0x99:
+            case 0x9A:
+            case 0x9B:
+            case 0x9C:
+            case 0x9D:
+            case 0x9E:
+            case 0x9F:
+                return get_msgpack_array(conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));
+
+            // fixstr
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+            case 0xB8:
+            case 0xB9:
+            case 0xBA:
+            case 0xBB:
+            case 0xBC:
+            case 0xBD:
+            case 0xBE:
+            case 0xBF:
+            case 0xD9: // str 8
+            case 0xDA: // str 16
+            case 0xDB: // str 32
+            {
+                string_t s;
+                return get_msgpack_string(s) && sax->string(s);
+            }
+
+            case 0xC0: // nil
+                return sax->null();
+
+            case 0xC2: // false
+                return sax->boolean(false);
+
+            case 0xC3: // true
+                return sax->boolean(true);
+
+            case 0xC4: // bin 8
+            case 0xC5: // bin 16
+            case 0xC6: // bin 32
+            case 0xC7: // ext 8
+            case 0xC8: // ext 16
+            case 0xC9: // ext 32
+            case 0xD4: // fixext 1
+            case 0xD5: // fixext 2
+            case 0xD6: // fixext 4
+            case 0xD7: // fixext 8
+            case 0xD8: // fixext 16
+            {
+                binary_t b;
+                return get_msgpack_binary(b) && sax->binary(b);
+            }
+
+            case 0xCA: // float 32
+            {
+                float number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xCB: // float 64
+            {
+                double number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xCC: // uint 8
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xCD: // uint 16
+            {
+                std::uint16_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xCE: // uint 32
+            {
+                std::uint32_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xCF: // uint 64
+            {
+                std::uint64_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xD0: // int 8
+            {
+                std::int8_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xD1: // int 16
+            {
+                std::int16_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xD2: // int 32
+            {
+                std::int32_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xD3: // int 64
+            {
+                std::int64_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xDC: // array 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast<std::size_t>(len));
+            }
+
+            case 0xDD: // array 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_array(conditional_static_cast<std::size_t>(len));
+            }
+
+            case 0xDE: // map 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast<std::size_t>(len));
+            }
+
+            case 0xDF: // map 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_object(conditional_static_cast<std::size_t>(len));
+            }
+
+            // negative fixint
+            case 0xE0:
+            case 0xE1:
+            case 0xE2:
+            case 0xE3:
+            case 0xE4:
+            case 0xE5:
+            case 0xE6:
+            case 0xE7:
+            case 0xE8:
+            case 0xE9:
+            case 0xEA:
+            case 0xEB:
+            case 0xEC:
+            case 0xED:
+            case 0xEE:
+            case 0xEF:
+            case 0xF0:
+            case 0xF1:
+            case 0xF2:
+            case 0xF3:
+            case 0xF4:
+            case 0xF5:
+            case 0xF6:
+            case 0xF7:
+            case 0xF8:
+            case 0xF9:
+            case 0xFA:
+            case 0xFB:
+            case 0xFC:
+            case 0xFD:
+            case 0xFE:
+            case 0xFF:
+                return sax->number_integer(static_cast<std::int8_t>(current));
+
+            default: // anything else
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
+                                        exception_message(input_format_t::msgpack, concat("invalid byte: 0x", last_token), "value"), nullptr));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a MessagePack string
+
+    This function first reads starting bytes to determine the expected
+    string length and then copies this number of bytes into a string.
+
+    @param[out] result  created string
+
+    @return whether string creation completed
+    */
+    bool get_msgpack_string(string_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::msgpack, "string")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            // fixstr
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+            case 0xB8:
+            case 0xB9:
+            case 0xBA:
+            case 0xBB:
+            case 0xBC:
+            case 0xBD:
+            case 0xBE:
+            case 0xBF:
+            {
+                return get_string(input_format_t::msgpack, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0xD9: // str 8
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
+            }
+
+            case 0xDA: // str 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
+            }
+
+            case 0xDB: // str 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read,
+                                        exception_message(input_format_t::msgpack, concat("expected length specification (0xA0-0xBF, 0xD9-0xDB); last byte: 0x", last_token), "string"), nullptr));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a MessagePack byte array
+
+    This function first reads starting bytes to determine the expected
+    byte array length and then copies this number of bytes into a byte array.
+
+    @param[out] result  created byte array
+
+    @return whether byte array creation completed
+    */
+    bool get_msgpack_binary(binary_t& result)
+    {
+        // helper function to set the subtype
+        auto assign_and_return_true = [&result](std::int8_t subtype)
+        {
+            result.set_subtype(static_cast<std::uint8_t>(subtype));
+            return true;
+        };
+
+        switch (current)
+        {
+            case 0xC4: // bin 8
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_binary(input_format_t::msgpack, len, result);
+            }
+
+            case 0xC5: // bin 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_binary(input_format_t::msgpack, len, result);
+            }
+
+            case 0xC6: // bin 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_binary(input_format_t::msgpack, len, result);
+            }
+
+            case 0xC7: // ext 8
+            {
+                std::uint8_t len{};
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, len, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xC8: // ext 16
+            {
+                std::uint16_t len{};
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, len, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xC9: // ext 32
+            {
+                std::uint32_t len{};
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, len, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD4: // fixext 1
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 1, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD5: // fixext 2
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 2, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD6: // fixext 4
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 4, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD7: // fixext 8
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 8, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD8: // fixext 16
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 16, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            default:           // LCOV_EXCL_LINE
+                return false;  // LCOV_EXCL_LINE
+        }
+    }
+
+    /*!
+    @param[in] len  the length of the array
+    @return whether array creation completed
+    */
+    bool get_msgpack_array(const std::size_t len)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len)))
+        {
+            return false;
+        }
+
+        for (std::size_t i = 0; i < len; ++i)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal()))
+            {
+                return false;
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @param[in] len  the length of the object
+    @return whether object creation completed
+    */
+    bool get_msgpack_object(const std::size_t len)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len)))
+        {
+            return false;
+        }
+
+        string_t key;
+        for (std::size_t i = 0; i < len; ++i)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!get_msgpack_string(key) || !sax->key(key)))
+            {
+                return false;
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal()))
+            {
+                return false;
+            }
+            key.clear();
+        }
+
+        return sax->end_object();
+    }
+
+    ////////////
+    // UBJSON //
+    ////////////
+
+    /*!
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true, default) or whether the last read
+                         character should be considered instead
+
+    @return whether a valid UBJSON value was passed to the SAX parser
+    */
+    bool parse_ubjson_internal(const bool get_char = true)
+    {
+        return get_ubjson_value(get_char ? get_ignore_noop() : current);
+    }
+
+    /*!
+    @brief reads a UBJSON string
+
+    This function is either called after reading the 'S' byte explicitly
+    indicating a string, or in case of an object key where the 'S' byte can be
+    left out.
+
+    @param[out] result   created string
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true, default) or whether the last read
+                         character should be considered instead
+
+    @return whether string creation completed
+    */
+    bool get_ubjson_string(string_t& result, const bool get_char = true)
+    {
+        if (get_char)
+        {
+            get();  // TODO(niels): may we ignore N here?
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "value")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            case 'U':
+            {
+                std::uint8_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            case 'i':
+            {
+                std::int8_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            case 'I':
+            {
+                std::int16_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            case 'l':
+            {
+                std::int32_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            case 'L':
+            {
+                std::int64_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            case 'u':
+            {
+                if (input_format != input_format_t::bjdata)
+                {
+                    break;
+                }
+                std::uint16_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            case 'm':
+            {
+                if (input_format != input_format_t::bjdata)
+                {
+                    break;
+                }
+                std::uint32_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            case 'M':
+            {
+                if (input_format != input_format_t::bjdata)
+                {
+                    break;
+                }
+                std::uint64_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            default:
+                break;
+        }
+        auto last_token = get_token_string();
+        std::string message;
+
+        if (input_format != input_format_t::bjdata)
+        {
+            message = "expected length type specification (U, i, I, l, L); last byte: 0x" + last_token;
+        }
+        else
+        {
+            message = "expected length type specification (U, i, u, I, m, l, M, L); last byte: 0x" + last_token;
+        }
+        return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format, message, "string"), nullptr));
+    }
+
+    /*!
+    @param[out] dim  an integer vector storing the ND array dimensions
+    @return whether reading ND array size vector is successful
+    */
+    bool get_ubjson_ndarray_size(std::vector<size_t>& dim)
+    {
+        std::pair<std::size_t, char_int_type> size_and_type;
+        size_t dimlen = 0;
+        bool no_ndarray = true;
+
+        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type, no_ndarray)))
+        {
+            return false;
+        }
+
+        if (size_and_type.first != npos)
+        {
+            if (size_and_type.second != 0)
+            {
+                if (size_and_type.second != 'N')
+                {
+                    for (std::size_t i = 0; i < size_and_type.first; ++i)
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_value(dimlen, no_ndarray, size_and_type.second)))
+                        {
+                            return false;
+                        }
+                        dim.push_back(dimlen);
+                    }
+                }
+            }
+            else
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_value(dimlen, no_ndarray)))
+                    {
+                        return false;
+                    }
+                    dim.push_back(dimlen);
+                }
+            }
+        }
+        else
+        {
+            while (current != ']')
+            {
+                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_value(dimlen, no_ndarray, current)))
+                {
+                    return false;
+                }
+                dim.push_back(dimlen);
+                get_ignore_noop();
+            }
+        }
+        return true;
+    }
+
+    /*!
+    @param[out] result  determined size
+    @param[in,out] is_ndarray  for input, `true` means already inside an ndarray vector
+                               or ndarray dimension is not allowed; `false` means ndarray
+                               is allowed; for output, `true` means an ndarray is found;
+                               is_ndarray can only return `true` when its initial value
+                               is `false`
+    @param[in] prefix  type marker if already read, otherwise set to 0
+
+    @return whether size determination completed
+    */
+    bool get_ubjson_size_value(std::size_t& result, bool& is_ndarray, char_int_type prefix = 0)
+    {
+        if (prefix == 0)
+        {
+            prefix = get_ignore_noop();
+        }
+
+        switch (prefix)
+        {
+            case 'U':
+            {
+                std::uint8_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'i':
+            {
+                std::int8_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
+                {
+                    return false;
+                }
+                if (number < 0)
+                {
+                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read,
+                                            exception_message(input_format, "count in an optimized container must be positive", "size"), nullptr));
+                }
+                result = static_cast<std::size_t>(number); // NOLINT(bugprone-signed-char-misuse,cert-str34-c): number is not a char
+                return true;
+            }
+
+            case 'I':
+            {
+                std::int16_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
+                {
+                    return false;
+                }
+                if (number < 0)
+                {
+                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read,
+                                            exception_message(input_format, "count in an optimized container must be positive", "size"), nullptr));
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'l':
+            {
+                std::int32_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
+                {
+                    return false;
+                }
+                if (number < 0)
+                {
+                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read,
+                                            exception_message(input_format, "count in an optimized container must be positive", "size"), nullptr));
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'L':
+            {
+                std::int64_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
+                {
+                    return false;
+                }
+                if (number < 0)
+                {
+                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read,
+                                            exception_message(input_format, "count in an optimized container must be positive", "size"), nullptr));
+                }
+                if (!value_in_range_of<std::size_t>(number))
+                {
+                    return sax->parse_error(chars_read, get_token_string(), out_of_range::create(408,
+                                            exception_message(input_format, "integer value overflow", "size"), nullptr));
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'u':
+            {
+                if (input_format != input_format_t::bjdata)
+                {
+                    break;
+                }
+                std::uint16_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'm':
+            {
+                if (input_format != input_format_t::bjdata)
+                {
+                    break;
+                }
+                std::uint32_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
+                {
+                    return false;
+                }
+                result = conditional_static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'M':
+            {
+                if (input_format != input_format_t::bjdata)
+                {
+                    break;
+                }
+                std::uint64_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
+                {
+                    return false;
+                }
+                if (!value_in_range_of<std::size_t>(number))
+                {
+                    return sax->parse_error(chars_read, get_token_string(), out_of_range::create(408,
+                                            exception_message(input_format, "integer value overflow", "size"), nullptr));
+                }
+                result = detail::conditional_static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case '[':
+            {
+                if (input_format != input_format_t::bjdata)
+                {
+                    break;
+                }
+                if (is_ndarray) // ndarray dimensional vector can only contain integers, and can not embed another array
+                {
+                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimensional vector is not allowed", "size"), nullptr));
+                }
+                std::vector<size_t> dim;
+                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim)))
+                {
+                    return false;
+                }
+                if (dim.size() == 1 || (dim.size() == 2 && dim.at(0) == 1)) // return normal array size if 1D row vector
+                {
+                    result = dim.at(dim.size() - 1);
+                    return true;
+                }
+                if (!dim.empty())  // if ndarray, convert to an object in JData annotated array format
+                {
+                    for (auto i : dim) // test if any dimension in an ndarray is 0, if so, return a 1D empty container
+                    {
+                        if ( i == 0 )
+                        {
+                            result = 0;
+                            return true;
+                        }
+                    }
+
+                    string_t key = "_ArraySize_";
+                    if (JSON_HEDLEY_UNLIKELY(!sax->start_object(3) || !sax->key(key) || !sax->start_array(dim.size())))
+                    {
+                        return false;
+                    }
+                    result = 1;
+                    for (auto i : dim)
+                    {
+                        result *= i;
+                        if (result == 0 || result == npos) // because dim elements shall not have zeros, result = 0 means overflow happened; it also can't be npos as it is used to initialize size in get_ubjson_size_type()
+                        {
+                            return sax->parse_error(chars_read, get_token_string(), out_of_range::create(408, exception_message(input_format, "excessive ndarray size caused overflow", "size"), nullptr));
+                        }
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_unsigned(static_cast<number_unsigned_t>(i))))
+                        {
+                            return false;
+                        }
+                    }
+                    is_ndarray = true;
+                    return sax->end_array();
+                }
+                result = 0;
+                return true;
+            }
+
+            default:
+                break;
+        }
+        auto last_token = get_token_string();
+        std::string message;
+
+        if (input_format != input_format_t::bjdata)
+        {
+            message = "expected length type specification (U, i, I, l, L) after '#'; last byte: 0x" + last_token;
+        }
+        else
+        {
+            message = "expected length type specification (U, i, u, I, m, l, M, L) after '#'; last byte: 0x" + last_token;
+        }
+        return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format, message, "size"), nullptr));
+    }
+
+    /*!
+    @brief determine the type and size for a container
+
+    In the optimized UBJSON format, a type and a size can be provided to allow
+    for a more compact representation.
+
+    @param[out] result  pair of the size and the type
+    @param[in] inside_ndarray  whether the parser is parsing an ND array dimensional vector
+
+    @return whether pair creation completed
+    */
+    bool get_ubjson_size_type(std::pair<std::size_t, char_int_type>& result, bool inside_ndarray = false)
+    {
+        result.first = npos; // size
+        result.second = 0; // type
+        bool is_ndarray = false;
+
+        get_ignore_noop();
+
+        if (current == '$')
+        {
+            result.second = get();  // must not ignore 'N', because 'N' maybe the type
+            if (input_format == input_format_t::bjdata
+                    && JSON_HEDLEY_UNLIKELY(std::binary_search(bjd_optimized_type_markers.begin(), bjd_optimized_type_markers.end(), result.second)))
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
+                                        exception_message(input_format, concat("marker 0x", last_token, " is not a permitted optimized array type"), "type"), nullptr));
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "type")))
+            {
+                return false;
+            }
+
+            get_ignore_noop();
+            if (JSON_HEDLEY_UNLIKELY(current != '#'))
+            {
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "value")))
+                {
+                    return false;
+                }
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
+                                        exception_message(input_format, concat("expected '#' after type information; last byte: 0x", last_token), "size"), nullptr));
+            }
+
+            const bool is_error = get_ubjson_size_value(result.first, is_ndarray);
+            if (input_format == input_format_t::bjdata && is_ndarray)
+            {
+                if (inside_ndarray)
+                {
+                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(112, chars_read,
+                                            exception_message(input_format, "ndarray can not be recursive", "size"), nullptr));
+                }
+                result.second |= (1 << 8); // use bit 8 to indicate ndarray, all UBJSON and BJData markers should be ASCII letters
+            }
+            return is_error;
+        }
+
+        if (current == '#')
+        {
+            const bool is_error = get_ubjson_size_value(result.first, is_ndarray);
+            if (input_format == input_format_t::bjdata && is_ndarray)
+            {
+                return sax->parse_error(chars_read, get_token_string(), parse_error::create(112, chars_read,
+                                        exception_message(input_format, "ndarray requires both type and size", "size"), nullptr));
+            }
+            return is_error;
+        }
+
+        return true;
+    }
+
+    /*!
+    @param prefix  the previously read or set type prefix
+    @return whether value creation completed
+    */
+    bool get_ubjson_value(const char_int_type prefix)
+    {
+        switch (prefix)
+        {
+            case char_traits<char_type>::eof():  // EOF
+                return unexpect_eof(input_format, "value");
+
+            case 'T':  // true
+                return sax->boolean(true);
+            case 'F':  // false
+                return sax->boolean(false);
+
+            case 'Z':  // null
+                return sax->null();
+
+            case 'B':  // byte
+            {
+                if (input_format != input_format_t::bjdata)
+                {
+                    break;
+                }
+                std::uint8_t number{};
+                return get_number(input_format, number) && sax->number_unsigned(number);
+            }
+
+            case 'U':
+            {
+                std::uint8_t number{};
+                return get_number(input_format, number) && sax->number_unsigned(number);
+            }
+
+            case 'i':
+            {
+                std::int8_t number{};
+                return get_number(input_format, number) && sax->number_integer(number);
+            }
+
+            case 'I':
+            {
+                std::int16_t number{};
+                return get_number(input_format, number) && sax->number_integer(number);
+            }
+
+            case 'l':
+            {
+                std::int32_t number{};
+                return get_number(input_format, number) && sax->number_integer(number);
+            }
+
+            case 'L':
+            {
+                std::int64_t number{};
+                return get_number(input_format, number) && sax->number_integer(number);
+            }
+
+            case 'u':
+            {
+                if (input_format != input_format_t::bjdata)
+                {
+                    break;
+                }
+                std::uint16_t number{};
+                return get_number(input_format, number) && sax->number_unsigned(number);
+            }
+
+            case 'm':
+            {
+                if (input_format != input_format_t::bjdata)
+                {
+                    break;
+                }
+                std::uint32_t number{};
+                return get_number(input_format, number) && sax->number_unsigned(number);
+            }
+
+            case 'M':
+            {
+                if (input_format != input_format_t::bjdata)
+                {
+                    break;
+                }
+                std::uint64_t number{};
+                return get_number(input_format, number) && sax->number_unsigned(number);
+            }
+
+            case 'h':
+            {
+                if (input_format != input_format_t::bjdata)
+                {
+                    break;
+                }
+                const auto byte1_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "number")))
+                {
+                    return false;
+                }
+                const auto byte2_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "number")))
+                {
+                    return false;
+                }
+
+                const auto byte1 = static_cast<unsigned char>(byte1_raw);
+                const auto byte2 = static_cast<unsigned char>(byte2_raw);
+
+                // code from RFC 7049, Appendix D, Figure 3:
+                // As half-precision floating-point numbers were only added
+                // to IEEE 754 in 2008, today's programming platforms often
+                // still only have limited support for them. It is very
+                // easy to include at least decoding support for them even
+                // without such support. An example of a small decoder for
+                // half-precision floating-point numbers in the C language
+                // is shown in Fig. 3.
+                const auto half = static_cast<unsigned int>((byte2 << 8u) + byte1);
+                const double val = [&half]
+                {
+                    const int exp = (half >> 10u) & 0x1Fu;
+                    const unsigned int mant = half & 0x3FFu;
+                    JSON_ASSERT(0 <= exp&& exp <= 32);
+                    JSON_ASSERT(mant <= 1024);
+                    switch (exp)
+                    {
+                        case 0:
+                            return std::ldexp(mant, -24);
+                        case 31:
+                            return (mant == 0)
+                            ? std::numeric_limits<double>::infinity()
+                            : std::numeric_limits<double>::quiet_NaN();
+                        default:
+                            return std::ldexp(mant + 1024, exp - 25);
+                    }
+                }();
+                return sax->number_float((half & 0x8000u) != 0
+                                         ? static_cast<number_float_t>(-val)
+                                         : static_cast<number_float_t>(val), "");
+            }
+
+            case 'd':
+            {
+                float number{};
+                return get_number(input_format, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 'D':
+            {
+                double number{};
+                return get_number(input_format, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 'H':
+            {
+                return get_ubjson_high_precision_number();
+            }
+
+            case 'C':  // char
+            {
+                get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "char")))
+                {
+                    return false;
+                }
+                if (JSON_HEDLEY_UNLIKELY(current > 127))
+                {
+                    auto last_token = get_token_string();
+                    return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read,
+                                            exception_message(input_format, concat("byte after 'C' must be in range 0x00..0x7F; last byte: 0x", last_token), "char"), nullptr));
+                }
+                string_t s(1, static_cast<typename string_t::value_type>(current));
+                return sax->string(s);
+            }
+
+            case 'S':  // string
+            {
+                string_t s;
+                return get_ubjson_string(s) && sax->string(s);
+            }
+
+            case '[':  // array
+                return get_ubjson_array();
+
+            case '{':  // object
+                return get_ubjson_object();
+
+            default: // anything else
+                break;
+        }
+        auto last_token = get_token_string();
+        return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format, "invalid byte: 0x" + last_token, "value"), nullptr));
+    }
+
+    /*!
+    @return whether array creation completed
+    */
+    bool get_ubjson_array()
+    {
+        std::pair<std::size_t, char_int_type> size_and_type;
+        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type)))
+        {
+            return false;
+        }
+
+        // if bit-8 of size_and_type.second is set to 1, encode bjdata ndarray as an object in JData annotated array format (https://github.com/NeuroJSON/jdata):
+        // {"_ArrayType_" : "typeid", "_ArraySize_" : [n1, n2, ...], "_ArrayData_" : [v1, v2, ...]}
+
+        if (input_format == input_format_t::bjdata && size_and_type.first != npos && (size_and_type.second & (1 << 8)) != 0)
+        {
+            size_and_type.second &= ~(static_cast<char_int_type>(1) << 8);  // use bit 8 to indicate ndarray, here we remove the bit to restore the type marker
+            auto it = std::lower_bound(bjd_types_map.begin(), bjd_types_map.end(), size_and_type.second, [](const bjd_type & p, char_int_type t)
+            {
+                return p.first < t;
+            });
+            string_t key = "_ArrayType_";
+            if (JSON_HEDLEY_UNLIKELY(it == bjd_types_map.end() || it->first != size_and_type.second))
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
+                                        exception_message(input_format, "invalid byte: 0x" + last_token, "type"), nullptr));
+            }
+
+            string_t type = it->second; // sax->string() takes a reference
+            if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->string(type)))
+            {
+                return false;
+            }
+
+            if (size_and_type.second == 'C' || size_and_type.second == 'B')
+            {
+                size_and_type.second = 'U';
+            }
+
+            key = "_ArrayData_";
+            if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->start_array(size_and_type.first) ))
+            {
+                return false;
+            }
+
+            for (std::size_t i = 0; i < size_and_type.first; ++i)
+            {
+                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
+                {
+                    return false;
+                }
+            }
+
+            return (sax->end_array() && sax->end_object());
+        }
+
+        // If BJData type marker is 'B' decode as binary
+        if (input_format == input_format_t::bjdata && size_and_type.first != npos && size_and_type.second == 'B')
+        {
+            binary_t result;
+            return get_binary(input_format, size_and_type.first, result) && sax->binary(result);
+        }
+
+        if (size_and_type.first != npos)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(size_and_type.first)))
+            {
+                return false;
+            }
+
+            if (size_and_type.second != 0)
+            {
+                if (size_and_type.second != 'N')
+                {
+                    for (std::size_t i = 0; i < size_and_type.first; ++i)
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
+                        {
+                            return false;
+                        }
+                    }
+                }
+            }
+            else
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
+                    {
+                        return false;
+                    }
+                }
+            }
+        }
+        else
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(detail::unknown_size())))
+            {
+                return false;
+            }
+
+            while (current != ']')
+            {
+                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal(false)))
+                {
+                    return false;
+                }
+                get_ignore_noop();
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @return whether object creation completed
+    */
+    bool get_ubjson_object()
+    {
+        std::pair<std::size_t, char_int_type> size_and_type;
+        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type)))
+        {
+            return false;
+        }
+
+        // do not accept ND-array size in objects in BJData
+        if (input_format == input_format_t::bjdata && size_and_type.first != npos && (size_and_type.second & (1 << 8)) != 0)
+        {
+            auto last_token = get_token_string();
+            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
+                                    exception_message(input_format, "BJData object does not support ND-array size in optimized format", "object"), nullptr));
+        }
+
+        string_t key;
+        if (size_and_type.first != npos)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(size_and_type.first)))
+            {
+                return false;
+            }
+
+            if (size_and_type.second != 0)
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key)))
+                    {
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+            else
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key)))
+                    {
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+        }
+        else
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(detail::unknown_size())))
+            {
+                return false;
+            }
+
+            while (current != '}')
+            {
+                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key, false) || !sax->key(key)))
+                {
+                    return false;
+                }
+                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
+                {
+                    return false;
+                }
+                get_ignore_noop();
+                key.clear();
+            }
+        }
+
+        return sax->end_object();
+    }
+
+    // Note, no reader for UBJSON binary types is implemented because they do
+    // not exist
+
+    bool get_ubjson_high_precision_number()
+    {
+        // get size of following number string
+        std::size_t size{};
+        bool no_ndarray = true;
+        auto res = get_ubjson_size_value(size, no_ndarray);
+        if (JSON_HEDLEY_UNLIKELY(!res))
+        {
+            return res;
+        }
+
+        // get number string
+        std::vector<char> number_vector;
+        for (std::size_t i = 0; i < size; ++i)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "number")))
+            {
+                return false;
+            }
+            number_vector.push_back(static_cast<char>(current));
+        }
+
+        // parse number string
+        using ia_type = decltype(detail::input_adapter(number_vector));
+        auto number_lexer = detail::lexer<BasicJsonType, ia_type>(detail::input_adapter(number_vector), false);
+        const auto result_number = number_lexer.scan();
+        const auto number_string = number_lexer.get_token_string();
+        const auto result_remainder = number_lexer.scan();
+
+        using token_type = typename detail::lexer_base<BasicJsonType>::token_type;
+
+        if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input))
+        {
+            return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read,
+                                    exception_message(input_format, concat("invalid number text: ", number_lexer.get_token_string()), "high-precision number"), nullptr));
+        }
+
+        switch (result_number)
+        {
+            case token_type::value_integer:
+                return sax->number_integer(number_lexer.get_number_integer());
+            case token_type::value_unsigned:
+                return sax->number_unsigned(number_lexer.get_number_unsigned());
+            case token_type::value_float:
+                return sax->number_float(number_lexer.get_number_float(), std::move(number_string));
+            case token_type::uninitialized:
+            case token_type::literal_true:
+            case token_type::literal_false:
+            case token_type::literal_null:
+            case token_type::value_string:
+            case token_type::begin_array:
+            case token_type::begin_object:
+            case token_type::end_array:
+            case token_type::end_object:
+            case token_type::name_separator:
+            case token_type::value_separator:
+            case token_type::parse_error:
+            case token_type::end_of_input:
+            case token_type::literal_or_value:
+            default:
+                return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read,
+                                        exception_message(input_format, concat("invalid number text: ", number_lexer.get_token_string()), "high-precision number"), nullptr));
+        }
+    }
+
+    ///////////////////////
+    // Utility functions //
+    ///////////////////////
+
+    /*!
+    @brief get next character from the input
+
+    This function provides the interface to the used input adapter. It does
+    not throw in case the input reached EOF, but returns a -'ve valued
+    `char_traits<char_type>::eof()` in that case.
+
+    @return character read from the input
+    */
+    char_int_type get()
+    {
+        ++chars_read;
+        return current = ia.get_character();
+    }
+
+    /*!
+    @brief get_to read into a primitive type
+
+    This function provides the interface to the used input adapter. It does
+    not throw in case the input reached EOF, but returns false instead
+
+    @return bool, whether the read was successful
+    */
+    template<class T>
+    bool get_to(T& dest, const input_format_t format, const char* context)
+    {
+        auto new_chars_read = ia.get_elements(&dest);
+        chars_read += new_chars_read;
+        if (JSON_HEDLEY_UNLIKELY(new_chars_read < sizeof(T)))
+        {
+            // in case of failure, advance position by 1 to report failing location
+            ++chars_read;
+            sax->parse_error(chars_read, "<end of file>", parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), nullptr));
+            return false;
+        }
+        return true;
+    }
+
+    /*!
+    @return character read from the input after ignoring all 'N' entries
+    */
+    char_int_type get_ignore_noop()
+    {
+        do
+        {
+            get();
+        }
+        while (current == 'N');
+
+        return current;
+    }
+
+    template<class NumberType>
+    static void byte_swap(NumberType& number)
+    {
+        constexpr std::size_t sz = sizeof(number);
+#ifdef __cpp_lib_byteswap
+        if constexpr (sz == 1)
+        {
+            return;
+        }
+        if constexpr(std::is_integral_v<NumberType>)
+        {
+            number = std::byteswap(number);
+            return;
+        }
+#endif
+        auto* ptr = reinterpret_cast<std::uint8_t*>(&number);
+        for (std::size_t i = 0; i < sz / 2; ++i)
+        {
+            std::swap(ptr[i], ptr[sz - i - 1]);
+        }
+    }
+
+    /*
+    @brief read a number from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format   the current format (for diagnostics)
+    @param[out] result  number of type @a NumberType
+
+    @return whether conversion completed
+
+    @note This function needs to respect the system's endianness, because
+          bytes in CBOR, MessagePack, and UBJSON are stored in network order
+          (big endian) and therefore need reordering on little endian systems.
+          On the other hand, BSON and BJData use little endian and should reorder
+          on big endian systems.
+    */
+    template<typename NumberType, bool InputIsLittleEndian = false>
+    bool get_number(const input_format_t format, NumberType& result)
+    {
+        // read in the original format
+
+        if (JSON_HEDLEY_UNLIKELY(!get_to(result, format, "number")))
+        {
+            return false;
+        }
+        if (is_little_endian != (InputIsLittleEndian || format == input_format_t::bjdata))
+        {
+            byte_swap(result);
+        }
+        return true;
+    }
+
+    /*!
+    @brief create a string by reading characters from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format the current format (for diagnostics)
+    @param[in] len number of characters to read
+    @param[out] result string created by reading @a len bytes
+
+    @return whether string creation completed
+
+    @note We can not reserve @a len bytes for the result, because @a len
+          may be too large. Usually, @ref unexpect_eof() detects the end of
+          the input before we run out of string memory.
+    */
+    template<typename NumberType>
+    bool get_string(const input_format_t format,
+                    const NumberType len,
+                    string_t& result)
+    {
+        bool success = true;
+        for (NumberType i = 0; i < len; i++)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "string")))
+            {
+                success = false;
+                break;
+            }
+            result.push_back(static_cast<typename string_t::value_type>(current));
+        }
+        return success;
+    }
+
+    /*!
+    @brief create a byte array by reading bytes from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format the current format (for diagnostics)
+    @param[in] len number of bytes to read
+    @param[out] result byte array created by reading @a len bytes
+
+    @return whether byte array creation completed
+
+    @note We can not reserve @a len bytes for the result, because @a len
+          may be too large. Usually, @ref unexpect_eof() detects the end of
+          the input before we run out of memory.
+    */
+    template<typename NumberType>
+    bool get_binary(const input_format_t format,
+                    const NumberType len,
+                    binary_t& result)
+    {
+        bool success = true;
+        for (NumberType i = 0; i < len; i++)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "binary")))
+            {
+                success = false;
+                break;
+            }
+            result.push_back(static_cast<std::uint8_t>(current));
+        }
+        return success;
+    }
+
+    /*!
+    @param[in] format   the current format (for diagnostics)
+    @param[in] context  further context information (for diagnostics)
+    @return whether the last read character is not EOF
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    bool unexpect_eof(const input_format_t format, const char* context) const
+    {
+        if (JSON_HEDLEY_UNLIKELY(current == char_traits<char_type>::eof()))
+        {
+            return sax->parse_error(chars_read, "<end of file>",
+                                    parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), nullptr));
+        }
+        return true;
+    }
+
+    /*!
+    @return a string representation of the last read byte
+    */
+    std::string get_token_string() const
+    {
+        std::array<char, 3> cr{{}};
+        static_cast<void>((std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(current))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+        return std::string{cr.data()};
+    }
+
+    /*!
+    @param[in] format   the current format
+    @param[in] detail   a detailed error message
+    @param[in] context  further context information
+    @return a message string to use in the parse_error exceptions
+    */
+    std::string exception_message(const input_format_t format,
+                                  const std::string& detail,
+                                  const std::string& context) const
+    {
+        std::string error_msg = "syntax error while parsing ";
+
+        switch (format)
+        {
+            case input_format_t::cbor:
+                error_msg += "CBOR";
+                break;
+
+            case input_format_t::msgpack:
+                error_msg += "MessagePack";
+                break;
+
+            case input_format_t::ubjson:
+                error_msg += "UBJSON";
+                break;
+
+            case input_format_t::bson:
+                error_msg += "BSON";
+                break;
+
+            case input_format_t::bjdata:
+                error_msg += "BJData";
+                break;
+
+            case input_format_t::json: // LCOV_EXCL_LINE
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        }
+
+        return concat(error_msg, ' ', context, ": ", detail);
+    }
+
+  private:
+    static JSON_INLINE_VARIABLE constexpr std::size_t npos = detail::unknown_size();
+
+    /// input adapter
+    InputAdapterType ia;
+
+    /// the current character
+    char_int_type current = char_traits<char_type>::eof();
+
+    /// the number of characters read
+    std::size_t chars_read = 0;
+
+    /// whether we can assume little endianness
+    const bool is_little_endian = little_endianness();
+
+    /// input format
+    const input_format_t input_format = input_format_t::json;
+
+    /// the SAX parser
+    json_sax_t* sax = nullptr;
+
+    // excluded markers in bjdata optimized type
+#define JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_ \
+    make_array<char_int_type>('F', 'H', 'N', 'S', 'T', 'Z', '[', '{')
+
+#define JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_ \
+    make_array<bjd_type>(                      \
+    bjd_type{'B', "byte"},                     \
+    bjd_type{'C', "char"},                     \
+    bjd_type{'D', "double"},                   \
+    bjd_type{'I', "int16"},                    \
+    bjd_type{'L', "int64"},                    \
+    bjd_type{'M', "uint64"},                   \
+    bjd_type{'U', "uint8"},                    \
+    bjd_type{'d', "single"},                   \
+    bjd_type{'i', "int8"},                     \
+    bjd_type{'l', "int32"},                    \
+    bjd_type{'m', "uint32"},                   \
+    bjd_type{'u', "uint16"})
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    // lookup tables
+    // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+    const decltype(JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_) bjd_optimized_type_markers =
+        JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_;
+
+    using bjd_type = std::pair<char_int_type, string_t>;
+    // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+    const decltype(JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_) bjd_types_map =
+        JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_;
+
+#undef JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_
+#undef JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_
+};
+
+#ifndef JSON_HAS_CPP_17
+    template<typename BasicJsonType, typename InputAdapterType, typename SAX>
+    constexpr std::size_t binary_reader<BasicJsonType, InputAdapterType, SAX>::npos;
+#endif
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+// #include <nlohmann/detail/input/parser.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <cmath> // isfinite
+#include <cstdint> // uint8_t
+#include <functional> // function
+#include <string> // string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/json_sax.hpp>
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/is_sax.hpp>
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+////////////
+// parser //
+////////////
+
+enum class parse_event_t : std::uint8_t
+{
+    /// the parser read `{` and started to process a JSON object
+    object_start,
+    /// the parser read `}` and finished processing a JSON object
+    object_end,
+    /// the parser read `[` and started to process a JSON array
+    array_start,
+    /// the parser read `]` and finished processing a JSON array
+    array_end,
+    /// the parser read a key of a value in an object
+    key,
+    /// the parser finished reading a JSON value
+    value
+};
+
+template<typename BasicJsonType>
+using parser_callback_t =
+    std::function<bool(int /*depth*/, parse_event_t /*event*/, BasicJsonType& /*parsed*/)>;
+
+/*!
+@brief syntax analysis
+
+This class implements a recursive descent parser.
+*/
+template<typename BasicJsonType, typename InputAdapterType>
+class parser
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using lexer_t = lexer<BasicJsonType, InputAdapterType>;
+    using token_type = typename lexer_t::token_type;
+
+  public:
+    /// a parser reading from an input adapter
+    explicit parser(InputAdapterType&& adapter,
+                    parser_callback_t<BasicJsonType> cb = nullptr,
+                    const bool allow_exceptions_ = true,
+                    const bool skip_comments = false)
+        : callback(std::move(cb))
+        , m_lexer(std::move(adapter), skip_comments)
+        , allow_exceptions(allow_exceptions_)
+    {
+        // read first token
+        get_token();
+    }
+
+    /*!
+    @brief public parser interface
+
+    @param[in] strict      whether to expect the last token to be EOF
+    @param[in,out] result  parsed JSON value
+
+    @throw parse_error.101 in case of an unexpected token
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+    */
+    void parse(const bool strict, BasicJsonType& result)
+    {
+        if (callback)
+        {
+            json_sax_dom_callback_parser<BasicJsonType, InputAdapterType> sdp(result, callback, allow_exceptions, &m_lexer);
+            sax_parse_internal(&sdp);
+
+            // in strict mode, input must be completely read
+            if (strict && (get_token() != token_type::end_of_input))
+            {
+                sdp.parse_error(m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                parse_error::create(101, m_lexer.get_position(),
+                                                    exception_message(token_type::end_of_input, "value"), nullptr));
+            }
+
+            // in case of an error, return discarded value
+            if (sdp.is_errored())
+            {
+                result = value_t::discarded;
+                return;
+            }
+
+            // set top-level value to null if it was discarded by the callback
+            // function
+            if (result.is_discarded())
+            {
+                result = nullptr;
+            }
+        }
+        else
+        {
+            json_sax_dom_parser<BasicJsonType, InputAdapterType> sdp(result, allow_exceptions, &m_lexer);
+            sax_parse_internal(&sdp);
+
+            // in strict mode, input must be completely read
+            if (strict && (get_token() != token_type::end_of_input))
+            {
+                sdp.parse_error(m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input, "value"), nullptr));
+            }
+
+            // in case of an error, return discarded value
+            if (sdp.is_errored())
+            {
+                result = value_t::discarded;
+                return;
+            }
+        }
+
+        result.assert_invariant();
+    }
+
+    /*!
+    @brief public accept interface
+
+    @param[in] strict  whether to expect the last token to be EOF
+    @return whether the input is a proper JSON text
+    */
+    bool accept(const bool strict = true)
+    {
+        json_sax_acceptor<BasicJsonType> sax_acceptor;
+        return sax_parse(&sax_acceptor, strict);
+    }
+
+    template<typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    bool sax_parse(SAX* sax, const bool strict = true)
+    {
+        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
+        const bool result = sax_parse_internal(sax);
+
+        // strict mode: next byte must be EOF
+        if (result && strict && (get_token() != token_type::end_of_input))
+        {
+            return sax->parse_error(m_lexer.get_position(),
+                                    m_lexer.get_token_string(),
+                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input, "value"), nullptr));
+        }
+
+        return result;
+    }
+
+  private:
+    template<typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    bool sax_parse_internal(SAX* sax)
+    {
+        // stack to remember the hierarchy of structured values we are parsing
+        // true = array; false = object
+        std::vector<bool> states;
+        // value to avoid a goto (see comment where set to true)
+        bool skip_to_state_evaluation = false;
+
+        while (true)
+        {
+            if (!skip_to_state_evaluation)
+            {
+                // invariant: get_token() was called before each iteration
+                switch (last_token)
+                {
+                    case token_type::begin_object:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(detail::unknown_size())))
+                        {
+                            return false;
+                        }
+
+                        // closing } -> we are done
+                        if (get_token() == token_type::end_object)
+                        {
+                            if (JSON_HEDLEY_UNLIKELY(!sax->end_object()))
+                            {
+                                return false;
+                            }
+                            break;
+                        }
+
+                        // parse key
+                        if (JSON_HEDLEY_UNLIKELY(last_token != token_type::value_string))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string, "object key"), nullptr));
+                        }
+                        if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+
+                        // parse separator (:)
+                        if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator, "object separator"), nullptr));
+                        }
+
+                        // remember we are now inside an object
+                        states.push_back(false);
+
+                        // parse values
+                        get_token();
+                        continue;
+                    }
+
+                    case token_type::begin_array:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(detail::unknown_size())))
+                        {
+                            return false;
+                        }
+
+                        // closing ] -> we are done
+                        if (get_token() == token_type::end_array)
+                        {
+                            if (JSON_HEDLEY_UNLIKELY(!sax->end_array()))
+                            {
+                                return false;
+                            }
+                            break;
+                        }
+
+                        // remember we are now inside an array
+                        states.push_back(true);
+
+                        // parse values (no need to call get_token)
+                        continue;
+                    }
+
+                    case token_type::value_float:
+                    {
+                        const auto res = m_lexer.get_number_float();
+
+                        if (JSON_HEDLEY_UNLIKELY(!std::isfinite(res)))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    out_of_range::create(406, concat("number overflow parsing '", m_lexer.get_token_string(), '\''), nullptr));
+                        }
+
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_float(res, m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+
+                        break;
+                    }
+
+                    case token_type::literal_false:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(false)))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::literal_null:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->null()))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::literal_true:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(true)))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_integer:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_integer(m_lexer.get_number_integer())))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_string:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->string(m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_unsigned:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_unsigned(m_lexer.get_number_unsigned())))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::parse_error:
+                    {
+                        // using "uninitialized" to avoid "expected" message
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::uninitialized, "value"), nullptr));
+                    }
+                    case token_type::end_of_input:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(m_lexer.get_position().chars_read_total == 1))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(),
+                                                            "attempting to parse an empty input; check that your input string or stream contains the expected JSON", nullptr));
+                        }
+
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::literal_or_value, "value"), nullptr));
+                    }
+                    case token_type::uninitialized:
+                    case token_type::end_array:
+                    case token_type::end_object:
+                    case token_type::name_separator:
+                    case token_type::value_separator:
+                    case token_type::literal_or_value:
+                    default: // the last token was unexpected
+                    {
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::literal_or_value, "value"), nullptr));
+                    }
+                }
+            }
+            else
+            {
+                skip_to_state_evaluation = false;
+            }
+
+            // we reached this line after we successfully parsed a value
+            if (states.empty())
+            {
+                // empty stack: we reached the end of the hierarchy: done
+                return true;
+            }
+
+            if (states.back())  // array
+            {
+                // comma -> next value
+                if (get_token() == token_type::value_separator)
+                {
+                    // parse a new value
+                    get_token();
+                    continue;
+                }
+
+                // closing ]
+                if (JSON_HEDLEY_LIKELY(last_token == token_type::end_array))
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!sax->end_array()))
+                    {
+                        return false;
+                    }
+
+                    // We are done with this array. Before we can parse a
+                    // new value, we need to evaluate the new state first.
+                    // By setting skip_to_state_evaluation to false, we
+                    // are effectively jumping to the beginning of this if.
+                    JSON_ASSERT(!states.empty());
+                    states.pop_back();
+                    skip_to_state_evaluation = true;
+                    continue;
+                }
+
+                return sax->parse_error(m_lexer.get_position(),
+                                        m_lexer.get_token_string(),
+                                        parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_array, "array"), nullptr));
+            }
+
+            // states.back() is false -> object
+
+            // comma -> next value
+            if (get_token() == token_type::value_separator)
+            {
+                // parse key
+                if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string))
+                {
+                    return sax->parse_error(m_lexer.get_position(),
+                                            m_lexer.get_token_string(),
+                                            parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string, "object key"), nullptr));
+                }
+
+                if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string())))
+                {
+                    return false;
+                }
+
+                // parse separator (:)
+                if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
+                {
+                    return sax->parse_error(m_lexer.get_position(),
+                                            m_lexer.get_token_string(),
+                                            parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator, "object separator"), nullptr));
+                }
+
+                // parse values
+                get_token();
+                continue;
+            }
+
+            // closing }
+            if (JSON_HEDLEY_LIKELY(last_token == token_type::end_object))
+            {
+                if (JSON_HEDLEY_UNLIKELY(!sax->end_object()))
+                {
+                    return false;
+                }
+
+                // We are done with this object. Before we can parse a
+                // new value, we need to evaluate the new state first.
+                // By setting skip_to_state_evaluation to false, we
+                // are effectively jumping to the beginning of this if.
+                JSON_ASSERT(!states.empty());
+                states.pop_back();
+                skip_to_state_evaluation = true;
+                continue;
+            }
+
+            return sax->parse_error(m_lexer.get_position(),
+                                    m_lexer.get_token_string(),
+                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_object, "object"), nullptr));
+        }
+    }
+
+    /// get next token from lexer
+    token_type get_token()
+    {
+        return last_token = m_lexer.scan();
+    }
+
+    std::string exception_message(const token_type expected, const std::string& context)
+    {
+        std::string error_msg = "syntax error ";
+
+        if (!context.empty())
+        {
+            error_msg += concat("while parsing ", context, ' ');
+        }
+
+        error_msg += "- ";
+
+        if (last_token == token_type::parse_error)
+        {
+            error_msg += concat(m_lexer.get_error_message(), "; last read: '",
+                                m_lexer.get_token_string(), '\'');
+        }
+        else
+        {
+            error_msg += concat("unexpected ", lexer_t::token_type_name(last_token));
+        }
+
+        if (expected != token_type::uninitialized)
+        {
+            error_msg += concat("; expected ", lexer_t::token_type_name(expected));
+        }
+
+        return error_msg;
+    }
+
+  private:
+    /// callback function
+    const parser_callback_t<BasicJsonType> callback = nullptr;
+    /// the type of the last read token
+    token_type last_token = token_type::uninitialized;
+    /// the lexer
+    lexer_t m_lexer;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/iterators/internal_iterator.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <cstddef> // ptrdiff_t
+#include <limits>  // numeric_limits
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+/*
+@brief an iterator for primitive JSON types
+
+This class models an iterator for primitive JSON types (boolean, number,
+string). It's only purpose is to allow the iterator/const_iterator classes
+to "iterate" over primitive values. Internally, the iterator is modeled by
+a `difference_type` variable. Value begin_value (`0`) models the begin,
+end_value (`1`) models past the end.
+*/
+class primitive_iterator_t
+{
+  private:
+    using difference_type = std::ptrdiff_t;
+    static constexpr difference_type begin_value = 0;
+    static constexpr difference_type end_value = begin_value + 1;
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    /// iterator as signed integer type
+    difference_type m_it = (std::numeric_limits<std::ptrdiff_t>::min)();
+
+  public:
+    constexpr difference_type get_value() const noexcept
+    {
+        return m_it;
+    }
+
+    /// set iterator to a defined beginning
+    void set_begin() noexcept
+    {
+        m_it = begin_value;
+    }
+
+    /// set iterator to a defined past the end
+    void set_end() noexcept
+    {
+        m_it = end_value;
+    }
+
+    /// return whether the iterator can be dereferenced
+    constexpr bool is_begin() const noexcept
+    {
+        return m_it == begin_value;
+    }
+
+    /// return whether the iterator is at end
+    constexpr bool is_end() const noexcept
+    {
+        return m_it == end_value;
+    }
+
+    friend constexpr bool operator==(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
+    {
+        return lhs.m_it == rhs.m_it;
+    }
+
+    friend constexpr bool operator<(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
+    {
+        return lhs.m_it < rhs.m_it;
+    }
+
+    primitive_iterator_t operator+(difference_type n) noexcept
+    {
+        auto result = *this;
+        result += n;
+        return result;
+    }
+
+    friend constexpr difference_type operator-(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
+    {
+        return lhs.m_it - rhs.m_it;
+    }
+
+    primitive_iterator_t& operator++() noexcept
+    {
+        ++m_it;
+        return *this;
+    }
+
+    primitive_iterator_t operator++(int)& noexcept // NOLINT(cert-dcl21-cpp)
+    {
+        auto result = *this;
+        ++m_it;
+        return result;
+    }
+
+    primitive_iterator_t& operator--() noexcept
+    {
+        --m_it;
+        return *this;
+    }
+
+    primitive_iterator_t operator--(int)& noexcept // NOLINT(cert-dcl21-cpp)
+    {
+        auto result = *this;
+        --m_it;
+        return result;
+    }
+
+    primitive_iterator_t& operator+=(difference_type n) noexcept
+    {
+        m_it += n;
+        return *this;
+    }
+
+    primitive_iterator_t& operator-=(difference_type n) noexcept
+    {
+        m_it -= n;
+        return *this;
+    }
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+/*!
+@brief an iterator value
+
+@note This structure could easily be a union, but MSVC currently does not allow
+unions members with complex constructors, see https://github.com/nlohmann/json/pull/105.
+*/
+template<typename BasicJsonType> struct internal_iterator
+{
+    /// iterator for JSON objects
+    typename BasicJsonType::object_t::iterator object_iterator {};
+    /// iterator for JSON arrays
+    typename BasicJsonType::array_t::iterator array_iterator {};
+    /// generic iterator for all other types
+    primitive_iterator_t primitive_iterator {};
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/iterators/iter_impl.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <iterator> // iterator, random_access_iterator_tag, bidirectional_iterator_tag, advance, next
+#include <type_traits> // conditional, is_const, remove_const
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/iterators/internal_iterator.hpp>
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+// forward declare, to be able to friend it later on
+template<typename IteratorType> class iteration_proxy;
+template<typename IteratorType> class iteration_proxy_value;
+
+/*!
+@brief a template for a bidirectional iterator for the @ref basic_json class
+This class implements a both iterators (iterator and const_iterator) for the
+@ref basic_json class.
+@note An iterator is called *initialized* when a pointer to a JSON value has
+      been set (e.g., by a constructor or a copy assignment). If the iterator is
+      default-constructed, it is *uninitialized* and most methods are undefined.
+      **The library uses assertions to detect calls on uninitialized iterators.**
+@requirement The class satisfies the following concept requirements:
+-
+[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
+  The iterator that can be moved can be moved in both directions (i.e.
+  incremented and decremented).
+@since version 1.0.0, simplified in version 2.0.9, change to bidirectional
+       iterators in version 3.0.0 (see https://github.com/nlohmann/json/issues/593)
+*/
+template<typename BasicJsonType>
+class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-special-member-functions)
+{
+    /// the iterator with BasicJsonType of different const-ness
+    using other_iter_impl = iter_impl<typename std::conditional<std::is_const<BasicJsonType>::value, typename std::remove_const<BasicJsonType>::type, const BasicJsonType>::type>;
+    /// allow basic_json to access private members
+    friend other_iter_impl;
+    friend BasicJsonType;
+    friend iteration_proxy<iter_impl>;
+    friend iteration_proxy_value<iter_impl>;
+
+    using object_t = typename BasicJsonType::object_t;
+    using array_t = typename BasicJsonType::array_t;
+    // make sure BasicJsonType is basic_json or const basic_json
+    static_assert(is_basic_json<typename std::remove_const<BasicJsonType>::type>::value,
+                  "iter_impl only accepts (const) basic_json");
+    // superficial check for the LegacyBidirectionalIterator named requirement
+    static_assert(std::is_base_of<std::bidirectional_iterator_tag, std::bidirectional_iterator_tag>::value
+                  &&  std::is_base_of<std::bidirectional_iterator_tag, typename std::iterator_traits<typename array_t::iterator>::iterator_category>::value,
+                  "basic_json iterator assumes array and object type iterators satisfy the LegacyBidirectionalIterator named requirement.");
+
+  public:
+    /// The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17.
+    /// The C++ Standard has never required user-defined iterators to derive from std::iterator.
+    /// A user-defined iterator should provide publicly accessible typedefs named
+    /// iterator_category, value_type, difference_type, pointer, and reference.
+    /// Note that value_type is required to be non-const, even for constant iterators.
+    using iterator_category = std::bidirectional_iterator_tag;
+
+    /// the type of the values when the iterator is dereferenced
+    using value_type = typename BasicJsonType::value_type;
+    /// a type to represent differences between iterators
+    using difference_type = typename BasicJsonType::difference_type;
+    /// defines a pointer to the type iterated over (value_type)
+    using pointer = typename std::conditional<std::is_const<BasicJsonType>::value,
+          typename BasicJsonType::const_pointer,
+          typename BasicJsonType::pointer>::type;
+    /// defines a reference to the type iterated over (value_type)
+    using reference =
+        typename std::conditional<std::is_const<BasicJsonType>::value,
+        typename BasicJsonType::const_reference,
+        typename BasicJsonType::reference>::type;
+
+    iter_impl() = default;
+    ~iter_impl() = default;
+    iter_impl(iter_impl&&) noexcept = default;
+    iter_impl& operator=(iter_impl&&) noexcept = default;
+
+    /*!
+    @brief constructor for a given JSON instance
+    @param[in] object  pointer to a JSON object for this iterator
+    @pre object != nullptr
+    @post The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    explicit iter_impl(pointer object) noexcept : m_object(object)
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type)
+        {
+            case value_t::object:
+            {
+                m_it.object_iterator = typename object_t::iterator();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_it.array_iterator = typename array_t::iterator();
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                m_it.primitive_iterator = primitive_iterator_t();
+                break;
+            }
+        }
+    }
+
+    /*!
+    @note The conventional copy constructor and copy assignment are implicitly
+          defined. Combined with the following converting constructor and
+          assignment, they support: (1) copy from iterator to iterator, (2)
+          copy from const iterator to const iterator, and (3) conversion from
+          iterator to const iterator. However conversion from const iterator
+          to iterator is not defined.
+    */
+
+    /*!
+    @brief const copy constructor
+    @param[in] other const iterator to copy from
+    @note This copy constructor had to be defined explicitly to circumvent a bug
+          occurring on msvc v19.0 compiler (VS 2015) debug build. For more
+          information refer to: https://github.com/nlohmann/json/issues/1608
+    */
+    iter_impl(const iter_impl<const BasicJsonType>& other) noexcept
+        : m_object(other.m_object), m_it(other.m_it)
+    {}
+
+    /*!
+    @brief converting assignment
+    @param[in] other const iterator to copy from
+    @return const/non-const iterator
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl& operator=(const iter_impl<const BasicJsonType>& other) noexcept
+    {
+        if (&other != this)
+        {
+            m_object = other.m_object;
+            m_it = other.m_it;
+        }
+        return *this;
+    }
+
+    /*!
+    @brief converting constructor
+    @param[in] other  non-const iterator to copy from
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept
+        : m_object(other.m_object), m_it(other.m_it)
+    {}
+
+    /*!
+    @brief converting assignment
+    @param[in] other  non-const iterator to copy from
+    @return const/non-const iterator
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl& operator=(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept // NOLINT(cert-oop54-cpp)
+    {
+        m_object = other.m_object;
+        m_it = other.m_it;
+        return *this;
+    }
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    /*!
+    @brief set the iterator to the first value
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    void set_begin() noexcept
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type)
+        {
+            case value_t::object:
+            {
+                m_it.object_iterator = m_object->m_data.m_value.object->begin();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_it.array_iterator = m_object->m_data.m_value.array->begin();
+                break;
+            }
+
+            case value_t::null:
+            {
+                // set to end so begin()==end() is true: null is empty
+                m_it.primitive_iterator.set_end();
+                break;
+            }
+
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                m_it.primitive_iterator.set_begin();
+                break;
+            }
+        }
+    }
+
+    /*!
+    @brief set the iterator past the last value
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    void set_end() noexcept
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type)
+        {
+            case value_t::object:
+            {
+                m_it.object_iterator = m_object->m_data.m_value.object->end();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_it.array_iterator = m_object->m_data.m_value.array->end();
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                m_it.primitive_iterator.set_end();
+                break;
+            }
+        }
+    }
+
+  public:
+    /*!
+    @brief return a reference to the value pointed to by the iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference operator*() const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type)
+        {
+            case value_t::object:
+            {
+                JSON_ASSERT(m_it.object_iterator != m_object->m_data.m_value.object->end());
+                return m_it.object_iterator->second;
+            }
+
+            case value_t::array:
+            {
+                JSON_ASSERT(m_it.array_iterator != m_object->m_data.m_value.array->end());
+                return *m_it.array_iterator;
+            }
+
+            case value_t::null:
+                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
+
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
+                {
+                    return *m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
+            }
+        }
+    }
+
+    /*!
+    @brief dereference the iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    pointer operator->() const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type)
+        {
+            case value_t::object:
+            {
+                JSON_ASSERT(m_it.object_iterator != m_object->m_data.m_value.object->end());
+                return &(m_it.object_iterator->second);
+            }
+
+            case value_t::array:
+            {
+                JSON_ASSERT(m_it.array_iterator != m_object->m_data.m_value.array->end());
+                return &*m_it.array_iterator;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
+                {
+                    return m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
+            }
+        }
+    }
+
+    /*!
+    @brief post-increment (it++)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl operator++(int)& // NOLINT(cert-dcl21-cpp)
+    {
+        auto result = *this;
+        ++(*this);
+        return result;
+    }
+
+    /*!
+    @brief pre-increment (++it)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator++()
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type)
+        {
+            case value_t::object:
+            {
+                std::advance(m_it.object_iterator, 1);
+                break;
+            }
+
+            case value_t::array:
+            {
+                std::advance(m_it.array_iterator, 1);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                ++m_it.primitive_iterator;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief post-decrement (it--)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl operator--(int)& // NOLINT(cert-dcl21-cpp)
+    {
+        auto result = *this;
+        --(*this);
+        return result;
+    }
+
+    /*!
+    @brief pre-decrement (--it)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator--()
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type)
+        {
+            case value_t::object:
+            {
+                std::advance(m_it.object_iterator, -1);
+                break;
+            }
+
+            case value_t::array:
+            {
+                std::advance(m_it.array_iterator, -1);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                --m_it.primitive_iterator;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief comparison: equal
+    @pre (1) Both iterators are initialized to point to the same object, or (2) both iterators are value-initialized.
+    */
+    template < typename IterImpl, detail::enable_if_t < (std::is_same<IterImpl, iter_impl>::value || std::is_same<IterImpl, other_iter_impl>::value), std::nullptr_t > = nullptr >
+    bool operator==(const IterImpl& other) const
+    {
+        // if objects are not the same, the comparison is undefined
+        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", m_object));
+        }
+
+        // value-initialized forward iterators can be compared, and must compare equal to other value-initialized iterators of the same type #4493
+        if (m_object == nullptr)
+        {
+            return true;
+        }
+
+        switch (m_object->m_data.m_type)
+        {
+            case value_t::object:
+                return (m_it.object_iterator == other.m_it.object_iterator);
+
+            case value_t::array:
+                return (m_it.array_iterator == other.m_it.array_iterator);
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                return (m_it.primitive_iterator == other.m_it.primitive_iterator);
+        }
+    }
+
+    /*!
+    @brief comparison: not equal
+    @pre (1) Both iterators are initialized to point to the same object, or (2) both iterators are value-initialized.
+    */
+    template < typename IterImpl, detail::enable_if_t < (std::is_same<IterImpl, iter_impl>::value || std::is_same<IterImpl, other_iter_impl>::value), std::nullptr_t > = nullptr >
+    bool operator!=(const IterImpl& other) const
+    {
+        return !operator==(other);
+    }
+
+    /*!
+    @brief comparison: smaller
+    @pre (1) Both iterators are initialized to point to the same object, or (2) both iterators are value-initialized.
+    */
+    bool operator<(const iter_impl& other) const
+    {
+        // if objects are not the same, the comparison is undefined
+        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", m_object));
+        }
+
+        // value-initialized forward iterators can be compared, and must compare equal to other value-initialized iterators of the same type #4493
+        if (m_object == nullptr)
+        {
+            // the iterators are both value-initialized and are to be considered equal, but this function checks for smaller, so we return false
+            return false;
+        }
+
+        switch (m_object->m_data.m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(213, "cannot compare order of object iterators", m_object));
+
+            case value_t::array:
+                return (m_it.array_iterator < other.m_it.array_iterator);
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                return (m_it.primitive_iterator < other.m_it.primitive_iterator);
+        }
+    }
+
+    /*!
+    @brief comparison: less than or equal
+    @pre (1) Both iterators are initialized to point to the same object, or (2) both iterators are value-initialized.
+    */
+    bool operator<=(const iter_impl& other) const
+    {
+        return !other.operator < (*this);
+    }
+
+    /*!
+    @brief comparison: greater than
+    @pre (1) Both iterators are initialized to point to the same object, or (2) both iterators are value-initialized.
+    */
+    bool operator>(const iter_impl& other) const
+    {
+        return !operator<=(other);
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+    @pre (1) The iterator is initialized; i.e. `m_object != nullptr`, or (2) both iterators are value-initialized.
+    */
+    bool operator>=(const iter_impl& other) const
+    {
+        return !operator<(other);
+    }
+
+    /*!
+    @brief add to iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator+=(difference_type i)
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", m_object));
+
+            case value_t::array:
+            {
+                std::advance(m_it.array_iterator, i);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                m_it.primitive_iterator += i;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief subtract from iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator-=(difference_type i)
+    {
+        return operator+=(-i);
+    }
+
+    /*!
+    @brief add to iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl operator+(difference_type i) const
+    {
+        auto result = *this;
+        result += i;
+        return result;
+    }
+
+    /*!
+    @brief addition of distance and iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    friend iter_impl operator+(difference_type i, const iter_impl& it)
+    {
+        auto result = it;
+        result += i;
+        return result;
+    }
+
+    /*!
+    @brief subtract from iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl operator-(difference_type i) const
+    {
+        auto result = *this;
+        result -= i;
+        return result;
+    }
+
+    /*!
+    @brief return difference
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    difference_type operator-(const iter_impl& other) const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", m_object));
+
+            case value_t::array:
+                return m_it.array_iterator - other.m_it.array_iterator;
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                return m_it.primitive_iterator - other.m_it.primitive_iterator;
+        }
+    }
+
+    /*!
+    @brief access to successor
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference operator[](difference_type n) const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(208, "cannot use operator[] for object iterators", m_object));
+
+            case value_t::array:
+                return *std::next(m_it.array_iterator, n);
+
+            case value_t::null:
+                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
+
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.get_value() == -n))
+                {
+                    return *m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
+            }
+        }
+    }
+
+    /*!
+    @brief return the key of an object iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    const typename object_t::key_type& key() const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        if (JSON_HEDLEY_LIKELY(m_object->is_object()))
+        {
+            return m_it.object_iterator->first;
+        }
+
+        JSON_THROW(invalid_iterator::create(207, "cannot use key() for non-object iterators", m_object));
+    }
+
+    /*!
+    @brief return the value of an iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference value() const
+    {
+        return operator*();
+    }
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    /// associated JSON instance
+    pointer m_object = nullptr;
+    /// the actual iterator of the associated instance
+    internal_iterator<typename std::remove_const<BasicJsonType>::type> m_it {};
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
+
+// #include <nlohmann/detail/iterators/json_reverse_iterator.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <cstddef> // ptrdiff_t
+#include <iterator> // reverse_iterator
+#include <utility> // declval
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+//////////////////////
+// reverse_iterator //
+//////////////////////
+
+/*!
+@brief a template for a reverse iterator class
+
+@tparam Base the base iterator type to reverse. Valid types are @ref
+iterator (to create @ref reverse_iterator) and @ref const_iterator (to
+create @ref const_reverse_iterator).
+
+@requirement The class satisfies the following concept requirements:
+-
+[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
+  The iterator that can be moved can be moved in both directions (i.e.
+  incremented and decremented).
+- [OutputIterator](https://en.cppreference.com/w/cpp/named_req/OutputIterator):
+  It is possible to write to the pointed-to element (only if @a Base is
+  @ref iterator).
+
+@since version 1.0.0
+*/
+template<typename Base>
+class json_reverse_iterator : public std::reverse_iterator<Base>
+{
+  public:
+    using difference_type = std::ptrdiff_t;
+    /// shortcut to the reverse iterator adapter
+    using base_iterator = std::reverse_iterator<Base>;
+    /// the reference type for the pointed-to element
+    using reference = typename Base::reference;
+
+    /// create reverse iterator from iterator
+    explicit json_reverse_iterator(const typename base_iterator::iterator_type& it) noexcept
+        : base_iterator(it) {}
+
+    /// create reverse iterator from base class
+    explicit json_reverse_iterator(const base_iterator& it) noexcept : base_iterator(it) {}
+
+    /// post-increment (it++)
+    json_reverse_iterator operator++(int)& // NOLINT(cert-dcl21-cpp)
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator++(1));
+    }
+
+    /// pre-increment (++it)
+    json_reverse_iterator& operator++()
+    {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator++());
+    }
+
+    /// post-decrement (it--)
+    json_reverse_iterator operator--(int)& // NOLINT(cert-dcl21-cpp)
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator--(1));
+    }
+
+    /// pre-decrement (--it)
+    json_reverse_iterator& operator--()
+    {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator--());
+    }
+
+    /// add to iterator
+    json_reverse_iterator& operator+=(difference_type i)
+    {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator+=(i));
+    }
+
+    /// add to iterator
+    json_reverse_iterator operator+(difference_type i) const
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator+(i));
+    }
+
+    /// subtract from iterator
+    json_reverse_iterator operator-(difference_type i) const
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator-(i));
+    }
+
+    /// return difference
+    difference_type operator-(const json_reverse_iterator& other) const
+    {
+        return base_iterator(*this) - base_iterator(other);
+    }
+
+    /// access to successor
+    reference operator[](difference_type n) const
+    {
+        return *(this->operator+(n));
+    }
+
+    /// return the key of an object iterator
+    auto key() const -> decltype(std::declval<Base>().key())
+    {
+        auto it = --this->base();
+        return it.key();
+    }
+
+    /// return the value of an iterator
+    reference value() const
+    {
+        auto it = --this->base();
+        return it.operator * ();
+    }
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+// #include <nlohmann/detail/json_custom_base_class.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <type_traits> // conditional, is_same
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+/*!
+@brief Default base class of the @ref basic_json class.
+
+So that the correct implementations of the copy / move ctors / assign operators
+of @ref basic_json do not require complex case distinctions
+(no base class / custom base class used as customization point),
+@ref basic_json always has a base class.
+By default, this class is used because it is empty and thus has no effect
+on the behavior of @ref basic_json.
+*/
+struct json_default_base {};
+
+template<class T>
+using json_base_class = typename std::conditional <
+                        std::is_same<T, void>::value,
+                        json_default_base,
+                        T
+                        >::type;
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/json_pointer.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <algorithm> // all_of
+#include <cctype> // isdigit
+#include <cerrno> // errno, ERANGE
+#include <cstdlib> // strtoull
+#ifndef JSON_NO_IO
+    #include <iosfwd> // ostream
+#endif  // JSON_NO_IO
+#include <limits> // max
+#include <numeric> // accumulate
+#include <string> // string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+// #include <nlohmann/detail/string_escape.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
+/// @sa https://json.nlohmann.me/api/json_pointer/
+template<typename RefStringType>
+class json_pointer
+{
+    // allow basic_json to access private members
+    NLOHMANN_BASIC_JSON_TPL_DECLARATION
+    friend class basic_json;
+
+    template<typename>
+    friend class json_pointer;
+
+    template<typename T>
+    struct string_t_helper
+    {
+        using type = T;
+    };
+
+    NLOHMANN_BASIC_JSON_TPL_DECLARATION
+    struct string_t_helper<NLOHMANN_BASIC_JSON_TPL>
+    {
+        using type = StringType;
+    };
+
+  public:
+    // for backwards compatibility accept BasicJsonType
+    using string_t = typename string_t_helper<RefStringType>::type;
+
+    /// @brief create JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/json_pointer/
+    explicit json_pointer(const string_t& s = "")
+        : reference_tokens(split(s))
+    {}
+
+    /// @brief return a string representation of the JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/to_string/
+    string_t to_string() const
+    {
+        return std::accumulate(reference_tokens.begin(), reference_tokens.end(),
+                               string_t{},
+                               [](const string_t& a, const string_t& b)
+        {
+            return detail::concat(a, '/', detail::escape(b));
+        });
+    }
+
+    /// @brief return a string representation of the JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_string/
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, to_string())
+    operator string_t() const
+    {
+        return to_string();
+    }
+
+#ifndef JSON_NO_IO
+    /// @brief write string representation of the JSON pointer to stream
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ltlt/
+    friend std::ostream& operator<<(std::ostream& o, const json_pointer& ptr)
+    {
+        o << ptr.to_string();
+        return o;
+    }
+#endif
+
+    /// @brief append another JSON pointer at the end of this JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slasheq/
+    json_pointer& operator/=(const json_pointer& ptr)
+    {
+        reference_tokens.insert(reference_tokens.end(),
+                                ptr.reference_tokens.begin(),
+                                ptr.reference_tokens.end());
+        return *this;
+    }
+
+    /// @brief append an unescaped reference token at the end of this JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slasheq/
+    json_pointer& operator/=(string_t token)
+    {
+        push_back(std::move(token));
+        return *this;
+    }
+
+    /// @brief append an array index at the end of this JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slasheq/
+    json_pointer& operator/=(std::size_t array_idx)
+    {
+        return *this /= std::to_string(array_idx);
+    }
+
+    /// @brief create a new JSON pointer by appending the right JSON pointer at the end of the left JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slash/
+    friend json_pointer operator/(const json_pointer& lhs,
+                                  const json_pointer& rhs)
+    {
+        return json_pointer(lhs) /= rhs;
+    }
+
+    /// @brief create a new JSON pointer by appending the unescaped token at the end of the JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slash/
+    friend json_pointer operator/(const json_pointer& lhs, string_t token) // NOLINT(performance-unnecessary-value-param)
+    {
+        return json_pointer(lhs) /= std::move(token);
+    }
+
+    /// @brief create a new JSON pointer by appending the array-index-token at the end of the JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slash/
+    friend json_pointer operator/(const json_pointer& lhs, std::size_t array_idx)
+    {
+        return json_pointer(lhs) /= array_idx;
+    }
+
+    /// @brief returns the parent of this JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/parent_pointer/
+    json_pointer parent_pointer() const
+    {
+        if (empty())
+        {
+            return *this;
+        }
+
+        json_pointer res = *this;
+        res.pop_back();
+        return res;
+    }
+
+    /// @brief remove last reference token
+    /// @sa https://json.nlohmann.me/api/json_pointer/pop_back/
+    void pop_back()
+    {
+        if (JSON_HEDLEY_UNLIKELY(empty()))
+        {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", nullptr));
+        }
+
+        reference_tokens.pop_back();
+    }
+
+    /// @brief return last reference token
+    /// @sa https://json.nlohmann.me/api/json_pointer/back/
+    const string_t& back() const
+    {
+        if (JSON_HEDLEY_UNLIKELY(empty()))
+        {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", nullptr));
+        }
+
+        return reference_tokens.back();
+    }
+
+    /// @brief append an unescaped token at the end of the reference pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/push_back/
+    void push_back(const string_t& token)
+    {
+        reference_tokens.push_back(token);
+    }
+
+    /// @brief append an unescaped token at the end of the reference pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/push_back/
+    void push_back(string_t&& token)
+    {
+        reference_tokens.push_back(std::move(token));
+    }
+
+    /// @brief return whether pointer points to the root document
+    /// @sa https://json.nlohmann.me/api/json_pointer/empty/
+    bool empty() const noexcept
+    {
+        return reference_tokens.empty();
+    }
+
+  private:
+    /*!
+    @param[in] s  reference token to be converted into an array index
+
+    @return integer representation of @a s
+
+    @throw parse_error.106  if an array index begins with '0'
+    @throw parse_error.109  if an array index begins not with a digit
+    @throw out_of_range.404 if string @a s could not be converted to an integer
+    @throw out_of_range.410 if an array index exceeds size_type
+    */
+    template<typename BasicJsonType>
+    static typename BasicJsonType::size_type array_index(const string_t& s)
+    {
+        using size_type = typename BasicJsonType::size_type;
+
+        // error condition (cf. RFC 6901, Sect. 4)
+        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && s[0] == '0'))
+        {
+            JSON_THROW(detail::parse_error::create(106, 0, detail::concat("array index '", s, "' must not begin with '0'"), nullptr));
+        }
+
+        // error condition (cf. RFC 6901, Sect. 4)
+        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && !(s[0] >= '1' && s[0] <= '9')))
+        {
+            JSON_THROW(detail::parse_error::create(109, 0, detail::concat("array index '", s, "' is not a number"), nullptr));
+        }
+
+        const char* p = s.c_str();
+        char* p_end = nullptr;
+        errno = 0; // strtoull doesn't reset errno
+        const unsigned long long res = std::strtoull(p, &p_end, 10); // NOLINT(runtime/int)
+        if (p == p_end // invalid input or empty string
+                || errno == ERANGE // out of range
+                || JSON_HEDLEY_UNLIKELY(static_cast<std::size_t>(p_end - p) != s.size())) // incomplete read
+        {
+            JSON_THROW(detail::out_of_range::create(404, detail::concat("unresolved reference token '", s, "'"), nullptr));
+        }
+
+        // only triggered on special platforms (like 32bit), see also
+        // https://github.com/nlohmann/json/pull/2203
+        if (res >= static_cast<unsigned long long>((std::numeric_limits<size_type>::max)()))  // NOLINT(runtime/int)
+        {
+            JSON_THROW(detail::out_of_range::create(410, detail::concat("array index ", s, " exceeds size_type"), nullptr));   // LCOV_EXCL_LINE
+        }
+
+        return static_cast<size_type>(res);
+    }
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    json_pointer top() const
+    {
+        if (JSON_HEDLEY_UNLIKELY(empty()))
+        {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", nullptr));
+        }
+
+        json_pointer result = *this;
+        result.reference_tokens = {reference_tokens[0]};
+        return result;
+    }
+
+  private:
+    /*!
+    @brief create and return a reference to the pointed to value
+
+    @complexity Linear in the number of reference tokens.
+
+    @throw parse_error.109 if array index is not a number
+    @throw type_error.313 if value cannot be unflattened
+    */
+    template<typename BasicJsonType>
+    BasicJsonType& get_and_create(BasicJsonType& j) const
+    {
+        auto* result = &j;
+
+        // in case no reference tokens exist, return a reference to the JSON value
+        // j which will be overwritten by a primitive value
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (result->type())
+            {
+                case detail::value_t::null:
+                {
+                    if (reference_token == "0")
+                    {
+                        // start a new array if reference token is 0
+                        result = &result->operator[](0);
+                    }
+                    else
+                    {
+                        // start a new object otherwise
+                        result = &result->operator[](reference_token);
+                    }
+                    break;
+                }
+
+                case detail::value_t::object:
+                {
+                    // create an entry in the object
+                    result = &result->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    // create an entry in the array
+                    result = &result->operator[](array_index<BasicJsonType>(reference_token));
+                    break;
+                }
+
+                /*
+                The following code is only reached if there exists a reference
+                token _and_ the current value is primitive. In this case, we have
+                an error situation, because primitive values may only occur as
+                single value; that is, with an empty list of reference tokens.
+                */
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                    JSON_THROW(detail::type_error::create(313, "invalid value to unflatten", &j));
+            }
+        }
+
+        return *result;
+    }
+
+    /*!
+    @brief return a reference to the pointed to value
+
+    @note This version does not throw if a value is not present, but tries to
+          create nested values instead. For instance, calling this function
+          with pointer `"/this/that"` on a null value is equivalent to calling
+          `operator[]("this").operator[]("that")` on that value, effectively
+          changing the null value to an object.
+
+    @param[in] ptr  a JSON value
+
+    @return reference to the JSON value pointed to by the JSON pointer
+
+    @complexity Linear in the length of the JSON pointer.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    template<typename BasicJsonType>
+    BasicJsonType& get_unchecked(BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            // convert null values to arrays or objects before continuing
+            if (ptr->is_null())
+            {
+                // check if reference token is a number
+                const bool nums =
+                    std::all_of(reference_token.begin(), reference_token.end(),
+                                [](const unsigned char x)
+                {
+                    return std::isdigit(x);
+                });
+
+                // change value to array for numbers or "-" or to object otherwise
+                *ptr = (nums || reference_token == "-")
+                       ? detail::value_t::array
+                       : detail::value_t::object;
+            }
+
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // use unchecked object access
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (reference_token == "-")
+                    {
+                        // explicitly treat "-" as index beyond the end
+                        ptr = &ptr->operator[](ptr->m_data.m_value.array->size());
+                    }
+                    else
+                    {
+                        // convert array index to number; unchecked access
+                        ptr = &ptr->operator[](array_index<BasicJsonType>(reference_token));
+                    }
+                    break;
+                }
+
+                case detail::value_t::null:
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    template<typename BasicJsonType>
+    BasicJsonType& get_checked(BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // note: at performs range check
+                    ptr = &ptr->at(reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        JSON_THROW(detail::out_of_range::create(402, detail::concat(
+                                "array index '-' (", std::to_string(ptr->m_data.m_value.array->size()),
+                                ") is out of range"), ptr));
+                    }
+
+                    // note: at performs range check
+                    ptr = &ptr->at(array_index<BasicJsonType>(reference_token));
+                    break;
+                }
+
+                case detail::value_t::null:
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @brief return a const reference to the pointed to value
+
+    @param[in] ptr  a JSON value
+
+    @return const reference to the JSON value pointed to by the JSON
+    pointer
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    template<typename BasicJsonType>
+    const BasicJsonType& get_unchecked(const BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // use unchecked object access
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" cannot be used for const access
+                        JSON_THROW(detail::out_of_range::create(402, detail::concat("array index '-' (", std::to_string(ptr->m_data.m_value.array->size()), ") is out of range"), ptr));
+                    }
+
+                    // use unchecked array access
+                    ptr = &ptr->operator[](array_index<BasicJsonType>(reference_token));
+                    break;
+                }
+
+                case detail::value_t::null:
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    template<typename BasicJsonType>
+    const BasicJsonType& get_checked(const BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // note: at performs range check
+                    ptr = &ptr->at(reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        JSON_THROW(detail::out_of_range::create(402, detail::concat(
+                                "array index '-' (", std::to_string(ptr->m_data.m_value.array->size()),
+                                ") is out of range"), ptr));
+                    }
+
+                    // note: at performs range check
+                    ptr = &ptr->at(array_index<BasicJsonType>(reference_token));
+                    break;
+                }
+
+                case detail::value_t::null:
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    */
+    template<typename BasicJsonType>
+    bool contains(const BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    if (!ptr->contains(reference_token))
+                    {
+                        // we did not find the key in the object
+                        return false;
+                    }
+
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() == 1 && !("0" <= reference_token && reference_token <= "9")))
+                    {
+                        // invalid char
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() > 1))
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!('1' <= reference_token[0] && reference_token[0] <= '9')))
+                        {
+                            // first char should be between '1' and '9'
+                            return false;
+                        }
+                        for (std::size_t i = 1; i < reference_token.size(); i++)
+                        {
+                            if (JSON_HEDLEY_UNLIKELY(!('0' <= reference_token[i] && reference_token[i] <= '9')))
+                            {
+                                // other char should be between '0' and '9'
+                                return false;
+                            }
+                        }
+                    }
+
+                    const auto idx = array_index<BasicJsonType>(reference_token);
+                    if (idx >= ptr->size())
+                    {
+                        // index out of range
+                        return false;
+                    }
+
+                    ptr = &ptr->operator[](idx);
+                    break;
+                }
+
+                case detail::value_t::null:
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                {
+                    // we do not expect primitive values if there is still a
+                    // reference token to process
+                    return false;
+                }
+            }
+        }
+
+        // no reference token left means we found a primitive value
+        return true;
+    }
+
+    /*!
+    @brief split the string input to reference tokens
+
+    @note This function is only called by the json_pointer constructor.
+          All exceptions below are documented there.
+
+    @throw parse_error.107  if the pointer is not empty or begins with '/'
+    @throw parse_error.108  if character '~' is not followed by '0' or '1'
+    */
+    static std::vector<string_t> split(const string_t& reference_string)
+    {
+        std::vector<string_t> result;
+
+        // special case: empty reference string -> no reference tokens
+        if (reference_string.empty())
+        {
+            return result;
+        }
+
+        // check if nonempty reference string begins with slash
+        if (JSON_HEDLEY_UNLIKELY(reference_string[0] != '/'))
+        {
+            JSON_THROW(detail::parse_error::create(107, 1, detail::concat("JSON pointer must be empty or begin with '/' - was: '", reference_string, "'"), nullptr));
+        }
+
+        // extract the reference tokens:
+        // - slash: position of the last read slash (or end of string)
+        // - start: position after the previous slash
+        for (
+            // search for the first slash after the first character
+            std::size_t slash = reference_string.find_first_of('/', 1),
+            // set the beginning of the first reference token
+            start = 1;
+            // we can stop if start == 0 (if slash == string_t::npos)
+            start != 0;
+            // set the beginning of the next reference token
+            // (will eventually be 0 if slash == string_t::npos)
+            start = (slash == string_t::npos) ? 0 : slash + 1,
+            // find next slash
+            slash = reference_string.find_first_of('/', start))
+        {
+            // use the text between the beginning of the reference token
+            // (start) and the last slash (slash).
+            auto reference_token = reference_string.substr(start, slash - start);
+
+            // check reference tokens are properly escaped
+            for (std::size_t pos = reference_token.find_first_of('~');
+                    pos != string_t::npos;
+                    pos = reference_token.find_first_of('~', pos + 1))
+            {
+                JSON_ASSERT(reference_token[pos] == '~');
+
+                // ~ must be followed by 0 or 1
+                if (JSON_HEDLEY_UNLIKELY(pos == reference_token.size() - 1 ||
+                                         (reference_token[pos + 1] != '0' &&
+                                          reference_token[pos + 1] != '1')))
+                {
+                    JSON_THROW(detail::parse_error::create(108, 0, "escape character '~' must be followed with '0' or '1'", nullptr));
+                }
+            }
+
+            // finally, store the reference token
+            detail::unescape(reference_token);
+            result.push_back(reference_token);
+        }
+
+        return result;
+    }
+
+  private:
+    /*!
+    @param[in] reference_string  the reference string to the current value
+    @param[in] value             the value to consider
+    @param[in,out] result        the result object to insert values to
+
+    @note Empty objects or arrays are flattened to `null`.
+    */
+    template<typename BasicJsonType>
+    static void flatten(const string_t& reference_string,
+                        const BasicJsonType& value,
+                        BasicJsonType& result)
+    {
+        switch (value.type())
+        {
+            case detail::value_t::array:
+            {
+                if (value.m_data.m_value.array->empty())
+                {
+                    // flatten empty array as null
+                    result[reference_string] = nullptr;
+                }
+                else
+                {
+                    // iterate array and use index as reference string
+                    for (std::size_t i = 0; i < value.m_data.m_value.array->size(); ++i)
+                    {
+                        flatten(detail::concat(reference_string, '/', std::to_string(i)),
+                                value.m_data.m_value.array->operator[](i), result);
+                    }
+                }
+                break;
+            }
+
+            case detail::value_t::object:
+            {
+                if (value.m_data.m_value.object->empty())
+                {
+                    // flatten empty object as null
+                    result[reference_string] = nullptr;
+                }
+                else
+                {
+                    // iterate object and use keys as reference string
+                    for (const auto& element : *value.m_data.m_value.object)
+                    {
+                        flatten(detail::concat(reference_string, '/', detail::escape(element.first)), element.second, result);
+                    }
+                }
+                break;
+            }
+
+            case detail::value_t::null:
+            case detail::value_t::string:
+            case detail::value_t::boolean:
+            case detail::value_t::number_integer:
+            case detail::value_t::number_unsigned:
+            case detail::value_t::number_float:
+            case detail::value_t::binary:
+            case detail::value_t::discarded:
+            default:
+            {
+                // add primitive value with its reference string
+                result[reference_string] = value;
+                break;
+            }
+        }
+    }
+
+    /*!
+    @param[in] value  flattened JSON
+
+    @return unflattened JSON
+
+    @throw parse_error.109 if array index is not a number
+    @throw type_error.314  if value is not an object
+    @throw type_error.315  if object values are not primitive
+    @throw type_error.313  if value cannot be unflattened
+    */
+    template<typename BasicJsonType>
+    static BasicJsonType
+    unflatten(const BasicJsonType& value)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!value.is_object()))
+        {
+            JSON_THROW(detail::type_error::create(314, "only objects can be unflattened", &value));
+        }
+
+        BasicJsonType result;
+
+        // iterate the JSON object values
+        for (const auto& element : *value.m_data.m_value.object)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!element.second.is_primitive()))
+            {
+                JSON_THROW(detail::type_error::create(315, "values in object must be primitive", &element.second));
+            }
+
+            // assign value to reference pointed to by JSON pointer; Note that if
+            // the JSON pointer is "" (i.e., points to the whole value), function
+            // get_and_create returns a reference to result itself. An assignment
+            // will then create a primitive value.
+            json_pointer(element.first).get_and_create(result) = element.second;
+        }
+
+        return result;
+    }
+
+    // can't use conversion operator because of ambiguity
+    json_pointer<string_t> convert() const&
+    {
+        json_pointer<string_t> result;
+        result.reference_tokens = reference_tokens;
+        return result;
+    }
+
+    json_pointer<string_t> convert()&&
+    {
+        json_pointer<string_t> result;
+        result.reference_tokens = std::move(reference_tokens);
+        return result;
+    }
+
+  public:
+#if JSON_HAS_THREE_WAY_COMPARISON
+    /// @brief compares two JSON pointers for equality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
+    template<typename RefStringTypeRhs>
+    bool operator==(const json_pointer<RefStringTypeRhs>& rhs) const noexcept
+    {
+        return reference_tokens == rhs.reference_tokens;
+    }
+
+    /// @brief compares JSON pointer and string for equality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator==(json_pointer))
+    bool operator==(const string_t& rhs) const
+    {
+        return *this == json_pointer(rhs);
+    }
+
+    /// @brief 3-way compares two JSON pointers
+    template<typename RefStringTypeRhs>
+    std::strong_ordering operator<=>(const json_pointer<RefStringTypeRhs>& rhs) const noexcept // *NOPAD*
+    {
+        return  reference_tokens <=> rhs.reference_tokens; // *NOPAD*
+    }
+#else
+    /// @brief compares two JSON pointers for equality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
+    template<typename RefStringTypeLhs, typename RefStringTypeRhs>
+    // NOLINTNEXTLINE(readability-redundant-declaration)
+    friend bool operator==(const json_pointer<RefStringTypeLhs>& lhs,
+                           const json_pointer<RefStringTypeRhs>& rhs) noexcept;
+
+    /// @brief compares JSON pointer and string for equality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
+    template<typename RefStringTypeLhs, typename StringType>
+    // NOLINTNEXTLINE(readability-redundant-declaration)
+    friend bool operator==(const json_pointer<RefStringTypeLhs>& lhs,
+                           const StringType& rhs);
+
+    /// @brief compares string and JSON pointer for equality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
+    template<typename RefStringTypeRhs, typename StringType>
+    // NOLINTNEXTLINE(readability-redundant-declaration)
+    friend bool operator==(const StringType& lhs,
+                           const json_pointer<RefStringTypeRhs>& rhs);
+
+    /// @brief compares two JSON pointers for inequality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_ne/
+    template<typename RefStringTypeLhs, typename RefStringTypeRhs>
+    // NOLINTNEXTLINE(readability-redundant-declaration)
+    friend bool operator!=(const json_pointer<RefStringTypeLhs>& lhs,
+                           const json_pointer<RefStringTypeRhs>& rhs) noexcept;
+
+    /// @brief compares JSON pointer and string for inequality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_ne/
+    template<typename RefStringTypeLhs, typename StringType>
+    // NOLINTNEXTLINE(readability-redundant-declaration)
+    friend bool operator!=(const json_pointer<RefStringTypeLhs>& lhs,
+                           const StringType& rhs);
+
+    /// @brief compares string and JSON pointer for inequality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_ne/
+    template<typename RefStringTypeRhs, typename StringType>
+    // NOLINTNEXTLINE(readability-redundant-declaration)
+    friend bool operator!=(const StringType& lhs,
+                           const json_pointer<RefStringTypeRhs>& rhs);
+
+    /// @brief compares two JSON pointer for less-than
+    template<typename RefStringTypeLhs, typename RefStringTypeRhs>
+    // NOLINTNEXTLINE(readability-redundant-declaration)
+    friend bool operator<(const json_pointer<RefStringTypeLhs>& lhs,
+                          const json_pointer<RefStringTypeRhs>& rhs) noexcept;
+#endif
+
+  private:
+    /// the reference tokens
+    std::vector<string_t> reference_tokens;
+};
+
+#if !JSON_HAS_THREE_WAY_COMPARISON
+// functions cannot be defined inside class due to ODR violations
+template<typename RefStringTypeLhs, typename RefStringTypeRhs>
+inline bool operator==(const json_pointer<RefStringTypeLhs>& lhs,
+                       const json_pointer<RefStringTypeRhs>& rhs) noexcept
+{
+    return lhs.reference_tokens == rhs.reference_tokens;
+}
+
+template<typename RefStringTypeLhs,
+         typename StringType = typename json_pointer<RefStringTypeLhs>::string_t>
+JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator==(json_pointer, json_pointer))
+inline bool operator==(const json_pointer<RefStringTypeLhs>& lhs,
+                       const StringType& rhs)
+{
+    return lhs == json_pointer<RefStringTypeLhs>(rhs);
+}
+
+template<typename RefStringTypeRhs,
+         typename StringType = typename json_pointer<RefStringTypeRhs>::string_t>
+JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator==(json_pointer, json_pointer))
+inline bool operator==(const StringType& lhs,
+                       const json_pointer<RefStringTypeRhs>& rhs)
+{
+    return json_pointer<RefStringTypeRhs>(lhs) == rhs;
+}
+
+template<typename RefStringTypeLhs, typename RefStringTypeRhs>
+inline bool operator!=(const json_pointer<RefStringTypeLhs>& lhs,
+                       const json_pointer<RefStringTypeRhs>& rhs) noexcept
+{
+    return !(lhs == rhs);
+}
+
+template<typename RefStringTypeLhs,
+         typename StringType = typename json_pointer<RefStringTypeLhs>::string_t>
+JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator!=(json_pointer, json_pointer))
+inline bool operator!=(const json_pointer<RefStringTypeLhs>& lhs,
+                       const StringType& rhs)
+{
+    return !(lhs == rhs);
+}
+
+template<typename RefStringTypeRhs,
+         typename StringType = typename json_pointer<RefStringTypeRhs>::string_t>
+JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator!=(json_pointer, json_pointer))
+inline bool operator!=(const StringType& lhs,
+                       const json_pointer<RefStringTypeRhs>& rhs)
+{
+    return !(lhs == rhs);
+}
+
+template<typename RefStringTypeLhs, typename RefStringTypeRhs>
+inline bool operator<(const json_pointer<RefStringTypeLhs>& lhs,
+                      const json_pointer<RefStringTypeRhs>& rhs) noexcept
+{
+    return lhs.reference_tokens < rhs.reference_tokens;
+}
+#endif
+
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/json_ref.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <initializer_list>
+#include <utility>
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+template<typename BasicJsonType>
+class json_ref
+{
+  public:
+    using value_type = BasicJsonType;
+
+    json_ref(value_type&& value)
+        : owned_value(std::move(value))
+    {}
+
+    json_ref(const value_type& value)
+        : value_ref(&value)
+    {}
+
+    json_ref(std::initializer_list<json_ref> init)
+        : owned_value(init)
+    {}
+
+    template <
+        class... Args,
+        enable_if_t<std::is_constructible<value_type, Args...>::value, int> = 0 >
+    json_ref(Args && ... args)
+        : owned_value(std::forward<Args>(args)...)
+    {}
+
+    // class should be movable only
+    json_ref(json_ref&&) noexcept = default;
+    json_ref(const json_ref&) = delete;
+    json_ref& operator=(const json_ref&) = delete;
+    json_ref& operator=(json_ref&&) = delete;
+    ~json_ref() = default;
+
+    value_type moved_or_copied() const
+    {
+        if (value_ref == nullptr)
+        {
+            return std::move(owned_value);
+        }
+        return *value_ref;
+    }
+
+    value_type const& operator*() const
+    {
+        return value_ref ? *value_ref : owned_value;
+    }
+
+    value_type const* operator->() const
+    {
+        return &** this;
+    }
+
+  private:
+    mutable value_type owned_value = nullptr;
+    value_type const* value_ref = nullptr;
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+// #include <nlohmann/detail/string_escape.hpp>
+
+// #include <nlohmann/detail/string_utils.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/output/binary_writer.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <algorithm> // reverse
+#include <array> // array
+#include <map> // map
+#include <cmath> // isnan, isinf
+#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
+#include <cstring> // memcpy
+#include <limits> // numeric_limits
+#include <string> // string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/input/binary_reader.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <algorithm> // copy
+#include <cstddef> // size_t
+#include <iterator> // back_inserter
+#include <memory> // shared_ptr, make_shared
+#include <string> // basic_string
+#include <vector> // vector
+
+#ifndef JSON_NO_IO
+    #include <ios>      // streamsize
+    #include <ostream>  // basic_ostream
+#endif  // JSON_NO_IO
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+/// abstract output adapter interface
+template<typename CharType> struct output_adapter_protocol
+{
+    virtual void write_character(CharType c) = 0;
+    virtual void write_characters(const CharType* s, std::size_t length) = 0;
+    virtual ~output_adapter_protocol() = default;
+
+    output_adapter_protocol() = default;
+    output_adapter_protocol(const output_adapter_protocol&) = default;
+    output_adapter_protocol(output_adapter_protocol&&) noexcept = default;
+    output_adapter_protocol& operator=(const output_adapter_protocol&) = default;
+    output_adapter_protocol& operator=(output_adapter_protocol&&) noexcept = default;
+};
+
+/// a type to simplify interfaces
+template<typename CharType>
+using output_adapter_t = std::shared_ptr<output_adapter_protocol<CharType>>;
+
+/// output adapter for byte vectors
+template<typename CharType, typename AllocatorType = std::allocator<CharType>>
+class output_vector_adapter : public output_adapter_protocol<CharType>
+{
+  public:
+    explicit output_vector_adapter(std::vector<CharType, AllocatorType>& vec) noexcept
+        : v(vec)
+    {}
+
+    void write_character(CharType c) override
+    {
+        v.push_back(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void write_characters(const CharType* s, std::size_t length) override
+    {
+        v.insert(v.end(), s, s + length);
+    }
+
+  private:
+    std::vector<CharType, AllocatorType>& v;
+};
+
+#ifndef JSON_NO_IO
+/// output adapter for output streams
+template<typename CharType>
+class output_stream_adapter : public output_adapter_protocol<CharType>
+{
+  public:
+    explicit output_stream_adapter(std::basic_ostream<CharType>& s) noexcept
+        : stream(s)
+    {}
+
+    void write_character(CharType c) override
+    {
+        stream.put(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void write_characters(const CharType* s, std::size_t length) override
+    {
+        stream.write(s, static_cast<std::streamsize>(length));
+    }
+
+  private:
+    std::basic_ostream<CharType>& stream;
+};
+#endif  // JSON_NO_IO
+
+/// output adapter for basic_string
+template<typename CharType, typename StringType = std::basic_string<CharType>>
+class output_string_adapter : public output_adapter_protocol<CharType>
+{
+  public:
+    explicit output_string_adapter(StringType& s) noexcept
+        : str(s)
+    {}
+
+    void write_character(CharType c) override
+    {
+        str.push_back(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void write_characters(const CharType* s, std::size_t length) override
+    {
+        str.append(s, length);
+    }
+
+  private:
+    StringType& str;
+};
+
+template<typename CharType, typename StringType = std::basic_string<CharType>>
+class output_adapter
+{
+  public:
+    template<typename AllocatorType = std::allocator<CharType>>
+    output_adapter(std::vector<CharType, AllocatorType>& vec)
+        : oa(std::make_shared<output_vector_adapter<CharType, AllocatorType>>(vec)) {}
+
+#ifndef JSON_NO_IO
+    output_adapter(std::basic_ostream<CharType>& s)
+        : oa(std::make_shared<output_stream_adapter<CharType>>(s)) {}
+#endif  // JSON_NO_IO
+
+    output_adapter(StringType& s)
+        : oa(std::make_shared<output_string_adapter<CharType, StringType>>(s)) {}
+
+    operator output_adapter_t<CharType>()
+    {
+        return oa;
+    }
+
+  private:
+    output_adapter_t<CharType> oa = nullptr;
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+/// how to encode BJData
+enum class bjdata_version_t
+{
+    draft2,
+    draft3,
+};
+
+///////////////////
+// binary writer //
+///////////////////
+
+/*!
+@brief serialization to CBOR and MessagePack values
+*/
+template<typename BasicJsonType, typename CharType>
+class binary_writer
+{
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+
+  public:
+    /*!
+    @brief create a binary writer
+
+    @param[in] adapter  output adapter to write to
+    */
+    explicit binary_writer(output_adapter_t<CharType> adapter) : oa(std::move(adapter))
+    {
+        JSON_ASSERT(oa);
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    @pre       j.type() == value_t::object
+    */
+    void write_bson(const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::object:
+            {
+                write_bson_object(*j.m_data.m_value.object);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::array:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                JSON_THROW(type_error::create(317, concat("to serialize to BSON, top-level type must be object, but is ", j.type_name()), &j));
+            }
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    */
+    void write_cbor(const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+            {
+                oa->write_character(to_char_type(0xF6));
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                oa->write_character(j.m_data.m_value.boolean
+                                    ? to_char_type(0xF5)
+                                    : to_char_type(0xF4));
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                if (j.m_data.m_value.number_integer >= 0)
+                {
+                    // CBOR does not differentiate between positive signed
+                    // integers and unsigned integers. Therefore, we used the
+                    // code from the value_t::number_unsigned case here.
+                    if (j.m_data.m_value.number_integer <= 0x17)
+                    {
+                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
+                    }
+                    else if (j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x18));
+                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
+                    }
+                    else if (j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x19));
+                        write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_integer));
+                    }
+                    else if (j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x1A));
+                        write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_integer));
+                    }
+                    else
+                    {
+                        oa->write_character(to_char_type(0x1B));
+                        write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_integer));
+                    }
+                }
+                else
+                {
+                    // The conversions below encode the sign in the first
+                    // byte, and the value is converted to a positive number.
+                    const auto positive_number = -1 - j.m_data.m_value.number_integer;
+                    if (j.m_data.m_value.number_integer >= -24)
+                    {
+                        write_number(static_cast<std::uint8_t>(0x20 + positive_number));
+                    }
+                    else if (positive_number <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x38));
+                        write_number(static_cast<std::uint8_t>(positive_number));
+                    }
+                    else if (positive_number <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x39));
+                        write_number(static_cast<std::uint16_t>(positive_number));
+                    }
+                    else if (positive_number <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x3A));
+                        write_number(static_cast<std::uint32_t>(positive_number));
+                    }
+                    else
+                    {
+                        oa->write_character(to_char_type(0x3B));
+                        write_number(static_cast<std::uint64_t>(positive_number));
+                    }
+                }
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_data.m_value.number_unsigned <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_unsigned));
+                }
+                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x18));
+                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_unsigned));
+                }
+                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x19));
+                    write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_unsigned));
+                }
+                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x1A));
+                    write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_unsigned));
+                }
+                else
+                {
+                    oa->write_character(to_char_type(0x1B));
+                    write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_unsigned));
+                }
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                if (std::isnan(j.m_data.m_value.number_float))
+                {
+                    // NaN is 0xf97e00 in CBOR
+                    oa->write_character(to_char_type(0xF9));
+                    oa->write_character(to_char_type(0x7E));
+                    oa->write_character(to_char_type(0x00));
+                }
+                else if (std::isinf(j.m_data.m_value.number_float))
+                {
+                    // Infinity is 0xf97c00, -Infinity is 0xf9fc00
+                    oa->write_character(to_char_type(0xf9));
+                    oa->write_character(j.m_data.m_value.number_float > 0 ? to_char_type(0x7C) : to_char_type(0xFC));
+                    oa->write_character(to_char_type(0x00));
+                }
+                else
+                {
+                    write_compact_float(j.m_data.m_value.number_float, detail::input_format_t::cbor);
+                }
+                break;
+            }
+
+            case value_t::string:
+            {
+                // step 1: write control byte and the string length
+                const auto N = j.m_data.m_value.string->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0x60 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x78));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x79));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x7A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x7B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write the string
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_data.m_value.string->c_str()),
+                    j.m_data.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                // step 1: write control byte and the array size
+                const auto N = j.m_data.m_value.array->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0x80 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x98));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x99));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x9A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x9B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                for (const auto& el : *j.m_data.m_value.array)
+                {
+                    write_cbor(el);
+                }
+                break;
+            }
+
+            case value_t::binary:
+            {
+                if (j.m_data.m_value.binary->has_subtype())
+                {
+                    if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        write_number(static_cast<std::uint8_t>(0xd8));
+                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.binary->subtype()));
+                    }
+                    else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        write_number(static_cast<std::uint8_t>(0xd9));
+                        write_number(static_cast<std::uint16_t>(j.m_data.m_value.binary->subtype()));
+                    }
+                    else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        write_number(static_cast<std::uint8_t>(0xda));
+                        write_number(static_cast<std::uint32_t>(j.m_data.m_value.binary->subtype()));
+                    }
+                    else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint64_t>::max)())
+                    {
+                        write_number(static_cast<std::uint8_t>(0xdb));
+                        write_number(static_cast<std::uint64_t>(j.m_data.m_value.binary->subtype()));
+                    }
+                }
+
+                // step 1: write control byte and the binary array size
+                const auto N = j.m_data.m_value.binary->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0x40 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x58));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x59));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x5A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x5B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_data.m_value.binary->data()),
+                    N);
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                // step 1: write control byte and the object size
+                const auto N = j.m_data.m_value.object->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0xA0 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xB8));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xB9));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xBA));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xBB));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                for (const auto& el : *j.m_data.m_value.object)
+                {
+                    write_cbor(el.first);
+                    write_cbor(el.second);
+                }
+                break;
+            }
+
+            case value_t::discarded:
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    */
+    void write_msgpack(const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::null: // nil
+            {
+                oa->write_character(to_char_type(0xC0));
+                break;
+            }
+
+            case value_t::boolean: // true and false
+            {
+                oa->write_character(j.m_data.m_value.boolean
+                                    ? to_char_type(0xC3)
+                                    : to_char_type(0xC2));
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                if (j.m_data.m_value.number_integer >= 0)
+                {
+                    // MessagePack does not differentiate between positive
+                    // signed integers and unsigned integers. Therefore, we used
+                    // the code from the value_t::number_unsigned case here.
+                    if (j.m_data.m_value.number_unsigned < 128)
+                    {
+                        // positive fixnum
+                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
+                    }
+                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        // uint 8
+                        oa->write_character(to_char_type(0xCC));
+                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
+                    }
+                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        // uint 16
+                        oa->write_character(to_char_type(0xCD));
+                        write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_integer));
+                    }
+                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        // uint 32
+                        oa->write_character(to_char_type(0xCE));
+                        write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_integer));
+                    }
+                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
+                    {
+                        // uint 64
+                        oa->write_character(to_char_type(0xCF));
+                        write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_integer));
+                    }
+                }
+                else
+                {
+                    if (j.m_data.m_value.number_integer >= -32)
+                    {
+                        // negative fixnum
+                        write_number(static_cast<std::int8_t>(j.m_data.m_value.number_integer));
+                    }
+                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int8_t>::min)() &&
+                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
+                    {
+                        // int 8
+                        oa->write_character(to_char_type(0xD0));
+                        write_number(static_cast<std::int8_t>(j.m_data.m_value.number_integer));
+                    }
+                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int16_t>::min)() &&
+                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
+                    {
+                        // int 16
+                        oa->write_character(to_char_type(0xD1));
+                        write_number(static_cast<std::int16_t>(j.m_data.m_value.number_integer));
+                    }
+                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int32_t>::min)() &&
+                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
+                    {
+                        // int 32
+                        oa->write_character(to_char_type(0xD2));
+                        write_number(static_cast<std::int32_t>(j.m_data.m_value.number_integer));
+                    }
+                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int64_t>::min)() &&
+                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
+                    {
+                        // int 64
+                        oa->write_character(to_char_type(0xD3));
+                        write_number(static_cast<std::int64_t>(j.m_data.m_value.number_integer));
+                    }
+                }
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_data.m_value.number_unsigned < 128)
+                {
+                    // positive fixnum
+                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
+                }
+                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    // uint 8
+                    oa->write_character(to_char_type(0xCC));
+                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
+                }
+                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // uint 16
+                    oa->write_character(to_char_type(0xCD));
+                    write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_integer));
+                }
+                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // uint 32
+                    oa->write_character(to_char_type(0xCE));
+                    write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_integer));
+                }
+                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    // uint 64
+                    oa->write_character(to_char_type(0xCF));
+                    write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_integer));
+                }
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                write_compact_float(j.m_data.m_value.number_float, detail::input_format_t::msgpack);
+                break;
+            }
+
+            case value_t::string:
+            {
+                // step 1: write control byte and the string length
+                const auto N = j.m_data.m_value.string->size();
+                if (N <= 31)
+                {
+                    // fixstr
+                    write_number(static_cast<std::uint8_t>(0xA0 | N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    // str 8
+                    oa->write_character(to_char_type(0xD9));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // str 16
+                    oa->write_character(to_char_type(0xDA));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // str 32
+                    oa->write_character(to_char_type(0xDB));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write the string
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_data.m_value.string->c_str()),
+                    j.m_data.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                // step 1: write control byte and the array size
+                const auto N = j.m_data.m_value.array->size();
+                if (N <= 15)
+                {
+                    // fixarray
+                    write_number(static_cast<std::uint8_t>(0x90 | N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // array 16
+                    oa->write_character(to_char_type(0xDC));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // array 32
+                    oa->write_character(to_char_type(0xDD));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write each element
+                for (const auto& el : *j.m_data.m_value.array)
+                {
+                    write_msgpack(el);
+                }
+                break;
+            }
+
+            case value_t::binary:
+            {
+                // step 0: determine if the binary type has a set subtype to
+                // determine whether to use the ext or fixext types
+                const bool use_ext = j.m_data.m_value.binary->has_subtype();
+
+                // step 1: write control byte and the byte string length
+                const auto N = j.m_data.m_value.binary->size();
+                if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    std::uint8_t output_type{};
+                    bool fixed = true;
+                    if (use_ext)
+                    {
+                        switch (N)
+                        {
+                            case 1:
+                                output_type = 0xD4; // fixext 1
+                                break;
+                            case 2:
+                                output_type = 0xD5; // fixext 2
+                                break;
+                            case 4:
+                                output_type = 0xD6; // fixext 4
+                                break;
+                            case 8:
+                                output_type = 0xD7; // fixext 8
+                                break;
+                            case 16:
+                                output_type = 0xD8; // fixext 16
+                                break;
+                            default:
+                                output_type = 0xC7; // ext 8
+                                fixed = false;
+                                break;
+                        }
+
+                    }
+                    else
+                    {
+                        output_type = 0xC4; // bin 8
+                        fixed = false;
+                    }
+
+                    oa->write_character(to_char_type(output_type));
+                    if (!fixed)
+                    {
+                        write_number(static_cast<std::uint8_t>(N));
+                    }
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    const std::uint8_t output_type = use_ext
+                                                     ? 0xC8 // ext 16
+                                                     : 0xC5; // bin 16
+
+                    oa->write_character(to_char_type(output_type));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    const std::uint8_t output_type = use_ext
+                                                     ? 0xC9 // ext 32
+                                                     : 0xC6; // bin 32
+
+                    oa->write_character(to_char_type(output_type));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 1.5: if this is an ext type, write the subtype
+                if (use_ext)
+                {
+                    write_number(static_cast<std::int8_t>(j.m_data.m_value.binary->subtype()));
+                }
+
+                // step 2: write the byte string
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_data.m_value.binary->data()),
+                    N);
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                // step 1: write control byte and the object size
+                const auto N = j.m_data.m_value.object->size();
+                if (N <= 15)
+                {
+                    // fixmap
+                    write_number(static_cast<std::uint8_t>(0x80 | (N & 0xF)));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // map 16
+                    oa->write_character(to_char_type(0xDE));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // map 32
+                    oa->write_character(to_char_type(0xDF));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write each element
+                for (const auto& el : *j.m_data.m_value.object)
+                {
+                    write_msgpack(el.first);
+                    write_msgpack(el.second);
+                }
+                break;
+            }
+
+            case value_t::discarded:
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    @param[in] use_count   whether to use '#' prefixes (optimized format)
+    @param[in] use_type    whether to use '$' prefixes (optimized format)
+    @param[in] add_prefix  whether prefixes need to be used for this value
+    @param[in] use_bjdata  whether write in BJData format, default is false
+    @param[in] bjdata_version  which BJData version to use, default is draft2
+    */
+    void write_ubjson(const BasicJsonType& j, const bool use_count,
+                      const bool use_type, const bool add_prefix = true,
+                      const bool use_bjdata = false, const bjdata_version_t bjdata_version = bjdata_version_t::draft2)
+    {
+        const bool bjdata_draft3 = use_bjdata && bjdata_version == bjdata_version_t::draft3;
+
+        switch (j.type())
+        {
+            case value_t::null:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('Z'));
+                }
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(j.m_data.m_value.boolean
+                                        ? to_char_type('T')
+                                        : to_char_type('F'));
+                }
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                write_number_with_ubjson_prefix(j.m_data.m_value.number_integer, add_prefix, use_bjdata);
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                write_number_with_ubjson_prefix(j.m_data.m_value.number_unsigned, add_prefix, use_bjdata);
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                write_number_with_ubjson_prefix(j.m_data.m_value.number_float, add_prefix, use_bjdata);
+                break;
+            }
+
+            case value_t::string:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('S'));
+                }
+                write_number_with_ubjson_prefix(j.m_data.m_value.string->size(), true, use_bjdata);
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_data.m_value.string->c_str()),
+                    j.m_data.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('['));
+                }
+
+                bool prefix_required = true;
+                if (use_type && !j.m_data.m_value.array->empty())
+                {
+                    JSON_ASSERT(use_count);
+                    const CharType first_prefix = ubjson_prefix(j.front(), use_bjdata);
+                    const bool same_prefix = std::all_of(j.begin() + 1, j.end(),
+                                                         [this, first_prefix, use_bjdata](const BasicJsonType & v)
+                    {
+                        return ubjson_prefix(v, use_bjdata) == first_prefix;
+                    });
+
+                    std::vector<CharType> bjdx = {'[', '{', 'S', 'H', 'T', 'F', 'N', 'Z'}; // excluded markers in bjdata optimized type
+
+                    if (same_prefix && !(use_bjdata && std::find(bjdx.begin(), bjdx.end(), first_prefix) != bjdx.end()))
+                    {
+                        prefix_required = false;
+                        oa->write_character(to_char_type('$'));
+                        oa->write_character(first_prefix);
+                    }
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_data.m_value.array->size(), true, use_bjdata);
+                }
+
+                for (const auto& el : *j.m_data.m_value.array)
+                {
+                    write_ubjson(el, use_count, use_type, prefix_required, use_bjdata, bjdata_version);
+                }
+
+                if (!use_count)
+                {
+                    oa->write_character(to_char_type(']'));
+                }
+
+                break;
+            }
+
+            case value_t::binary:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('['));
+                }
+
+                if (use_type && (bjdata_draft3 || !j.m_data.m_value.binary->empty()))
+                {
+                    JSON_ASSERT(use_count);
+                    oa->write_character(to_char_type('$'));
+                    oa->write_character(bjdata_draft3 ? 'B' : 'U');
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_data.m_value.binary->size(), true, use_bjdata);
+                }
+
+                if (use_type)
+                {
+                    oa->write_characters(
+                        reinterpret_cast<const CharType*>(j.m_data.m_value.binary->data()),
+                        j.m_data.m_value.binary->size());
+                }
+                else
+                {
+                    for (size_t i = 0; i < j.m_data.m_value.binary->size(); ++i)
+                    {
+                        oa->write_character(to_char_type(bjdata_draft3 ? 'B' : 'U'));
+                        oa->write_character(j.m_data.m_value.binary->data()[i]);
+                    }
+                }
+
+                if (!use_count)
+                {
+                    oa->write_character(to_char_type(']'));
+                }
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                if (use_bjdata && j.m_data.m_value.object->size() == 3 && j.m_data.m_value.object->find("_ArrayType_") != j.m_data.m_value.object->end() && j.m_data.m_value.object->find("_ArraySize_") != j.m_data.m_value.object->end() && j.m_data.m_value.object->find("_ArrayData_") != j.m_data.m_value.object->end())
+                {
+                    if (!write_bjdata_ndarray(*j.m_data.m_value.object, use_count, use_type, bjdata_version))  // decode bjdata ndarray in the JData format (https://github.com/NeuroJSON/jdata)
+                    {
+                        break;
+                    }
+                }
+
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('{'));
+                }
+
+                bool prefix_required = true;
+                if (use_type && !j.m_data.m_value.object->empty())
+                {
+                    JSON_ASSERT(use_count);
+                    const CharType first_prefix = ubjson_prefix(j.front(), use_bjdata);
+                    const bool same_prefix = std::all_of(j.begin(), j.end(),
+                                                         [this, first_prefix, use_bjdata](const BasicJsonType & v)
+                    {
+                        return ubjson_prefix(v, use_bjdata) == first_prefix;
+                    });
+
+                    std::vector<CharType> bjdx = {'[', '{', 'S', 'H', 'T', 'F', 'N', 'Z'}; // excluded markers in bjdata optimized type
+
+                    if (same_prefix && !(use_bjdata && std::find(bjdx.begin(), bjdx.end(), first_prefix) != bjdx.end()))
+                    {
+                        prefix_required = false;
+                        oa->write_character(to_char_type('$'));
+                        oa->write_character(first_prefix);
+                    }
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_data.m_value.object->size(), true, use_bjdata);
+                }
+
+                for (const auto& el : *j.m_data.m_value.object)
+                {
+                    write_number_with_ubjson_prefix(el.first.size(), true, use_bjdata);
+                    oa->write_characters(
+                        reinterpret_cast<const CharType*>(el.first.c_str()),
+                        el.first.size());
+                    write_ubjson(el.second, use_count, use_type, prefix_required, use_bjdata, bjdata_version);
+                }
+
+                if (!use_count)
+                {
+                    oa->write_character(to_char_type('}'));
+                }
+
+                break;
+            }
+
+            case value_t::discarded:
+            default:
+                break;
+        }
+    }
+
+  private:
+    //////////
+    // BSON //
+    //////////
+
+    /*!
+    @return The size of a BSON document entry header, including the id marker
+            and the entry name size (and its null-terminator).
+    */
+    static std::size_t calc_bson_entry_header_size(const string_t& name, const BasicJsonType& j)
+    {
+        const auto it = name.find(static_cast<typename string_t::value_type>(0));
+        if (JSON_HEDLEY_UNLIKELY(it != BasicJsonType::string_t::npos))
+        {
+            JSON_THROW(out_of_range::create(409, concat("BSON key cannot contain code point U+0000 (at byte ", std::to_string(it), ")"), &j));
+            static_cast<void>(j);
+        }
+
+        return /*id*/ 1ul + name.size() + /*zero-terminator*/1u;
+    }
+
+    /*!
+    @brief Writes the given @a element_type and @a name to the output adapter
+    */
+    void write_bson_entry_header(const string_t& name,
+                                 const std::uint8_t element_type)
+    {
+        oa->write_character(to_char_type(element_type)); // boolean
+        oa->write_characters(
+            reinterpret_cast<const CharType*>(name.c_str()),
+            name.size() + 1u);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and boolean value @a value
+    */
+    void write_bson_boolean(const string_t& name,
+                            const bool value)
+    {
+        write_bson_entry_header(name, 0x08);
+        oa->write_character(value ? to_char_type(0x01) : to_char_type(0x00));
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and double value @a value
+    */
+    void write_bson_double(const string_t& name,
+                           const double value)
+    {
+        write_bson_entry_header(name, 0x01);
+        write_number<double>(value, true);
+    }
+
+    /*!
+    @return The size of the BSON-encoded string in @a value
+    */
+    static std::size_t calc_bson_string_size(const string_t& value)
+    {
+        return sizeof(std::int32_t) + value.size() + 1ul;
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and string value @a value
+    */
+    void write_bson_string(const string_t& name,
+                           const string_t& value)
+    {
+        write_bson_entry_header(name, 0x02);
+
+        write_number<std::int32_t>(static_cast<std::int32_t>(value.size() + 1ul), true);
+        oa->write_characters(
+            reinterpret_cast<const CharType*>(value.c_str()),
+            value.size() + 1);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and null value
+    */
+    void write_bson_null(const string_t& name)
+    {
+        write_bson_entry_header(name, 0x0A);
+    }
+
+    /*!
+    @return The size of the BSON-encoded integer @a value
+    */
+    static std::size_t calc_bson_integer_size(const std::int64_t value)
+    {
+        return (std::numeric_limits<std::int32_t>::min)() <= value && value <= (std::numeric_limits<std::int32_t>::max)()
+               ? sizeof(std::int32_t)
+               : sizeof(std::int64_t);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and integer @a value
+    */
+    void write_bson_integer(const string_t& name,
+                            const std::int64_t value)
+    {
+        if ((std::numeric_limits<std::int32_t>::min)() <= value && value <= (std::numeric_limits<std::int32_t>::max)())
+        {
+            write_bson_entry_header(name, 0x10); // int32
+            write_number<std::int32_t>(static_cast<std::int32_t>(value), true);
+        }
+        else
+        {
+            write_bson_entry_header(name, 0x12); // int64
+            write_number<std::int64_t>(static_cast<std::int64_t>(value), true);
+        }
+    }
+
+    /*!
+    @return The size of the BSON-encoded unsigned integer in @a j
+    */
+    static constexpr std::size_t calc_bson_unsigned_size(const std::uint64_t value) noexcept
+    {
+        return (value <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+               ? sizeof(std::int32_t)
+               : sizeof(std::int64_t);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and unsigned @a value
+    */
+    void write_bson_unsigned(const string_t& name,
+                             const BasicJsonType& j)
+    {
+        if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+        {
+            write_bson_entry_header(name, 0x10 /* int32 */);
+            write_number<std::int32_t>(static_cast<std::int32_t>(j.m_data.m_value.number_unsigned), true);
+        }
+        else if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+        {
+            write_bson_entry_header(name, 0x12 /* int64 */);
+            write_number<std::int64_t>(static_cast<std::int64_t>(j.m_data.m_value.number_unsigned), true);
+        }
+        else
+        {
+            JSON_THROW(out_of_range::create(407, concat("integer number ", std::to_string(j.m_data.m_value.number_unsigned), " cannot be represented by BSON as it does not fit int64"), &j));
+        }
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and object @a value
+    */
+    void write_bson_object_entry(const string_t& name,
+                                 const typename BasicJsonType::object_t& value)
+    {
+        write_bson_entry_header(name, 0x03); // object
+        write_bson_object(value);
+    }
+
+    /*!
+    @return The size of the BSON-encoded array @a value
+    */
+    static std::size_t calc_bson_array_size(const typename BasicJsonType::array_t& value)
+    {
+        std::size_t array_index = 0ul;
+
+        const std::size_t embedded_document_size = std::accumulate(std::begin(value), std::end(value), static_cast<std::size_t>(0), [&array_index](std::size_t result, const typename BasicJsonType::array_t::value_type & el)
+        {
+            return result + calc_bson_element_size(std::to_string(array_index++), el);
+        });
+
+        return sizeof(std::int32_t) + embedded_document_size + 1ul;
+    }
+
+    /*!
+    @return The size of the BSON-encoded binary array @a value
+    */
+    static std::size_t calc_bson_binary_size(const typename BasicJsonType::binary_t& value)
+    {
+        return sizeof(std::int32_t) + value.size() + 1ul;
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and array @a value
+    */
+    void write_bson_array(const string_t& name,
+                          const typename BasicJsonType::array_t& value)
+    {
+        write_bson_entry_header(name, 0x04); // array
+        write_number<std::int32_t>(static_cast<std::int32_t>(calc_bson_array_size(value)), true);
+
+        std::size_t array_index = 0ul;
+
+        for (const auto& el : value)
+        {
+            write_bson_element(std::to_string(array_index++), el);
+        }
+
+        oa->write_character(to_char_type(0x00));
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and binary value @a value
+    */
+    void write_bson_binary(const string_t& name,
+                           const binary_t& value)
+    {
+        write_bson_entry_header(name, 0x05);
+
+        write_number<std::int32_t>(static_cast<std::int32_t>(value.size()), true);
+        write_number(value.has_subtype() ? static_cast<std::uint8_t>(value.subtype()) : static_cast<std::uint8_t>(0x00));
+
+        oa->write_characters(reinterpret_cast<const CharType*>(value.data()), value.size());
+    }
+
+    /*!
+    @brief Calculates the size necessary to serialize the JSON value @a j with its @a name
+    @return The calculated size for the BSON document entry for @a j with the given @a name.
+    */
+    static std::size_t calc_bson_element_size(const string_t& name,
+            const BasicJsonType& j)
+    {
+        const auto header_size = calc_bson_entry_header_size(name, j);
+        switch (j.type())
+        {
+            case value_t::object:
+                return header_size + calc_bson_object_size(*j.m_data.m_value.object);
+
+            case value_t::array:
+                return header_size + calc_bson_array_size(*j.m_data.m_value.array);
+
+            case value_t::binary:
+                return header_size + calc_bson_binary_size(*j.m_data.m_value.binary);
+
+            case value_t::boolean:
+                return header_size + 1ul;
+
+            case value_t::number_float:
+                return header_size + 8ul;
+
+            case value_t::number_integer:
+                return header_size + calc_bson_integer_size(j.m_data.m_value.number_integer);
+
+            case value_t::number_unsigned:
+                return header_size + calc_bson_unsigned_size(j.m_data.m_value.number_unsigned);
+
+            case value_t::string:
+                return header_size + calc_bson_string_size(*j.m_data.m_value.string);
+
+            case value_t::null:
+                return header_size + 0ul;
+
+            // LCOV_EXCL_START
+            case value_t::discarded:
+            default:
+                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert)
+                return 0ul;
+                // LCOV_EXCL_STOP
+        }
+    }
+
+    /*!
+    @brief Serializes the JSON value @a j to BSON and associates it with the
+           key @a name.
+    @param name The name to associate with the JSON entity @a j within the
+                current BSON document
+    */
+    void write_bson_element(const string_t& name,
+                            const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::object:
+                return write_bson_object_entry(name, *j.m_data.m_value.object);
+
+            case value_t::array:
+                return write_bson_array(name, *j.m_data.m_value.array);
+
+            case value_t::binary:
+                return write_bson_binary(name, *j.m_data.m_value.binary);
+
+            case value_t::boolean:
+                return write_bson_boolean(name, j.m_data.m_value.boolean);
+
+            case value_t::number_float:
+                return write_bson_double(name, j.m_data.m_value.number_float);
+
+            case value_t::number_integer:
+                return write_bson_integer(name, j.m_data.m_value.number_integer);
+
+            case value_t::number_unsigned:
+                return write_bson_unsigned(name, j);
+
+            case value_t::string:
+                return write_bson_string(name, *j.m_data.m_value.string);
+
+            case value_t::null:
+                return write_bson_null(name);
+
+            // LCOV_EXCL_START
+            case value_t::discarded:
+            default:
+                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert)
+                return;
+                // LCOV_EXCL_STOP
+        }
+    }
+
+    /*!
+    @brief Calculates the size of the BSON serialization of the given
+           JSON-object @a j.
+    @param[in] value  JSON value to serialize
+    @pre       value.type() == value_t::object
+    */
+    static std::size_t calc_bson_object_size(const typename BasicJsonType::object_t& value)
+    {
+        const std::size_t document_size = std::accumulate(value.begin(), value.end(), static_cast<std::size_t>(0),
+                                          [](size_t result, const typename BasicJsonType::object_t::value_type & el)
+        {
+            return result += calc_bson_element_size(el.first, el.second);
+        });
+
+        return sizeof(std::int32_t) + document_size + 1ul;
+    }
+
+    /*!
+    @param[in] value  JSON value to serialize
+    @pre       value.type() == value_t::object
+    */
+    void write_bson_object(const typename BasicJsonType::object_t& value)
+    {
+        write_number<std::int32_t>(static_cast<std::int32_t>(calc_bson_object_size(value)), true);
+
+        for (const auto& el : value)
+        {
+            write_bson_element(el.first, el.second);
+        }
+
+        oa->write_character(to_char_type(0x00));
+    }
+
+    //////////
+    // CBOR //
+    //////////
+
+    static constexpr CharType get_cbor_float_prefix(float /*unused*/)
+    {
+        return to_char_type(0xFA);  // Single-Precision Float
+    }
+
+    static constexpr CharType get_cbor_float_prefix(double /*unused*/)
+    {
+        return to_char_type(0xFB);  // Double-Precision Float
+    }
+
+    /////////////
+    // MsgPack //
+    /////////////
+
+    static constexpr CharType get_msgpack_float_prefix(float /*unused*/)
+    {
+        return to_char_type(0xCA);  // float 32
+    }
+
+    static constexpr CharType get_msgpack_float_prefix(double /*unused*/)
+    {
+        return to_char_type(0xCB);  // float 64
+    }
+
+    ////////////
+    // UBJSON //
+    ////////////
+
+    // UBJSON: write number (floating point)
+    template<typename NumberType, typename std::enable_if<
+                 std::is_floating_point<NumberType>::value, int>::type = 0>
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix,
+                                         const bool use_bjdata)
+    {
+        if (add_prefix)
+        {
+            oa->write_character(get_ubjson_float_prefix(n));
+        }
+        write_number(n, use_bjdata);
+    }
+
+    // UBJSON: write number (unsigned integer)
+    template<typename NumberType, typename std::enable_if<
+                 std::is_unsigned<NumberType>::value, int>::type = 0>
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix,
+                                         const bool use_bjdata)
+    {
+        if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('i'));  // int8
+            }
+            write_number(static_cast<std::uint8_t>(n), use_bjdata);
+        }
+        else if (n <= (std::numeric_limits<std::uint8_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('U'));  // uint8
+            }
+            write_number(static_cast<std::uint8_t>(n), use_bjdata);
+        }
+        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('I'));  // int16
+            }
+            write_number(static_cast<std::int16_t>(n), use_bjdata);
+        }
+        else if (use_bjdata && n <= static_cast<uint64_t>((std::numeric_limits<uint16_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('u'));  // uint16 - bjdata only
+            }
+            write_number(static_cast<std::uint16_t>(n), use_bjdata);
+        }
+        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('l'));  // int32
+            }
+            write_number(static_cast<std::int32_t>(n), use_bjdata);
+        }
+        else if (use_bjdata && n <= static_cast<uint64_t>((std::numeric_limits<uint32_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('m'));  // uint32 - bjdata only
+            }
+            write_number(static_cast<std::uint32_t>(n), use_bjdata);
+        }
+        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('L'));  // int64
+            }
+            write_number(static_cast<std::int64_t>(n), use_bjdata);
+        }
+        else if (use_bjdata && n <= (std::numeric_limits<uint64_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('M'));  // uint64 - bjdata only
+            }
+            write_number(static_cast<std::uint64_t>(n), use_bjdata);
+        }
+        else
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('H'));  // high-precision number
+            }
+
+            const auto number = BasicJsonType(n).dump();
+            write_number_with_ubjson_prefix(number.size(), true, use_bjdata);
+            for (std::size_t i = 0; i < number.size(); ++i)
+            {
+                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
+            }
+        }
+    }
+
+    // UBJSON: write number (signed integer)
+    template < typename NumberType, typename std::enable_if <
+                   std::is_signed<NumberType>::value&&
+                   !std::is_floating_point<NumberType>::value, int >::type = 0 >
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix,
+                                         const bool use_bjdata)
+    {
+        if ((std::numeric_limits<std::int8_t>::min)() <= n && n <= (std::numeric_limits<std::int8_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('i'));  // int8
+            }
+            write_number(static_cast<std::int8_t>(n), use_bjdata);
+        }
+        else if (static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::min)()) <= n && n <= static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('U'));  // uint8
+            }
+            write_number(static_cast<std::uint8_t>(n), use_bjdata);
+        }
+        else if ((std::numeric_limits<std::int16_t>::min)() <= n && n <= (std::numeric_limits<std::int16_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('I'));  // int16
+            }
+            write_number(static_cast<std::int16_t>(n), use_bjdata);
+        }
+        else if (use_bjdata && (static_cast<std::int64_t>((std::numeric_limits<std::uint16_t>::min)()) <= n && n <= static_cast<std::int64_t>((std::numeric_limits<std::uint16_t>::max)())))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('u'));  // uint16 - bjdata only
+            }
+            write_number(static_cast<uint16_t>(n), use_bjdata);
+        }
+        else if ((std::numeric_limits<std::int32_t>::min)() <= n && n <= (std::numeric_limits<std::int32_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('l'));  // int32
+            }
+            write_number(static_cast<std::int32_t>(n), use_bjdata);
+        }
+        else if (use_bjdata && (static_cast<std::int64_t>((std::numeric_limits<std::uint32_t>::min)()) <= n && n <= static_cast<std::int64_t>((std::numeric_limits<std::uint32_t>::max)())))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('m'));  // uint32 - bjdata only
+            }
+            write_number(static_cast<uint32_t>(n), use_bjdata);
+        }
+        else if ((std::numeric_limits<std::int64_t>::min)() <= n && n <= (std::numeric_limits<std::int64_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('L'));  // int64
+            }
+            write_number(static_cast<std::int64_t>(n), use_bjdata);
+        }
+        // LCOV_EXCL_START
+        else
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('H'));  // high-precision number
+            }
+
+            const auto number = BasicJsonType(n).dump();
+            write_number_with_ubjson_prefix(number.size(), true, use_bjdata);
+            for (std::size_t i = 0; i < number.size(); ++i)
+            {
+                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
+            }
+        }
+        // LCOV_EXCL_STOP
+    }
+
+    /*!
+    @brief determine the type prefix of container values
+    */
+    CharType ubjson_prefix(const BasicJsonType& j, const bool use_bjdata) const noexcept
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+                return 'Z';
+
+            case value_t::boolean:
+                return j.m_data.m_value.boolean ? 'T' : 'F';
+
+            case value_t::number_integer:
+            {
+                if ((std::numeric_limits<std::int8_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
+                {
+                    return 'i';
+                }
+                if ((std::numeric_limits<std::uint8_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    return 'U';
+                }
+                if ((std::numeric_limits<std::int16_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
+                {
+                    return 'I';
+                }
+                if (use_bjdata && ((std::numeric_limits<std::uint16_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)()))
+                {
+                    return 'u';
+                }
+                if ((std::numeric_limits<std::int32_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
+                {
+                    return 'l';
+                }
+                if (use_bjdata && ((std::numeric_limits<std::uint32_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)()))
+                {
+                    return 'm';
+                }
+                if ((std::numeric_limits<std::int64_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
+                {
+                    return 'L';
+                }
+                // anything else is treated as high-precision number
+                return 'H'; // LCOV_EXCL_LINE
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
+                {
+                    return 'i';
+                }
+                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint8_t>::max)()))
+                {
+                    return 'U';
+                }
+                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
+                {
+                    return 'I';
+                }
+                if (use_bjdata && j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint16_t>::max)()))
+                {
+                    return 'u';
+                }
+                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+                {
+                    return 'l';
+                }
+                if (use_bjdata && j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint32_t>::max)()))
+                {
+                    return 'm';
+                }
+                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+                {
+                    return 'L';
+                }
+                if (use_bjdata && j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    return 'M';
+                }
+                // anything else is treated as high-precision number
+                return 'H'; // LCOV_EXCL_LINE
+            }
+
+            case value_t::number_float:
+                return get_ubjson_float_prefix(j.m_data.m_value.number_float);
+
+            case value_t::string:
+                return 'S';
+
+            case value_t::array: // fallthrough
+            case value_t::binary:
+                return '[';
+
+            case value_t::object:
+                return '{';
+
+            case value_t::discarded:
+            default:  // discarded values
+                return 'N';
+        }
+    }
+
+    static constexpr CharType get_ubjson_float_prefix(float /*unused*/)
+    {
+        return 'd';  // float 32
+    }
+
+    static constexpr CharType get_ubjson_float_prefix(double /*unused*/)
+    {
+        return 'D';  // float 64
+    }
+
+    /*!
+    @return false if the object is successfully converted to a bjdata ndarray, true if the type or size is invalid
+    */
+    bool write_bjdata_ndarray(const typename BasicJsonType::object_t& value, const bool use_count, const bool use_type, const bjdata_version_t bjdata_version)
+    {
+        std::map<string_t, CharType> bjdtype = {{"uint8", 'U'},  {"int8", 'i'},  {"uint16", 'u'}, {"int16", 'I'},
+            {"uint32", 'm'}, {"int32", 'l'}, {"uint64", 'M'}, {"int64", 'L'}, {"single", 'd'}, {"double", 'D'},
+            {"char", 'C'}, {"byte", 'B'}
+        };
+
+        string_t key = "_ArrayType_";
+        auto it = bjdtype.find(static_cast<string_t>(value.at(key)));
+        if (it == bjdtype.end())
+        {
+            return true;
+        }
+        CharType dtype = it->second;
+
+        key = "_ArraySize_";
+        std::size_t len = (value.at(key).empty() ? 0 : 1);
+        for (const auto& el : value.at(key))
+        {
+            len *= static_cast<std::size_t>(el.m_data.m_value.number_unsigned);
+        }
+
+        key = "_ArrayData_";
+        if (value.at(key).size() != len)
+        {
+            return true;
+        }
+
+        oa->write_character('[');
+        oa->write_character('$');
+        oa->write_character(dtype);
+        oa->write_character('#');
+
+        key = "_ArraySize_";
+        write_ubjson(value.at(key), use_count, use_type, true,  true, bjdata_version);
+
+        key = "_ArrayData_";
+        if (dtype == 'U' || dtype == 'C' || dtype == 'B')
+        {
+            for (const auto& el : value.at(key))
+            {
+                write_number(static_cast<std::uint8_t>(el.m_data.m_value.number_unsigned), true);
+            }
+        }
+        else if (dtype == 'i')
+        {
+            for (const auto& el : value.at(key))
+            {
+                write_number(static_cast<std::int8_t>(el.m_data.m_value.number_integer), true);
+            }
+        }
+        else if (dtype == 'u')
+        {
+            for (const auto& el : value.at(key))
+            {
+                write_number(static_cast<std::uint16_t>(el.m_data.m_value.number_unsigned), true);
+            }
+        }
+        else if (dtype == 'I')
+        {
+            for (const auto& el : value.at(key))
+            {
+                write_number(static_cast<std::int16_t>(el.m_data.m_value.number_integer), true);
+            }
+        }
+        else if (dtype == 'm')
+        {
+            for (const auto& el : value.at(key))
+            {
+                write_number(static_cast<std::uint32_t>(el.m_data.m_value.number_unsigned), true);
+            }
+        }
+        else if (dtype == 'l')
+        {
+            for (const auto& el : value.at(key))
+            {
+                write_number(static_cast<std::int32_t>(el.m_data.m_value.number_integer), true);
+            }
+        }
+        else if (dtype == 'M')
+        {
+            for (const auto& el : value.at(key))
+            {
+                write_number(static_cast<std::uint64_t>(el.m_data.m_value.number_unsigned), true);
+            }
+        }
+        else if (dtype == 'L')
+        {
+            for (const auto& el : value.at(key))
+            {
+                write_number(static_cast<std::int64_t>(el.m_data.m_value.number_integer), true);
+            }
+        }
+        else if (dtype == 'd')
+        {
+            for (const auto& el : value.at(key))
+            {
+                write_number(static_cast<float>(el.m_data.m_value.number_float), true);
+            }
+        }
+        else if (dtype == 'D')
+        {
+            for (const auto& el : value.at(key))
+            {
+                write_number(static_cast<double>(el.m_data.m_value.number_float), true);
+            }
+        }
+        return false;
+    }
+
+    ///////////////////////
+    // Utility functions //
+    ///////////////////////
+
+    /*
+    @brief write a number to output input
+    @param[in] n number of type @a NumberType
+    @param[in] OutputIsLittleEndian Set to true if output data is
+                                 required to be little endian
+    @tparam NumberType the type of the number
+
+    @note This function needs to respect the system's endianness, because bytes
+          in CBOR, MessagePack, and UBJSON are stored in network order (big
+          endian) and therefore need reordering on little endian systems.
+          On the other hand, BSON and BJData use little endian and should reorder
+          on big endian systems.
+    */
+    template<typename NumberType>
+    void write_number(const NumberType n, const bool OutputIsLittleEndian = false)
+    {
+        // step 1: write number to array of length NumberType
+        std::array<CharType, sizeof(NumberType)> vec{};
+        std::memcpy(vec.data(), &n, sizeof(NumberType));
+
+        // step 2: write array to output (with possible reordering)
+        if (is_little_endian != OutputIsLittleEndian)
+        {
+            // reverse byte order prior to conversion if necessary
+            std::reverse(vec.begin(), vec.end());
+        }
+
+        oa->write_characters(vec.data(), sizeof(NumberType));
+    }
+
+    void write_compact_float(const number_float_t n, detail::input_format_t format)
+    {
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        if (static_cast<double>(n) >= static_cast<double>(std::numeric_limits<float>::lowest()) &&
+                static_cast<double>(n) <= static_cast<double>((std::numeric_limits<float>::max)()) &&
+                static_cast<double>(static_cast<float>(n)) == static_cast<double>(n))
+        {
+            oa->write_character(format == detail::input_format_t::cbor
+                                ? get_cbor_float_prefix(static_cast<float>(n))
+                                : get_msgpack_float_prefix(static_cast<float>(n)));
+            write_number(static_cast<float>(n));
+        }
+        else
+        {
+            oa->write_character(format == detail::input_format_t::cbor
+                                ? get_cbor_float_prefix(n)
+                                : get_msgpack_float_prefix(n));
+            write_number(n);
+        }
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+    }
+
+  public:
+    // The following to_char_type functions are implement the conversion
+    // between uint8_t and CharType. In case CharType is not unsigned,
+    // such a conversion is required to allow values greater than 128.
+    // See <https://github.com/nlohmann/json/issues/1286> for a discussion.
+    template < typename C = CharType,
+               enable_if_t < std::is_signed<C>::value && std::is_signed<char>::value > * = nullptr >
+    static constexpr CharType to_char_type(std::uint8_t x) noexcept
+    {
+        return *reinterpret_cast<char*>(&x);
+    }
+
+    template < typename C = CharType,
+               enable_if_t < std::is_signed<C>::value && std::is_unsigned<char>::value > * = nullptr >
+    static CharType to_char_type(std::uint8_t x) noexcept
+    {
+        static_assert(sizeof(std::uint8_t) == sizeof(CharType), "size of CharType must be equal to std::uint8_t");
+        static_assert(std::is_trivial<CharType>::value, "CharType must be trivial");
+        CharType result;
+        std::memcpy(&result, &x, sizeof(x));
+        return result;
+    }
+
+    template<typename C = CharType,
+             enable_if_t<std::is_unsigned<C>::value>* = nullptr>
+    static constexpr CharType to_char_type(std::uint8_t x) noexcept
+    {
+        return x;
+    }
+
+    template < typename InputCharType, typename C = CharType,
+               enable_if_t <
+                   std::is_signed<C>::value &&
+                   std::is_signed<char>::value &&
+                   std::is_same<char, typename std::remove_cv<InputCharType>::type>::value
+                   > * = nullptr >
+    static constexpr CharType to_char_type(InputCharType x) noexcept
+    {
+        return x;
+    }
+
+  private:
+    /// whether we can assume little endianness
+    const bool is_little_endian = little_endianness();
+
+    /// the output
+    output_adapter_t<CharType> oa = nullptr;
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+// #include <nlohmann/detail/output/serializer.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2008 - 2009 Björn Hoehrmann <bjoern@hoehrmann.de>
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <algorithm> // reverse, remove, fill, find, none_of
+#include <array> // array
+#include <clocale> // localeconv, lconv
+#include <cmath> // labs, isfinite, isnan, signbit
+#include <cstddef> // size_t, ptrdiff_t
+#include <cstdint> // uint8_t
+#include <cstdio> // snprintf
+#include <limits> // numeric_limits
+#include <string> // string, char_traits
+#include <iomanip> // setfill, setw
+#include <type_traits> // is_same
+#include <utility> // move
+
+// #include <nlohmann/detail/conversions/to_chars.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2009 Florian Loitsch <https://florian.loitsch.com/>
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <array> // array
+#include <cmath>   // signbit, isfinite
+#include <cstdint> // intN_t, uintN_t
+#include <cstring> // memcpy, memmove
+#include <limits> // numeric_limits
+#include <type_traits> // conditional
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+/*!
+@brief implements the Grisu2 algorithm for binary to decimal floating-point
+conversion.
+
+This implementation is a slightly modified version of the reference
+implementation which may be obtained from
+http://florian.loitsch.com/publications (bench.tar.gz).
+
+The code is distributed under the MIT license, Copyright (c) 2009 Florian Loitsch.
+
+For a detailed description of the algorithm see:
+
+[1] Loitsch, "Printing Floating-Point Numbers Quickly and Accurately with
+    Integers", Proceedings of the ACM SIGPLAN 2010 Conference on Programming
+    Language Design and Implementation, PLDI 2010
+[2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and Accurately",
+    Proceedings of the ACM SIGPLAN 1996 Conference on Programming Language
+    Design and Implementation, PLDI 1996
+*/
+namespace dtoa_impl
+{
+
+template<typename Target, typename Source>
+Target reinterpret_bits(const Source source)
+{
+    static_assert(sizeof(Target) == sizeof(Source), "size mismatch");
+
+    Target target;
+    std::memcpy(&target, &source, sizeof(Source));
+    return target;
+}
+
+struct diyfp // f * 2^e
+{
+    static constexpr int kPrecision = 64; // = q
+
+    std::uint64_t f = 0;
+    int e = 0;
+
+    constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {}
+
+    /*!
+    @brief returns x - y
+    @pre x.e == y.e and x.f >= y.f
+    */
+    static diyfp sub(const diyfp& x, const diyfp& y) noexcept
+    {
+        JSON_ASSERT(x.e == y.e);
+        JSON_ASSERT(x.f >= y.f);
+
+        return {x.f - y.f, x.e};
+    }
+
+    /*!
+    @brief returns x * y
+    @note The result is rounded. (Only the upper q bits are returned.)
+    */
+    static diyfp mul(const diyfp& x, const diyfp& y) noexcept
+    {
+        static_assert(kPrecision == 64, "internal error");
+
+        // Computes:
+        //  f = round((x.f * y.f) / 2^q)
+        //  e = x.e + y.e + q
+
+        // Emulate the 64-bit * 64-bit multiplication:
+        //
+        // p = u * v
+        //   = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi)
+        //   = (u_lo v_lo         ) + 2^32 ((u_lo v_hi         ) + (u_hi v_lo         )) + 2^64 (u_hi v_hi         )
+        //   = (p0                ) + 2^32 ((p1                ) + (p2                )) + 2^64 (p3                )
+        //   = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo + 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3                )
+        //   = (p0_lo             ) + 2^32 (p0_hi + p1_lo + p2_lo                      ) + 2^64 (p1_hi + p2_hi + p3)
+        //   = (p0_lo             ) + 2^32 (Q                                          ) + 2^64 (H                 )
+        //   = (p0_lo             ) + 2^32 (Q_lo + 2^32 Q_hi                           ) + 2^64 (H                 )
+        //
+        // (Since Q might be larger than 2^32 - 1)
+        //
+        //   = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H)
+        //
+        // (Q_hi + H does not overflow a 64-bit int)
+        //
+        //   = p_lo + 2^64 p_hi
+
+        const std::uint64_t u_lo = x.f & 0xFFFFFFFFu;
+        const std::uint64_t u_hi = x.f >> 32u;
+        const std::uint64_t v_lo = y.f & 0xFFFFFFFFu;
+        const std::uint64_t v_hi = y.f >> 32u;
+
+        const std::uint64_t p0 = u_lo * v_lo;
+        const std::uint64_t p1 = u_lo * v_hi;
+        const std::uint64_t p2 = u_hi * v_lo;
+        const std::uint64_t p3 = u_hi * v_hi;
+
+        const std::uint64_t p0_hi = p0 >> 32u;
+        const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu;
+        const std::uint64_t p1_hi = p1 >> 32u;
+        const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu;
+        const std::uint64_t p2_hi = p2 >> 32u;
+
+        std::uint64_t Q = p0_hi + p1_lo + p2_lo;
+
+        // The full product might now be computed as
+        //
+        // p_hi = p3 + p2_hi + p1_hi + (Q >> 32)
+        // p_lo = p0_lo + (Q << 32)
+        //
+        // But in this particular case here, the full p_lo is not required.
+        // Effectively we only need to add the highest bit in p_lo to p_hi (and
+        // Q_hi + 1 does not overflow).
+
+        Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up
+
+        const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u);
+
+        return {h, x.e + y.e + 64};
+    }
+
+    /*!
+    @brief normalize x such that the significand is >= 2^(q-1)
+    @pre x.f != 0
+    */
+    static diyfp normalize(diyfp x) noexcept
+    {
+        JSON_ASSERT(x.f != 0);
+
+        while ((x.f >> 63u) == 0)
+        {
+            x.f <<= 1u;
+            x.e--;
+        }
+
+        return x;
+    }
+
+    /*!
+    @brief normalize x such that the result has the exponent E
+    @pre e >= x.e and the upper e - x.e bits of x.f must be zero.
+    */
+    static diyfp normalize_to(const diyfp& x, const int target_exponent) noexcept
+    {
+        const int delta = x.e - target_exponent;
+
+        JSON_ASSERT(delta >= 0);
+        JSON_ASSERT(((x.f << delta) >> delta) == x.f);
+
+        return {x.f << delta, target_exponent};
+    }
+};
+
+struct boundaries
+{
+    diyfp w;
+    diyfp minus;
+    diyfp plus;
+};
+
+/*!
+Compute the (normalized) diyfp representing the input number 'value' and its
+boundaries.
+
+@pre value must be finite and positive
+*/
+template<typename FloatType>
+boundaries compute_boundaries(FloatType value)
+{
+    JSON_ASSERT(std::isfinite(value));
+    JSON_ASSERT(value > 0);
+
+    // Convert the IEEE representation into a diyfp.
+    //
+    // If v is denormal:
+    //      value = 0.F * 2^(1 - bias) = (          F) * 2^(1 - bias - (p-1))
+    // If v is normalized:
+    //      value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1))
+
+    static_assert(std::numeric_limits<FloatType>::is_iec559,
+                  "internal error: dtoa_short requires an IEEE-754 floating-point implementation");
+
+    constexpr int      kPrecision = std::numeric_limits<FloatType>::digits; // = p (includes the hidden bit)
+    constexpr int      kBias      = std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1);
+    constexpr int      kMinExp    = 1 - kBias;
+    constexpr std::uint64_t kHiddenBit = std::uint64_t{1} << (kPrecision - 1); // = 2^(p-1)
+
+    using bits_type = typename std::conditional<kPrecision == 24, std::uint32_t, std::uint64_t >::type;
+
+    const auto bits = static_cast<std::uint64_t>(reinterpret_bits<bits_type>(value));
+    const std::uint64_t E = bits >> (kPrecision - 1);
+    const std::uint64_t F = bits & (kHiddenBit - 1);
+
+    const bool is_denormal = E == 0;
+    const diyfp v = is_denormal
+                    ? diyfp(F, kMinExp)
+                    : diyfp(F + kHiddenBit, static_cast<int>(E) - kBias);
+
+    // Compute the boundaries m- and m+ of the floating-point value
+    // v = f * 2^e.
+    //
+    // Determine v- and v+, the floating-point predecessor and successor if v,
+    // respectively.
+    //
+    //      v- = v - 2^e        if f != 2^(p-1) or e == e_min                (A)
+    //         = v - 2^(e-1)    if f == 2^(p-1) and e > e_min                (B)
+    //
+    //      v+ = v + 2^e
+    //
+    // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_
+    // between m- and m+ round to v, regardless of how the input rounding
+    // algorithm breaks ties.
+    //
+    //      ---+-------------+-------------+-------------+-------------+---  (A)
+    //         v-            m-            v             m+            v+
+    //
+    //      -----------------+------+------+-------------+-------------+---  (B)
+    //                       v-     m-     v             m+            v+
+
+    const bool lower_boundary_is_closer = F == 0 && E > 1;
+    const diyfp m_plus = diyfp((2 * v.f) + 1, v.e - 1);
+    const diyfp m_minus = lower_boundary_is_closer
+                          ? diyfp((4 * v.f) - 1, v.e - 2)  // (B)
+                          : diyfp((2 * v.f) - 1, v.e - 1); // (A)
+
+    // Determine the normalized w+ = m+.
+    const diyfp w_plus = diyfp::normalize(m_plus);
+
+    // Determine w- = m- such that e_(w-) = e_(w+).
+    const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e);
+
+    return {diyfp::normalize(v), w_minus, w_plus};
+}
+
+// Given normalized diyfp w, Grisu needs to find a (normalized) cached
+// power-of-ten c, such that the exponent of the product c * w = f * 2^e lies
+// within a certain range [alpha, gamma] (Definition 3.2 from [1])
+//
+//      alpha <= e = e_c + e_w + q <= gamma
+//
+// or
+//
+//      f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q
+//                          <= f_c * f_w * 2^gamma
+//
+// Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies
+//
+//      2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma
+//
+// or
+//
+//      2^(q - 2 + alpha) <= c * w < 2^(q + gamma)
+//
+// The choice of (alpha,gamma) determines the size of the table and the form of
+// the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well
+// in practice:
+//
+// The idea is to cut the number c * w = f * 2^e into two parts, which can be
+// processed independently: An integral part p1, and a fractional part p2:
+//
+//      f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e
+//              = (f div 2^-e) + (f mod 2^-e) * 2^e
+//              = p1 + p2 * 2^e
+//
+// The conversion of p1 into decimal form requires a series of divisions and
+// modulos by (a power of) 10. These operations are faster for 32-bit than for
+// 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be
+// achieved by choosing
+//
+//      -e >= 32   or   e <= -32 := gamma
+//
+// In order to convert the fractional part
+//
+//      p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ...
+//
+// into decimal form, the fraction is repeatedly multiplied by 10 and the digits
+// d[-i] are extracted in order:
+//
+//      (10 * p2) div 2^-e = d[-1]
+//      (10 * p2) mod 2^-e = d[-2] / 10^1 + ...
+//
+// The multiplication by 10 must not overflow. It is sufficient to choose
+//
+//      10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64.
+//
+// Since p2 = f mod 2^-e < 2^-e,
+//
+//      -e <= 60   or   e >= -60 := alpha
+
+constexpr int kAlpha = -60;
+constexpr int kGamma = -32;
+
+struct cached_power // c = f * 2^e ~= 10^k
+{
+    std::uint64_t f;
+    int e;
+    int k;
+};
+
+/*!
+For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached
+power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c
+satisfies (Definition 3.2 from [1])
+
+     alpha <= e_c + e + q <= gamma.
+*/
+inline cached_power get_cached_power_for_binary_exponent(int e)
+{
+    // Now
+    //
+    //      alpha <= e_c + e + q <= gamma                                    (1)
+    //      ==> f_c * 2^alpha <= c * 2^e * 2^q
+    //
+    // and since the c's are normalized, 2^(q-1) <= f_c,
+    //
+    //      ==> 2^(q - 1 + alpha) <= c * 2^(e + q)
+    //      ==> 2^(alpha - e - 1) <= c
+    //
+    // If c were an exact power of ten, i.e. c = 10^k, one may determine k as
+    //
+    //      k = ceil( log_10( 2^(alpha - e - 1) ) )
+    //        = ceil( (alpha - e - 1) * log_10(2) )
+    //
+    // From the paper:
+    // "In theory the result of the procedure could be wrong since c is rounded,
+    //  and the computation itself is approximated [...]. In practice, however,
+    //  this simple function is sufficient."
+    //
+    // For IEEE double precision floating-point numbers converted into
+    // normalized diyfp's w = f * 2^e, with q = 64,
+    //
+    //      e >= -1022      (min IEEE exponent)
+    //           -52        (p - 1)
+    //           -52        (p - 1, possibly normalize denormal IEEE numbers)
+    //           -11        (normalize the diyfp)
+    //         = -1137
+    //
+    // and
+    //
+    //      e <= +1023      (max IEEE exponent)
+    //           -52        (p - 1)
+    //           -11        (normalize the diyfp)
+    //         = 960
+    //
+    // This binary exponent range [-1137,960] results in a decimal exponent
+    // range [-307,324]. One does not need to store a cached power for each
+    // k in this range. For each such k it suffices to find a cached power
+    // such that the exponent of the product lies in [alpha,gamma].
+    // This implies that the difference of the decimal exponents of adjacent
+    // table entries must be less than or equal to
+    //
+    //      floor( (gamma - alpha) * log_10(2) ) = 8.
+    //
+    // (A smaller distance gamma-alpha would require a larger table.)
+
+    // NB:
+    // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34.
+
+    constexpr int kCachedPowersMinDecExp = -300;
+    constexpr int kCachedPowersDecStep = 8;
+
+    static constexpr std::array<cached_power, 79> kCachedPowers =
+    {
+        {
+            { 0xAB70FE17C79AC6CA, -1060, -300 },
+            { 0xFF77B1FCBEBCDC4F, -1034, -292 },
+            { 0xBE5691EF416BD60C, -1007, -284 },
+            { 0x8DD01FAD907FFC3C,  -980, -276 },
+            { 0xD3515C2831559A83,  -954, -268 },
+            { 0x9D71AC8FADA6C9B5,  -927, -260 },
+            { 0xEA9C227723EE8BCB,  -901, -252 },
+            { 0xAECC49914078536D,  -874, -244 },
+            { 0x823C12795DB6CE57,  -847, -236 },
+            { 0xC21094364DFB5637,  -821, -228 },
+            { 0x9096EA6F3848984F,  -794, -220 },
+            { 0xD77485CB25823AC7,  -768, -212 },
+            { 0xA086CFCD97BF97F4,  -741, -204 },
+            { 0xEF340A98172AACE5,  -715, -196 },
+            { 0xB23867FB2A35B28E,  -688, -188 },
+            { 0x84C8D4DFD2C63F3B,  -661, -180 },
+            { 0xC5DD44271AD3CDBA,  -635, -172 },
+            { 0x936B9FCEBB25C996,  -608, -164 },
+            { 0xDBAC6C247D62A584,  -582, -156 },
+            { 0xA3AB66580D5FDAF6,  -555, -148 },
+            { 0xF3E2F893DEC3F126,  -529, -140 },
+            { 0xB5B5ADA8AAFF80B8,  -502, -132 },
+            { 0x87625F056C7C4A8B,  -475, -124 },
+            { 0xC9BCFF6034C13053,  -449, -116 },
+            { 0x964E858C91BA2655,  -422, -108 },
+            { 0xDFF9772470297EBD,  -396, -100 },
+            { 0xA6DFBD9FB8E5B88F,  -369,  -92 },
+            { 0xF8A95FCF88747D94,  -343,  -84 },
+            { 0xB94470938FA89BCF,  -316,  -76 },
+            { 0x8A08F0F8BF0F156B,  -289,  -68 },
+            { 0xCDB02555653131B6,  -263,  -60 },
+            { 0x993FE2C6D07B7FAC,  -236,  -52 },
+            { 0xE45C10C42A2B3B06,  -210,  -44 },
+            { 0xAA242499697392D3,  -183,  -36 },
+            { 0xFD87B5F28300CA0E,  -157,  -28 },
+            { 0xBCE5086492111AEB,  -130,  -20 },
+            { 0x8CBCCC096F5088CC,  -103,  -12 },
+            { 0xD1B71758E219652C,   -77,   -4 },
+            { 0x9C40000000000000,   -50,    4 },
+            { 0xE8D4A51000000000,   -24,   12 },
+            { 0xAD78EBC5AC620000,     3,   20 },
+            { 0x813F3978F8940984,    30,   28 },
+            { 0xC097CE7BC90715B3,    56,   36 },
+            { 0x8F7E32CE7BEA5C70,    83,   44 },
+            { 0xD5D238A4ABE98068,   109,   52 },
+            { 0x9F4F2726179A2245,   136,   60 },
+            { 0xED63A231D4C4FB27,   162,   68 },
+            { 0xB0DE65388CC8ADA8,   189,   76 },
+            { 0x83C7088E1AAB65DB,   216,   84 },
+            { 0xC45D1DF942711D9A,   242,   92 },
+            { 0x924D692CA61BE758,   269,  100 },
+            { 0xDA01EE641A708DEA,   295,  108 },
+            { 0xA26DA3999AEF774A,   322,  116 },
+            { 0xF209787BB47D6B85,   348,  124 },
+            { 0xB454E4A179DD1877,   375,  132 },
+            { 0x865B86925B9BC5C2,   402,  140 },
+            { 0xC83553C5C8965D3D,   428,  148 },
+            { 0x952AB45CFA97A0B3,   455,  156 },
+            { 0xDE469FBD99A05FE3,   481,  164 },
+            { 0xA59BC234DB398C25,   508,  172 },
+            { 0xF6C69A72A3989F5C,   534,  180 },
+            { 0xB7DCBF5354E9BECE,   561,  188 },
+            { 0x88FCF317F22241E2,   588,  196 },
+            { 0xCC20CE9BD35C78A5,   614,  204 },
+            { 0x98165AF37B2153DF,   641,  212 },
+            { 0xE2A0B5DC971F303A,   667,  220 },
+            { 0xA8D9D1535CE3B396,   694,  228 },
+            { 0xFB9B7CD9A4A7443C,   720,  236 },
+            { 0xBB764C4CA7A44410,   747,  244 },
+            { 0x8BAB8EEFB6409C1A,   774,  252 },
+            { 0xD01FEF10A657842C,   800,  260 },
+            { 0x9B10A4E5E9913129,   827,  268 },
+            { 0xE7109BFBA19C0C9D,   853,  276 },
+            { 0xAC2820D9623BF429,   880,  284 },
+            { 0x80444B5E7AA7CF85,   907,  292 },
+            { 0xBF21E44003ACDD2D,   933,  300 },
+            { 0x8E679C2F5E44FF8F,   960,  308 },
+            { 0xD433179D9C8CB841,   986,  316 },
+            { 0x9E19DB92B4E31BA9,  1013,  324 },
+        }
+    };
+
+    // This computation gives exactly the same results for k as
+    //      k = ceil((kAlpha - e - 1) * 0.30102999566398114)
+    // for |e| <= 1500, but doesn't require floating-point operations.
+    // NB: log_10(2) ~= 78913 / 2^18
+    JSON_ASSERT(e >= -1500);
+    JSON_ASSERT(e <=  1500);
+    const int f = kAlpha - e - 1;
+    const int k = ((f * 78913) / (1 << 18)) + static_cast<int>(f > 0);
+
+    const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) / kCachedPowersDecStep;
+    JSON_ASSERT(index >= 0);
+    JSON_ASSERT(static_cast<std::size_t>(index) < kCachedPowers.size());
+
+    const cached_power cached = kCachedPowers[static_cast<std::size_t>(index)];
+    JSON_ASSERT(kAlpha <= cached.e + e + 64);
+    JSON_ASSERT(kGamma >= cached.e + e + 64);
+
+    return cached;
+}
+
+/*!
+For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k.
+For n == 0, returns 1 and sets pow10 := 1.
+*/
+inline int find_largest_pow10(const std::uint32_t n, std::uint32_t& pow10)
+{
+    // LCOV_EXCL_START
+    if (n >= 1000000000)
+    {
+        pow10 = 1000000000;
+        return 10;
+    }
+    // LCOV_EXCL_STOP
+    if (n >= 100000000)
+    {
+        pow10 = 100000000;
+        return  9;
+    }
+    if (n >= 10000000)
+    {
+        pow10 = 10000000;
+        return  8;
+    }
+    if (n >= 1000000)
+    {
+        pow10 = 1000000;
+        return  7;
+    }
+    if (n >= 100000)
+    {
+        pow10 = 100000;
+        return  6;
+    }
+    if (n >= 10000)
+    {
+        pow10 = 10000;
+        return  5;
+    }
+    if (n >= 1000)
+    {
+        pow10 = 1000;
+        return  4;
+    }
+    if (n >= 100)
+    {
+        pow10 = 100;
+        return  3;
+    }
+    if (n >= 10)
+    {
+        pow10 = 10;
+        return  2;
+    }
+
+    pow10 = 1;
+    return 1;
+}
+
+inline void grisu2_round(char* buf, int len, std::uint64_t dist, std::uint64_t delta,
+                         std::uint64_t rest, std::uint64_t ten_k)
+{
+    JSON_ASSERT(len >= 1);
+    JSON_ASSERT(dist <= delta);
+    JSON_ASSERT(rest <= delta);
+    JSON_ASSERT(ten_k > 0);
+
+    //               <--------------------------- delta ---->
+    //                                  <---- dist --------->
+    // --------------[------------------+-------------------]--------------
+    //               M-                 w                   M+
+    //
+    //                                  ten_k
+    //                                <------>
+    //                                       <---- rest ---->
+    // --------------[------------------+----+--------------]--------------
+    //                                  w    V
+    //                                       = buf * 10^k
+    //
+    // ten_k represents a unit-in-the-last-place in the decimal representation
+    // stored in buf.
+    // Decrement buf by ten_k while this takes buf closer to w.
+
+    // The tests are written in this order to avoid overflow in unsigned
+    // integer arithmetic.
+
+    while (rest < dist
+            && delta - rest >= ten_k
+            && (rest + ten_k < dist || dist - rest > rest + ten_k - dist))
+    {
+        JSON_ASSERT(buf[len - 1] != '0');
+        buf[len - 1]--;
+        rest += ten_k;
+    }
+}
+
+/*!
+Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+.
+M- and M+ must be normalized and share the same exponent -60 <= e <= -32.
+*/
+inline void grisu2_digit_gen(char* buffer, int& length, int& decimal_exponent,
+                             diyfp M_minus, diyfp w, diyfp M_plus)
+{
+    static_assert(kAlpha >= -60, "internal error");
+    static_assert(kGamma <= -32, "internal error");
+
+    // Generates the digits (and the exponent) of a decimal floating-point
+    // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's
+    // w, M- and M+ share the same exponent e, which satisfies alpha <= e <= gamma.
+    //
+    //               <--------------------------- delta ---->
+    //                                  <---- dist --------->
+    // --------------[------------------+-------------------]--------------
+    //               M-                 w                   M+
+    //
+    // Grisu2 generates the digits of M+ from left to right and stops as soon as
+    // V is in [M-,M+].
+
+    JSON_ASSERT(M_plus.e >= kAlpha);
+    JSON_ASSERT(M_plus.e <= kGamma);
+
+    std::uint64_t delta = diyfp::sub(M_plus, M_minus).f; // (significand of (M+ - M-), implicit exponent is e)
+    std::uint64_t dist  = diyfp::sub(M_plus, w      ).f; // (significand of (M+ - w ), implicit exponent is e)
+
+    // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0):
+    //
+    //      M+ = f * 2^e
+    //         = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e
+    //         = ((p1        ) * 2^-e + (p2        )) * 2^e
+    //         = p1 + p2 * 2^e
+
+    const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e);
+
+    auto p1 = static_cast<std::uint32_t>(M_plus.f >> -one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.)
+    std::uint64_t p2 = M_plus.f & (one.f - 1);                    // p2 = f mod 2^-e
+
+    // 1)
+    //
+    // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0]
+
+    JSON_ASSERT(p1 > 0);
+
+    std::uint32_t pow10{};
+    const int k = find_largest_pow10(p1, pow10);
+
+    //      10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1)
+    //
+    //      p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1))
+    //         = (d[k-1]         ) * 10^(k-1) + (p1 mod 10^(k-1))
+    //
+    //      M+ = p1                                             + p2 * 2^e
+    //         = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1))          + p2 * 2^e
+    //         = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e
+    //         = d[k-1] * 10^(k-1) + (                         rest) * 2^e
+    //
+    // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0)
+    //
+    //      p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0]
+    //
+    // but stop as soon as
+    //
+    //      rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e
+
+    int n = k;
+    while (n > 0)
+    {
+        // Invariants:
+        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)    (buffer = 0 for n = k)
+        //      pow10 = 10^(n-1) <= p1 < 10^n
+        //
+        const std::uint32_t d = p1 / pow10;  // d = p1 div 10^(n-1)
+        const std::uint32_t r = p1 % pow10;  // r = p1 mod 10^(n-1)
+        //
+        //      M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e
+        //         = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e)
+        //
+        JSON_ASSERT(d <= 9);
+        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
+        //
+        //      M+ = buffer * 10^(n-1) + (r + p2 * 2^e)
+        //
+        p1 = r;
+        n--;
+        //
+        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)
+        //      pow10 = 10^n
+        //
+
+        // Now check if enough digits have been generated.
+        // Compute
+        //
+        //      p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e
+        //
+        // Note:
+        // Since rest and delta share the same exponent e, it suffices to
+        // compare the significands.
+        const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2;
+        if (rest <= delta)
+        {
+            // V = buffer * 10^n, with M- <= V <= M+.
+
+            decimal_exponent += n;
+
+            // We may now just stop. But instead look if the buffer could be
+            // decremented to bring V closer to w.
+            //
+            // pow10 = 10^n is now 1 ulp in the decimal representation V.
+            // The rounding procedure works with diyfp's with an implicit
+            // exponent of e.
+            //
+            //      10^n = (10^n * 2^-e) * 2^e = ulp * 2^e
+            //
+            const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e;
+            grisu2_round(buffer, length, dist, delta, rest, ten_n);
+
+            return;
+        }
+
+        pow10 /= 10;
+        //
+        //      pow10 = 10^(n-1) <= p1 < 10^n
+        // Invariants restored.
+    }
+
+    // 2)
+    //
+    // The digits of the integral part have been generated:
+    //
+    //      M+ = d[k-1]...d[1]d[0] + p2 * 2^e
+    //         = buffer            + p2 * 2^e
+    //
+    // Now generate the digits of the fractional part p2 * 2^e.
+    //
+    // Note:
+    // No decimal point is generated: the exponent is adjusted instead.
+    //
+    // p2 actually represents the fraction
+    //
+    //      p2 * 2^e
+    //          = p2 / 2^-e
+    //          = d[-1] / 10^1 + d[-2] / 10^2 + ...
+    //
+    // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...)
+    //
+    //      p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m
+    //                      + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...)
+    //
+    // using
+    //
+    //      10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e)
+    //                = (                   d) * 2^-e + (                   r)
+    //
+    // or
+    //      10^m * p2 * 2^e = d + r * 2^e
+    //
+    // i.e.
+    //
+    //      M+ = buffer + p2 * 2^e
+    //         = buffer + 10^-m * (d + r * 2^e)
+    //         = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e
+    //
+    // and stop as soon as 10^-m * r * 2^e <= delta * 2^e
+
+    JSON_ASSERT(p2 > delta);
+
+    int m = 0;
+    for (;;)
+    {
+        // Invariant:
+        //      M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...) * 2^e
+        //         = buffer * 10^-m + 10^-m * (p2                                 ) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * (10 * p2)                   ) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e + (10*p2 mod 2^-e)) * 2^e
+        //
+        JSON_ASSERT(p2 <= (std::numeric_limits<std::uint64_t>::max)() / 10);
+        p2 *= 10;
+        const std::uint64_t d = p2 >> -one.e;     // d = (10 * p2) div 2^-e
+        const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e
+        //
+        //      M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e))
+        //         = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e
+        //
+        JSON_ASSERT(d <= 9);
+        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
+        //
+        //      M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e
+        //
+        p2 = r;
+        m++;
+        //
+        //      M+ = buffer * 10^-m + 10^-m * p2 * 2^e
+        // Invariant restored.
+
+        // Check if enough digits have been generated.
+        //
+        //      10^-m * p2 * 2^e <= delta * 2^e
+        //              p2 * 2^e <= 10^m * delta * 2^e
+        //                    p2 <= 10^m * delta
+        delta *= 10;
+        dist  *= 10;
+        if (p2 <= delta)
+        {
+            break;
+        }
+    }
+
+    // V = buffer * 10^-m, with M- <= V <= M+.
+
+    decimal_exponent -= m;
+
+    // 1 ulp in the decimal representation is now 10^-m.
+    // Since delta and dist are now scaled by 10^m, we need to do the
+    // same with ulp in order to keep the units in sync.
+    //
+    //      10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e
+    //
+    const std::uint64_t ten_m = one.f;
+    grisu2_round(buffer, length, dist, delta, p2, ten_m);
+
+    // By construction this algorithm generates the shortest possible decimal
+    // number (Loitsch, Theorem 6.2) which rounds back to w.
+    // For an input number of precision p, at least
+    //
+    //      N = 1 + ceil(p * log_10(2))
+    //
+    // decimal digits are sufficient to identify all binary floating-point
+    // numbers (Matula, "In-and-Out conversions").
+    // This implies that the algorithm does not produce more than N decimal
+    // digits.
+    //
+    //      N = 17 for p = 53 (IEEE double precision)
+    //      N = 9  for p = 24 (IEEE single precision)
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+JSON_HEDLEY_NON_NULL(1)
+inline void grisu2(char* buf, int& len, int& decimal_exponent,
+                   diyfp m_minus, diyfp v, diyfp m_plus)
+{
+    JSON_ASSERT(m_plus.e == m_minus.e);
+    JSON_ASSERT(m_plus.e == v.e);
+
+    //  --------(-----------------------+-----------------------)--------    (A)
+    //          m-                      v                       m+
+    //
+    //  --------------------(-----------+-----------------------)--------    (B)
+    //                      m-          v                       m+
+    //
+    // First scale v (and m- and m+) such that the exponent is in the range
+    // [alpha, gamma].
+
+    const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e);
+
+    const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k
+
+    // The exponent of the products is = v.e + c_minus_k.e + q and is in the range [alpha,gamma]
+    const diyfp w       = diyfp::mul(v,       c_minus_k);
+    const diyfp w_minus = diyfp::mul(m_minus, c_minus_k);
+    const diyfp w_plus  = diyfp::mul(m_plus,  c_minus_k);
+
+    //  ----(---+---)---------------(---+---)---------------(---+---)----
+    //          w-                      w                       w+
+    //          = c*m-                  = c*v                   = c*m+
+    //
+    // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and
+    // w+ are now off by a small amount.
+    // In fact:
+    //
+    //      w - v * 10^k < 1 ulp
+    //
+    // To account for this inaccuracy, add resp. subtract 1 ulp.
+    //
+    //  --------+---[---------------(---+---)---------------]---+--------
+    //          w-  M-                  w                   M+  w+
+    //
+    // Now any number in [M-, M+] (bounds included) will round to w when input,
+    // regardless of how the input rounding algorithm breaks ties.
+    //
+    // And digit_gen generates the shortest possible such number in [M-, M+].
+    // Note that this does not mean that Grisu2 always generates the shortest
+    // possible number in the interval (m-, m+).
+    const diyfp M_minus(w_minus.f + 1, w_minus.e);
+    const diyfp M_plus (w_plus.f  - 1, w_plus.e );
+
+    decimal_exponent = -cached.k; // = -(-k) = k
+
+    grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus);
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+template<typename FloatType>
+JSON_HEDLEY_NON_NULL(1)
+void grisu2(char* buf, int& len, int& decimal_exponent, FloatType value)
+{
+    static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3,
+                  "internal error: not enough precision");
+
+    JSON_ASSERT(std::isfinite(value));
+    JSON_ASSERT(value > 0);
+
+    // If the neighbors (and boundaries) of 'value' are always computed for double-precision
+    // numbers, all float's can be recovered using strtod (and strtof). However, the resulting
+    // decimal representations are not exactly "short".
+    //
+    // The documentation for 'std::to_chars' (https://en.cppreference.com/w/cpp/utility/to_chars)
+    // says "value is converted to a string as if by std::sprintf in the default ("C") locale"
+    // and since sprintf promotes floats to doubles, I think this is exactly what 'std::to_chars'
+    // does.
+    // On the other hand, the documentation for 'std::to_chars' requires that "parsing the
+    // representation using the corresponding std::from_chars function recovers value exactly". That
+    // indicates that single precision floating-point numbers should be recovered using
+    // 'std::strtof'.
+    //
+    // NB: If the neighbors are computed for single-precision numbers, there is a single float
+    //     (7.0385307e-26f) which can't be recovered using strtod. The resulting double precision
+    //     value is off by 1 ulp.
+#if 0 // NOLINT(readability-avoid-unconditional-preprocessor-if)
+    const boundaries w = compute_boundaries(static_cast<double>(value));
+#else
+    const boundaries w = compute_boundaries(value);
+#endif
+
+    grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus);
+}
+
+/*!
+@brief appends a decimal representation of e to buf
+@return a pointer to the element following the exponent.
+@pre -1000 < e < 1000
+*/
+JSON_HEDLEY_NON_NULL(1)
+JSON_HEDLEY_RETURNS_NON_NULL
+inline char* append_exponent(char* buf, int e)
+{
+    JSON_ASSERT(e > -1000);
+    JSON_ASSERT(e <  1000);
+
+    if (e < 0)
+    {
+        e = -e;
+        *buf++ = '-';
+    }
+    else
+    {
+        *buf++ = '+';
+    }
+
+    auto k = static_cast<std::uint32_t>(e);
+    if (k < 10)
+    {
+        // Always print at least two digits in the exponent.
+        // This is for compatibility with printf("%g").
+        *buf++ = '0';
+        *buf++ = static_cast<char>('0' + k);
+    }
+    else if (k < 100)
+    {
+        *buf++ = static_cast<char>('0' + (k / 10));
+        k %= 10;
+        *buf++ = static_cast<char>('0' + k);
+    }
+    else
+    {
+        *buf++ = static_cast<char>('0' + (k / 100));
+        k %= 100;
+        *buf++ = static_cast<char>('0' + (k / 10));
+        k %= 10;
+        *buf++ = static_cast<char>('0' + k);
+    }
+
+    return buf;
+}
+
+/*!
+@brief prettify v = buf * 10^decimal_exponent
+
+If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point
+notation. Otherwise it will be printed in exponential notation.
+
+@pre min_exp < 0
+@pre max_exp > 0
+*/
+JSON_HEDLEY_NON_NULL(1)
+JSON_HEDLEY_RETURNS_NON_NULL
+inline char* format_buffer(char* buf, int len, int decimal_exponent,
+                           int min_exp, int max_exp)
+{
+    JSON_ASSERT(min_exp < 0);
+    JSON_ASSERT(max_exp > 0);
+
+    const int k = len;
+    const int n = len + decimal_exponent;
+
+    // v = buf * 10^(n-k)
+    // k is the length of the buffer (number of decimal digits)
+    // n is the position of the decimal point relative to the start of the buffer.
+
+    if (k <= n && n <= max_exp)
+    {
+        // digits[000]
+        // len <= max_exp + 2
+
+        std::memset(buf + k, '0', static_cast<size_t>(n) - static_cast<size_t>(k));
+        // Make it look like a floating-point number (#362, #378)
+        buf[n + 0] = '.';
+        buf[n + 1] = '0';
+        return buf + (static_cast<size_t>(n) + 2);
+    }
+
+    if (0 < n && n <= max_exp)
+    {
+        // dig.its
+        // len <= max_digits10 + 1
+
+        JSON_ASSERT(k > n);
+
+        std::memmove(buf + (static_cast<size_t>(n) + 1), buf + n, static_cast<size_t>(k) - static_cast<size_t>(n));
+        buf[n] = '.';
+        return buf + (static_cast<size_t>(k) + 1U);
+    }
+
+    if (min_exp < n && n <= 0)
+    {
+        // 0.[000]digits
+        // len <= 2 + (-min_exp - 1) + max_digits10
+
+        std::memmove(buf + (2 + static_cast<size_t>(-n)), buf, static_cast<size_t>(k));
+        buf[0] = '0';
+        buf[1] = '.';
+        std::memset(buf + 2, '0', static_cast<size_t>(-n));
+        return buf + (2U + static_cast<size_t>(-n) + static_cast<size_t>(k));
+    }
+
+    if (k == 1)
+    {
+        // dE+123
+        // len <= 1 + 5
+
+        buf += 1;
+    }
+    else
+    {
+        // d.igitsE+123
+        // len <= max_digits10 + 1 + 5
+
+        std::memmove(buf + 2, buf + 1, static_cast<size_t>(k) - 1);
+        buf[1] = '.';
+        buf += 1 + static_cast<size_t>(k);
+    }
+
+    *buf++ = 'e';
+    return append_exponent(buf, n - 1);
+}
+
+}  // namespace dtoa_impl
+
+/*!
+@brief generates a decimal representation of the floating-point number value in [first, last).
+
+The format of the resulting decimal representation is similar to printf's %g
+format. Returns an iterator pointing past-the-end of the decimal representation.
+
+@note The input number must be finite, i.e. NaN's and Inf's are not supported.
+@note The buffer must be large enough.
+@note The result is NOT null-terminated.
+*/
+template<typename FloatType>
+JSON_HEDLEY_NON_NULL(1, 2)
+JSON_HEDLEY_RETURNS_NON_NULL
+char* to_chars(char* first, const char* last, FloatType value)
+{
+    static_cast<void>(last); // maybe unused - fix warning
+    JSON_ASSERT(std::isfinite(value));
+
+    // Use signbit(value) instead of (value < 0) since signbit works for -0.
+    if (std::signbit(value))
+    {
+        value = -value;
+        *first++ = '-';
+    }
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+    if (value == 0) // +-0
+    {
+        *first++ = '0';
+        // Make it look like a floating-point number (#362, #378)
+        *first++ = '.';
+        *first++ = '0';
+        return first;
+    }
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10);
+
+    // Compute v = buffer * 10^decimal_exponent.
+    // The decimal digits are stored in the buffer, which needs to be interpreted
+    // as an unsigned decimal integer.
+    // len is the length of the buffer, i.e. the number of decimal digits.
+    int len = 0;
+    int decimal_exponent = 0;
+    dtoa_impl::grisu2(first, len, decimal_exponent, value);
+
+    JSON_ASSERT(len <= std::numeric_limits<FloatType>::max_digits10);
+
+    // Format the buffer like printf("%.*g", prec, value)
+    constexpr int kMinExp = -4;
+    // Use digits10 here to increase compatibility with version 2.
+    constexpr int kMaxExp = std::numeric_limits<FloatType>::digits10;
+
+    JSON_ASSERT(last - first >= kMaxExp + 2);
+    JSON_ASSERT(last - first >= 2 + (-kMinExp - 1) + std::numeric_limits<FloatType>::max_digits10);
+    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10 + 6);
+
+    return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp, kMaxExp);
+}
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/output/binary_writer.hpp>
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+///////////////////
+// serialization //
+///////////////////
+
+/// how to treat decoding errors
+enum class error_handler_t
+{
+    strict,  ///< throw a type_error exception in case of invalid UTF-8
+    replace, ///< replace invalid UTF-8 sequences with U+FFFD
+    ignore   ///< ignore invalid UTF-8 sequences
+};
+
+template<typename BasicJsonType>
+class serializer
+{
+    using string_t = typename BasicJsonType::string_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using binary_char_t = typename BasicJsonType::binary_t::value_type;
+    static constexpr std::uint8_t UTF8_ACCEPT = 0;
+    static constexpr std::uint8_t UTF8_REJECT = 1;
+
+  public:
+    /*!
+    @param[in] s  output stream to serialize to
+    @param[in] ichar  indentation character to use
+    @param[in] error_handler_  how to react on decoding errors
+    */
+    serializer(output_adapter_t<char> s, const char ichar,
+               error_handler_t error_handler_ = error_handler_t::strict)
+        : o(std::move(s))
+        , loc(std::localeconv())
+        , thousands_sep(loc->thousands_sep == nullptr ? '\0' : std::char_traits<char>::to_char_type(* (loc->thousands_sep)))
+        , decimal_point(loc->decimal_point == nullptr ? '\0' : std::char_traits<char>::to_char_type(* (loc->decimal_point)))
+        , indent_char(ichar)
+        , indent_string(512, indent_char)
+        , error_handler(error_handler_)
+    {}
+
+    // delete because of pointer members
+    serializer(const serializer&) = delete;
+    serializer& operator=(const serializer&) = delete;
+    serializer(serializer&&) = delete;
+    serializer& operator=(serializer&&) = delete;
+    ~serializer() = default;
+
+    /*!
+    @brief internal implementation of the serialization function
+
+    This function is called by the public member function dump and organizes
+    the serialization internally. The indentation level is propagated as
+    additional parameter. In case of arrays and objects, the function is
+    called recursively.
+
+    - strings and object keys are escaped using `escape_string()`
+    - integer numbers are converted implicitly via `operator<<`
+    - floating-point numbers are converted to a string using `"%g"` format
+    - binary values are serialized as objects containing the subtype and the
+      byte array
+
+    @param[in] val               value to serialize
+    @param[in] pretty_print      whether the output shall be pretty-printed
+    @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
+    in the output are escaped with `\uXXXX` sequences, and the result consists
+    of ASCII characters only.
+    @param[in] indent_step       the indent level
+    @param[in] current_indent    the current indent level (only used internally)
+    */
+    void dump(const BasicJsonType& val,
+              const bool pretty_print,
+              const bool ensure_ascii,
+              const unsigned int indent_step,
+              const unsigned int current_indent = 0)
+    {
+        switch (val.m_data.m_type)
+        {
+            case value_t::object:
+            {
+                if (val.m_data.m_value.object->empty())
+                {
+                    o->write_characters("{}", 2);
+                    return;
+                }
+
+                if (pretty_print)
+                {
+                    o->write_characters("{\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
+                    {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    // first n-1 elements
+                    auto i = val.m_data.m_value.object->cbegin();
+                    for (std::size_t cnt = 0; cnt < val.m_data.m_value.object->size() - 1; ++cnt, ++i)
+                    {
+                        o->write_characters(indent_string.c_str(), new_indent);
+                        o->write_character('\"');
+                        dump_escaped(i->first, ensure_ascii);
+                        o->write_characters("\": ", 3);
+                        dump(i->second, true, ensure_ascii, indent_step, new_indent);
+                        o->write_characters(",\n", 2);
+                    }
+
+                    // last element
+                    JSON_ASSERT(i != val.m_data.m_value.object->cend());
+                    JSON_ASSERT(std::next(i) == val.m_data.m_value.object->cend());
+                    o->write_characters(indent_string.c_str(), new_indent);
+                    o->write_character('\"');
+                    dump_escaped(i->first, ensure_ascii);
+                    o->write_characters("\": ", 3);
+                    dump(i->second, true, ensure_ascii, indent_step, new_indent);
+
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character('}');
+                }
+                else
+                {
+                    o->write_character('{');
+
+                    // first n-1 elements
+                    auto i = val.m_data.m_value.object->cbegin();
+                    for (std::size_t cnt = 0; cnt < val.m_data.m_value.object->size() - 1; ++cnt, ++i)
+                    {
+                        o->write_character('\"');
+                        dump_escaped(i->first, ensure_ascii);
+                        o->write_characters("\":", 2);
+                        dump(i->second, false, ensure_ascii, indent_step, current_indent);
+                        o->write_character(',');
+                    }
+
+                    // last element
+                    JSON_ASSERT(i != val.m_data.m_value.object->cend());
+                    JSON_ASSERT(std::next(i) == val.m_data.m_value.object->cend());
+                    o->write_character('\"');
+                    dump_escaped(i->first, ensure_ascii);
+                    o->write_characters("\":", 2);
+                    dump(i->second, false, ensure_ascii, indent_step, current_indent);
+
+                    o->write_character('}');
+                }
+
+                return;
+            }
+
+            case value_t::array:
+            {
+                if (val.m_data.m_value.array->empty())
+                {
+                    o->write_characters("[]", 2);
+                    return;
+                }
+
+                if (pretty_print)
+                {
+                    o->write_characters("[\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
+                    {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    // first n-1 elements
+                    for (auto i = val.m_data.m_value.array->cbegin();
+                            i != val.m_data.m_value.array->cend() - 1; ++i)
+                    {
+                        o->write_characters(indent_string.c_str(), new_indent);
+                        dump(*i, true, ensure_ascii, indent_step, new_indent);
+                        o->write_characters(",\n", 2);
+                    }
+
+                    // last element
+                    JSON_ASSERT(!val.m_data.m_value.array->empty());
+                    o->write_characters(indent_string.c_str(), new_indent);
+                    dump(val.m_data.m_value.array->back(), true, ensure_ascii, indent_step, new_indent);
+
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character(']');
+                }
+                else
+                {
+                    o->write_character('[');
+
+                    // first n-1 elements
+                    for (auto i = val.m_data.m_value.array->cbegin();
+                            i != val.m_data.m_value.array->cend() - 1; ++i)
+                    {
+                        dump(*i, false, ensure_ascii, indent_step, current_indent);
+                        o->write_character(',');
+                    }
+
+                    // last element
+                    JSON_ASSERT(!val.m_data.m_value.array->empty());
+                    dump(val.m_data.m_value.array->back(), false, ensure_ascii, indent_step, current_indent);
+
+                    o->write_character(']');
+                }
+
+                return;
+            }
+
+            case value_t::string:
+            {
+                o->write_character('\"');
+                dump_escaped(*val.m_data.m_value.string, ensure_ascii);
+                o->write_character('\"');
+                return;
+            }
+
+            case value_t::binary:
+            {
+                if (pretty_print)
+                {
+                    o->write_characters("{\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
+                    {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    o->write_characters(indent_string.c_str(), new_indent);
+
+                    o->write_characters("\"bytes\": [", 10);
+
+                    if (!val.m_data.m_value.binary->empty())
+                    {
+                        for (auto i = val.m_data.m_value.binary->cbegin();
+                                i != val.m_data.m_value.binary->cend() - 1; ++i)
+                        {
+                            dump_integer(*i);
+                            o->write_characters(", ", 2);
+                        }
+                        dump_integer(val.m_data.m_value.binary->back());
+                    }
+
+                    o->write_characters("],\n", 3);
+                    o->write_characters(indent_string.c_str(), new_indent);
+
+                    o->write_characters("\"subtype\": ", 11);
+                    if (val.m_data.m_value.binary->has_subtype())
+                    {
+                        dump_integer(val.m_data.m_value.binary->subtype());
+                    }
+                    else
+                    {
+                        o->write_characters("null", 4);
+                    }
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character('}');
+                }
+                else
+                {
+                    o->write_characters("{\"bytes\":[", 10);
+
+                    if (!val.m_data.m_value.binary->empty())
+                    {
+                        for (auto i = val.m_data.m_value.binary->cbegin();
+                                i != val.m_data.m_value.binary->cend() - 1; ++i)
+                        {
+                            dump_integer(*i);
+                            o->write_character(',');
+                        }
+                        dump_integer(val.m_data.m_value.binary->back());
+                    }
+
+                    o->write_characters("],\"subtype\":", 12);
+                    if (val.m_data.m_value.binary->has_subtype())
+                    {
+                        dump_integer(val.m_data.m_value.binary->subtype());
+                        o->write_character('}');
+                    }
+                    else
+                    {
+                        o->write_characters("null}", 5);
+                    }
+                }
+                return;
+            }
+
+            case value_t::boolean:
+            {
+                if (val.m_data.m_value.boolean)
+                {
+                    o->write_characters("true", 4);
+                }
+                else
+                {
+                    o->write_characters("false", 5);
+                }
+                return;
+            }
+
+            case value_t::number_integer:
+            {
+                dump_integer(val.m_data.m_value.number_integer);
+                return;
+            }
+
+            case value_t::number_unsigned:
+            {
+                dump_integer(val.m_data.m_value.number_unsigned);
+                return;
+            }
+
+            case value_t::number_float:
+            {
+                dump_float(val.m_data.m_value.number_float);
+                return;
+            }
+
+            case value_t::discarded:
+            {
+                o->write_characters("<discarded>", 11);
+                return;
+            }
+
+            case value_t::null:
+            {
+                o->write_characters("null", 4);
+                return;
+            }
+
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        }
+    }
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    /*!
+    @brief dump escaped string
+
+    Escape a string by replacing certain special characters by a sequence of an
+    escape character (backslash) and another character and other control
+    characters by a sequence of "\u" followed by a four-digit hex
+    representation. The escaped string is written to output stream @a o.
+
+    @param[in] s  the string to escape
+    @param[in] ensure_ascii  whether to escape non-ASCII characters with
+                             \uXXXX sequences
+
+    @complexity Linear in the length of string @a s.
+    */
+    void dump_escaped(const string_t& s, const bool ensure_ascii)
+    {
+        std::uint32_t codepoint{};
+        std::uint8_t state = UTF8_ACCEPT;
+        std::size_t bytes = 0;  // number of bytes written to string_buffer
+
+        // number of bytes written at the point of the last valid byte
+        std::size_t bytes_after_last_accept = 0;
+        std::size_t undumped_chars = 0;
+
+        for (std::size_t i = 0; i < s.size(); ++i)
+        {
+            const auto byte = static_cast<std::uint8_t>(s[i]);
+
+            switch (decode(state, codepoint, byte))
+            {
+                case UTF8_ACCEPT:  // decode found a new code point
+                {
+                    switch (codepoint)
+                    {
+                        case 0x08: // backspace
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'b';
+                            break;
+                        }
+
+                        case 0x09: // horizontal tab
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 't';
+                            break;
+                        }
+
+                        case 0x0A: // newline
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'n';
+                            break;
+                        }
+
+                        case 0x0C: // formfeed
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'f';
+                            break;
+                        }
+
+                        case 0x0D: // carriage return
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'r';
+                            break;
+                        }
+
+                        case 0x22: // quotation mark
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = '\"';
+                            break;
+                        }
+
+                        case 0x5C: // reverse solidus
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = '\\';
+                            break;
+                        }
+
+                        default:
+                        {
+                            // escape control characters (0x00..0x1F) or, if
+                            // ensure_ascii parameter is used, non-ASCII characters
+                            if ((codepoint <= 0x1F) || (ensure_ascii && (codepoint >= 0x7F)))
+                            {
+                                if (codepoint <= 0xFFFF)
+                                {
+                                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+                                    static_cast<void>((std::snprintf)(string_buffer.data() + bytes, 7, "\\u%04x",
+                                                                      static_cast<std::uint16_t>(codepoint)));
+                                    bytes += 6;
+                                }
+                                else
+                                {
+                                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+                                    static_cast<void>((std::snprintf)(string_buffer.data() + bytes, 13, "\\u%04x\\u%04x",
+                                                                      static_cast<std::uint16_t>(0xD7C0u + (codepoint >> 10u)),
+                                                                      static_cast<std::uint16_t>(0xDC00u + (codepoint & 0x3FFu))));
+                                    bytes += 12;
+                                }
+                            }
+                            else
+                            {
+                                // copy byte to buffer (all previous bytes
+                                // been copied have in default case above)
+                                string_buffer[bytes++] = s[i];
+                            }
+                            break;
+                        }
+                    }
+
+                    // write buffer and reset index; there must be 13 bytes
+                    // left, as this is the maximal number of bytes to be
+                    // written ("\uxxxx\uxxxx\0") for one code point
+                    if (string_buffer.size() - bytes < 13)
+                    {
+                        o->write_characters(string_buffer.data(), bytes);
+                        bytes = 0;
+                    }
+
+                    // remember the byte position of this accept
+                    bytes_after_last_accept = bytes;
+                    undumped_chars = 0;
+                    break;
+                }
+
+                case UTF8_REJECT:  // decode found invalid UTF-8 byte
+                {
+                    switch (error_handler)
+                    {
+                        case error_handler_t::strict:
+                        {
+                            JSON_THROW(type_error::create(316, concat("invalid UTF-8 byte at index ", std::to_string(i), ": 0x", hex_bytes(byte | 0)), nullptr));
+                        }
+
+                        case error_handler_t::ignore:
+                        case error_handler_t::replace:
+                        {
+                            // in case we saw this character the first time, we
+                            // would like to read it again, because the byte
+                            // may be OK for itself, but just not OK for the
+                            // previous sequence
+                            if (undumped_chars > 0)
+                            {
+                                --i;
+                            }
+
+                            // reset length buffer to the last accepted index;
+                            // thus removing/ignoring the invalid characters
+                            bytes = bytes_after_last_accept;
+
+                            if (error_handler == error_handler_t::replace)
+                            {
+                                // add a replacement character
+                                if (ensure_ascii)
+                                {
+                                    string_buffer[bytes++] = '\\';
+                                    string_buffer[bytes++] = 'u';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'd';
+                                }
+                                else
+                                {
+                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xEF');
+                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBF');
+                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBD');
+                                }
+
+                                // write buffer and reset index; there must be 13 bytes
+                                // left, as this is the maximal number of bytes to be
+                                // written ("\uxxxx\uxxxx\0") for one code point
+                                if (string_buffer.size() - bytes < 13)
+                                {
+                                    o->write_characters(string_buffer.data(), bytes);
+                                    bytes = 0;
+                                }
+
+                                bytes_after_last_accept = bytes;
+                            }
+
+                            undumped_chars = 0;
+
+                            // continue processing the string
+                            state = UTF8_ACCEPT;
+                            break;
+                        }
+
+                        default:            // LCOV_EXCL_LINE
+                            JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+                    }
+                    break;
+                }
+
+                default:  // decode found yet incomplete multi-byte code point
+                {
+                    if (!ensure_ascii)
+                    {
+                        // code point will not be escaped - copy byte to buffer
+                        string_buffer[bytes++] = s[i];
+                    }
+                    ++undumped_chars;
+                    break;
+                }
+            }
+        }
+
+        // we finished processing the string
+        if (JSON_HEDLEY_LIKELY(state == UTF8_ACCEPT))
+        {
+            // write buffer
+            if (bytes > 0)
+            {
+                o->write_characters(string_buffer.data(), bytes);
+            }
+        }
+        else
+        {
+            // we finish reading, but do not accept: string was incomplete
+            switch (error_handler)
+            {
+                case error_handler_t::strict:
+                {
+                    JSON_THROW(type_error::create(316, concat("incomplete UTF-8 string; last byte: 0x", hex_bytes(static_cast<std::uint8_t>(s.back() | 0))), nullptr));
+                }
+
+                case error_handler_t::ignore:
+                {
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
+                    break;
+                }
+
+                case error_handler_t::replace:
+                {
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
+                    // add a replacement character
+                    if (ensure_ascii)
+                    {
+                        o->write_characters("\\ufffd", 6);
+                    }
+                    else
+                    {
+                        o->write_characters("\xEF\xBF\xBD", 3);
+                    }
+                    break;
+                }
+
+                default:            // LCOV_EXCL_LINE
+                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+            }
+        }
+    }
+
+  private:
+    /*!
+    @brief count digits
+
+    Count the number of decimal (base 10) digits for an input unsigned integer.
+
+    @param[in] x  unsigned integer number to count its digits
+    @return    number of decimal digits
+    */
+    unsigned int count_digits(number_unsigned_t x) noexcept
+    {
+        unsigned int n_digits = 1;
+        for (;;)
+        {
+            if (x < 10)
+            {
+                return n_digits;
+            }
+            if (x < 100)
+            {
+                return n_digits + 1;
+            }
+            if (x < 1000)
+            {
+                return n_digits + 2;
+            }
+            if (x < 10000)
+            {
+                return n_digits + 3;
+            }
+            x = x / 10000u;
+            n_digits += 4;
+        }
+    }
+
+    /*!
+     * @brief convert a byte to a uppercase hex representation
+     * @param[in] byte byte to represent
+     * @return representation ("00".."FF")
+     */
+    static std::string hex_bytes(std::uint8_t byte)
+    {
+        std::string result = "FF";
+        constexpr const char* nibble_to_hex = "0123456789ABCDEF";
+        result[0] = nibble_to_hex[byte / 16];
+        result[1] = nibble_to_hex[byte % 16];
+        return result;
+    }
+
+    // templates to avoid warnings about useless casts
+    template <typename NumberType, enable_if_t<std::is_signed<NumberType>::value, int> = 0>
+    bool is_negative_number(NumberType x)
+    {
+        return x < 0;
+    }
+
+    template < typename NumberType, enable_if_t <std::is_unsigned<NumberType>::value, int > = 0 >
+    bool is_negative_number(NumberType /*unused*/)
+    {
+        return false;
+    }
+
+    /*!
+    @brief dump an integer
+
+    Dump a given integer to output stream @a o. Works internally with
+    @a number_buffer.
+
+    @param[in] x  integer number (signed or unsigned) to dump
+    @tparam NumberType either @a number_integer_t or @a number_unsigned_t
+    */
+    template < typename NumberType, detail::enable_if_t <
+                   std::is_integral<NumberType>::value ||
+                   std::is_same<NumberType, number_unsigned_t>::value ||
+                   std::is_same<NumberType, number_integer_t>::value ||
+                   std::is_same<NumberType, binary_char_t>::value,
+                   int > = 0 >
+    void dump_integer(NumberType x)
+    {
+        static constexpr std::array<std::array<char, 2>, 100> digits_to_99
+        {
+            {
+                {{'0', '0'}}, {{'0', '1'}}, {{'0', '2'}}, {{'0', '3'}}, {{'0', '4'}}, {{'0', '5'}}, {{'0', '6'}}, {{'0', '7'}}, {{'0', '8'}}, {{'0', '9'}},
+                {{'1', '0'}}, {{'1', '1'}}, {{'1', '2'}}, {{'1', '3'}}, {{'1', '4'}}, {{'1', '5'}}, {{'1', '6'}}, {{'1', '7'}}, {{'1', '8'}}, {{'1', '9'}},
+                {{'2', '0'}}, {{'2', '1'}}, {{'2', '2'}}, {{'2', '3'}}, {{'2', '4'}}, {{'2', '5'}}, {{'2', '6'}}, {{'2', '7'}}, {{'2', '8'}}, {{'2', '9'}},
+                {{'3', '0'}}, {{'3', '1'}}, {{'3', '2'}}, {{'3', '3'}}, {{'3', '4'}}, {{'3', '5'}}, {{'3', '6'}}, {{'3', '7'}}, {{'3', '8'}}, {{'3', '9'}},
+                {{'4', '0'}}, {{'4', '1'}}, {{'4', '2'}}, {{'4', '3'}}, {{'4', '4'}}, {{'4', '5'}}, {{'4', '6'}}, {{'4', '7'}}, {{'4', '8'}}, {{'4', '9'}},
+                {{'5', '0'}}, {{'5', '1'}}, {{'5', '2'}}, {{'5', '3'}}, {{'5', '4'}}, {{'5', '5'}}, {{'5', '6'}}, {{'5', '7'}}, {{'5', '8'}}, {{'5', '9'}},
+                {{'6', '0'}}, {{'6', '1'}}, {{'6', '2'}}, {{'6', '3'}}, {{'6', '4'}}, {{'6', '5'}}, {{'6', '6'}}, {{'6', '7'}}, {{'6', '8'}}, {{'6', '9'}},
+                {{'7', '0'}}, {{'7', '1'}}, {{'7', '2'}}, {{'7', '3'}}, {{'7', '4'}}, {{'7', '5'}}, {{'7', '6'}}, {{'7', '7'}}, {{'7', '8'}}, {{'7', '9'}},
+                {{'8', '0'}}, {{'8', '1'}}, {{'8', '2'}}, {{'8', '3'}}, {{'8', '4'}}, {{'8', '5'}}, {{'8', '6'}}, {{'8', '7'}}, {{'8', '8'}}, {{'8', '9'}},
+                {{'9', '0'}}, {{'9', '1'}}, {{'9', '2'}}, {{'9', '3'}}, {{'9', '4'}}, {{'9', '5'}}, {{'9', '6'}}, {{'9', '7'}}, {{'9', '8'}}, {{'9', '9'}},
+            }
+        };
+
+        // special case for "0"
+        if (x == 0)
+        {
+            o->write_character('0');
+            return;
+        }
+
+        // use a pointer to fill the buffer
+        auto buffer_ptr = number_buffer.begin(); // NOLINT(llvm-qualified-auto,readability-qualified-auto,cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+
+        number_unsigned_t abs_value;
+
+        unsigned int n_chars{};
+
+        if (is_negative_number(x))
+        {
+            *buffer_ptr = '-';
+            abs_value = remove_sign(static_cast<number_integer_t>(x));
+
+            // account one more byte for the minus sign
+            n_chars = 1 + count_digits(abs_value);
+        }
+        else
+        {
+            abs_value = static_cast<number_unsigned_t>(x);
+            n_chars = count_digits(abs_value);
+        }
+
+        // spare 1 byte for '\0'
+        JSON_ASSERT(n_chars < number_buffer.size() - 1);
+
+        // jump to the end to generate the string from backward,
+        // so we later avoid reversing the result
+        buffer_ptr += n_chars;
+
+        // Fast int2ascii implementation inspired by "Fastware" talk by Andrei Alexandrescu
+        // See: https://www.youtube.com/watch?v=o4-CwDo2zpg
+        while (abs_value >= 100)
+        {
+            const auto digits_index = static_cast<unsigned>((abs_value % 100));
+            abs_value /= 100;
+            *(--buffer_ptr) = digits_to_99[digits_index][1];
+            *(--buffer_ptr) = digits_to_99[digits_index][0];
+        }
+
+        if (abs_value >= 10)
+        {
+            const auto digits_index = static_cast<unsigned>(abs_value);
+            *(--buffer_ptr) = digits_to_99[digits_index][1];
+            *(--buffer_ptr) = digits_to_99[digits_index][0];
+        }
+        else
+        {
+            *(--buffer_ptr) = static_cast<char>('0' + abs_value);
+        }
+
+        o->write_characters(number_buffer.data(), n_chars);
+    }
+
+    /*!
+    @brief dump a floating-point number
+
+    Dump a given floating-point number to output stream @a o. Works internally
+    with @a number_buffer.
+
+    @param[in] x  floating-point number to dump
+    */
+    void dump_float(number_float_t x)
+    {
+        // NaN / inf
+        if (!std::isfinite(x))
+        {
+            o->write_characters("null", 4);
+            return;
+        }
+
+        // If number_float_t is an IEEE-754 single or double precision number,
+        // use the Grisu2 algorithm to produce short numbers which are
+        // guaranteed to round-trip, using strtof and strtod, resp.
+        //
+        // NB: The test below works if <long double> == <double>.
+        static constexpr bool is_ieee_single_or_double
+            = (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 24 && std::numeric_limits<number_float_t>::max_exponent == 128) ||
+              (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 53 && std::numeric_limits<number_float_t>::max_exponent == 1024);
+
+        dump_float(x, std::integral_constant<bool, is_ieee_single_or_double>());
+    }
+
+    void dump_float(number_float_t x, std::true_type /*is_ieee_single_or_double*/)
+    {
+        auto* begin = number_buffer.data();
+        auto* end = ::nlohmann::detail::to_chars(begin, begin + number_buffer.size(), x);
+
+        o->write_characters(begin, static_cast<size_t>(end - begin));
+    }
+
+    void dump_float(number_float_t x, std::false_type /*is_ieee_single_or_double*/)
+    {
+        // get number of digits for a float -> text -> float round-trip
+        static constexpr auto d = std::numeric_limits<number_float_t>::max_digits10;
+
+        // the actual conversion
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+        std::ptrdiff_t len = (std::snprintf)(number_buffer.data(), number_buffer.size(), "%.*g", d, x);
+
+        // negative value indicates an error
+        JSON_ASSERT(len > 0);
+        // check if buffer was large enough
+        JSON_ASSERT(static_cast<std::size_t>(len) < number_buffer.size());
+
+        // erase thousands separator
+        if (thousands_sep != '\0')
+        {
+            // NOLINTNEXTLINE(readability-qualified-auto,llvm-qualified-auto): std::remove returns an iterator, see https://github.com/nlohmann/json/issues/3081
+            const auto end = std::remove(number_buffer.begin(), number_buffer.begin() + len, thousands_sep);
+            std::fill(end, number_buffer.end(), '\0');
+            JSON_ASSERT((end - number_buffer.begin()) <= len);
+            len = (end - number_buffer.begin());
+        }
+
+        // convert decimal point to '.'
+        if (decimal_point != '\0' && decimal_point != '.')
+        {
+            // NOLINTNEXTLINE(readability-qualified-auto,llvm-qualified-auto): std::find returns an iterator, see https://github.com/nlohmann/json/issues/3081
+            const auto dec_pos = std::find(number_buffer.begin(), number_buffer.end(), decimal_point);
+            if (dec_pos != number_buffer.end())
+            {
+                *dec_pos = '.';
+            }
+        }
+
+        o->write_characters(number_buffer.data(), static_cast<std::size_t>(len));
+
+        // determine if we need to append ".0"
+        const bool value_is_int_like =
+            std::none_of(number_buffer.begin(), number_buffer.begin() + len + 1,
+                         [](char c)
+        {
+            return c == '.' || c == 'e';
+        });
+
+        if (value_is_int_like)
+        {
+            o->write_characters(".0", 2);
+        }
+    }
+
+    /*!
+    @brief check whether a string is UTF-8 encoded
+
+    The function checks each byte of a string whether it is UTF-8 encoded. The
+    result of the check is stored in the @a state parameter. The function must
+    be called initially with state 0 (accept). State 1 means the string must
+    be rejected, because the current byte is not allowed. If the string is
+    completely processed, but the state is non-zero, the string ended
+    prematurely; that is, the last byte indicated more bytes should have
+    followed.
+
+    @param[in,out] state  the state of the decoding
+    @param[in,out] codep  codepoint (valid only if resulting state is UTF8_ACCEPT)
+    @param[in] byte       next byte to decode
+    @return               new state
+
+    @note The function has been edited: a std::array is used.
+
+    @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+    @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+    */
+    static std::uint8_t decode(std::uint8_t& state, std::uint32_t& codep, const std::uint8_t byte) noexcept
+    {
+        static const std::array<std::uint8_t, 400> utf8d =
+        {
+            {
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7F
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9F
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // A0..BF
+                8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..DF
+                0xA, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // E0..EF
+                0xB, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // F0..FF
+                0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
+                1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
+                1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
+                1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // s7..s8
+            }
+        };
+
+        JSON_ASSERT(byte < utf8d.size());
+        const std::uint8_t type = utf8d[byte];
+
+        codep = (state != UTF8_ACCEPT)
+                ? (byte & 0x3fu) | (codep << 6u)
+                : (0xFFu >> type) & (byte);
+
+        const std::size_t index = 256u + (static_cast<size_t>(state) * 16u) + static_cast<size_t>(type);
+        JSON_ASSERT(index < utf8d.size());
+        state = utf8d[index];
+        return state;
+    }
+
+    /*
+     * Overload to make the compiler happy while it is instantiating
+     * dump_integer for number_unsigned_t.
+     * Must never be called.
+     */
+    number_unsigned_t remove_sign(number_unsigned_t x)
+    {
+        JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        return x; // LCOV_EXCL_LINE
+    }
+
+    /*
+     * Helper function for dump_integer
+     *
+     * This function takes a negative signed integer and returns its absolute
+     * value as unsigned integer. The plus/minus shuffling is necessary as we can
+     * not directly remove the sign of an arbitrary signed integer as the
+     * absolute values of INT_MIN and INT_MAX are usually not the same. See
+     * #1708 for details.
+     */
+    number_unsigned_t remove_sign(number_integer_t x) noexcept
+    {
+        JSON_ASSERT(x < 0 && x < (std::numeric_limits<number_integer_t>::max)()); // NOLINT(misc-redundant-expression)
+        return static_cast<number_unsigned_t>(-(x + 1)) + 1;
+    }
+
+  private:
+    /// the output of the serializer
+    output_adapter_t<char> o = nullptr;
+
+    /// a (hopefully) large enough character buffer
+    std::array<char, 64> number_buffer{{}};
+
+    /// the locale
+    const std::lconv* loc = nullptr;
+    /// the locale's thousand separator character
+    const char thousands_sep = '\0';
+    /// the locale's decimal point character
+    const char decimal_point = '\0';
+
+    /// string buffer
+    std::array<char, 512> string_buffer{{}};
+
+    /// the indentation character
+    const char indent_char;
+    /// the indentation string
+    string_t indent_string;
+
+    /// error_handler how to react on decoding errors
+    const error_handler_t error_handler;
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/value_t.hpp>
+
+// #include <nlohmann/json_fwd.hpp>
+
+// #include <nlohmann/ordered_map.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <functional> // equal_to, less
+#include <initializer_list> // initializer_list
+#include <iterator> // input_iterator_tag, iterator_traits
+#include <memory> // allocator
+#include <stdexcept> // for out_of_range
+#include <type_traits> // enable_if, is_convertible
+#include <utility> // pair
+#include <vector> // vector
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/// ordered_map: a minimal map-like container that preserves insertion order
+/// for use within nlohmann::basic_json<ordered_map>
+template <class Key, class T, class IgnoredLess = std::less<Key>,
+          class Allocator = std::allocator<std::pair<const Key, T>>>
+              struct ordered_map : std::vector<std::pair<const Key, T>, Allocator>
+{
+    using key_type = Key;
+    using mapped_type = T;
+    using Container = std::vector<std::pair<const Key, T>, Allocator>;
+    using iterator = typename Container::iterator;
+    using const_iterator = typename Container::const_iterator;
+    using size_type = typename Container::size_type;
+    using value_type = typename Container::value_type;
+#ifdef JSON_HAS_CPP_14
+    using key_compare = std::equal_to<>;
+#else
+    using key_compare = std::equal_to<Key>;
+#endif
+
+    // Explicit constructors instead of `using Container::Container`
+    // otherwise older compilers choke on it (GCC <= 5.5, xcode <= 9.4)
+    ordered_map() noexcept(noexcept(Container())) : Container{} {}
+    explicit ordered_map(const Allocator& alloc) noexcept(noexcept(Container(alloc))) : Container{alloc} {}
+    template <class It>
+    ordered_map(It first, It last, const Allocator& alloc = Allocator())
+        : Container{first, last, alloc} {}
+    ordered_map(std::initializer_list<value_type> init, const Allocator& alloc = Allocator() )
+        : Container{init, alloc} {}
+
+    std::pair<iterator, bool> emplace(const key_type& key, T&& t)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (m_compare(it->first, key))
+            {
+                return {it, false};
+            }
+        }
+        Container::emplace_back(key, std::forward<T>(t));
+        return {std::prev(this->end()), true};
+    }
+
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    std::pair<iterator, bool> emplace(KeyType && key, T && t)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (m_compare(it->first, key))
+            {
+                return {it, false};
+            }
+        }
+        Container::emplace_back(std::forward<KeyType>(key), std::forward<T>(t));
+        return {std::prev(this->end()), true};
+    }
+
+    T& operator[](const key_type& key)
+    {
+        return emplace(key, T{}).first->second;
+    }
+
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    T & operator[](KeyType && key)
+    {
+        return emplace(std::forward<KeyType>(key), T{}).first->second;
+    }
+
+    const T& operator[](const key_type& key) const
+    {
+        return at(key);
+    }
+
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    const T & operator[](KeyType && key) const
+    {
+        return at(std::forward<KeyType>(key));
+    }
+
+    T& at(const key_type& key)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (m_compare(it->first, key))
+            {
+                return it->second;
+            }
+        }
+
+        JSON_THROW(std::out_of_range("key not found"));
+    }
+
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    T & at(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (m_compare(it->first, key))
+            {
+                return it->second;
+            }
+        }
+
+        JSON_THROW(std::out_of_range("key not found"));
+    }
+
+    const T& at(const key_type& key) const
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (m_compare(it->first, key))
+            {
+                return it->second;
+            }
+        }
+
+        JSON_THROW(std::out_of_range("key not found"));
+    }
+
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    const T & at(KeyType && key) const // NOLINT(cppcoreguidelines-missing-std-forward)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (m_compare(it->first, key))
+            {
+                return it->second;
+            }
+        }
+
+        JSON_THROW(std::out_of_range("key not found"));
+    }
+
+    size_type erase(const key_type& key)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (m_compare(it->first, key))
+            {
+                // Since we cannot move const Keys, re-construct them in place
+                for (auto next = it; ++next != this->end(); ++it)
+                {
+                    it->~value_type(); // Destroy but keep allocation
+                    new (&*it) value_type{std::move(*next)};
+                }
+                Container::pop_back();
+                return 1;
+            }
+        }
+        return 0;
+    }
+
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    size_type erase(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (m_compare(it->first, key))
+            {
+                // Since we cannot move const Keys, re-construct them in place
+                for (auto next = it; ++next != this->end(); ++it)
+                {
+                    it->~value_type(); // Destroy but keep allocation
+                    new (&*it) value_type{std::move(*next)};
+                }
+                Container::pop_back();
+                return 1;
+            }
+        }
+        return 0;
+    }
+
+    iterator erase(iterator pos)
+    {
+        return erase(pos, std::next(pos));
+    }
+
+    iterator erase(iterator first, iterator last)
+    {
+        if (first == last)
+        {
+            return first;
+        }
+
+        const auto elements_affected = std::distance(first, last);
+        const auto offset = std::distance(Container::begin(), first);
+
+        // This is the start situation. We need to delete elements_affected
+        // elements (3 in this example: e, f, g), and need to return an
+        // iterator past the last deleted element (h in this example).
+        // Note that offset is the distance from the start of the vector
+        // to first. We will need this later.
+
+        // [ a, b, c, d, e, f, g, h, i, j ]
+        //               ^        ^
+        //             first    last
+
+        // Since we cannot move const Keys, we re-construct them in place.
+        // We start at first and re-construct (viz. copy) the elements from
+        // the back of the vector. Example for first iteration:
+
+        //               ,--------.
+        //               v        |   destroy e and re-construct with h
+        // [ a, b, c, d, e, f, g, h, i, j ]
+        //               ^        ^
+        //               it       it + elements_affected
+
+        for (auto it = first; std::next(it, elements_affected) != Container::end(); ++it)
+        {
+            it->~value_type(); // destroy but keep allocation
+            new (&*it) value_type{std::move(*std::next(it, elements_affected))}; // "move" next element to it
+        }
+
+        // [ a, b, c, d, h, i, j, h, i, j ]
+        //               ^        ^
+        //             first    last
+
+        // remove the unneeded elements at the end of the vector
+        Container::resize(this->size() - static_cast<size_type>(elements_affected));
+
+        // [ a, b, c, d, h, i, j ]
+        //               ^        ^
+        //             first    last
+
+        // first is now pointing past the last deleted element, but we cannot
+        // use this iterator, because it may have been invalidated by the
+        // resize call. Instead, we can return begin() + offset.
+        return Container::begin() + offset;
+    }
+
+    size_type count(const key_type& key) const
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (m_compare(it->first, key))
+            {
+                return 1;
+            }
+        }
+        return 0;
+    }
+
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    size_type count(KeyType && key) const // NOLINT(cppcoreguidelines-missing-std-forward)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (m_compare(it->first, key))
+            {
+                return 1;
+            }
+        }
+        return 0;
+    }
+
+    iterator find(const key_type& key)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (m_compare(it->first, key))
+            {
+                return it;
+            }
+        }
+        return Container::end();
+    }
+
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    iterator find(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (m_compare(it->first, key))
+            {
+                return it;
+            }
+        }
+        return Container::end();
+    }
+
+    const_iterator find(const key_type& key) const
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (m_compare(it->first, key))
+            {
+                return it;
+            }
+        }
+        return Container::end();
+    }
+
+    std::pair<iterator, bool> insert( value_type&& value )
+    {
+        return emplace(value.first, std::move(value.second));
+    }
+
+    std::pair<iterator, bool> insert( const value_type& value )
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (m_compare(it->first, value.first))
+            {
+                return {it, false};
+            }
+        }
+        Container::push_back(value);
+        return {--this->end(), true};
+    }
+
+    template<typename InputIt>
+    using require_input_iter = typename std::enable_if<std::is_convertible<typename std::iterator_traits<InputIt>::iterator_category,
+        std::input_iterator_tag>::value>::type;
+
+    template<typename InputIt, typename = require_input_iter<InputIt>>
+    void insert(InputIt first, InputIt last)
+    {
+        for (auto it = first; it != last; ++it)
+        {
+            insert(*it);
+        }
+    }
+
+private:
+    JSON_NO_UNIQUE_ADDRESS key_compare m_compare = key_compare();
+};
+
+NLOHMANN_JSON_NAMESPACE_END
+
+
+#if defined(JSON_HAS_CPP_17)
+    #if JSON_HAS_STATIC_RTTI
+        #include <any>
+    #endif
+    #include <string_view>
+#endif
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/*!
+@brief a class to store JSON values
+
+@internal
+@invariant The member variables @a m_value and @a m_type have the following
+relationship:
+- If `m_type == value_t::object`, then `m_value.object != nullptr`.
+- If `m_type == value_t::array`, then `m_value.array != nullptr`.
+- If `m_type == value_t::string`, then `m_value.string != nullptr`.
+The invariants are checked by member function assert_invariant().
+
+@note ObjectType trick from https://stackoverflow.com/a/9860911
+@endinternal
+
+@since version 1.0.0
+
+@nosubgrouping
+*/
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-special-member-functions)
+    : public ::nlohmann::detail::json_base_class<CustomBaseClass>
+{
+  private:
+    template<detail::value_t> friend struct detail::external_constructor;
+
+    template<typename>
+    friend class ::nlohmann::json_pointer;
+    // can be restored when json_pointer backwards compatibility is removed
+    // friend ::nlohmann::json_pointer<StringType>;
+
+    template<typename BasicJsonType, typename InputType>
+    friend class ::nlohmann::detail::parser;
+    friend ::nlohmann::detail::serializer<basic_json>;
+    template<typename BasicJsonType>
+    friend class ::nlohmann::detail::iter_impl;
+    template<typename BasicJsonType, typename CharType>
+    friend class ::nlohmann::detail::binary_writer;
+    template<typename BasicJsonType, typename InputType, typename SAX>
+    friend class ::nlohmann::detail::binary_reader;
+    template<typename BasicJsonType, typename InputAdapterType>
+    friend class ::nlohmann::detail::json_sax_dom_parser;
+    template<typename BasicJsonType, typename InputAdapterType>
+    friend class ::nlohmann::detail::json_sax_dom_callback_parser;
+    friend class ::nlohmann::detail::exception;
+
+    /// workaround type for MSVC
+    using basic_json_t = NLOHMANN_BASIC_JSON_TPL;
+    using json_base_class_t = ::nlohmann::detail::json_base_class<CustomBaseClass>;
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    // convenience aliases for types residing in namespace detail;
+    using lexer = ::nlohmann::detail::lexer_base<basic_json>;
+
+    template<typename InputAdapterType>
+    static ::nlohmann::detail::parser<basic_json, InputAdapterType> parser(
+        InputAdapterType adapter,
+        detail::parser_callback_t<basic_json>cb = nullptr,
+        const bool allow_exceptions = true,
+        const bool ignore_comments = false
+                                 )
+    {
+        return ::nlohmann::detail::parser<basic_json, InputAdapterType>(std::move(adapter),
+            std::move(cb), allow_exceptions, ignore_comments);
+    }
+
+  private:
+    using primitive_iterator_t = ::nlohmann::detail::primitive_iterator_t;
+    template<typename BasicJsonType>
+    using internal_iterator = ::nlohmann::detail::internal_iterator<BasicJsonType>;
+    template<typename BasicJsonType>
+    using iter_impl = ::nlohmann::detail::iter_impl<BasicJsonType>;
+    template<typename Iterator>
+    using iteration_proxy = ::nlohmann::detail::iteration_proxy<Iterator>;
+    template<typename Base> using json_reverse_iterator = ::nlohmann::detail::json_reverse_iterator<Base>;
+
+    template<typename CharType>
+    using output_adapter_t = ::nlohmann::detail::output_adapter_t<CharType>;
+
+    template<typename InputType>
+    using binary_reader = ::nlohmann::detail::binary_reader<basic_json, InputType>;
+    template<typename CharType> using binary_writer = ::nlohmann::detail::binary_writer<basic_json, CharType>;
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    using serializer = ::nlohmann::detail::serializer<basic_json>;
+
+  public:
+    using value_t = detail::value_t;
+    /// JSON Pointer, see @ref nlohmann::json_pointer
+    using json_pointer = ::nlohmann::json_pointer<StringType>;
+    template<typename T, typename SFINAE>
+    using json_serializer = JSONSerializer<T, SFINAE>;
+    /// how to treat decoding errors
+    using error_handler_t = detail::error_handler_t;
+    /// how to treat CBOR tags
+    using cbor_tag_handler_t = detail::cbor_tag_handler_t;
+    /// how to encode BJData
+    using bjdata_version_t = detail::bjdata_version_t;
+    /// helper type for initializer lists of basic_json values
+    using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;
+
+    using input_format_t = detail::input_format_t;
+    /// SAX interface type, see @ref nlohmann::json_sax
+    using json_sax_t = json_sax<basic_json>;
+
+    ////////////////
+    // exceptions //
+    ////////////////
+
+    /// @name exceptions
+    /// Classes to implement user-defined exceptions.
+    /// @{
+
+    using exception = detail::exception;
+    using parse_error = detail::parse_error;
+    using invalid_iterator = detail::invalid_iterator;
+    using type_error = detail::type_error;
+    using out_of_range = detail::out_of_range;
+    using other_error = detail::other_error;
+
+    /// @}
+
+    /////////////////////
+    // container types //
+    /////////////////////
+
+    /// @name container types
+    /// The canonic container types to use @ref basic_json like any other STL
+    /// container.
+    /// @{
+
+    /// the type of elements in a basic_json container
+    using value_type = basic_json;
+
+    /// the type of an element reference
+    using reference = value_type&;
+    /// the type of an element const reference
+    using const_reference = const value_type&;
+
+    /// a type to represent differences between iterators
+    using difference_type = std::ptrdiff_t;
+    /// a type to represent container sizes
+    using size_type = std::size_t;
+
+    /// the allocator type
+    using allocator_type = AllocatorType<basic_json>;
+
+    /// the type of an element pointer
+    using pointer = typename std::allocator_traits<allocator_type>::pointer;
+    /// the type of an element const pointer
+    using const_pointer = typename std::allocator_traits<allocator_type>::const_pointer;
+
+    /// an iterator for a basic_json container
+    using iterator = iter_impl<basic_json>;
+    /// a const iterator for a basic_json container
+    using const_iterator = iter_impl<const basic_json>;
+    /// a reverse iterator for a basic_json container
+    using reverse_iterator = json_reverse_iterator<typename basic_json::iterator>;
+    /// a const reverse iterator for a basic_json container
+    using const_reverse_iterator = json_reverse_iterator<typename basic_json::const_iterator>;
+
+    /// @}
+
+    /// @brief returns the allocator associated with the container
+    /// @sa https://json.nlohmann.me/api/basic_json/get_allocator/
+    static allocator_type get_allocator()
+    {
+        return allocator_type();
+    }
+
+    /// @brief returns version information on the library
+    /// @sa https://json.nlohmann.me/api/basic_json/meta/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json meta()
+    {
+        basic_json result;
+
+        result["copyright"] = "(C) 2013-2023 Niels Lohmann";
+        result["name"] = "JSON for Modern C++";
+        result["url"] = "https://github.com/nlohmann/json";
+        result["version"]["string"] =
+            detail::concat(std::to_string(NLOHMANN_JSON_VERSION_MAJOR), '.',
+                           std::to_string(NLOHMANN_JSON_VERSION_MINOR), '.',
+                           std::to_string(NLOHMANN_JSON_VERSION_PATCH));
+        result["version"]["major"] = NLOHMANN_JSON_VERSION_MAJOR;
+        result["version"]["minor"] = NLOHMANN_JSON_VERSION_MINOR;
+        result["version"]["patch"] = NLOHMANN_JSON_VERSION_PATCH;
+
+#ifdef _WIN32
+        result["platform"] = "win32";
+#elif defined __linux__
+        result["platform"] = "linux";
+#elif defined __APPLE__
+        result["platform"] = "apple";
+#elif defined __unix__
+        result["platform"] = "unix";
+#else
+        result["platform"] = "unknown";
+#endif
+
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+        result["compiler"] = {{"family", "icc"}, {"version", __INTEL_COMPILER}};
+#elif defined(__clang__)
+        result["compiler"] = {{"family", "clang"}, {"version", __clang_version__}};
+#elif defined(__GNUC__) || defined(__GNUG__)
+        result["compiler"] = {{"family", "gcc"}, {"version", detail::concat(
+                    std::to_string(__GNUC__), '.',
+                    std::to_string(__GNUC_MINOR__), '.',
+                    std::to_string(__GNUC_PATCHLEVEL__))
+            }
+        };
+#elif defined(__HP_cc) || defined(__HP_aCC)
+        result["compiler"] = "hp"
+#elif defined(__IBMCPP__)
+        result["compiler"] = {{"family", "ilecpp"}, {"version", __IBMCPP__}};
+#elif defined(_MSC_VER)
+        result["compiler"] = {{"family", "msvc"}, {"version", _MSC_VER}};
+#elif defined(__PGI)
+        result["compiler"] = {{"family", "pgcpp"}, {"version", __PGI}};
+#elif defined(__SUNPRO_CC)
+        result["compiler"] = {{"family", "sunpro"}, {"version", __SUNPRO_CC}};
+#else
+        result["compiler"] = {{"family", "unknown"}, {"version", "unknown"}};
+#endif
+
+#if defined(_MSVC_LANG)
+        result["compiler"]["c++"] = std::to_string(_MSVC_LANG);
+#elif defined(__cplusplus)
+        result["compiler"]["c++"] = std::to_string(__cplusplus);
+#else
+        result["compiler"]["c++"] = "unknown";
+#endif
+        return result;
+    }
+
+    ///////////////////////////
+    // JSON value data types //
+    ///////////////////////////
+
+    /// @name JSON value data types
+    /// The data types to store a JSON value. These types are derived from
+    /// the template arguments passed to class @ref basic_json.
+    /// @{
+
+    /// @brief default object key comparator type
+    /// The actual object key comparator type (@ref object_comparator_t) may be
+    /// different.
+    /// @sa https://json.nlohmann.me/api/basic_json/default_object_comparator_t/
+#if defined(JSON_HAS_CPP_14)
+    // use of transparent comparator avoids unnecessary repeated construction of temporaries
+    // in functions involving lookup by key with types other than object_t::key_type (aka. StringType)
+    using default_object_comparator_t = std::less<>;
+#else
+    using default_object_comparator_t = std::less<StringType>;
+#endif
+
+    /// @brief a type for an object
+    /// @sa https://json.nlohmann.me/api/basic_json/object_t/
+    using object_t = ObjectType<StringType,
+          basic_json,
+          default_object_comparator_t,
+          AllocatorType<std::pair<const StringType,
+          basic_json>>>;
+
+    /// @brief a type for an array
+    /// @sa https://json.nlohmann.me/api/basic_json/array_t/
+    using array_t = ArrayType<basic_json, AllocatorType<basic_json>>;
+
+    /// @brief a type for a string
+    /// @sa https://json.nlohmann.me/api/basic_json/string_t/
+    using string_t = StringType;
+
+    /// @brief a type for a boolean
+    /// @sa https://json.nlohmann.me/api/basic_json/boolean_t/
+    using boolean_t = BooleanType;
+
+    /// @brief a type for a number (integer)
+    /// @sa https://json.nlohmann.me/api/basic_json/number_integer_t/
+    using number_integer_t = NumberIntegerType;
+
+    /// @brief a type for a number (unsigned)
+    /// @sa https://json.nlohmann.me/api/basic_json/number_unsigned_t/
+    using number_unsigned_t = NumberUnsignedType;
+
+    /// @brief a type for a number (floating-point)
+    /// @sa https://json.nlohmann.me/api/basic_json/number_float_t/
+    using number_float_t = NumberFloatType;
+
+    /// @brief a type for a packed binary type
+    /// @sa https://json.nlohmann.me/api/basic_json/binary_t/
+    using binary_t = nlohmann::byte_container_with_subtype<BinaryType>;
+
+    /// @brief object key comparator type
+    /// @sa https://json.nlohmann.me/api/basic_json/object_comparator_t/
+    using object_comparator_t = detail::actual_object_comparator_t<basic_json>;
+
+    /// @}
+
+  private:
+
+    /// helper for exception-safe object creation
+    template<typename T, typename... Args>
+    JSON_HEDLEY_RETURNS_NON_NULL
+    static T* create(Args&& ... args)
+    {
+        AllocatorType<T> alloc;
+        using AllocatorTraits = std::allocator_traits<AllocatorType<T>>;
+
+        auto deleter = [&](T * obj)
+        {
+            AllocatorTraits::deallocate(alloc, obj, 1);
+        };
+        std::unique_ptr<T, decltype(deleter)> obj(AllocatorTraits::allocate(alloc, 1), deleter);
+        AllocatorTraits::construct(alloc, obj.get(), std::forward<Args>(args)...);
+        JSON_ASSERT(obj != nullptr);
+        return obj.release();
+    }
+
+    ////////////////////////
+    // JSON value storage //
+    ////////////////////////
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    /*!
+    @brief a JSON value
+
+    The actual storage for a JSON value of the @ref basic_json class. This
+    union combines the different storage types for the JSON value types
+    defined in @ref value_t.
+
+    JSON type | value_t type    | used type
+    --------- | --------------- | ------------------------
+    object    | object          | pointer to @ref object_t
+    array     | array           | pointer to @ref array_t
+    string    | string          | pointer to @ref string_t
+    boolean   | boolean         | @ref boolean_t
+    number    | number_integer  | @ref number_integer_t
+    number    | number_unsigned | @ref number_unsigned_t
+    number    | number_float    | @ref number_float_t
+    binary    | binary          | pointer to @ref binary_t
+    null      | null            | *no value is stored*
+
+    @note Variable-length types (objects, arrays, and strings) are stored as
+    pointers. The size of the union should not exceed 64 bits if the default
+    value types are used.
+
+    @since version 1.0.0
+    */
+    union json_value
+    {
+        /// object (stored with pointer to save storage)
+        object_t* object;
+        /// array (stored with pointer to save storage)
+        array_t* array;
+        /// string (stored with pointer to save storage)
+        string_t* string;
+        /// binary (stored with pointer to save storage)
+        binary_t* binary;
+        /// boolean
+        boolean_t boolean;
+        /// number (integer)
+        number_integer_t number_integer;
+        /// number (unsigned integer)
+        number_unsigned_t number_unsigned;
+        /// number (floating-point)
+        number_float_t number_float;
+
+        /// default constructor (for null values)
+        json_value() = default;
+        /// constructor for booleans
+        json_value(boolean_t v) noexcept : boolean(v) {}
+        /// constructor for numbers (integer)
+        json_value(number_integer_t v) noexcept : number_integer(v) {}
+        /// constructor for numbers (unsigned)
+        json_value(number_unsigned_t v) noexcept : number_unsigned(v) {}
+        /// constructor for numbers (floating-point)
+        json_value(number_float_t v) noexcept : number_float(v) {}
+        /// constructor for empty values of a given type
+        json_value(value_t t)
+        {
+            switch (t)
+            {
+                case value_t::object:
+                {
+                    object = create<object_t>();
+                    break;
+                }
+
+                case value_t::array:
+                {
+                    array = create<array_t>();
+                    break;
+                }
+
+                case value_t::string:
+                {
+                    string = create<string_t>("");
+                    break;
+                }
+
+                case value_t::binary:
+                {
+                    binary = create<binary_t>();
+                    break;
+                }
+
+                case value_t::boolean:
+                {
+                    boolean = static_cast<boolean_t>(false);
+                    break;
+                }
+
+                case value_t::number_integer:
+                {
+                    number_integer = static_cast<number_integer_t>(0);
+                    break;
+                }
+
+                case value_t::number_unsigned:
+                {
+                    number_unsigned = static_cast<number_unsigned_t>(0);
+                    break;
+                }
+
+                case value_t::number_float:
+                {
+                    number_float = static_cast<number_float_t>(0.0);
+                    break;
+                }
+
+                case value_t::null:
+                {
+                    object = nullptr;  // silence warning, see #821
+                    break;
+                }
+
+                case value_t::discarded:
+                default:
+                {
+                    object = nullptr;  // silence warning, see #821
+                    if (JSON_HEDLEY_UNLIKELY(t == value_t::null))
+                    {
+                        JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.11.3", nullptr)); // LCOV_EXCL_LINE
+                    }
+                    break;
+                }
+            }
+        }
+
+        /// constructor for strings
+        json_value(const string_t& value) : string(create<string_t>(value)) {}
+
+        /// constructor for rvalue strings
+        json_value(string_t&& value) : string(create<string_t>(std::move(value))) {}
+
+        /// constructor for objects
+        json_value(const object_t& value) : object(create<object_t>(value)) {}
+
+        /// constructor for rvalue objects
+        json_value(object_t&& value) : object(create<object_t>(std::move(value))) {}
+
+        /// constructor for arrays
+        json_value(const array_t& value) : array(create<array_t>(value)) {}
+
+        /// constructor for rvalue arrays
+        json_value(array_t&& value) : array(create<array_t>(std::move(value))) {}
+
+        /// constructor for binary arrays
+        json_value(const typename binary_t::container_type& value) : binary(create<binary_t>(value)) {}
+
+        /// constructor for rvalue binary arrays
+        json_value(typename binary_t::container_type&& value) : binary(create<binary_t>(std::move(value))) {}
+
+        /// constructor for binary arrays (internal type)
+        json_value(const binary_t& value) : binary(create<binary_t>(value)) {}
+
+        /// constructor for rvalue binary arrays (internal type)
+        json_value(binary_t&& value) : binary(create<binary_t>(std::move(value))) {}
+
+        void destroy(value_t t)
+        {
+            if (
+                (t == value_t::object && object == nullptr) ||
+                (t == value_t::array && array == nullptr) ||
+                (t == value_t::string && string == nullptr) ||
+                (t == value_t::binary && binary == nullptr)
+            )
+            {
+                //not initialized (e.g. due to exception in the ctor)
+                return;
+            }
+            if (t == value_t::array || t == value_t::object)
+            {
+                // flatten the current json_value to a heap-allocated stack
+                std::vector<basic_json> stack;
+
+                // move the top-level items to stack
+                if (t == value_t::array)
+                {
+                    stack.reserve(array->size());
+                    std::move(array->begin(), array->end(), std::back_inserter(stack));
+                }
+                else
+                {
+                    stack.reserve(object->size());
+                    for (auto&& it : *object)
+                    {
+                        stack.push_back(std::move(it.second));
+                    }
+                }
+
+                while (!stack.empty())
+                {
+                    // move the last item to local variable to be processed
+                    basic_json current_item(std::move(stack.back()));
+                    stack.pop_back();
+
+                    // if current_item is array/object, move
+                    // its children to the stack to be processed later
+                    if (current_item.is_array())
+                    {
+                        std::move(current_item.m_data.m_value.array->begin(), current_item.m_data.m_value.array->end(), std::back_inserter(stack));
+
+                        current_item.m_data.m_value.array->clear();
+                    }
+                    else if (current_item.is_object())
+                    {
+                        for (auto&& it : *current_item.m_data.m_value.object)
+                        {
+                            stack.push_back(std::move(it.second));
+                        }
+
+                        current_item.m_data.m_value.object->clear();
+                    }
+
+                    // it's now safe that current_item get destructed
+                    // since it doesn't have any children
+                }
+            }
+
+            switch (t)
+            {
+                case value_t::object:
+                {
+                    AllocatorType<object_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, object);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, object, 1);
+                    break;
+                }
+
+                case value_t::array:
+                {
+                    AllocatorType<array_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, array);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, array, 1);
+                    break;
+                }
+
+                case value_t::string:
+                {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, string, 1);
+                    break;
+                }
+
+                case value_t::binary:
+                {
+                    AllocatorType<binary_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, binary, 1);
+                    break;
+                }
+
+                case value_t::null:
+                case value_t::boolean:
+                case value_t::number_integer:
+                case value_t::number_unsigned:
+                case value_t::number_float:
+                case value_t::discarded:
+                default:
+                {
+                    break;
+                }
+            }
+        }
+    };
+
+  private:
+    /*!
+    @brief checks the class invariants
+
+    This function asserts the class invariants. It needs to be called at the
+    end of every constructor to make sure that created objects respect the
+    invariant. Furthermore, it has to be called each time the type of a JSON
+    value is changed, because the invariant expresses a relationship between
+    @a m_type and @a m_value.
+
+    Furthermore, the parent relation is checked for arrays and objects: If
+    @a check_parents true and the value is an array or object, then the
+    container's elements must have the current value as parent.
+
+    @param[in] check_parents  whether the parent relation should be checked.
+               The value is true by default and should only be set to false
+               during destruction of objects when the invariant does not
+               need to hold.
+    */
+    void assert_invariant(bool check_parents = true) const noexcept
+    {
+        JSON_ASSERT(m_data.m_type != value_t::object || m_data.m_value.object != nullptr);
+        JSON_ASSERT(m_data.m_type != value_t::array || m_data.m_value.array != nullptr);
+        JSON_ASSERT(m_data.m_type != value_t::string || m_data.m_value.string != nullptr);
+        JSON_ASSERT(m_data.m_type != value_t::binary || m_data.m_value.binary != nullptr);
+
+#if JSON_DIAGNOSTICS
+        JSON_TRY
+        {
+            // cppcheck-suppress assertWithSideEffect
+            JSON_ASSERT(!check_parents || !is_structured() || std::all_of(begin(), end(), [this](const basic_json & j)
+            {
+                return j.m_parent == this;
+            }));
+        }
+        JSON_CATCH(...) {} // LCOV_EXCL_LINE
+#endif
+        static_cast<void>(check_parents);
+    }
+
+    void set_parents()
+    {
+#if JSON_DIAGNOSTICS
+        switch (m_data.m_type)
+        {
+            case value_t::array:
+            {
+                for (auto& element : *m_data.m_value.array)
+                {
+                    element.m_parent = this;
+                }
+                break;
+            }
+
+            case value_t::object:
+            {
+                for (auto& element : *m_data.m_value.object)
+                {
+                    element.second.m_parent = this;
+                }
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                break;
+        }
+#endif
+    }
+
+    iterator set_parents(iterator it, typename iterator::difference_type count_set_parents)
+    {
+#if JSON_DIAGNOSTICS
+        for (typename iterator::difference_type i = 0; i < count_set_parents; ++i)
+        {
+            (it + i)->m_parent = this;
+        }
+#else
+        static_cast<void>(count_set_parents);
+#endif
+        return it;
+    }
+
+    reference set_parent(reference j, std::size_t old_capacity = detail::unknown_size())
+    {
+#if JSON_DIAGNOSTICS
+        if (old_capacity != detail::unknown_size())
+        {
+            // see https://github.com/nlohmann/json/issues/2838
+            JSON_ASSERT(type() == value_t::array);
+            if (JSON_HEDLEY_UNLIKELY(m_data.m_value.array->capacity() != old_capacity))
+            {
+                // capacity has changed: update all parents
+                set_parents();
+                return j;
+            }
+        }
+
+        // ordered_json uses a vector internally, so pointers could have
+        // been invalidated; see https://github.com/nlohmann/json/issues/2962
+#ifdef JSON_HEDLEY_MSVC_VERSION
+#pragma warning(push )
+#pragma warning(disable : 4127) // ignore warning to replace if with if constexpr
+#endif
+        if (detail::is_ordered_map<object_t>::value)
+        {
+            set_parents();
+            return j;
+        }
+#ifdef JSON_HEDLEY_MSVC_VERSION
+#pragma warning( pop )
+#endif
+
+        j.m_parent = this;
+#else
+        static_cast<void>(j);
+        static_cast<void>(old_capacity);
+#endif
+        return j;
+    }
+
+  public:
+    //////////////////////////
+    // JSON parser callback //
+    //////////////////////////
+
+    /// @brief parser event types
+    /// @sa https://json.nlohmann.me/api/basic_json/parse_event_t/
+    using parse_event_t = detail::parse_event_t;
+
+    /// @brief per-element parser callback type
+    /// @sa https://json.nlohmann.me/api/basic_json/parser_callback_t/
+    using parser_callback_t = detail::parser_callback_t<basic_json>;
+
+    //////////////////
+    // constructors //
+    //////////////////
+
+    /// @name constructors and destructors
+    /// Constructors of class @ref basic_json, copy/move constructor, copy
+    /// assignment, static functions creating objects, and the destructor.
+    /// @{
+
+    /// @brief create an empty value with a given type
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    basic_json(const value_t v)
+        : m_data(v)
+    {
+        assert_invariant();
+    }
+
+    /// @brief create a null object
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    basic_json(std::nullptr_t = nullptr) noexcept // NOLINT(bugprone-exception-escape)
+        : basic_json(value_t::null)
+    {
+        assert_invariant();
+    }
+
+    /// @brief create a JSON value from compatible types
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    template < typename CompatibleType,
+               typename U = detail::uncvref_t<CompatibleType>,
+               detail::enable_if_t <
+                   !detail::is_basic_json<U>::value && detail::is_compatible_type<basic_json_t, U>::value, int > = 0 >
+    basic_json(CompatibleType && val) noexcept(noexcept( // NOLINT(bugprone-forwarding-reference-overload,bugprone-exception-escape)
+            JSONSerializer<U>::to_json(std::declval<basic_json_t&>(),
+                                       std::forward<CompatibleType>(val))))
+    {
+        JSONSerializer<U>::to_json(*this, std::forward<CompatibleType>(val));
+        set_parents();
+        assert_invariant();
+    }
+
+    /// @brief create a JSON value from an existing one
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    template < typename BasicJsonType,
+               detail::enable_if_t <
+                   detail::is_basic_json<BasicJsonType>::value&& !std::is_same<basic_json, BasicJsonType>::value, int > = 0 >
+    basic_json(const BasicJsonType& val)
+#if JSON_DIAGNOSTIC_POSITIONS
+        : start_position(val.start_pos()),
+          end_position(val.end_pos())
+#endif
+    {
+        using other_boolean_t = typename BasicJsonType::boolean_t;
+        using other_number_float_t = typename BasicJsonType::number_float_t;
+        using other_number_integer_t = typename BasicJsonType::number_integer_t;
+        using other_number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+        using other_string_t = typename BasicJsonType::string_t;
+        using other_object_t = typename BasicJsonType::object_t;
+        using other_array_t = typename BasicJsonType::array_t;
+        using other_binary_t = typename BasicJsonType::binary_t;
+
+        switch (val.type())
+        {
+            case value_t::boolean:
+                JSONSerializer<other_boolean_t>::to_json(*this, val.template get<other_boolean_t>());
+                break;
+            case value_t::number_float:
+                JSONSerializer<other_number_float_t>::to_json(*this, val.template get<other_number_float_t>());
+                break;
+            case value_t::number_integer:
+                JSONSerializer<other_number_integer_t>::to_json(*this, val.template get<other_number_integer_t>());
+                break;
+            case value_t::number_unsigned:
+                JSONSerializer<other_number_unsigned_t>::to_json(*this, val.template get<other_number_unsigned_t>());
+                break;
+            case value_t::string:
+                JSONSerializer<other_string_t>::to_json(*this, val.template get_ref<const other_string_t&>());
+                break;
+            case value_t::object:
+                JSONSerializer<other_object_t>::to_json(*this, val.template get_ref<const other_object_t&>());
+                break;
+            case value_t::array:
+                JSONSerializer<other_array_t>::to_json(*this, val.template get_ref<const other_array_t&>());
+                break;
+            case value_t::binary:
+                JSONSerializer<other_binary_t>::to_json(*this, val.template get_ref<const other_binary_t&>());
+                break;
+            case value_t::null:
+                *this = nullptr;
+                break;
+            case value_t::discarded:
+                m_data.m_type = value_t::discarded;
+                break;
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        }
+        JSON_ASSERT(m_data.m_type == val.type());
+
+        set_parents();
+        assert_invariant();
+    }
+
+    /// @brief create a container (array or object) from an initializer list
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    basic_json(initializer_list_t init,
+               bool type_deduction = true,
+               value_t manual_type = value_t::array)
+    {
+        // check if each element is an array with two elements whose first
+        // element is a string
+        bool is_an_object = std::all_of(init.begin(), init.end(),
+                                        [](const detail::json_ref<basic_json>& element_ref)
+        {
+            // The cast is to ensure op[size_type] is called, bearing in mind size_type may not be int;
+            // (many string types can be constructed from 0 via its null-pointer guise, so we get a
+            // broken call to op[key_type], the wrong semantics and a 4804 warning on Windows)
+            return element_ref->is_array() && element_ref->size() == 2 && (*element_ref)[static_cast<size_type>(0)].is_string();
+        });
+
+        // adjust type if type deduction is not wanted
+        if (!type_deduction)
+        {
+            // if array is wanted, do not create an object though possible
+            if (manual_type == value_t::array)
+            {
+                is_an_object = false;
+            }
+
+            // if object is wanted but impossible, throw an exception
+            if (JSON_HEDLEY_UNLIKELY(manual_type == value_t::object && !is_an_object))
+            {
+                JSON_THROW(type_error::create(301, "cannot create object from initializer list", nullptr));
+            }
+        }
+
+        if (is_an_object)
+        {
+            // the initializer list is a list of pairs -> create object
+            m_data.m_type = value_t::object;
+            m_data.m_value = value_t::object;
+
+            for (auto& element_ref : init)
+            {
+                auto element = element_ref.moved_or_copied();
+                m_data.m_value.object->emplace(
+                    std::move(*((*element.m_data.m_value.array)[0].m_data.m_value.string)),
+                    std::move((*element.m_data.m_value.array)[1]));
+            }
+        }
+        else
+        {
+            // the initializer list describes an array -> create array
+            m_data.m_type = value_t::array;
+            m_data.m_value.array = create<array_t>(init.begin(), init.end());
+        }
+
+        set_parents();
+        assert_invariant();
+    }
+
+    /// @brief explicitly create a binary array (without subtype)
+    /// @sa https://json.nlohmann.me/api/basic_json/binary/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(const typename binary_t::container_type& init)
+    {
+        auto res = basic_json();
+        res.m_data.m_type = value_t::binary;
+        res.m_data.m_value = init;
+        return res;
+    }
+
+    /// @brief explicitly create a binary array (with subtype)
+    /// @sa https://json.nlohmann.me/api/basic_json/binary/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(const typename binary_t::container_type& init, typename binary_t::subtype_type subtype)
+    {
+        auto res = basic_json();
+        res.m_data.m_type = value_t::binary;
+        res.m_data.m_value = binary_t(init, subtype);
+        return res;
+    }
+
+    /// @brief explicitly create a binary array
+    /// @sa https://json.nlohmann.me/api/basic_json/binary/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(typename binary_t::container_type&& init)
+    {
+        auto res = basic_json();
+        res.m_data.m_type = value_t::binary;
+        res.m_data.m_value = std::move(init);
+        return res;
+    }
+
+    /// @brief explicitly create a binary array (with subtype)
+    /// @sa https://json.nlohmann.me/api/basic_json/binary/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(typename binary_t::container_type&& init, typename binary_t::subtype_type subtype)
+    {
+        auto res = basic_json();
+        res.m_data.m_type = value_t::binary;
+        res.m_data.m_value = binary_t(std::move(init), subtype);
+        return res;
+    }
+
+    /// @brief explicitly create an array from an initializer list
+    /// @sa https://json.nlohmann.me/api/basic_json/array/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json array(initializer_list_t init = {})
+    {
+        return basic_json(init, false, value_t::array);
+    }
+
+    /// @brief explicitly create an object from an initializer list
+    /// @sa https://json.nlohmann.me/api/basic_json/object/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json object(initializer_list_t init = {})
+    {
+        return basic_json(init, false, value_t::object);
+    }
+
+    /// @brief construct an array with count copies of given value
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    basic_json(size_type cnt, const basic_json& val):
+        m_data{cnt, val}
+    {
+        set_parents();
+        assert_invariant();
+    }
+
+    /// @brief construct a JSON container given an iterator range
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    template < class InputIT, typename std::enable_if <
+                   std::is_same<InputIT, typename basic_json_t::iterator>::value ||
+                   std::is_same<InputIT, typename basic_json_t::const_iterator>::value, int >::type = 0 >
+    basic_json(InputIT first, InputIT last) // NOLINT(performance-unnecessary-value-param)
+    {
+        JSON_ASSERT(first.m_object != nullptr);
+        JSON_ASSERT(last.m_object != nullptr);
+
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(201, "iterators are not compatible", nullptr));
+        }
+
+        // copy type from first iterator
+        m_data.m_type = first.m_object->m_data.m_type;
+
+        // check if iterator range is complete for primitive values
+        switch (m_data.m_type)
+        {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            {
+                if (JSON_HEDLEY_UNLIKELY(!first.m_it.primitive_iterator.is_begin()
+                                         || !last.m_it.primitive_iterator.is_end()))
+                {
+                    JSON_THROW(invalid_iterator::create(204, "iterators out of range", first.m_object));
+                }
+                break;
+            }
+
+            case value_t::null:
+            case value_t::object:
+            case value_t::array:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                break;
+        }
+
+        switch (m_data.m_type)
+        {
+            case value_t::number_integer:
+            {
+                m_data.m_value.number_integer = first.m_object->m_data.m_value.number_integer;
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                m_data.m_value.number_unsigned = first.m_object->m_data.m_value.number_unsigned;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_data.m_value.number_float = first.m_object->m_data.m_value.number_float;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_data.m_value.boolean = first.m_object->m_data.m_value.boolean;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_data.m_value = *first.m_object->m_data.m_value.string;
+                break;
+            }
+
+            case value_t::object:
+            {
+                m_data.m_value.object = create<object_t>(first.m_it.object_iterator,
+                                        last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_data.m_value.array = create<array_t>(first.m_it.array_iterator,
+                                                       last.m_it.array_iterator);
+                break;
+            }
+
+            case value_t::binary:
+            {
+                m_data.m_value = *first.m_object->m_data.m_value.binary;
+                break;
+            }
+
+            case value_t::null:
+            case value_t::discarded:
+            default:
+                JSON_THROW(invalid_iterator::create(206, detail::concat("cannot construct with iterators from ", first.m_object->type_name()), first.m_object));
+        }
+
+        set_parents();
+        assert_invariant();
+    }
+
+    ///////////////////////////////////////
+    // other constructors and destructor //
+    ///////////////////////////////////////
+
+    template<typename JsonRef,
+             detail::enable_if_t<detail::conjunction<detail::is_json_ref<JsonRef>,
+                                 std::is_same<typename JsonRef::value_type, basic_json>>::value, int> = 0 >
+    basic_json(const JsonRef& ref) : basic_json(ref.moved_or_copied()) {}
+
+    /// @brief copy constructor
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    basic_json(const basic_json& other)
+        : json_base_class_t(other)
+#if JSON_DIAGNOSTIC_POSITIONS
+        , start_position(other.start_position)
+        , end_position(other.end_position)
+#endif
+    {
+        m_data.m_type = other.m_data.m_type;
+        // check of passed value is valid
+        other.assert_invariant();
+
+        switch (m_data.m_type)
+        {
+            case value_t::object:
+            {
+                m_data.m_value = *other.m_data.m_value.object;
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_data.m_value = *other.m_data.m_value.array;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_data.m_value = *other.m_data.m_value.string;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_data.m_value = other.m_data.m_value.boolean;
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                m_data.m_value = other.m_data.m_value.number_integer;
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                m_data.m_value = other.m_data.m_value.number_unsigned;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_data.m_value = other.m_data.m_value.number_float;
+                break;
+            }
+
+            case value_t::binary:
+            {
+                m_data.m_value = *other.m_data.m_value.binary;
+                break;
+            }
+
+            case value_t::null:
+            case value_t::discarded:
+            default:
+                break;
+        }
+
+        set_parents();
+        assert_invariant();
+    }
+
+    /// @brief move constructor
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    basic_json(basic_json&& other) noexcept
+        : json_base_class_t(std::forward<json_base_class_t>(other)),
+          m_data(std::move(other.m_data)) // cppcheck-suppress[accessForwarded] TODO check
+#if JSON_DIAGNOSTIC_POSITIONS
+        , start_position(other.start_position) // cppcheck-suppress[accessForwarded] TODO check
+        , end_position(other.end_position) // cppcheck-suppress[accessForwarded] TODO check
+#endif
+    {
+        // check that passed value is valid
+        other.assert_invariant(false); // cppcheck-suppress[accessForwarded]
+
+        // invalidate payload
+        other.m_data.m_type = value_t::null;
+        other.m_data.m_value = {};
+
+#if JSON_DIAGNOSTIC_POSITIONS
+        other.start_position = std::string::npos;
+        other.end_position = std::string::npos;
+#endif
+
+        set_parents();
+        assert_invariant();
+    }
+
+    /// @brief copy assignment
+    /// @sa https://json.nlohmann.me/api/basic_json/operator=/
+    basic_json& operator=(basic_json other) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value&&
+        std::is_nothrow_move_assignable<value_t>::value&&
+        std::is_nothrow_move_constructible<json_value>::value&&
+        std::is_nothrow_move_assignable<json_value>::value&&
+        std::is_nothrow_move_assignable<json_base_class_t>::value
+    )
+    {
+        // check that passed value is valid
+        other.assert_invariant();
+
+        using std::swap;
+        swap(m_data.m_type, other.m_data.m_type);
+        swap(m_data.m_value, other.m_data.m_value);
+
+#if JSON_DIAGNOSTIC_POSITIONS
+        swap(start_position, other.start_position);
+        swap(end_position, other.end_position);
+#endif
+
+        json_base_class_t::operator=(std::move(other));
+
+        set_parents();
+        assert_invariant();
+        return *this;
+    }
+
+    /// @brief destructor
+    /// @sa https://json.nlohmann.me/api/basic_json/~basic_json/
+    ~basic_json() noexcept
+    {
+        assert_invariant(false);
+    }
+
+    /// @}
+
+  public:
+    ///////////////////////
+    // object inspection //
+    ///////////////////////
+
+    /// @name object inspection
+    /// Functions to inspect the type of a JSON value.
+    /// @{
+
+    /// @brief serialization
+    /// @sa https://json.nlohmann.me/api/basic_json/dump/
+    string_t dump(const int indent = -1,
+                  const char indent_char = ' ',
+                  const bool ensure_ascii = false,
+                  const error_handler_t error_handler = error_handler_t::strict) const
+    {
+        string_t result;
+        serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler);
+
+        if (indent >= 0)
+        {
+            s.dump(*this, true, ensure_ascii, static_cast<unsigned int>(indent));
+        }
+        else
+        {
+            s.dump(*this, false, ensure_ascii, 0);
+        }
+
+        return result;
+    }
+
+    /// @brief return the type of the JSON value (explicit)
+    /// @sa https://json.nlohmann.me/api/basic_json/type/
+    constexpr value_t type() const noexcept
+    {
+        return m_data.m_type;
+    }
+
+    /// @brief return whether type is primitive
+    /// @sa https://json.nlohmann.me/api/basic_json/is_primitive/
+    constexpr bool is_primitive() const noexcept
+    {
+        return is_null() || is_string() || is_boolean() || is_number() || is_binary();
+    }
+
+    /// @brief return whether type is structured
+    /// @sa https://json.nlohmann.me/api/basic_json/is_structured/
+    constexpr bool is_structured() const noexcept
+    {
+        return is_array() || is_object();
+    }
+
+    /// @brief return whether value is null
+    /// @sa https://json.nlohmann.me/api/basic_json/is_null/
+    constexpr bool is_null() const noexcept
+    {
+        return m_data.m_type == value_t::null;
+    }
+
+    /// @brief return whether value is a boolean
+    /// @sa https://json.nlohmann.me/api/basic_json/is_boolean/
+    constexpr bool is_boolean() const noexcept
+    {
+        return m_data.m_type == value_t::boolean;
+    }
+
+    /// @brief return whether value is a number
+    /// @sa https://json.nlohmann.me/api/basic_json/is_number/
+    constexpr bool is_number() const noexcept
+    {
+        return is_number_integer() || is_number_float();
+    }
+
+    /// @brief return whether value is an integer number
+    /// @sa https://json.nlohmann.me/api/basic_json/is_number_integer/
+    constexpr bool is_number_integer() const noexcept
+    {
+        return m_data.m_type == value_t::number_integer || m_data.m_type == value_t::number_unsigned;
+    }
+
+    /// @brief return whether value is an unsigned integer number
+    /// @sa https://json.nlohmann.me/api/basic_json/is_number_unsigned/
+    constexpr bool is_number_unsigned() const noexcept
+    {
+        return m_data.m_type == value_t::number_unsigned;
+    }
+
+    /// @brief return whether value is a floating-point number
+    /// @sa https://json.nlohmann.me/api/basic_json/is_number_float/
+    constexpr bool is_number_float() const noexcept
+    {
+        return m_data.m_type == value_t::number_float;
+    }
+
+    /// @brief return whether value is an object
+    /// @sa https://json.nlohmann.me/api/basic_json/is_object/
+    constexpr bool is_object() const noexcept
+    {
+        return m_data.m_type == value_t::object;
+    }
+
+    /// @brief return whether value is an array
+    /// @sa https://json.nlohmann.me/api/basic_json/is_array/
+    constexpr bool is_array() const noexcept
+    {
+        return m_data.m_type == value_t::array;
+    }
+
+    /// @brief return whether value is a string
+    /// @sa https://json.nlohmann.me/api/basic_json/is_string/
+    constexpr bool is_string() const noexcept
+    {
+        return m_data.m_type == value_t::string;
+    }
+
+    /// @brief return whether value is a binary array
+    /// @sa https://json.nlohmann.me/api/basic_json/is_binary/
+    constexpr bool is_binary() const noexcept
+    {
+        return m_data.m_type == value_t::binary;
+    }
+
+    /// @brief return whether value is discarded
+    /// @sa https://json.nlohmann.me/api/basic_json/is_discarded/
+    constexpr bool is_discarded() const noexcept
+    {
+        return m_data.m_type == value_t::discarded;
+    }
+
+    /// @brief return the type of the JSON value (implicit)
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_value_t/
+    constexpr operator value_t() const noexcept
+    {
+        return m_data.m_type;
+    }
+
+    /// @}
+
+  private:
+    //////////////////
+    // value access //
+    //////////////////
+
+    /// get a boolean (explicit)
+    boolean_t get_impl(boolean_t* /*unused*/) const
+    {
+        if (JSON_HEDLEY_LIKELY(is_boolean()))
+        {
+            return m_data.m_value.boolean;
+        }
+
+        JSON_THROW(type_error::create(302, detail::concat("type must be boolean, but is ", type_name()), this));
+    }
+
+    /// get a pointer to the value (object)
+    object_t* get_impl_ptr(object_t* /*unused*/) noexcept
+    {
+        return is_object() ? m_data.m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (object)
+    constexpr const object_t* get_impl_ptr(const object_t* /*unused*/) const noexcept
+    {
+        return is_object() ? m_data.m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    array_t* get_impl_ptr(array_t* /*unused*/) noexcept
+    {
+        return is_array() ? m_data.m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    constexpr const array_t* get_impl_ptr(const array_t* /*unused*/) const noexcept
+    {
+        return is_array() ? m_data.m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    string_t* get_impl_ptr(string_t* /*unused*/) noexcept
+    {
+        return is_string() ? m_data.m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    constexpr const string_t* get_impl_ptr(const string_t* /*unused*/) const noexcept
+    {
+        return is_string() ? m_data.m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    boolean_t* get_impl_ptr(boolean_t* /*unused*/) noexcept
+    {
+        return is_boolean() ? &m_data.m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    constexpr const boolean_t* get_impl_ptr(const boolean_t* /*unused*/) const noexcept
+    {
+        return is_boolean() ? &m_data.m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    number_integer_t* get_impl_ptr(number_integer_t* /*unused*/) noexcept
+    {
+        return m_data.m_type == value_t::number_integer ? &m_data.m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    constexpr const number_integer_t* get_impl_ptr(const number_integer_t* /*unused*/) const noexcept
+    {
+        return m_data.m_type == value_t::number_integer ? &m_data.m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (unsigned number)
+    number_unsigned_t* get_impl_ptr(number_unsigned_t* /*unused*/) noexcept
+    {
+        return is_number_unsigned() ? &m_data.m_value.number_unsigned : nullptr;
+    }
+
+    /// get a pointer to the value (unsigned number)
+    constexpr const number_unsigned_t* get_impl_ptr(const number_unsigned_t* /*unused*/) const noexcept
+    {
+        return is_number_unsigned() ? &m_data.m_value.number_unsigned : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    number_float_t* get_impl_ptr(number_float_t* /*unused*/) noexcept
+    {
+        return is_number_float() ? &m_data.m_value.number_float : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    constexpr const number_float_t* get_impl_ptr(const number_float_t* /*unused*/) const noexcept
+    {
+        return is_number_float() ? &m_data.m_value.number_float : nullptr;
+    }
+
+    /// get a pointer to the value (binary)
+    binary_t* get_impl_ptr(binary_t* /*unused*/) noexcept
+    {
+        return is_binary() ? m_data.m_value.binary : nullptr;
+    }
+
+    /// get a pointer to the value (binary)
+    constexpr const binary_t* get_impl_ptr(const binary_t* /*unused*/) const noexcept
+    {
+        return is_binary() ? m_data.m_value.binary : nullptr;
+    }
+
+    /*!
+    @brief helper function to implement get_ref()
+
+    This function helps to implement get_ref() without code duplication for
+    const and non-const overloads
+
+    @tparam ThisType will be deduced as `basic_json` or `const basic_json`
+
+    @throw type_error.303 if ReferenceType does not match underlying value
+    type of the current JSON
+    */
+    template<typename ReferenceType, typename ThisType>
+    static ReferenceType get_ref_impl(ThisType& obj)
+    {
+        // delegate the call to get_ptr<>()
+        auto* ptr = obj.template get_ptr<typename std::add_pointer<ReferenceType>::type>();
+
+        if (JSON_HEDLEY_LIKELY(ptr != nullptr))
+        {
+            return *ptr;
+        }
+
+        JSON_THROW(type_error::create(303, detail::concat("incompatible ReferenceType for get_ref, actual type is ", obj.type_name()), &obj));
+    }
+
+  public:
+    /// @name value access
+    /// Direct access to the stored value of a JSON value.
+    /// @{
+
+    /// @brief get a pointer value (implicit)
+    /// @sa https://json.nlohmann.me/api/basic_json/get_ptr/
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value, int>::type = 0>
+    auto get_ptr() noexcept -> decltype(std::declval<basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
+    {
+        // delegate the call to get_impl_ptr<>()
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
+    /// @brief get a pointer value (implicit)
+    /// @sa https://json.nlohmann.me/api/basic_json/get_ptr/
+    template < typename PointerType, typename std::enable_if <
+                   std::is_pointer<PointerType>::value&&
+                   std::is_const<typename std::remove_pointer<PointerType>::type>::value, int >::type = 0 >
+    constexpr auto get_ptr() const noexcept -> decltype(std::declval<const basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
+    {
+        // delegate the call to get_impl_ptr<>() const
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
+  private:
+    /*!
+    @brief get a value (explicit)
+
+    Explicit type conversion between the JSON value and a compatible value
+    which is [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
+    and [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
+    The value is converted by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    ValueType ret;
+    JSONSerializer<ValueType>::from_json(*this, ret);
+    return ret;
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json,
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `void from_json(const basic_json&, ValueType&)`, and
+    - @ref json_serializer<ValueType> does not have a `from_json()` method of
+      the form `ValueType from_json(const basic_json&)`
+
+    @tparam ValueType the returned value type
+
+    @return copy of the JSON value, converted to @a ValueType
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,get__ValueType_const}
+
+    @since version 2.1.0
+    */
+    template < typename ValueType,
+               detail::enable_if_t <
+                   detail::is_default_constructible<ValueType>::value&&
+                   detail::has_from_json<basic_json_t, ValueType>::value,
+                   int > = 0 >
+    ValueType get_impl(detail::priority_tag<0> /*unused*/) const noexcept(noexcept(
+            JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), std::declval<ValueType&>())))
+    {
+        auto ret = ValueType();
+        JSONSerializer<ValueType>::from_json(*this, ret);
+        return ret;
+    }
+
+    /*!
+    @brief get a value (explicit); special case
+
+    Explicit type conversion between the JSON value and a compatible value
+    which is **not** [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
+    and **not** [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
+    The value is converted by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    return JSONSerializer<ValueType>::from_json(*this);
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json and
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `ValueType from_json(const basic_json&)`
+
+    @note If @ref json_serializer<ValueType> has both overloads of
+    `from_json()`, this one is chosen.
+
+    @tparam ValueType the returned value type
+
+    @return copy of the JSON value, converted to @a ValueType
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @since version 2.1.0
+    */
+    template < typename ValueType,
+               detail::enable_if_t <
+                   detail::has_non_default_from_json<basic_json_t, ValueType>::value,
+                   int > = 0 >
+    ValueType get_impl(detail::priority_tag<1> /*unused*/) const noexcept(noexcept(
+            JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>())))
+    {
+        return JSONSerializer<ValueType>::from_json(*this);
+    }
+
+    /*!
+    @brief get special-case overload
+
+    This overloads converts the current @ref basic_json in a different
+    @ref basic_json type
+
+    @tparam BasicJsonType == @ref basic_json
+
+    @return a copy of *this, converted into @a BasicJsonType
+
+    @complexity Depending on the implementation of the called `from_json()`
+                method.
+
+    @since version 3.2.0
+    */
+    template < typename BasicJsonType,
+               detail::enable_if_t <
+                   detail::is_basic_json<BasicJsonType>::value,
+                   int > = 0 >
+    BasicJsonType get_impl(detail::priority_tag<2> /*unused*/) const
+    {
+        return *this;
+    }
+
+    /*!
+    @brief get special-case overload
+
+    This overloads avoids a lot of template boilerplate, it can be seen as the
+    identity method
+
+    @tparam BasicJsonType == @ref basic_json
+
+    @return a copy of *this
+
+    @complexity Constant.
+
+    @since version 2.1.0
+    */
+    template<typename BasicJsonType,
+             detail::enable_if_t<
+                 std::is_same<BasicJsonType, basic_json_t>::value,
+                 int> = 0>
+    basic_json get_impl(detail::priority_tag<3> /*unused*/) const
+    {
+        return *this;
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+    @copydoc get()
+    */
+    template<typename PointerType,
+             detail::enable_if_t<
+                 std::is_pointer<PointerType>::value,
+                 int> = 0>
+    constexpr auto get_impl(detail::priority_tag<4> /*unused*/) const noexcept
+    -> decltype(std::declval<const basic_json_t&>().template get_ptr<PointerType>())
+    {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+  public:
+    /*!
+    @brief get a (pointer) value (explicit)
+
+    Performs explicit type conversion between the JSON value and a compatible value if required.
+
+    - If the requested type is a pointer to the internally stored JSON value that pointer is returned.
+    No copies are made.
+
+    - If the requested type is the current @ref basic_json, or a different @ref basic_json convertible
+    from the current @ref basic_json.
+
+    - Otherwise the value is converted by calling the @ref json_serializer<ValueType> `from_json()`
+    method.
+
+    @tparam ValueTypeCV the provided value type
+    @tparam ValueType the returned value type
+
+    @return copy of the JSON value, converted to @tparam ValueType if necessary
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws if conversion is required
+
+    @since version 2.1.0
+    */
+    template < typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>>
+#if defined(JSON_HAS_CPP_14)
+    constexpr
+#endif
+    auto get() const noexcept(
+    noexcept(std::declval<const basic_json_t&>().template get_impl<ValueType>(detail::priority_tag<4> {})))
+    -> decltype(std::declval<const basic_json_t&>().template get_impl<ValueType>(detail::priority_tag<4> {}))
+    {
+        // we cannot static_assert on ValueTypeCV being non-const, because
+        // there is support for get<const basic_json_t>(), which is why we
+        // still need the uncvref
+        static_assert(!std::is_reference<ValueTypeCV>::value,
+                      "get() cannot be used with reference types, you might want to use get_ref()");
+        return get_impl<ValueType>(detail::priority_tag<4> {});
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+
+    Explicit pointer access to the internally stored JSON value. No copies are
+    made.
+
+    @warning The pointer becomes invalid if the underlying JSON object
+    changes.
+
+    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
+    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
+    @ref number_unsigned_t, or @ref number_float_t.
+
+    @return pointer to the internally stored JSON value if the requested
+    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how pointers to internal values of a
+    JSON value can be requested. Note that no type conversions are made and a
+    `nullptr` is returned if the value and the requested pointer type does not
+    match.,get__PointerType}
+
+    @sa see @ref get_ptr() for explicit pointer-member access
+
+    @since version 1.0.0
+    */
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value, int>::type = 0>
+    auto get() noexcept -> decltype(std::declval<basic_json_t&>().template get_ptr<PointerType>())
+    {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+    /// @brief get a value (explicit)
+    /// @sa https://json.nlohmann.me/api/basic_json/get_to/
+    template < typename ValueType,
+               detail::enable_if_t <
+                   !detail::is_basic_json<ValueType>::value&&
+                   detail::has_from_json<basic_json_t, ValueType>::value,
+                   int > = 0 >
+    ValueType & get_to(ValueType& v) const noexcept(noexcept(
+            JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), v)))
+    {
+        JSONSerializer<ValueType>::from_json(*this, v);
+        return v;
+    }
+
+    // specialization to allow calling get_to with a basic_json value
+    // see https://github.com/nlohmann/json/issues/2175
+    template<typename ValueType,
+             detail::enable_if_t <
+                 detail::is_basic_json<ValueType>::value,
+                 int> = 0>
+    ValueType & get_to(ValueType& v) const
+    {
+        v = *this;
+        return v;
+    }
+
+    template <
+        typename T, std::size_t N,
+        typename Array = T (&)[N], // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+        detail::enable_if_t <
+            detail::has_from_json<basic_json_t, Array>::value, int > = 0 >
+    Array get_to(T (&v)[N]) const // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+    noexcept(noexcept(JSONSerializer<Array>::from_json(
+                          std::declval<const basic_json_t&>(), v)))
+    {
+        JSONSerializer<Array>::from_json(*this, v);
+        return v;
+    }
+
+    /// @brief get a reference value (implicit)
+    /// @sa https://json.nlohmann.me/api/basic_json/get_ref/
+    template<typename ReferenceType, typename std::enable_if<
+                 std::is_reference<ReferenceType>::value, int>::type = 0>
+    ReferenceType get_ref()
+    {
+        // delegate call to get_ref_impl
+        return get_ref_impl<ReferenceType>(*this);
+    }
+
+    /// @brief get a reference value (implicit)
+    /// @sa https://json.nlohmann.me/api/basic_json/get_ref/
+    template < typename ReferenceType, typename std::enable_if <
+                   std::is_reference<ReferenceType>::value&&
+                   std::is_const<typename std::remove_reference<ReferenceType>::type>::value, int >::type = 0 >
+    ReferenceType get_ref() const
+    {
+        // delegate call to get_ref_impl
+        return get_ref_impl<ReferenceType>(*this);
+    }
+
+    /*!
+    @brief get a value (implicit)
+
+    Implicit type conversion between the JSON value and a compatible value.
+    The call is realized by calling @ref get() const.
+
+    @tparam ValueType non-pointer type compatible to the JSON value, for
+    instance `int` for JSON integer numbers, `bool` for JSON booleans, or
+    `std::vector` types for JSON arrays. The character type of @ref string_t
+    as well as an initializer list of this type is excluded to avoid
+    ambiguities as these types implicitly convert to `std::string`.
+
+    @return copy of the JSON value, converted to type @a ValueType
+
+    @throw type_error.302 in case passed type @a ValueType is incompatible
+    to the JSON value type (e.g., the JSON value is of type boolean, but a
+    string is requested); see example below
+
+    @complexity Linear in the size of the JSON value.
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,operator__ValueType}
+
+    @since version 1.0.0
+    */
+    template < typename ValueType, typename std::enable_if <
+                   detail::conjunction <
+                       detail::negation<std::is_pointer<ValueType>>,
+                       detail::negation<std::is_same<ValueType, std::nullptr_t>>,
+                       detail::negation<std::is_same<ValueType, detail::json_ref<basic_json>>>,
+                                        detail::negation<std::is_same<ValueType, typename string_t::value_type>>,
+                                        detail::negation<detail::is_basic_json<ValueType>>,
+                                        detail::negation<std::is_same<ValueType, std::initializer_list<typename string_t::value_type>>>,
+#if defined(JSON_HAS_CPP_17) && (defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1910 && _MSC_VER <= 1914))
+                                                detail::negation<std::is_same<ValueType, std::string_view>>,
+#endif
+#if defined(JSON_HAS_CPP_17) && JSON_HAS_STATIC_RTTI
+                                                detail::negation<std::is_same<ValueType, std::any>>,
+#endif
+                                                detail::is_detected_lazy<detail::get_template_function, const basic_json_t&, ValueType>
+                                                >::value, int >::type = 0 >
+                                        JSON_EXPLICIT operator ValueType() const
+    {
+        // delegate the call to get<>() const
+        return get<ValueType>();
+    }
+
+    /// @brief get a binary value
+    /// @sa https://json.nlohmann.me/api/basic_json/get_binary/
+    binary_t& get_binary()
+    {
+        if (!is_binary())
+        {
+            JSON_THROW(type_error::create(302, detail::concat("type must be binary, but is ", type_name()), this));
+        }
+
+        return *get_ptr<binary_t*>();
+    }
+
+    /// @brief get a binary value
+    /// @sa https://json.nlohmann.me/api/basic_json/get_binary/
+    const binary_t& get_binary() const
+    {
+        if (!is_binary())
+        {
+            JSON_THROW(type_error::create(302, detail::concat("type must be binary, but is ", type_name()), this));
+        }
+
+        return *get_ptr<const binary_t*>();
+    }
+
+    /// @}
+
+    ////////////////////
+    // element access //
+    ////////////////////
+
+    /// @name element access
+    /// Access to the JSON value.
+    /// @{
+
+    /// @brief access specified array element with bounds checking
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    reference at(size_type idx)
+    {
+        // at only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            JSON_TRY
+            {
+                return set_parent(m_data.m_value.array->at(idx));
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), this));
+            } // cppcheck-suppress[missingReturn]
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
+        }
+    }
+
+    /// @brief access specified array element with bounds checking
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    const_reference at(size_type idx) const
+    {
+        // at only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            JSON_TRY
+            {
+                return m_data.m_value.array->at(idx);
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), this));
+            } // cppcheck-suppress[missingReturn]
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
+        }
+    }
+
+    /// @brief access specified object element with bounds checking
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    reference at(const typename object_t::key_type& key)
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
+        }
+
+        auto it = m_data.m_value.object->find(key);
+        if (it == m_data.m_value.object->end())
+        {
+            JSON_THROW(out_of_range::create(403, detail::concat("key '", key, "' not found"), this));
+        }
+        return set_parent(it->second);
+    }
+
+    /// @brief access specified object element with bounds checking
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    reference at(KeyType && key)
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
+        }
+
+        auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
+        if (it == m_data.m_value.object->end())
+        {
+            JSON_THROW(out_of_range::create(403, detail::concat("key '", string_t(std::forward<KeyType>(key)), "' not found"), this));
+        }
+        return set_parent(it->second);
+    }
+
+    /// @brief access specified object element with bounds checking
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    const_reference at(const typename object_t::key_type& key) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
+        }
+
+        auto it = m_data.m_value.object->find(key);
+        if (it == m_data.m_value.object->end())
+        {
+            JSON_THROW(out_of_range::create(403, detail::concat("key '", key, "' not found"), this));
+        }
+        return it->second;
+    }
+
+    /// @brief access specified object element with bounds checking
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    const_reference at(KeyType && key) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
+        }
+
+        auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
+        if (it == m_data.m_value.object->end())
+        {
+            JSON_THROW(out_of_range::create(403, detail::concat("key '", string_t(std::forward<KeyType>(key)), "' not found"), this));
+        }
+        return it->second;
+    }
+
+    /// @brief access specified array element
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    reference operator[](size_type idx)
+    {
+        // implicitly convert null value to an empty array
+        if (is_null())
+        {
+            m_data.m_type = value_t::array;
+            m_data.m_value.array = create<array_t>();
+            assert_invariant();
+        }
+
+        // operator[] only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            // fill up array with null values if given idx is outside range
+            if (idx >= m_data.m_value.array->size())
+            {
+#if JSON_DIAGNOSTICS
+                // remember array size & capacity before resizing
+                const auto old_size = m_data.m_value.array->size();
+                const auto old_capacity = m_data.m_value.array->capacity();
+#endif
+                m_data.m_value.array->resize(idx + 1);
+
+#if JSON_DIAGNOSTICS
+                if (JSON_HEDLEY_UNLIKELY(m_data.m_value.array->capacity() != old_capacity))
+                {
+                    // capacity has changed: update all parents
+                    set_parents();
+                }
+                else
+                {
+                    // set parent for values added above
+                    set_parents(begin() + static_cast<typename iterator::difference_type>(old_size), static_cast<typename iterator::difference_type>(idx + 1 - old_size));
+                }
+#endif
+                assert_invariant();
+            }
+
+            return m_data.m_value.array->operator[](idx);
+        }
+
+        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a numeric argument with ", type_name()), this));
+    }
+
+    /// @brief access specified array element
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    const_reference operator[](size_type idx) const
+    {
+        // const operator[] only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            return m_data.m_value.array->operator[](idx);
+        }
+
+        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a numeric argument with ", type_name()), this));
+    }
+
+    /// @brief access specified object element
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    reference operator[](typename object_t::key_type key) // NOLINT(performance-unnecessary-value-param)
+    {
+        // implicitly convert null value to an empty object
+        if (is_null())
+        {
+            m_data.m_type = value_t::object;
+            m_data.m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        // operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            auto result = m_data.m_value.object->emplace(std::move(key), nullptr);
+            return set_parent(result.first->second);
+        }
+
+        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
+    }
+
+    /// @brief access specified object element
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    const_reference operator[](const typename object_t::key_type& key) const
+    {
+        // const operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            auto it = m_data.m_value.object->find(key);
+            JSON_ASSERT(it != m_data.m_value.object->end());
+            return it->second;
+        }
+
+        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
+    }
+
+    // these two functions resolve a (const) char * ambiguity affecting Clang and MSVC
+    // (they seemingly cannot be constrained to resolve the ambiguity)
+    template<typename T>
+    reference operator[](T* key)
+    {
+        return operator[](typename object_t::key_type(key));
+    }
+
+    template<typename T>
+    const_reference operator[](T* key) const
+    {
+        return operator[](typename object_t::key_type(key));
+    }
+
+    /// @brief access specified object element
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int > = 0 >
+    reference operator[](KeyType && key)
+    {
+        // implicitly convert null value to an empty object
+        if (is_null())
+        {
+            m_data.m_type = value_t::object;
+            m_data.m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        // operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            auto result = m_data.m_value.object->emplace(std::forward<KeyType>(key), nullptr);
+            return set_parent(result.first->second);
+        }
+
+        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
+    }
+
+    /// @brief access specified object element
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int > = 0 >
+    const_reference operator[](KeyType && key) const
+    {
+        // const operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
+            JSON_ASSERT(it != m_data.m_value.object->end());
+            return it->second;
+        }
+
+        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
+    }
+
+  private:
+    template<typename KeyType>
+    using is_comparable_with_object_key = detail::is_comparable <
+        object_comparator_t, const typename object_t::key_type&, KeyType >;
+
+    template<typename ValueType>
+    using value_return_type = std::conditional <
+        detail::is_c_string_uncvref<ValueType>::value,
+        string_t, typename std::decay<ValueType>::type >;
+
+  public:
+    /// @brief access specified object element with default value
+    /// @sa https://json.nlohmann.me/api/basic_json/value/
+    template < class ValueType, detail::enable_if_t <
+                   !detail::is_transparent<object_comparator_t>::value
+                   && detail::is_getable<basic_json_t, ValueType>::value
+                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
+    ValueType value(const typename object_t::key_type& key, const ValueType& default_value) const
+    {
+        // value only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            // if key is found, return value and given default value otherwise
+            const auto it = find(key);
+            if (it != end())
+            {
+                return it->template get<ValueType>();
+            }
+
+            return default_value;
+        }
+
+        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
+    }
+
+    /// @brief access specified object element with default value
+    /// @sa https://json.nlohmann.me/api/basic_json/value/
+    template < class ValueType, class ReturnType = typename value_return_type<ValueType>::type,
+               detail::enable_if_t <
+                   !detail::is_transparent<object_comparator_t>::value
+                   && detail::is_getable<basic_json_t, ReturnType>::value
+                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
+    ReturnType value(const typename object_t::key_type& key, ValueType && default_value) const
+    {
+        // value only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            // if key is found, return value and given default value otherwise
+            const auto it = find(key);
+            if (it != end())
+            {
+                return it->template get<ReturnType>();
+            }
+
+            return std::forward<ValueType>(default_value);
+        }
+
+        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
+    }
+
+    /// @brief access specified object element with default value
+    /// @sa https://json.nlohmann.me/api/basic_json/value/
+    template < class ValueType, class KeyType, detail::enable_if_t <
+                   detail::is_transparent<object_comparator_t>::value
+                   && !detail::is_json_pointer<KeyType>::value
+                   && is_comparable_with_object_key<KeyType>::value
+                   && detail::is_getable<basic_json_t, ValueType>::value
+                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
+    ValueType value(KeyType && key, const ValueType& default_value) const
+    {
+        // value only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            // if key is found, return value and given default value otherwise
+            const auto it = find(std::forward<KeyType>(key));
+            if (it != end())
+            {
+                return it->template get<ValueType>();
+            }
+
+            return default_value;
+        }
+
+        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
+    }
+
+    /// @brief access specified object element via JSON Pointer with default value
+    /// @sa https://json.nlohmann.me/api/basic_json/value/
+    template < class ValueType, class KeyType, class ReturnType = typename value_return_type<ValueType>::type,
+               detail::enable_if_t <
+                   detail::is_transparent<object_comparator_t>::value
+                   && !detail::is_json_pointer<KeyType>::value
+                   && is_comparable_with_object_key<KeyType>::value
+                   && detail::is_getable<basic_json_t, ReturnType>::value
+                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
+    ReturnType value(KeyType && key, ValueType && default_value) const
+    {
+        // value only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            // if key is found, return value and given default value otherwise
+            const auto it = find(std::forward<KeyType>(key));
+            if (it != end())
+            {
+                return it->template get<ReturnType>();
+            }
+
+            return std::forward<ValueType>(default_value);
+        }
+
+        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
+    }
+
+    /// @brief access specified object element via JSON Pointer with default value
+    /// @sa https://json.nlohmann.me/api/basic_json/value/
+    template < class ValueType, detail::enable_if_t <
+                   detail::is_getable<basic_json_t, ValueType>::value
+                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
+    ValueType value(const json_pointer& ptr, const ValueType& default_value) const
+    {
+        // value only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            // if pointer resolves a value, return it or use default value
+            JSON_TRY
+            {
+                return ptr.get_checked(this).template get<ValueType>();
+            }
+            JSON_INTERNAL_CATCH (out_of_range&)
+            {
+                return default_value;
+            }
+        }
+
+        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
+    }
+
+    /// @brief access specified object element via JSON Pointer with default value
+    /// @sa https://json.nlohmann.me/api/basic_json/value/
+    template < class ValueType, class ReturnType = typename value_return_type<ValueType>::type,
+               detail::enable_if_t <
+                   detail::is_getable<basic_json_t, ReturnType>::value
+                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
+    ReturnType value(const json_pointer& ptr, ValueType && default_value) const
+    {
+        // value only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            // if pointer resolves a value, return it or use default value
+            JSON_TRY
+            {
+                return ptr.get_checked(this).template get<ReturnType>();
+            }
+            JSON_INTERNAL_CATCH (out_of_range&)
+            {
+                return std::forward<ValueType>(default_value);
+            }
+        }
+
+        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
+    }
+
+    template < class ValueType, class BasicJsonType, detail::enable_if_t <
+                   detail::is_basic_json<BasicJsonType>::value
+                   && detail::is_getable<basic_json_t, ValueType>::value
+                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
+    ValueType value(const ::nlohmann::json_pointer<BasicJsonType>& ptr, const ValueType& default_value) const
+    {
+        return value(ptr.convert(), default_value);
+    }
+
+    template < class ValueType, class BasicJsonType, class ReturnType = typename value_return_type<ValueType>::type,
+               detail::enable_if_t <
+                   detail::is_basic_json<BasicJsonType>::value
+                   && detail::is_getable<basic_json_t, ReturnType>::value
+                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
+    ReturnType value(const ::nlohmann::json_pointer<BasicJsonType>& ptr, ValueType && default_value) const
+    {
+        return value(ptr.convert(), std::forward<ValueType>(default_value));
+    }
+
+    /// @brief access the first element
+    /// @sa https://json.nlohmann.me/api/basic_json/front/
+    reference front()
+    {
+        return *begin();
+    }
+
+    /// @brief access the first element
+    /// @sa https://json.nlohmann.me/api/basic_json/front/
+    const_reference front() const
+    {
+        return *cbegin();
+    }
+
+    /// @brief access the last element
+    /// @sa https://json.nlohmann.me/api/basic_json/back/
+    reference back()
+    {
+        auto tmp = end();
+        --tmp;
+        return *tmp;
+    }
+
+    /// @brief access the last element
+    /// @sa https://json.nlohmann.me/api/basic_json/back/
+    const_reference back() const
+    {
+        auto tmp = cend();
+        --tmp;
+        return *tmp;
+    }
+
+    /// @brief remove element given an iterator
+    /// @sa https://json.nlohmann.me/api/basic_json/erase/
+    template < class IteratorType, detail::enable_if_t <
+                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
+                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int > = 0 >
+    IteratorType erase(IteratorType pos) // NOLINT(performance-unnecessary-value-param)
+    {
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(this != pos.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
+        }
+
+        IteratorType result = end();
+
+        switch (m_data.m_type)
+        {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            case value_t::binary:
+            {
+                if (JSON_HEDLEY_UNLIKELY(!pos.m_it.primitive_iterator.is_begin()))
+                {
+                    JSON_THROW(invalid_iterator::create(205, "iterator out of range", this));
+                }
+
+                if (is_string())
+                {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.string, 1);
+                    m_data.m_value.string = nullptr;
+                }
+                else if (is_binary())
+                {
+                    AllocatorType<binary_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.binary, 1);
+                    m_data.m_value.binary = nullptr;
+                }
+
+                m_data.m_type = value_t::null;
+                assert_invariant();
+                break;
+            }
+
+            case value_t::object:
+            {
+                result.m_it.object_iterator = m_data.m_value.object->erase(pos.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                result.m_it.array_iterator = m_data.m_value.array->erase(pos.m_it.array_iterator);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::discarded:
+            default:
+                JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
+        }
+
+        return result;
+    }
+
+    /// @brief remove elements given an iterator range
+    /// @sa https://json.nlohmann.me/api/basic_json/erase/
+    template < class IteratorType, detail::enable_if_t <
+                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
+                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int > = 0 >
+    IteratorType erase(IteratorType first, IteratorType last) // NOLINT(performance-unnecessary-value-param)
+    {
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(this != first.m_object || this != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(203, "iterators do not fit current value", this));
+        }
+
+        IteratorType result = end();
+
+        switch (m_data.m_type)
+        {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            case value_t::binary:
+            {
+                if (JSON_HEDLEY_LIKELY(!first.m_it.primitive_iterator.is_begin()
+                                       || !last.m_it.primitive_iterator.is_end()))
+                {
+                    JSON_THROW(invalid_iterator::create(204, "iterators out of range", this));
+                }
+
+                if (is_string())
+                {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.string, 1);
+                    m_data.m_value.string = nullptr;
+                }
+                else if (is_binary())
+                {
+                    AllocatorType<binary_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.binary, 1);
+                    m_data.m_value.binary = nullptr;
+                }
+
+                m_data.m_type = value_t::null;
+                assert_invariant();
+                break;
+            }
+
+            case value_t::object:
+            {
+                result.m_it.object_iterator = m_data.m_value.object->erase(first.m_it.object_iterator,
+                                              last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                result.m_it.array_iterator = m_data.m_value.array->erase(first.m_it.array_iterator,
+                                             last.m_it.array_iterator);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::discarded:
+            default:
+                JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
+        }
+
+        return result;
+    }
+
+  private:
+    template < typename KeyType, detail::enable_if_t <
+                   detail::has_erase_with_key_type<basic_json_t, KeyType>::value, int > = 0 >
+    size_type erase_internal(KeyType && key)
+    {
+        // this erase only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
+        }
+
+        return m_data.m_value.object->erase(std::forward<KeyType>(key));
+    }
+
+    template < typename KeyType, detail::enable_if_t <
+                   !detail::has_erase_with_key_type<basic_json_t, KeyType>::value, int > = 0 >
+    size_type erase_internal(KeyType && key)
+    {
+        // this erase only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
+        }
+
+        const auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
+        if (it != m_data.m_value.object->end())
+        {
+            m_data.m_value.object->erase(it);
+            return 1;
+        }
+        return 0;
+    }
+
+  public:
+
+    /// @brief remove element from a JSON object given a key
+    /// @sa https://json.nlohmann.me/api/basic_json/erase/
+    size_type erase(const typename object_t::key_type& key)
+    {
+        // the indirection via erase_internal() is added to avoid making this
+        // function a template and thus de-rank it during overload resolution
+        return erase_internal(key);
+    }
+
+    /// @brief remove element from a JSON object given a key
+    /// @sa https://json.nlohmann.me/api/basic_json/erase/
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    size_type erase(KeyType && key)
+    {
+        return erase_internal(std::forward<KeyType>(key));
+    }
+
+    /// @brief remove element from a JSON array given an index
+    /// @sa https://json.nlohmann.me/api/basic_json/erase/
+    void erase(const size_type idx)
+    {
+        // this erase only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            if (JSON_HEDLEY_UNLIKELY(idx >= size()))
+            {
+                JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), this));
+            }
+
+            m_data.m_value.array->erase(m_data.m_value.array->begin() + static_cast<difference_type>(idx));
+        }
+        else
+        {
+            JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
+        }
+    }
+
+    /// @}
+
+    ////////////
+    // lookup //
+    ////////////
+
+    /// @name lookup
+    /// @{
+
+    /// @brief find an element in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/find/
+    iterator find(const typename object_t::key_type& key)
+    {
+        auto result = end();
+
+        if (is_object())
+        {
+            result.m_it.object_iterator = m_data.m_value.object->find(key);
+        }
+
+        return result;
+    }
+
+    /// @brief find an element in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/find/
+    const_iterator find(const typename object_t::key_type& key) const
+    {
+        auto result = cend();
+
+        if (is_object())
+        {
+            result.m_it.object_iterator = m_data.m_value.object->find(key);
+        }
+
+        return result;
+    }
+
+    /// @brief find an element in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/find/
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    iterator find(KeyType && key)
+    {
+        auto result = end();
+
+        if (is_object())
+        {
+            result.m_it.object_iterator = m_data.m_value.object->find(std::forward<KeyType>(key));
+        }
+
+        return result;
+    }
+
+    /// @brief find an element in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/find/
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    const_iterator find(KeyType && key) const
+    {
+        auto result = cend();
+
+        if (is_object())
+        {
+            result.m_it.object_iterator = m_data.m_value.object->find(std::forward<KeyType>(key));
+        }
+
+        return result;
+    }
+
+    /// @brief returns the number of occurrences of a key in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/count/
+    size_type count(const typename object_t::key_type& key) const
+    {
+        // return 0 for all nonobject types
+        return is_object() ? m_data.m_value.object->count(key) : 0;
+    }
+
+    /// @brief returns the number of occurrences of a key in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/count/
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    size_type count(KeyType && key) const
+    {
+        // return 0 for all nonobject types
+        return is_object() ? m_data.m_value.object->count(std::forward<KeyType>(key)) : 0;
+    }
+
+    /// @brief check the existence of an element in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/contains/
+    bool contains(const typename object_t::key_type& key) const
+    {
+        return is_object() && m_data.m_value.object->find(key) != m_data.m_value.object->end();
+    }
+
+    /// @brief check the existence of an element in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/contains/
+    template<class KeyType, detail::enable_if_t<
+                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    bool contains(KeyType && key) const
+    {
+        return is_object() && m_data.m_value.object->find(std::forward<KeyType>(key)) != m_data.m_value.object->end();
+    }
+
+    /// @brief check the existence of an element in a JSON object given a JSON pointer
+    /// @sa https://json.nlohmann.me/api/basic_json/contains/
+    bool contains(const json_pointer& ptr) const
+    {
+        return ptr.contains(this);
+    }
+
+    template<typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
+    bool contains(const typename ::nlohmann::json_pointer<BasicJsonType>& ptr) const
+    {
+        return ptr.contains(this);
+    }
+
+    /// @}
+
+    ///////////////
+    // iterators //
+    ///////////////
+
+    /// @name iterators
+    /// @{
+
+    /// @brief returns an iterator to the first element
+    /// @sa https://json.nlohmann.me/api/basic_json/begin/
+    iterator begin() noexcept
+    {
+        iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /// @brief returns an iterator to the first element
+    /// @sa https://json.nlohmann.me/api/basic_json/begin/
+    const_iterator begin() const noexcept
+    {
+        return cbegin();
+    }
+
+    /// @brief returns a const iterator to the first element
+    /// @sa https://json.nlohmann.me/api/basic_json/cbegin/
+    const_iterator cbegin() const noexcept
+    {
+        const_iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /// @brief returns an iterator to one past the last element
+    /// @sa https://json.nlohmann.me/api/basic_json/end/
+    iterator end() noexcept
+    {
+        iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /// @brief returns an iterator to one past the last element
+    /// @sa https://json.nlohmann.me/api/basic_json/end/
+    const_iterator end() const noexcept
+    {
+        return cend();
+    }
+
+    /// @brief returns an iterator to one past the last element
+    /// @sa https://json.nlohmann.me/api/basic_json/cend/
+    const_iterator cend() const noexcept
+    {
+        const_iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /// @brief returns an iterator to the reverse-beginning
+    /// @sa https://json.nlohmann.me/api/basic_json/rbegin/
+    reverse_iterator rbegin() noexcept
+    {
+        return reverse_iterator(end());
+    }
+
+    /// @brief returns an iterator to the reverse-beginning
+    /// @sa https://json.nlohmann.me/api/basic_json/rbegin/
+    const_reverse_iterator rbegin() const noexcept
+    {
+        return crbegin();
+    }
+
+    /// @brief returns an iterator to the reverse-end
+    /// @sa https://json.nlohmann.me/api/basic_json/rend/
+    reverse_iterator rend() noexcept
+    {
+        return reverse_iterator(begin());
+    }
+
+    /// @brief returns an iterator to the reverse-end
+    /// @sa https://json.nlohmann.me/api/basic_json/rend/
+    const_reverse_iterator rend() const noexcept
+    {
+        return crend();
+    }
+
+    /// @brief returns a const reverse iterator to the last element
+    /// @sa https://json.nlohmann.me/api/basic_json/crbegin/
+    const_reverse_iterator crbegin() const noexcept
+    {
+        return const_reverse_iterator(cend());
+    }
+
+    /// @brief returns a const reverse iterator to one before the first
+    /// @sa https://json.nlohmann.me/api/basic_json/crend/
+    const_reverse_iterator crend() const noexcept
+    {
+        return const_reverse_iterator(cbegin());
+    }
+
+  public:
+    /// @brief wrapper to access iterator member functions in range-based for
+    /// @sa https://json.nlohmann.me/api/basic_json/items/
+    /// @deprecated This function is deprecated since 3.1.0 and will be removed in
+    ///             version 4.0.0 of the library. Please use @ref items() instead;
+    ///             that is, replace `json::iterator_wrapper(j)` with `j.items()`.
+    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
+    static iteration_proxy<iterator> iterator_wrapper(reference ref) noexcept
+    {
+        return ref.items();
+    }
+
+    /// @brief wrapper to access iterator member functions in range-based for
+    /// @sa https://json.nlohmann.me/api/basic_json/items/
+    /// @deprecated This function is deprecated since 3.1.0 and will be removed in
+    ///         version 4.0.0 of the library. Please use @ref items() instead;
+    ///         that is, replace `json::iterator_wrapper(j)` with `j.items()`.
+    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
+    static iteration_proxy<const_iterator> iterator_wrapper(const_reference ref) noexcept
+    {
+        return ref.items();
+    }
+
+    /// @brief helper to access iterator member functions in range-based for
+    /// @sa https://json.nlohmann.me/api/basic_json/items/
+    iteration_proxy<iterator> items() noexcept
+    {
+        return iteration_proxy<iterator>(*this);
+    }
+
+    /// @brief helper to access iterator member functions in range-based for
+    /// @sa https://json.nlohmann.me/api/basic_json/items/
+    iteration_proxy<const_iterator> items() const noexcept
+    {
+        return iteration_proxy<const_iterator>(*this);
+    }
+
+    /// @}
+
+    //////////////
+    // capacity //
+    //////////////
+
+    /// @name capacity
+    /// @{
+
+    /// @brief checks whether the container is empty.
+    /// @sa https://json.nlohmann.me/api/basic_json/empty/
+    bool empty() const noexcept
+    {
+        switch (m_data.m_type)
+        {
+            case value_t::null:
+            {
+                // null values are empty
+                return true;
+            }
+
+            case value_t::array:
+            {
+                // delegate call to array_t::empty()
+                return m_data.m_value.array->empty();
+            }
+
+            case value_t::object:
+            {
+                // delegate call to object_t::empty()
+                return m_data.m_value.object->empty();
+            }
+
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                // all other types are nonempty
+                return false;
+            }
+        }
+    }
+
+    /// @brief returns the number of elements
+    /// @sa https://json.nlohmann.me/api/basic_json/size/
+    size_type size() const noexcept
+    {
+        switch (m_data.m_type)
+        {
+            case value_t::null:
+            {
+                // null values are empty
+                return 0;
+            }
+
+            case value_t::array:
+            {
+                // delegate call to array_t::size()
+                return m_data.m_value.array->size();
+            }
+
+            case value_t::object:
+            {
+                // delegate call to object_t::size()
+                return m_data.m_value.object->size();
+            }
+
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                // all other types have size 1
+                return 1;
+            }
+        }
+    }
+
+    /// @brief returns the maximum possible number of elements
+    /// @sa https://json.nlohmann.me/api/basic_json/max_size/
+    size_type max_size() const noexcept
+    {
+        switch (m_data.m_type)
+        {
+            case value_t::array:
+            {
+                // delegate call to array_t::max_size()
+                return m_data.m_value.array->max_size();
+            }
+
+            case value_t::object:
+            {
+                // delegate call to object_t::max_size()
+                return m_data.m_value.object->max_size();
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                // all other types have max_size() == size()
+                return size();
+            }
+        }
+    }
+
+    /// @}
+
+    ///////////////
+    // modifiers //
+    ///////////////
+
+    /// @name modifiers
+    /// @{
+
+    /// @brief clears the contents
+    /// @sa https://json.nlohmann.me/api/basic_json/clear/
+    void clear() noexcept
+    {
+        switch (m_data.m_type)
+        {
+            case value_t::number_integer:
+            {
+                m_data.m_value.number_integer = 0;
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                m_data.m_value.number_unsigned = 0;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_data.m_value.number_float = 0.0;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_data.m_value.boolean = false;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_data.m_value.string->clear();
+                break;
+            }
+
+            case value_t::binary:
+            {
+                m_data.m_value.binary->clear();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_data.m_value.array->clear();
+                break;
+            }
+
+            case value_t::object:
+            {
+                m_data.m_value.object->clear();
+                break;
+            }
+
+            case value_t::null:
+            case value_t::discarded:
+            default:
+                break;
+        }
+    }
+
+    /// @brief add an object to an array
+    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
+    void push_back(basic_json&& val)
+    {
+        // push_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
+        {
+            JSON_THROW(type_error::create(308, detail::concat("cannot use push_back() with ", type_name()), this));
+        }
+
+        // transform null object into an array
+        if (is_null())
+        {
+            m_data.m_type = value_t::array;
+            m_data.m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array (move semantics)
+        const auto old_capacity = m_data.m_value.array->capacity();
+        m_data.m_value.array->push_back(std::move(val));
+        set_parent(m_data.m_value.array->back(), old_capacity);
+        // if val is moved from, basic_json move constructor marks it null, so we do not call the destructor
+    }
+
+    /// @brief add an object to an array
+    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
+    reference operator+=(basic_json&& val)
+    {
+        push_back(std::move(val));
+        return *this;
+    }
+
+    /// @brief add an object to an array
+    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
+    void push_back(const basic_json& val)
+    {
+        // push_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
+        {
+            JSON_THROW(type_error::create(308, detail::concat("cannot use push_back() with ", type_name()), this));
+        }
+
+        // transform null object into an array
+        if (is_null())
+        {
+            m_data.m_type = value_t::array;
+            m_data.m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array
+        const auto old_capacity = m_data.m_value.array->capacity();
+        m_data.m_value.array->push_back(val);
+        set_parent(m_data.m_value.array->back(), old_capacity);
+    }
+
+    /// @brief add an object to an array
+    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
+    reference operator+=(const basic_json& val)
+    {
+        push_back(val);
+        return *this;
+    }
+
+    /// @brief add an object to an object
+    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
+    void push_back(const typename object_t::value_type& val)
+    {
+        // push_back only works for null objects or objects
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
+        {
+            JSON_THROW(type_error::create(308, detail::concat("cannot use push_back() with ", type_name()), this));
+        }
+
+        // transform null object into an object
+        if (is_null())
+        {
+            m_data.m_type = value_t::object;
+            m_data.m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // add element to object
+        auto res = m_data.m_value.object->insert(val);
+        set_parent(res.first->second);
+    }
+
+    /// @brief add an object to an object
+    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
+    reference operator+=(const typename object_t::value_type& val)
+    {
+        push_back(val);
+        return *this;
+    }
+
+    /// @brief add an object to an object
+    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
+    void push_back(initializer_list_t init)
+    {
+        if (is_object() && init.size() == 2 && (*init.begin())->is_string())
+        {
+            basic_json&& key = init.begin()->moved_or_copied();
+            push_back(typename object_t::value_type(
+                          std::move(key.get_ref<string_t&>()), (init.begin() + 1)->moved_or_copied()));
+        }
+        else
+        {
+            push_back(basic_json(init));
+        }
+    }
+
+    /// @brief add an object to an object
+    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
+    reference operator+=(initializer_list_t init)
+    {
+        push_back(init);
+        return *this;
+    }
+
+    /// @brief add an object to an array
+    /// @sa https://json.nlohmann.me/api/basic_json/emplace_back/
+    template<class... Args>
+    reference emplace_back(Args&& ... args)
+    {
+        // emplace_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
+        {
+            JSON_THROW(type_error::create(311, detail::concat("cannot use emplace_back() with ", type_name()), this));
+        }
+
+        // transform null object into an array
+        if (is_null())
+        {
+            m_data.m_type = value_t::array;
+            m_data.m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array (perfect forwarding)
+        const auto old_capacity = m_data.m_value.array->capacity();
+        m_data.m_value.array->emplace_back(std::forward<Args>(args)...);
+        return set_parent(m_data.m_value.array->back(), old_capacity);
+    }
+
+    /// @brief add an object to an object if key does not exist
+    /// @sa https://json.nlohmann.me/api/basic_json/emplace/
+    template<class... Args>
+    std::pair<iterator, bool> emplace(Args&& ... args)
+    {
+        // emplace only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
+        {
+            JSON_THROW(type_error::create(311, detail::concat("cannot use emplace() with ", type_name()), this));
+        }
+
+        // transform null object into an object
+        if (is_null())
+        {
+            m_data.m_type = value_t::object;
+            m_data.m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // add element to array (perfect forwarding)
+        auto res = m_data.m_value.object->emplace(std::forward<Args>(args)...);
+        set_parent(res.first->second);
+
+        // create result iterator and set iterator to the result of emplace
+        auto it = begin();
+        it.m_it.object_iterator = res.first;
+
+        // return pair of iterator and boolean
+        return {it, res.second};
+    }
+
+    /// Helper for insertion of an iterator
+    /// @note: This uses std::distance to support GCC 4.8,
+    ///        see https://github.com/nlohmann/json/pull/1257
+    template<typename... Args>
+    iterator insert_iterator(const_iterator pos, Args&& ... args) // NOLINT(performance-unnecessary-value-param)
+    {
+        iterator result(this);
+        JSON_ASSERT(m_data.m_value.array != nullptr);
+
+        auto insert_pos = std::distance(m_data.m_value.array->begin(), pos.m_it.array_iterator);
+        m_data.m_value.array->insert(pos.m_it.array_iterator, std::forward<Args>(args)...);
+        result.m_it.array_iterator = m_data.m_value.array->begin() + insert_pos;
+
+        // This could have been written as:
+        // result.m_it.array_iterator = m_data.m_value.array->insert(pos.m_it.array_iterator, cnt, val);
+        // but the return value of insert is missing in GCC 4.8, so it is written this way instead.
+
+        set_parents();
+        return result;
+    }
+
+    /// @brief inserts element into array
+    /// @sa https://json.nlohmann.me/api/basic_json/insert/
+    iterator insert(const_iterator pos, const basic_json& val) // NOLINT(performance-unnecessary-value-param)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            // check if iterator pos fits to this JSON value
+            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+            {
+                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
+            }
+
+            // insert to array and return iterator
+            return insert_iterator(pos, val);
+        }
+
+        JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
+    }
+
+    /// @brief inserts element into array
+    /// @sa https://json.nlohmann.me/api/basic_json/insert/
+    iterator insert(const_iterator pos, basic_json&& val) // NOLINT(performance-unnecessary-value-param)
+    {
+        return insert(pos, val);
+    }
+
+    /// @brief inserts copies of element into array
+    /// @sa https://json.nlohmann.me/api/basic_json/insert/
+    iterator insert(const_iterator pos, size_type cnt, const basic_json& val) // NOLINT(performance-unnecessary-value-param)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            // check if iterator pos fits to this JSON value
+            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+            {
+                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
+            }
+
+            // insert to array and return iterator
+            return insert_iterator(pos, cnt, val);
+        }
+
+        JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
+    }
+
+    /// @brief inserts range of elements into array
+    /// @sa https://json.nlohmann.me/api/basic_json/insert/
+    iterator insert(const_iterator pos, const_iterator first, const_iterator last) // NOLINT(performance-unnecessary-value-param)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_UNLIKELY(!is_array()))
+        {
+            JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", this));
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(first.m_object == this))
+        {
+            JSON_THROW(invalid_iterator::create(211, "passed iterators may not belong to container", this));
+        }
+
+        // insert to array and return iterator
+        return insert_iterator(pos, first.m_it.array_iterator, last.m_it.array_iterator);
+    }
+
+    /// @brief inserts elements from initializer list into array
+    /// @sa https://json.nlohmann.me/api/basic_json/insert/
+    iterator insert(const_iterator pos, initializer_list_t ilist) // NOLINT(performance-unnecessary-value-param)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_UNLIKELY(!is_array()))
+        {
+            JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
+        }
+
+        // insert to array and return iterator
+        return insert_iterator(pos, ilist.begin(), ilist.end());
+    }
+
+    /// @brief inserts range of elements into object
+    /// @sa https://json.nlohmann.me/api/basic_json/insert/
+    void insert(const_iterator first, const_iterator last) // NOLINT(performance-unnecessary-value-param)
+    {
+        // insert only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", this));
+        }
+
+        // passed iterators must belong to objects
+        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects", this));
+        }
+
+        m_data.m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator);
+        set_parents();
+    }
+
+    /// @brief updates a JSON object from another object, overwriting existing keys
+    /// @sa https://json.nlohmann.me/api/basic_json/update/
+    void update(const_reference j, bool merge_objects = false)
+    {
+        update(j.begin(), j.end(), merge_objects);
+    }
+
+    /// @brief updates a JSON object from another object, overwriting existing keys
+    /// @sa https://json.nlohmann.me/api/basic_json/update/
+    void update(const_iterator first, const_iterator last, bool merge_objects = false) // NOLINT(performance-unnecessary-value-param)
+    {
+        // implicitly convert null value to an empty object
+        if (is_null())
+        {
+            m_data.m_type = value_t::object;
+            m_data.m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(312, detail::concat("cannot use update() with ", type_name()), this));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", this));
+        }
+
+        // passed iterators must belong to objects
+        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()))
+        {
+            JSON_THROW(type_error::create(312, detail::concat("cannot use update() with ", first.m_object->type_name()), first.m_object));
+        }
+
+        for (auto it = first; it != last; ++it)
+        {
+            if (merge_objects && it.value().is_object())
+            {
+                auto it2 = m_data.m_value.object->find(it.key());
+                if (it2 != m_data.m_value.object->end())
+                {
+                    it2->second.update(it.value(), true);
+                    continue;
+                }
+            }
+            m_data.m_value.object->operator[](it.key()) = it.value();
+#if JSON_DIAGNOSTICS
+            m_data.m_value.object->operator[](it.key()).m_parent = this;
+#endif
+        }
+    }
+
+    /// @brief exchanges the values
+    /// @sa https://json.nlohmann.me/api/basic_json/swap/
+    void swap(reference other) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value&&
+        std::is_nothrow_move_assignable<value_t>::value&&
+        std::is_nothrow_move_constructible<json_value>::value&& // NOLINT(cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        std::swap(m_data.m_type, other.m_data.m_type);
+        std::swap(m_data.m_value, other.m_data.m_value);
+
+        set_parents();
+        other.set_parents();
+        assert_invariant();
+    }
+
+    /// @brief exchanges the values
+    /// @sa https://json.nlohmann.me/api/basic_json/swap/
+    friend void swap(reference left, reference right) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value&&
+        std::is_nothrow_move_assignable<value_t>::value&&
+        std::is_nothrow_move_constructible<json_value>::value&& // NOLINT(cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        left.swap(right);
+    }
+
+    /// @brief exchanges the values
+    /// @sa https://json.nlohmann.me/api/basic_json/swap/
+    void swap(array_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
+    {
+        // swap only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            using std::swap;
+            swap(*(m_data.m_value.array), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(array_t&) with ", type_name()), this));
+        }
+    }
+
+    /// @brief exchanges the values
+    /// @sa https://json.nlohmann.me/api/basic_json/swap/
+    void swap(object_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
+    {
+        // swap only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            using std::swap;
+            swap(*(m_data.m_value.object), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(object_t&) with ", type_name()), this));
+        }
+    }
+
+    /// @brief exchanges the values
+    /// @sa https://json.nlohmann.me/api/basic_json/swap/
+    void swap(string_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_string()))
+        {
+            using std::swap;
+            swap(*(m_data.m_value.string), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(string_t&) with ", type_name()), this));
+        }
+    }
+
+    /// @brief exchanges the values
+    /// @sa https://json.nlohmann.me/api/basic_json/swap/
+    void swap(binary_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_binary()))
+        {
+            using std::swap;
+            swap(*(m_data.m_value.binary), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(binary_t&) with ", type_name()), this));
+        }
+    }
+
+    /// @brief exchanges the values
+    /// @sa https://json.nlohmann.me/api/basic_json/swap/
+    void swap(typename binary_t::container_type& other) // NOLINT(bugprone-exception-escape)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_binary()))
+        {
+            using std::swap;
+            swap(*(m_data.m_value.binary), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(binary_t::container_type&) with ", type_name()), this));
+        }
+    }
+
+    /// @}
+
+    //////////////////////////////////////////
+    // lexicographical comparison operators //
+    //////////////////////////////////////////
+
+    /// @name lexicographical comparison operators
+    /// @{
+
+    // note parentheses around operands are necessary; see
+    // https://github.com/nlohmann/json/issues/1530
+#define JSON_IMPLEMENT_OPERATOR(op, null_result, unordered_result, default_result)                       \
+    const auto lhs_type = lhs.type();                                                                    \
+    const auto rhs_type = rhs.type();                                                                    \
+    \
+    if (lhs_type == rhs_type) /* NOLINT(readability/braces) */                                           \
+    {                                                                                                    \
+        switch (lhs_type)                                                                                \
+        {                                                                                                \
+            case value_t::array:                                                                         \
+                return (*lhs.m_data.m_value.array) op (*rhs.m_data.m_value.array);                                     \
+                \
+            case value_t::object:                                                                        \
+                return (*lhs.m_data.m_value.object) op (*rhs.m_data.m_value.object);                                   \
+                \
+            case value_t::null:                                                                          \
+                return (null_result);                                                                    \
+                \
+            case value_t::string:                                                                        \
+                return (*lhs.m_data.m_value.string) op (*rhs.m_data.m_value.string);                                   \
+                \
+            case value_t::boolean:                                                                       \
+                return (lhs.m_data.m_value.boolean) op (rhs.m_data.m_value.boolean);                                   \
+                \
+            case value_t::number_integer:                                                                \
+                return (lhs.m_data.m_value.number_integer) op (rhs.m_data.m_value.number_integer);                     \
+                \
+            case value_t::number_unsigned:                                                               \
+                return (lhs.m_data.m_value.number_unsigned) op (rhs.m_data.m_value.number_unsigned);                   \
+                \
+            case value_t::number_float:                                                                  \
+                return (lhs.m_data.m_value.number_float) op (rhs.m_data.m_value.number_float);                         \
+                \
+            case value_t::binary:                                                                        \
+                return (*lhs.m_data.m_value.binary) op (*rhs.m_data.m_value.binary);                                   \
+                \
+            case value_t::discarded:                                                                     \
+            default:                                                                                     \
+                return (unordered_result);                                                               \
+        }                                                                                                \
+    }                                                                                                    \
+    else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float)                   \
+    {                                                                                                    \
+        return static_cast<number_float_t>(lhs.m_data.m_value.number_integer) op rhs.m_data.m_value.number_float;      \
+    }                                                                                                    \
+    else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer)                   \
+    {                                                                                                    \
+        return lhs.m_data.m_value.number_float op static_cast<number_float_t>(rhs.m_data.m_value.number_integer);      \
+    }                                                                                                    \
+    else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float)                  \
+    {                                                                                                    \
+        return static_cast<number_float_t>(lhs.m_data.m_value.number_unsigned) op rhs.m_data.m_value.number_float;     \
+    }                                                                                                    \
+    else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned)                  \
+    {                                                                                                    \
+        return lhs.m_data.m_value.number_float op static_cast<number_float_t>(rhs.m_data.m_value.number_unsigned);     \
+    }                                                                                                    \
+    else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer)                \
+    {                                                                                                    \
+        return static_cast<number_integer_t>(lhs.m_data.m_value.number_unsigned) op rhs.m_data.m_value.number_integer; \
+    }                                                                                                    \
+    else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned)                \
+    {                                                                                                    \
+        return lhs.m_data.m_value.number_integer op static_cast<number_integer_t>(rhs.m_data.m_value.number_unsigned); \
+    }                                                                                                    \
+    else if(compares_unordered(lhs, rhs))\
+    {\
+        return (unordered_result);\
+    }\
+    \
+    return (default_result);
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    // returns true if:
+    // - any operand is NaN and the other operand is of number type
+    // - any operand is discarded
+    // in legacy mode, discarded values are considered ordered if
+    // an operation is computed as an odd number of inverses of others
+    static bool compares_unordered(const_reference lhs, const_reference rhs, bool inverse = false) noexcept
+    {
+        if ((lhs.is_number_float() && std::isnan(lhs.m_data.m_value.number_float) && rhs.is_number())
+                || (rhs.is_number_float() && std::isnan(rhs.m_data.m_value.number_float) && lhs.is_number()))
+        {
+            return true;
+        }
+#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+        return (lhs.is_discarded() || rhs.is_discarded()) && !inverse;
+#else
+        static_cast<void>(inverse);
+        return lhs.is_discarded() || rhs.is_discarded();
+#endif
+    }
+
+  private:
+    bool compares_unordered(const_reference rhs, bool inverse = false) const noexcept
+    {
+        return compares_unordered(*this, rhs, inverse);
+    }
+
+  public:
+#if JSON_HAS_THREE_WAY_COMPARISON
+    /// @brief comparison: equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
+    bool operator==(const_reference rhs) const noexcept
+    {
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        const_reference lhs = *this;
+        JSON_IMPLEMENT_OPERATOR( ==, true, false, false)
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+    }
+
+    /// @brief comparison: equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
+    template<typename ScalarType>
+    requires std::is_scalar_v<ScalarType>
+    bool operator==(ScalarType rhs) const noexcept
+    {
+        return *this == basic_json(rhs);
+    }
+
+    /// @brief comparison: not equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
+    bool operator!=(const_reference rhs) const noexcept
+    {
+        if (compares_unordered(rhs, true))
+        {
+            return false;
+        }
+        return !operator==(rhs);
+    }
+
+    /// @brief comparison: 3-way
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_spaceship/
+    std::partial_ordering operator<=>(const_reference rhs) const noexcept // *NOPAD*
+    {
+        const_reference lhs = *this;
+        // default_result is used if we cannot compare values. In that case,
+        // we compare types.
+        JSON_IMPLEMENT_OPERATOR(<=>, // *NOPAD*
+                                std::partial_ordering::equivalent,
+                                std::partial_ordering::unordered,
+                                lhs_type <=> rhs_type) // *NOPAD*
+    }
+
+    /// @brief comparison: 3-way
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_spaceship/
+    template<typename ScalarType>
+    requires std::is_scalar_v<ScalarType>
+    std::partial_ordering operator<=>(ScalarType rhs) const noexcept // *NOPAD*
+    {
+        return *this <=> basic_json(rhs); // *NOPAD*
+    }
+
+#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+    // all operators that are computed as an odd number of inverses of others
+    // need to be overloaded to emulate the legacy comparison behavior
+
+    /// @brief comparison: less than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, undef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON)
+    bool operator<=(const_reference rhs) const noexcept
+    {
+        if (compares_unordered(rhs, true))
+        {
+            return false;
+        }
+        return !(rhs < *this);
+    }
+
+    /// @brief comparison: less than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
+    template<typename ScalarType>
+    requires std::is_scalar_v<ScalarType>
+    bool operator<=(ScalarType rhs) const noexcept
+    {
+        return *this <= basic_json(rhs);
+    }
+
+    /// @brief comparison: greater than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, undef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON)
+    bool operator>=(const_reference rhs) const noexcept
+    {
+        if (compares_unordered(rhs, true))
+        {
+            return false;
+        }
+        return !(*this < rhs);
+    }
+
+    /// @brief comparison: greater than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
+    template<typename ScalarType>
+    requires std::is_scalar_v<ScalarType>
+    bool operator>=(ScalarType rhs) const noexcept
+    {
+        return *this >= basic_json(rhs);
+    }
+#endif
+#else
+    /// @brief comparison: equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
+    friend bool operator==(const_reference lhs, const_reference rhs) noexcept
+    {
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        JSON_IMPLEMENT_OPERATOR( ==, true, false, false)
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+    }
+
+    /// @brief comparison: equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator==(const_reference lhs, ScalarType rhs) noexcept
+    {
+        return lhs == basic_json(rhs);
+    }
+
+    /// @brief comparison: equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator==(ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) == rhs;
+    }
+
+    /// @brief comparison: not equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
+    friend bool operator!=(const_reference lhs, const_reference rhs) noexcept
+    {
+        if (compares_unordered(lhs, rhs, true))
+        {
+            return false;
+        }
+        return !(lhs == rhs);
+    }
+
+    /// @brief comparison: not equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator!=(const_reference lhs, ScalarType rhs) noexcept
+    {
+        return lhs != basic_json(rhs);
+    }
+
+    /// @brief comparison: not equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator!=(ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) != rhs;
+    }
+
+    /// @brief comparison: less than
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_lt/
+    friend bool operator<(const_reference lhs, const_reference rhs) noexcept
+    {
+        // default_result is used if we cannot compare values. In that case,
+        // we compare types. Note we have to call the operator explicitly,
+        // because MSVC has problems otherwise.
+        JSON_IMPLEMENT_OPERATOR( <, false, false, operator<(lhs_type, rhs_type))
+    }
+
+    /// @brief comparison: less than
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_lt/
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<(const_reference lhs, ScalarType rhs) noexcept
+    {
+        return lhs < basic_json(rhs);
+    }
+
+    /// @brief comparison: less than
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_lt/
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<(ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) < rhs;
+    }
+
+    /// @brief comparison: less than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
+    friend bool operator<=(const_reference lhs, const_reference rhs) noexcept
+    {
+        if (compares_unordered(lhs, rhs, true))
+        {
+            return false;
+        }
+        return !(rhs < lhs);
+    }
+
+    /// @brief comparison: less than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<=(const_reference lhs, ScalarType rhs) noexcept
+    {
+        return lhs <= basic_json(rhs);
+    }
+
+    /// @brief comparison: less than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<=(ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) <= rhs;
+    }
+
+    /// @brief comparison: greater than
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_gt/
+    friend bool operator>(const_reference lhs, const_reference rhs) noexcept
+    {
+        // double inverse
+        if (compares_unordered(lhs, rhs))
+        {
+            return false;
+        }
+        return !(lhs <= rhs);
+    }
+
+    /// @brief comparison: greater than
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_gt/
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>(const_reference lhs, ScalarType rhs) noexcept
+    {
+        return lhs > basic_json(rhs);
+    }
+
+    /// @brief comparison: greater than
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_gt/
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>(ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) > rhs;
+    }
+
+    /// @brief comparison: greater than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
+    friend bool operator>=(const_reference lhs, const_reference rhs) noexcept
+    {
+        if (compares_unordered(lhs, rhs, true))
+        {
+            return false;
+        }
+        return !(lhs < rhs);
+    }
+
+    /// @brief comparison: greater than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>=(const_reference lhs, ScalarType rhs) noexcept
+    {
+        return lhs >= basic_json(rhs);
+    }
+
+    /// @brief comparison: greater than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>=(ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) >= rhs;
+    }
+#endif
+
+#undef JSON_IMPLEMENT_OPERATOR
+
+    /// @}
+
+    ///////////////////
+    // serialization //
+    ///////////////////
+
+    /// @name serialization
+    /// @{
+#ifndef JSON_NO_IO
+    /// @brief serialize to stream
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ltlt/
+    friend std::ostream& operator<<(std::ostream& o, const basic_json& j)
+    {
+        // read width member and use it as indentation parameter if nonzero
+        const bool pretty_print = o.width() > 0;
+        const auto indentation = pretty_print ? o.width() : 0;
+
+        // reset width to 0 for subsequent calls to this stream
+        o.width(0);
+
+        // do the actual serialization
+        serializer s(detail::output_adapter<char>(o), o.fill());
+        s.dump(j, pretty_print, false, static_cast<unsigned int>(indentation));
+        return o;
+    }
+
+    /// @brief serialize to stream
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ltlt/
+    /// @deprecated This function is deprecated since 3.0.0 and will be removed in
+    ///             version 4.0.0 of the library. Please use
+    ///             operator<<(std::ostream&, const basic_json&) instead; that is,
+    ///             replace calls like `j >> o;` with `o << j;`.
+    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator<<(std::ostream&, const basic_json&))
+    friend std::ostream& operator>>(const basic_json& j, std::ostream& o)
+    {
+        return o << j;
+    }
+#endif  // JSON_NO_IO
+    /// @}
+
+    /////////////////////
+    // deserialization //
+    /////////////////////
+
+    /// @name deserialization
+    /// @{
+
+    /// @brief deserialize from a compatible input
+    /// @sa https://json.nlohmann.me/api/basic_json/parse/
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json parse(InputType&& i,
+                            parser_callback_t cb = nullptr,
+                            const bool allow_exceptions = true,
+                            const bool ignore_comments = false)
+    {
+        basic_json result;
+        parser(detail::input_adapter(std::forward<InputType>(i)), std::move(cb), allow_exceptions, ignore_comments).parse(true, result); // cppcheck-suppress[accessMoved,accessForwarded]
+        return result;
+    }
+
+    /// @brief deserialize from a pair of character iterators
+    /// @sa https://json.nlohmann.me/api/basic_json/parse/
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json parse(IteratorType first,
+                            IteratorType last,
+                            parser_callback_t cb = nullptr,
+                            const bool allow_exceptions = true,
+                            const bool ignore_comments = false)
+    {
+        basic_json result;
+        parser(detail::input_adapter(std::move(first), std::move(last)), std::move(cb), allow_exceptions, ignore_comments).parse(true, result); // cppcheck-suppress[accessMoved]
+        return result;
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, parse(ptr, ptr + len))
+    static basic_json parse(detail::span_input_adapter&& i,
+                            parser_callback_t cb = nullptr,
+                            const bool allow_exceptions = true,
+                            const bool ignore_comments = false)
+    {
+        basic_json result;
+        parser(i.get(), std::move(cb), allow_exceptions, ignore_comments).parse(true, result); // cppcheck-suppress[accessMoved]
+        return result;
+    }
+
+    /// @brief check if the input is valid JSON
+    /// @sa https://json.nlohmann.me/api/basic_json/accept/
+    template<typename InputType>
+    static bool accept(InputType&& i,
+                       const bool ignore_comments = false)
+    {
+        return parser(detail::input_adapter(std::forward<InputType>(i)), nullptr, false, ignore_comments).accept(true);
+    }
+
+    /// @brief check if the input is valid JSON
+    /// @sa https://json.nlohmann.me/api/basic_json/accept/
+    template<typename IteratorType>
+    static bool accept(IteratorType first, IteratorType last,
+                       const bool ignore_comments = false)
+    {
+        return parser(detail::input_adapter(std::move(first), std::move(last)), nullptr, false, ignore_comments).accept(true);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, accept(ptr, ptr + len))
+    static bool accept(detail::span_input_adapter&& i,
+                       const bool ignore_comments = false)
+    {
+        return parser(i.get(), nullptr, false, ignore_comments).accept(true);
+    }
+
+    /// @brief generate SAX events
+    /// @sa https://json.nlohmann.me/api/basic_json/sax_parse/
+    template <typename InputType, typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    static bool sax_parse(InputType&& i, SAX* sax,
+                          input_format_t format = input_format_t::json,
+                          const bool strict = true,
+                          const bool ignore_comments = false)
+    {
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        return format == input_format_t::json
+               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
+               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia), format).sax_parse(format, sax, strict);
+    }
+
+    /// @brief generate SAX events
+    /// @sa https://json.nlohmann.me/api/basic_json/sax_parse/
+    template<class IteratorType, class SAX>
+    JSON_HEDLEY_NON_NULL(3)
+    static bool sax_parse(IteratorType first, IteratorType last, SAX* sax,
+                          input_format_t format = input_format_t::json,
+                          const bool strict = true,
+                          const bool ignore_comments = false)
+    {
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        return format == input_format_t::json
+               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
+               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia), format).sax_parse(format, sax, strict);
+    }
+
+    /// @brief generate SAX events
+    /// @sa https://json.nlohmann.me/api/basic_json/sax_parse/
+    /// @deprecated This function is deprecated since 3.8.0 and will be removed in
+    ///             version 4.0.0 of the library. Please use
+    ///             sax_parse(ptr, ptr + len) instead.
+    template <typename SAX>
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, sax_parse(ptr, ptr + len, ...))
+    JSON_HEDLEY_NON_NULL(2)
+    static bool sax_parse(detail::span_input_adapter&& i, SAX* sax,
+                          input_format_t format = input_format_t::json,
+                          const bool strict = true,
+                          const bool ignore_comments = false)
+    {
+        auto ia = i.get();
+        return format == input_format_t::json
+               // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
+               // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia), format).sax_parse(format, sax, strict);
+    }
+#ifndef JSON_NO_IO
+    /// @brief deserialize from stream
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_gtgt/
+    /// @deprecated This stream operator is deprecated since 3.0.0 and will be removed in
+    ///             version 4.0.0 of the library. Please use
+    ///             operator>>(std::istream&, basic_json&) instead; that is,
+    ///             replace calls like `j << i;` with `i >> j;`.
+    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator>>(std::istream&, basic_json&))
+    friend std::istream& operator<<(basic_json& j, std::istream& i)
+    {
+        return operator>>(i, j);
+    }
+
+    /// @brief deserialize from stream
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_gtgt/
+    friend std::istream& operator>>(std::istream& i, basic_json& j)
+    {
+        parser(detail::input_adapter(i)).parse(false, j);
+        return i;
+    }
+#endif  // JSON_NO_IO
+    /// @}
+
+    ///////////////////////////
+    // convenience functions //
+    ///////////////////////////
+
+    /// @brief return the type as string
+    /// @sa https://json.nlohmann.me/api/basic_json/type_name/
+    JSON_HEDLEY_RETURNS_NON_NULL
+    const char* type_name() const noexcept
+    {
+        switch (m_data.m_type)
+        {
+            case value_t::null:
+                return "null";
+            case value_t::object:
+                return "object";
+            case value_t::array:
+                return "array";
+            case value_t::string:
+                return "string";
+            case value_t::boolean:
+                return "boolean";
+            case value_t::binary:
+                return "binary";
+            case value_t::discarded:
+                return "discarded";
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            default:
+                return "number";
+        }
+    }
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    //////////////////////
+    // member variables //
+    //////////////////////
+
+    struct data
+    {
+        /// the type of the current element
+        value_t m_type = value_t::null;
+
+        /// the value of the current element
+        json_value m_value = {};
+
+        data(const value_t v)
+            : m_type(v), m_value(v)
+        {
+        }
+
+        data(size_type cnt, const basic_json& val)
+            : m_type(value_t::array)
+        {
+            m_value.array = create<array_t>(cnt, val);
+        }
+
+        data() noexcept = default;
+        data(data&&) noexcept = default;
+        data(const data&) noexcept = delete;
+        data& operator=(data&&) noexcept = delete;
+        data& operator=(const data&) noexcept = delete;
+
+        ~data() noexcept
+        {
+            m_value.destroy(m_type);
+        }
+    };
+
+    data m_data = {};
+
+#if JSON_DIAGNOSTICS
+    /// a pointer to a parent value (for debugging purposes)
+    basic_json* m_parent = nullptr;
+#endif
+
+#if JSON_DIAGNOSTIC_POSITIONS
+    /// the start position of the value
+    std::size_t start_position = std::string::npos;
+    /// the end position of the value
+    std::size_t end_position = std::string::npos;
+  public:
+    constexpr std::size_t start_pos() const noexcept
+    {
+        return start_position;
+    }
+
+    constexpr std::size_t end_pos() const noexcept
+    {
+        return end_position;
+    }
+#endif
+
+    //////////////////////////////////////////
+    // binary serialization/deserialization //
+    //////////////////////////////////////////
+
+    /// @name binary serialization/deserialization support
+    /// @{
+
+  public:
+    /// @brief create a CBOR serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_cbor/
+    static std::vector<std::uint8_t> to_cbor(const basic_json& j)
+    {
+        std::vector<std::uint8_t> result;
+        to_cbor(j, result);
+        return result;
+    }
+
+    /// @brief create a CBOR serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_cbor/
+    static void to_cbor(const basic_json& j, detail::output_adapter<std::uint8_t> o)
+    {
+        binary_writer<std::uint8_t>(o).write_cbor(j);
+    }
+
+    /// @brief create a CBOR serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_cbor/
+    static void to_cbor(const basic_json& j, detail::output_adapter<char> o)
+    {
+        binary_writer<char>(o).write_cbor(j);
+    }
+
+    /// @brief create a MessagePack serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_msgpack/
+    static std::vector<std::uint8_t> to_msgpack(const basic_json& j)
+    {
+        std::vector<std::uint8_t> result;
+        to_msgpack(j, result);
+        return result;
+    }
+
+    /// @brief create a MessagePack serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_msgpack/
+    static void to_msgpack(const basic_json& j, detail::output_adapter<std::uint8_t> o)
+    {
+        binary_writer<std::uint8_t>(o).write_msgpack(j);
+    }
+
+    /// @brief create a MessagePack serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_msgpack/
+    static void to_msgpack(const basic_json& j, detail::output_adapter<char> o)
+    {
+        binary_writer<char>(o).write_msgpack(j);
+    }
+
+    /// @brief create a UBJSON serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_ubjson/
+    static std::vector<std::uint8_t> to_ubjson(const basic_json& j,
+            const bool use_size = false,
+            const bool use_type = false)
+    {
+        std::vector<std::uint8_t> result;
+        to_ubjson(j, result, use_size, use_type);
+        return result;
+    }
+
+    /// @brief create a UBJSON serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_ubjson/
+    static void to_ubjson(const basic_json& j, detail::output_adapter<std::uint8_t> o,
+                          const bool use_size = false, const bool use_type = false)
+    {
+        binary_writer<std::uint8_t>(o).write_ubjson(j, use_size, use_type);
+    }
+
+    /// @brief create a UBJSON serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_ubjson/
+    static void to_ubjson(const basic_json& j, detail::output_adapter<char> o,
+                          const bool use_size = false, const bool use_type = false)
+    {
+        binary_writer<char>(o).write_ubjson(j, use_size, use_type);
+    }
+
+    /// @brief create a BJData serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_bjdata/
+    static std::vector<std::uint8_t> to_bjdata(const basic_json& j,
+            const bool use_size = false,
+            const bool use_type = false,
+            const bjdata_version_t version = bjdata_version_t::draft2)
+    {
+        std::vector<std::uint8_t> result;
+        to_bjdata(j, result, use_size, use_type, version);
+        return result;
+    }
+
+    /// @brief create a BJData serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_bjdata/
+    static void to_bjdata(const basic_json& j, detail::output_adapter<std::uint8_t> o,
+                          const bool use_size = false, const bool use_type = false,
+                          const bjdata_version_t version = bjdata_version_t::draft2)
+    {
+        binary_writer<std::uint8_t>(o).write_ubjson(j, use_size, use_type, true, true, version);
+    }
+
+    /// @brief create a BJData serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_bjdata/
+    static void to_bjdata(const basic_json& j, detail::output_adapter<char> o,
+                          const bool use_size = false, const bool use_type = false,
+                          const bjdata_version_t version = bjdata_version_t::draft2)
+    {
+        binary_writer<char>(o).write_ubjson(j, use_size, use_type, true, true, version);
+    }
+
+    /// @brief create a BSON serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_bson/
+    static std::vector<std::uint8_t> to_bson(const basic_json& j)
+    {
+        std::vector<std::uint8_t> result;
+        to_bson(j, result);
+        return result;
+    }
+
+    /// @brief create a BSON serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_bson/
+    static void to_bson(const basic_json& j, detail::output_adapter<std::uint8_t> o)
+    {
+        binary_writer<std::uint8_t>(o).write_bson(j);
+    }
+
+    /// @brief create a BSON serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_bson/
+    static void to_bson(const basic_json& j, detail::output_adapter<char> o)
+    {
+        binary_writer<char>(o).write_bson(j);
+    }
+
+    /// @brief create a JSON value from an input in CBOR format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_cbor/
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_cbor(InputType&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        basic_json result;
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler); // cppcheck-suppress[accessMoved]
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in CBOR format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_cbor/
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_cbor(IteratorType first, IteratorType last,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        basic_json result;
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler); // cppcheck-suppress[accessMoved]
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
+    static basic_json from_cbor(const T* ptr, std::size_t len,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
+    static basic_json from_cbor(detail::span_input_adapter&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        basic_json result;
+        auto ia = i.get();
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler); // cppcheck-suppress[accessMoved]
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in MessagePack format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_msgpack/
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_msgpack(InputType&& i,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        basic_json result;
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack).sax_parse(input_format_t::msgpack, &sdp, strict); // cppcheck-suppress[accessMoved]
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in MessagePack format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_msgpack/
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_msgpack(IteratorType first, IteratorType last,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        basic_json result;
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack).sax_parse(input_format_t::msgpack, &sdp, strict); // cppcheck-suppress[accessMoved]
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
+    static basic_json from_msgpack(const T* ptr, std::size_t len,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        return from_msgpack(ptr, ptr + len, strict, allow_exceptions);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
+    static basic_json from_msgpack(detail::span_input_adapter&& i,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        basic_json result;
+        auto ia = i.get();
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack).sax_parse(input_format_t::msgpack, &sdp, strict); // cppcheck-suppress[accessMoved]
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in UBJSON format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_ubjson/
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_ubjson(InputType&& i,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson).sax_parse(input_format_t::ubjson, &sdp, strict); // cppcheck-suppress[accessMoved]
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in UBJSON format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_ubjson/
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_ubjson(IteratorType first, IteratorType last,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson).sax_parse(input_format_t::ubjson, &sdp, strict); // cppcheck-suppress[accessMoved]
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
+    static basic_json from_ubjson(const T* ptr, std::size_t len,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        return from_ubjson(ptr, ptr + len, strict, allow_exceptions);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
+    static basic_json from_ubjson(detail::span_input_adapter&& i,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        auto ia = i.get();
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson).sax_parse(input_format_t::ubjson, &sdp, strict); // cppcheck-suppress[accessMoved]
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in BJData format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_bjdata/
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_bjdata(InputType&& i,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bjdata).sax_parse(input_format_t::bjdata, &sdp, strict); // cppcheck-suppress[accessMoved]
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in BJData format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_bjdata/
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_bjdata(IteratorType first, IteratorType last,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bjdata).sax_parse(input_format_t::bjdata, &sdp, strict); // cppcheck-suppress[accessMoved]
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in BSON format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_bson/
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_bson(InputType&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson).sax_parse(input_format_t::bson, &sdp, strict); // cppcheck-suppress[accessMoved]
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in BSON format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_bson/
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_bson(IteratorType first, IteratorType last,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson).sax_parse(input_format_t::bson, &sdp, strict); // cppcheck-suppress[accessMoved]
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
+    static basic_json from_bson(const T* ptr, std::size_t len,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        return from_bson(ptr, ptr + len, strict, allow_exceptions);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
+    static basic_json from_bson(detail::span_input_adapter&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        auto ia = i.get();
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson).sax_parse(input_format_t::bson, &sdp, strict); // cppcheck-suppress[accessMoved]
+        return res ? result : basic_json(value_t::discarded);
+    }
+    /// @}
+
+    //////////////////////////
+    // JSON Pointer support //
+    //////////////////////////
+
+    /// @name JSON Pointer functions
+    /// @{
+
+    /// @brief access specified element via JSON Pointer
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    reference operator[](const json_pointer& ptr)
+    {
+        return ptr.get_unchecked(this);
+    }
+
+    template<typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
+    reference operator[](const ::nlohmann::json_pointer<BasicJsonType>& ptr)
+    {
+        return ptr.get_unchecked(this);
+    }
+
+    /// @brief access specified element via JSON Pointer
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    const_reference operator[](const json_pointer& ptr) const
+    {
+        return ptr.get_unchecked(this);
+    }
+
+    template<typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
+    const_reference operator[](const ::nlohmann::json_pointer<BasicJsonType>& ptr) const
+    {
+        return ptr.get_unchecked(this);
+    }
+
+    /// @brief access specified element via JSON Pointer
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    reference at(const json_pointer& ptr)
+    {
+        return ptr.get_checked(this);
+    }
+
+    template<typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
+    reference at(const ::nlohmann::json_pointer<BasicJsonType>& ptr)
+    {
+        return ptr.get_checked(this);
+    }
+
+    /// @brief access specified element via JSON Pointer
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    const_reference at(const json_pointer& ptr) const
+    {
+        return ptr.get_checked(this);
+    }
+
+    template<typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
+    const_reference at(const ::nlohmann::json_pointer<BasicJsonType>& ptr) const
+    {
+        return ptr.get_checked(this);
+    }
+
+    /// @brief return flattened JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/flatten/
+    basic_json flatten() const
+    {
+        basic_json result(value_t::object);
+        json_pointer::flatten("", *this, result);
+        return result;
+    }
+
+    /// @brief unflatten a previously flattened JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/unflatten/
+    basic_json unflatten() const
+    {
+        return json_pointer::unflatten(*this);
+    }
+
+    /// @}
+
+    //////////////////////////
+    // JSON Patch functions //
+    //////////////////////////
+
+    /// @name JSON Patch functions
+    /// @{
+
+    /// @brief applies a JSON patch in-place without copying the object
+    /// @sa https://json.nlohmann.me/api/basic_json/patch/
+    void patch_inplace(const basic_json& json_patch)
+    {
+        basic_json& result = *this;
+        // the valid JSON Patch operations
+        enum class patch_operations {add, remove, replace, move, copy, test, invalid};
+
+        const auto get_op = [](const string_t& op)
+        {
+            if (op == "add")
+            {
+                return patch_operations::add;
+            }
+            if (op == "remove")
+            {
+                return patch_operations::remove;
+            }
+            if (op == "replace")
+            {
+                return patch_operations::replace;
+            }
+            if (op == "move")
+            {
+                return patch_operations::move;
+            }
+            if (op == "copy")
+            {
+                return patch_operations::copy;
+            }
+            if (op == "test")
+            {
+                return patch_operations::test;
+            }
+
+            return patch_operations::invalid;
+        };
+
+        // wrapper for "add" operation; add value at ptr
+        const auto operation_add = [&result](json_pointer & ptr, const basic_json & val)
+        {
+            // adding to the root of the target document means replacing it
+            if (ptr.empty())
+            {
+                result = val;
+                return;
+            }
+
+            // make sure the top element of the pointer exists
+            json_pointer const top_pointer = ptr.top();
+            if (top_pointer != ptr)
+            {
+                result.at(top_pointer);
+            }
+
+            // get reference to parent of JSON pointer ptr
+            const auto last_path = ptr.back();
+            ptr.pop_back();
+            // parent must exist when performing patch add per RFC6902 specs
+            basic_json& parent = result.at(ptr);
+
+            switch (parent.m_data.m_type)
+            {
+                case value_t::null:
+                case value_t::object:
+                {
+                    // use operator[] to add value
+                    parent[last_path] = val;
+                    break;
+                }
+
+                case value_t::array:
+                {
+                    if (last_path == "-")
+                    {
+                        // special case: append to back
+                        parent.push_back(val);
+                    }
+                    else
+                    {
+                        const auto idx = json_pointer::template array_index<basic_json_t>(last_path);
+                        if (JSON_HEDLEY_UNLIKELY(idx > parent.size()))
+                        {
+                            // avoid undefined behavior
+                            JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), &parent));
+                        }
+
+                        // default case: insert add offset
+                        parent.insert(parent.begin() + static_cast<difference_type>(idx), val);
+                    }
+                    break;
+                }
+
+                // if there exists a parent it cannot be primitive
+                case value_t::string: // LCOV_EXCL_LINE
+                case value_t::boolean: // LCOV_EXCL_LINE
+                case value_t::number_integer: // LCOV_EXCL_LINE
+                case value_t::number_unsigned: // LCOV_EXCL_LINE
+                case value_t::number_float: // LCOV_EXCL_LINE
+                case value_t::binary: // LCOV_EXCL_LINE
+                case value_t::discarded: // LCOV_EXCL_LINE
+                default:            // LCOV_EXCL_LINE
+                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+            }
+        };
+
+        // wrapper for "remove" operation; remove value at ptr
+        const auto operation_remove = [this, & result](json_pointer & ptr)
+        {
+            // get reference to parent of JSON pointer ptr
+            const auto last_path = ptr.back();
+            ptr.pop_back();
+            basic_json& parent = result.at(ptr);
+
+            // remove child
+            if (parent.is_object())
+            {
+                // perform range check
+                auto it = parent.find(last_path);
+                if (JSON_HEDLEY_LIKELY(it != parent.end()))
+                {
+                    parent.erase(it);
+                }
+                else
+                {
+                    JSON_THROW(out_of_range::create(403, detail::concat("key '", last_path, "' not found"), this));
+                }
+            }
+            else if (parent.is_array())
+            {
+                // note erase performs range check
+                parent.erase(json_pointer::template array_index<basic_json_t>(last_path));
+            }
+        };
+
+        // type check: top level value must be an array
+        if (JSON_HEDLEY_UNLIKELY(!json_patch.is_array()))
+        {
+            JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects", &json_patch));
+        }
+
+        // iterate and apply the operations
+        for (const auto& val : json_patch)
+        {
+            // wrapper to get a value for an operation
+            const auto get_value = [&val](const string_t& op,
+                                          const string_t& member,
+                                          bool string_type) -> basic_json &
+            {
+                // find value
+                auto it = val.m_data.m_value.object->find(member);
+
+                // context-sensitive error message
+                const auto error_msg = (op == "op") ? "operation" : detail::concat("operation '", op, '\''); // NOLINT(bugprone-unused-local-non-trivial-variable)
+
+                // check if desired value is present
+                if (JSON_HEDLEY_UNLIKELY(it == val.m_data.m_value.object->end()))
+                {
+                    // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
+                    JSON_THROW(parse_error::create(105, 0, detail::concat(error_msg, " must have member '", member, "'"), &val));
+                }
+
+                // check if result is of type string
+                if (JSON_HEDLEY_UNLIKELY(string_type && !it->second.is_string()))
+                {
+                    // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
+                    JSON_THROW(parse_error::create(105, 0, detail::concat(error_msg, " must have string member '", member, "'"), &val));
+                }
+
+                // no error: return value
+                return it->second;
+            };
+
+            // type check: every element of the array must be an object
+            if (JSON_HEDLEY_UNLIKELY(!val.is_object()))
+            {
+                JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects", &val));
+            }
+
+            // collect mandatory members
+            const auto op = get_value("op", "op", true).template get<string_t>();
+            const auto path = get_value(op, "path", true).template get<string_t>();
+            json_pointer ptr(path);
+
+            switch (get_op(op))
+            {
+                case patch_operations::add:
+                {
+                    operation_add(ptr, get_value("add", "value", false));
+                    break;
+                }
+
+                case patch_operations::remove:
+                {
+                    operation_remove(ptr);
+                    break;
+                }
+
+                case patch_operations::replace:
+                {
+                    // the "path" location must exist - use at()
+                    result.at(ptr) = get_value("replace", "value", false);
+                    break;
+                }
+
+                case patch_operations::move:
+                {
+                    const auto from_path = get_value("move", "from", true).template get<string_t>();
+                    json_pointer from_ptr(from_path);
+
+                    // the "from" location must exist - use at()
+                    basic_json const v = result.at(from_ptr);
+
+                    // The move operation is functionally identical to a
+                    // "remove" operation on the "from" location, followed
+                    // immediately by an "add" operation at the target
+                    // location with the value that was just removed.
+                    operation_remove(from_ptr);
+                    operation_add(ptr, v);
+                    break;
+                }
+
+                case patch_operations::copy:
+                {
+                    const auto from_path = get_value("copy", "from", true).template get<string_t>();
+                    const json_pointer from_ptr(from_path);
+
+                    // the "from" location must exist - use at()
+                    basic_json const v = result.at(from_ptr);
+
+                    // The copy is functionally identical to an "add"
+                    // operation at the target location using the value
+                    // specified in the "from" member.
+                    operation_add(ptr, v);
+                    break;
+                }
+
+                case patch_operations::test:
+                {
+                    bool success = false;
+                    JSON_TRY
+                    {
+                        // check if "value" matches the one at "path"
+                        // the "path" location must exist - use at()
+                        success = (result.at(ptr) == get_value("test", "value", false));
+                    }
+                    JSON_INTERNAL_CATCH (out_of_range&)
+                    {
+                        // ignore out of range errors: success remains false
+                    }
+
+                    // throw an exception if test fails
+                    if (JSON_HEDLEY_UNLIKELY(!success))
+                    {
+                        JSON_THROW(other_error::create(501, detail::concat("unsuccessful: ", val.dump()), &val));
+                    }
+
+                    break;
+                }
+
+                case patch_operations::invalid:
+                default:
+                {
+                    // op must be "add", "remove", "replace", "move", "copy", or
+                    // "test"
+                    JSON_THROW(parse_error::create(105, 0, detail::concat("operation value '", op, "' is invalid"), &val));
+                }
+            }
+        }
+    }
+
+    /// @brief applies a JSON patch to a copy of the current object
+    /// @sa https://json.nlohmann.me/api/basic_json/patch/
+    basic_json patch(const basic_json& json_patch) const
+    {
+        basic_json result = *this;
+        result.patch_inplace(json_patch);
+        return result;
+    }
+
+    /// @brief creates a diff as a JSON patch
+    /// @sa https://json.nlohmann.me/api/basic_json/diff/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json diff(const basic_json& source, const basic_json& target,
+                           const string_t& path = "")
+    {
+        // the patch
+        basic_json result(value_t::array);
+
+        // if the values are the same, return empty patch
+        if (source == target)
+        {
+            return result;
+        }
+
+        if (source.type() != target.type())
+        {
+            // different types: replace value
+            result.push_back(
+            {
+                {"op", "replace"}, {"path", path}, {"value", target}
+            });
+            return result;
+        }
+
+        switch (source.type())
+        {
+            case value_t::array:
+            {
+                // first pass: traverse common elements
+                std::size_t i = 0;
+                while (i < source.size() && i < target.size())
+                {
+                    // recursive call to compare array values at index i
+                    auto temp_diff = diff(source[i], target[i], detail::concat<string_t>(path, '/', detail::to_string<string_t>(i)));
+                    result.insert(result.end(), temp_diff.begin(), temp_diff.end());
+                    ++i;
+                }
+
+                // We now reached the end of at least one array
+                // in a second pass, traverse the remaining elements
+
+                // remove my remaining elements
+                const auto end_index = static_cast<difference_type>(result.size());
+                while (i < source.size())
+                {
+                    // add operations in reverse order to avoid invalid
+                    // indices
+                    result.insert(result.begin() + end_index, object(
+                    {
+                        {"op", "remove"},
+                        {"path", detail::concat<string_t>(path, '/', detail::to_string<string_t>(i))}
+                    }));
+                    ++i;
+                }
+
+                // add other remaining elements
+                while (i < target.size())
+                {
+                    result.push_back(
+                    {
+                        {"op", "add"},
+                        {"path", detail::concat<string_t>(path, "/-")},
+                        {"value", target[i]}
+                    });
+                    ++i;
+                }
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                // first pass: traverse this object's elements
+                for (auto it = source.cbegin(); it != source.cend(); ++it)
+                {
+                    // escape the key name to be used in a JSON patch
+                    const auto path_key = detail::concat<string_t>(path, '/', detail::escape(it.key()));
+
+                    if (target.find(it.key()) != target.end())
+                    {
+                        // recursive call to compare object values at key it
+                        auto temp_diff = diff(it.value(), target[it.key()], path_key);
+                        result.insert(result.end(), temp_diff.begin(), temp_diff.end());
+                    }
+                    else
+                    {
+                        // found a key that is not in o -> remove it
+                        result.push_back(object(
+                        {
+                            {"op", "remove"}, {"path", path_key}
+                        }));
+                    }
+                }
+
+                // second pass: traverse other object's elements
+                for (auto it = target.cbegin(); it != target.cend(); ++it)
+                {
+                    if (source.find(it.key()) == source.end())
+                    {
+                        // found a key that is not in this -> add it
+                        const auto path_key = detail::concat<string_t>(path, '/', detail::escape(it.key()));
+                        result.push_back(
+                        {
+                            {"op", "add"}, {"path", path_key},
+                            {"value", it.value()}
+                        });
+                    }
+                }
+
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                // both primitive type: replace value
+                result.push_back(
+                {
+                    {"op", "replace"}, {"path", path}, {"value", target}
+                });
+                break;
+            }
+        }
+
+        return result;
+    }
+    /// @}
+
+    ////////////////////////////////
+    // JSON Merge Patch functions //
+    ////////////////////////////////
+
+    /// @name JSON Merge Patch functions
+    /// @{
+
+    /// @brief applies a JSON Merge Patch
+    /// @sa https://json.nlohmann.me/api/basic_json/merge_patch/
+    void merge_patch(const basic_json& apply_patch)
+    {
+        if (apply_patch.is_object())
+        {
+            if (!is_object())
+            {
+                *this = object();
+            }
+            for (auto it = apply_patch.begin(); it != apply_patch.end(); ++it)
+            {
+                if (it.value().is_null())
+                {
+                    erase(it.key());
+                }
+                else
+                {
+                    operator[](it.key()).merge_patch(it.value());
+                }
+            }
+        }
+        else
+        {
+            *this = apply_patch;
+        }
+    }
+
+    /// @}
+};
+
+/// @brief user-defined to_string function for JSON values
+/// @sa https://json.nlohmann.me/api/basic_json/to_string/
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+std::string to_string(const NLOHMANN_BASIC_JSON_TPL& j)
+{
+    return j.dump();
+}
+
+inline namespace literals
+{
+inline namespace json_literals
+{
+
+/// @brief user-defined string literal for JSON values
+/// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json/
+JSON_HEDLEY_NON_NULL(1)
+#if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0)
+    inline nlohmann::json operator ""_json(const char* s, std::size_t n)
+#else
+    inline nlohmann::json operator "" _json(const char* s, std::size_t n)
+#endif
+{
+    return nlohmann::json::parse(s, s + n);
+}
+
+/// @brief user-defined string literal for JSON pointer
+/// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json_pointer/
+JSON_HEDLEY_NON_NULL(1)
+#if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0)
+    inline nlohmann::json::json_pointer operator ""_json_pointer(const char* s, std::size_t n)
+#else
+    inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n)
+#endif
+{
+    return nlohmann::json::json_pointer(std::string(s, n));
+}
+
+}  // namespace json_literals
+}  // namespace literals
+NLOHMANN_JSON_NAMESPACE_END
+
+///////////////////////
+// nonmember support //
+///////////////////////
+
+namespace std // NOLINT(cert-dcl58-cpp)
+{
+
+/// @brief hash value for JSON objects
+/// @sa https://json.nlohmann.me/api/basic_json/std_hash/
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+struct hash<nlohmann::NLOHMANN_BASIC_JSON_TPL> // NOLINT(cert-dcl58-cpp)
+{
+    std::size_t operator()(const nlohmann::NLOHMANN_BASIC_JSON_TPL& j) const
+    {
+        return nlohmann::detail::hash(j);
+    }
+};
+
+// specialization for std::less<value_t>
+template<>
+struct less< ::nlohmann::detail::value_t> // do not remove the space after '<', see https://github.com/nlohmann/json/pull/679
+{
+    /*!
+    @brief compare two value_t enum values
+    @since version 3.0.0
+    */
+    bool operator()(::nlohmann::detail::value_t lhs,
+                    ::nlohmann::detail::value_t rhs) const noexcept
+    {
+#if JSON_HAS_THREE_WAY_COMPARISON
+        return std::is_lt(lhs <=> rhs); // *NOPAD*
+#else
+        return ::nlohmann::detail::operator<(lhs, rhs);
+#endif
+    }
+};
+
+// C++20 prohibit function specialization in the std namespace.
+#ifndef JSON_HAS_CPP_20
+
+/// @brief exchanges the values of two JSON objects
+/// @sa https://json.nlohmann.me/api/basic_json/std_swap/
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC_JSON_TPL& j2) noexcept(  // NOLINT(readability-inconsistent-declaration-parameter-name, cert-dcl58-cpp)
+    is_nothrow_move_constructible<nlohmann::NLOHMANN_BASIC_JSON_TPL>::value&&                          // NOLINT(misc-redundant-expression,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
+    is_nothrow_move_assignable<nlohmann::NLOHMANN_BASIC_JSON_TPL>::value)
+{
+    j1.swap(j2);
+}
+
+#endif
+
+}  // namespace std
+
+#if JSON_USE_GLOBAL_UDLS
+    #if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0)
+        using nlohmann::literals::json_literals::operator ""_json; // NOLINT(misc-unused-using-decls,google-global-names-in-headers)
+        using nlohmann::literals::json_literals::operator ""_json_pointer; //NOLINT(misc-unused-using-decls,google-global-names-in-headers)
+    #else
+        using nlohmann::literals::json_literals::operator "" _json; // NOLINT(misc-unused-using-decls,google-global-names-in-headers)
+        using nlohmann::literals::json_literals::operator "" _json_pointer; //NOLINT(misc-unused-using-decls,google-global-names-in-headers)
+    #endif
+#endif
+
+// #include <nlohmann/detail/macro_unscope.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+// restore clang diagnostic settings
+#if defined(__clang__)
+    #pragma clang diagnostic pop
+#endif
+
+// clean up
+#undef JSON_ASSERT
+#undef JSON_INTERNAL_CATCH
+#undef JSON_THROW
+#undef JSON_PRIVATE_UNLESS_TESTED
+#undef NLOHMANN_BASIC_JSON_TPL_DECLARATION
+#undef NLOHMANN_BASIC_JSON_TPL
+#undef JSON_EXPLICIT
+#undef NLOHMANN_CAN_CALL_STD_FUNC_IMPL
+#undef JSON_INLINE_VARIABLE
+#undef JSON_NO_UNIQUE_ADDRESS
+#undef JSON_DISABLE_ENUM_SERIALIZATION
+#undef JSON_USE_GLOBAL_UDLS
+
+#ifndef JSON_TEST_KEEP_MACROS
+    #undef JSON_CATCH
+    #undef JSON_TRY
+    #undef JSON_HAS_CPP_11
+    #undef JSON_HAS_CPP_14
+    #undef JSON_HAS_CPP_17
+    #undef JSON_HAS_CPP_20
+    #undef JSON_HAS_FILESYSTEM
+    #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+    #undef JSON_HAS_THREE_WAY_COMPARISON
+    #undef JSON_HAS_RANGES
+    #undef JSON_HAS_STATIC_RTTI
+    #undef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+#endif
+
+// #include <nlohmann/thirdparty/hedley/hedley_undef.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#undef JSON_HEDLEY_ALWAYS_INLINE
+#undef JSON_HEDLEY_ARM_VERSION
+#undef JSON_HEDLEY_ARM_VERSION_CHECK
+#undef JSON_HEDLEY_ARRAY_PARAM
+#undef JSON_HEDLEY_ASSUME
+#undef JSON_HEDLEY_BEGIN_C_DECLS
+#undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_BUILTIN
+#undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_EXTENSION
+#undef JSON_HEDLEY_CLANG_HAS_FEATURE
+#undef JSON_HEDLEY_CLANG_HAS_WARNING
+#undef JSON_HEDLEY_COMPCERT_VERSION
+#undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
+#undef JSON_HEDLEY_CONCAT
+#undef JSON_HEDLEY_CONCAT3
+#undef JSON_HEDLEY_CONCAT3_EX
+#undef JSON_HEDLEY_CONCAT_EX
+#undef JSON_HEDLEY_CONST
+#undef JSON_HEDLEY_CONSTEXPR
+#undef JSON_HEDLEY_CONST_CAST
+#undef JSON_HEDLEY_CPP_CAST
+#undef JSON_HEDLEY_CRAY_VERSION
+#undef JSON_HEDLEY_CRAY_VERSION_CHECK
+#undef JSON_HEDLEY_C_DECL
+#undef JSON_HEDLEY_DEPRECATED
+#undef JSON_HEDLEY_DEPRECATED_FOR
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
+#undef JSON_HEDLEY_DIAGNOSTIC_POP
+#undef JSON_HEDLEY_DIAGNOSTIC_PUSH
+#undef JSON_HEDLEY_DMC_VERSION
+#undef JSON_HEDLEY_DMC_VERSION_CHECK
+#undef JSON_HEDLEY_EMPTY_BASES
+#undef JSON_HEDLEY_EMSCRIPTEN_VERSION
+#undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
+#undef JSON_HEDLEY_END_C_DECLS
+#undef JSON_HEDLEY_FLAGS
+#undef JSON_HEDLEY_FLAGS_CAST
+#undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_BUILTIN
+#undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_EXTENSION
+#undef JSON_HEDLEY_GCC_HAS_FEATURE
+#undef JSON_HEDLEY_GCC_HAS_WARNING
+#undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
+#undef JSON_HEDLEY_GCC_VERSION
+#undef JSON_HEDLEY_GCC_VERSION_CHECK
+#undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_BUILTIN
+#undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_EXTENSION
+#undef JSON_HEDLEY_GNUC_HAS_FEATURE
+#undef JSON_HEDLEY_GNUC_HAS_WARNING
+#undef JSON_HEDLEY_GNUC_VERSION
+#undef JSON_HEDLEY_GNUC_VERSION_CHECK
+#undef JSON_HEDLEY_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_BUILTIN
+#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
+#undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_EXTENSION
+#undef JSON_HEDLEY_HAS_FEATURE
+#undef JSON_HEDLEY_HAS_WARNING
+#undef JSON_HEDLEY_IAR_VERSION
+#undef JSON_HEDLEY_IAR_VERSION_CHECK
+#undef JSON_HEDLEY_IBM_VERSION
+#undef JSON_HEDLEY_IBM_VERSION_CHECK
+#undef JSON_HEDLEY_IMPORT
+#undef JSON_HEDLEY_INLINE
+#undef JSON_HEDLEY_INTEL_CL_VERSION
+#undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK
+#undef JSON_HEDLEY_INTEL_VERSION
+#undef JSON_HEDLEY_INTEL_VERSION_CHECK
+#undef JSON_HEDLEY_IS_CONSTANT
+#undef JSON_HEDLEY_IS_CONSTEXPR_
+#undef JSON_HEDLEY_LIKELY
+#undef JSON_HEDLEY_MALLOC
+#undef JSON_HEDLEY_MCST_LCC_VERSION
+#undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK
+#undef JSON_HEDLEY_MESSAGE
+#undef JSON_HEDLEY_MSVC_VERSION
+#undef JSON_HEDLEY_MSVC_VERSION_CHECK
+#undef JSON_HEDLEY_NEVER_INLINE
+#undef JSON_HEDLEY_NON_NULL
+#undef JSON_HEDLEY_NO_ESCAPE
+#undef JSON_HEDLEY_NO_RETURN
+#undef JSON_HEDLEY_NO_THROW
+#undef JSON_HEDLEY_NULL
+#undef JSON_HEDLEY_PELLES_VERSION
+#undef JSON_HEDLEY_PELLES_VERSION_CHECK
+#undef JSON_HEDLEY_PGI_VERSION
+#undef JSON_HEDLEY_PGI_VERSION_CHECK
+#undef JSON_HEDLEY_PREDICT
+#undef JSON_HEDLEY_PRINTF_FORMAT
+#undef JSON_HEDLEY_PRIVATE
+#undef JSON_HEDLEY_PUBLIC
+#undef JSON_HEDLEY_PURE
+#undef JSON_HEDLEY_REINTERPRET_CAST
+#undef JSON_HEDLEY_REQUIRE
+#undef JSON_HEDLEY_REQUIRE_CONSTEXPR
+#undef JSON_HEDLEY_REQUIRE_MSG
+#undef JSON_HEDLEY_RESTRICT
+#undef JSON_HEDLEY_RETURNS_NON_NULL
+#undef JSON_HEDLEY_SENTINEL
+#undef JSON_HEDLEY_STATIC_ASSERT
+#undef JSON_HEDLEY_STATIC_CAST
+#undef JSON_HEDLEY_STRINGIFY
+#undef JSON_HEDLEY_STRINGIFY_EX
+#undef JSON_HEDLEY_SUNPRO_VERSION
+#undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
+#undef JSON_HEDLEY_TINYC_VERSION
+#undef JSON_HEDLEY_TINYC_VERSION_CHECK
+#undef JSON_HEDLEY_TI_ARMCL_VERSION
+#undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL2000_VERSION
+#undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL430_VERSION
+#undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL6X_VERSION
+#undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL7X_VERSION
+#undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CLPRU_VERSION
+#undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
+#undef JSON_HEDLEY_TI_VERSION
+#undef JSON_HEDLEY_TI_VERSION_CHECK
+#undef JSON_HEDLEY_UNAVAILABLE
+#undef JSON_HEDLEY_UNLIKELY
+#undef JSON_HEDLEY_UNPREDICTABLE
+#undef JSON_HEDLEY_UNREACHABLE
+#undef JSON_HEDLEY_UNREACHABLE_RETURN
+#undef JSON_HEDLEY_VERSION
+#undef JSON_HEDLEY_VERSION_DECODE_MAJOR
+#undef JSON_HEDLEY_VERSION_DECODE_MINOR
+#undef JSON_HEDLEY_VERSION_DECODE_REVISION
+#undef JSON_HEDLEY_VERSION_ENCODE
+#undef JSON_HEDLEY_WARNING
+#undef JSON_HEDLEY_WARN_UNUSED_RESULT
+#undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
+#undef JSON_HEDLEY_FALL_THROUGH
+
+
+
+#endif  // INCLUDE_NLOHMANN_JSON_HPP_
diff --git a/sgl-kernel/3rdparty/nlohmann/json_fwd.hpp b/sgl-kernel/3rdparty/nlohmann/json_fwd.hpp
new file mode 100644
index 000000000000..1df3928d71b7
--- /dev/null
+++ b/sgl-kernel/3rdparty/nlohmann/json_fwd.hpp
@@ -0,0 +1,187 @@
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
+#define INCLUDE_NLOHMANN_JSON_FWD_HPP_
+
+#include <cstdint> // int64_t, uint64_t
+#include <map> // map
+#include <memory> // allocator
+#include <string> // string
+#include <vector> // vector
+
+// #include <nlohmann/detail/abi_macros.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+// This file contains all macro definitions affecting or depending on the ABI
+
+#ifndef JSON_SKIP_LIBRARY_VERSION_CHECK
+    #if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH)
+        #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 3
+            #warning "Already included a different version of the library!"
+        #endif
+    #endif
+#endif
+
+#define NLOHMANN_JSON_VERSION_MAJOR 3   // NOLINT(modernize-macro-to-enum)
+#define NLOHMANN_JSON_VERSION_MINOR 11  // NOLINT(modernize-macro-to-enum)
+#define NLOHMANN_JSON_VERSION_PATCH 3   // NOLINT(modernize-macro-to-enum)
+
+#ifndef JSON_DIAGNOSTICS
+    #define JSON_DIAGNOSTICS 0
+#endif
+
+#ifndef JSON_DIAGNOSTIC_POSITIONS
+    #define JSON_DIAGNOSTIC_POSITIONS 0
+#endif
+
+#ifndef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+    #define JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON 0
+#endif
+
+#if JSON_DIAGNOSTICS
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS _diag
+#else
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS
+#endif
+
+#if JSON_DIAGNOSTIC_POSITIONS
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS _dp
+#else
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS
+#endif
+
+#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON _ldvcmp
+#else
+    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON
+#endif
+
+#ifndef NLOHMANN_JSON_NAMESPACE_NO_VERSION
+    #define NLOHMANN_JSON_NAMESPACE_NO_VERSION 0
+#endif
+
+// Construct the namespace ABI tags component
+#define NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c) json_abi ## a ## b ## c
+#define NLOHMANN_JSON_ABI_TAGS_CONCAT(a, b, c) \
+    NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c)
+
+#define NLOHMANN_JSON_ABI_TAGS                                       \
+    NLOHMANN_JSON_ABI_TAGS_CONCAT(                                   \
+            NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS,                       \
+            NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON, \
+            NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS)
+
+// Construct the namespace version component
+#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) \
+    _v ## major ## _ ## minor ## _ ## patch
+#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(major, minor, patch) \
+    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch)
+
+#if NLOHMANN_JSON_NAMESPACE_NO_VERSION
+#define NLOHMANN_JSON_NAMESPACE_VERSION
+#else
+#define NLOHMANN_JSON_NAMESPACE_VERSION                                 \
+    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(NLOHMANN_JSON_VERSION_MAJOR, \
+                                           NLOHMANN_JSON_VERSION_MINOR, \
+                                           NLOHMANN_JSON_VERSION_PATCH)
+#endif
+
+// Combine namespace components
+#define NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b) a ## b
+#define NLOHMANN_JSON_NAMESPACE_CONCAT(a, b) \
+    NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b)
+
+#ifndef NLOHMANN_JSON_NAMESPACE
+#define NLOHMANN_JSON_NAMESPACE               \
+    nlohmann::NLOHMANN_JSON_NAMESPACE_CONCAT( \
+            NLOHMANN_JSON_ABI_TAGS,           \
+            NLOHMANN_JSON_NAMESPACE_VERSION)
+#endif
+
+#ifndef NLOHMANN_JSON_NAMESPACE_BEGIN
+#define NLOHMANN_JSON_NAMESPACE_BEGIN                \
+    namespace nlohmann                               \
+    {                                                \
+    inline namespace NLOHMANN_JSON_NAMESPACE_CONCAT( \
+                NLOHMANN_JSON_ABI_TAGS,              \
+                NLOHMANN_JSON_NAMESPACE_VERSION)     \
+    {
+#endif
+
+#ifndef NLOHMANN_JSON_NAMESPACE_END
+#define NLOHMANN_JSON_NAMESPACE_END                                     \
+    }  /* namespace (inline namespace) NOLINT(readability/namespace) */ \
+    }  // namespace nlohmann
+#endif
+
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/*!
+@brief default JSONSerializer template argument
+
+This serializer ignores the template arguments and uses ADL
+([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
+for serialization.
+*/
+template<typename T = void, typename SFINAE = void>
+struct adl_serializer;
+
+/// a class to store JSON values
+/// @sa https://json.nlohmann.me/api/basic_json/
+template<template<typename U, typename V, typename... Args> class ObjectType =
+         std::map,
+         template<typename U, typename... Args> class ArrayType = std::vector,
+         class StringType = std::string, class BooleanType = bool,
+         class NumberIntegerType = std::int64_t,
+         class NumberUnsignedType = std::uint64_t,
+         class NumberFloatType = double,
+         template<typename U> class AllocatorType = std::allocator,
+         template<typename T, typename SFINAE = void> class JSONSerializer =
+         adl_serializer,
+         class BinaryType = std::vector<std::uint8_t>, // cppcheck-suppress syntaxError
+         class CustomBaseClass = void>
+class basic_json;
+
+/// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
+/// @sa https://json.nlohmann.me/api/json_pointer/
+template<typename RefStringType>
+class json_pointer;
+
+/*!
+@brief default specialization
+@sa https://json.nlohmann.me/api/json/
+*/
+using json = basic_json<>;
+
+/// @brief a minimal map-like container that preserves insertion order
+/// @sa https://json.nlohmann.me/api/ordered_map/
+template<class Key, class T, class IgnoredLess, class Allocator>
+struct ordered_map;
+
+/// @brief specialization that maintains the insertion order of object keys
+/// @sa https://json.nlohmann.me/api/ordered_json/
+using ordered_json = basic_json<nlohmann::ordered_map>;
+
+NLOHMANN_JSON_NAMESPACE_END
+
+#endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_
diff --git a/sgl-kernel/benchmark/89_fp8_bf16.json b/sgl-kernel/benchmark/89_fp8_bf16.json
new file mode 100644
index 000000000000..6f5e12bbba87
--- /dev/null
+++ b/sgl-kernel/benchmark/89_fp8_bf16.json
@@ -0,0 +1,10 @@
+{
+    "M=1,N=4096,K=8192": 1,
+    "M=1024,N=4096,K=8192": 5,
+    "M=128,N=4096,K=8192": 3,
+    "M=16,N=4096,K=8192": 1,
+    "M=2048,N=4096,K=8192": 5,
+    "M=256,N=4096,K=8192": 4,
+    "M=512,N=4096,K=8192": 5,
+    "M=64,N=4096,K=8192": 1
+}
diff --git "a/sgl-kernel/benchmark/89_fp8_bf16_256\350\247\243\345\206\263.json" "b/sgl-kernel/benchmark/89_fp8_bf16_256\350\247\243\345\206\263.json"
new file mode 100644
index 000000000000..9bbf86714e92
--- /dev/null
+++ "b/sgl-kernel/benchmark/89_fp8_bf16_256\350\247\243\345\206\263.json"
@@ -0,0 +1,10 @@
+{
+    "M=1,N=4096,K=8192": 75,
+    "M=1024,N=4096,K=8192": 33,
+    "M=128,N=4096,K=8192": 15,
+    "M=16,N=4096,K=8192": 88,
+    "M=2048,N=4096,K=8192": 32,
+    "M=256,N=4096,K=8192": 12,
+    "M=512,N=4096,K=8192": 31,
+    "M=64,N=4096,K=8192": 90
+}
diff --git a/sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=bfloat16.json b/sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=bfloat16.json
new file mode 100644
index 000000000000..c41a7a4ff120
--- /dev/null
+++ b/sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=bfloat16.json
@@ -0,0 +1,11 @@
+{
+    "M=1": 75,
+    "M=1024": 31,
+    "M=128": 15,
+    "M=16": 87,
+    "M=2048": 32,
+    "M=256": 10,
+    "M=4096": 32,
+    "M=512": 31,
+    "M=64": 90
+}
diff --git a/sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=float16.json b/sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=float16.json
new file mode 100644
index 000000000000..ea15e9dbb510
--- /dev/null
+++ b/sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=float16.json
@@ -0,0 +1,11 @@
+{
+    "M=1": 75,
+    "M=1024": 31,
+    "M=128": 15,
+    "M=16": 75,
+    "M=2048": 31,
+    "M=256": 10,
+    "M=4096": 32,
+    "M=512": 31,
+    "M=64": 90
+}
diff --git a/sgl-kernel/benchmark/bench_fp8_gemm.py b/sgl-kernel/benchmark/bench_fp8_gemm.py
index 65efce4417c0..95ec522c9883 100644
--- a/sgl-kernel/benchmark/bench_fp8_gemm.py
+++ b/sgl-kernel/benchmark/bench_fp8_gemm.py
@@ -7,9 +7,33 @@
 from sgl_kernel import fp8_scaled_mm as sgl_scaled_mm
 from sgl_kernel import fp8_scaled_mm_profile as sgl_scaled_mm_profile
 import time
-def to_int8(tensor: torch.Tensor) -> torch.Tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 
+def get_sm_version():
+    device = torch.cuda.current_device()
+    major, minor = torch.cuda.get_device_capability(device)
+    return major * 10 + minor
+
+
+def get_device_name():
+    return torch.cuda.get_device_name(torch.cuda.current_device())
+
+def get_config_filename(dtype="bf16"):
+    sm_version = get_sm_version()
+    return f"sm{sm_version}_fp8_{dtype}.json"
+
+def do_profile(dtype="bf16"):
+    M = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+    for m in M:
+        n = 4096
+        k = 8192
+        a = torch.ones((m, k), device="cuda") * 5.0
+        b = torch.ones((n, k), device="cuda") * 5.0
+        scale_a = torch.randn((m,), device="cuda", dtype=torch.float32)
+        scale_b = torch.randn((n,), device="cuda", dtype=torch.float32)
+        a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
+        b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+        b_fp8 = b_fp8.t()
+        sgl_scaled_mm_profile(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=None)
 
 @triton.testing.perf_report(
         triton.testing.Benchmark(
@@ -17,16 +41,18 @@ def to_int8(tensor: torch.Tensor) -> torch.Tensor:
         x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048],
         x_log=False,
         line_arg="provider",
-        # line_vals=["vllm-fp8", "torch-fp8", "sglang-fp8"],
-        # line_names=["vllm-fp8", "torch-fp8", "sglang-fp8"],
-        line_vals=["vllm-fp8", "sglang-fp8", "sglang-fp8-profile"],
-        line_names=["vllm-fp8", "sglang-fp8", "sglang-fp8-profile"],
-        styles=[("green", "-"), ("blue", "-"), ("red", "-")],
+        line_vals=["vllm-fp8-fp16", "vllm-fp8-bf16", "sglang-fp8-fp16", "sglang-fp8-bf16", 
+                  "sglang-fp8-profile-fp16", "sglang-fp8-profile-bf16"],
+        line_names=["vllm-fp8-fp16", "vllm-fp8-bf16", "sglang-fp8-fp16", "sglang-fp8-bf16", 
+                   "sglang-fp8-profile-fp16", "sglang-fp8-profile-bf16"],
+        styles=[("green", "-"), ("green", "--"), ("blue", "-"), ("blue", "--"), 
+               ("red", "-"), ("red", "--")],
         ylabel="GB/s",
         plot_name="int8 scaled matmul",
         args={},
     )
 )
+
 def benchmark(batch_size, provider):
     M, N, K = batch_size, 4096, 8192
     a = torch.ones((M, K), device="cuda") * 5.0
@@ -38,111 +64,33 @@ def benchmark(batch_size, provider):
     b_fp8 = b_fp8.t()
     quantiles = [0.5, 0.2, 0.8]
 
-    if provider == "vllm-fp8":
+    dtype = torch.float16 if "fp16" in provider else torch.bfloat16
+
+    if "vllm-fp8" in provider:
         ms, min_ms, max_ms = triton.testing.do_bench(
             lambda: vllm_scaled_mm(
-                a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, torch.bfloat16
+                a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype
             ),
             quantiles=quantiles,
         )
-    if provider == "torch-fp8":
-        scale_a_2d = scale_a_fp8.float().unsqueeze(1)  # [M, 1]
-        scale_b_2d = scale_b_fp8.float().unsqueeze(0)  # [1, N]
+    elif "sglang-fp8-profile" in provider:
+        do_profile(dtype)
         try:
-            out = torch.empty(
-                (a_fp8.shape[0], b_fp8.shape[0]), device="cuda", dtype=torch.bfloat16
-            )
             ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: torch._scaled_mm(
-                    a_fp8,
-                    b_fp8,
-                    out=out,
-                    out_dtype=torch.bfloat16,
-                    scale_a=scale_a_2d,
-                    scale_b=scale_b_2d,
-                    use_fast_accum=True,
-                ),
+                lambda: sgl_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=None, is_profile=True),
                 quantiles=quantiles,
             )
         except RuntimeError as e:
             print("Error details:", e)
-            raise
-    if provider == "sglang-fp8":
+            ms, min_ms, max_ms = 1, 1, 1
+    elif "sglang-fp8" in provider:
         ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: sgl_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, torch.bfloat16),
+            lambda: sgl_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=None, is_profile=False),
             quantiles=quantiles,
         )
-    if provider == "sglang-fp8-profile":
-        best_configs = []
-        times = []
-        valid_configs = []
-        best_config_info = {}  # 新增：用于存储每个输入规模的最优配置信息
-        
-        try:
-            sgl_scaled_mm_profile(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, torch.bfloat16, bias=None, config_id=35)
-        except RuntimeError as e:
-            print(f"Skip config_id 35 due to error: {e}")
-            
-        for config_id in range(1, 7):
-            try:
-                torch.cuda.synchronize()
-                start = time.time()
-                sgl_scaled_mm_profile(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, 
-                                    torch.bfloat16, bias=None, config_id=config_id)
-                torch.cuda.synchronize()
-                end = time.time()
-                times.append(end - start)
-                valid_configs.append(config_id)
-                print(f"config_id: {config_id}, time: {end - start}")
-            except RuntimeError as e:
-                print(f"Skip config_id {config_id} due to error: {e}")
-                continue
-                
-        if not valid_configs:
-            print("No valid config found")
-            return 0, 0, 0
-            
-        min_time = float('inf')
-        best_config = None
-        for i, config_id in enumerate(valid_configs):
-            if times[i] < min_time:
-                min_time = times[i]
-                best_config = config_id
-                
-        # 记录当前输入规模的最优配置
-        best_config_info[f"M{M}_N{N}_K{K}"] = {
-            "best_config": best_config,
-            "time": min_time,
-            "batch_size": batch_size
-        }
-        
-        # 将最优配置信息保存到文件
-        import json
-        config_file = "best_fp8_configs.json"
-        try:
-            with open(config_file, "r") as f:
-                existing_configs = json.load(f)
-        except FileNotFoundError:
-            existing_configs = {}
-            
-        existing_configs.update(best_config_info)
-        with open(config_file, "w") as f:
-            json.dump(existing_configs, f, indent=4)
-            
-        print(f"Best config for batch_size={batch_size}: config_id={best_config}, time={min_time:.6f}s")
-        
-        # 使用最佳配置进行基准测试
-        try:
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: sgl_scaled_mm_profile(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, torch.bfloat16, bias=None, config_id=best_config),
-                quantiles=quantiles,
-            )
-        except RuntimeError as e:
-            print("Error details:", e)
-            print(f"config_id is not valid {best_config}")
-            ms, min_ms, max_ms = 1, 1, 1
+
     gbps = lambda ms: (2 * M * N * K + M * N) * a.element_size() * 1e-9 / (ms * 1e-3)
     return gbps(ms), gbps(max_ms), gbps(min_ms)
 
 
-benchmark.run(print_data=True, show_plots=True, save_path="bench_int8_res")
\ No newline at end of file
+benchmark.run(print_data=True, show_plots=True, save_path="bench_fp8_res")
\ No newline at end of file
diff --git a/sgl-kernel/benchmark/bench_fp8_res/results.html b/sgl-kernel/benchmark/bench_fp8_res/results.html
new file mode 100644
index 000000000000..0632d108d287
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_fp8_res/results.html
@@ -0,0 +1 @@
+<html><body>
diff --git a/sgl-kernel/benchmark/best_fp8_configs.json b/sgl-kernel/benchmark/best_fp8_configs.json
deleted file mode 100644
index cff052cfd253..000000000000
--- a/sgl-kernel/benchmark/best_fp8_configs.json
+++ /dev/null
@@ -1,42 +0,0 @@
-{
-    "M1_N4096_K8192": {
-        "best_config": 6,
-        "time": 6.532669067382812e-05,
-        "batch_size": 1
-    },
-    "M16_N4096_K8192": {
-        "best_config": 6,
-        "time": 6.699562072753906e-05,
-        "batch_size": 16
-    },
-    "M64_N4096_K8192": {
-        "best_config": 6,
-        "time": 6.67572021484375e-05,
-        "batch_size": 64
-    },
-    "M128_N4096_K8192": {
-        "best_config": 6,
-        "time": 6.699562072753906e-05,
-        "batch_size": 128
-    },
-    "M256_N4096_K8192": {
-        "best_config": 6,
-        "time": 6.842613220214844e-05,
-        "batch_size": 256
-    },
-    "M512_N4096_K8192": {
-        "best_config": 6,
-        "time": 0.00012421607971191406,
-        "batch_size": 512
-    },
-    "M1024_N4096_K8192": {
-        "best_config": 6,
-        "time": 0.00023627281188964844,
-        "batch_size": 1024
-    },
-    "M2048_N4096_K8192": {
-        "best_config": 6,
-        "time": 0.00045871734619140625,
-        "batch_size": 2048
-    }
-}
\ No newline at end of file
diff --git a/sgl-kernel/outp b/sgl-kernel/outp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index aaa0a53dc899..6133f182fee7 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -8,6 +8,9 @@
 
 root = Path(__file__).parent.resolve()
 
+# 添加调试模式控制
+debug_build = os.environ.get('DEBUG_BUILD', '0').lower() in ('1', 'true', 'yes', 'on')
+print(f"Debug build: {'enabled' if debug_build else 'disabled'}")
 
 def get_version():
     with open(root / "pyproject.toml") as f:
@@ -50,12 +53,20 @@ def update_wheel_platform_tag():
     "-O3",
     "-Xcompiler",
     "-fPIC",
-    # 只保留需要的架构
     "-gencode=arch=compute_89,code=sm_89",
     "-U__CUDA_NO_HALF_OPERATORS__",
     "-U__CUDA_NO_HALF2_OPERATORS__",
 ]
+
+# 如果是调试模式，添加调试标志
+if debug_build:
+    nvcc_flags.extend([
+        "-DSGL_DEBUG_BUILD",
+    ])
 cxx_flags = ["-O3"]
+if debug_build:
+    cxx_flags.extend(["-DSGL_DEBUG_BUILD"])
+
 libraries = ["c10", "torch", "torch_python"]
 extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib"]
 ext_modules = [
diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index 914d1cb4df83..47de3dbc9106 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -3,6 +3,7 @@
 // https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h
 
 #pragma once
+#include <chrono>
 
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -30,12 +31,14 @@
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
 
 #include "utils.hpp"
+
+
 using namespace cute;
 
 template <typename ElementType, typename OutElementType, typename AccumElementType, typename CtaShape,
     typename WarpShape, int Stages, bool WithBias,
     typename FP8MathOperator = cutlass::arch::OpMultiplyAdd,
-    template <typename, typename> typename EpilogueVisitor = cutlass::epilogue::threadblock::Sm80EVT,
+    template <typename...> typename EpilogueVisitor = cutlass::epilogue::threadblock::Sm80EVT,
     typename ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>>
 struct DeviceGemmFp8RowwiseSm89
 {
@@ -524,9 +527,104 @@ void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch
     // }
 }
 
+#define DISPATCH_FP8_GEMM_CONFIG(TB_M, TB_N, TB_K, WP_M, WP_N, WP_K, STAGES) \
+    sm89_dispatch_bias<ElementOutput, cutlass::gemm::GemmShape<TB_M, TB_N, TB_K>, \
+        cutlass::gemm::GemmShape<WP_M, WP_N, WP_K>, STAGES>(out, mat_a, mat_b, scales_a, scales_b, bias)
+// generate all stages for a group of configs
+#define DISPATCH_FP8_GEMM_GROUP(GROUP_ID, CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, BASE_CASE) \
+    case BASE_CASE:     DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 2); break; \
+    case BASE_CASE + 1: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 3); break; \
+    case BASE_CASE + 2: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 4); break; \
+    case BASE_CASE + 3: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 5); break; \
+    case BASE_CASE + 4: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 6); break; \
+    case BASE_CASE + 5: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 7); break;
+
+template <typename ElementOutput>
+void sm89_dispatch_shape_explicit(torch::Tensor& out, const torch::Tensor& mat_a, const torch::Tensor& mat_b,
+                            const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                            const c10::optional<torch::Tensor>& bias,
+                            int config_id) {
+#ifdef SGL_DEBUG_BUILD
+    switch(config_id) {
+        case 1:
+            DISPATCH_FP8_GEMM_CONFIG(32, 64, 128, 16, 64, 64, 5);
+            break;
+        case 2:
+            DISPATCH_FP8_GEMM_CONFIG(16, 64, 128, 16, 64, 64, 5);
+            break;
+        case 3:
+            DISPATCH_FP8_GEMM_CONFIG(64, 64, 128, 32, 64, 64, 5);
+            break;
+        case 4:
+            DISPATCH_FP8_GEMM_CONFIG(64, 128, 64, 32, 64, 64, 5);
+            break;
+        case 5:
+            DISPATCH_FP8_GEMM_CONFIG(128, 128, 64, 64, 32, 64, 2);
+            break;
+        case 6:
+            DISPATCH_FP8_GEMM_CONFIG(64, 128, 64, 32, 64, 64, 6);
+            break;
+        default:
+            throw std::runtime_error("Invalid config_id in debug mode: " + std::to_string(config_id));
+    }
+#else
+    switch(config_id) {
+        // Group 1: CtaShape32x128x64_WarpShape32x32x64
+        DISPATCH_FP8_GEMM_GROUP(1, 32, 128, 64, 32, 32, 64, 1);
+
+        // Group 2: CtaShape64x128x64_WarpShape32x64x64
+        DISPATCH_FP8_GEMM_GROUP(2, 64, 128, 64, 32, 64, 64, 7);
+
+        // Group 3: CtaShape64x64x128_WarpShape32x64x64
+        DISPATCH_FP8_GEMM_GROUP(3, 64, 64, 128, 32, 64, 64, 13);
+
+        // Group 4: CtaShape64x128x64_WarpShape64x32x64
+        DISPATCH_FP8_GEMM_GROUP(4, 64, 128, 64, 64, 32, 64, 19);
+
+        // Group 5: CtaShape128x64x64_WarpShape64x32x64
+        DISPATCH_FP8_GEMM_GROUP(5, 128, 64, 64, 64, 32, 64, 25);
+
+        // Group 6: CtaShape128x128x64_WarpShape64x32x64
+        DISPATCH_FP8_GEMM_GROUP(6, 128, 128, 64, 64, 32, 64, 31);
+
+        // Group 7: CtaShape128x128x64_WarpShape64x64x64
+        DISPATCH_FP8_GEMM_GROUP(7, 128, 128, 64, 64, 64, 64, 37);
+
+        // Group 8: CtaShape128x128x64_WarpShape128x32x64
+        DISPATCH_FP8_GEMM_GROUP(8, 128, 128, 64, 128, 32, 64, 43);
+
+        // Group 9: CtaShape128x256x64_WarpShape64x64x64
+        DISPATCH_FP8_GEMM_GROUP(9, 128, 256, 64, 64, 64, 64, 49);
+
+        // Group 10: CtaShape256x128x64_WarpShape64x64x64
+        DISPATCH_FP8_GEMM_GROUP(10, 256, 128, 64, 64, 64, 64, 55);
+
+        // Group 11: CtaShape128x64x128_WarpShape64x32x128
+        DISPATCH_FP8_GEMM_GROUP(11, 128, 64, 128, 64, 32, 128, 61);
+
+        // Group 12: CtaShape16x256x128_WarpShape16x64x128
+        DISPATCH_FP8_GEMM_GROUP(12, 16, 256, 128, 16, 64, 128, 67);
+
+        // Group 13: CtaShape16x64x128_WarpShape16x64x64
+        DISPATCH_FP8_GEMM_GROUP(13, 16, 64, 128, 16, 64, 64, 73);
+
+        // Group 14: CtaShape16x128x64_WarpShape16x64x64
+        DISPATCH_FP8_GEMM_GROUP(14, 16, 128, 64, 16, 64, 64, 79);
+
+        // Group 15: CtaShape32x64x128_WarpShape16x64x64
+        DISPATCH_FP8_GEMM_GROUP(15, 32, 64, 128, 16, 64, 64, 85);
+        
+        default:
+            throw std::runtime_error("Invalid config_id: " + std::to_string(config_id));
+    }
+#endif
+}
+
 torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
-                             const c10::optional<torch::Tensor>& bias) {
+                             const c10::optional<torch::Tensor>& bias, bool is_profile=false) {
+
+
   TORCH_CHECK(mat_a.is_cuda(), "mat_a must be a CUDA tensor");
   TORCH_CHECK(mat_b.is_cuda(), "mat_b must be a CUDA tensor");
   TORCH_CHECK(mat_a.dim() == 2, "mat_a must be a 2D tensor");
@@ -558,7 +656,6 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
   TORCH_CHECK((out.size(1) * out.element_size()) % 16 == 0, "out must be multiple of 16 bytes for memory alignment");
 
   auto sm_version = getSMVersion();
-
   if (sm_version >= 90) {
         if (out_dtype == torch::kBFloat16) {
             sm90_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
@@ -566,6 +663,30 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
             sm90_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
         }
   } else if (sm_version == 89) {
+        if (is_profile) {
+            std::string config_path = get_config_path(mat_a.size(1), mat_b.size(1), out_dtype);
+            try {
+                json config = read_json_config(config_path);
+                int current_m = mat_a.size(0);
+                int nearest_m = find_nearest_m(config, current_m);
+                if (nearest_m != -1) {
+                    std::string key = "M=" + std::to_string(nearest_m);
+                    int config_id = config[key].get<int>();
+                    if (out_dtype == torch::kBFloat16) {
+                        sm89_dispatch_shape_explicit<cutlass::bfloat16_t>(
+                            out, mat_a, mat_b, scales_a, scales_b, bias, config_id);
+
+                    } else {
+                        sm89_dispatch_shape_explicit<cutlass::half_t>(
+                            out, mat_a, mat_b, scales_a, scales_b, bias, config_id);
+                    }
+                    return out;
+                }
+            } catch (const std::exception& e) {
+                std::cerr << "Failed to read config, using default dispatch: " << e.what() << std::endl;
+            }
+        }
+        
         if (out_dtype == torch::kBFloat16) {
             sm89_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
         } else {
@@ -575,92 +696,76 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
     TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented int8_scaled_mm for current compute capability: ", sm_version);
   }
 
+
   return out;
 }
 
 
-#define DISPATCH_FP8_GEMM_CONFIG(TB_M, TB_N, TB_K, WP_M, WP_N, WP_K, STAGES) \
-    sm89_dispatch_bias<ElementOutput, cutlass::gemm::GemmShape<TB_M, TB_N, TB_K>, \
-        cutlass::gemm::GemmShape<WP_M, WP_N, WP_K>, STAGES>(out, mat_a, mat_b, scales_a, scales_b, bias)
-// 定义一个宏来生成一组配置的所有stages
-#define DISPATCH_FP8_GEMM_GROUP(GROUP_ID, CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, BASE_CASE) \
-    case BASE_CASE:     DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 2); break; \
-    case BASE_CASE + 1: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 3); break; \
-    case BASE_CASE + 2: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 4); break; \
-    case BASE_CASE + 3: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 5); break; \
-    case BASE_CASE + 4: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 6); break; \
-    case BASE_CASE + 5: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 7); break;
-
-template <typename ElementOutput>
-void sm89_dispatch_shape_profile(torch::Tensor& out, const torch::Tensor& mat_a, const torch::Tensor& mat_b,
-                            const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-                            const c10::optional<torch::Tensor>& bias,
-                            int config_id) {
-    switch(config_id) {
-        case 1:
-            DISPATCH_FP8_GEMM_CONFIG(32, 64, 128, 16, 64, 64, 5);
-        case 2:
-            DISPATCH_FP8_GEMM_CONFIG(16, 64, 128, 16, 64, 64, 5);
-        case 3:
-            DISPATCH_FP8_GEMM_CONFIG(64, 64, 128, 32, 64, 64, 5);
-        case 4:
-            DISPATCH_FP8_GEMM_CONFIG(64, 128, 64, 32, 64, 64, 5);
-        case 5:
-            DISPATCH_FP8_GEMM_CONFIG(128, 128, 64, 64, 32, 64, 2);
-        case 6:
-            DISPATCH_FP8_GEMM_CONFIG(64, 128, 64, 32, 64, 64, 6);
-        // // Group 1: CtaShape32x128x64_WarpShape32x32x64
-        // DISPATCH_FP8_GEMM_GROUP(1, 32, 128, 64, 32, 32, 64, 1)
-
-        // // Group 2: CtaShape64x128x64_WarpShape32x64x64
-        // DISPATCH_FP8_GEMM_GROUP(2, 64, 128, 64, 32, 64, 64, 7)
-
-        // // Group 3: CtaShape64x64x128_WarpShape32x64x64
-        // DISPATCH_FP8_GEMM_GROUP(3, 64, 64, 128, 32, 64, 64, 13)
-
-        // // Group 4: CtaShape64x128x64_WarpShape64x32x64
-        // DISPATCH_FP8_GEMM_GROUP(4, 64, 128, 64, 64, 32, 64, 19)
-
-        // // Group 5: CtaShape128x64x64_WarpShape64x32x64
-        // DISPATCH_FP8_GEMM_GROUP(5, 128, 64, 64, 64, 32, 64, 25)
-
-        // // Group 6: CtaShape128x128x64_WarpShape64x32x64
-        // DISPATCH_FP8_GEMM_GROUP(6, 128, 128, 64, 64, 32, 64, 31)
-
-        // // Group 7: CtaShape128x128x64_WarpShape64x64x64
-        // DISPATCH_FP8_GEMM_GROUP(7, 128, 128, 64, 64, 64, 64, 37)
-
-        // // Group 8: CtaShape128x128x64_WarpShape128x32x64
-        // DISPATCH_FP8_GEMM_GROUP(8, 128, 128, 64, 128, 32, 64, 43)
-
-        // // Group 9: CtaShape128x256x64_WarpShape64x64x64
-        // DISPATCH_FP8_GEMM_GROUP(9, 128, 256, 64, 64, 64, 64, 49)
-
-        // // Group 10: CtaShape256x128x64_WarpShape64x64x64
-        // DISPATCH_FP8_GEMM_GROUP(10, 256, 128, 64, 64, 64, 64, 55)
-
-        // // Group 11: CtaShape128x64x128_WarpShape64x32x128
-        // DISPATCH_FP8_GEMM_GROUP(11, 128, 64, 128, 64, 32, 128, 61)
-
-        // // Group 12: CtaShape16x256x128_WarpShape16x64x128
-        // DISPATCH_FP8_GEMM_GROUP(12, 16, 256, 128, 16, 64, 128, 67)
-
-        // // Group 13: CtaShape16x64x128_WarpShape16x64x64
-        // DISPATCH_FP8_GEMM_GROUP(13, 16, 64, 128, 16, 64, 64, 73)
-
-        // // Group 14: CtaShape16x128x64_WarpShape16x64x64
-        // DISPATCH_FP8_GEMM_GROUP(14, 16, 128, 64, 16, 64, 64, 79)
+template <typename OutType>
+float test_config(int config_id, torch::Tensor& out, const torch::Tensor& mat_a, const torch::Tensor& mat_b,
+    const torch::Tensor& scales_a, const torch::Tensor& scales_b, const c10::optional<torch::Tensor>& bias) {
+    const int NUM_WARMUP = 25;
+    const int NUM_TEST = 100;
+    // warmup
+    for (int i = 0; i < NUM_WARMUP; i++) {
+        sm89_dispatch_shape_explicit<OutType>(out, mat_a, mat_b, scales_a, scales_b, bias, config_id);
+    }
+    
+    float total_time = 0.0f;
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    
+    for (int i = 0; i < NUM_TEST; i++) {
+        cudaEventRecord(start);
+        sm89_dispatch_shape_explicit<OutType>(out, mat_a, mat_b, scales_a, scales_b, bias, config_id);
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+        
+        float elapsed_time;
+        cudaEventElapsedTime(&elapsed_time, start, stop);
+        total_time += elapsed_time;
+    }
+    
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+    
+    return total_time / NUM_TEST;
+}
 
-        // // Group 15: CtaShape32x64x128_WarpShape16x64x64
-        // DISPATCH_FP8_GEMM_GROUP(15, 32, 64, 128, 16, 64, 64, 85)
+template <typename OutType>
+int sm89_dispatch_shape_profile(const torch::Tensor& mat_a, const torch::Tensor& mat_b,
+    const torch::Tensor& scales_a, const torch::Tensor& scales_b, 
+    const torch::Dtype& out_dtype, const c10::optional<torch::Tensor>& bias) {
+    torch::Tensor out = torch::empty({mat_a.size(0), mat_b.size(1)}, mat_a.options().dtype(out_dtype));
+    float min_time = std::numeric_limits<float>::max();
+    int best_config = -1;
+#ifdef SGL_DEBUG_BUILD
+    for (int i = 1; i <= MAX_CONFIG_ID; i++) {
+#else
+    for (int i = 1; i <= MAX_CONFIG_ID; i++) {
+#endif
+        try {
+            float elapsed_time = test_config<OutType>(i, out, mat_a, mat_b, scales_a, scales_b, bias);
+            #ifdef SGL_DEBUG_BUILD
+            std::cout << "batch_size: " << mat_a.size(0) << ", config_id: " << i << ", time: " << elapsed_time << "ms" << std::endl;
+            #endif
+            if (elapsed_time < min_time) {
+                min_time = elapsed_time;
+                best_config = i;
+            }
+        } catch (const std::exception& e) {
+            continue;
+        }
     }
+    return best_config;
 }
-torch::Tensor fp8_scaled_mm_profile(const torch::Tensor& mat_a, const torch::Tensor& mat_b, 
+
+
+void fp8_scaled_mm_profile(const torch::Tensor& mat_a, const torch::Tensor& mat_b, 
     const torch::Tensor& scales_a, const torch::Tensor& scales_b, 
-    const torch::Dtype& out_dtype, const c10::optional<torch::Tensor>& bias,
-    int config_id) {
+    const torch::Dtype& out_dtype, const c10::optional<torch::Tensor>& bias) {
     
-    // 基本检查
     TORCH_CHECK(mat_a.is_cuda(), "mat_a must be a CUDA tensor");
     TORCH_CHECK(mat_b.is_cuda(), "mat_b must be a CUDA tensor");
     TORCH_CHECK(mat_a.dim() == 2, "mat_a must be a 2D tensor");
@@ -675,7 +780,6 @@ torch::Tensor fp8_scaled_mm_profile(const torch::Tensor& mat_a, const torch::Ten
     TORCH_CHECK(mat_b.scalar_type() == torch::kFloat8_e4m3fn, "mat_b must be Float8_e4m3fn");
     TORCH_CHECK(out_dtype == torch::kHalf || out_dtype == torch::kBFloat16, "out_dtype must be Half or BFloat16");
 
-    // 检查scales
     TORCH_CHECK(scales_a.numel() == mat_a.size(0), "size of scales_a is not matched");
     TORCH_CHECK(scales_b.numel() == mat_b.size(1), "size of scales_b is not matched");
     TORCH_CHECK(scales_a.is_contiguous(), "scales_a must be contiguous");
@@ -683,7 +787,6 @@ torch::Tensor fp8_scaled_mm_profile(const torch::Tensor& mat_a, const torch::Ten
     TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32, "scales_a must be Float32");
     TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32, "scales_b must be Float32");
 
-    // 检查bias
     if (bias) {
         TORCH_CHECK(bias->numel() == mat_b.size(1), "size of bias is not matched");
         TORCH_CHECK(bias->is_contiguous(), "bias must be contiguous");
@@ -692,18 +795,53 @@ torch::Tensor fp8_scaled_mm_profile(const torch::Tensor& mat_a, const torch::Ten
 
     torch::Tensor out = torch::empty({mat_a.size(0), mat_b.size(1)}, mat_a.options().dtype(out_dtype));
     TORCH_CHECK((out.size(1) * out.element_size()) % 16 == 0, "out must be multiple of 16 bytes for memory alignment");
-
-    auto sm_version = getSMVersion();
     
-    if (sm_version == 89) {
+    std::string config_path = get_config_path(mat_a.size(1), mat_b.size(1), out_dtype);
+    int best_config = -1;
+    bool need_profile = true;
+    
+    try {
+        json config = read_json_config(config_path);
+        // construct key
+        std::string key = "M=" + std::to_string(mat_a.size(0));
+        
+        // check if key exists
+        if (config.contains(key)) {
+            best_config = config[key].get<int>();
+            need_profile = false;
+        }
+    } catch (const std::exception& e) {
+        // if read failed, create new json object
+        need_profile = true;
+    }
+    
+    // if need profile, run profile and update config
+    if (need_profile) {
         if (out_dtype == torch::kBFloat16) {
-            sm89_dispatch_shape_profile<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias, config_id);
+            best_config = sm89_dispatch_shape_profile<cutlass::bfloat16_t>(mat_a, mat_b, scales_a, scales_b, out_dtype, bias);
         } else {
-            sm89_dispatch_shape_profile<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias, config_id);
+            best_config = sm89_dispatch_shape_profile<cutlass::half_t>(mat_a, mat_b, scales_a, scales_b, out_dtype, bias);
+        }
+        if (best_config != -1) {
+            try {
+                // read existing config or create new config
+                json config;
+                try {
+                    config = read_json_config(config_path);
+                } catch (...) {
+                    // if file not exists, use empty json object
+                }
+                
+                // update config
+                std::string key = "M=" + std::to_string(mat_a.size(0));
+                config[key] = best_config;
+                
+                // save config
+                std::ofstream o(config_path);
+                o << std::setw(4) << config << std::endl;
+            } catch (const std::exception& e) {
+                std::cerr << "Failed to save config: " << e.what() << std::endl;
+            }
         }
-    } else {
-        TORCH_CHECK_NOT_IMPLEMENTED(false, "FP8 operations require SM89 GPU architecture");
     }
-
-    return out;
 }
\ No newline at end of file
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index 4673a13271d0..390b88b48bc8 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -19,12 +19,11 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t b
 
 torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
-                             const c10::optional<torch::Tensor>& bias);
+                             const c10::optional<torch::Tensor>& bias, bool is_profile=false);
 
-torch::Tensor fp8_scaled_mm_profile(const torch::Tensor& mat_a, const torch::Tensor& mat_b, 
+void fp8_scaled_mm_profile(const torch::Tensor& mat_a, const torch::Tensor& mat_b, 
     const torch::Tensor& scales_a, const torch::Tensor& scales_b, 
-    const torch::Dtype& out_dtype, const c10::optional<torch::Tensor>& bias,
-    int config_id);
+    const torch::Dtype& out_dtype, const c10::optional<torch::Tensor>& bias);
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // trt_reduce
diff --git a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
index 5820b1350ab5..ad31d7a40856 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
+++ b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
@@ -1,7 +1,17 @@
 #pragma once
 #include <torch/extension.h>
-
+#include "../../../3rdparty/nlohmann/json.hpp"
+#include <fstream>
 #include <sstream>
+#include <filesystem>
+// 在头文件中定义
+#ifdef SGL_DEBUG_BUILD
+constexpr int MAX_CONFIG_ID = 6;  // debug模式下的最大配置ID
+#else
+constexpr int MAX_CONFIG_ID = 90; // 正常模式下的最大配置ID
+#endif
+
+using json = nlohmann::json;
 
 struct cuda_error : public std::runtime_error {
   /**
@@ -49,3 +59,90 @@ inline uint32_t next_pow_2(uint32_t const num) {
   if (num <= 1) return num;
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }
+
+inline json read_json_config(const std::string& config_path) {
+    std::ifstream f(config_path);
+    if (!f.is_open()) {
+        std::stringstream ss;
+        ss << "Failed to open config file: " << config_path;
+        throw std::runtime_error(ss.str());
+    }
+    json config;
+    try {
+        config = json::parse(f);
+    } catch (const json::parse_error& e) {
+        std::stringstream ss;
+        ss << "Failed to parse config file: " << config_path << "\n"
+           << "Error: " << e.what();
+        throw std::runtime_error(ss.str());
+    }
+    return config;
+}
+
+inline json get_gemm_config(const std::string& config_path, int m, int n) {
+    auto config = read_json_config(config_path);
+    
+    json* best_config = nullptr;
+    int min_diff = std::numeric_limits<int>::max();
+    
+    for (auto& cfg : config["configs"]) {
+        int cfg_m = cfg["m_range"][0];
+        int cfg_m_end = cfg["m_range"][1];
+        int cfg_n = cfg["n_range"][0];
+        int cfg_n_end = cfg["n_range"][1];
+        
+        if (m >= cfg_m && m <= cfg_m_end && n >= cfg_n && n <= cfg_n_end) {
+            int diff = std::abs(m - cfg_m) + std::abs(n - cfg_n);
+            if (diff < min_diff) {
+                min_diff = diff;
+                best_config = &cfg;
+            }
+        }
+    }
+    
+    if (!best_config) {
+        throw std::runtime_error("No matching configuration found for m=" + 
+                               std::to_string(m) + ", n=" + std::to_string(n));
+    }
+    
+    return *best_config;
+}
+
+inline std::string get_config_path(int64_t N, int64_t K, const torch::Dtype& dtype) {
+    static int device = -1;
+    static std::string cached_device_name;
+    
+    // 只在第一次调用时获取设备信息
+    if (device == -1) {
+        CHECK_CUDA_SUCCESS(cudaGetDevice(&device));
+        cudaDeviceProp prop;
+        CHECK_CUDA_SUCCESS(cudaGetDeviceProperties(&prop, device));
+        cached_device_name = prop.name;
+        std::replace(cached_device_name.begin(), cached_device_name.end(), ' ', '_');
+    }
+    
+    std::string dtype_str = (dtype == torch::kBFloat16) ? "bfloat16" : "float16";
+    
+    return "N=" + std::to_string(N) + 
+           ",K=" + std::to_string(K) + 
+           ",device=" + cached_device_name + 
+           ",dtype=" + dtype_str + ".json";
+}
+
+// 添加一个辅助函数来找到最近的配置
+inline int find_nearest_m(const json& config, int current_m) {
+    int nearest_m = -1;
+    int min_diff = std::numeric_limits<int>::max();
+    
+    for (auto& el : config.items()) {
+        if (el.key().substr(0, 2) == "M=") {
+            int m = std::stoi(el.key().substr(2));
+            if (m <= current_m && (current_m - m) < min_diff) {
+                min_diff = current_m - m;
+                nearest_m = m;
+            }
+        }
+    }
+    
+    return nearest_m;
+}
\ No newline at end of file
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index 8b36c1738cde..ea684346a7c4 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -51,7 +51,7 @@ def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
     #     bias,
     # )
 
-def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
+def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None, is_profile=False):
     return _fp8_scaled_mm(
         mat_a,
         mat_b,
@@ -59,15 +59,15 @@ def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
         scales_b,
         out_dtype,
         bias,
+        is_profile,
     )
 
-def fp8_scaled_mm_profile(mat_a, mat_b, scales_a, scales_b, out_dtype, bias, config_id):
-    return _fp8_scaled_mm_profile(
+def fp8_scaled_mm_profile(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
+    _fp8_scaled_mm_profile(
         mat_a,
         mat_b,
         scales_a,
         scales_b,
         out_dtype,
         bias,
-        config_id,
     )

From 05eb204d3cdeb70872beab9586251f05e4a8df17 Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Tue, 14 Jan 2025 11:02:52 +0000
Subject: [PATCH 060/248] fp8 sm90-H100 singleTest done

---
 sgl-kernel/tests/test_fp8_gemm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sgl-kernel/tests/test_fp8_gemm.py b/sgl-kernel/tests/test_fp8_gemm.py
index 2a474d7ea17e..386910db1ec3 100644
--- a/sgl-kernel/tests/test_fp8_gemm.py
+++ b/sgl-kernel/tests/test_fp8_gemm.py
@@ -35,8 +35,8 @@ def _test_accuracy_once(self, M, N, K, with_bias, out_dtype, device):
         a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
         o = torch_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias)
         o1 = fp8_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias)
-        rtol = 0.01
-        atol = 0.1
+        rtol = 0.02
+        atol = 2
         torch.testing.assert_close(o, o1, rtol=rtol, atol=atol)
         print(f"M={M}, N={N}, K={K}, with_bias={with_bias}, out_dtype={out_dtype}: OK")
 

From 8d955389d9ed411832a7ca8648916cd748e6876a Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Tue, 14 Jan 2025 11:07:35 +0000
Subject: [PATCH 061/248] fp8 sm90-H100 singleTest done

---
 sgl-kernel/setup.py                           |  2 +
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    | 89 +++++++++++++------
 2 files changed, 66 insertions(+), 25 deletions(-)

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 6133f182fee7..ab8e4fd8109b 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -54,6 +54,8 @@ def update_wheel_platform_tag():
     "-Xcompiler",
     "-fPIC",
     "-gencode=arch=compute_89,code=sm_89",
+    "-gencode=arch=compute_90,code=sm_90",
+    "-gencode=arch=compute_90a,code=sm_90a",  # 只保留这个
     "-U__CUDA_NO_HALF_OPERATORS__",
     "-U__CUDA_NO_HALF2_OPERATORS__",
 ]
diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index 47de3dbc9106..27a5b64cb650 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -436,6 +436,15 @@ typename Gemm::Arguments prepare_sm90_fp8_args(torch::Tensor& out, const torch::
     StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, make_shape(n, k, 1));
     StrideC stride_c;
     StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, make_shape(m, n, 1));
+    std::cout << "m: " << m << std::endl;
+    std::cout << "n: " << n << std::endl;
+    std::cout << "k: " << k << std::endl;
+    std::cout << "stride_a: " << stride_a << std::endl;
+    std::cout << "stride_b: " << stride_b << std::endl;
+    std::cout << "stride_d: " << stride_d << std::endl;
+    std::cout << "ptr_a: " << ptr_a << std::endl;
+    std::cout << "ptr_b: " << ptr_b << std::endl;
+    std::cout << "ptr_d: " << ptr_d << std::endl;
     typename Gemm::Arguments args
         = {cutlass::gemm::GemmUniversalMode::kGemm, {m, n, k, 1}, {ptr_a, stride_a, ptr_b, stride_b},
             {{}, // epilogue.thread
@@ -482,6 +491,36 @@ void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const
     TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
 
     auto status = gemm_op.run(args, workspace.data_ptr(), stream);
+    
+    if (status != cutlass::Status::kSuccess) {
+        std::stringstream error_msg;
+        error_msg << "GEMM execution failed. Status: " 
+                  << cutlass::cutlassGetStatusString(status) << "\n"
+                  << "Problem size: M=" << a.size(0) << ", N=" << b.size(1) << ", K=" << a.size(1) << "\n"
+                  << "Device: " << a.device() << "\n"
+                  << "Data types - A: " << a.dtype() 
+                  << ", B: " << b.dtype() 
+                  << ", Out: " << out.dtype() << "\n"
+                  << "Memory alignment - A: " << reinterpret_cast<std::uintptr_t>(a.data_ptr()) % 16 
+                  << ", B: " << reinterpret_cast<std::uintptr_t>(b.data_ptr()) % 16 
+                  << ", Out: " << reinterpret_cast<std::uintptr_t>(out.data_ptr()) % 16
+                  << ", workspace_size: " << workspace_size
+                  << ", workspace_options: " << workspace_options;
+        
+        // 检查CUDA错误
+        cudaError_t cuda_err = cudaGetLastError();
+        if (cuda_err != cudaSuccess) {
+            error_msg << "\nCUDA error: " << cudaGetErrorString(cuda_err);
+        }
+        
+        TORCH_CHECK(false, error_msg.str());
+    }
+
+    // 5. 同步并检查最终状态
+    cudaError_t sync_err = cudaStreamSynchronize(stream);
+    if (sync_err != cudaSuccess) {
+        TORCH_CHECK(false, "CUDA sync error: ", cudaGetErrorString(sync_err));
+    }
     TORCH_CHECK(status == cutlass::Status::kSuccess)
 }
 
@@ -515,16 +554,16 @@ void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch
     uint32_t const mp2 =
         std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
 
-    // if (mp2 <= 64) {
-    //     // m in [1, 64]
-    //     return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _8, _1>>(out, a, b, scales_a, scales_b, bias);
-    // } else if (mp2 <= 128) {
-    //     // m in (64, 128]
-    //     return sm90_dispatch_bias<OutType, Shape<_64, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
-    // } else {
-    //     // m in (128, inf)
-    //     return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
-    // }
+    if (mp2 <= 64) {
+        // m in [1, 64]
+        return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _8, _1>>(out, a, b, scales_a, scales_b, bias);
+    } else if (mp2 <= 128) {
+        // m in (64, 128]
+        return sm90_dispatch_bias<OutType, Shape<_64, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+    } else {
+        // m in (128, inf)
+        return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+    }
 }
 
 #define DISPATCH_FP8_GEMM_CONFIG(TB_M, TB_N, TB_K, WP_M, WP_N, WP_K, STAGES) \
@@ -549,21 +588,21 @@ void sm89_dispatch_shape_explicit(torch::Tensor& out, const torch::Tensor& mat_a
         case 1:
             DISPATCH_FP8_GEMM_CONFIG(32, 64, 128, 16, 64, 64, 5);
             break;
-        case 2:
-            DISPATCH_FP8_GEMM_CONFIG(16, 64, 128, 16, 64, 64, 5);
-            break;
-        case 3:
-            DISPATCH_FP8_GEMM_CONFIG(64, 64, 128, 32, 64, 64, 5);
-            break;
-        case 4:
-            DISPATCH_FP8_GEMM_CONFIG(64, 128, 64, 32, 64, 64, 5);
-            break;
-        case 5:
-            DISPATCH_FP8_GEMM_CONFIG(128, 128, 64, 64, 32, 64, 2);
-            break;
-        case 6:
-            DISPATCH_FP8_GEMM_CONFIG(64, 128, 64, 32, 64, 64, 6);
-            break;
+        // case 2:
+        //     DISPATCH_FP8_GEMM_CONFIG(16, 64, 128, 16, 64, 64, 5);
+        //     break;
+        // case 3:
+        //     DISPATCH_FP8_GEMM_CONFIG(64, 64, 128, 32, 64, 64, 5);
+        //     break;
+        // case 4:
+        //     DISPATCH_FP8_GEMM_CONFIG(64, 128, 64, 32, 64, 64, 5);
+        //     break;
+        // case 5:
+        //     DISPATCH_FP8_GEMM_CONFIG(128, 128, 64, 64, 32, 64, 2);
+        //     break;
+        // case 6:
+        //     DISPATCH_FP8_GEMM_CONFIG(64, 128, 64, 32, 64, 64, 6);
+        //     break;
         default:
             throw std::runtime_error("Invalid config_id in debug mode: " + std::to_string(config_id));
     }

From 724cf62cd71895d9690682fc4d7e5ee86aeff0c4 Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Tue, 14 Jan 2025 11:19:41 +0000
Subject: [PATCH 062/248] clean code

---
 e -i HEAD~3q:q                                | 10399 ++++++++++++++++
 sgl-kernel/setup.py                           |     6 +-
 sgl-kernel/src/sgl-kernel/__init__.py         |     1 -
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    |   263 -
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     |     6 -
 sgl-kernel/src/sgl-kernel/ops/__init__.py     |    13 +-
 6 files changed, 10403 insertions(+), 285 deletions(-)
 create mode 100644 e -i HEAD~3q:q

diff --git a/e -i HEAD~3q:q b/e -i HEAD~3q:q
new file mode 100644
index 000000000000..1af572690a69
--- /dev/null
+++ b/e -i HEAD~3q:q	
@@ -0,0 +1,10399 @@
+[33mcommit ae31fd9313a1419fdccc80a43eabea16596aa2ec[m[33m ([m[1;36mHEAD -> [m[1;32myych[m[33m, [m[1;31morigin/yych[m[33m)[m
+Author: yych0745 <1398089567@qq.com>
+Date:   Tue Jan 14 11:02:52 2025 +0000
+
+    fp8 sm90-H100 singleTest done
+
+[33mcommit b0a6e2c5e8fb7c625a6585b81958ed4fb2ccaffc[m
+Author: hadoop-hmart-waimai-rank hadoop-hmart-waimai-rank <hadoop-hmart-waimai-rank@set-yg-kubernetes-pc07.mt>
+Date:   Mon Jan 13 19:38:03 2025 +0800
+
+    add config_profile for sm_89
+
+[33mcommit 798cf24446d3ba659911254248c707008cd6f043[m
+Author: hadoop-hmart-waimai-rank hadoop-hmart-waimai-rank <hadoop-hmart-waimai-rank@set-yg-kubernetes-pc07.mt>
+Date:   Fri Jan 10 17:22:15 2025 +0800
+
+    opitmize
+
+[33mcommit b028bbdd3e457b5e77e385e19a4cb7e84f030cb2[m[33m ([m[1;31morigin/main_w8a8_fp8[m[33m)[m
+Author: HandH1998 <1335248067@qq.com>
+Date:   Thu Jan 9 17:41:46 2025 +0800
+
+    support bias
+
+[33mcommit bdc1755a4a9da2e18dde8f63eb33ae51e4560367[m
+Author: HandH1998 <1335248067@qq.com>
+Date:   Wed Jan 8 19:25:23 2025 +0800
+
+    support w8a8 fp8
+
+[33mcommit 6e4db4d6b0c2648138e36ab65cb02747febe699d[m
+Author: yych0745 <1398089567@qq.com>
+Date:   Tue Jan 7 17:24:45 2025 +0800
+
+    Add performance and accuracy test code for FP8 GEMM operations
+
+[33mcommit 6fb5768372532d5b9885446d3f82fd2b09f23c28[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 7 18:17:34 2025 -0800
+
+    Disable math eval on nightly CI temporarily (#2779)
+
+[33mcommit 51caee740feece0bcfa43af780d7fcab1c213583[m
+Author: Zhiqiang Xie <xiezhq@stanford.edu>
+Date:   Tue Jan 7 13:38:37 2025 -0800
+
+    Host memory pool for hierarchical caching (#2771)
+
+[33mcommit 58f9060efe26d4377af06dcb2e33778fb012e4f3[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Tue Jan 7 19:47:37 2025 +0800
+
+    Update int8 gemm config (#2774)
+
+[33mcommit bdc1acf6cdadf6bf08f7d2d895c8099023253d36[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 7 02:52:53 2025 -0800
+
+    Misc fix for min_p_sampling, --cuda-graph-bs (#2761)
+
+[33mcommit 6d08ce2aa9f481f6704b8c04a9d2a5b138db7ebe[m
+Author: HAI <hixiao@gmail.com>
+Date:   Tue Jan 7 01:35:08 2025 -0800
+
+    Use Optional with None default (#2770)
+
+[33mcommit 380930a959ac5acb97cd686e4232823cf9d2b5bc[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Tue Jan 7 14:20:50 2025 +0800
+
+    add  benchmark_moe_align_blocks (#2767)
+
+[33mcommit 9dec582dabcf9787e6d2701f269befe598a10a86[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jan 6 16:35:45 2025 -0800
+
+    Remove --modelopt-config in server_args (#2758)
+
+[33mcommit b01febdca00e90a725b38378f6b77dfc71ea4148[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jan 6 15:36:23 2025 -0800
+
+    Update README.md (#2757)
+    
+    
+    Co-authored-by: Junjie Jin <jjjjohnsonjin@gmail.com>
+    Co-authored-by: justdoit <24875266+coolhok@users.noreply.github.com>
+
+[33mcommit 1acbaf1b5aed65f8232a689042801c569d6e2661[m
+Author: Xingyao Wang <xingyaoww@gmail.com>
+Date:   Mon Jan 6 18:04:55 2025 -0500
+
+    Add generator-style run_batch function (#2513)
+    
+    Co-authored-by: openhands <openhands@all-hands.dev>
+
+[33mcommit 287427e2e66aef4e4d857cfd666fe849e9f73617[m
+Author: Zhiyu <zhiyuc@nvidia.com>
+Date:   Mon Jan 6 14:54:52 2025 -0800
+
+    Enable Nvidia's ModelOpt fp8 quantized models (#2535)
+
+[33mcommit b8574f695359e443e40ebb6a0fb6165b9e722674[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jan 6 14:54:18 2025 -0800
+
+    Clean up eagle code (#2756)
+
+[33mcommit 2855caa4817b22f8c72fc561d25006e7adf89136[m
+Author: 王博伟 <mamomobo@live.com>
+Date:   Tue Jan 7 06:00:55 2025 +0800
+
+    feat: add devcontainer.json for VSCode development (#2745)
+
+[33mcommit 2329e1ddd03fad4eade733d3b36ce4e388cb3c02[m
+Author: Xu-Chen <956140954@qq.com>
+Date:   Tue Jan 7 05:56:28 2025 +0800
+
+    Support llamafy/Qwen-Qwen2.5-7B-Instruct-llamafied (#2748)
+    
+    Co-authored-by: chenxu02 <chenxu02@zhihu.com>
+
+[33mcommit 0f3eb1d29404f9941e07aaa8b5f1d39523e12608[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Mon Jan 6 22:51:22 2025 +0800
+
+    Support cutlass Int8 gemm (#2752)
+
+[33mcommit 06dd2eab84387cf47ff0db3b48e35373119c8347[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Mon Jan 6 22:13:28 2025 +0800
+
+    Remove unused var in moe_align_kernel (#2751)
+
+[33mcommit 439f65809f7c917165cbb962d7a6bb5167ecdcf9[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Mon Jan 6 21:59:31 2025 +0800
+
+    Fix sgl-kernel cu118 compile issue (#2750)
+
+[33mcommit 2f0d3864962343ce894b7abc705b876028f2c668[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Jan 6 01:29:54 2025 +0800
+
+    chore: bump v0.4.1.post4 (#2713)
+
+[33mcommit 3900a94afe635bccab3975852cdfa8d4ffd8fce1[m
+Author: yizhang2077 <1109276519@qq.com>
+Date:   Mon Jan 6 00:47:16 2025 +0800
+
+    Support twoshot kernel (#2688)
+
+[33mcommit ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Mon Jan 6 00:28:22 2025 +0800
+
+    improve moe_align_kernel for deepseek v3 (#2735)
+
+[33mcommit bc6ad367c2beec2587843992176089b32eb5d6b9[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Jan 5 14:45:42 2025 +0800
+
+    fix lint (#2733)
+
+[33mcommit 3a22a303d1db773455e1371bbff4f5ed65e683c6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jan 4 20:13:16 2025 -0800
+
+    Revert the GLOO_SOCKET_IFNAME change (#2731)
+
+[33mcommit bdb3929dbb8f14d22c4a27ee2d8840751752658c[m
+Author: libra <lihu723@gmail.com>
+Date:   Sat Jan 4 00:05:16 2025 +0800
+
+    Refactor SchedulePolicy to improve code organization (#2571)
+
+[33mcommit f5d0865b252ff9eb95cec73caa784194463ea03a[m
+Author: Ce Gao <gaocegege@hotmail.com>
+Date:   Fri Jan 3 22:32:30 2025 +0800
+
+    feat: Support VLM in reference_hf (#2726)
+    
+    Signed-off-by: Ce Gao <gaocegege@hotmail.com>
+
+[33mcommit afdee7b1a9d2ec2faec2f2d895df6a631e6ef573[m
+Author: Ce Gao <gaocegege@hotmail.com>
+Date:   Fri Jan 3 22:21:38 2025 +0800
+
+    [Docs] fix 404 - Contributor Guide, again (#2727)
+    
+    Signed-off-by: Ce Gao <gaocegege@hotmail.com>
+
+[33mcommit cb34d848ac9314991bf96f9b479fa0b6147ba23a[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Jan 3 00:32:20 2025 -0800
+
+    Update README.md (#2722)
+    
+    
+    Co-authored-by: Yangmin Li <2682000734@qq.com>
+    Co-authored-by: Mingyuan Ma <mamingyuan2001@berkeley.edu>
+    Co-authored-by: Zhiyu Cheng <zhiyuc@nvidia.com>
+
+[33mcommit 0f9cc6d8d3c688eeb8d61e5e869a59d8d756044b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Jan 2 18:25:26 2025 -0800
+
+     Fix package loss for small models (#2717)
+    
+    Co-authored-by: sdli1995 < mmlmonkey@163.com>
+
+[33mcommit c7ae474a49f9167c6ef3046c5a968e1442b926c8[m
+Author: yigex <yigex@amd.com>
+Date:   Fri Jan 3 08:23:19 2025 +0800
+
+    [Feature, Hardware] Enable DeepseekV3 on AMD GPUs (#2601)
+    
+    Co-authored-by: root <root@banff-cyxtera-s83-5.amd.com>
+    Co-authored-by: HAI <hixiao@gmail.com>
+    Co-authored-by: Bruce Xue <yigex@xilinx.com>
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit bdf946bf8101e2907257d9575e068af2594cc330[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Jan 2 15:07:37 2025 -0800
+
+    Support loading pre-sharded moe weights (#2716)
+
+[33mcommit 8c8779cd059d64827f148c9532645c641512a04d[m
+Author: yukavio <67678385+yukavio@users.noreply.github.com>
+Date:   Fri Jan 3 02:28:39 2025 +0800
+
+    [Fix] fix retract error in eagle speculative decoding (#2711)
+    
+    Co-authored-by: kavioyu <kavioyu@tencent.com>
+
+[33mcommit 1775b963dbb9d182496c12b92eb5b0d3155db030[m
+Author: Mick <mickjagger19@icloud.com>
+Date:   Fri Jan 3 02:28:22 2025 +0800
+
+    [Fix] fix incorrectly overwriting the port specified in ServerArgs (#2714)
+
+[33mcommit dd2e2d275f2d83065d42fa9194adfded2147af41[m
+Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
+Date:   Thu Jan 2 17:18:31 2025 +0000
+
+    Docs: Update documentation workflow and contribution guide (#2704)
+    
+    Co-authored-by: Chayenne <zhaochen20@outlook.com>
+
+[33mcommit a990daff9ce0e29d5921ab390650c3ab1e0252e2[m
+Author: Rodrigo Garcia <32329949+roG0d@users.noreply.github.com>
+Date:   Thu Jan 2 15:17:03 2025 +0100
+
+    Included multi-node DeepSeekv3 example (#2707)
+
+[33mcommit ba5112ff691d791a9e38c6c71f59324a5fcb49d0[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Jan 2 21:47:44 2025 +0800
+
+    feat: support moe_align_block_size_triton (#2712)
+    
+    Co-authored-by: WANDY666 <1060304770@qq.com>
+
+[33mcommit 815dce0554793d0788faf4eaacf0c7271c070e95[m
+Author: yukavio <67678385+yukavio@users.noreply.github.com>
+Date:   Thu Jan 2 19:22:34 2025 +0800
+
+    Eagle speculative decoding part 4: Add EAGLE2 worker (#2150)
+    
+    Co-authored-by: kavioyu <kavioyu@tencent.com>
+    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
+
+[33mcommit ad20b7957e26f14a91e3052a13b822b8744bd931[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Jan 2 02:09:08 2025 -0800
+
+    Eagle speculative decoding part 3: small modifications to the general scheduler (#2709)
+    
+    Co-authored-by: kavioyu <kavioyu@tencent.com>
+
+[33mcommit 9183c23eca51bf76159e81dfd6edf5770796c2d8[m
+Author: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
+Date:   Thu Jan 2 18:05:19 2025 +0800
+
+    Speed up `update_weights_from_tensor` (#2695)
+
+[33mcommit 148254d4db8bf3bffee23710cd1acbd5711ebd1b[m
+Author: kk <43161300+kkHuang-amd@users.noreply.github.com>
+Date:   Thu Jan 2 17:11:06 2025 +0800
+
+    Improve moe reduce sum kernel performance (#2705)
+    
+    Co-authored-by: wunhuang <wunhuang@amd.com>
+
+[33mcommit a4d6d6f1ddc9f15bfa904e7e286e3f5ba4ba5a50[m
+Author: Xiaotong Jiang <jiangxiaotong728@gmail.com>
+Date:   Wed Jan 1 15:29:35 2025 -0800
+
+    [feat]: Add math eval to CI nightly run (#2663)
+    
+    Co-authored-by: Chayenne <zhaochen20@outlook.com>
+
+[33mcommit 062c48d2bd1ade784faf00bc2c73bc90131e878e[m
+Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
+Date:   Wed Jan 1 23:08:43 2025 +0000
+
+    [Docs] Add Support for Pydantic Structured Output Format (#2697)
+
+[33mcommit b6e0cfb5e1c355f9526defdf9bbee430c0bfebaa[m
+Author: kk <43161300+kkHuang-amd@users.noreply.github.com>
+Date:   Wed Jan 1 12:12:19 2025 +0800
+
+    ROCm base image update (#2692)
+    
+    Co-authored-by: wunhuang <wunhuang@amd.com>
+
+[33mcommit 0d8d97b8e6d7b3f35c2c2bce9508bdd5dd83cc9b[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Tue Dec 31 14:35:48 2024 -0800
+
+    Doc: Rename contribution_guide.md (#2691)
+
+[33mcommit 0a765bbccca04005a28364dc53cbf0fc379aa8d7[m
+Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
+Date:   Tue Dec 31 22:11:00 2024 +0000
+
+    Docs: Refactor Contribution Guide  (#2690)
+
+[33mcommit 286cad3ee31b360a2c374ce7da6fac9731f913c2[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Tue Dec 31 23:17:36 2024 +0800
+
+    h200 tuning  fused_moe_triton config for  Mixtral 8x7B/8x22B and Qwen2 57BA14B (#2689)
+
+[33mcommit dc7eb01f192f406afc4f0f9af84f3fbd970b6b06[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Dec 31 02:48:19 2024 -0800
+
+    [Fix] fix openai adapter (#2685)
+
+[33mcommit b0524c3789725642c3dd93323d47b18b9ba77a51[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Dec 31 02:25:05 2024 -0800
+
+    Eagle speculative decoding part 2: Fix cuda graph + DP attention hanging  (#2684)
+    
+    Co-authored-by: yukavio <kavioyu@gmail.com>
+
+[33mcommit 6c42fa229d9730bd7c5aff33232c1a2eb2cea387[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Dec 31 00:13:10 2024 -0800
+
+    Update README.md (#2683)
+
+[33mcommit d49b13c6f8c70f22b0076d1a3807eb707e8eeb98[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Dec 31 15:52:09 2024 +0800
+
+    feat: use CUDA 12.4 by default (for FA3) (#2682)
+
+[33mcommit bedc4c7a50fe4ffb98b8f76bf3fa08ff1cf47a13[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Dec 31 15:04:50 2024 +0800
+
+    misc: update CODEOWNERS (#2680)
+
+[33mcommit f44d143949f6c6fbca6cb96c52381b8bc1769a87[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Dec 30 22:58:55 2024 -0800
+
+    Support target model verification in the attention backend (#2678)
+    
+    
+    Co-authored-by: yukavio <kavioyu@gmail.com>
+
+[33mcommit b6b57fc2007524a1c16b71ff88e5c04f8271caca[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Dec 31 14:52:00 2024 +0800
+
+    minor: cleanup sgl-kernel (#2679)
+
+[33mcommit b4403985d00939958db69194f94a795d3ea95bce[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Tue Dec 31 14:28:29 2024 +0800
+
+    Add cutlass submodule for sgl-kernel (#2676)
+
+[33mcommit 339c69a243cfb8c504861a9bd92f206d3a6d0f10[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Dec 30 21:40:14 2024 -0800
+
+    Improve the computation for time_per_output_token Prometheus metrics (#2674)
+
+[33mcommit f7074700190a19f7e22f70620d30d2efe0b1d9b4[m
+Author: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
+Date:   Tue Dec 31 11:04:01 2024 +0800
+
+    CI: Update scripts to fail fast (#2672)
+
+[33mcommit 21ec66e59e466ba8bef05478296fabfcb1f94421[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Dec 30 05:42:08 2024 -0800
+
+    Minor follow-up fixes for the logprob refactor (#2670)
+
+[33mcommit c5210dfa3802dbe08a8de9e860cea0c932307c9d[m
+Author: HAI <hixiao@gmail.com>
+Date:   Mon Dec 30 05:31:12 2024 -0800
+
+    AMD DeepSeek_V3 FP8 Numerical fix (#2667)
+
+[33mcommit a29dd9501da901b874eb55f2ade694fcaf79a5ee[m
+Author: mobicham <37179323+mobicham@users.noreply.github.com>
+Date:   Mon Dec 30 14:27:29 2024 +0100
+
+    Add GemLite caching after each capture (#2669)
+
+[33mcommit 9c6ba2484f03be55aa3732b5be50ad062a2d8720[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Dec 30 04:51:38 2024 -0800
+
+    Refactor logprob computation to return the real logprob used in sampling (#2664)
+
+[33mcommit b02da24a5b8cc0b8e4971f59a7e0f8afcfeab9b3[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Mon Dec 30 18:07:01 2024 +0800
+
+    Refactor sgl-kernel build (#2642)
+
+[33mcommit bdd2827a804960f379ab3cd1650252b8c6e4503d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Dec 30 00:46:41 2024 -0800
+
+    Update structured_outputs.ipynb (#2666)
+
+[33mcommit 8c3b420eec03ea94e4ccce04681891558ca892ca[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 29 23:57:16 2024 -0800
+
+    [Docs] clean up structured outputs docs (#2654)
+
+[33mcommit e6f523b5f241d2d094a54cf11ed1aadfed904150[m
+Author: HAI <hixiao@gmail.com>
+Date:   Sun Dec 29 23:45:02 2024 -0800
+
+    fix typo in python/sglang/srt/layers/quantization/fp8.py (#2655)
+
+[33mcommit 32318178611b2cbbeddb272e8388ea25f4b64cf5[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 29 23:05:50 2024 -0800
+
+    Revert "[feat] Add math eval to CI" (#2656)
+
+[33mcommit a11f8d5f6a80595cd90982b369284a5b87d50163[m
+Author: Xiaotong Jiang <xiaotong.jiang@databricks.com>
+Date:   Sun Dec 29 22:49:41 2024 -0800
+
+    [feat] Add math eval to CI (#2652)
+
+[33mcommit 098d659c0e809a6cb4a6a0792cbbf159db011c10[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Dec 30 13:33:29 2024 +0800
+
+    docs: update README (#2651)
+
+[33mcommit 76d14f8cb92c73ac75a1d859a088629670de4290[m
+Author: Lzhang-hub <57925599+Lzhang-hub@users.noreply.github.com>
+Date:   Mon Dec 30 13:04:38 2024 +0800
+
+    add 2*h20 node serving example for deepseek v3 (#2650)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit b08c308ebc7f742acc24d971d15121d862f80c4f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 29 14:51:07 2024 -0800
+
+    Update the timeout in nightly-test.yml (#2649)
+
+[33mcommit 03d5fbfd441ad1b03feb66a4d1cbb03088399eff[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 29 14:25:53 2024 -0800
+
+    Release 0.4.1.post3 - upload the config.json to PyPI (#2647)
+
+[33mcommit 1703d766d8c54e41de6e46b814732732a25c81ff[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Sun Dec 29 13:52:50 2024 -0800
+
+    CI: skip special token for engine token ids unit test (#2648)
+
+[33mcommit 09e6e2aa334b4d716d35934b0faf21ecd648caa1[m
+Merge: 35bdb485 fad29f7f
+Author: zhaochenyang20 <zhaochen20@outlook.com>
+Date:   Sun Dec 29 21:48:21 2024 +0000
+
+    Merge branch 'main' of github.com:sgl-project/sglang
+
+[33mcommit fad29f7f52cf3be868dcdcf28967930045545954[m
+Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
+Date:   Sun Dec 29 21:28:59 2024 +0000
+
+    CI: Fix unittest for engine input token ids and output token ids (#2646)
+
+[33mcommit 35bdb48557d6b55e1bfadbadd1084cb23c56f7f4[m
+Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
+Date:   Sun Dec 29 20:28:27 2024 +0000
+
+    [Feature] Get Token IDs with Engine.generate() (#2636)
+    
+    Co-authored-by: Chayenne <zhaochen20@outlook.com>
+
+[33mcommit b085e06b0159c08f74d348fad3ea0a27e6a45a5e[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Dec 30 02:22:54 2024 +0800
+
+    docs: add development guide using docker (#2645)
+
+[33mcommit 763dd55d17bcb5fb7670ae62ce927fa1f27a8776[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Dec 30 01:24:06 2024 +0800
+
+    docs: update README (#2644)
+
+[33mcommit 3ccf566b0d4941a446353b14b5251ac9a30090a4[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Dec 30 00:11:46 2024 +0800
+
+    chore: bump v0.4.1.post2 (#2643)
+
+[33mcommit afa0341e57ec3bfe4e0af19767af88b1f5baf7c3[m
+Author: HandH1998 <1335248067@qq.com>
+Date:   Sun Dec 29 22:53:47 2024 +0800
+
+    Update Triton configs for block fp8 kernels (#2641)
+
+[33mcommit 30828e7192122468d0ca1e700a5ec8e85fe8fdf0[m
+Author: HAI <hixiao@gmail.com>
+Date:   Sun Dec 29 03:23:39 2024 -0800
+
+    AMD: set weights and scaling numbers properly for block FP8 (#2637)
+
+[33mcommit e0e09fceeb1c4ea80cfc3bd1652bfd1931762525[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Dec 29 02:10:27 2024 -0800
+
+    [Session] Update session control interface (#2635)
+
+[33mcommit 9c05c6898e262b1f7dd07b1401918be2d48fe4e2[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 29 01:45:35 2024 -0800
+
+    Add llama_eagle.py (#2640)
+    
+    Co-authored-by: kavioyu <kavioyu@tencent.com>
+
+[33mcommit 3464e57b620ca88f8ec9913ee25f4cf6d597993a[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 29 17:28:11 2024 +0800
+
+    minor: add nsys cli for docker dev (#2639)
+
+[33mcommit 3815b23ccb3d3a54cad705123da2f89aafdde0d2[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 29 00:45:57 2024 -0800
+
+    Clean up wrapper in flashinfer backend (#2638)
+
+[33mcommit fd34f2da3581dc8cf0a4d4bb1ce74ae2fca4b63a[m
+Author: Adarsh Shirawalmath <114558126+adarshxs@users.noreply.github.com>
+Date:   Sun Dec 29 13:35:00 2024 +0530
+
+    [Docs] Add EBNF to sampling params docs  (#2609)
+
+[33mcommit 8ee9a8501a897395e9d21dbf02986b0f98b378d0[m
+Author: Tanjiro <tushar.goel.ml@gmail.com>
+Date:   Sun Dec 29 11:28:52 2024 +0530
+
+    [Feature] Function Calling (#2544)
+    
+    Co-authored-by: Haoyu Wang <120358163+HaoyuWang4188@users.noreply.github.com>
+
+[33mcommit fd28640dc51d06786b411d1dfbfa043d084919e0[m
+Author: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
+Date:   Sun Dec 29 05:30:27 2024 +0800
+
+    Add `update_weights_from_tensor` (#2631)
+
+[33mcommit 7863e4368abfd92b331d59aed0ef60680ebe8559[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Dec 28 23:12:04 2024 +0800
+
+    add configs for block fp8 related kernels (#2628)
+    
+    Co-authored-by: HandH1998 <1335248067@qq.com>
+
+[33mcommit 333e3bfde5d7589e9ea23a13ef4a790830cc316a[m
+Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
+Date:   Sat Dec 28 15:00:38 2024 +0000
+
+    [docs]Refactor constrained decoding tutorial (#2633)
+
+[33mcommit 239c9d4d3a40841b1323db700b1e351d3565e7e4[m
+Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
+Date:   Sat Dec 28 07:54:28 2024 +0000
+
+    Docs: Add constrained decoding tutorial (#2614)
+    
+    Co-authored-by: Chayenne <zhaochen20@outlook.com>
+
+[33mcommit 855d0ba381f6bfa69f906797e33efcd0708797b9[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Dec 27 22:16:39 2024 -0800
+
+    [CI] Fix nightly test and raise better error message (#2626)
+    
+    Co-authored-by: Sangbin <rkooo567@gmail.com>
+
+[33mcommit 9254a33ad46d226069dc6a60e7c2e40abde96920[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Sat Dec 28 14:01:35 2024 +0800
+
+    avoid fused_moe_triton `padding` circular import (#2624)
+
+[33mcommit 8a2681e26a1a7993f12a46fb19f7ee238117df83[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Sat Dec 28 13:39:56 2024 +0800
+
+    Update readme (#2625)
+
+[33mcommit 5276a675f589932e3c743719803af324efce5162[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Dec 27 13:41:41 2024 -0800
+
+    Add more supporting organizations (#2623)
+
+[33mcommit 751e5ca2734287e5506e29d69d947e1912d70cdb[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Dec 27 11:23:46 2024 -0800
+
+    [minor] clean up docs and eos id (#2622)
+
+[33mcommit 7a7ac6bea15fdffc4b078dd81c485b517534389e[m
+Author: Yang Zheng <50227060+zhengy001@users.noreply.github.com>
+Date:   Sat Dec 28 02:59:56 2024 +0800
+
+    [FIX] Update EOS from config (#2475)
+
+[33mcommit d9e6ee382b84153a831ee7cf9dc5d65cd6b8ed68[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Dec 28 00:21:53 2024 +0800
+
+    docs: update README (#2618)
+
+[33mcommit ef5b0ff90b526e4f8dc301553f3a1ce04e9dd71d[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Dec 28 00:11:06 2024 +0800
+
+    chore: bump v0.4.1.post1 (#2616)
+
+[33mcommit 6e5305158cded4aa7523c28435339905adb2f610[m
+Author: HandH1998 <1335248067@qq.com>
+Date:   Sat Dec 28 00:01:13 2024 +0800
+
+    update sgl_moe_align_block_size usage (#2617)
+
+[33mcommit 77d1210b3610eda49fee06bc8b7400e2af4dd5e5[m
+Author: HandH1998 <1335248067@qq.com>
+Date:   Fri Dec 27 23:32:53 2024 +0800
+
+    fix moe_align_block_size (#2615)
+
+[33mcommit 70dc2fbe2d1ebecbb9b1a052f864253c446ec301[m
+Author: kk <43161300+kkHuang-amd@users.noreply.github.com>
+Date:   Fri Dec 27 16:32:17 2024 +0800
+
+    Change extend attention kernel launch parameter for ROCm platform to … (#2610)
+    
+    Co-authored-by: wunhuang <wunhuang@amd.com>
+    Co-authored-by: HAI <hixiao@gmail.com>
+
+[33mcommit b438a2e5125ca19a214dd8ac060f957cbf595673[m
+Author: kk <43161300+kkHuang-amd@users.noreply.github.com>
+Date:   Fri Dec 27 15:54:38 2024 +0800
+
+    Fix triton kernel performance regression (#2611)
+    
+    Co-authored-by: wunhuang <wunhuang@amd.com>
+
+[33mcommit 7ca751ff7d8ddb341cb22cf000c0e8ce5fc1bb4d[m
+Author: kk <43161300+kkHuang-amd@users.noreply.github.com>
+Date:   Fri Dec 27 15:38:22 2024 +0800
+
+    Fused moe triton cfg opt for rocm (#2612)
+    
+    Co-authored-by: wunhuang <wunhuang@amd.com>
+
+[33mcommit c75adfec59afbf8419d312346129f9365fd33b0d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Dec 26 20:58:08 2024 -0800
+
+    Update CODEOWNERS (#2608)
+
+[33mcommit 7722c11c1d2a2da5b914f3e043b7e8fcd182c0f5[m
+Author: HAI <hixiao@gmail.com>
+Date:   Thu Dec 26 20:22:14 2024 -0800
+
+    Regression fix to AMD/ROCm from recent change (#2606)
+
+[33mcommit b2ed5c8ea784c316fe1217ac0af6c23405cca132[m
+Author: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
+Date:   Fri Dec 27 09:53:09 2024 +0800
+
+    Tiny code cleanup in tokenizer_manager.py (#2586)
+
+[33mcommit f46f394f4d4dbe4aae85403dec006199b34d2840[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Dec 26 10:58:49 2024 -0800
+
+    Update README.md (#2605)
+
+[33mcommit 2125898af5224464f5b5999e32a6cc93f442199c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Dec 26 08:36:13 2024 -0800
+
+    Update contributor_guide.md (#2603)
+
+[33mcommit 44f011d2241945b173bcfd13545b523e80b806bd[m
+Author: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
+Date:   Fri Dec 27 00:28:01 2024 +0800
+
+    Super tiny typo fix (#2564)
+
+[33mcommit ed91e003bb2348c735f4c99125eb0f860306d568[m
+Author: kzhou003 <zhoukuan1@gmail.com>
+Date:   Thu Dec 26 08:24:18 2024 -0800
+
+    [UTILS] improve makefile a bit by adding help info (#2570)
+    
+    Co-authored-by: Hongpeng Guo <hpguo@anyscale.com>
+    Co-authored-by: Xingyao Wang <xingyao@all-hands.dev>
+    Co-authored-by: yigex <yigex@amd.com>
+
+[33mcommit 531d6ea9689acb27068a1ae89986f472f8d7a32a[m
+Author: yudian0504 <138860534+yudian0504@users.noreply.github.com>
+Date:   Fri Dec 27 00:16:48 2024 +0800
+
+    fix: package data missing (#2521)
+
+[33mcommit dc3bee4815183d19761082b9ac90ada72d0151ef[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Dec 26 07:56:26 2024 -0800
+
+    Fix test and benchmark scripts (#2598)
+
+[33mcommit a74d194146354af749947edfc1782f62bfea8b94[m
+Author: Zhizhou Sha <shazhizhou0@gmail.com>
+Date:   Thu Dec 26 06:54:43 2024 -0800
+
+    [unittest] add unit test to test quant args of srt engine (#2574)
+
+[33mcommit 3169e66c2347c7dddd74661a58357eb7b3e55d78[m
+Author: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
+Date:   Thu Dec 26 22:49:32 2024 +0800
+
+    Fix duplicated handling of GetWeightsByNameReqInput (#2565)
+
+[33mcommit 773951548ddd2d9f98f062788bc1c13aecbcf66d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Dec 26 06:27:45 2024 -0800
+
+    Fix logprob_start_len for multi modal models  (#2597)
+    
+    Co-authored-by: libra <lihu723@gmail.com>
+    Co-authored-by: fzyzcjy <ch271828n@outlook.com>
+    Co-authored-by: Wang, Haoyu <haoyu.wang@intel.com>
+
+[33mcommit 637de9e8ce91fd3e92755eb2a842860925954ab1[m
+Author: fsygd <fsygd1996@163.com>
+Date:   Thu Dec 26 21:31:56 2024 +0800
+
+    update readme of DeepSeek V3 (#2596)
+
+[33mcommit acb340728c169a9338e16783ff65510ab21179be[m
+Author: Adarsh Shirawalmath <114558126+adarshxs@users.noreply.github.com>
+Date:   Thu Dec 26 18:42:41 2024 +0530
+
+    [Feature] Support new parameter - EBNF in xgrammar (#2526)
+
+[33mcommit 08effbff35849a4e252b5a161f838731ed6d5deb[m
+Author: Sangchun Ha (Patrick) <seomk9896@gmail.com>
+Date:   Thu Dec 26 22:10:37 2024 +0900
+
+    Error occurs when loading the gemma model in bitsandbytes format. (#2557)
+
+[33mcommit 60bd32723ad0c7bda461bddd38f9475a28165c6f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Dec 26 03:31:50 2024 -0800
+
+    Update README.md (#2594)
+
+[33mcommit e7ebecf82eeacf6ed8dcd5b4ebffcb4e1e180671[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Dec 26 03:14:28 2024 -0800
+
+    Fix cache hit rate when chunked prefill (#2555)
+
+[33mcommit 9a23c484562770e6acb721af30d41f201d9b9936[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Thu Dec 26 19:13:31 2024 +0800
+
+    h100 tuning fused_moe_triton for qwen2 moe (#2560)
+
+[33mcommit 635a042623960722d04ae6ce39c788715c7d2e6b[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 26 17:43:37 2024 +0800
+
+    docs: update deepseek v3 example (#2592)
+
+[33mcommit 2dccecf43261207aaf5a8da7a92f5d1ae3f52e5b[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 26 16:56:59 2024 +0800
+
+    fix: only enable moe_align_block_size for now (#2590)
+
+[33mcommit 75ad0a143fb5ff91499578179028d772d21d3d49[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 26 15:26:54 2024 +0800
+
+    docs: add deepseek v3 launch instructions (#2589)
+
+[33mcommit efc52f85e2d5c9b31545d4092f2b361b6ff04d67[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 26 07:14:51 2024 +0800
+
+    chore: bump v0.4.1 (#2582)
+
+[33mcommit 60e2fdcf4fdb84742bbe452fdd4e2db2a3307c15[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 26 06:29:08 2024 +0800
+
+    use sgl-kernel moe_align_block_size (#2581)
+    
+    Co-authored-by: ispobock <ispobaoke@163.com>
+    Co-authored-by: HandH1998 <1335248067@qq.com>
+
+[33mcommit d7c0e872b0ffc88d825b7b580b839de427b43e23[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 26 06:11:39 2024 +0800
+
+    chore: bump 0.0.2.post8 for sgl-kernel (#2580)
+
+[33mcommit 31548116a8dc8c6df7e146e0587335a59fc5b9d7[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 26 05:31:04 2024 +0800
+
+    fix moe_align_block_size_kernel for shared memory issue (#2579)
+    
+    Co-authored-by: ispobock <ispobaoke@163.com>
+
+[33mcommit 53aed988cbaa7433c59c070d72d5aad3815cb286[m
+Author: HandH1998 <1335248067@qq.com>
+Date:   Thu Dec 26 00:02:14 2024 +0800
+
+    Refactor MoE (#2575)
+    
+    Co-authored-by: zhyncs <me@zhyncs.com>
+
+[33mcommit 8a56b43175c7daa0e5d900a27a137778edbc4e97[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Dec 23 19:21:21 2024 -0800
+
+    [Bench] Flush cache before benchmarking (#2566)
+
+[33mcommit e835a50021e03bf7c14b8af68b210189bb6e768b[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Tue Dec 24 01:10:22 2024 +0800
+
+    Reorg moe code (#2563)
+
+[33mcommit 23e5e50fd5fba7f315e04294f55060a8171fcc69[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 22 20:21:17 2024 -0800
+
+    Fix gemlite import (#2553)
+
+[33mcommit 25e5d589e39b3b605296395e4f9c96ec42f09055[m
+Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
+Date:   Mon Dec 23 01:14:40 2024 +0000
+
+    Doc: Update Grammar Backend (#2545)
+    
+    Co-authored-by: Chayenne <zhaochen20@outlook.com>
+
+[33mcommit 41b1db69b8104bc7ee0252a6215df4a262e8352b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 22 15:44:32 2024 -0800
+
+    A better aio rwlock that guarantees the order (#2547)
+
+[33mcommit 8496701934b9b9f6c178cacdf7d282e654716f29[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 22 06:25:57 2024 -0800
+
+    [Misc] Fix metrics, weight update lock, request logging (#2543)
+
+[33mcommit 7d672d277be6b24e4e5a157c8539dbabe07c5246[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Sun Dec 22 18:31:02 2024 +0800
+
+    [kernel optimize] benchmark write_req_to_token_pool_triton and optimize kernel (#2509)
+
+[33mcommit d4b174817dc6d97dd3b5b3366eff221b29566ec6[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 22 18:29:04 2024 +0800
+
+    docs: update sponsorship (DataCrunch) (#2523)
+
+[33mcommit 19ba2b0ea9241bf6a3ff5918916cd725755d1b6b[m
+Author: Lei <zhou.lei@outlook.com>
+Date:   Sun Dec 22 02:23:33 2024 -0800
+
+    Add lora_paths to v1_chat_generate_request (#2529)
+
+[33mcommit 4e1e3cff2075eba31c4656cad8512b1eba306570[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 22 00:14:41 2024 +0800
+
+    fix #2528 (#2541)
+
+[33mcommit 8f4d04e5403510fdea9d6ab83fc67f3e07ea4e32[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Dec 21 21:16:34 2024 +0800
+
+    chore: bump v0.4.0.post2 (#2525)
+
+[33mcommit feb2b768ba43577594c30f9dac55355954721e01[m
+Author: Jerry Zhang <jerryzh168@gmail.com>
+Date:   Fri Dec 20 08:25:25 2024 -0800
+
+    Add integration with gemlite weight only quant (#2528)
+
+[33mcommit d95a5f5bf53c639975a426381e0f11aa4099c076[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 19 23:24:30 2024 +0800
+
+    fix followup #2517 (#2524)
+
+[33mcommit 4b83db24f12861b51f58ffab35035414b44b080f[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 19 14:03:24 2024 +0800
+
+    fix: continue to use flashinfer 0.1.6 temporarily (#2517)
+
+[33mcommit 64456cf023539661f117bcbb90a4ab76c82dfb0d[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 19 13:44:02 2024 +0800
+
+    docs: update README (#2516)
+
+[33mcommit bb4a922023f401db849f0c3b36400cacb297cf0e[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 19 13:37:09 2024 +0800
+
+    feat: add llama3 eval (#2515)
+
+[33mcommit 21e9e63ad56f8bd25663fa6907ed92f47a2b2724[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Dec 17 06:20:44 2024 -0800
+
+    Print progress bar during cuda graph capture (#2502)
+
+[33mcommit 1fc84cf60be05bbbb45cabdfb5ba9454b03638a6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Dec 17 04:33:36 2024 -0800
+
+    Update readme (#2500)
+    
+    Co-authored-by: Ravi Theja <ravi03071991@gmail.com>
+    Co-authored-by: “yixin-huang1” <yixinhuang1@berkeley.edu>
+
+[33mcommit 361ea8d9120879b6eed517416fc70db829c8ed2e[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Dec 17 04:14:14 2024 -0800
+
+    Fix openai protocols and pass top_k, min_p (#2499)
+
+[33mcommit 33c5ff2845a980e2022f7c10268474b91416d19a[m
+Author: Lei <zhou.lei@outlook.com>
+Date:   Tue Dec 17 03:47:49 2024 -0800
+
+    Add lora_path to chat completion (#2438)
+
+[33mcommit 5ce9daea59774f64933036790ad7bb659fc10386[m
+Author: Hui Liu <96135754+hliuca@users.noreply.github.com>
+Date:   Tue Dec 17 03:45:14 2024 -0800
+
+    ROCm support for sglang.check_env (#2426)
+
+[33mcommit ce094a5d79aa3794f0cdb86ebf03e4897764e1bd[m
+Author: Ata Fatahi <immrata@gmail.com>
+Date:   Tue Dec 17 06:42:40 2024 -0500
+
+    Clean up GPU memory after killing sglang processes (#2457)
+    
+    Signed-off-by: Ata Fatahi <immrata@gmail.com>
+
+[33mcommit e21026690db23c833908c12bbe5fba94bb601735[m
+Author: bjmsong <wq.songbob@gmail.com>
+Date:   Tue Dec 17 19:31:57 2024 +0800
+
+    benchmark decoding attention kernel with cudnn (#2467)
+    
+    Co-authored-by: root <bjmsong@126.com>
+
+[33mcommit bd6196163ec3293b5254ecb5c6f14c16cb3577b6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Dec 16 19:21:11 2024 -0800
+
+    Small fix for the order of apply_torchao_config (#2495)
+
+[33mcommit 56198b45d9712bdbb161d226f94b4647738d33f5[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Dec 16 18:49:02 2024 -0800
+
+    Add a benchmark script for in-batch prefix caching (#2494)
+
+[33mcommit ba36b5520ab6759045abfd89d1d108f861053fb1[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Dec 16 15:04:16 2024 -0800
+
+    Revert "Small fixes for torchao quant" (#2493)
+
+[33mcommit 9cd9dc83b31e70723fb28ad820994a446cd57f8b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Dec 16 14:17:27 2024 -0800
+
+    Temporarily disable unit test of torch native attention backend (#2492)
+
+[33mcommit 7a1aecb9389cb5928f4595af4a1fb5f88e85b5f8[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Dec 16 14:11:09 2024 -0800
+
+    Simplify pytorch sampling kernel and logit processor (#2491)
+
+[33mcommit 82699474fdaad513c7fa8d73e892a262c47f6569[m
+Author: Jerry Zhang <jerryzh168@gmail.com>
+Date:   Mon Dec 16 14:08:12 2024 -0800
+
+    Small fixes for torchao quant (#2476)
+
+[33mcommit 7154b4b1df1410a8f64d996f912b7084dea7b270[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Dec 16 23:02:49 2024 +0800
+
+    minor: update flashinfer nightly (#2490)
+
+[33mcommit b532a5fd16d0c2b0d7945bffdc3beab1d7018975[m
+Author: xiaobochen <35516720+xiaobochen123@users.noreply.github.com>
+Date:   Mon Dec 16 20:54:02 2024 +0800
+
+    fix moe-ep accuracy issue for fp8 (#2489)
+
+[33mcommit a0592c059f14e9aaa73432497fa8b9f01399cfc7[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Sun Dec 15 13:52:08 2024 +0800
+
+    [Benchmark] add a benchmark for hf/vllm/sglang rmsnorm (#2486)
+
+[33mcommit e8dbdf75bc72c7e5ba0230ceebc253a2eaa1bd6d[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 15 13:44:55 2024 +0800
+
+    fix typo (#2487)
+
+[33mcommit e04d3f289753b942bc2d201988df408d01baf73c[m
+Author: yizhang2077 <1109276519@qq.com>
+Date:   Sun Dec 15 13:15:59 2024 +0800
+
+    adapt tensorrt llm custom all reduce to sgl-kernel (#2481)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit 5f2595be430239ba13c5adbe559e21333f5adf9e[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 15 02:47:26 2024 +0800
+
+    hotfix: checking for HIP (#2485)
+
+[33mcommit 0ba2c58947633e88f6bd848d2ba943132f43e901[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Sat Dec 14 23:53:54 2024 +0800
+
+    Remove cuda graph batch size adjustment for dp attention (#2484)
+
+[33mcommit fccbfa3752abc4c599a58be32573246a6215b747[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Dec 14 22:36:04 2024 +0800
+
+    format: add clang-format for sgl-kernel (#2483)
+
+[33mcommit 2f9bd0fafd7bfe9f8c085a5f482635c8638accc6[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Sat Dec 14 16:50:54 2024 +0800
+
+    Fix correctness issue for triton decoding kernel (#2479)
+
+[33mcommit 5282a4735f5b0bc99303bc57838fa926aa437066[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Dec 12 14:34:47 2024 -0800
+
+    [Minor] Fix grok model loader (#2473)
+
+[33mcommit f0ed9c353e954b9860a0cab8bf76e80c651c48b6[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Dec 13 02:23:52 2024 +0800
+
+    feat: support dev image (#2469)
+
+[33mcommit e3b3acfa6fff7e9c34a80e641260c92adccf1b22[m
+Author: Ata Fatahi <immrata@gmail.com>
+Date:   Thu Dec 12 12:40:41 2024 -0500
+
+    Rename rust folder to sgl-router (#2464)
+    
+    Signed-off-by: Ata Fatahi <immrata@gmail.com>
+
+[33mcommit 2673fa29d4b9188463898da7d1234a85a1685377[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 12 18:05:48 2024 +0800
+
+    fix: set runtime path (#2466)
+
+[33mcommit dedaf8cd48c1079a8b80780755a84f199cd621db[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 12 15:21:45 2024 +0800
+
+    minor: update pypi tag (#2463)
+
+[33mcommit 32ed01604187c05160f64520025c96c9eeae0dd1[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 12 14:58:05 2024 +0800
+
+    chore: bump v0.0.2 for sgl-kernel (#2462)
+
+[33mcommit 6efa9e4a6da41c6906ac5b8a7fe7faf7e2692291[m
+Author: Ata Fatahi <immrata@gmail.com>
+Date:   Wed Dec 11 20:40:03 2024 -0500
+
+    Bump sglang-router to 0.1.1 (#2459)
+    
+    Signed-off-by: Ata Fatahi <immrata@gmail.com>
+
+[33mcommit 7791fd994847e4e5ef68c79328cf5d665776b808[m
+Author: Ata Fatahi <immrata@gmail.com>
+Date:   Wed Dec 11 20:31:20 2024 -0500
+
+    Include version info into the router package (#2456)
+    
+    Signed-off-by: Ata Fatahi <immrata@gmail.com>
+    Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
+
+[33mcommit 2ac36b9a7bd602ea68e36d7a364b87ea3d321798[m
+Author: Ata Fatahi <immrata@gmail.com>
+Date:   Wed Dec 11 19:55:21 2024 -0500
+
+    Make request payload size configurable (#2444)
+    
+    Signed-off-by: Ata Fatahi <immrata@gmail.com>
+
+[33mcommit 2d60a5ee75a8506bf7ee97090afc3f0d1baf88d4[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Wed Dec 11 13:48:18 2024 -0800
+
+    Update v0.1.0.md
+
+[33mcommit 2e4a5907c9670e64fffbd1c90c9a3ecf52ec6a50[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Wed Dec 11 13:42:35 2024 -0800
+
+    [router] Release router 0.1.0 with dynamic scaling and fault tolerance (#2455)
+
+[33mcommit c0ee46fe10e7627903543b7ab7e536b02ef7a4d3[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Wed Dec 11 13:11:42 2024 -0800
+
+    [router] Update doc for dynamic scaling and fault tolerance (#2454)
+
+[33mcommit 9208618b3ef91dcabd0eb4157cf941d11391d8da[m
+Author: SangBin Cho <sangbin@x.ai>
+Date:   Wed Dec 11 12:51:50 2024 -0800
+
+    [Core] in batch prefix caching by delay scheduling (#2442)
+
+[33mcommit 864bf2ba00daecda60cb7e4636aae3a7ff277dbe[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Wed Dec 11 12:13:19 2024 -0800
+
+    [router] remove main.rs because only lib.rs is used for py binding (#2453)
+
+[33mcommit a4cca7fc53da2b0c58495e208bb17e0199246e12[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Wed Dec 11 12:13:08 2024 -0800
+
+    [router] Add retries based fault tolerance (#2452)
+
+[33mcommit 993956c6b1e4da007c6d821c1d181221410825d2[m
+Author: Fred Reiss <frreiss@us.ibm.com>
+Date:   Wed Dec 11 06:30:23 2024 -0800
+
+    Add support for IBM Granite 3.x models (#2437)
+
+[33mcommit f8548295d63d56b79599d900c61d6539bb6cfc74[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Dec 11 06:16:01 2024 -0800
+
+    Fix warmup in bench_offline_throughput.py (#2449)
+
+[33mcommit 959735fc9e38d6507651ba9196aa205430687b05[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Dec 11 05:21:23 2024 -0800
+
+    Fix model loader for more quantization formats (#2448)
+
+[33mcommit f67723940d92dc7e78b79897bbefc42c3bec6a80[m
+Author: bjmsong <wq.songbob@gmail.com>
+Date:   Wed Dec 11 20:46:59 2024 +0800
+
+    decoding attention kernel benchmark (#2425)
+    
+    Co-authored-by: root <bjmsong@126.com>
+
+[33mcommit 626a99ac13e475c4bfca270d241c9c9a2c949d0b[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Dec 11 20:44:28 2024 +0800
+
+    chore: update ao v0.7.0 (#2447)
+
+[33mcommit ece724910afd624e8aac85444472440d6586c3e4[m
+Author: Ke Wen <kw2501@meta.com>
+Date:   Wed Dec 11 04:21:42 2024 -0800
+
+    Make torch TP composable with torchao (#2436)
+
+[33mcommit 0fb88aaa77827d23b14fe64099121976adafce10[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Wed Dec 11 01:38:50 2024 -0800
+
+    [router] Use borrow if possible to save cost (#2441)
+
+[33mcommit d4de9a62359d1299cb639a67f39cfb40fda5d957[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Wed Dec 11 00:51:21 2024 -0800
+
+    [router] Refactor: decouple select and send stage (#2440)
+
+[33mcommit 7310aede97a0fdacc0be3219b6f8174b53351075[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Dec 11 06:48:45 2024 +0800
+
+    fix: compatible with PEP 440 (#2435)
+
+[33mcommit 5de9a58eca5fe9ce6ed4ad6f09efe5f2cecbab6e[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Dec 11 06:17:41 2024 +0800
+
+    fix: use manylinux2014_x86_64 tag (#2434)
+
+[33mcommit 56fcd8e8a53438838172454077e10944719e8fa1[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Dec 11 06:06:19 2024 +0800
+
+    feat: support sgl-kernel PyPI (#2433)
+    
+    Co-authored-by: Zhangyi <1109276519@qq.com>
+
+[33mcommit 2b340adfb1ebf6dee420885479ee92296694078c[m
+Author: Adarsh Shirawalmath <114558126+adarshxs@users.noreply.github.com>
+Date:   Tue Dec 10 11:19:40 2024 +0530
+
+    Typo fix in router.md (#2424)
+
+[33mcommit 8586b72da0fe8a9d32a76770eba4b83a893b5ddf[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Dec 9 09:52:38 2024 -0800
+
+    [feat] Enable chunked prefill for llava-onevision (#2412)
+
+[33mcommit 641b7d0ae051d3964926d5062d6426520367488e[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Dec 9 06:30:35 2024 -0800
+
+    [Minor] Improve code style (#2422)
+
+[33mcommit 0ce091a82d29bd6c0ea6564bc372311d14b6f5eb[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Dec 9 03:05:59 2024 -0800
+
+    [Minor] Improve code style (#2419)
+
+[33mcommit 835f8afc7788ad03a1175a6930f741a6bbf3f92e[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 8 23:30:51 2024 -0800
+
+    Migrate llama_classification to use the /classify interface (#2417)
+
+[33mcommit 3844feb9bb1cdd1ee59653b85e3b40e8a4d107d1[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Mon Dec 9 14:46:10 2024 +0800
+
+    Add a unittest for fused_moe (#2416)
+
+[33mcommit 27f7bed7a75b52538a2a4de69054f1dd19e1455c[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sun Dec 8 21:17:31 2024 -0800
+
+    reduce watchdog interval to 5s (#2410)
+
+[33mcommit 6387098f5f98101ee103732efe8da9d6cb54d92d[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sun Dec 8 17:17:37 2024 -0800
+
+    [router] add health checking in router init (#2393)
+
+[33mcommit 2a717c5078ed5feb7c8df70943e25d27e50a89eb[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sun Dec 8 16:58:41 2024 -0800
+
+    [Router] fix interrupt from terminal (#2413)
+
+[33mcommit a1e697b25b31287b67afe009a61f803b2fd6592f[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sun Dec 8 15:24:02 2024 -0800
+
+    [router] Improve cleanup logic (#2411)
+
+[33mcommit a6ca736c8e35b308ecb9d8e21c53692ef5c7ac4f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 8 12:27:13 2024 -0800
+
+    Simplify stream_output (#2398)
+
+[33mcommit f62055b528c2cac6cebdb6303e00bb479d7d2402[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Dec 9 04:15:21 2024 +0800
+
+    minor: add random flashinfer vs triton use case (#2409)
+
+[33mcommit 74bc9184c3eb8fcd2135a665424d484a652fe50a[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Dec 9 03:21:35 2024 +0800
+
+    minor: add random use case (#2408)
+
+[33mcommit 0f8eb15323ea8776a945d917517990ca7cbfbdcb[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Dec 9 02:29:55 2024 +0800
+
+    feat: support custom task runner (#2407)
+
+[33mcommit 67470bbb28591cc2a82a4cda419cdf6664ce46d2[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 8 20:55:04 2024 +0800
+
+    minor: update correct measurement unit (#2406)
+
+[33mcommit cc858953a0b0f99e5b7cf07dcf3335a158097df5[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 8 04:08:04 2024 -0800
+
+    Fix recv_requests (#2405)
+
+[33mcommit 6128f7cff5e61517f69fafa6aec148d8d40657cf[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 8 20:07:30 2024 +0800
+
+    fix: specify dtype with begin_forward aka plan (#2404)
+
+[33mcommit a2486eb58fa32661965bf66034625155e87cfc05[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 8 03:55:27 2024 -0800
+
+    Fix a bug with logprob streaming + chunked prefill (#2403)
+
+[33mcommit 61dec545b0446256b655d4a8aeccb50d3a341ee4[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Sun Dec 8 19:37:03 2024 +0800
+
+    Remove unused vars in the triton backend (#2401)
+
+[33mcommit 96db0f666d850156555b721ace0e3a9464249f34[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 8 01:56:26 2024 -0800
+
+    Update killall_sglang.sh (#2397)
+
+[33mcommit 7dc66fcb40aa693a299bdcf17247f52cc9deeff0[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Sun Dec 8 17:17:37 2024 +0800
+
+    Optimize Triton decoding kernel for long context (#2394)
+
+[33mcommit 1f09e84b9a31a8fa98fee6cbb9c5d8409967e653[m
+Author: SangBin Cho <rkooo567@gmail.com>
+Date:   Sun Dec 8 01:06:15 2024 -0800
+
+    nit: Remove busy waiting on scheduler (#2382)
+
+[33mcommit 63dfab1beada0c6800b6694bf28eb8eb85657615[m
+Author: Sangchun Ha (Patrick) <seomk9896@naver.com>
+Date:   Sun Dec 8 18:04:08 2024 +0900
+
+    Fix shape error that occurred when loading lora weight of gemma2 model. (#2330)
+
+[33mcommit ef995dae1e9e7cdc7cfa7d78a195a3943d7e3e6b[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Dec 7 15:39:54 2024 -0800
+
+    [router] Health check on worker before adding to the router (#2392)
+
+[33mcommit 75ae968959566da691a0bde8e6f96f463ce531b3[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 8 04:21:00 2024 +0800
+
+    minor: update killall script (#2391)
+
+[33mcommit 95f93f493a60a4dfdb30aa3d24ba3fc3b8666d3e[m
+Author: HAI <hixiao@gmail.com>
+Date:   Sat Dec 7 05:18:26 2024 -0800
+
+    Fp8 MoE optimizations on AMD (#2388)
+
+[33mcommit aaac33fd8dbc5f11790298d9d1ef325da487f3e4[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Dec 7 21:09:16 2024 +0800
+
+    fix: update xgrammar v0.1.6 (#2390)
+
+[33mcommit d332aa3b0c0ac131df4724084fc167f852611503[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Dec 7 19:28:53 2024 +0800
+
+    fix: resolve fp8 moe issue (#2387)
+
+[33mcommit c36736c841f735aa3a03bfa0db52c9d603c5fb49[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Fri Dec 6 17:16:03 2024 -0800
+
+    [router] Add remove worker api (#2380)
+
+[33mcommit 1bf9e34745e8056f9043065f4c485b4aa4d3864b[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Fri Dec 6 11:53:15 2024 -0800
+
+    [router] add remove tenant method in the radix tree (#2379)
+
+[33mcommit 499c85f1318d5ad914a599050bd3f616a28007e0[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Fri Dec 6 11:26:07 2024 -0800
+
+    [Router] remove duplicate char count (#2378)
+
+[33mcommit e5f227c0ee9f491ed8a625733314e7218988e744[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Dec 6 06:08:19 2024 -0800
+
+    Release v0.4.0.post1 (#2375)
+
+[33mcommit 0e7409adb64ac19db2db3583ef3e4077cc569b30[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Dec 6 05:49:29 2024 -0800
+
+    Fix the overlap for xgrammar (#2377)
+
+[33mcommit 3cde5eb62940556b4defbe285170658027fca353[m
+Author: vchzls <zhaohoulong@outlook.com>
+Date:   Fri Dec 6 20:27:17 2024 +0800
+
+    docs: Improve instructions for supporting new models (#2363)
+    
+    Co-authored-by: zhaohoulong <zhaohoulong@xiaomi.com>
+
+[33mcommit f5b2a3aa67efb10918965b9f3555ff24ef971902[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Dec 6 02:01:23 2024 -0800
+
+    Use proc.join instead of busy waiting (#2374)
+
+[33mcommit f68175967cb61983377a634a25994c5c8e9fb7e0[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Dec 6 17:59:26 2024 +0800
+
+    docs: update adoption (Meituan) (#2373)
+
+[33mcommit 67b657945a1b62bafc0376cda78c91b1ef2a614a[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Fri Dec 6 01:17:04 2024 -0800
+
+    [router] support `/add_worker` api (#2369)
+
+[33mcommit 37ee906f616efbd89b80fc2273e85bf8dbdd6682[m
+Author: Qun Yang <quyang@habana.ai>
+Date:   Fri Dec 6 17:16:33 2024 +0800
+
+    Add more support for intel Gaudi accelerators (#2357)
+
+[33mcommit 34b364e07355f5216babd8c6fac7cb476f85e42c[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Fri Dec 6 17:13:04 2024 +0800
+
+    optimize cuda graph max_bs_settings on low-end gpus (#2360)
+
+[33mcommit 84d96b3ae52ebf65baa6557647e09488b28eee3b[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Dec 6 15:42:10 2024 +0800
+
+    Move FP8 to SGLang (#2370)
+    
+    Co-authored-by: HaiShaw <hixiao@gmail.com>
+
+[33mcommit 3d32e4a32c4cd0c29da176bbc9f6b4f018c54fa5[m
+Author: xiaobochen <35516720+xiaobochen123@users.noreply.github.com>
+Date:   Fri Dec 6 15:05:21 2024 +0800
+
+    Resubmit MoE-EP (#2371)
+
+[33mcommit 64fceab8afae962ac2f64b6491d873591a58c051[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Thu Dec 5 17:46:21 2024 -0800
+
+    [router] use 2-gpu-runner (#2368)
+
+[33mcommit 71e2a27753fa6908eeaa0151ad27df0b05fd407a[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Dec 5 13:42:47 2024 -0800
+
+    Fix the cuda graph capture range for small #max-running-requests (#2359)
+
+[33mcommit 4a63c181f19015a0a8812b1fe5c33daf90ec8590[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Fri Dec 6 00:46:48 2024 +0800
+
+    Fix AWQ with enable MLA (#2364)
+
+[33mcommit 2b0fc5941d3d7f3dfe4a56c053ddddf9d4f77670[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Dec 4 19:02:08 2024 -0800
+
+    [Minor] Code style improvements (#2355)
+
+[33mcommit 9cc733b38ceb4fc9df0daa6aed7335f2f8a4ba82[m
+Author: Jerry Zhang <jerryzh168@gmail.com>
+Date:   Wed Dec 4 17:26:42 2024 -0800
+
+    move apply_torchao_config_ to model_runner (#2342)
+
+[33mcommit d693ec0427bd70c8676316c634e00bd27514b7ec[m
+Author: Ke Wen <kw2501@meta.com>
+Date:   Wed Dec 4 17:26:00 2024 -0800
+
+    Make torch TP composable with torch.compile (#2352)
+
+[33mcommit 18ea841f408c01a28c1a1db92f37ae95cfa12523[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Wed Dec 4 15:41:22 2024 -0800
+
+    Add Docs For SGLang Native Router (#2308)
+
+[33mcommit 786be44da52e4994c499fdddbbac0f5d79a9fd6e[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Wed Dec 4 11:19:46 2024 -0800
+
+    Fix Docs CI When Compile Error (#2323)
+
+[33mcommit 2db4469808158700036de79bd41a9c463bb89bdc[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 5 02:00:34 2024 +0800
+
+    minor: limit the range of vllm versions (#2350)
+
+[33mcommit ed45e509df91663698f42d132253ae485baba00c[m
+Author: Ata Fatahi <immrata@gmail.com>
+Date:   Wed Dec 4 09:53:02 2024 -0800
+
+    Check gpu availability at server args creation (#2340)
+    
+    Signed-off-by: Ata Fatahi <immrata@gmail.com>
+
+[33mcommit ec52464ddeabcc70b1fd3117b93adfefd5cb7ed0[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Thu Dec 5 01:50:28 2024 +0800
+
+    MLA prefill w/o weight absorption (#2349)
+
+[33mcommit eb0c1f53735c2a6f4c0ae0f0846f7cdc959ebada[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Dec 5 01:24:51 2024 +0800
+
+    docs: add SGLang v0.4 blog (#2341)
+
+[33mcommit b2986d7aa5a40740b71c0d2f59a9277cfa10c67f[m
+Author: HAI <hixiao@gmail.com>
+Date:   Wed Dec 4 03:01:33 2024 -0800
+
+    Adding SGLang FP8 Utils (#2348)
+
+[33mcommit f8b0326934bacb7a7d4eba68fb6eddebaa6ff751[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Dec 4 03:55:41 2024 +0800
+
+    chore: bump v0.4.0 (#2338)
+
+[33mcommit 0495796517a706e6ddf22189359f9da8e6f2b36b[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Tue Dec 3 10:27:43 2024 -0800
+
+    [router] Copy license when publishing & bump version (#2339)
+
+[33mcommit 1228f7ca69e6ee3f5076f2381c3a187120e0de00[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Dec 3 07:12:33 2024 -0800
+
+    Fix gptq for moe layers (#2300)
+    
+    Co-authored-by: root <me@zhyncs.com>
+
+[33mcommit fda628d8f210058b5386d0e6b4eefcd6a8fb8947[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Dec 3 21:22:19 2024 +0800
+
+    fix: resolve cmake url for Dockerfile.dev (#2335)
+
+[33mcommit 07ec07ad1fa59e0f07a4fcd1b1f324123c2e2bd4[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Dec 3 01:58:25 2024 -0800
+
+    Improve torch compile for fused moe (#2327)
+
+[33mcommit 83b340e371a0151c9fdefac9f07e0f89ba5e6c37[m
+Author: Ata Fatahi <afbcesh91@gmail.com>
+Date:   Tue Dec 3 00:06:25 2024 -0800
+
+    Add missing license for router wheel (#2324)
+    
+    Signed-off-by: Ata Fatahi <immrata@gmail.com>
+
+[33mcommit 0639bf15d1077fafe6f1be41dad72d6c87b301a9[m
+Author: HAI <hixiao@gmail.com>
+Date:   Mon Dec 2 23:20:33 2024 -0800
+
+    ROCm Container: set SGLANG_SET_CPU_AFFINITY=1 (#2328)
+
+[33mcommit aa47f642230f35269b45d81cba837a30a3015eb3[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Dec 2 23:11:13 2024 -0800
+
+    Revert "[feat] Enable chunked prefill for llava-onevision" (#2329)
+
+[33mcommit 3ddb1c467979eb13afc629506ea80806935390e8[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Dec 2 20:45:53 2024 -0800
+
+    [Minor] Fix logger and style (#2325)
+
+[33mcommit 480e38a73350f2af57d003b023fab5cbc9a1e65e[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Dec 2 20:19:02 2024 -0800
+
+    [feat] Enable chunked prefill for llava-onevision (#2281)
+
+[33mcommit 69e2d4fb66e8dd9df7e9472df44ae29afc1320d1[m
+Author: HAI <hixiao@gmail.com>
+Date:   Mon Dec 2 19:05:58 2024 -0800
+
+    Relax to include more AMD GPUs (#2319)
+
+[33mcommit 85e1a6f3aa5a2288ca85fe3fe922c733b6533fa7[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Dec 2 23:22:13 2024 +0800
+
+    Update  model_loader deps and qqq quantization deps (#2220) (#2318)
+    
+    Co-authored-by: HandH1998 <1335248067@qq.com>
+
+[33mcommit 33deca81b5e346c8cd0a04bd0896746d61515dc9[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Dec 2 04:26:55 2024 -0800
+
+    Add more fused moe benchmark utilities (#2314)
+
+[33mcommit 18108abe5d0e2cb21e1ac7efe22144fbfe19d8af[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Dec 2 02:27:36 2024 -0800
+
+    [Minor] Fix code style (#2311)
+
+[33mcommit c54bda300ab2d9128eded374b802a4779302b9ff[m
+Author: HAI <hixiao@gmail.com>
+Date:   Mon Dec 2 00:15:45 2024 -0800
+
+    Use rocminfo instead of rocm-smi for more OS/WSL support (#2310)
+
+[33mcommit 3c79ad35cae8c7883b2d0d9f067b46804c24f544[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 1 23:36:28 2024 -0800
+
+    [Fix] Fix the padded hash value for image tokens (#2309)
+
+[33mcommit 983bfcf386861812aeaf1f0495371549a94b01c1[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Sun Dec 1 23:23:18 2024 -0800
+
+    Online weight updates from torch.distributed (#2279)
+
+[33mcommit 28bc60dcab1290933c35e77e5b28a95285c34703[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Dec 2 02:03:49 2024 +0800
+
+    misc: update build setup (#2306)
+
+[33mcommit 7301a39b13c769e3b9eac38f8e08e6c22018a799[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 1 23:55:19 2024 +0800
+
+    fix: resolve CodeQL cpp issue (#2305)
+
+[33mcommit 47eb139f810a84f16d426087268991bef8a4540f[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 1 22:43:50 2024 +0800
+
+    feat: use warp reduce as a simple example (#2304)
+
+[33mcommit 5c18a037337ec39f1a3d0609574f56d1eb05f339[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 1 05:17:05 2024 -0800
+
+    Fix logprob for completions (#2301)
+
+[33mcommit 5c91a315d779690b800638a868e1dbd6479ef49c[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 1 20:11:21 2024 +0800
+
+    feat: support sgl-kernel pypi (#2302)
+
+[33mcommit 3dbd73d3194ddc91320c88ae8399e86094f91764[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 1 19:24:12 2024 +0800
+
+    minor: rm unused _grouped_size_compiled_for_decode_kernels (#2299)
+
+[33mcommit e9a6203dee21cda91a8f5a113ea4171f3b221571[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 1 19:18:57 2024 +0800
+
+    feat: skip good first issue (#2298)
+
+[33mcommit 62c516ac45a74af107d64d421cf3639701b3b17b[m
+Author: Qun Yang <quyang@habana.ai>
+Date:   Sun Dec 1 19:01:25 2024 +0800
+
+    Add a simple torch native attention backend (#2241)
+
+[33mcommit fc78640e00e39520fa7126789d23369d2f104d0c[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 1 18:55:26 2024 +0800
+
+    minor: support flashinfer nightly (#2295)
+
+[33mcommit 906d795f15e4df3535f1b76af709932076a07797[m
+Author: gobraves <gobraves@users.noreply.github.com>
+Date:   Sun Dec 1 18:07:27 2024 +0800
+
+    Feat: upgrade outlines & support compatibility with the old version (#2292)
+
+[33mcommit 118b6af35e37ebe2bc82905be30637e444b304c6[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 1 18:01:16 2024 +0800
+
+    feat: add should_use_tensor_core (#2179)
+
+[33mcommit 9449a95431dd8a3e2c1c817782bc52eb7bc50d03[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Dec 1 01:47:30 2024 -0800
+
+    [CI] Balance CI tests (#2293)
+
+[33mcommit 5f12f0e7af585973b366c262a5c3faea5ce5bf0a[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Dec 1 00:37:53 2024 -0800
+
+    Fix chunked prefill when ignore eos (#2290)
+
+[33mcommit d5b95cbb53b05a8cfe6884a989a05c88e8363295[m
+Author: yizhang2077 <1109276519@qq.com>
+Date:   Sun Dec 1 15:54:52 2024 +0800
+
+    adapt vllm distributed module to sglang (#2244)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit 0303ca918fdf7b87df63c8902a36f2623cb4dea0[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 30 23:53:34 2024 -0800
+
+    [CI] Fix missing files in run_suite.py (#2288)
+
+[33mcommit 00181098dd2b4b5dacdec299205f499c7bcb62b9[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Dec 1 15:27:52 2024 +0800
+
+    feat: add Dockerfile for development (#2289)
+
+[33mcommit 4936be8accdf5604152ee77369db87c03399c726[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 30 22:14:48 2024 -0800
+
+    Revert "Revert "[FEAT] Support GGUF format"" (#2287)
+
+[33mcommit 1bfa511b95896d15b70cf505d10f2e28344ffb33[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 30 21:12:03 2024 -0800
+
+    [CI] Fix ci tests (#2284)
+
+[33mcommit f5b5f2bff9c492911494d6d4da96d82083fc3c96[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 30 19:03:42 2024 -0800
+
+    Revert "[Fix] fix assertion error for chunked prefill when disabling cache" (#2286)
+
+[33mcommit 7e4c6dd8dac5e33a45eb31c1c508fa2ceb6df023[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 30 19:03:26 2024 -0800
+
+    Revert "[FEAT] Support GGUF format" (#2285)
+
+[33mcommit d622851dc9f1d044e820a45a89ab38614d70c543[m
+Author: Rui Wang <45031995+wangraying@users.noreply.github.com>
+Date:   Sun Dec 1 09:53:43 2024 +0800
+
+    [Fix] fix assertion error for chunked prefill when disabling cache (#2282)
+
+[33mcommit 883c955489d70a9dbb7c0e5612f8dfc7ced40472[m
+Author: Yang Zheng <50227060+zhengy001@users.noreply.github.com>
+Date:   Sat Nov 30 16:44:48 2024 +0800
+
+    [FEAT] Support GGUF format (#2215)
+    
+    Co-authored-by: Yang Zheng(SW)(Alex) <you@example.com>
+
+[33mcommit 0d6a49bd7d86e2a51887f0b5bad2f47d174d26ba[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 30 00:24:30 2024 -0800
+
+    [CI] Kill zombie processes (#2280)
+
+[33mcommit ccaf1f997c4e96010bf33be916a06e4476300ace[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 29 23:48:54 2024 -0800
+
+    [CI] Print summary on github actions (#2274)
+
+[33mcommit 7d1485d3765eed0ed2f55c60210dc47c6573478a[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Fri Nov 29 23:36:38 2024 -0800
+
+    Add get weights by parameter name for llama (#2266)
+
+[33mcommit 7d5d1d3d2915d386d20890d5bda466834fc220be[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Fri Nov 29 17:17:00 2024 -0800
+
+    udate weights from disk (#2265)
+
+[33mcommit b53d6cbda38dee5ec1cf153c4b56e84cbd3dfdc8[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 29 16:37:52 2024 -0800
+
+    Add new contributors so they can trigger CI automatically (#2269)
+    
+    Co-authored-by: Qun Yang <qun.yang@intel.com>
+    Co-authored-by: zhengy001 <zhengy.gator@gmail.com>
+    Co-authored-by: HandH1998 <1335248067@qq.com>
+    Co-authored-by: xiaobo <xiaob.chen@outlook.com>
+
+[33mcommit 01017d4c206ccfe01c9fc458f3acafc52d81b848[m
+Author: bjmsong <wq.songbob@gmail.com>
+Date:   Sat Nov 30 08:13:38 2024 +0800
+
+    Support LoRA in Completion API (#2243)
+    
+    Co-authored-by: root <bjmsong@126.com>
+
+[33mcommit 94e167ea5aaf2871e9176c2b02e4584f5101dd64[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 29 16:03:32 2024 -0800
+
+    Fix the default chunked prefill size (#2268)
+
+[33mcommit 262e370f78c0f96cd261773e0053b980bce2b157[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Sat Nov 30 05:36:45 2024 +0800
+
+    [benchmark] Add fused_moe_triton benchmark and tuning tools (#2225)
+    
+    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
+    Co-authored-by: HAI <hixiao@gmail.com>
+
+[33mcommit 419a57e771197b14dbe1b5d9b8eacec4ab517d5b[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Nov 30 02:27:35 2024 +0800
+
+    minor: add sgl-kernel dir (#2261)
+
+[33mcommit fae4e5e99a93f8f5e7fa462833754c91ecbea1c2[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Nov 30 01:41:16 2024 +0800
+
+    chore: bump v0.3.6.post3 (#2259)
+
+[33mcommit afe1e46586dac2a6b42326351856ba1dc05d1508[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 29 04:24:20 2024 -0800
+
+    [Minor] fix the style for multimodal models (#2257)
+
+[33mcommit f50a6cf4435bd39b854efcf00814bc796b7f9b21[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 29 03:15:58 2024 -0800
+
+    Fix hash collision for multi modal models (#2256)
+
+[33mcommit fe97a2d40f9faeac16dbf58fae9161718cdb4b31[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 29 02:18:51 2024 -0800
+
+    Simplify tokenizer manager (#2254)
+
+[33mcommit 8b48496aaf90fdcd90698dbe9a9e11acdfc4a4d3[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Nov 28 23:58:54 2024 -0800
+
+    Revert "Revert "Add simple CPU offloading support"" (#2253)
+    
+    Co-authored-by: Jani Monoses <jani.monoses@gmail.com>
+    Co-authored-by: youkaichao <youkaichao@gmail.com>
+
+[33mcommit 4057ea82c9a11f4f2379189c390f4a4f88f73854[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Nov 28 23:36:55 2024 -0800
+
+    Revert "Add simple CPU offloading support" (#2252)
+    
+    We'll re-add the commit to correctly ack Kaichao's authorship
+
+[33mcommit 4f2ee48ed1c66ee0e189daa4120581de324ee814[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Nov 28 23:18:07 2024 -0800
+
+    Update backend.md (#2251)
+
+[33mcommit 71ff2728a1e4ab6cab870737d6563c97eb048929[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Nov 28 23:14:36 2024 -0800
+
+    Update backend.md (#2250)
+
+[33mcommit b7038fec9b29f2251a781074d90e9a068d838db2[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Nov 28 12:08:13 2024 -0800
+
+    [fix] Fix prefix caching for multi-image/video (#2239)
+
+[33mcommit 65fdb289294f890c1814277ffc6160fa93b07750[m
+Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
+Date:   Thu Nov 28 13:24:47 2024 +0000
+
+    fix missing launch server import (#2242)
+
+[33mcommit b2ccf36d4d93d47b59399a93e7e00444b812a28c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Nov 28 02:22:15 2024 -0800
+
+    Fix memory leak during abort (#2238)
+
+[33mcommit d4fc1a70e3187c914043a1ffc619adbb0c3c6860[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Nov 28 00:22:39 2024 -0800
+
+    Crash the server correctly during error (#2231)
+
+[33mcommit db674e3d24dd224df42aef37cad55be130062a6f[m
+Author: Jani Monoses <jani.monoses@gmail.com>
+Date:   Thu Nov 28 10:15:20 2024 +0200
+
+    Add OLMo2 model. (#2233)
+
+[33mcommit fb915bd1a2e0f1425ecfd3ab47cace317abf1ddb[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Nov 27 23:44:33 2024 -0800
+
+    Disable overlap scheduler for multimodal models (#2235)
+
+[33mcommit 09798b36cd31f8f9787cc43a5aed9bca173ada40[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Nov 27 23:37:20 2024 -0800
+
+    Fix chunked prefill size for bench_offline_throughput (#2234)
+
+[33mcommit b79fffdcb5c52ba8fdc72a9f18aabc3cd50bc7ff[m
+Author: HAI <hixiao@gmail.com>
+Date:   Wed Nov 27 22:46:55 2024 -0800
+
+    Update Install Method 2. From source (#2232)
+
+[33mcommit cd51758fade4119b3f6233444c3bfac91ed5eba9[m
+Author: HAI <hixiao@gmail.com>
+Date:   Wed Nov 27 21:18:51 2024 -0800
+
+    Rename tuned MI300X config files for fused_moe_triton (#2228)
+
+[33mcommit 91e5dbf5547382e2df51435a2113be14949188bf[m
+Author: bjmsong <wq.songbob@gmail.com>
+Date:   Thu Nov 28 06:57:13 2024 +0800
+
+    add profile in offline benchmark & update doc (#2123)
+    
+    Co-authored-by: root <bjmsong@126.com>
+
+[33mcommit dd5eba4c88991cb5d6cddb140279cdbc398f827f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Nov 27 14:28:55 2024 -0800
+
+    Remove fused_moe_grok (#2223)
+
+[33mcommit a4fd2f9b465b5a4ad47345be7f4ae0781150cb94[m
+Author: Baoyuan Qi <qibaoyuan@126.com>
+Date:   Thu Nov 28 04:07:00 2024 +0800
+
+    fix typo prompts (#2224)
+
+[33mcommit 92d1253e5802c75cb892b0eb5172b604ce6f60e9[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Wed Nov 27 11:23:32 2024 -0800
+
+    Bump sglang-router to 0.0.10 for env name change (#2226)
+
+[33mcommit a9ca297d769b52251a8fca7073c1a41700825fa4[m
+Author: kk <43161300+kkHuang-amd@users.noreply.github.com>
+Date:   Thu Nov 28 02:23:10 2024 +0800
+
+    [3rdparty, document] Updated Documentation that for triton fused_moe kernel tuning for AMD Instinct GPUs (#2191)
+    
+    Co-authored-by: wunhuang <wunhuang@amd.com>
+    Co-authored-by: HAI <hixiao@gmail.com>
+
+[33mcommit 2a02185c5f9be353fb493fc3548552ec5a5aafad[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Nov 27 09:36:36 2024 -0800
+
+    Rename DP_RANK to SGLANG_DP_RANK (#2218)
+
+[33mcommit fed4c6946acd476ab94cad85a1210900a3ae6076[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Nov 27 03:35:30 2024 -0800
+
+    Release v0.3.6.post2 (#2214)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit fb6e04a0c28bc33996aa3c7da51842012a965ffc[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Nov 27 02:52:46 2024 -0800
+
+    Use an env var SGLANG_SET_CPU_AFFINITY to set cpu affinity; turn it off by default (#2222)
+
+[33mcommit 6997e28f6e46a506eaacc18e6a3c62fcb63e60b9[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Nov 27 02:02:01 2024 -0800
+
+    Revert "Use an env var SGLANG_SET_CPU_AFFINITY to set cpu affinity; turn it off by default" (#2221)
+
+[33mcommit a0e58740a8307b7edd2a8868d520c9371a532e92[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Nov 27 01:13:41 2024 -0800
+
+    Use an env var SGLANG_SET_CPU_AFFINITY to set cpu affinity; turn it off by default (#2217)
+
+[33mcommit 37c8a5761f05d83b5ef3f946c8ebacbd51891651[m[33m ([m[1;31morigin/main[m[33m, [m[1;31morigin/HEAD[m[33m, [m[1;32mmain[m[33m)[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Nov 27 00:03:29 2024 -0800
+
+    [feat] Support session control for vision language models (#2210)
+
+[33mcommit c754652fcd1a5ac0e727343486657f5ef71b3252[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Nov 26 23:06:20 2024 -0800
+
+    Fix flasky tests (#2212)
+
+[33mcommit 0b46b951ae088dd22fe980acc7d855947ce2537f[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Tue Nov 26 15:00:41 2024 -0800
+
+    Fix rust warning (#2208)
+
+[33mcommit 2763c0a73adeafb42b6b38b5bd756e3bbe8d68b1[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Tue Nov 26 13:30:28 2024 -0800
+
+    Bump router to 0.0.9 with better logging (#2207)
+
+[33mcommit de3b67b77d3a7fe328e5caefdf6486d6e221ffce[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Nov 27 04:57:16 2024 +0800
+
+    docs: update adoption (#2204)
+
+[33mcommit 19f33b3237fb4dbdb6c7431fb45a2c5548713acb[m
+Author: Yudi Xue <10211+binarycrayon@users.noreply.github.com>
+Date:   Tue Nov 26 12:10:23 2024 -0800
+
+    add sglang version to get_server_info (#2206)
+
+[33mcommit 30ce5b599e3676695dde7dcce0c99b48c6a609fe[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Nov 26 18:22:55 2024 +0800
+
+    minor: update check_env (#2201)
+
+[33mcommit bc1f6fda0d479ebf26b32d31f9835a5d8f05e1ef[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Nov 26 17:24:18 2024 +0800
+
+    fix: add cuda-python for xgrammar (#2199)
+
+[33mcommit 867e092f8289bcbb60b5ffa14ddb7587c9fdd65c[m
+Author: Wang Ran (汪然) <wangr@smail.nju.edu.cn>
+Date:   Tue Nov 26 17:00:38 2024 +0800
+
+    using `is not` not `!=` to test `None` (#2196)
+
+[33mcommit 88c7763f536cc80478ca59814c409e23e9d7556a[m
+Author: Andrew Lyu <apemost@gmail.com>
+Date:   Tue Nov 26 16:59:58 2024 +0800
+
+    Remove unresolved reference 'self' (#2198)
+
+[33mcommit e4118b15b30ae4fd2ed9bb2cf6bad5b6f62d8ac9[m
+Author: Wang Ran (汪然) <wangr@smail.nju.edu.cn>
+Date:   Tue Nov 26 16:59:36 2024 +0800
+
+    remove unused imports (#2195)
+
+[33mcommit ba4ee37fa4f9b0e991ce9a55a54d54946c7eb34f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Nov 26 00:58:57 2024 -0800
+
+    Update sampler.py to skip the success  check (#2197)
+
+[33mcommit ac5a0f048870364126c7c97ed8660306be58609d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Nov 25 17:31:37 2024 -0800
+
+    Release v0.3.6.post1 (#2189)
+
+[33mcommit ea34350d882624e421101ffa0477f2d3bf3364e2[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Nov 25 17:12:08 2024 -0800
+
+    Rename double sparsity config file (#2188)
+
+[33mcommit 1605ae121e6c792e4f38813814b287b3c8669eb5[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Nov 25 16:38:43 2024 -0800
+
+    [CI] Minor fix for CI (#2187)
+
+[33mcommit 1aea19f64b06cee64368a6f0488af1fb2a39e328[m
+Author: Rin Intachuen <113603872+RinRin-32@users.noreply.github.com>
+Date:   Mon Nov 25 19:35:04 2024 -0500
+
+    Input_embeds support (#2052)
+
+[33mcommit 1f76fc6e3f6f95e823e350330e575e573f4bb3ee[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Nov 25 16:02:03 2024 -0800
+
+    [router] Rust e2e test (#2184)
+
+[33mcommit 7f076c2ce6d2de2625233b98c4b6990d24d09b66[m
+Author: Yixin Dong <ubospica@gmail.com>
+Date:   Mon Nov 25 18:58:30 2024 -0500
+
+    Update XGrammar to the latest API (#2176)
+    
+    Co-authored-by: Ben Gitter <gitterbd@gmail.com>
+
+[33mcommit 3c5538f781acd0b052330b2eb9f0f8d860e4d1ca[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Nov 25 15:24:17 2024 -0800
+
+    Update CI threshold (#2186)
+
+[33mcommit 10189d08dde1096f5759316c0a6ff05962714c4b[m
+Author: HAI <hixiao@gmail.com>
+Date:   Mon Nov 25 14:57:32 2024 -0800
+
+    [Performance]: Process affinity to CPU cores with multiple sockets support (#2171)
+
+[33mcommit c4336b2b60acdc2a835842f5033c05226d211e56[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Nov 25 14:55:01 2024 -0800
+
+    Use custom allreduce w/ torch.compile (#2185)
+
+[33mcommit 4d62bca5429405830e3de0e18d3e4fde6e022a6a[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Nov 25 13:36:02 2024 -0800
+
+    [router] Replace print with logger (#2183)
+
+[33mcommit e1e595d702fe61883fd1fbfa0377075fd34e7694[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Nov 25 12:32:51 2024 -0800
+
+    [feat] Refactor session control interface and add CI (#2173)
+
+[33mcommit 5ada33ffa08a16a0ffbc71feca5055aa24904803[m
+Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
+Date:   Tue Nov 26 03:22:33 2024 +0800
+
+    Bump rustls from 0.23.16 to 0.23.18 in /rust (#2182)
+    
+    Signed-off-by: dependabot[bot] <support@github.com>
+    Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
+
+[33mcommit 254fd130e27363de8d56364e5a13fad0188fb7a2[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Nov 25 04:58:16 2024 -0800
+
+    [CI] Split test cases in CI for better load balancing (#2180)
+
+[33mcommit 538fa0ae135c4e7ef70c65439359eff7bec2b616[m
+Author: Yixin Dong <ubospica@gmail.com>
+Date:   Mon Nov 25 04:31:25 2024 -0500
+
+    [Fix] Avoid calling fill_vocab_mask for terminated requests (#2175)
+
+[33mcommit 55842eb81a782da7e522ec0210c3fa1f3f74dc0a[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Nov 25 17:06:36 2024 +0800
+
+    feat: fused_moe fp8 monkey patch (#2174)
+
+[33mcommit a866b65e1d7b1a0284bb8e3ab967d94134d7d748[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sun Nov 24 23:17:38 2024 -0800
+
+    Bump rust router to 0.0.8
+
+[33mcommit 4b0a1c9365efbbe1890858d2c8ad86046aaa3e7b[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sun Nov 24 23:17:11 2024 -0800
+
+    Replace prob based with threshold based load balancing  (#2170)
+
+[33mcommit 8e1adb8441a47e102e9c6ac2485c3d66c8e66e62[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 24 20:58:17 2024 -0800
+
+    Allow overwrite flashinfer use_tensorcore (#2169)
+
+[33mcommit dd44173dad4ecca49430886f000198a391eefac0[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Mon Nov 25 10:37:50 2024 +0800
+
+    [Fused moe] add tuning fused configs for qwen2 57b and mixtral 8x7b (#2167)
+
+[33mcommit 8912b7637f5c8dca0f18c31a17e46f427cf53152[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 24 08:25:56 2024 -0800
+
+    Fix docs (#2164)
+
+[33mcommit be0124bda09dc10267f6cbbcb097bf14dd4fd8b6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 24 08:12:35 2024 -0800
+
+    Rename triton_fused_moe -> fused_moe_triton (#2163)
+
+[33mcommit fe5d3e818fbcf940743481a10f638f3ebe6f4e1f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 24 07:38:52 2024 -0800
+
+    Balance CI tests (#2162)
+
+[33mcommit 731146f6cbec40f502e16dc971a150ed46b207ad[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 24 07:17:37 2024 -0800
+
+    Fix mixed chunked prefill in overlap mode (#2158)
+
+[33mcommit fa271613809bc5d901c0e864c4f9b9d3d3a101bd[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Nov 24 22:37:04 2024 +0800
+
+    fix: use torch.sum for compatible (#2161)
+
+[33mcommit 5652c565352c73889b3a39a7e2a014ca4c5dafcb[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 24 06:29:38 2024 -0800
+
+    Update CI threshold & Improve code style (#2159)
+
+[33mcommit e3938b2f9c9644e979407e77c613322b60a1c622[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Nov 24 21:36:34 2024 +0800
+
+    feat: update other MoE models deps (#2156)
+
+[33mcommit c211e7b669c72a35dc8c128f2af20ac928f73280[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 24 04:47:10 2024 -0800
+
+    Simplify batch update (#2154)
+
+[33mcommit d90c3d6b8bcc30943b775aad0bb37402663adfaa[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Nov 24 20:38:26 2024 +0800
+
+    fix: resolve end-of-file-fixer (#2157)
+
+[33mcommit 9e8f8fbf95a1c0adfbc8bdfe373a17b92b094cd2[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Nov 24 20:24:58 2024 +0800
+
+    feat: update gitignore and add tuning config for FusedMoE (#2155)
+
+[33mcommit b509db5832c96e2a47dd82d500bc9d4c855c9b4c[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Nov 24 20:09:27 2024 +0800
+
+    feat: remove the dependency on FusedMoE (#2153)
+
+[33mcommit dbe1729395d8c0cdfda419fe4378c7499157f563[m
+Author: Henry Hyeonmok Ko <52618631+henryhmko@users.noreply.github.com>
+Date:   Sun Nov 24 01:37:58 2024 -0800
+
+    Merged three native APIs into one: get_server_info (#2152)
+
+[33mcommit 84a1698d67d63911e8d1f55c979b00d65d84dc37[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Nov 23 17:35:25 2024 -0800
+
+    Update release-pypi-router.yml
+
+[33mcommit 32293a299c7aa4c5d985d97bbb1885e5f32e4862[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Nov 23 17:34:24 2024 -0800
+
+    Improve sglang router (#2148)
+
+[33mcommit 79216908931c26e7c0f2dbe0429d1e8a94cb6149[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Nov 23 15:35:02 2024 -0800
+
+    add prefix match for certain tenant (#2147)
+
+[33mcommit bbb81c24578cddcde0f6241ffa993ab471b6b214[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Nov 23 15:10:26 2024 -0800
+
+    Add more api routes (completion, health, etc) to the router (#2146)
+
+[33mcommit 52f58fc42ab1f00ae3d0e0279594664c07504142[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Nov 23 11:46:21 2024 -0800
+
+    fix dp_rank env (#2144)
+
+[33mcommit 145c0ddc2df7fdb68fb45fea7ad179a3a49934c1[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Nov 23 11:01:04 2024 -0800
+
+    update router doc (#2143)
+
+[33mcommit 505d7f71a6d8f59459506ef9e2dffe5524088539[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Nov 23 08:35:46 2024 -0800
+
+    Bump sglang-router to 0.0.5 (#2142)
+
+[33mcommit cbedd1db1d8bdde867efadf90b3c801dfe4e9964[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Nov 23 08:34:48 2024 -0800
+
+    [router] cache-aware load-balancing router v1 (#2114)
+
+[33mcommit ad47749b827b8087c914d489d2d26ac485121c59[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Nov 23 17:45:42 2024 +0800
+
+    fix: resolve bench_serving args (#2139)
+
+[33mcommit 751c3a037cdfa27e58cec5e316b3f23cb0b80db2[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 23 01:22:26 2024 -0800
+
+    Fix dp print message (#2138)
+
+[33mcommit 60769be14d00fb0d61159312db796c5f47bff6f7[m
+Author: Yunmeng <cym103@126.com>
+Date:   Sat Nov 23 17:07:07 2024 +0800
+
+    Add concurrency option for benchmark (#2136)
+
+[33mcommit a78d8f8db380e86c4e534a5e466ffc0ae7b13a5c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 23 01:00:07 2024 -0800
+
+    [CI] Fix test cases (#2137)
+
+[33mcommit c5f865013e729a6449384c595492018041e9fb64[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Sat Nov 23 16:51:46 2024 +0800
+
+    Fix grid size in Triton decoding kernel (#2134)
+
+[33mcommit d98fa1e93dc9af557c2dd0aa80f8ba80a2fe65e5[m
+Author: Jani Monoses <jani.monoses@gmail.com>
+Date:   Sat Nov 23 08:23:53 2024 +0200
+
+    Add simple CPU offloading support. (#2081)
+
+[33mcommit 865233e2565fa4cbb89e806bf371866f4ef9d56f[m
+Author: Ankur Neog <anneog@habana.ai>
+Date:   Sat Nov 23 09:52:23 2024 +0530
+
+    Add initial support for intel Gaudi accelerators (#2121)
+
+[33mcommit 66d4859acfb24ae3afe358d8310299a88ce8fce1[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 22 15:46:16 2024 -0800
+
+    Revert "Only stream output on tp rank 0" (#2130)
+
+[33mcommit e1b63624d79d7153e85ae6fe884619e097ffc1bd[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 22 15:13:44 2024 -0800
+
+    Only stream output on tp rank 0 (#2124)
+
+[33mcommit c35cd1f8c7cbcb5086276ae960572d9a49dc50f3[m
+Author: Henry Hyeonmok Ko <52618631+henryhmko@users.noreply.github.com>
+Date:   Fri Nov 22 15:10:10 2024 -0800
+
+    Expose max total num tokens from Runtime & Engine API (#2092)
+
+[33mcommit 72f87b723bd49287258b88a7b420cd75239ec83a[m
+Author: Xuehai Pan <XuehaiPan@pku.edu.cn>
+Date:   Sat Nov 23 05:04:51 2024 +0800
+
+    feat(pre-commit): trim unnecessary notebook metadata from git history (#2127)
+
+[33mcommit 62a4a339ebc1b2a9ecf5deac10ebf1de9108bca3[m
+Author: Xuehai Pan <XuehaiPan@pku.edu.cn>
+Date:   Fri Nov 22 22:16:53 2024 +0800
+
+    docs: fix module docstrings and copyright headers (#2077)
+
+[33mcommit 2797bc34221568f5362cb59cc8e5c3f65078730a[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Nov 22 20:53:11 2024 +0800
+
+    fix: add xgrammar dependency (#2126)
+
+[33mcommit 9a00e6f453e764c0b286e2a62f652a1202c0bf9c[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Nov 22 19:27:30 2024 +0800
+
+    chore: bump v0.3.6 (#2120)
+
+[33mcommit 4f8c3aeafccbbd1eb28065474652e2b181206b86[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Nov 22 19:23:58 2024 +0800
+
+    minor: update gsm8k threshold (#2125)
+
+[33mcommit 2369e88209afca1e431d355bc86f7deae08e4b7c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 22 01:50:42 2024 -0800
+
+    [minor] Clean up unused imports (#2122)
+    
+    Co-authored-by: rinrin32 <rinrin.int@gmail.com>
+
+[33mcommit ad30d5cf9a15c20e9b04eaa674c822161dba58ce[m
+Author: bjmsong <wq.songbob@gmail.com>
+Date:   Fri Nov 22 15:29:50 2024 +0800
+
+    Benchmark with Pytorch Profiler easily (#2110)
+    
+    Co-authored-by: root <bjmsong@126.com>
+
+[33mcommit dfec7fca0616be1af145e2bd89eee6d9db3235a4[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Nov 21 20:07:48 2024 -0800
+
+    Rename sglang.bench_latency to sglang.bench_one_batch (#2118)
+
+[33mcommit 8048c28c11b7b377d769bfc38fd8b8c87fb187de[m
+Author: Jake Poznanski <jakep@allenai.org>
+Date:   Thu Nov 21 19:05:41 2024 -0800
+
+    Fix #2037 - Context length check does not take into out pad tokens for visual models (#2106)
+
+[33mcommit 30af7dfb3426065f97ea6a217a721d84f0bb2b56[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Thu Nov 21 17:13:33 2024 -0800
+
+    [router] add base_gpu_id server args & merged radix tree python reference (#2115)
+
+[33mcommit f6f713797bcbc63d225136d66deaa00495cdedfe[m
+Author: James Xu <jamesxu1288@Gmail.com>
+Date:   Thu Nov 21 17:24:25 2024 -0500
+
+    Add support for Qwen2-VL-based embedding models (#2055)
+
+[33mcommit f35cb46cc37661d772db5964653c906dac41edbe[m
+Author: HAI <hixiao@gmail.com>
+Date:   Thu Nov 21 12:23:21 2024 -0800
+
+    ROCm: Fix MoE padding for none FP8 cases (#2111)
+
+[33mcommit 7f8fcd39cd405dbb5667265eb4171ae68935b47d[m
+Author: Jerry Zhang <jerryzh168@gmail.com>
+Date:   Thu Nov 21 12:19:49 2024 -0800
+
+    Turn off autotune for scaled mm for fp8 dynamic quant in torchao (#2116)
+
+[33mcommit 5c6a41facfacdc80d01015a65be5b8a5ec8eb91e[m
+Author: Jerry Zhang <jerryzh168@gmail.com>
+Date:   Wed Nov 20 17:37:28 2024 -0800
+
+    Error out when torchao-config option is not recognized (#2107)
+
+[33mcommit 722530fa018290fd3921c8f030fb806b190f32b7[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Nov 20 02:58:35 2024 -0800
+
+    Enable overlap scheduler by default for the triton attention backend (#2105)
+
+[33mcommit 56a347f7d30b8e9c702b823646f67cc8c8f2f11c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Nov 20 01:28:27 2024 -0800
+
+    Move test_session_id.py to playground (#2104)
+
+[33mcommit 3295cd8af2e6b1f3bc2dfbbce3390f4d64eb78ca[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Nov 20 01:25:21 2024 -0800
+
+    Allow skipping warmup in bench_offline_throughput.py (#2103)
+
+[33mcommit 5942dfc00a3131c38ea469ea7937ed403b28ddcf[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Nov 20 00:36:53 2024 -0800
+
+    [feat] Add session control (#2073)
+
+[33mcommit 63a395b98517ee4a65476f8650919af43cc4c993[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Nov 19 22:15:02 2024 -0800
+
+    Update nightly-eval.yml (#2100)
+
+[33mcommit 7d671e4ad2977d8090f44be5e94f351a15f4c9bf[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Nov 19 22:07:58 2024 -0800
+
+    Enable overlap by default (#2067)
+
+[33mcommit 699384cb017c4096815cb090f473c4004388e5ad[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Wed Nov 20 12:57:18 2024 +0800
+
+    Set schedule policy more conservative for DP attention (#2096)
+
+[33mcommit ffd20fcd037fa2815c6a5dd8fa165200f7649d0d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Nov 19 15:04:43 2024 -0800
+
+    Make constrained decoding work for overlap scheduler (#2095)
+
+[33mcommit 55bd97f3e5ea839e76388aa85876ec20160f8266[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Nov 20 06:07:27 2024 +0800
+
+    minor: add dataset dump and questions shuffle (#2093)
+
+[33mcommit e57c3e12b89ad5b06a5166f300991ccfe9867560[m
+Author: HAI <hixiao@gmail.com>
+Date:   Tue Nov 19 14:06:29 2024 -0800
+
+    Use native fp8 format on MI300X (#2094)
+
+[33mcommit f239268fad20b659cb6b8a2c33d9e9ae24da4474[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Nov 19 20:36:55 2024 +0800
+
+    minor: update gsm8k eval (#2091)
+
+[33mcommit 929c7621afc382d9ecbda2616187833255ee7332[m
+Author: Alexander Waitz <ajwaitz@gmail.com>
+Date:   Tue Nov 19 04:21:36 2024 -0800
+
+    Fix: incorrect top_logprobs in chat completion (#2088)
+
+[33mcommit b7a065eae3d9b2c05030a25ff57391b7432f8cc4[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Nov 19 00:21:46 2024 -0800
+
+    Use cuda event wait and synchronization instead of busy waiting (#2089)
+
+[33mcommit b110453802779285a8fb9dca6808f34cddbf68ee[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Nov 18 17:48:28 2024 -0800
+
+    Simplify logits penalizer (#2086)
+
+[33mcommit 3b44bbeecf7178b1802e5d1817f2f8b9bd94eccf[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Nov 18 14:59:15 2024 -0800
+
+    Allow passing extra request body to bench_offline_throughput.py (#2085)
+
+[33mcommit 80e2c4a8de3ad34af12f6127956975b69c1beaa7[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Nov 18 13:16:28 2024 -0800
+
+    Fix chunked prefill with output logprob (#2083)
+
+[33mcommit 66318ffe962b7361c3b8f90eac8ec31c8380c970[m
+Author: Jani Monoses <jani.monoses@gmail.com>
+Date:   Mon Nov 18 23:00:02 2024 +0200
+
+    Rename layer_idx to layer_id for consistency (#2078)
+
+[33mcommit 766192610e2d4b3cdf381498ca01e0c6fe6a2ae4[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Nov 18 21:29:13 2024 +0800
+
+    feat: update torch 2.5.1 (#2069)
+
+[33mcommit 2a3992b6f11bf7e71817020416a6d376818bd12d[m
+Author: yukavio <67678385+yukavio@users.noreply.github.com>
+Date:   Mon Nov 18 17:06:59 2024 +0800
+
+    support set role as 'tool' (#2075)
+    
+    Co-authored-by: kavioyu <kavioyu@tencent.com>
+
+[33mcommit 4af3f889fc6f406c0fc3b7a310e3ad7220b01ff6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Nov 18 00:02:36 2024 -0800
+
+    Simplify flashinfer indices update for prefill (#2074)
+    
+    Co-authored-by: kavioyu <kavioyu@tencent.com>
+    Co-authored-by: kavioyu <kavioyu@gmail.com>
+
+[33mcommit df7fe4521a121bd2738a8ac8ee28163c22f40bf7[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 17 22:18:11 2024 -0800
+
+    Crash the CI jobs on model import errors (#2072)
+
+[33mcommit a7164b620f862aeb4542713a6675ce3b4d9aea45[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 17 21:51:00 2024 -0800
+
+    Tune the threshold for accuracy tests in CI (#2071)
+
+[33mcommit 116685337e817e6e328ced94becdeb4979d83f36[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 17 21:29:30 2024 -0800
+
+    Fix cuda illegal memory access in overlap mode (#2070)
+
+[33mcommit a9e90b4bcecd61ec2f8fe09aab884dbcb6ddf732[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 17 19:49:20 2024 -0800
+
+    [Minor] Fix styles for overlap mode (#2068)
+
+[33mcommit 8c280cee550980edb842ff692e2cacee75b2641f[m
+Author: Tanjiro <tushar.goel.ml@gmail.com>
+Date:   Sun Nov 17 18:47:43 2024 -0800
+
+    add phi-3 small support (#2062)
+    
+    Co-authored-by: Tushar Goel <114812108+AI-Tushar@users.noreply.github.com>
+
+[33mcommit 9c745d078e29e153a64300bd07636c7c9c1c42d5[m
+Author: DarkSharpness <76582120+DarkSharpness@users.noreply.github.com>
+Date:   Mon Nov 18 09:58:49 2024 +0900
+
+    [Performance] Update xgrammar-related constrained decoding (#2056)
+
+[33mcommit ebaa2f31996e80e4128b832d70f29f288b59944e[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 17 16:53:44 2024 -0800
+
+    Rename arguments `--disable-nan-detection` to `--enable-nan-detection` (#2066)
+
+[33mcommit 62832bb2728e0e8ac5f97dc7687eaf263aaa927f[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Mon Nov 18 08:29:20 2024 +0800
+
+    Support cuda graph for DP attention (#2061)
+
+[33mcommit 11f881d173c4744a3ebf31736c264a0b0af4396f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 17 16:20:58 2024 -0800
+
+    Deprecate --disable-flashinfer and --disable-flashinfer-sampling (#2065)
+
+[33mcommit 38625e2139941fe8a02db81ebdd2babda359f05b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 17 15:48:12 2024 -0800
+
+    Remove monkey_patch_vllm_dummy_weight_loader (#2064)
+
+[33mcommit c1f401fc580c8b7875a5b7ac415058b31c7a4331[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 17 15:29:38 2024 -0800
+
+    Revert "chore: update torch v2.5.1" (#2063)
+
+[33mcommit 3b878863f7bb96726c8573efd1b8a6ba90de65a8[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Nov 18 00:06:00 2024 +0800
+
+    chore: update torch v2.5.1 (#1849)
+
+[33mcommit f719d9aebc1820bad70be738b8473fbf2f1dd370[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 16 17:13:36 2024 -0800
+
+    Launch dp ranks in parallel (#2053)
+    
+    Co-authored-by: Haotian Liu <6631389+haotian-liu@users.noreply.github.com>
+
+[33mcommit edad3731351bd3c3769ea97374a0a36c79aec7cb[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 16 16:14:23 2024 -0800
+
+    Fix illegal memory access in overlap mode & Use more fused triton kernels for building meta data (#2051)
+
+[33mcommit 976bc302e52b12d1d2e581cc5d8a952ac1c6b0a4[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Sat Nov 16 17:01:43 2024 +0800
+
+    Support DP MLA (#1970)
+
+[33mcommit 2f2e07439ce2ab7598a6b2ee92ee51ac14b7dc01[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 16 00:30:39 2024 -0800
+
+    Fix weight update for data parallelism (#2050)
+
+[33mcommit 2ffe0a7363aa0be9f1d7311daf8efe3ec9098338[m
+Author: HAI <hixiao@gmail.com>
+Date:   Fri Nov 15 22:51:48 2024 -0800
+
+    Add get_amdgpu_memory_capacity() (#2049)
+
+[33mcommit cf2489762b0ae6d9243b6dbe152721ec23cd91a2[m
+Author: Ke Wen <kw2501@meta.com>
+Date:   Fri Nov 15 21:26:00 2024 -0800
+
+    Add Tensor Parallel to torch_native_llama (#1876)
+
+[33mcommit e5c6715003da433da5cf57d143fc5794f9d5c942[m
+Author: HAI <hixiao@gmail.com>
+Date:   Fri Nov 15 21:24:42 2024 -0800
+
+    Fix core (MI300X) with --enable-overlap (#2048)
+
+[33mcommit 023d0a73df989a24535653f5290d63de369b8d75[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Sat Nov 16 03:09:10 2024 +0800
+
+    fix small typos in docs (#2047)
+
+[33mcommit 32c9a7ec11b8dde282f637614f7f8e51a2f20b11[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 15 06:54:00 2024 -0800
+
+    Release v0.3.5.post2 (#2046)
+
+[33mcommit b01df48cf2abb78114ba8a28cbe31139515dd112[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 15 06:21:57 2024 -0800
+
+    [Fix] Adjust default chunked prefill size and cuda graph max bs according to GPU memory capacity (#2044)
+
+[33mcommit c29b98e04393aa73680e6376bfc8774f4081eb35[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 15 05:33:43 2024 -0800
+
+    Fix json benchmark (#2043)
+
+[33mcommit 954f4e6bd607ae8ed08cc60dab7c8117e1ff1776[m
+Author: DarkSharpness <76582120+DarkSharpness@users.noreply.github.com>
+Date:   Fri Nov 15 22:06:19 2024 +0900
+
+    benchmark json schema (#2030)
+
+[33mcommit 2558d6a6752ad45e047900b7c42da1ebc27512d4[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 15 05:02:44 2024 -0800
+
+    Fix the default arguments of bench_offline_throughput.py & simplify detokenizer manager (#2042)
+
+[33mcommit 29ebe3dff475b87f9e252fa9257ab9b64ee4988f[m
+Author: ws <lj6922020@gmail.com>
+Date:   Fri Nov 15 19:39:10 2024 +0800
+
+    fix: align enable_overlap_scheduler naming between code and docs (#2038)
+
+[33mcommit f6dd648620cab687170390bebb7da85ab94ebbd7[m
+Author: zolinthecow <32052672+zolinthecow@users.noreply.github.com>
+Date:   Thu Nov 14 21:59:33 2024 -0800
+
+    Offline LLM Engine Benchmark Throughput (#1968)
+    
+    Co-authored-by: ByronHsu <byronhsu1230@gmail.com>
+
+[33mcommit ea53c63bad67f07f491718d38c2f65c3dd9d656b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Nov 14 19:09:21 2024 -0800
+
+    Expose no_stop_trim and skip_special_tokens in openai api (#2039)
+
+[33mcommit a10d5309436f83c34c1dc948f5601dd8895e7df2[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Nov 14 12:52:40 2024 -0800
+
+    Fix outlines version (#2036)
+
+[33mcommit aae5434bdffd13dc9e00417379168146602553e5[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Nov 14 11:08:37 2024 -0800
+
+    Fix unit tests (#2034)
+
+[33mcommit c3eac1b010b3da3086457e40af555690da0787a6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Nov 14 01:30:24 2024 -0800
+
+    Fix torch.compile for MoE (#2033)
+
+[33mcommit b275ce00439b07c8d73d9f11b6d0fb74dda46629[m
+Author: HAI <hixiao@gmail.com>
+Date:   Wed Nov 13 23:57:18 2024 -0800
+
+    Github runner instructions for AMD (#2031)
+
+[33mcommit 13ce3e4b5d40b55d6ae72758ac5ced8ce6f4937c[m
+Author: Patrick Yi <21299683+pjyi2147@users.noreply.github.com>
+Date:   Thu Nov 14 02:26:56 2024 -0500
+
+    Add download_dir ServerArgs property (#2027)
+
+[33mcommit df246e699d2a18873da2b2c47b432d07b17d8cca[m
+Author: Tzu Gwo <gotzehsing@gmail.com>
+Date:   Thu Nov 14 15:02:39 2024 +0800
+
+    chore: open lto and optimization in release profile (#2028)
+
+[33mcommit fb9fb3518b2598eaae21b9f31a56768eeb03f4bd[m
+Author: chottolabs <171991982+chottolabs@users.noreply.github.com>
+Date:   Wed Nov 13 20:06:02 2024 -0500
+
+    set content to empty string (#2026)
+
+[33mcommit c722d9bdc30e9730f82f6d646c171c43a4837e12[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Nov 13 14:04:25 2024 -0800
+
+    Fix dependency and error message for xgrammar (#2024)
+
+[33mcommit 218ab3611ddf46ce6acf8a465611a01faa275eb7[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Nov 13 11:39:16 2024 -0800
+
+    Do not let invalid grammar crash the server (#2023)
+
+[33mcommit f407fcf9ef0ef637deb6b62cd9044e1778c53b89[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Nov 13 10:27:12 2024 -0800
+
+    Release v0.3.5.post1 (#2022)
+
+[33mcommit 54479d6f301c0178be1c43209d1adc1684542520[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Nov 13 01:49:45 2024 -0800
+
+    Fix grammar backend for tensor parallelism (#2020)
+
+[33mcommit ba069a24d3e116b37399cf3ebd295c97c49ae6fd[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Nov 12 21:17:38 2024 -0800
+
+    Fix grammar backend (#2018)
+
+[33mcommit 125b1199c5858db069a98a17af8c917e35891480[m
+Author: DarkSharpness <76582120+DarkSharpness@users.noreply.github.com>
+Date:   Wed Nov 13 01:45:28 2024 +0900
+
+    support parallel grammar preprocessing (#1996)
+    
+    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
+
+[33mcommit eff468dd5a3d24646560eb044276585f7a11ac3c[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Tue Nov 12 23:21:16 2024 +0800
+
+    fix test_embedding_models prompt length too long's bug (#2015)
+
+[33mcommit a1bd7190315d021c7326ed34fe7d73a368c0f572[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Tue Nov 12 16:49:45 2024 +0800
+
+    fix a bug in v1_embeeding_request (#2014)
+
+[33mcommit 78c1d6445fa64667e5691826abbb35b1423e8486[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Nov 11 23:24:41 2024 -0800
+
+    Fix finish reason (#2013)
+
+[33mcommit 027e65248f26845057ed6eef663f33bbcd2602f7[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Tue Nov 12 15:21:20 2024 +0800
+
+    support echo=true and logprobs in openai api when logprobs=1 in lm-evaluation-harness  (#1998)
+
+[33mcommit b808a38365b082e6ef0e25c673ed56b9bdd6f73c[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Tue Nov 12 14:53:41 2024 +0800
+
+    Filter empty prompt in random bench serving (#2011)
+
+[33mcommit 602ebc661d7173cfa2ab1edfa51c8785b57b91d7[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Nov 11 20:45:52 2024 -0800
+
+    remove sglang folder in rust (#2010)
+
+[33mcommit 530ae1bdc80f8740975977d4a347b62760fd381d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Nov 11 17:52:42 2024 -0800
+
+    Fix weight loading for tied word embedding when TP > 1 (#2009)
+
+[33mcommit befc6beb863b9a5c04fb0364ad0356378ec027fc[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Nov 11 16:34:10 2024 -0800
+
+    Fix a typo in io_struct.py (#2008)
+
+[33mcommit 59a5ba9be0c7d2453aeacf61ac20bad99c04ec10[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Nov 11 15:36:14 2024 -0800
+
+    [Minor] Remove unused imports (#2006)
+
+[33mcommit 86c37d010aeabb1bd2f4a05e19b6bb7f14c3d8da[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Nov 11 15:20:14 2024 -0800
+
+    fix sglang_router not found (#2005)
+
+[33mcommit f18b9c72520dc403c6cc00d57321f499ca42803f[m
+Author: RangiLyu <lyuchqi@gmail.com>
+Date:   Tue Nov 12 07:09:58 2024 +0800
+
+    support internlm2-reward (#1994)
+
+[33mcommit 3e33574374c0228ca223739842654cca5e0e4851[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Nov 11 14:46:08 2024 -0800
+
+    run rust test on ubuntu instead of 1-gpu-runner (#2003)
+
+[33mcommit 0d94f1dd036a046620b9a2e767b21af5e5887cc1[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Nov 11 14:42:22 2024 -0800
+
+    Bump router to 0.0.3 (#2004)
+
+[33mcommit e728258d34dd59c6e0ea783863c72c40359b2292[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Nov 11 14:30:25 2024 -0800
+
+    release router from py38 to py312 (#2002)
+
+[33mcommit 239eafbd2e71d43e574eaa6d604ed2918b39200e[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Nov 11 14:18:21 2024 -0800
+
+    Fix rust unit test and pypi token (#2001)
+
+[33mcommit 9d427265fdc527d6bf568c0b373c1789e25b8a39[m
+Author: James Xu <jamesxu1288@Gmail.com>
+Date:   Mon Nov 11 16:43:35 2024 -0500
+
+    Add Engine::encode example (#2000)
+
+[33mcommit 00ffde206f893c4dcbeea8eb15c6a0caf261ea23[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Nov 11 12:19:32 2024 -0800
+
+    setup router python binding ci (#1999)
+
+[33mcommit ddeb9d42dec70ba032929f1a48fc64381fdda2b2[m
+Author: James Xu <jamesxu1288@Gmail.com>
+Date:   Mon Nov 11 14:48:17 2024 -0500
+
+    Add engine encode (#1995)
+    
+    Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
+
+[33mcommit aaf0a3156edf311956189fae3cf3271fb8df4ff0[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Nov 11 21:03:16 2024 +0800
+
+    docs: add slides link in README (#1997)
+
+[33mcommit f9633fa9b94c633677863bfd0dc183b8717cfd77[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sun Nov 10 21:57:32 2024 -0800
+
+    [rust] cache-aware DP - approx tree (#1934)
+
+[33mcommit 087ab832236ef264746d8c75af8cd8752f56ca6b[m
+Author: HAI <hixiao@gmail.com>
+Date:   Sun Nov 10 18:54:43 2024 -0800
+
+    [Performance, Triton] Optimize over mask compute to tl.load in fused_moe_kernel (#1980)
+
+[33mcommit 8169c6f4ef5c5d5705fbb8309dc7a27544bd0a37[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sun Nov 10 16:39:56 2024 -0800
+
+    Add gen-shared-prefix dataset in bench_serving (#1990)
+
+[33mcommit 3d043319aa8e307b61b7c91f172acdcd76813ddb[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 10 11:45:01 2024 -0800
+
+    [CI] Balance unit tests (#1988)
+
+[33mcommit a8aad9357d2099064c9198d828375a829c270aab[m
+Author: yizhang2077 <1109276519@qq.com>
+Date:   Mon Nov 11 00:10:45 2024 +0800
+
+    qwen2vl fix bug for #1971 #1897 (#1984)
+
+[33mcommit 47ffe7af816b9dcc0ecce43b633c752dba4eeccb[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Nov 10 22:14:48 2024 +0800
+
+    docs: add shm size for docker run (#1986)
+
+[33mcommit b3523af8eb332c1a53c3abfb914f475be371f8fc[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Nov 10 21:33:23 2024 +0800
+
+    fix: update pyzmq version (#1983)
+
+[33mcommit 1929c067625089c9c3c04321578f450275f24041[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 10 04:39:32 2024 -0800
+
+    Simplify prometheus metrics (#1981)
+    
+    Co-authored-by: Mohit Reddy <mohitreddy1996@users.noreply.github.com>
+
+[33mcommit ed53ac84b4b54c7086096d42c2137740ac8cc6c5[m
+Author: Huanzhi (Hans) Mao <huanzhimao@gmail.com>
+Date:   Sun Nov 10 01:32:07 2024 -0800
+
+    Specify `zmq` Version Requirement (#1982)
+
+[33mcommit 520f0094e4e74a72920860dc7767e0f599ab0b26[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 9 16:46:14 2024 -0800
+
+    [CI] balance unit tests (#1977)
+
+[33mcommit 9c939a3d8b3c5a60d51609789fcb6d98c64ccd30[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 9 15:43:20 2024 -0800
+
+    Clean up metrics code (#1972)
+
+[33mcommit 549e8b83667b4816f1a697048c561b57e0f3f5b1[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 9 15:07:27 2024 -0800
+
+    [Minor] Fix a typo in test_torchao.py (#1976)
+
+[33mcommit a1f32867cacaecb3a12c630fb6a025adfdbd91bc[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 9 13:53:35 2024 -0800
+
+    Update pr-test-rust.yml to add a "finish" step (#1975)
+
+[33mcommit 760552e068edb58d9cd6e68aa1b714c247027d92[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 9 11:32:13 2024 -0800
+
+    Update README.md (#1974)
+
+[33mcommit d9aada9db15be00d92bf386bbfabac6b8ea2b337[m
+Author: Kursat Aktas <kursat.ce@gmail.com>
+Date:   Sat Nov 9 22:29:26 2024 +0300
+
+    Introducing SGLang Guru on Gurubase.io (#1745)
+
+[33mcommit f11eb90fe42db9043d4d3a15a16def8f3c33cbdd[m
+Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
+Date:   Sat Nov 9 19:28:02 2024 +0000
+
+    Initialize model_worker_batch variable (#1973)
+
+[33mcommit 95a4ed129ae24df6bca2d0e01c522253b2d385cb[m
+Author: Yudi Xue <10211+binarycrayon@users.noreply.github.com>
+Date:   Fri Nov 8 23:21:11 2024 -0800
+
+    Fix metrics (#1963)
+
+[33mcommit d1150e9a001d7642e48dd105ab92c816ef27c5c7[m
+Author: leishaoSC <165223994+leishaoSC@users.noreply.github.com>
+Date:   Fri Nov 8 23:19:03 2024 -0800
+
+    Updated Instructions on Profiling SGLang Infer System with AMD GPUs (#1966)
+    
+    Co-authored-by: wunhuang <wunhuang@amd.com>
+
+[33mcommit e3126e3c5ffabe77c90222aeff63fe778d525ff5[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Fri Nov 8 11:46:25 2024 -0800
+
+    Update README.md's Slack invitation link (#1962)
+
+[33mcommit a509552087fa29a62113ca0e24a6c35aa9502b30[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 8 02:19:41 2024 -0800
+
+    [minor] Improve code style and compatibility (#1961)
+
+[33mcommit 7ef0084b0d2e3b91fe1fa7cd5e396d47aa613797[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 8 01:21:29 2024 -0800
+
+    Add sentence_transformers to CI dependency  (#1958)
+
+[33mcommit f9a377f6501b92896263a8210b45bfcaabe89f2a[m
+Author: HAI <hixiao@gmail.com>
+Date:   Fri Nov 8 00:14:15 2024 -0800
+
+    [Release, ROCm] release ROCm docker build for AMD MI GPUs (#1957)
+
+[33mcommit 4ade15dd32397c0a45bd41202b9f949dd78cafe3[m
+Author: aqweteddy <keddy940199@gmail.com>
+Date:   Fri Nov 8 16:10:54 2024 +0800
+
+    Adjust reward model's score module and pooler module order for reducing computation (#1956)
+
+[33mcommit 8dc84da08479aabcde2480e8a9c67c249595eb62[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Nov 7 23:15:08 2024 -0800
+
+    Remove the useless to_srt_kwargs (#1955)
+
+[33mcommit f16eb15d0d4f6fbd48c2c8e1730c3ab14f9ecaa6[m
+Author: aqweteddy <keddy940199@gmail.com>
+Date:   Fri Nov 8 14:42:27 2024 +0800
+
+    Gemma2 reward model support (#1954)
+
+[33mcommit 5bc2508b80a438dda141c757af5b443db65defe9[m
+Author: Yudi Xue <10211+binarycrayon@users.noreply.github.com>
+Date:   Thu Nov 7 22:14:16 2024 -0800
+
+    Monitoring documentation (#1933)
+
+[33mcommit a71a44f20369384c986a99836af25d1b302653af[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Nov 7 19:20:47 2024 -0800
+
+    Update setup_github_runner.md (#1952)
+
+[33mcommit 691808d587deff22bfa7f8209a7122564514ea7d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Nov 7 18:28:29 2024 -0800
+
+    Add a timeout for execute-notebook.yml (#1951)
+
+[33mcommit d32fba2a4d4cee32d4ba25bb4f04c765fd7f1b9a[m
+Author: HAI <hixiao@gmail.com>
+Date:   Thu Nov 7 18:24:36 2024 -0800
+
+    [ENV, ROCm] update environment settings (#1939)
+
+[33mcommit 67c424cce310d36b7261992ebce00bd218378769[m
+Author: HAI <hixiao@gmail.com>
+Date:   Thu Nov 7 18:24:02 2024 -0800
+
+    [Performance, Triton Kernel Args] extend_attention, optimize kern args to _fwd_kernel (#1941)
+
+[33mcommit 1ae270c5d0873c0bcd02b9078e3a6bd0f12fbc1d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Nov 7 18:20:41 2024 -0800
+
+    [Doc] fix docs (#1949)
+
+[33mcommit c77c1e05badb5f5bf774872c3498b21eeb0aef20[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Thu Nov 7 15:42:47 2024 -0800
+
+    fix black in pre-commit (#1940)
+
+[33mcommit dca87ec34801e4a541cf8324e977522a2b06c067[m
+Author: HAI <hixiao@gmail.com>
+Date:   Thu Nov 7 00:50:45 2024 -0800
+
+    [Docs] fix 404 - Contributor Guide (#1942)
+
+[33mcommit 4b1d7a2583cacb9bf3abf200a5efbcf4556a91c5[m
+Author: Austin Liu <austin362667@gmail.com>
+Date:   Thu Nov 7 10:08:30 2024 +0800
+
+    Add Rust Router Python Binding (#1891)
+    
+    Signed-off-by: Austin Liu <austin362667@gmail.com>
+    Co-authored-by: ByronHsu <byronhsu1230@gmail.com>
+
+[33mcommit a5e0defb5a560a6d42882008c1dd8a739002ab7d[m
+Author: Xuehai Pan <XuehaiPan@outlook.com>
+Date:   Wed Nov 6 21:46:04 2024 +0800
+
+    minor: Add basic editorconfig and pre-commit hooks to enforce style for whitespaces (#1926)
+
+[33mcommit 96766101b4d181f9b3141da9484ca10ff7656743[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Wed Nov 6 00:02:02 2024 -0800
+
+    [rust] refactor server and router (#1922)
+
+[33mcommit a146d9990e148fdf2c247d639ba5d2a572175e9c[m
+Author: Lzhang-hub <57925599+Lzhang-hub@users.noreply.github.com>
+Date:   Wed Nov 6 12:42:53 2024 +0800
+
+    support prometheus metrics (#1853)
+    
+    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
+    Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
+
+[33mcommit f5113e50aed22cfca0b411e9815ea37c40103615[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Nov 5 01:12:10 2024 -0800
+
+    [Doc] improve relative links and structure (#1924)
+
+[33mcommit 02755768d32765eb49f9fa1499ed841c3aab7edb[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Mon Nov 4 23:53:44 2024 -0800
+
+    Change judge to classify & Modify make file (#1920)
+
+[33mcommit 463d56bf4439b078748c47421dffaf73d8eaede4[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Nov 4 17:13:41 2024 -0800
+
+    Update CODEOWNERS (#1916)
+
+[33mcommit 530ff541cf272956ad629a3703ecda80ff68fc63[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Nov 4 10:56:52 2024 -0800
+
+    [router] Impl radix tree and set up CI (#1893)
+    
+    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
+
+[33mcommit 3cd28092771bc8af4eaaa56d24586c26ca76a3d1[m
+Author: HAI <hixiao@gmail.com>
+Date:   Mon Nov 4 01:40:57 2024 -0800
+
+    [Docs, ROCm] update install to cover ROCm with MI GPUs (#1915)
+
+[33mcommit 704f8e8ed1a4ab992ac626bc91cd62e4909faa8f[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Sun Nov 3 22:33:03 2024 -0800
+
+    Add Reward API Docs etc (#1910)
+    
+    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
+
+[33mcommit 1853c3523bf13fd6664d8e68b4744f646c53f9d6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 3 14:18:16 2024 -0800
+
+    Fix regex docs (#1909)
+
+[33mcommit 65859754f1463ce280bbaaf68d04797705849240[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 3 13:48:11 2024 -0800
+
+    Release v0.3.5 (#1908)
+
+[33mcommit 2ce32db6fb317c252f9c877880c1b5dd47dca7b6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 3 13:27:12 2024 -0800
+
+    Let reward model take text inputs instead of message lists (#1907)
+    
+    Co-authored-by: Kyle Corbitt <kyle@corbt.com>
+
+[33mcommit 793b79dbe901fd2f4257744125f15edcc14567f4[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Nov 3 12:56:10 2024 -0800
+
+    feat: support truss endpoint for benchmark serving (#1906)
+
+[33mcommit 1363b51983415a5180fd3981e676a6b20ea77ed8[m
+Author: Iñaki Arango <arangoinaki@gmail.com>
+Date:   Sun Nov 3 12:27:11 2024 -0800
+
+    Escape backwards slash (#1902)
+
+[33mcommit 0abbf289a8acd01cafd182da8d6a5cc0fccb6953[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 3 12:25:39 2024 -0800
+
+    Unify the model type checking (#1905)
+
+[33mcommit c17c57810891591b3f7d5151d65b1e8d13af50f9[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 3 08:38:26 2024 -0800
+
+    Simplify tokenizer manager (#1904)
+
+[33mcommit 916b3cdddcbaa0f902a27fac0a1ec02f72cd62e9[m
+Author: Jani Monoses <jani.monoses@gmail.com>
+Date:   Sun Nov 3 18:24:37 2024 +0200
+
+    Allow passing dtype and max_new_tokens to HF reference script (#1903)
+
+[33mcommit 838dcda162e465b2e84f5b33434e55c1df8f6942[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 3 03:52:38 2024 -0800
+
+    Simplify tokenizer manager (#1899)
+
+[33mcommit efbc116a0f81e7c3f09f45b0720152aa5b91dc0d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Nov 3 01:45:20 2024 -0700
+
+    Do not use longest prefix matching when #queue-req is large (#1896)
+
+[33mcommit 6aed0445ed5182acc6309c1a80c743dc33ecb837[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Sun Nov 3 00:19:12 2024 -0700
+
+    turn off log (#1895)
+
+[33mcommit 908dd7f9aae52a9c961c836d99e46ba6681fee42[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Sat Nov 2 22:03:38 2024 -0700
+
+    Add engine api (#1894)
+
+[33mcommit f4cd8040732f348b7c55e432ae772b6ea70520db[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Sat Nov 2 19:08:49 2024 -0700
+
+    Fix ci and link error (#1892)
+    
+    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
+
+[33mcommit be7986e00544a28832841c916c07793173fd512c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 2 13:26:32 2024 -0700
+
+    Fix docs (#1890)
+
+[33mcommit 5a5f18432f574c16cbdb08234a8d1e6efce1bb2a[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Sat Nov 2 11:57:22 2024 -0700
+
+    Fix docs ci (#1888)
+
+[33mcommit 7b394e5f2b26b05363303738792aa841573ebbbf[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 2 11:46:00 2024 -0700
+
+    Fix docs (#1889)
+
+[33mcommit 3b60558dd79e1f4aeadc34ed5dbae45cb75e5a00[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Sat Nov 2 01:02:17 2024 -0700
+
+    Native api (#1886)
+    
+    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
+
+[33mcommit 5a9a4f41c695daa8b46c25abe8200117e68fbab2[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Nov 2 00:20:33 2024 -0700
+
+    Update index.rst (#1885)
+
+[33mcommit 72e979bfb5ed031282deef800774cbcde3d572b3[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Sat Nov 2 00:17:30 2024 -0700
+
+    add native api docs (#1883)
+    
+    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
+
+[33mcommit 146f6134051a23cda360a9de2a1abc1f447b9787[m
+Author: Ran Chen <ranchen19@icloud.com>
+Date:   Sat Nov 2 00:04:50 2024 -0700
+
+    Fix incorrect context length for llama3.2-11b (#1873)
+
+[33mcommit 660ecb731f0aa8d08d0220ea2ef91757ac24d33c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 1 20:42:30 2024 -0700
+
+    Fix doc links (#1882)
+
+[33mcommit 2565cb0f40b9d8b92711efab7e8ff073b4058478[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 1 20:29:41 2024 -0700
+
+    Update docs and workflow (#1881)
+
+[33mcommit 066e8a4ef0e9728cb8744944155c6da815c3d8a0[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 1 20:00:41 2024 -0700
+
+    Update docs title (#1879)
+
+[33mcommit 2134f0898ceb833c5202c3a7c5e9e74535d96697[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 1 18:21:14 2024 -0700
+
+    Fix links in the docs (#1878)
+
+[33mcommit a54f278d44afb42bea1f77990efd3640e71a2af3[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 1 18:16:29 2024 -0700
+
+    Add a FAQ documentation (#1877)
+
+[33mcommit d1b31b06842829cbb8516271f28af4bedce00546[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Nov 1 17:47:44 2024 -0700
+
+    Improve docs and fix the broken links (#1875)
+
+[33mcommit d59a47828cb8983704b9438b6207d38564b46fdc[m
+Author: jacky.cheng <yi-chih.cheng@amd.com>
+Date:   Sat Nov 2 03:12:59 2024 +0800
+
+    [3rdparty, document] Updated Documentation that covers performance tuning techniques for AMD Instinct GPUs. (#1871)
+    
+    Co-authored-by: root <root@dell300x-pla-t10-23.pla.dcgpu>
+
+[33mcommit 104bf2609b1adf182a8c34a533b500914a219b8d[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Nov 1 21:38:29 2024 +0800
+
+    minor: update nightly eval (#1867)
+
+[33mcommit 3bf3d011ed9650cd98b9c20bac86bebd6f87c7d9[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Fri Nov 1 00:51:15 2024 -0700
+
+    Add vlm document (#1866)
+    
+    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
+
+[33mcommit d86a2d6562840455281bfb7bd4a9a0bcc9461992[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Nov 1 14:29:20 2024 +0800
+
+    minor: add human eval (#1754)
+
+[33mcommit 16eb33ffe2afc9c043e4f80f18e15ea57b984944[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Fri Nov 1 11:13:07 2024 +0800
+
+    Update vocab embedding deps and add TP switch (#1856)
+
+[33mcommit 61cf00e1121509c0dfa19d2a8608471b23a3f6a9[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Thu Oct 31 20:10:16 2024 -0700
+
+    change file tree (#1859)
+    
+    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
+
+[33mcommit b9fd178f1b7bab721b384c017dcec30a3ba0f323[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Oct 31 18:27:42 2024 -0700
+
+    Fix retraction + overlap (#1860)
+    
+    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
+
+[33mcommit d8e9d61f8621742948cfb63b3d81cdbf5cdda316[m
+Author: HAI <hixiao@gmail.com>
+Date:   Thu Oct 31 16:38:16 2024 -0700
+
+    [Build, ROCm] Dockerfile.rocm for Instinct GPUs, with package updates (#1861)
+
+[33mcommit a2e0424abfc0d9f382331c813b1d96e0ef39d3e0[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Oct 31 14:51:51 2024 -0700
+
+    Fix memory leak for chunked prefill 2 (#1858)
+    
+    Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
+
+[33mcommit 8ce202a493294f8d98660c86d502edbfad74b741[m
+Author: geeker-smallwhite <1453684133@qq.com>
+Date:   Thu Oct 31 19:33:55 2024 +0800
+
+    delete unused character (#1855)
+
+[33mcommit d913d52c9a25f64238ab3fb23b02e0ced2d9c625[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 30 22:28:00 2024 -0700
+
+    Fix warnings in doc build (#1852)
+
+[33mcommit 0ab7bcaf6600722a916f01e2f8c01c8e9da99106[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 30 21:57:49 2024 -0700
+
+    Simplify documentation in README.md (#1851)
+
+[33mcommit 438526a814f94fafcc2c753220930edeff947e05[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Wed Oct 30 21:32:18 2024 -0700
+
+    Refactor tokenizer manager (#1846)
+
+[33mcommit f7102fbd2b5d1e6bc0373e54b5bead7370dab160[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 30 21:20:41 2024 -0700
+
+    Fix mixed chunked prefill (#1850)
+
+[33mcommit a7a0a6886b61574598d21628c71bd20b32504f98[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Wed Oct 30 19:59:20 2024 -0700
+
+    Make decode log interval configurable (#1847)
+
+[33mcommit 2d4ce1b7928d253144bc4b030a643af2b9267b40[m
+Author: HAI <hixiao@gmail.com>
+Date:   Wed Oct 30 17:33:36 2024 -0700
+
+    [Performance, Triton Kernel Args] _decode_grouped_softmax_reducev_fwd… (#1845)
+
+[33mcommit 4ba815b84e176033f37ea4e6b0311edd9066e946[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Wed Oct 30 12:28:12 2024 -0700
+
+    Fix suggest edit (#1842)
+    
+    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
+
+[33mcommit 5f65e2b830a4cf752ae8ab5739ae7ad958eced83[m
+Author: HAI <hixiao@gmail.com>
+Date:   Wed Oct 30 12:17:32 2024 -0700
+
+    [Performance, Hardware] MoE weights padding to AMD MI300x GPUs (#1836)
+
+[33mcommit 4e2af03cfa124096a7235281634ecee064bae037[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Oct 30 10:22:56 2024 -0700
+
+    [Production] Drain requests before exit when receive SIGTERM (#1838)
+
+[33mcommit 3184aa95a78bc4eca4532ef97f7065302a053816[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 30 03:16:43 2024 -0700
+
+    Update README.md (#1840)
+
+[33mcommit b548801ddbf94643e722168dc7303d0a2c5b43fc[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 30 02:49:08 2024 -0700
+
+    Update docs (#1839)
+
+[33mcommit 539df95d2cb5634b92d01ed83ed7c5c60d299a28[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Wed Oct 30 00:39:41 2024 -0700
+
+    Imporve openai api documents (#1827)
+    
+    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
+
+[33mcommit 5e00ddebc09e6919996e55407c45d89c50d6c522[m
+Author: DanielC12321 <73292458+DanielC12321@users.noreply.github.com>
+Date:   Tue Oct 29 19:52:33 2024 -0500
+
+    Add new model: Gpt2 (#1833)
+
+[33mcommit 54dd3ea12277f782823c8067ed723279136c40bb[m
+Author: HAI <hixiao@gmail.com>
+Date:   Tue Oct 29 13:58:03 2024 -0700
+
+    [FP8 KV Cache, Mixtral] Avoid KeyError at loading pre-quantized FP8 m… (#1835)
+
+[33mcommit d04899d7ca645671335db6876758f0062f239ebc[m
+Author: yizhang2077 <1109276519@qq.com>
+Date:   Wed Oct 30 04:30:41 2024 +0800
+
+    stop_str of qwen2-vl template should be a tuple not a str (#1834)
+    
+    Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
+
+[33mcommit 5010e0d2ca87716c872b6c78c0c754128812bd90[m
+Author: HAI <hixiao@gmail.com>
+Date:   Tue Oct 29 10:51:02 2024 -0700
+
+    [3rdparty, document] Add 3rdparty/amd, with profiling and tuning instructions to be added (#1822)
+
+[33mcommit 5e6c32657e384b023faf03d79e06f7727feedb7c[m
+Author: Yanyi Liu <wolfsonliu@163.com>
+Date:   Tue Oct 29 14:51:47 2024 +0800
+
+    Support setting `use_thread` in the `run_program` for easier debugging. (#1823)
+    
+    Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
+
+[33mcommit 680cad20233be46da97e92db0ba29d2b8fa41c03[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Oct 28 23:07:14 2024 -0700
+
+    fix get_memory_pool_size deadlock for DP (#1830)
+
+[33mcommit 0a24eb850a8ed690c5ae5f3cbffc10f2b0c1c42e[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Oct 28 12:02:23 2024 -0700
+
+    Fix update_weights deadlock for DP (#1825)
+
+[33mcommit 3839be2913f29e1f234a789e1a0159d876251f02[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Oct 28 09:49:48 2024 -0700
+
+    [Router] Add a rust-based router (#1790)
+
+[33mcommit 6e13b650a98275750835ce7999890052d01d1c45[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Sun Oct 27 21:03:41 2024 -0700
+
+    Fix docs deploy ci (#1821)
+
+[33mcommit 6fcd6d7d6dec7aea858d7441effd8a04b6d05474[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sun Oct 27 14:02:34 2024 -0700
+
+    Support token ids in `engine.generate` (#1820)
+
+[33mcommit c77762d57f4161efae8222ad828b818d95f8d268[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Mon Oct 28 01:54:38 2024 +0800
+
+    Fix Triton decode kernel & ut (#1819)
+
+[33mcommit 51c81e339bb50db2cb5fb282498886562883b5f0[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Sun Oct 27 10:51:42 2024 -0700
+
+    Add openAI compatible API (#1810)
+    
+    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
+
+[33mcommit eaade87a421e7dddc4e91af6a97a03441944a36d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Oct 27 03:04:54 2024 -0700
+
+    Fix unit tests (#1817)
+
+[33mcommit 86fc0d79d0b564fba1c313feafd15323ba731418[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Oct 27 02:00:50 2024 -0700
+
+    Add a watch dog thread (#1816)
+
+[33mcommit 1be853ee69a23eda57da57f88451feb9143a1838[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 26 21:08:44 2024 -0700
+
+    Update hyperparameter_tuning.md (#1813)
+
+[33mcommit 86e0dde555284b1df0001cafdbd04d2645784cbc[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 26 16:39:41 2024 -0700
+
+    Improve the user control of new_token_ratio (#1811)
+
+[33mcommit 2b80978859794eb9dcf0156066a4e7c7b7abc713[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 26 15:09:33 2024 -0700
+
+    Provide an argument to set the maximum batch size for cuda graph (#1809)
+
+[33mcommit 9d6fb084575c75b58cfeb0464a842d6fd4e39e13[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Sat Oct 26 11:23:51 2024 -0700
+
+    Fix docs ci (#1808)
+
+[33mcommit ced362f7c60f9bf36d659423aa23aba6c9691018[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Sat Oct 26 10:44:11 2024 -0700
+
+    Simplify our docs with complicated functions into utils (#1807)
+    
+    Co-authored-by: Chayenne <zhaochenyang@ucla.edu>
+
+[33mcommit 9084a864453e5a898beb6ce36c2d56ed1f2f3c46[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 26 04:46:01 2024 -0700
+
+    Update links (#1805)
+
+[33mcommit 6aa94b967c2bd79c7c0844d15d6ed90665c58f6a[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 26 04:32:36 2024 -0700
+
+    Update ci workflows (#1804)
+
+[33mcommit c26507484fca9c6a901754b16af56285df29aa2b[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Oct 26 00:09:44 2024 -0700
+
+    fix int conversion for `SGLANG_CPU_COUNT` (#1803)
+
+[33mcommit 07bf2e846a413d8125d35121210365a2cf0ee3fa[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Oct 25 23:43:24 2024 -0700
+
+    Allow consecutive ports when launching multiple sglang servers. (#1802)
+
+[33mcommit a628dd8e3162be93a5fad6e27dc9a1bc176d63cf[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Oct 25 23:15:56 2024 -0700
+
+    Set `ZMQ` buffer size heuristic (#1801)
+
+[33mcommit 1e8903414a4385f29765e4a1ac2551825e9aba66[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Oct 25 23:07:07 2024 -0700
+
+    Fix possible ZMQ hanging (#1800)
+
+[33mcommit 715b16c140b4519a67e9c9a10db311731f0d930b[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Fri Oct 25 20:48:35 2024 -0700
+
+    Add support for ipynb (#1786)
+
+[33mcommit 9ce8e1a93cd6ca15e177789c1e075d9f833aa822[m
+Author: Hui Liu <96135754+hliuca@users.noreply.github.com>
+Date:   Fri Oct 25 19:30:50 2024 -0700
+
+    move max_position_embeddings to the last (#1799)
+
+[33mcommit fb99aaa527199de19271668f0aa1e70b780f83fa[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 25 18:51:59 2024 -0700
+
+    [Fix] Fix --skip-tokenizer-init (#1798)
+
+[33mcommit b77a02cdfdb4cd58be3ebc6a66d076832c309cfc[m
+Author: DarkSharpness <76582120+DarkSharpness@users.noreply.github.com>
+Date:   Sat Oct 26 06:47:02 2024 +0900
+
+    [Performance] Support both xgrammar and outlines for constrained decoding (#1752)
+
+[33mcommit 30643fed7f92be32540dfcdf9e4310e477ce0f6d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 25 11:07:19 2024 -0700
+
+    Release v0.3.4.post2 (#1796)
+    
+    Co-authored-by: DarkSharpness <76582120+DarkSharpness@users.noreply.github.com>
+
+[33mcommit e646c5901e7910228e128861e39d8de16241afc1[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 25 11:06:57 2024 -0700
+
+    Fix logprob in the overlapped mode (#1795)
+
+[33mcommit c555ce2ca20cd8a2fc87a0e048c39c181614388e[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 25 10:24:44 2024 -0700
+
+    Revert "Fix memory leak when doing chunked prefill" (#1797)
+
+[33mcommit 40900baea7f689e9175a70b9683b5a4fe9be1dc8[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 25 08:31:08 2024 -0700
+
+    [Fix] Fix the log parsing in chunked prefill uni tests (#1794)
+
+[33mcommit a2f5e7555fdd0d94d00e9feaa1d463505ec7cc7b[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Oct 25 08:01:17 2024 -0700
+
+    Fix memory leak when doing chunked prefill (#1787)
+
+[33mcommit 2148914e1b4740a812d01776d5e5c38257a10552[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 25 08:00:55 2024 -0700
+
+    Fix log parsing in the chunked prefill unit tests (#1793)
+
+[33mcommit def55bc8762bf0f69f32512cd6a425c790a9e6ea[m
+Author: yizhang2077 <1109276519@qq.com>
+Date:   Fri Oct 25 22:45:17 2024 +0800
+
+    Qwen2vl support cuda graph and disable radix cache (#1780)
+
+[33mcommit 86a2c473b775f9051f460b4107a34c5e662fd1a3[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Oct 24 21:26:05 2024 -0700
+
+    [Fix] Fix seq_lens_sum for cuda graph runner in padded cases (#1789)
+
+[33mcommit 1701b0db31544b0a95b1c18474d79bd1bc401253[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Oct 24 21:23:09 2024 -0700
+
+    Enhance the test case for chunked prefill (#1785)
+
+[33mcommit 384d85ba358a6a097090f9d7dbe0f621c8c47829[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Oct 24 13:30:11 2024 -0700
+
+    Re-introduce `get_cuda_graph_seq_len_fill_value` (#1783)
+
+[33mcommit 605972195bafcd3ffd7a3489dbed4e1d2d0d51dd[m
+Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
+Date:   Fri Oct 25 03:40:36 2024 +0800
+
+    check user-specified model_max_len with hf derived max_model_len (#1778)
+
+[33mcommit fc82f5a743f48d50c633a08e89eff3d6522fb4a3[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Oct 24 12:33:15 2024 -0700
+
+    [Fix] Fix cuda graph padding for triton attention backend (#1782)
+
+[33mcommit 0089c4bc96013806162aa47c929f597d5327d662[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Oct 24 04:16:59 2024 -0700
+
+    [Fix] Fix NaN issues by fixing the cuda graph padding values for flashinfer (#1779)
+
+[33mcommit 72e7b57a750615187af9c761e526bf4570fc1288[m
+Author: zolinthecow <32052672+zolinthecow@users.noreply.github.com>
+Date:   Thu Oct 24 01:54:53 2024 -0700
+
+    [Bug] Catch any errors caused by parsing json schema (#1776)
+
+[33mcommit 87a7cfa080cec3f123618c1429b5f998bf5d99cb[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 23 17:47:05 2024 -0700
+
+    Fix MockTokenizer in the unit tests (#1774)
+
+[33mcommit 8f8f96a6217ea737c94e7429e480196319594459[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 23 16:45:21 2024 -0700
+
+    Fix the perf regression due to additional_stop_token_ids (#1773)
+
+[33mcommit 05b3bf5e8e4751cf51510198ae2e864c4b11ac2f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 23 16:27:13 2024 -0700
+
+    Crash the server on warnings in CI (#1772)
+
+[33mcommit 3f5ac88d029964f756270c25a9f677f60adb28e7[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Oct 23 15:20:39 2024 -0700
+
+    Fix out of memory message. (#1771)
+
+[33mcommit 0d800090b4effc3f683054ba411cc62521f6ddb4[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 23 12:18:59 2024 -0700
+
+    Fix missing additional_stop_token_ids (#1769)
+
+[33mcommit b7d0559496569a7210de911cb0b23faf384d0bba[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 23 11:28:48 2024 -0700
+
+    Update docs (#1768)
+    
+    Co-authored-by: Chayenne Zhao <zhaochenyang20@gmail.com>
+    Co-authored-by: Chayenne <zhaochen20@outlook.com>
+
+[33mcommit 80a905475d31fea8d3c4eca0681b2a2e8d456106[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 23 10:47:12 2024 -0700
+
+    Fix stop condition for <|eom_id|> (#1766)
+
+[33mcommit 9af7b88e3cbb3afea3a3b71ffd2b984fcbed58bb[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 23 10:46:29 2024 -0700
+
+    [Fix] Fix abort in dp (#1767)
+
+[33mcommit fbcbb26327e1da685139b3f66cdc75c49ae608c0[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 23 09:57:08 2024 -0700
+
+    Fix perf regression for set_kv_buffer (#1765)
+
+[33mcommit 2fce449b1c0a6cadde4946984426336621baed22[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Oct 23 00:02:29 2024 -0700
+
+    [API] add get memory pool size (#1760)
+    
+    Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
+
+[33mcommit ad4125d1a9c4796cdbc6c6a5cdb69b09e60e5509[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Oct 22 23:20:43 2024 -0700
+
+    Fuse more ops & Simplify token mapping (#1758)
+
+[33mcommit 17536e7e3dde0518097dd4c22cea35f7db8e5d5a[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Tue Oct 22 21:00:25 2024 -0700
+
+    Fix edge case for truncated (#1747)
+
+[33mcommit 1f26e8b8e4c8b884e59036dccd87929b2af592f9[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Oct 21 21:16:43 2024 -0700
+
+    Release v0.3.4.post1 (#1749)
+
+[33mcommit 5e1558f1f26f0fc060ea261c9e81b767dc8e3fb9[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Oct 21 16:12:04 2024 -0700
+
+    Update `max_req_len` and `max_req_input_len` (#1748)
+
+[33mcommit 94cde10920035648b0554abec5323176eea8486d[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Oct 21 15:01:21 2024 -0700
+
+    Llama3.2 vision model support (#1551)
+
+[33mcommit 00611286a1a57da6d305a634bf959beb8f5549f6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Oct 21 13:47:12 2024 -0700
+
+    Fix sliding window attention and gemma-2 unit tests in CI (#1746)
+
+[33mcommit e68b9e7667db64e240c25c1b872f7b4d69f54698[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Oct 21 06:28:32 2024 -0700
+
+    misc: add CODEOWNERS (#1737)
+
+[33mcommit 7ce36068914503c3a53ad7be23ab29831fb8aa63[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Oct 21 04:30:52 2024 -0700
+
+    Faster overlap mode scheduler (#1738)
+
+[33mcommit efb099cdee90b9ad332fcda96d89dd91ddebe072[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Oct 21 03:54:35 2024 -0700
+
+    Fix prefill oom (#1743)
+
+[33mcommit 09603c6dc93244cc31de0a1092281bc685187a4f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Oct 21 01:43:16 2024 -0700
+
+    Maintain seq_lens_sum to make more FlashInfer operations non-blocking (#1741)
+
+[33mcommit cf470fea322aeb5cd53b4f6c8a63dbd16821e80c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Oct 20 23:25:14 2024 -0700
+
+    Make token mapping non-blocking in the overlapped mode (#1740)
+
+[33mcommit 45d5af2416f53940e48100754bdfbb6360c4e586[m
+Author: sixgod <evethwillbeok@outlook.com>
+Date:   Mon Oct 21 12:08:30 2024 +0800
+
+    Add GLM-4 TextGeneration Model support for SGLang (#1736)
+
+[33mcommit b121bc03a3c30888caeffd49e96d5ffef473edbf[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Oct 20 19:47:14 2024 -0700
+
+    Simplify batch result resolution (#1735)
+
+[33mcommit e12358dc91361925c4979e552251522e2774fc11[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Oct 20 18:17:41 2024 -0700
+
+    Simplify the usage of device (#1734)
+
+[33mcommit 554fbf93cd67234fa63f811aa458fe0f60f17e42[m
+Author: yizhang2077 <1109276519@qq.com>
+Date:   Sun Oct 20 17:38:35 2024 +0800
+
+    [Bugfix] qwen2vl forward_extend (#1727)
+
+[33mcommit b48edff67fd8051f32f03a7e58499717173f8574[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Oct 20 00:29:29 2024 -0700
+
+    Split the overlapped version of TpModelWorkerClient into a separate file (#1726)
+
+[33mcommit 593b19f29d20065102f8c92580a2dbecbfa95485[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Oct 20 00:05:45 2024 -0700
+
+    Temporarily skip this test_mixed_batch for QWen2VL (#1725)
+
+[33mcommit 59cbf476264d1385405dba4db12effda32cc2053[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 19 23:19:26 2024 -0700
+
+    Unify the memory pool api and tp worker API (#1724)
+
+[33mcommit 95946271afaf472430acb240db3f78e711c2807c[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Oct 19 22:29:12 2024 -0700
+
+    Update README.md
+
+[33mcommit 5c4ce6563153bae73a97d3918821a97804ea0d67[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Oct 19 22:27:38 2024 -0700
+
+    Update README.md (#1722)
+
+[33mcommit cbbc82b7b81bb293265e6d2dd71667ecbbfa9199[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Oct 19 21:44:38 2024 -0700
+
+    Support qwen2 vl model  (#1721)
+    
+    Co-authored-by: yizhang2077 <1109276519@qq.com>
+    Co-authored-by: ispobock <ISPObaoke@163.com>
+
+[33mcommit 8bee20f80b47f1b79eb87e3d53b117d84d4ff948[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Oct 19 20:45:41 2024 -0700
+
+    Update vllm to 0.6.3 (#1711) (#1720)
+    
+    Co-authored-by: Ke Bao <ISPObaoke@163.com>
+
+[33mcommit 12cad0feaecdd9f206cea585f9f57729f12c8bf3[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 19 17:39:38 2024 -0700
+
+    Simplify the interface of tp_worker (#1718)
+
+[33mcommit b6cd903604a7439fcd082290ade83a02b164eca0[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 19 12:58:55 2024 -0700
+
+    Update readme and workflow (#1716)
+
+[33mcommit 087257ea032a8a8fb74798aba7a35a8314d9a4f4[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 19 08:17:41 2024 -0700
+
+    Release v0.3.4 (#1714)
+
+[33mcommit 736f04025d2b01893cbf3ece20614991d0a94951[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 19 07:11:02 2024 -0700
+
+    Update README.md (#1713)
+
+[33mcommit 769bf11c05209a1cc08ac8c5180f1e4da68ba21f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 19 06:50:56 2024 -0700
+
+    Fix the race condition in overlap mode (#1712)
+
+[33mcommit 3db43d1b0803d75220f5d2e3ebd08f2dccdf61c5[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 18 21:01:52 2024 -0700
+
+    Fix `is_all_ready` for overlap copy (#1710)
+
+[33mcommit f0f8a7699b4adfd7caa8b349f77ff50cf4f80610[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 18 20:21:24 2024 -0700
+
+    Simplify the nan detection and greedy check in sampler (#1709)
+
+[33mcommit 2bcfba1b080e137e51e5a726d8e3ddc8dbcb3a79[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 18 17:54:03 2024 -0700
+
+    Skip unnecessary penalizer (#1707)
+
+[33mcommit bc12d4033f3e49314a837249288d5012d1bf7501[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 18 13:21:05 2024 -0700
+
+    Add grouped free operations (#1706)
+
+[33mcommit 392f2863c8da8697fd7fb6f72222a9c82f198ed6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 18 12:18:15 2024 -0700
+
+    Add dtype for more operations (#1705)
+
+[33mcommit 6d0fa73ece7d8e6694ce9a435b5204acaed20876[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Oct 17 22:54:14 2024 -0700
+
+    Simplify flashinfer utilities (#1704)
+
+[33mcommit 9e0dac1ad706aa8769860796e60a218d6c20868b[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Oct 17 18:33:21 2024 -0700
+
+    Fix regex and logprob conflicts when chunked prefilling (#1703)
+
+[33mcommit a95d5589c3bbfeecaec9a1109e601785b24d014c[m
+Author: Gleb Drozdov <159446314+g-drozdov@users.noreply.github.com>
+Date:   Thu Oct 17 22:06:52 2024 +0400
+
+    Add matched_stop token or str to distinguish between eos or stop str finish_reason generation (#1684)
+
+[33mcommit d17d19e5b84ec459e8fcce238232781a731ca488[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Oct 17 10:27:26 2024 -0700
+
+    Fix mixed batch for multi modal models (#1702)
+
+[33mcommit dd3809fad8de5519bef52f14fe0da2496848b28c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Oct 17 09:53:32 2024 -0700
+
+    Fix engine unit test (#1701)
+
+[33mcommit 7feba41584a0b02108c10bd23c2a2bdea6c6a03e[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Oct 17 09:23:29 2024 -0700
+
+    Fix failed ci tests on long prompts; Better error messages for embedding models (#1700)
+
+[33mcommit 30ee36305e468c2a467b8ec13b201c1a61368420[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Oct 17 08:13:29 2024 -0700
+
+    Fix the failed unit tests (#1699)
+
+[33mcommit e5db40dcbce67157e005f524bf6a5bea7dcb7f34[m
+Author: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
+Date:   Thu Oct 17 08:03:08 2024 -0700
+
+    ORJson. Faster Json serialization (#1694)
+
+[33mcommit b170930534acbb9c1619a3c83670a839ceee763a[m
+Author: wxsm <wxsms@foxmail.com>
+Date:   Thu Oct 17 23:01:27 2024 +0800
+
+    feat: radix tree code optimize (#1697)
+
+[33mcommit 5ab20cceba227479bf5088a3fc95b1b4fe0ac3a9[m
+Author: Jani Monoses <jani.monoses@gmail.com>
+Date:   Thu Oct 17 17:50:01 2024 +0300
+
+    Use SGLang imports for linear layer (#1696)
+
+[33mcommit 02f7f3e4889b4941425c4da1ca4a907ac4a5c9a0[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 16 19:03:55 2024 -0700
+
+    Update the transformers version in CI (#1690)
+
+[33mcommit 2782132be8c67f7108042a3dedb7094779b81a0c[m
+Author: Zeng Zhongchao <zengzhongchao@gmail.com>
+Date:   Thu Oct 17 09:54:55 2024 +0800
+
+    Add date to logging messages (#1623) (#1679)
+
+[33mcommit d19cc0b9c9520328f29dc88f827caa2081458382[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 16 18:36:24 2024 -0700
+
+    Update README.md (#1689)
+
+[33mcommit b0facb3316dedad4a1f5418a89ddfebf811d0b1d[m
+Author: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
+Date:   Wed Oct 16 18:14:30 2024 -0700
+
+    add orjson for jsonresponse (#1688)
+
+[33mcommit ecb8bad276ea13e243a36cc23adca8207fac4657[m
+Author: havetc <corentin.havet@hotmail.fr>
+Date:   Wed Oct 16 20:49:22 2024 +0200
+
+    Returning a per request metric for number of cached_tokens read (#1599)
+
+[33mcommit dbec2f18478cf1de196d566e5fcc9b18754d0fa4[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 16 11:20:17 2024 -0700
+
+    Launch a thread to overlap CPU and GPU (#1687)
+
+[33mcommit e4b367baa80db372452bcf1f7da12724af71640e[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Oct 16 10:58:14 2024 -0700
+
+    [Event] Add online meetup meeting link (#1686)
+
+[33mcommit d10b933a36eed76b72c351f7914efa6c4c86d842[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Wed Oct 16 23:21:20 2024 +0800
+
+    Fix srt dependency (#1685)
+
+[33mcommit 9116b2896fb9b6ae8510f48ce5f12f012b64483f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 16 01:33:20 2024 -0700
+
+    Add a new event loop (#1677)
+
+[33mcommit a5114b6f910f3c2a45b628a4052d47c9b518ccea[m
+Author: Jani Monoses <jani.monoses@gmail.com>
+Date:   Wed Oct 16 10:11:18 2024 +0300
+
+    Add OLMo model (#1676)
+
+[33mcommit b6b40946211f8284686fcc8a8044527bbd132f51[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Oct 15 22:59:26 2024 -0700
+
+    Fix filter_batch function call (#1681)
+
+[33mcommit f1088e0fc87027a18347c98c0319e54eabbe6a03[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Oct 15 08:15:08 2024 -0700
+
+    Fix memory leak during abort (#1674)
+
+[33mcommit 175afed370e2c527125de762c367105c7ea4a942[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Oct 14 21:53:01 2024 -0700
+
+    Improve benchmark scripts (#1672)
+
+[33mcommit 4a292f670db863fbdea906ad41aec2c631eedbdb[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Oct 14 20:08:03 2024 -0700
+
+    [Minor] Add some utility functions (#1671)
+
+[33mcommit cd0be7489f3ad0f12ca21db23962c0dd52788262[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Oct 14 19:56:21 2024 -0700
+
+    [doc] improve engine doc and add to readme (#1670)
+
+[33mcommit 56503d9bc93a13db22b9314a93c83a4f2ce4c362[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Oct 14 09:06:34 2024 -0700
+
+    [1/N] Remove `CacheConfig` import in all model files (#1658)
+
+[33mcommit 02bc95796d64c9dc1ad8eef6a8b50b3cbd98e9f8[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Oct 14 06:47:50 2024 -0700
+
+    Simplify chunked prefill (#1667)
+
+[33mcommit 24f3e1511cc289b1b7e3e94e4ee19ab559a5e7f9[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Oct 14 05:25:00 2024 -0700
+
+    [Minor] Improve style (#1666)
+
+[33mcommit 6790240cc31c11ee69fe3f38ae170390b1615ef6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Oct 14 02:01:44 2024 -0700
+
+    Fix unit test order to balance the tasks in CI (#1665)
+
+[33mcommit 061e54631352d9c54eb136042acd5474fc3478ca[m
+Author: Shuo Yang <73746844+andy-yang-1@users.noreply.github.com>
+Date:   Mon Oct 14 02:00:41 2024 -0700
+
+    Support double sparsity (#1459)
+
+[33mcommit 0c1e87964b87f201f1cc9d3bd6d54ae3280a9b31[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Oct 14 01:15:34 2024 -0700
+
+    Move filter_batch out of stream_output (#1663)
+
+[33mcommit 869f1c02c4a7140c674ea92127a45eac0211bf74[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Oct 13 20:32:37 2024 -0700
+
+    Add a test case to test retract (#1662)
+
+[33mcommit 2725f8da61a30b902be789d161074cec1c1ad988[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Oct 13 20:30:03 2024 -0700
+
+    [Minor] Rename no_eos_trim to no_stop_trim (#1661)
+
+[33mcommit da1ffed689f2e18702b152d3d86ed5312e86f33a[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Oct 13 19:54:02 2024 -0700
+
+    Add output_ids into ScheduleBatch (#1659)
+
+[33mcommit 48761171716302446a95c8d9d1fe1a469f12309e[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Oct 13 01:07:09 2024 -0700
+
+    [Fix] fix eos trim inconsistency (#1650)
+
+[33mcommit c3f2fc5a7a152a3679e753dcd023f38ef2458676[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sun Oct 13 00:33:58 2024 -0700
+
+    [doc] Add engine section in backend.md (#1656)
+
+[33mcommit 7ee6c259ff8c9cc29f92c4c68530810c8bfc2b30[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 12 21:35:30 2024 -0700
+
+    Simplify the event loop and expose `--num-continuous-decode-steps` as an argument (#1652)
+
+[33mcommit 9610fcd46964507a7c5418ea35e94935c7881815[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 12 19:47:24 2024 -0700
+
+    Fix the batch_is_full check for jump-forward decoding (#1654)
+
+[33mcommit 31fad29ab02d1d516632da7fc2f128eabc89aee9[m
+Author: Patrick Yi <21299683+pjyi2147@users.noreply.github.com>
+Date:   Sat Oct 12 22:39:35 2024 -0400
+
+    Add get_tokenizer function for Engine class (#1653)
+
+[33mcommit 9da5a60b18bcd0331a7b54e89d3d697db599f924[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 12 17:53:23 2024 -0700
+
+    Add an option to disable penalizer (#1651)
+
+[33mcommit 69aa937aa528f0066ab5226bb428cbdf37dec048[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 12 14:49:24 2024 -0700
+
+    Fix unit tests and type annotations (#1648)
+
+[33mcommit 5d638c92f5215aa43c4fb38d5106d2d3967cabb1[m
+Author: Zhang, Liangang <liangang.zhang@intel.com>
+Date:   Sun Oct 13 02:10:32 2024 +0800
+
+    [Feature, Hardware] Enable SGLang on XPU GPUs via PyTorch (#1480)
+
+[33mcommit e37cdab0c6fd1e924522ab3cb720908d0c02e226[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Oct 12 00:36:28 2024 -0700
+
+    Fix ignore_eos (#1645)
+
+[33mcommit 1d9deeacdb32a5253d7afb25c20d28cb8fc07786[m
+Author: LI MOU <142368437+learninmou@users.noreply.github.com>
+Date:   Sat Oct 12 12:37:20 2024 +0800
+
+    fix missing ignore_eos in v1/chat/completions (#1642)
+
+[33mcommit dafb6a5266f7c04cbd75e8bb0f8e6d63630fa336[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 11 16:05:58 2024 -0700
+
+    [Fix] Fix the style of test_large_max_new_tokens.py (#1638)
+
+[33mcommit 862cd265e5149df71858658c12d8dbbf82d72c44[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Fri Oct 11 15:26:25 2024 -0700
+
+    [engine] support async and streaming (#1614)
+
+[33mcommit 00c7e6368bbf598da9af5443f24353e8ff2a6fd0[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 11 07:56:16 2024 -0700
+
+    Release v0.3.3.post1 (#1636)
+
+[33mcommit 23cc66f7b65f885969d4608fd4964e0ba98fb7f5[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 11 07:22:48 2024 -0700
+
+    Add back data parallelism (#1635)
+
+[33mcommit 5d09ca5735462eacc36a0b0aed7f4108c3d33f2f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 11 06:26:20 2024 -0700
+
+    Fix constrained decoding (#1634)
+
+[33mcommit 81c33274021b96743fbf6d05d8f8e43ef2af2f62[m
+Author: Janumala Akhilendra <82641474+JanumalaAkhilendra@users.noreply.github.com>
+Date:   Fri Oct 11 18:55:30 2024 +0530
+
+    Added a "Back To Top" Button (#1633)
+
+[33mcommit f13d86f9209b62c701dcd12d08cad8f15c600fae[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 11 05:07:51 2024 -0700
+
+    Add image_token in conversation.py (#1632)
+    
+    Co-authored-by: yizhang2077 <1109276519@qq.com>
+
+[33mcommit aba9eae4c653ee4949bb7d5723b4d1b918d206b6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 11 05:03:20 2024 -0700
+
+    Fix the correctness test in bench_latency.py when tp > 1 and test_generation_models.py (#1631)
+
+[33mcommit bbd72bfc8609d1c5d8bc9ebb29b9c3b9e218bb90[m
+Author: 科英 <abatom@163.com>
+Date:   Fri Oct 11 17:34:25 2024 +0800
+
+    Add the ability to enable and disable the Profiler via HTTP API. (#1626)
+
+[33mcommit b503881bd214eed7ff6d46e965004d721a7a11fe[m
+Author: Yiding-Lu <45527994+OBJECT907@users.noreply.github.com>
+Date:   Fri Oct 11 17:25:04 2024 +0800
+
+    [Bug] Fix the Image Input of Batch Generation (#1579)
+
+[33mcommit 58093b868f48fead1a224264b2d5534568faaee4[m
+Author: glen-amd <146770157+glen-amd@users.noreply.github.com>
+Date:   Fri Oct 11 02:17:47 2024 -0700
+
+    Nit about the decorator of `PortArgs.init_new` (#1611)
+
+[33mcommit 8275049ce3dd3dd3a64f0d1db833f5622202bc9c[m
+Author: Zhang, Liangang <liangang.zhang@intel.com>
+Date:   Fri Oct 11 17:05:58 2024 +0800
+
+    Add device support (#1607)
+
+[33mcommit 5476ccad8fa3f1ecf71cceb066ffbc88a0269e8d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 11 01:59:49 2024 -0700
+
+    Update README.md
+
+[33mcommit b040ed71f774d3ba05c5c66dcd1a333dbfae2b3b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 11 01:58:25 2024 -0700
+
+    Update README.md (#1629)
+
+[33mcommit c9e665869982d5322293d66301761bf1425dc0e9[m
+Author: Kushal Agrawal <98145879+kushal34712@users.noreply.github.com>
+Date:   Fri Oct 11 14:27:42 2024 +0530
+
+    Update README.md (#1625)
+
+[33mcommit e11ab79e68c1ccc5ead4177d71582bdc8bf61510[m
+Author: HAI <hixiao@gmail.com>
+Date:   Thu Oct 10 22:48:15 2024 -0700
+
+    [Performance, hardware] MoE tuning update to AMD MI300x GPUs (#1619)
+
+[33mcommit 01fdb2f377e2a4bbc1be9fa4ffaac69929ddb513[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Thu Oct 10 16:34:13 2024 -0700
+
+    Fix test_vision_openai_server on CI (#1620)
+
+[33mcommit c996e8ccd415f6e1077ace5bc645d19a8dd40203[m
+Author: Amos You <91300605+amosyou@users.noreply.github.com>
+Date:   Tue Oct 8 21:11:19 2024 -0700
+
+    [Minor] Fix logging typo (#1615)
+
+[33mcommit 7b69d91b4f94a73f6b8fa3a86de3a910a16dc645[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Oct 8 12:58:41 2024 -0700
+
+    Release v0.3.3 (#1605)
+
+[33mcommit e8613df071fb126f97c0d1254977586f39362e08[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Oct 7 21:26:56 2024 -0700
+
+    [Engine] Fix generate hanging issue after the first call (#1606)
+
+[33mcommit c5325aba75b29cd6c893ba6bb67b31870f03692d[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Oct 7 14:37:16 2024 -0700
+
+    [Profile] Add pytorch profiler (#1604)
+
+[33mcommit ebbc42d989bb206d296f074a5dea7aed948d3715[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Oct 7 13:05:53 2024 -0700
+
+    Optimize broadcast & Reorg code (#1598)
+
+[33mcommit 3ff641132e45881ed9ff86be31d673e3ae1d6812[m
+Author: Jani Monoses <jani.monoses@gmail.com>
+Date:   Mon Oct 7 21:30:41 2024 +0300
+
+    Remove references to squeezellm (#1603)
+
+[33mcommit 2b302b93938c3de5fc98c5149a7ebcce86648051[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Oct 7 00:44:38 2024 -0700
+
+    Fix the port_args in bench_latency (#1597)
+
+[33mcommit 68f8b60d22a34f81ca6bff044170d86e97f09b6e[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Mon Oct 7 14:34:14 2024 +0800
+
+    Fix chunked prefill condition (#1594)
+
+[33mcommit 6a5b352aaf1e53c490945bd87ebe6ab456b5eda6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Oct 6 22:54:05 2024 -0700
+
+    Use is_flashinfer_available to replace is_hip for flashinfer check (#1596)
+    
+    Co-authored-by: Zhang Liangang <liangang.zhang@intel.com>
+
+[33mcommit 565b05f02fc785a02855b2f596a5649a1bee7336[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sun Oct 6 22:18:45 2024 -0700
+
+    Use `atexit` hook to implicitly shutdown `Runtime` (#1595)
+
+[33mcommit b6aad70ab1160a521151a69202e717dbd652e331[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Oct 6 20:30:02 2024 -0700
+
+    [Fix] Fix the case where prompt_len = 0 (#1593)
+
+[33mcommit 551a3a9d3870e57c285025827be8870f197daa0a[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sun Oct 6 20:27:03 2024 -0700
+
+    Provide an offline engine API (#1567)
+
+[33mcommit 91877a9f9c8763a504373e41d238dbf9adf65f8c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Oct 6 15:43:32 2024 -0700
+
+    Fix modality for image inputs (#1592)
+
+[33mcommit f7cce751f9f9006ae46506c029a14dd4115fe810[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Oct 6 15:14:29 2024 -0700
+
+    Update README.md (#1591)
+
+[33mcommit 17e998f1a8f7f08e85f6ba8b0096f25aa4d6b666[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sun Oct 6 15:02:27 2024 -0700
+
+    Test consistency for single and batch seperately (#1590)
+
+[33mcommit c98e84c21e4313d7d307425ca43e61753a53a9f7[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Oct 6 13:15:05 2024 -0700
+
+    [Minor, Performance] Use torch.argmax for greedy sampling (#1589)
+
+[33mcommit 9c064bf78af8558dbc50fbd809f65dcafd6fd965[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Oct 6 10:33:44 2024 -0700
+
+    [LoRA, Performance] Speedup multi-LoRA serving - Step 1 (#1587)
+
+[33mcommit 58d1082e392cabbf26c404cb7ec18e4cb51b99e9[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Oct 6 03:24:04 2024 -0700
+
+    Clean up event loop (#1586)
+
+[33mcommit 4d086719e5cee5dc84d89d9b47522b11bb776157[m
+Author: HAI <hixiao@gmail.com>
+Date:   Sun Oct 6 01:09:09 2024 -0700
+
+    [Bug] Fix decode stats error on output_len 1 (#1585)
+
+[33mcommit 9244f27f0af24deb199921c32e24f2491380e016[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Oct 6 00:10:48 2024 -0700
+
+    [Minor] Improve the style and fix flaky tests (#1584)
+
+[33mcommit 2422de5193bfb00ed73b767957448de528dfb14f[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Oct 5 21:51:12 2024 -0700
+
+    Support min_tokens in sgl.gen (#1573)
+
+[33mcommit 521f862d9067ddac679a4d8e048f35bb8fecf47f[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Oct 5 17:59:05 2024 -0700
+
+    Fix runtime.generate when sampling param is not passed (#1582)
+
+[33mcommit 34c32d2820caabcc9d481bedf854d9866fed48e7[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Oct 5 17:52:14 2024 -0700
+
+    Fix styling  (#1583)
+
+[33mcommit dde8bb16fe9180bd1642bdb8d4f0aa283b120ee4[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Oct 5 17:27:43 2024 -0700
+
+    default sampling param should be deepcopied (#1581)
+
+[33mcommit 8ac3ccc060fe59c0ff76011ac40cb1a2f22edfdc[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Oct 5 11:47:35 2024 -0700
+
+    Backend method not found when SRT Runtime is used (#1576)
+
+[33mcommit 9b0926ceeb3393e6af94060cc2bcb005368f7932[m
+Author: Jerry Zhang <jerryzh168@gmail.com>
+Date:   Sat Oct 5 11:22:27 2024 -0700
+
+    Add llama implementation with no tensor parallel linears (#1561)
+
+[33mcommit 1c1bdc769975b16f5537828230e7a98986669d18[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Oct 5 11:16:47 2024 -0700
+
+    [Event] Update README.md (#1572)
+
+[33mcommit 6bfdb4031dec751d51b6e68c09c768bd41922291[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Oct 5 11:07:41 2024 -0700
+
+    [Easy] use .text() instead of .text (#1577)
+
+[33mcommit f8fb4ce9b0a788c7e1826a26035ad3c103ca45d3[m
+Author: Ikko Eltociear Ashimine <eltociear@gmail.com>
+Date:   Sun Oct 6 03:05:57 2024 +0900
+
+    chore: update README.md (#1580)
+
+[33mcommit 5d0ba4038f9c30d0996338c431a93b0d5324f2ae[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Oct 4 18:00:18 2024 -0700
+
+    Refine the add request reasons to avoid corner cases. (#1574)
+
+[33mcommit 04b262cd91cbcb6b7ec1e096cf6e00631f2c4ead[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Oct 4 01:51:11 2024 -0700
+
+    [Fix] Fix major performance bug in certain cases (#1563)
+    
+    Co-authored-by: hnyls2002 <hnyls2002@gmail.com>
+
+[33mcommit 2432ad40c6fd630e0c40cd1850f5f66130914aa4[m
+Author: FredericOdermatt <50372080+FredericOdermatt@users.noreply.github.com>
+Date:   Fri Oct 4 10:16:53 2024 +0200
+
+    [Minifix] Remove extra space in cot example (#1569)
+
+[33mcommit 45473d4b2b2f026da18095dbbd573f739dc440b0[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 4 01:09:59 2024 -0700
+
+    Make input_ids a torch.Tensor (#1568)
+
+[33mcommit 114bbc8651c864093ac49e704db7e888f9f453d2[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Oct 4 00:45:52 2024 -0700
+
+    Use ipc instead of tcp in zmq (#1566)
+
+[33mcommit 32eb6e96f2b344eb917cb417c2a924281bacdcb2[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Oct 3 18:29:49 2024 -0700
+
+    Organize sampling batch info better (#1562)
+
+[33mcommit e0b5dbcec13cea0bed7737463052480f7468ce41[m
+Author: HAI <hixiao@gmail.com>
+Date:   Thu Oct 3 01:52:26 2024 -0700
+
+    [FP8 KV Cache] Avoid KeyError at loading pre-quantized FP8 model with kv_scale (#1559)
+
+[33mcommit e6852b0dd28979d548885c6b203e0b74616d9af7[m
+Author: Minsang Song <mssongit@gmail.com>
+Date:   Thu Oct 3 12:41:15 2024 +0900
+
+    [Fix] Fix AttributeError in Qwen2.5 LoRA: 'Qwen2ForCausalLM' object has no attribute 'get_hidden_dim' (#1536)
+    
+    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
+
+[33mcommit 4ae0969c0abfd576d119de0698894a11f5991b0c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 2 18:23:35 2024 -0700
+
+    Move status check in the memory pool to CPU (#1557)
+
+[33mcommit 317631cadac533d2d548ae35fc153b8a8fbbccc8[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 2 17:18:04 2024 -0700
+
+    [Fix] Move ScheduleBatch out of SamplingInfo (#1556)
+
+[33mcommit b564835364e13979226faa6d56ba6d70e07caa9f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Oct 2 13:19:44 2024 -0700
+
+    [Fix] do not maintain regex_fsm in SamplingBatchInfo (#1555)
+
+[33mcommit 2c7d0a5b8b33d9a90ede19a0ee227393982ac340[m
+Author: Theresa Barton <tbarton16@gmail.com>
+Date:   Wed Oct 2 10:12:07 2024 -0700
+
+    [Fix] Fix all the Huggingface paths (#1553)
+
+[33mcommit 8cdc76f6d4cd61ced1d84a44c243b8a89e0a1f74[m
+Author: kk <43161300+kkHuang-amd@users.noreply.github.com>
+Date:   Thu Oct 3 00:52:46 2024 +0800
+
+    [Performance, Hardware] MoE tuning on AMD MI300x GPUs (#1554)
+    
+    Co-authored-by: wunhuang <wunhuang@amd.com>
+
+[33mcommit f202ed97121a42fbc960572fa953101f584f17d4[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Oct 1 10:25:32 2024 -0700
+
+    [Refactor] Simplify io_struct and tokenizer_manager (#1549)
+
+[33mcommit 100f5b8bc976773b595923665715eb13d3bfcab6[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Oct 1 00:28:42 2024 -0700
+
+    Simplify flashinfer dispatch (#1552)
+
+[33mcommit 619bb6ddda39cada67f75426979b77e7b42bb15e[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Sep 30 23:12:36 2024 -0700
+
+    Dispatch flashinfer wrappers (#1550)
+
+[33mcommit b88ea90d4ad98992790395d11ae20bf27b9657f8[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Sep 30 17:09:54 2024 -0700
+
+    Fix bugs of `logprobs_nums` (#1548)
+
+[33mcommit 99ec439da476c1a83ce29863395433833f0ac850[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Sep 30 15:54:18 2024 -0700
+
+    Organize Attention Backends (#1547)
+
+[33mcommit 0f4fb19bc8cf87f518b0273ee970d5b3eef8beb5[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Sep 30 10:06:08 2024 -0700
+
+    [Fix, LoRA] fix LoRA with updates in main (#1545)
+
+[33mcommit 63ba2f8d7bf895938a3f4039910044ce6912d57e[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Sep 30 06:41:49 2024 -0700
+
+    Clean up batch data structures: Introducing ModelWorkerBatch (#1544)
+
+[33mcommit 36d5acfca54f56567f9ece4c0664ba6c8c02d76f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Sep 30 02:41:11 2024 -0700
+
+    Rename InputMetadata -> ForwardBatch (#1543)
+
+[33mcommit 3f0fe08d3748dfd5aa70f4620bb44bdd6765e335[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Sep 29 20:28:45 2024 -0700
+
+    Let ModelRunner take InputMetadata as input, instead of ScheduleBatch (#1541)
+
+[33mcommit 55b974f96f14dc359111bb34a47655eec006655f[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Sep 29 18:52:43 2024 -0700
+
+    Process image in parallel (#1539)
+
+[33mcommit f86c1e611f9cfead8040966ae290679e8932db8c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Sep 29 17:42:45 2024 -0700
+
+    Move scheduler code from tp_worker.py to scheduler.py (#1538)
+
+[33mcommit acaffd233fbaee164947ffca45189204db4da6f3[m
+Author: Xinyu Yang <cauyxy@163.com>
+Date:   Mon Sep 30 02:02:40 2024 +0800
+
+    [Fix] fix ipv6 url when warm up model (#1537)
+
+[33mcommit 048685430d4c46fd5bc150675b0df49fc6a681d3[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Sep 29 02:36:12 2024 -0700
+
+    Improve process creation (#1534)
+
+[33mcommit fd9ad817ec449592ec58b1cb7b57ac2e55d49b02[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Sep 28 23:28:55 2024 -0700
+
+    Organize image inputs (#1531)
+
+[33mcommit e165a9fc1bb104d07763d2992ee45642b5fcda28[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Sep 28 19:33:09 2024 -0700
+
+    Make detokenizer_manager.py not asyncio (#1532)
+
+[33mcommit 4e4459b91fae13c6d75252ffc41ecacc03950372[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Sep 28 14:43:35 2024 -0700
+
+    Multiple minor fixes (#1530)
+
+[33mcommit 065bb9475344c66d468c9a7ba71fb1ea465292a0[m
+Author: Jeffrey Fong <jeffreyfong94@gmail.com>
+Date:   Sun Sep 29 05:04:06 2024 +0800
+
+    Fix RuntimeEndpoint.select method (#1495)
+
+[33mcommit f42e9bfb52b4e1af282806697ba29de1164df480[m
+Author: Kylin <kose2livs@gmail.com>
+Date:   Sun Sep 29 03:43:22 2024 +0800
+
+    [bugfix] Add modelscope package to avoid docker image without modelscope (#1520)
+
+[33mcommit 840c5dbcb303d55f982cc386b539a892b5c4ba2e[m
+Author: Ninglin Du <du00cs@outlook.com>
+Date:   Sun Sep 29 03:42:06 2024 +0800
+
+    [FIX] Catch syntax error of Regex Guide to avoid crash (#1521)
+
+[33mcommit 63e845d0bb3a4095af9640242aaca4ed8656fed8[m
+Author: Jerry Zhang <jerryzh168@gmail.com>
+Date:   Sat Sep 28 12:27:54 2024 -0700
+
+    Add float8 dynamic quant to torchao_utils (#1528)
+
+[33mcommit 9aa6553d2abd3d0a76629fe1bc02ce9b58f3907e[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Sep 27 23:32:11 2024 -0700
+
+    [Feature] Support reward model LxzGordon/URM-LLaMa-3.1-8B (#1525)
+
+[33mcommit b1e330bcb0db76928279ef9c2ad603cc1e666965[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Sep 27 13:30:04 2024 -0700
+
+    [Event] Update meeting link (#1529)
+
+[33mcommit 4353acb469d46afe3b652928729803492873d0cd[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Sep 27 01:49:16 2024 -0700
+
+    minor: fix config (#1524)
+
+[33mcommit 9ae1db0bdcb1e9c4ecf70cc1cc7473a23eee97dc[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Sep 25 11:32:21 2024 -0700
+
+    [Fix] Ignore import error (#1513)
+
+[33mcommit 37c5899fc2100de1c9afd51a7b1977b2f8185a28[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Sep 24 23:17:09 2024 -0700
+
+    Release v0.3.2 (#1512)
+
+[33mcommit f39a0197fdc39aeb468f8b51f7b9e7631a02bf98[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Sep 24 22:50:31 2024 -0700
+
+    Revert "kernel: use tensor cores for flashinfer gqa kernels" (#1511)
+
+[33mcommit 3c93187cafd675ad8c05dcf4095513ce4ec0bae3[m
+Author: TianyiQ <34389237+TianyiQ@users.noreply.github.com>
+Date:   Tue Sep 24 21:50:20 2024 -0700
+
+    Add support for tie_word_embeddings when loading weights + support for SmolLM (#1508)
+
+[33mcommit fb2d0680e0479acd7ea69737cbb09eec21755af9[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Sep 24 21:37:33 2024 -0700
+
+    [Fix] Fix clean_up_tokenization_spaces in tokenizer (#1510)
+
+[33mcommit 067d8e16fc8d35bbb5bbe339850a47cb39e59b85[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Sep 24 17:42:07 2024 -0700
+
+    Simplify bench_latency.py (#1503)
+
+[33mcommit e6692bf4a53780572635f65d524f579f5ce08220[m
+Author: luzengxiangcn <60803814+luzengxiangcn@users.noreply.github.com>
+Date:   Tue Sep 24 19:58:01 2024 +0800
+
+    debug radixcache stack_overflow (#1499)
+
+[33mcommit 28b4d8e14472eda2f4e779c2e3a7f9eac5a522b3[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Sep 24 03:17:10 2024 -0700
+
+    Update test_srt_backend.py (#1502)
+
+[33mcommit bc068e96181d4b42989e1c13b59f4b64de94bd99[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Sep 24 02:06:28 2024 -0700
+
+    [CI] Move AMD test to a separate file (#1500)
+
+[33mcommit 8d4ed42ad51dfab931c61e007b58a76bb1a2eeb8[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Tue Sep 24 16:46:59 2024 +0800
+
+    MoE torch compile (#1497)
+
+[33mcommit 2854a5ea9fbb31165936f633ab99915dec760f8d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Sep 23 07:38:14 2024 -0700
+
+    Fix the overhead due to penalizer in bench_latency (#1496)
+
+[33mcommit 42a2d82ba71dc86ca3b6342c978db450658b750c[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Sep 23 20:40:17 2024 +0800
+
+    minor: add mla fp8 test (#1494)
+
+[33mcommit e4780cf839b5dcaf41cd60fa384faa9616372025[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Sep 22 06:46:17 2024 -0700
+
+    [API, Feature] Support response prefill for openai API (#1490)
+
+[33mcommit 39bb49d156f2319d2aec67c458c2db980bb0f4c3[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Sep 22 04:49:16 2024 -0700
+
+    Update dockerfile to include datamodel_code_generator (#1492)
+
+[33mcommit 6f3cf1297e7600f4b2ba8dd3af3a5cc2e33de6ef[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Sep 22 04:45:10 2024 -0700
+
+    [CI, AMD] Add AMD tests to CI (#1491)
+
+[33mcommit 13f1357ef000e0d9dcf6f13dd178d809126f3ac7[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Sep 22 02:21:05 2024 -0700
+
+    Add a unit test for data parallelism (#1489)
+
+[33mcommit 2a99993cd968854eca891b4b9e9ad406560155cd[m
+Author: wellhowtosay <aaa7327403@qq.com>
+Date:   Sun Sep 22 17:20:26 2024 +0800
+
+    Pr fix max workers (#1456)
+    
+    Co-authored-by: baolujia <baolujia@shizhuang-inc.com>
+    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
+
+[33mcommit 167591e864f1aab0787bdd8efd8eb91852b1a1a6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Sep 22 01:50:37 2024 -0700
+
+    Better unit tests for adding a new model (#1488)
+
+[33mcommit 441c22db8cbcb005b5f005b991e8aa1a65d79bb6[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Sep 21 22:05:12 2024 +0800
+
+    doc: update backend (#1486)
+
+[33mcommit ce636ac441f8085ab5a118b26c52005a78fcb2bf[m
+Author: Ran Chen <ranchen19@icloud.com>
+Date:   Sat Sep 21 05:36:23 2024 -0700
+
+    fix incorrect links in documentation (#1481)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit 82136eb0b58cf93c953b9f701360aa1fe4718c14[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Sep 21 11:17:45 2024 +0800
+
+    chore: bump v0.3.1.post3 (#1483)
+
+[33mcommit b8ccaf4d737a3c6a7965317a5eedddf3f4af903d[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Sat Sep 21 11:16:13 2024 +0800
+
+    Add MLA gsm8k eval (#1484)
+
+[33mcommit a68cb201dd5f4ae6155b324d22054bbb0de15fba[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Sat Sep 21 10:25:20 2024 +0800
+
+    Fix triton head num (#1482)
+
+[33mcommit 014982b5e00cbcf18caa20dc662a1c09220c3fa7[m
+Author: Niklas Muennighoff <n.muennighoff@gmail.com>
+Date:   Thu Sep 19 19:32:49 2024 -0700
+
+    Add OLMoE (#1476)
+
+[33mcommit a6db88626e137d65b2b452609174a9e7aff2d779[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Sep 20 01:57:19 2024 +0800
+
+    minor: add quant eval compared with base (#1475)
+
+[33mcommit b4408b0d1667bb34729a146a01efb31179d28edf[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Sep 19 20:53:11 2024 +0800
+
+    feat: update linear deps 1/N (#1305)
+
+[33mcommit 2cd7e181dddd825862e8007b3709e12339e62f03[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Sep 19 03:19:26 2024 -0700
+
+    Fix env vars in bench_latency (#1472)
+
+[33mcommit 5ce55aee15311802b7f57caf80eb278e4d8453f4[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Sep 19 02:03:38 2024 -0700
+
+    Release v0.3.1.post2 (#1470)
+
+[33mcommit 2d346a57c2fb753d9353f346dff3bbc5b208bd0b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Sep 19 01:52:15 2024 -0700
+
+    Fix padding in the cuda graph (#1469)
+
+[33mcommit 446ea3327735e125e19d37b6a2c25aed7ead68f3[m
+Author: Li Bo <drluodian@gmail.com>
+Date:   Thu Sep 19 16:31:48 2024 +0800
+
+    fix: creat new dict everytime for putting new frame (#1464)
+
+[33mcommit 8f527e29409f714f9de839ece1e7aace15d9b27a[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Sep 18 08:53:22 2024 -0700
+
+    [Event] Add public meeting invite to README (#1458)
+
+[33mcommit 7f24ea95c344ae85c6633d47083722ebc5377f07[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Sep 18 04:35:35 2024 -0700
+
+    Fuse top_k and top_k in the sampler (#1457)
+
+[33mcommit 1acccb364a46b4c7fe6bb6aabdfd44a11cf8b204[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Sep 18 03:45:19 2024 -0700
+
+    Fix oom issues with fp8 for llama (#1454)
+
+[33mcommit aa2750beb30fe6663fa68162b8937399cebb03e4[m
+Author: HAI <hixiao@gmail.com>
+Date:   Wed Sep 18 02:01:35 2024 -0700
+
+    [Bugfix] Enable SGLang on AMD GPUs via PyTorch for ROCm (#1419) (#1453)
+
+[33mcommit 5e62a6b706b9de7d3a7628e2523d24b7e67b30c8[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Sep 18 00:56:06 2024 -0700
+
+    Add bench_server_latency.py (#1452)
+
+[33mcommit 5752f25eef583cc97a961d0e625c7bf46f8526af[m
+Author: Xiao Yu <39458711+jasonyux@users.noreply.github.com>
+Date:   Wed Sep 18 03:46:32 2024 -0400
+
+    Fixed n>1 causing list index out of range with VLM (#1449)
+
+[33mcommit 7c162fa9c5f7ab029104e8aeb93460d0a38d3034[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Sep 17 22:59:32 2024 -0700
+
+    Fix schedule bug (#1451)
+
+[33mcommit 36078fb24760de837b95f4bea87ede00c0fd91e8[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Sep 17 16:33:53 2024 -0700
+
+    fix schedule bug (#1450)
+
+[33mcommit b3710d2c93b6f1ef608990096d71817c5cf35608[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Tue Sep 17 22:07:53 2024 +0800
+
+    Fix attention backend (#1448)
+
+[33mcommit c6b6d2e71b2b5626097d8e0d8c18f810e828d58e[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Tue Sep 17 19:42:48 2024 +0800
+
+    Enable MLA by default (#1447)
+
+[33mcommit 90a26be31cea2e15c26ef5e91b296bc76a75004c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Sep 17 01:47:31 2024 -0700
+
+    Release 0.3.1.post1 (#1445)
+
+[33mcommit 1f4b5f770dd257ebe84f1e1b875de2f5f782c5bc[m
+Author: Jani Monoses <jani.monoses@gmail.com>
+Date:   Tue Sep 17 11:14:53 2024 +0300
+
+    Add OLMoE model (#1444)
+
+[33mcommit 76524b70d1f8bb4a3d0648010c7b35b54f01d654[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Tue Sep 17 15:52:08 2024 +0800
+
+    Fix torch compile for deepseek-v2 (#1442)
+
+[33mcommit 3a6e04185b8dc3fd69f6c308a09186419b746c43[m
+Author: HAI <hixiao@gmail.com>
+Date:   Tue Sep 17 00:43:52 2024 -0700
+
+    [Feature, Hardware] Enable SGLang on AMD GPUs via PyTorch for ROCm (#1420)
+
+[33mcommit 2fa5cec7754432590f0b1478bf7b2ee8333ae4d3[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Sep 16 21:23:31 2024 -0700
+
+    Simplify sampler and its error handling (#1441)
+
+[33mcommit 27b557aea794d267e371d3bdaa4722a4db45a1e1[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Sep 16 18:16:27 2024 -0700
+
+    Clean up model loader (#1440)
+
+[33mcommit 93dffd699bd653fb1dfef44f30eb3d7ec40d6a4d[m
+Author: zifeitong <zifeitong@gmail.com>
+Date:   Mon Sep 16 13:29:18 2024 -0700
+
+    Add constrained_json_whitespace_pattern to ServerArgs (#1438)
+
+[33mcommit 2abe4f1cb6e9b4d36c332b0fb04c0dec2ad38bc6[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Sep 15 15:22:32 2024 -0700
+
+    Revert "[Minor] Raise exception for wrong import (#1409)" (#1432)
+
+[33mcommit 37963394aa28769abb1843d4373ae799d4e93f07[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Sep 15 12:46:04 2024 -0700
+
+    [Feature] Support LoRA path renaming and add LoRA serving benchmarks (#1433)
+
+[33mcommit 899cf5c4389b83fc7d913bf23ce07282d09ffb91[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Sep 15 08:52:18 2024 -0700
+
+    Remove deprecated configs (#1431)
+
+[33mcommit e79f6cd73d8e0fd74db29b8c441c2ead0e2fff37[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Sep 15 07:03:16 2024 -0700
+
+    Release v0.3.1 (#1430)
+
+[33mcommit 9ba1f0976035fe7212002cac3b2b9df9f0685334[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Sep 15 06:36:06 2024 -0700
+
+    [Fix] Fix logprob and normalized_logprob (#1428)
+
+[33mcommit 282681b8a15affd7f7d9e16584c38954ba4e8413[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Sep 15 02:55:34 2024 -0700
+
+    Update backend.md (#1429)
+
+[33mcommit 58cafe23a7dcbeac8b45ad79212c3a535f1da6ce[m
+Author: William Arnold <will748@gmail.com>
+Date:   Sun Sep 15 15:40:31 2024 +0900
+
+    Add libibverbs-dev to Dockerfile (#1427)
+
+[33mcommit 9463bc13856a6c7954e9777a3230bd457d111898[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Sep 14 15:38:37 2024 -0700
+
+    Enable torch.compile for triton backend (#1422)
+
+[33mcommit e3fc4658f473bc1848a9b68345a87acc80fafddc[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Sep 15 01:07:52 2024 +0900
+
+    fix: resolve nightly eval (#1426)
+
+[33mcommit 33b54e7c40a58d64c988c067e3e607f96a04ae58[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Sat Sep 14 23:15:30 2024 +0800
+
+    Add pytorch sampling backend ut (#1425)
+
+[33mcommit 30b404ce72b52e02076fa46ff5ee16f3e1a68a98[m
+Author: Jerry Zhang <jerryzh168@gmail.com>
+Date:   Fri Sep 13 23:46:55 2024 -0700
+
+    Add torchao quant for mixtral and qwen_moe (#1418)
+
+[33mcommit 70b6802982198a739b233a1c72a8fa9871aabec8[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Sep 13 20:27:53 2024 -0700
+
+    Optimize conflicts between CUDA graph and vocab mask tensors (#1392)
+
+[33mcommit f3d32f888a25ff62f0d9f5994a1d0420637beb0e[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Sep 14 00:01:30 2024 +0900
+
+    ci: fix finish (#1414)
+
+[33mcommit 8779da95d62230bdb52e05d25bac45457cf805f1[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Sep 13 00:37:13 2024 -0700
+
+    Update pr-test.yml (#1412)
+
+[33mcommit ad0ff62a4c25f9d47533c22be083cacf38f60c68[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Sep 12 23:29:44 2024 -0700
+
+    Balance test in CI (#1411)
+
+[33mcommit 9a903a878413dd6ef894ff481f28294e9293a9a4[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Sep 12 23:02:36 2024 -0700
+
+    [Minor] Raise exception for wrong import (#1409)
+
+[33mcommit 68be2f6d3b8df28ee0e3553c528c8842987c18f2[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Sep 12 21:36:41 2024 -0700
+
+    [CI] Include triton backend and online serving benchmark into CI (#1408)
+
+[33mcommit b912de11b0a58330064b3d72db6ea0fad515d468[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Sep 12 20:47:31 2024 -0700
+
+    Make stop reason a dict instead of str (#1407)
+
+[33mcommit eb02c1618a93428c2e4e3e170648340dcff2286e[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Sep 12 16:49:50 2024 -0700
+
+    [Minor, CI] remove lora test from minimal suite (#1406)
+
+[33mcommit 712216928fa252d6592a1518579018a69cb72bfe[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Sep 12 16:46:14 2024 -0700
+
+    [Feature] Initial support for multi-LoRA serving (#1307)
+
+[33mcommit c33d82a2111434f02159cd97e02f3cb6657595a4[m
+Author: hxer7963 <hxer7963@gmail.com>
+Date:   Thu Sep 12 16:47:52 2024 +0800
+
+    Add Support for XVERSE Models (Dense and MoE) to sglang (#1397)
+    
+    Co-authored-by: will he <hexin@xverse.cn>
+    Co-authored-by: root <root@localhost.localdomain>
+    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
+
+[33mcommit 8234e663e9c43f3dd1552866b37da21e2bc249ae[m
+Author: Kaichen Zhang - NTU <kaichenzhang358@outlook.com>
+Date:   Thu Sep 12 16:10:26 2024 +0800
+
+    [Minor Fix] Fix llava modalities issue for single-image (#1402)
+
+[33mcommit debbdb5178f347159b42550298806625d0989ff8[m
+Author: Zihao Ye <zihaoye.cs@gmail.com>
+Date:   Thu Sep 12 00:38:18 2024 -0700
+
+    kernel: use tensor cores for flashinfer gqa kernels (#1403)
+
+[33mcommit 3efa798116419311eeabbb450f69b181e21fd461[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Sep 12 00:36:55 2024 -0700
+
+    Support cuda graph in the triton attention backend (#1401)
+
+[33mcommit 2a71be5e2554e01368b3bc4265db1f7822b0ae3c[m
+Author: William <acha131441373@gmail.com>
+Date:   Thu Sep 12 14:46:51 2024 +0800
+
+    Fix README format (#1399)
+
+[33mcommit 446213777773f217a36b0b415e28c6a8d88d793f[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Sep 11 14:40:45 2024 -0700
+
+    Add no commit to main rule (#1393)
+
+[33mcommit fec185ce0cbaf3b0597d0d1a71c335a8c52ce1ba[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Sep 11 11:44:26 2024 -0700
+
+    Refactor attention backend (#1381)
+
+[33mcommit c03cece42f425cc8e73df77a6d1fcc316fd44b50[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Sep 11 04:50:04 2024 -0700
+
+    Improve error reporting during server launch (#1390)
+
+[33mcommit 15c75e41462dfdb6e405bf061ab0640bb04ccdbf[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Sep 11 04:36:21 2024 -0700
+
+    [Fix] Fix --disable-flashinfer (#1389)
+
+[33mcommit 224200e3c2accfe4e1c1ca4fb5906a5b8b609586[m
+Author: Vectory <blacker@aliyun.com>
+Date:   Wed Sep 11 18:55:24 2024 +0800
+
+    BaiChuan2 Model (#1367)
+    
+    Co-authored-by: wanpenghan <wanpenghan@sohu-inc.com>
+
+[33mcommit 8c0efa514dd17c49de2cf334a3cb49ec40fa3f3a[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Wed Sep 11 03:22:07 2024 -0700
+
+    remove assertion in triton attention and add an unit test (#1385)
+
+[33mcommit 144bc70fcceede77fc2c2fbd286676b57f9a0c94[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Sep 10 17:38:59 2024 -0700
+
+    Organize flashinfer indices update (#1378)
+
+[33mcommit 46094e0c1b9c81a1f12f356472af694d9ef613cc[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Sep 10 17:11:16 2024 -0700
+
+    Deprecate --disable-flashinfer and introduce --attention-backend (#1380)
+
+[33mcommit 3a6e8b6d78d8d33b5c241b4d95f531ac20e31964[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Sep 10 15:15:08 2024 -0700
+
+    [Minor] move triton attention kernels into a separate folder (#1379)
+
+[33mcommit fbb4754cb8c6585763ab631231508e84e6c287e2[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Sep 10 13:10:36 2024 -0700
+
+    Fix vocab mask update bug (#1376)
+
+[33mcommit 6c7cb903655d4b8523e45838e597c11e10a6600f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Sep 10 11:27:03 2024 -0700
+
+    [Minor] improve kill scripts and torchao import (#1375)
+
+[33mcommit dff2860a690757966e408b598a8f0b47a29a4713[m
+Author: josephrocca <1167575+josephrocca@users.noreply.github.com>
+Date:   Wed Sep 11 00:35:03 2024 +0800
+
+    Fix CORS compatibility with OpenAI, vLLM, TGI, LMDeploy (#1373)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit e72275cf7f6f9783cbd6031a1dcfd93bd45e40da[m
+Author: William <acha131441373@gmail.com>
+Date:   Tue Sep 10 17:57:52 2024 +0800
+
+    Support MiniCPM3 (#1371)
+
+[33mcommit fec2d1223c82f5701a384030c842dc92e0543e22[m
+Author: wangchao <wcsjtu@163.com>
+Date:   Tue Sep 10 16:17:37 2024 +0800
+
+    [Fix] fix bug of `undefined is_single` in meth `create_abort_task` (#1370)
+
+[33mcommit 8d1095dbf0565cb7d6e5e3d10728a6542c8db6ae[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Sep 9 20:48:28 2024 -0700
+
+    [Docs] Improve documentations (#1368)
+
+[33mcommit 743007e1ce07b99529b49d95413f4879853be1ac[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Tue Sep 10 10:09:13 2024 +0800
+
+    Adding Documentation for installation (#1300)
+    
+    Co-authored-by: zhaochen20 <zhaochenyang20@gmail.com>
+
+[33mcommit 9144ed1067f27ae682d48fc4f183e24098b72f6d[m
+Author: zifeitong <zifeitong@gmail.com>
+Date:   Mon Sep 9 19:08:25 2024 -0700
+
+    Support OpenAI API json_schema response format (#1363)
+
+[33mcommit 69b3bb9ae1c504925455e8b258eefa0fcc15bd81[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Sep 9 13:49:29 2024 -0700
+
+    Unify forward mode (#1360)
+
+[33mcommit 689ff588eca5b6d401b6bfd736cf98cd2b776144[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Sep 9 13:05:13 2024 -0700
+
+    [CI] Return output logprobs in unit test (#1361)
+
+[33mcommit a7c47e0f028c2a9e67cbc99ab67692ec765d3dd0[m
+Author: Jerry Zhang <jerryzh168@gmail.com>
+Date:   Mon Sep 9 05:32:41 2024 -0700
+
+    Add torchao quant (int4/int8/fp8) to llama models (#1341)
+    
+    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
+
+[33mcommit e4d68afcf00869a5467f101d176fecc3cd97b7b8[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Sep 9 04:14:11 2024 -0700
+
+    [Minor] Many cleanup (#1357)
+
+[33mcommit c9b75917d577523ba1c1c581c638b9d2e94b777b[m
+Author: Kai-Hsun Chen <kaihsun@anyscale.com>
+Date:   Mon Sep 9 02:14:25 2024 -0700
+
+    [server] Passing `model_override_args` to `launch_server` via the CLI. (#1298)
+    
+    Signed-off-by: Kai-Hsun Chen <kaihsun@anyscale.com>
+
+[33mcommit 662ecd93680c8195eda799cb9a497f93efdc521a[m
+Author: Kaichen Zhang - NTU <kaichenzhang358@outlook.com>
+Date:   Mon Sep 9 17:07:34 2024 +0800
+
+    [Feat] Add modalities for vision server when handling pixel values for llava (#1346)
+
+[33mcommit 8e6bdf851c4aa6619baa584fc450af748720319d[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Mon Sep 9 01:30:24 2024 -0700
+
+    [triton] Support head_dim not 2^n in triton extend and decode attention (#1281)
+
+[33mcommit 05bea6883c4b3f2fb7f01287cd8dccefeacd545f[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Sep 7 20:46:27 2024 -0700
+
+    Fix some online scheduling delay (#1345)
+
+[33mcommit ab4a83b25909aa98330b838a224e4fe5c943e483[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Sep 5 14:30:26 2024 -0700
+
+    Optimize schedule (#1339)
+
+[33mcommit 62f15eea5a0b4266cdae965d0337fd33f6673736[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Sep 6 04:25:14 2024 +1000
+
+    docs: add conclusion (#1340)
+
+[33mcommit 79794af52d90abfb00e73871109f0cdc4e0b7f34[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Sep 6 00:00:06 2024 +1000
+
+    docs: highlight ttft itl and throughput (#1337)
+
+[33mcommit 3494b32c3a77e32d1a492b8c2a408b3662c08229[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Sep 5 23:39:44 2024 +1000
+
+    docs: update README (#1336)
+
+[33mcommit eda7c09048b39bd03b0e34aa16ffef9398072663[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Sep 4 05:37:32 2024 -0700
+
+    Remove useless fields in global_config.py (#1328)
+
+[33mcommit 5ab9418f5b4c9ad1a90d72a803331d9a0b26d13e[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Sep 4 21:21:21 2024 +1000
+
+    [Doc] update news (#1327)
+
+[33mcommit 843e63d809f59e1446d6b0a61306c9a001b404d6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Sep 4 04:15:11 2024 -0700
+
+    Fix the flaky test test_moe_eval_accuracy_large.py (#1326)
+
+[33mcommit a63c8275c6c5aa642f06793d0bfc60c9653e75a7[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Sep 4 06:32:18 2024 +1000
+
+    chore: bump v0.3.0 (#1320)
+
+[33mcommit dc67d9769382cf83b3e2644a4366d6473445a6c6[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Sep 4 04:29:53 2024 +1000
+
+    misc: speedup load safetensors (#1319)
+    
+    Co-authored-by: ispobock <ISPObaoke@163.com>
+
+[33mcommit 1e495e08470b6dc56645081f644831e0c620dfa5[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Sep 3 06:31:45 2024 -0700
+
+    [Fix] Fix select by ensuring each request has at least one token (#1318)
+
+[33mcommit 12cb115d381cc19605c2fd3aa696ddf550f480de[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Sep 3 05:32:14 2024 -0700
+
+    Fix llama2 weight loader (#1317)
+
+[33mcommit c500f96bb16c686ee8ba5d5f1fc716a0bd8e5fff[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Sep 3 01:43:08 2024 -0700
+
+    Update README.md for llava-onevision instructions (#1313)
+
+[33mcommit 474317f2b606a79ac6811798c612d13b83f719fd[m
+Author: Jani Monoses <jani.monoses@gmail.com>
+Date:   Tue Sep 3 07:49:40 2024 +0300
+
+    Support Phi3 mini and medium (#1299)
+
+[33mcommit f64eae3a291ade9654f1b030878df098bdefa9ee[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Sep 2 21:44:45 2024 -0700
+
+    [Fix] Reduce memory usage for loading llava model & Remove EntryClassRemapping (#1308)
+
+[33mcommit a5a134f39f9b032496fa895050e56485d8fe9957[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Sep 2 16:18:48 2024 -0700
+
+    Fix bugs in sampler with CUDA graph / torch.compile (#1306)
+
+[33mcommit 2561ed012ce10e109ac888f7e9e7ffe44ccb4a94[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Sep 3 01:18:41 2024 +1000
+
+    feat: update nightly gsm8k eval (#1304)
+
+[33mcommit 9999442756cf34a298933d2e072bd07493346d52[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Sep 1 22:22:38 2024 -0700
+
+    Release v0.2.15 (#1295)
+
+[33mcommit 6def9b018c6b4c87410e870f0a5d0469ba50d637[m
+Author: Max Shawabkeh <max99x@gmail.com>
+Date:   Sun Sep 1 21:56:33 2024 -0700
+
+    Fix hang when doing s += None. (#1297)
+    
+    Co-authored-by: max99x <mshawabkeh@jamandtea.studio>
+
+[33mcommit 47f20da223c62473577231cec49dedb86c56220f[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Sep 1 21:50:58 2024 -0700
+
+    Fix regex mask (#1296)
+
+[33mcommit 4a9f8ea43bc23609f1fcce46e6efb2711b452fe5[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sun Sep 1 14:46:36 2024 -0700
+
+    [doc] Fix more broken links (#1294)
+
+[33mcommit 58fa6076223ab2438e840c1a4bb2e5508fd2c1f2[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Sep 1 12:20:46 2024 -0700
+
+    Fix the flaky tests in test_moe_eval_accuracy_large.py (#1293)
+
+[33mcommit 6487ef64c659fe1b1c10743a37f6377f70044ecd[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Sep 2 03:19:49 2024 +1000
+
+    ci: add nightly eval (#1291)
+
+[33mcommit 9b0805242eeaf81bc41f6920788eaa379b43488b[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Sep 2 00:29:06 2024 +1000
+
+    fix: resolve fp8 for mixtral (#1290)
+
+[33mcommit 32a4141d5aaca699c9377dd0d5c689ac019f91b9[m
+Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
+Date:   Sun Sep 1 11:42:29 2024 +0100
+
+    Allow new lines during JSON generation (#1277)
+
+[33mcommit 0836055324af92cae74a13bdb137fb754b6c8aae[m
+Author: Kai-Hsun Chen <kaihsun@apache.org>
+Date:   Sun Sep 1 03:14:56 2024 -0700
+
+    [Chore] Rename model_overide_args to model_override_args (#1284)
+    
+    Signed-off-by: Kai-Hsun Chen <kaihsun@anyscale.com>
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit 00b19f198f198bd2f7182596773d80f5217ab757[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sun Sep 1 03:12:06 2024 -0700
+
+    [triton] Remove the zero initialization of qk_acc by directly writing the result (#1288)
+
+[33mcommit 6cb32ef92c99ee7c1192ff90023692adc106049c[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Sun Sep 1 17:46:40 2024 +0800
+
+    Support Triton fp8 e5m2 kv cache (#1286)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit 761b2cebd65ff7fbf2cd55b63e1230df1bf6f6ca[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Sep 1 02:36:56 2024 -0700
+
+    [CI] merge all ci tests into one file (#1289)
+
+[33mcommit 54772f784adb9c9774c359c23661cfb0a3bbac17[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Sep 1 17:28:06 2024 +1000
+
+    feat: fix fp8 for MLA and support bmm fp8 for DeepSeek V2 (#1285)
+    
+    Co-authored-by: ispobock <ispobaoke@163.com>
+
+[33mcommit 1b5d56f7f885cdc4284579ee863f9944f4c12bce[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Sep 1 00:27:25 2024 -0700
+
+    [CI] Add more multi-gpu tests (#1280)
+
+[33mcommit d134c139a19cfec512d59f55656f12f5b421e14b[m
+Author: xiaobochen <35516720+xiaobochen123@users.noreply.github.com>
+Date:   Sat Aug 31 23:40:28 2024 -0700
+
+    Optimize the update flashinfer indices (#1262)
+
+[33mcommit 6cc9c52521976450b1371c7555102ded79670b2e[m
+Author: Byron Hsu <byronhsu1230@gmail.com>
+Date:   Sat Aug 31 22:54:34 2024 -0700
+
+    [doc] fix quick start link (#1282)
+
+[33mcommit 52cefdbf5797f612d38f43a120f52ae45b9d1380[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Sep 1 00:44:29 2024 +1000
+
+    fix: resolve the fp8 bug introduced by vLLM 0.5.5 (#1276)
+
+[33mcommit 51c554d812f4969f4727e21531224322281efc2f[m
+Author: Christopher Chou <49086305+BabyChouSr@users.noreply.github.com>
+Date:   Fri Aug 30 11:51:44 2024 -0700
+
+    Allow more flexible assistant and system response (#1256)
+
+[33mcommit 79ece2c51f47ee6b792c6282a6f76987892c5f8d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Aug 30 06:05:01 2024 -0700
+
+    Report median instead of mean in bench_latency.py (#1269)
+
+[33mcommit 55f5976b42d736f3dfe2f8f9b91a6536c212744a[m
+Author: 김종곤 <149566442+Deepfocused@users.noreply.github.com>
+Date:   Fri Aug 30 17:49:07 2024 +0900
+
+    Update README.md - Supported Models add Exaone 3.0 (#1267)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit b7f834101476209767b7c8a52f17aa86cad79f44[m
+Author: 김종곤 <149566442+Deepfocused@users.noreply.github.com>
+Date:   Fri Aug 30 17:08:28 2024 +0900
+
+    EXAONE 3.0 Model Support (#1258)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit f414352ae6783dc20dc93e09be00ea62f4438931[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Fri Aug 30 14:45:40 2024 +0800
+
+    Transpose mla weight offline (#1261)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit a362340b33258eae0f48504be09659e2e9dcd035[m
+Author: lxww302 <68112258+lxww302@users.noreply.github.com>
+Date:   Thu Aug 29 23:43:41 2024 -0700
+
+    fix: multimodal_config in monkey_patch_vllm_dummy_weight_loader (#1260)
+
+[33mcommit 381dd57bd69f027a3298d107d8eb851c3c29d8e4[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Aug 28 18:58:52 2024 -0700
+
+    Sampler cudagraph (#1253)
+
+[33mcommit 8153168c96c76cdc77eabcbe03b167f9f3b4385f[m
+Author: Zhiqiang Xie <zhiqiangx@nvidia.com>
+Date:   Wed Aug 28 18:57:54 2024 -0700
+
+    fix data racing due to mutable reference using deepcopy (#1255)
+
+[33mcommit 6c34d6339c040628e895d167cf22f2ab7104f8b3[m
+Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
+Date:   Thu Aug 29 02:57:10 2024 +0100
+
+    make json_schema usable from gen (#1254)
+
+[33mcommit 13ac95b8946ff0bc62527567931bdf647cc43c5e[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Aug 29 04:46:33 2024 +1000
+
+    chore: bump v0.2.14.post2 (#1250)
+
+[33mcommit 492143bf32b25848300dcc18bd51fef6c25d02d7[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Aug 29 04:25:46 2024 +1000
+
+    fix: resolve qwen2 moe weight loader (#1252)
+
+[33mcommit 0a97d7962d31728a3e4d5936b105ab27a83cd1a9[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Aug 28 08:38:50 2024 -0700
+
+    [Fix] Fix OOM in llava base class (#1249)
+
+[33mcommit c411f32e1c9b551011a52566b5afae1320a99fde[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Aug 29 00:07:02 2024 +1000
+
+    feat: replace GeluAndMul (#1234)
+
+[33mcommit bf53bf5142bd3393d495608e58c86f6d8c991664[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Aug 28 06:33:05 2024 -0700
+
+    [Fix] Fix llava on multi images (#1247)
+
+[33mcommit b1a540ec42cdd7b2875ce4b84587c522458bc065[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Aug 28 22:47:34 2024 +1000
+
+    feat: update GemmaRMSNorm (#1232)
+
+[33mcommit 66975360e7575a5f573cdaf5c6892d81afc3ed19[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Aug 28 22:12:36 2024 +1000
+
+    fix: increase max_new_tokens when testing generation models (#1244)
+
+[33mcommit 6c498313942b32e548dd0b499f279db0abc5b085[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Aug 28 04:20:54 2024 -0700
+
+    Add  sglang.bench_latency to CI (#1243)
+
+[33mcommit f25f4dfde5af9a81be52c1ba6d99cc2ac5cca179[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Aug 28 21:16:47 2024 +1000
+
+    hotfix: revert sampler CUDA Graph (#1242)
+
+[33mcommit 184ae1c68316c58a7f5b4ad813639b08604369f5[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Aug 28 02:15:52 2024 -0700
+
+    Update README.md (#1239)
+
+[33mcommit 198974cd1a805a7fab2d81fe9e6b5fbd73d03fb8[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Aug 28 18:39:12 2024 +1000
+
+    feat: support sm75 with FlashInfer v0.1.6 (#1233)
+
+[33mcommit 6cc38b2bf31c141e3ae06ca8c1150e35dbeb5578[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Aug 28 00:54:26 2024 -0700
+
+    [Minor] Add more type annotations (#1237)
+
+[33mcommit 1ece2cda3dde1df62c924c0288ec514f5c5e2af5[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Aug 28 00:37:32 2024 -0700
+
+    Fix bench latency benchmark (#1225)
+
+[33mcommit c8a9e79186503c3bd1955cdbd4c364b04db333fc[m
+Author: Dr. Artificial曾小健 <875100501@qq.com>
+Date:   Wed Aug 28 14:51:41 2024 +0800
+
+    Fix readme (#1236)
+
+[33mcommit 3602692c7ca7c3757cc3d2b5dfc829209205731a[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Aug 27 21:15:31 2024 +1000
+
+    feat: replace get_act_fn for gpt_bigcode (#1231)
+
+[33mcommit 909f34363bf551711c20dbadbd5cc7fb6517a614[m
+Author: havetc <corentin.havet@hotmail.fr>
+Date:   Tue Aug 27 12:10:46 2024 +0200
+
+    [FIX] Wrong logger (#1230)
+
+[33mcommit 5ff25cdf5b1310e83d9e595142b39ae4d7b561e9[m
+Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
+Date:   Mon Aug 26 22:04:52 2024 -0700
+
+    [Minor] add delete test and delete tmp file on ci server (#1227)
+
+[33mcommit 2f1d92834f41df42e266ed6d7036b4add906d21f[m
+Author: caiyueliang <393900414@qq.com>
+Date:   Tue Aug 27 07:28:26 2024 +0800
+
+    [FEAT] Support batches cancel (#1222)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit c61a1b6f97c61ebd80bada10c60c8ab75d2745b9[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Aug 26 13:52:58 2024 -0700
+
+    Torch compile CI throughput test (#1223)
+
+[33mcommit 9935f97b3e594e246776466d04134decff1b59ae[m
+Author: havetc <corentin.havet@hotmail.fr>
+Date:   Mon Aug 26 18:37:26 2024 +0200
+
+    [FEAT] JSON constrained support (#1125)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit c5fe11a8e175d48b00b32aafd7412953180314e4[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Aug 27 00:28:24 2024 +1000
+
+    chore: bump v0.2.14 (#1155)
+
+[33mcommit 75ce37f40139394bd2f3f55250095477d8c9b16d[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Aug 26 07:02:50 2024 -0700
+
+    Move sampler into CUDA graph (#1201)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit 97589a60a2cf2ef75d26ca0de9a78f30e2b63c4e[m
+Author: Mingyi <wisclmy0611@gmail.com>
+Date:   Sun Aug 25 21:54:02 2024 -0700
+
+    [CI] Parallelize unit tests in CI (#1219)
+
+[33mcommit 632d506d0b526f641f9ced4f408dad8bd64b5009[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Aug 25 21:26:31 2024 -0700
+
+    minor: improve CI and dependencies (#1212)
+
+[33mcommit 3579162ab102351b8cac5d17eab29e05fee63abe[m
+Author: Kaichen Zhang - NTU <kaichenzhang358@outlook.com>
+Date:   Mon Aug 26 11:58:51 2024 +0800
+
+    [Fix] Multi-images loading error (#1218)
+
+[33mcommit 7514b9f8d3660417c085538076cf5162f32ce2fb[m
+Author: Mingyi <wisclmy0611@gmail.com>
+Date:   Sun Aug 25 19:56:42 2024 -0700
+
+    [CI] Fix CI (#1217)
+
+[33mcommit 158e8f1e2d499e225add6ed0554896c94fd5a891[m
+Author: Mingyi <wisclmy0611@gmail.com>
+Date:   Sun Aug 25 19:02:08 2024 -0700
+
+    improve the threshold and ports in tests (#1215)
+
+[33mcommit d3efcb3930cfb1c79958dda00ce3e044fd85b714[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Aug 25 17:45:35 2024 -0700
+
+    Update workflow files (#1214)
+
+[33mcommit 2c615d120fa5da4ff6b88f59ca7656b8d595ccd0[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Mon Aug 26 08:38:11 2024 +0800
+
+    [Feature] Support fp8 e5m2 kv cache with flashinfer (#1204)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit 61bb223e0fc1ccd0c26ac3137f0d9154bcecc25a[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Aug 25 17:31:52 2024 -0700
+
+    Update CI runner docs (#1213)
+
+[33mcommit 15f1a49d2dcbd488155de373e7fcf854f29a7de8[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Aug 25 16:43:07 2024 -0700
+
+    Update CI workflows (#1210)
+
+[33mcommit 308d024092d8a671998b978f419dd40262bef9b5[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Aug 25 16:21:37 2024 -0700
+
+    [CI] Fix the issue of unit test hanging (#1211)
+
+[33mcommit ab4990e4bfd79fe60815a3f872c5857df57798bb[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Aug 25 14:49:23 2024 -0700
+
+    [Minor] Temporarily skip flaky test (#1209)
+
+[33mcommit 902278008a6e5cf0f054c0b6ce4ba0cc64ce7437[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Aug 25 14:46:34 2024 -0700
+
+    [Minor] Improve the function organization in TokenizerManager & improve loggers (#1208)
+
+[33mcommit 30b4f771b0c515c18179f3e1ee0b4662b2606a95[m
+Author: Chayenne <zhaochen20@outlook.com>
+Date:   Mon Aug 26 01:29:12 2024 +0800
+
+    Support Alibaba-NLP/gte-Qwen2-7B-instruct embedding Model (#1186)
+    
+    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
+
+[33mcommit 66e7dcaf7008d2ffe892044a21513a6e06424d1a[m
+Author: Kaichen Zhang - NTU <kaichenzhang358@outlook.com>
+Date:   Mon Aug 26 01:28:23 2024 +0800
+
+    [Fix] Fixing the multi-images error for llava-onevision (#1205)
+
+[33mcommit bc4c7a35457b0a1cb4e83b9f80a01f2cbee9f0e9[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Aug 25 10:27:02 2024 -0700
+
+    Relax the assert in moe throughput test to fix the flaky CI (#1207)
+
+[33mcommit 1cb4da5c5f1fbaafa5c48b052b1f05abedd97fe5[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Aug 24 21:43:03 2024 -0700
+
+    [Fix] the issue of random order when input is a list (#1199)
+
+[33mcommit e61d13acdf3193606c3bc57fb59f0de33eab7490[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Aug 24 18:35:55 2024 -0700
+
+    [CI] Fix the problem of hf runner too slow (#1202)
+
+[33mcommit b20daf982a82bbeda120d2c30532c74970bd053d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Aug 24 14:50:05 2024 -0700
+
+    Update README.md (#1198)
+
+[33mcommit f6af3a6561b2528531bcb4815012b085280d4ec7[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Aug 24 08:02:23 2024 -0700
+
+    Cleanup readme, llava examples, usage examples and nccl init (#1194)
+
+[33mcommit c9064e6fd9a5356ee579e9d452bfad725f8e6f2c[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Aug 24 18:58:16 2024 +1000
+
+    feat: use gelu_tanh_and_mul (#1193)
+
+[33mcommit a5b14ad04337a3371ca2513ef95a5add28b3f34d[m
+Author: Kaichen Zhang - NTU <kaichenzhang358@outlook.com>
+Date:   Sat Aug 24 05:11:16 2024 +0800
+
+    [Feat/WIP] add llava-onevision, with support for (1) siglip encoder, (2) qwen2 decoder (3) openai api compatible server. (#1123)
+    
+    Co-authored-by: Bo Li <drluodian@gmail.com>
+
+[33mcommit 5fafcac00834253a18a3f10551dfc8221fcc360b[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 22 02:03:25 2024 -0700
+
+    Fix benchmark script (#1185)
+
+[33mcommit 364d3d72a78ba4ce3b0cfde7e28e40d91679cb8e[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Aug 22 01:16:35 2024 -0700
+
+    Fix broken penalty (#1184)
+
+[33mcommit 5623826f7363e41f97db2cfe6e7f1244d9222d35[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Aug 21 19:24:36 2024 -0700
+
+    [Minor] Improve logging and rename the health check endpoint name (#1180)
+
+[33mcommit 83e23c69b35ce26857ee415b243812973fdb9573[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Aug 21 16:48:24 2024 -0700
+
+    Improve code style of sampler (#1168)
+
+[33mcommit ac1b74fa8548adf4f3b3a14b737702158c95c8d9[m
+Author: Zhanghao Wu <zhanghao.wu@outlook.com>
+Date:   Wed Aug 21 16:05:33 2024 -0700
+
+    [Docs] Fix rendering of details in README (#1179)
+
+[33mcommit 068e9eae55daf2ca1666cfa64ad66139b02fa623[m
+Author: intervitens <155717317+intervitens@users.noreply.github.com>
+Date:   Thu Aug 22 01:49:32 2024 +0300
+
+    Support min-p sampling (#1167)
+
+[33mcommit d6aeb9fa1552939e7444d845a9d0f5e9225daf02[m
+Author: rainred <107027757+gryffindor-rr@users.noreply.github.com>
+Date:   Thu Aug 22 05:28:35 2024 +0800
+
+    [Feature] Add a function to convert sampling_params to kwargs (#1170)
+    
+    Co-authored-by: lzhang <zhanglei@modelbest.cn>
+
+[33mcommit 1fb94599087e4881c8b31dc4de46b1685fcaa124[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Aug 22 07:26:35 2024 +1000
+
+    fix: custom op fallback forward native when lower sm80 (#1177)
+
+[33mcommit bea2bb9eeae6cf6f1bdfbb6aaaae2d91adea7bac[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Aug 20 22:35:05 2024 -0700
+
+    Improve multi-node stability (#1171)
+
+[33mcommit cd10654e7ed99616d25fc1d6958ae74b21531bd6[m
+Author: Shan Yu <shanyu1@g.ucla.edu>
+Date:   Tue Aug 20 13:48:24 2024 -0700
+
+    [Feat] Support update weights without restart server (#1157)
+
+[33mcommit 350a81609b1e69194465a9dcbc7b8c1dd1a09e7c[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Aug 21 03:23:52 2024 +1000
+
+    fix: resolve README render (#1166)
+
+[33mcommit 6242c399abb7582fb3d9a4e6a11f6af7d248841b[m
+Author: Lucien <lucien@lucien.ink>
+Date:   Wed Aug 21 01:14:34 2024 +0800
+
+    Generate 1 token to verify the health of the inference service in /health (#1154)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit 04707b09b7240e19039c991ffc6981335c649caa[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Aug 21 02:14:51 2024 +1000
+
+    misc: add hypervisor vendor (#1165)
+
+[33mcommit ff2cfdb1a21867700c21cf903dcd720c55ad60fe[m
+Author: Xu-Chen <956140954@qq.com>
+Date:   Tue Aug 20 23:44:12 2024 +0800
+
+    [Feature] add disable-custom-all-reduce (#1148)
+    
+    Co-authored-by: chenxu02 <chenxu02@zhihu.com>
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit a8ae640328f469b5cd9f1d1c21712c10fd0c5869[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Aug 20 08:31:29 2024 -0700
+
+    Improve docs and warnings (#1164)
+
+[33mcommit d8476818efc88188d0aa0a8a176024a0b82e7a1d[m
+Author: Juwan Yoo <ryan@tmfi.us>
+Date:   Tue Aug 20 08:06:55 2024 -0700
+
+    feat: allow streaming for multi-prompt and/or parallel sampling (#1134)
+
+[33mcommit df191254abc002b3284560d9c4b94214a4656265[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Mon Aug 19 18:23:07 2024 +0800
+
+    Optimize MLA/GQA/MQA Triton decoding (#1138)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit b997a18d74213e905052c47941eebefd36a4d276[m
+Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
+Date:   Sun Aug 18 23:45:41 2024 -0700
+
+    [Feat]Add support for optional start len of logprobs (#1035)
+    
+    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
+    Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
+
+[33mcommit d8627ed16d403751e7cecbdc0563f26230c6ea25[m
+Author: Zhanghao Wu <zhanghao.wu@outlook.com>
+Date:   Sun Aug 18 23:01:55 2024 -0700
+
+    [Docs] Add instruction for running on clouds and kubernetes with SkyPilot (#1144)
+    
+    Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>
+
+[33mcommit fa13b95d6be5c246693492a1c7246cb112930252[m
+Author: min-xu-et <168487304+min-xu-et@users.noreply.github.com>
+Date:   Sun Aug 18 14:29:09 2024 -0700
+
+    fixed a typo (#1143)
+
+[33mcommit 3c1f5a92200e112a07d467771af879942d2dd440[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Aug 17 18:03:00 2024 -0700
+
+    Fix duplicated imports in hf_transformers_utils.py (#1141)
+
+[33mcommit 57d0bd91ec1775cd150629db14d39e07a876a45b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Aug 17 17:43:23 2024 -0700
+
+    Improve benchmark (#1140)
+
+[33mcommit cdc8d607524a9cf663d2319ff452168d99645e39[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Aug 17 14:37:52 2024 -0700
+
+    Improve the code style: more comments and remove useless packages (#1139)
+
+[33mcommit 9208591f05c39963f423fb3fee841f94276da187[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Aug 17 22:45:42 2024 +0800
+
+    fix: use fp16 dtype for sm75 (#1136)
+
+[33mcommit 5d0d40d0eb8c347d8b3598f0a375696728df66c4[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Aug 16 21:41:11 2024 -0700
+
+    Fix CI accuracy && time out limit (#1133)
+
+[33mcommit f624f6a6cc0a5578b9ef056b610e54e04518b26c[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Aug 16 15:12:38 2024 -0700
+
+    Fix port conflicts between local CI and runner CI. (#1131)
+
+[33mcommit 3694f8f996e25c862cd67057e2bfa5844900fc98[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Aug 16 02:13:00 2024 -0700
+
+    Mixed style of chunked prefill (#1013)
+
+[33mcommit 5a261bd0552c049f7eb14dfd20a1ae43f61c9f46[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Aug 16 01:39:24 2024 -0700
+
+    Fix the deadlock in multi-node tp (#1122)
+
+[33mcommit 6aa8ad14f8a9b09904c11413449b9b5d942a115a[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Aug 16 13:46:43 2024 +0800
+
+    fix: resolve Python.h header missing (#1119)
+
+[33mcommit 26e9c12c159277684078d70724247b16611d9e08[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Aug 16 02:26:44 2024 +0800
+
+    ci: compatible with fork repo (#1115)
+
+[33mcommit 87a0db82b867d52e775b96e344b0e16ff60cdb67[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Aug 15 10:54:24 2024 -0700
+
+    update hyperparameter guide (#1114)
+
+[33mcommit 5bd953749b520070a5b72b5b99b9a92853698685[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Aug 16 01:50:43 2024 +0800
+
+    chore: bump v0.2.13 (#1111)
+
+[33mcommit 0cb099e20a0b9ccd308fff5ef133a2e4b26a7f7a[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Aug 15 10:47:39 2024 -0700
+
+    set CUDA_DEVICE_MAX_CONNECTIONS=1 (#1113)
+
+[33mcommit 93d4e354d82b95663f52f3d031f8e432ad0c1803[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 15 10:33:20 2024 -0700
+
+    [Fix] Window attention compatible with RadixAttention and chunked prefill (#1112)
+
+[33mcommit 9195d1362aa33db052c01fb9589301299d6fc50c[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Aug 15 23:29:35 2024 +0800
+
+    misc: rm unused model_loader (#1110)
+
+[33mcommit 14cb544d56b06b25483c4cf9c817b657acff8604[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 15 00:53:24 2024 -0700
+
+    [Fix] fix flashinfer usage for window attention (#1107)
+
+[33mcommit e86b1ccbf07d29ec040b5d1d4092f152237db0f8[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Aug 14 21:56:20 2024 -0700
+
+    Enable chunked prefill by default (#1040)
+
+[33mcommit 8d2d876fc8ec690db8728d363c593174ee3b97c0[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Aug 14 21:56:01 2024 -0700
+
+    [Fix] fix the typo bug for window attention (#1106)
+
+[33mcommit 326df4bab25583eb1dcfaaf0f5f1f28b20d35ae7[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Aug 14 19:25:37 2024 -0700
+
+    Use a single workspace for flashinfer (#1077)
+
+[33mcommit 6767e2229f6245a30fff0373ecceb1c13792d594[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Aug 14 17:43:14 2024 -0700
+
+    Support jinja as chat template file (#1104)
+
+[33mcommit 73cf6834f2a6ee0d566a1ca70db5e2c05c76486b[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Aug 14 17:31:39 2024 -0700
+
+    Support `stop_token_ids` in sglang API (#1092)
+
+[33mcommit 1c2b5f524041752442856428db451510a75add96[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Aug 15 01:39:15 2024 +0800
+
+    docs: update nsys usage (#1103)
+
+[33mcommit 96a2093ef021b7fb10cf727050e0c87494c5463a[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Aug 14 10:37:01 2024 -0700
+
+    [Fix] Compatibility of window attention and cuda graph (#1090)
+
+[33mcommit a34dd86a7dd734ef95ba37a86ba929479bbbac64[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Aug 14 08:58:07 2024 -0700
+
+    Use `dtype` to control generate (#1082)
+    
+    Co-authored-by: zhyncs <me@zhyncs.com>
+
+[33mcommit 67c0d832a644090810a479d6d4655555a07d44a7[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Aug 14 20:25:39 2024 +0800
+
+    docs: update pr template (#1099)
+
+[33mcommit a59636bb5e68f36308bb092674429d27c05cf125[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Aug 14 04:40:44 2024 -0700
+
+    Update grok 1 model (#1095)
+
+[33mcommit fe5024325b8bf952714a49575c86e9b608d01f58[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Aug 14 19:40:05 2024 +0800
+
+    docs: update README (#1098)
+
+[33mcommit f14569f64aa19bcdbf51e08d0aba7e19ccfb5b88[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Aug 14 18:36:24 2024 +0800
+
+    ci: remove workflow path trigger (#1096)
+
+[33mcommit 8f790ac1005cfb5403a0a1e847bb0e050a4282da[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Aug 14 03:25:38 2024 -0700
+
+    Fix a bug in cuda graph runner (#1094)
+
+[33mcommit 616b59f384ad13b824fa8bb634444b43967f8c8a[m
+Author: rainred <107027757+gryffindor-rr@users.noreply.github.com>
+Date:   Wed Aug 14 15:28:04 2024 +0800
+
+    [Feature] modify Runtime to support skip_tokenizer_init (#1088)
+    
+    Co-authored-by: lzhang <zhanglei@modelbest.cn>
+
+[33mcommit c8423ca3112f6bf638f294a548e16ab4a3e79f1f[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Aug 14 15:27:35 2024 +0800
+
+    ci: update timeout and retry (#1086)
+    
+    Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
+
+[33mcommit e205527cb11148b19ba4061d8503e7866c3f25dd[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Aug 13 21:14:05 2024 -0700
+
+    Fix jump forward final state circular path bug. (#1084)
+
+[33mcommit 0909bb0d2f87e3d6a73a8e0dc0e38f55ce44a4d4[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Aug 13 17:01:26 2024 -0700
+
+    [Feat] Add window attention for gemma-2 (#1056)
+
+[33mcommit ad3e4f16199a51862d72845f5f7ea53cc92442d2[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Aug 13 15:44:25 2024 -0700
+
+    Update the mixtral to use the better FusedMoE layer (#1081)
+
+[33mcommit 312e8492556dd092368452f349ed45af3e3a68b6[m
+Author: Lucien <lucien@lucien.ink>
+Date:   Wed Aug 14 06:07:57 2024 +0800
+
+    Example file for docker compose and k8s (#1006)
+
+[33mcommit 95f5fbf1a75f4256cedb35da5c2e38f7841d0ba4[m
+Author: rainred <107027757+gryffindor-rr@users.noreply.github.com>
+Date:   Tue Aug 13 20:47:22 2024 +0800
+
+    Fix create_abort_task, GenerateReqInput does not have rids. (#1079)
+    
+    Co-authored-by: lzhang <zhanglei@modelbest.cn>
+
+[33mcommit cebd78d83ee193b1d35f0591e7beb62f2b944b8e[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Aug 13 20:12:58 2024 +0800
+
+    ci: add accuracy timeout (#1078)
+
+[33mcommit 0076f1154160f53a6c5de8a3716783071f6ef617[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Aug 13 19:08:43 2024 +0800
+
+    fix: use devel for Triton's compiler requirements (#1074)
+
+[33mcommit f7fb68d2925201ce234e97d81ad3095e4dc48cbb[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Aug 13 16:43:23 2024 +0800
+
+    ci: add moe test (#1053)
+
+[33mcommit 396a13e6ad6b62f850aac026e4ddc57134e5f4e7[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Aug 13 16:16:50 2024 +0800
+
+    ci: add cancel pr workflow (#1070)
+
+[33mcommit 65915f9f3e93a0f682c97fe8ece268f2f2c00fa5[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Aug 13 13:48:54 2024 +0800
+
+    fix: temporary solution for DeepSeek V2 H100 layout conversion issue (#1060)
+    
+    Co-authored-by: ispobock <ISPObaoke@163.com>
+
+[33mcommit 162f3ccb01d9b31d21f1a1ae3d6cabbfe4079838[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Tue Aug 13 13:48:07 2024 +0800
+
+    Fix layernorm input shape (#1066)
+    
+    Co-authored-by: Yineng Zhang <me@zhyncs.com>
+
+[33mcommit 65e89baea9f152837f32ce8b0baa5b877bf39a5c[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Aug 13 13:12:56 2024 +0800
+
+    fix: not use the default port (#1068)
+
+[33mcommit 6a38efa8342ef4b924b093d90260ead6d1f6cea7[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Aug 13 00:15:59 2024 +0800
+
+    feat: replace all rmsnorm and silu (#1057)
+
+[33mcommit b0ad0c1bc8787937a7df5bc0487af1e9db6efb5e[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Aug 12 18:59:38 2024 +0800
+
+    chore: bump v0.2.12 (#1048)
+
+[33mcommit c877292cc12a61011694d7d0ea53c05f247003f6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Aug 12 03:39:01 2024 -0700
+
+    Re-organize CI tests (#1052)
+
+[33mcommit 0c1c72a0b409f255a1fcea666705af8140da5f1e[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Aug 12 02:48:40 2024 -0700
+
+    Fix accuracy test (#1051)
+
+[33mcommit 41598e0d8e7de0aa777941c4ff5e1fddfb6f573c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Aug 12 02:21:38 2024 -0700
+
+    Add longer accuracy test on CI (#1049)
+
+[33mcommit 89f23a5178769ce867e8fb3af3cb44da2f5399ec[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Aug 12 16:11:38 2024 +0800
+
+    docs: update setup github runner (#1050)
+
+[33mcommit cb99ba4fc6194e4feffa0fbb22223ab0119e5e36[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Aug 12 14:24:06 2024 +0800
+
+    feat: update Dockerfile (#1033)
+    
+    Co-authored-by: vhain <vhain6512@gmail.com>
+
+[33mcommit 32f614432355ed5fa2ba4a7ec58f634571ac60f6[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Aug 11 23:13:45 2024 -0700
+
+    fix: Fix returned prefill logits and add output str test (#1046)
+
+[33mcommit fb1f28cbbbd3e2abcbf40dc043e5b2556938abec[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Aug 11 22:54:37 2024 -0700
+
+    Clean up the comments and names under python/sglang/srt/layers (#1047)
+
+[33mcommit fb7421db0ddcb263b2cd1d8bbbe63282c97606aa[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Aug 11 22:35:44 2024 -0700
+
+    minor: some potential bugs (#1044)
+
+[33mcommit 14b64930871c6a2e2236af6c648734852ac7a35c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Aug 11 21:31:52 2024 -0700
+
+    Delete the useless test/srt/test_throughput.py (#1045)
+
+[33mcommit 8207637029082563cab74951fe8d5f86b574b85e[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Aug 11 18:27:33 2024 -0700
+
+    Improve end-to-end throughput test and its coverage (#1039)
+
+[33mcommit 7de6034534fbb586474691e8add93e5f75a7ac20[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Aug 11 17:57:02 2024 -0700
+
+    Fix the prefix indices (#1037)
+
+[33mcommit d84c5e70f7a0d309978eb64fa3e7aa5ac47fbb7a[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Aug 11 16:41:03 2024 -0700
+
+    Test the case when max_new_tokens is very large (#1038)
+
+[33mcommit d785412077d53cef16c4e70caec0f0156aca5edc[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Aug 11 15:20:18 2024 -0700
+
+    Fix the case when max_new_tokens is too large (#1025)
+
+[33mcommit 7b6a5332cad494b4016f5ac658e55bf40224fb7e[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Aug 11 12:11:26 2024 -0700
+
+    Fix triton args init (#1034)
+
+[33mcommit 4080e82244f72dd8c60a1e89928c568e2dc5dd1c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Aug 11 04:53:51 2024 -0700
+
+    Fix the case where r.prefix_indices is None (#1031)
+
+[33mcommit c245b78973c934752b5d3b73f0bb62047b1c4f3d[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Aug 11 17:45:59 2024 +0800
+
+    hotfix: add CustomOp abstraction (#1027)
+
+[33mcommit 9dae4078122bc675a07dbdfd40e879e925c6e3ed[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Aug 11 02:44:59 2024 -0700
+
+    Improve type annotation (#1029)
+
+[33mcommit fcc0f5ed9932f165fb1da558efc412054294d298[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Aug 11 02:22:16 2024 -0700
+
+    Fix wrong assert (#1028)
+
+[33mcommit a97df79124d6de281a412db2005ad9210cbadee8[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Aug 11 01:18:52 2024 -0700
+
+    Clean up readme and arguments of chunked prefill (#1022)
+
+[33mcommit 33d61356b885c8cb7805733dc65def442b30cf63[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Aug 11 15:34:30 2024 +0800
+
+    misc: update issue template (#1024)
+
+[33mcommit 94752ac811f1412caec55f940ea0fa0d52911e5d[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Aug 11 12:57:13 2024 +0800
+
+    feat: use FlashInfer rmsnorm and silu (#907)
+
+[33mcommit 43fbb6d919d9b6c07ab256a8ab04bc4d7462df66[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Aug 10 16:24:12 2024 -0700
+
+    Fix `input_ids` && rename to `fill_ids` (#1021)
+
+[33mcommit 54fb1c80c0d7bbf100d4efc84d1aad4bee094ff0[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Aug 10 15:09:03 2024 -0700
+
+    Clean up unit tests (#1020)
+
+[33mcommit b68c4c073ba730f3ced08830fd804132269bdfc9[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Aug 10 13:46:42 2024 -0700
+
+    fix: force max new tokens to be 1 for embedding request (#1019)
+
+[33mcommit e712837d389e307fe0a4d07d06a6a940695455d8[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Aug 11 02:20:30 2024 +0800
+
+    misc: update test config (#990)
+
+[33mcommit 7599badeaf5aeab8c9f72659ceb55bcaf9472e56[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Aug 10 08:39:05 2024 -0700
+
+    Support embedding input as a list (#1014)
+
+[33mcommit 62757db6f0f09a6dff15b1ee1ac3029602951509[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Aug 9 16:36:57 2024 -0700
+
+    Reduce the overhead when cache is disabled (#1010)
+
+[33mcommit 73fa2d49d539fd67548b0458a365528d3e3b6edc[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Aug 9 15:16:23 2024 -0700
+
+    Some warnings to crash when CI (#1009)
+
+[33mcommit 61728884d73390f2af17644e399e2c489e106d07[m
+Author: Mingyi <wisclmy0611@gmail.com>
+Date:   Fri Aug 9 13:18:58 2024 -0700
+
+    Fix benchmark latency (#1007)
+
+[33mcommit 9cf0a5bada133d9f9f5bcc7f8f8cf0ba56848fb9[m
+Author: gryffindor-rr <107027757+gryffindor-rr@users.noreply.github.com>
+Date:   Sat Aug 10 03:14:13 2024 +0800
+
+    Add skip_tokenizer_init args. (#959)
+    
+    Co-authored-by: lzhang <zhanglei@modelbest.cn>
+
+[33mcommit b16e856f11cf7b88cdaf7e5739e53e9321009485[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Aug 9 11:19:18 2024 -0700
+
+    Add openai embedding API (#997)
+
+[33mcommit 05c50a82b82c108ad963fec4e572e1a888e62962[m
+Author: Roger Wang <136131678+ywang96@users.noreply.github.com>
+Date:   Fri Aug 9 09:53:50 2024 -0700
+
+    Minor bugfix on benchmark serving (#1005)
+
+[33mcommit b568df5d03daa5052de214405d608a760dd379cc[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Aug 9 21:21:42 2024 +0800
+
+    fix: resolve correctness_test issue (#1002)
+
+[33mcommit 10bca45bc6415afc2d6fb764c697626875831af9[m
+Author: Juwan Yoo <ryan@tmfi.us>
+Date:   Fri Aug 9 04:46:24 2024 -0700
+
+    bugfix: penalizers to be merged before reqs (#1001)
+
+[33mcommit b91a4cb1b1c21b94ca74f4e75305e4b26673dc5b[m
+Author: liuyhwangyh <liuyhwangyh@163.com>
+Date:   Fri Aug 9 17:52:14 2024 +0800
+
+    support models from www.modelscope.cn (#994)
+    
+    Co-authored-by: mulin.lyh <mulin.lyh@taobao.com>
+
+[33mcommit 95a28019ba6c7288c1d2e747665d6a9dd005fdc2[m
+Author: Juwan Yoo <ryan@tmfi.us>
+Date:   Thu Aug 8 23:30:50 2024 -0700
+
+    test: negative value testing for frequency, presence penalizers (#995)
+
+[33mcommit e040a2450b950f6e1674c73def37a9238064104d[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 8 16:31:19 2024 -0700
+
+    Add e5-mistral embedding model - step 3/3 (#988)
+
+[33mcommit 9f662501a36b332ec4ac9b4ece29233ad7563c01[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 8 13:20:30 2024 -0700
+
+    Move torch.compile configs into cuda_graph_runner.py (#993)
+
+[33mcommit ab7875941b34200529eddd1fb950efa981dc3866[m
+Author: Juwan Yoo <vhain6512@gmail.com>
+Date:   Thu Aug 8 04:21:08 2024 -0700
+
+    feat: frequency, min_new_tokens, presence, and repetition penalties (#973)
+
+[33mcommit 228cf47547a3d2f7f38f636f40a5e85b0c3cd646[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 8 03:58:47 2024 -0700
+
+    Create contributor_guide.md (#992)
+
+[33mcommit 3a79613c28319030a5fe7fe22284b178f56984e1[m
+Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
+Date:   Thu Aug 8 17:41:57 2024 +0800
+
+    support more optioin about usage in stream mode (#985)
+    
+    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
+
+[33mcommit 1ac304eeb483c4ce3435dd1673426ddd7271d02c[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Aug 8 01:11:22 2024 -0700
+
+    Adjust `InputeMetadata` and `ScheduleBatch` (#981)
+
+[33mcommit 20a4f927dc0cd5c9f75592eb9efd91e79fb90141[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 8 00:52:31 2024 -0700
+
+    Add io struct for embedding models [unreachable code] - step 2/3 (#987)
+
+[33mcommit 0de7c2d09efe1e6bd25bbff5f572ca629c04e197[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 8 00:04:15 2024 -0700
+
+    Add e5-mistral modules [unreachable code] - step 1/3 (#983)
+
+[33mcommit 6ed4e3b8fb8b264d38378ddff49b185a56e1e810[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Aug 7 22:28:42 2024 -0700
+
+    Fix chunked prefill (#984)
+
+[33mcommit 00023d622a6d484e67ef4a0e444f708b8fc861c8[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Aug 7 18:48:45 2024 -0700
+
+    [minor] Update type annotation in tokenizer_manager.py (#982)
+
+[33mcommit c62d560c03bdb9edd8d36d82f5771d5e8c18a899[m
+Author: foszto <foszto@gmail.com>
+Date:   Thu Aug 8 02:54:46 2024 +0200
+
+    #590 Increase default , track changes in examples and documentation (#971)
+    
+    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
+
+[33mcommit 2b8257f325f3135f0c1cbeae50f3186a98daf6f0[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Aug 7 17:41:26 2024 -0700
+
+    Adjust max prefix len (#980)
+
+[33mcommit 7623091d9769f074680beefcdf23a6fb2ecac753[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Aug 7 15:52:24 2024 -0700
+
+    RadixCache method adjust (#977)
+
+[33mcommit f724f1f1e99406a120874de2579e671f304ca58c[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Aug 7 13:47:28 2024 -0700
+
+    PrefillAdder abstraction (#968)
+
+[33mcommit 6db27f7b3b883ab47216114bd611b4f628bdfaa2[m
+Author: Zhiqiang Xie <zhiqiangx@nvidia.com>
+Date:   Wed Aug 7 13:40:07 2024 -0700
+
+    misc: correct the int data type for token ids and indices (#969)
+
+[33mcommit 4d929107aef1598ffb681daf563c92f44167a918[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Aug 7 13:16:36 2024 -0700
+
+    Run purge-cache only in sgl-project (#976)
+
+[33mcommit fbe0c818c253198a59c1c41670a32493c2e53437[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Aug 7 12:43:12 2024 -0700
+
+    Purge self-runner's pip cache weekly (#975)
+
+[33mcommit dc9d06d886151707f97d0b78095df9de262fd3c9[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Aug 7 20:47:53 2024 +0800
+
+    chore: bump v0.2.11 (#970)
+
+[33mcommit c31f084c713cb91f0fdb54306f0851aa2780fdf5[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Aug 7 19:15:41 2024 +0800
+
+    chore: update vllm to 0.5.4 (#966)
+
+[33mcommit a01ddd9605fe3b01fa8e4c9545c35290ad541afe[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Aug 7 01:52:10 2024 -0700
+
+    misc: fix the req_to_token member change (#967)
+
+[33mcommit 7fa54a1ab3143d730e160069e1ef5c6536eb2752[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Aug 7 01:41:25 2024 -0700
+
+    Make `req_pool_indices` on CPU (#960)
+
+[33mcommit 05abd1261c197f13330dfed3115fe27f2e461299[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Aug 7 16:39:36 2024 +0800
+
+    misc: add compute capability in check_env (#965)
+
+[33mcommit 5f6fa04a3f3f8c2451835d6e99ddaf5b9d48c8f8[m
+Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
+Date:   Wed Aug 7 16:23:27 2024 +0800
+
+    misc: simplify test (#964)
+
+[33mcommit 58a09708539b5d3c8c12f3aaceb18178e1483d16[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Aug 7 15:41:21 2024 +0800
+
+    misc: update issue template (#963)
+
+[33mcommit ff68ae857ab952f60564b0b4a56e030f91e3f67e[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Aug 6 23:57:06 2024 -0700
+
+    Show more error messages for warmup errors (#932)
+
+[33mcommit 795eab6dda7b7c3df552b9c44dce65c695a0f97c[m
+Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
+Date:   Wed Aug 7 14:52:10 2024 +0800
+
+    Add support for Batch API test  (#936)
+
+[33mcommit 41bb1ab10d7585f874ab4809744a0b55a5b351b7[m
+Author: Meng, Peng <pengmeng@tencent.com>
+Date:   Wed Aug 7 11:51:21 2024 +0800
+
+    fix nsys cannot profile cuda kernel (#957)
+
+[33mcommit 87e8c090e910c20f9619808179d6e38ba10e2034[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Aug 6 20:50:32 2024 -0700
+
+    Organize code (rename, movement) (#953)
+
+[33mcommit ad56e684950ab296e5829e8b44c80fcedc1eb426[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Aug 6 01:05:58 2024 -0700
+
+    Fix stuck in `get_new_prefill_batch` (#948)
+
+[33mcommit ffb15744b583e85e4d3f479784a30f33f6c799aa[m
+Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
+Date:   Tue Aug 6 02:12:53 2024 +0800
+
+    Support multiple args options (#941)
+
+[33mcommit a9c833d5802d87279d4eb2af4297a765d2bd3b0a[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Tue Aug 6 00:46:34 2024 +0800
+
+    Fix union operator (#940)
+
+[33mcommit 94e0115186d91fe00910a17b02da2a62de6b2d45[m
+Author: Aidan Cooper <30752032+AidanCooper@users.noreply.github.com>
+Date:   Mon Aug 5 11:27:49 2024 +0100
+
+    Feat: add alternative choices selection methods (#835)
+
+[33mcommit b216a545b357b12b2c47879497d63bedf80fcc54[m
+Author: Aidan Cooper <30752032+AidanCooper@users.noreply.github.com>
+Date:   Mon Aug 5 11:25:48 2024 +0100
+
+    Remove leftover auth_token (#934)
+
+[33mcommit fde8340550d5c5587a97eef69839d3e8a57a43c8[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Mon Aug 5 18:06:06 2024 +0800
+
+    docs: update README (#935)
+
+[33mcommit fd7926e46e37b57c4231e4da4dca5a241850f795[m
+Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
+Date:   Mon Aug 5 15:56:08 2024 +0800
+
+    Fix prompt len in parallel sampling (#928)
+
+[33mcommit 399cad91f36e9b4baea78408801d6ef0d7bfd436[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Aug 4 23:01:35 2024 -0700
+
+    Update README.md (#927)
+
+[33mcommit 0a4f5f9beab7915d8d4d0d325b8fbd7c0eed5037[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Aug 4 22:52:41 2024 -0700
+
+    Test regex in vision api (#926)
+
+[33mcommit 3bc99e6fe4d77979f0e0de707a59a9cf305504d6[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Aug 4 20:51:55 2024 -0700
+
+    Test openai vision api (#925)
+
+[33mcommit ebf69964cd7e5af9a079eb430ecdd7f67e8566d0[m
+Author: min-xu-et <168487304+min-xu-et@users.noreply.github.com>
+Date:   Sun Aug 4 18:15:23 2024 -0700
+
+    latency test enhancement - final part (#921)
+
+[33mcommit 141e8c71a31eb24041a373141c5d3da3b92938d8[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Aug 4 16:52:51 2024 -0700
+
+    Bump version to 0.2.10 (#923)
+
+[33mcommit d53dcf9c989fe4badcfbeb9d598adb7a3b6c9ab3[m
+Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
+Date:   Mon Aug 5 07:43:09 2024 +0800
+
+    Support more OpenAI API test (#916)
+
+[33mcommit bb66cc4c52b1440a8e85247b706b2b3d645e902d[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Aug 4 16:02:05 2024 -0700
+
+    Fix CI && python3.8 compatible (#920)
+
+[33mcommit 975adb802b39bf98b9b0ac4a715cd3670f18b61a[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Aug 4 13:51:52 2024 -0700
+
+    Update hyperparameter_tuning.md (#918)
+
+[33mcommit 0d4f3a9fcdea60ac327a6a5897a281a1d763c3ac[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Aug 4 13:35:44 2024 -0700
+
+    Make API Key OpenAI-compatible (#917)
+
+[33mcommit afd411d09f6d143cfb299d759056597a9fae3209[m
+Author: min-xu-et <168487304+min-xu-et@users.noreply.github.com>
+Date:   Sun Aug 4 12:27:25 2024 -0700
+
+    enhance latency test - part 2 (#915)
+
+[33mcommit e1eae1fd15ed8e125ddcd18d0193ae8529c0c309[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Mon Aug 5 01:40:33 2024 +0800
+
+    Support MLA for DeepSeek-V2 with Triton - step 1 (#905)
+
+[33mcommit f4d9953d9d6e7e86d933a2b7be0328bec8527d0a[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Aug 4 21:20:59 2024 +0800
+
+    misc: add triton in check_env PACKAGE_LIST (#914)
+
+[33mcommit 4f005250573c60b1538b6cee4b30ae691d9f2e98[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Aug 4 14:34:50 2024 +0800
+
+    fix: use e2e and unit test only for original repo or pr (#912)
+
+[33mcommit 995af5a54b03495ff34af28f5499f381d19758da[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Aug 3 23:09:21 2024 -0700
+
+    Improve the structure of CI (#911)
+
+[33mcommit 539856455d8950b0249248aecdc844a78168a003[m
+Author: min-xu-et <168487304+min-xu-et@users.noreply.github.com>
+Date:   Sat Aug 3 22:44:58 2024 -0700
+
+    latency test enhancement - part 1 (#909)
+
+[33mcommit 70cc0749ce0d8a6fa28323c057311ebe88a6157e[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Aug 3 18:20:50 2024 -0700
+
+    Add model accuracy test - step 1 (#866)
+
+[33mcommit 7dd8a7e6d973f2dba7f669e92b40baeeb7983248[m
+Author: min-xu-et <168487304+min-xu-et@users.noreply.github.com>
+Date:   Sat Aug 3 17:42:17 2024 -0700
+
+    fixed an error handling in bench_latency.py (#904)
+
+[33mcommit 947402c8293f4c03e014a02aee54c2b76bde1d39[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Aug 3 16:18:50 2024 -0700
+
+    Reorder CI unit tests. (#908)
+
+[33mcommit 8c5382e62c53d251104da36b121a9a28d0eae21a[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Aug 3 12:58:41 2024 -0700
+
+    Update README.md
+
+[33mcommit 001b0bdd089a626ada2ee217fdc59b4212f0b461[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Aug 2 21:54:57 2024 -0700
+
+    Update the base image of the docker (#900)
+
+[33mcommit b906c015926c9064e086dd1f378e705330076da3[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Aug 2 12:08:00 2024 -0700
+
+    Bump version to 0.2.9.post1 (#899)
+
+[33mcommit 9319cd139c91b3c86775601e694a78420f8d01db[m
+Author: min-xu-et <168487304+min-xu-et@users.noreply.github.com>
+Date:   Fri Aug 2 09:39:28 2024 -0700
+
+    [minor] fixed code formatting doc (#896)
+
+[33mcommit 046c2b339e4f88649c37751bcc6156924e025bd1[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Aug 2 22:50:19 2024 +0800
+
+    chore: add multipart dep for fastapi (#895)
+
+[33mcommit 6b8f66efe1ba754c23326ae6e71f96e05ae132de[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Aug 2 22:40:52 2024 +0800
+
+    misc: update cuda graph capture exception log (#894)
+
+[33mcommit 7937a886b2da950f5f17d4c3f788b4cbfda398e7[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Aug 2 19:03:53 2024 +0800
+
+    docs: update setup runner (#884)
+
+[33mcommit 2e218b9e04c5e92d0400bbb1f26d3893096639c3[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Aug 2 18:48:56 2024 +0800
+
+    fix: set env in runner (#891)
+
+[33mcommit 30a9b2ef203329110477dd10cf3ff45df0c2e8f4[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Aug 2 01:45:48 2024 -0700
+
+    Bump version to v0.2.9 (#890)
+
+[33mcommit 3cadecf0c4a1e8dbce63700ad7a1ba3716494e95[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Aug 2 00:47:23 2024 -0700
+
+    Increase openai client limit (#886)
+
+[33mcommit e90e3a50d45250a29efe3cdc6afa202b63a622f3[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Aug 2 00:46:41 2024 -0700
+
+    Add benchmark: HumanEval (#889)
+
+[33mcommit fbd6b94d6982298c1b488779e581029a1792df9b[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Aug 2 00:30:50 2024 -0700
+
+    Fix the double BOS problem in the HF chat template (#888)
+
+[33mcommit 4c8093c8dbe815ab0982b2c1caeaef49277a6e36[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 1 21:29:46 2024 -0700
+
+    Update workflow name (#883)
+
+[33mcommit ae7ee01a8e59f755d47426c4b08641053b765a89[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 1 21:20:17 2024 -0700
+
+    Add accuracy test to CI: MMLU (#882)
+
+[33mcommit 76e59088d88f1c79f258c69edc3b887fbcf4ce61[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 1 18:14:33 2024 -0700
+
+    Add more unit tests to CI (#880)
+
+[33mcommit 12ce3befb6382c34a32225c05ef0fe4ba7953f29[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Aug 1 17:37:47 2024 -0700
+
+    Update runner docs (#879)
+
+[33mcommit 4013a4e1b0a1210c0cfa729bd68130fc96030d50[m
+Author: 任嘉 <dionren@users.noreply.github.com>
+Date:   Fri Aug 2 08:13:51 2024 +0800
+
+    Implement served_model_name to customize model id when use local mode… (#749)
+    
+    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
+
+[33mcommit 60340a3643b14620d7ade3941caad5bcbdbf4ac0[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 1 16:01:30 2024 -0700
+
+    Improve the coverage of the openai api server test (#878)
+
+[33mcommit 70c78cfb03f9b991a734e0487bc3e317093a4d4f[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Aug 1 15:32:33 2024 -0700
+
+    Update runner docs (#876)
+
+[33mcommit 72b6ea88b4354ad7551aab1594db0c967065c11d[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 1 14:34:55 2024 -0700
+
+    Make scripts under `/test/srt` as unit tests (#875)
+
+[33mcommit e4d3333c6c9841f139222ea675a4f29241362f49[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 1 14:18:26 2024 -0700
+
+    bump to 0.2.8 (#877)
+
+[33mcommit 6f221d4ca03731542bedf2dd4da002178e5babbc[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 1 12:39:12 2024 -0700
+
+    Fix unit tests for the frontend language part (#872)
+
+[33mcommit aba6f51f88029524b841a706570047b99a7a2257[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Aug 2 03:27:05 2024 +0800
+
+    misc: update unit test config (#873)
+
+[33mcommit 7f6c690b67a9d6311041d3a65ec1be1123b475cf[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Aug 2 03:12:20 2024 +0800
+
+    misc: use pip cache purge and add unit test ci (#871)
+
+[33mcommit 40e6f5131a3f78eab970374761a2b07f49562dce[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 1 09:39:09 2024 -0700
+
+    Fix openai CI tests (#870)
+
+[33mcommit 4075677621f3be941f205cac669d37b8db3a8851[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 1 09:25:24 2024 -0700
+
+    Add OpenAI backend to the CI test (#869)
+
+[33mcommit 9e8d2c7f7490d553c98abbfc84ecb60a8789f89c[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Aug 1 16:26:54 2024 +0800
+
+    misc: add cancel previous at e2e (#864)
+
+[33mcommit c9bff5fcc80e3c5edad31d1998bf11cfc7781f57[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Aug 1 15:46:51 2024 +0800
+
+    misc: disable auto release (#862)
+
+[33mcommit b04444ac015a40c696a635fefdbbb8089a1cda73[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 1 00:39:55 2024 -0700
+
+    Rename github workflows (#861)
+
+[33mcommit 3d617a21ba04fea69223a0b8c162f292d4524860[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Aug 1 15:38:24 2024 +0800
+
+    misc: update e2e test paths config (#860)
+
+[33mcommit c020f9cedafe1b3eb0c4575c9a6d394e05fc8277[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Aug 1 00:29:01 2024 -0700
+
+    Support chunked prefill when radix cache is disabled (#811)
+
+[33mcommit ca600e8cd6ece48da1c8a8bdbfc53c32ac0d05f6[m
+Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
+Date:   Thu Aug 1 15:08:21 2024 +0800
+
+    Add support for logprobs in OpenAI chat API (#852)
+
+[33mcommit 0c0c81372ebebacb10f52a7424215e21d6421dae[m
+Author: Kai Fronsdal <kaifronsdal@gmail.com>
+Date:   Thu Aug 1 00:05:39 2024 -0700
+
+    Fix #857 (#858)
+
+[33mcommit 90286d857638f539ea47df1e0604fcc4f341234d[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Aug 1 00:05:26 2024 -0700
+
+    Add troubleshooting doc (#856)
+
+[33mcommit 5e7dd984fe0151198148b9cee6e613805e80998b[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Jul 31 15:48:31 2024 -0700
+
+    Fix llama for classification (#855)
+
+[33mcommit bc3eaac2b82a8464730b9ba4d6b2fbebf19fa314[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Aug 1 02:37:05 2024 +0800
+
+    chore: update flashinfer to v0.1.3 (#850)
+
+[33mcommit a78d98de190a336c0e9a1c27d849a769054cc065[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Jul 31 16:37:29 2024 +0800
+
+    misc: update e2e test paths config (#848)
+
+[33mcommit 7d5ed7c6ee2546d9a5e8aacfe310d430c7586874[m
+Author: Ikko Eltociear Ashimine <eltociear@gmail.com>
+Date:   Wed Jul 31 11:48:18 2024 +0900
+
+    docs: update README.md (#843)
+
+[33mcommit a6c7ebbbcb1ed135b117eab2b03fa1af9c9da05a[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Jul 30 18:29:01 2024 -0700
+
+    Add req slots leaking check (#842)
+
+[33mcommit bb0501c0d96cc9531dc8885dec8d5e701156010d[m
+Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
+Date:   Wed Jul 31 04:40:51 2024 +0800
+
+    Fix List input bug (#838)
+
+[33mcommit 6b0f2e908815acc3dbcf6630b5cdff4b9fbece72[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Jul 30 13:33:55 2024 -0700
+
+    Add `--max-total-tokens` (#840)
+
+[33mcommit 1edd4e07d6ad52f4f63e7f6beaa5987c1e1cf621[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Jul 30 20:41:10 2024 +1000
+
+    chore: bump v0.2.7 (#830)
+
+[33mcommit 62c673c46f3694c66895d29800b796c561aa1873[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Jul 30 19:43:40 2024 +1000
+
+    docs: add set up runner (#829)
+
+[33mcommit 377c5dc9a90eed82b04c2a16fb7ce59c0987f0c2[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Jul 30 19:26:23 2024 +1000
+
+    misc: enable e2e test when push (#828)
+
+[33mcommit f52eda35ea4dae83a4ada499d0a752252e04b938[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Jul 30 19:19:23 2024 +1000
+
+    misc: update e2e test benchmark config (#825)
+
+[33mcommit b579ecf028f571e6eadc216629470d6e456550e4[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Jul 30 02:04:51 2024 -0700
+
+    Add awq_marlin (#826)
+
+[33mcommit e7487b08bcda8cb39beea5eb225df493dc490028[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Jul 30 01:58:31 2024 -0700
+
+    Adjust default mem fraction to avoid OOM (#823)
+
+[33mcommit ae5c0fc442716e9fdc6fddba33c970ab3fe6f208[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Jul 30 01:42:07 2024 -0700
+
+    Support disable_ignore_eos in bench_serving.py (#824)
+
+[33mcommit a30d5d75bfde72c99fbd5ffc30a309e793520f66[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Jul 30 18:31:26 2024 +1000
+
+    feat: add pr e2e test (#822)
+
+[33mcommit 17af39c5dc6a20f39d5a68dd1ac668477eacadce[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Jul 30 17:32:13 2024 +1000
+
+    feat: add runner (#821)
+
+[33mcommit daf593a385db3f50cdaf8a71fb1f37548cd73bf8[m
+Author: ObjectNotFound <13832753+objnf-dev@users.noreply.github.com>
+Date:   Tue Jul 30 15:32:07 2024 +0800
+
+    Fix streaming bug (#820)
+
+[33mcommit bece265f5a189b23bac9ad31d140e11072d5efdf[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Jul 30 16:17:50 2024 +1000
+
+    docs: update README (#819)
+
+[33mcommit cdcbde5fc3155edaa6b98a13ab8764101e657b23[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Jul 29 23:04:48 2024 -0700
+
+    Code structure refactor (#807)
+
+[33mcommit 21e22b9e96c16e3cef9ba3d2aa16df45a20f7a2b[m
+Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
+Date:   Tue Jul 30 06:38:02 2024 +0100
+
+    Fix LiteLLM kwargs (#817)
+
+[33mcommit a50c8a14b3db0c2fb2f998c9969dc56b17162a30[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Jul 30 12:40:35 2024 +1000
+
+    fix: use v0.2.5 for benchmark (#814)
+
+[33mcommit db6089e6f341878080e645860a75766fea6207db[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Jul 29 19:40:28 2024 -0700
+
+    Revert "Organize public APIs" (#815)
+
+[33mcommit 3520f75fb14d1932fa226aea534937cc87c1b819[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Jul 29 18:34:25 2024 -0700
+
+    Remove inf value for chunked prefill size (#812)
+
+[33mcommit c8e9fed87a85241180cb83230c8407d5d96c5f85[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Jul 29 15:34:16 2024 -0700
+
+    Organize public APIs (#809)
+
+[33mcommit 084fa54d371e439cbca8b21930c6f658c1ef4671[m
+Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
+Date:   Tue Jul 30 04:07:18 2024 +0800
+
+    Add support for OpenAI API : offline batch(file) processing (#699)
+    
+    Co-authored-by: hnyls2002 <hnyls2002@gmail.com>
+
+[33mcommit eba458bd19042ee449d52ea4c9f5b30a79d1165e[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Jul 29 12:20:42 2024 -0700
+
+    Revert "Revert "fix: update flashinfer to 0.1.2 to fix sampling for cu118"" (#806)
+
+[33mcommit 3d1cb0af83a2e8e669edd25bea6c80ce22bfba6a[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Jul 30 05:18:03 2024 +1000
+
+    feat: add chat template for internlm2-chat (#802)
+
+[33mcommit 7d352b4fdd3505d1ff527bd3f5b6b069b63f53bd[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Jul 29 11:39:12 2024 -0700
+
+    Revert "fix: update flashinfer to 0.1.2 to fix sampling for cu118" (#805)
+
+[33mcommit 87064015d9a31a116137574607bdb32a96cc3c17[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Tue Jul 30 04:00:52 2024 +1000
+
+    fix: update flashinfer to 0.1.2 to fix sampling for cu118 (#803)
+
+[33mcommit 7cd4f244a42178d0cdfb6a81156f38e87a7d92cd[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Jul 29 03:32:58 2024 -0700
+
+    Chunked prefill (#800)
+
+[33mcommit 98111fbe3ebd429258923ae00c3e1c7b1be8dcec[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Jul 29 02:38:31 2024 -0700
+
+    Revert "Chunked prefill support" (#799)
+
+[33mcommit 2ec39ab712245e89c0897a9cc2aa257b0c577c8d[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Jul 29 02:21:50 2024 -0700
+
+    Chunked prefill support (#797)
+
+[33mcommit 8f6274c82be3221b45848836756223a918cd1d07[m
+Author: ObjectNotFound <13832753+objnf-dev@users.noreply.github.com>
+Date:   Mon Jul 29 14:02:49 2024 +0800
+
+    Add role documentation, add system begin & end tokens (#793)
+
+[33mcommit 325a06c2deef25067f8b37e73358b4569e13def7[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Jul 28 23:01:45 2024 -0700
+
+    Fix logging (#796)
+
+[33mcommit 79f816292e02d1a56762f956741d83fc557c1040[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Jul 28 22:09:50 2024 -0700
+
+    Fix lazy import location (#795)
+
+[33mcommit b688fd858d89258ba0018e5903c2907badf49afa[m
+Author: Eric Yoon <bgyoon@gmail.com>
+Date:   Mon Jul 29 13:57:41 2024 +0900
+
+    Lazy-import third-party backends (#794)
+
+[33mcommit 5bd899243be414d087ebc3ad3dd98b571581ff16[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Jul 28 21:57:23 2024 -0700
+
+    Update README.md (#792)
+
+[33mcommit 8d908a937cb75b211822a7fed94b352660b2347e[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Jul 28 17:09:16 2024 -0700
+
+    Fix echo + lobprob for OpenAI API when the prompt is a list (#791)
+
+[33mcommit dd7e8b9421f31a2af824cf578cf4c8bed92da52a[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Jul 28 23:07:12 2024 +1000
+
+    chore: add copyright for srt (#790)
+
+[33mcommit 1f013d64eb78a7336e59244b9628ab36f5d32bf1[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Jul 28 22:27:52 2024 +1000
+
+    docs: make badges center (#789)
+
+[33mcommit 628e1fa7603ea583a1511e08aa6f508f97be658f[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Jul 28 22:24:27 2024 +1000
+
+    docs: update README (#788)
+
+[33mcommit c71880f896a356255c84928d3e86618ca19cf891[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Jul 28 05:22:14 2024 -0700
+
+    Vectorize logprobs computation (#787)
+
+[33mcommit bcb6611a46cf5380e27bc05e7ac4945c3bd57ca8[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Jul 28 01:00:06 2024 -0700
+
+    Update README.md
+
+[33mcommit fa2aa0db0afe09630c58b26ed5e29ea0b3dc936b[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Jul 28 17:22:00 2024 +1000
+
+    docs: update index (#786)
+
+[33mcommit 6a387a69ccf3277380d694a285a616f1c4d46526[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Jul 28 17:08:16 2024 +1000
+
+    fix: exclude logo png in gitignore (#785)
+
+[33mcommit 27f5ce0a6cf2d61a24db87c0e844177125ba16b1[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Jul 28 16:55:54 2024 +1000
+
+    fix: init readthedocs support (#784)
+
+[33mcommit 948625799e1d173308f63462c88de31e58a28f9e[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Jul 28 16:50:31 2024 +1000
+
+    docs: init readthedocs support (#783)
+
+[33mcommit 68e5262699e2272c89993abf8c5bd47e046b4bce[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Jul 28 14:06:24 2024 +1000
+
+    fix: replace pillow with PIL in PACKAGE_LIST (#781)
+
+[33mcommit bc1154c399b3350e502adb56320e1bf1b00d2750[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 27 20:29:33 2024 -0700
+
+    Bump version to 0.2.6 (#779)
+
+[33mcommit 752e643007070107808794f0ad46aa196b39c66c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 27 20:18:56 2024 -0700
+
+    Allow disabling flashinfer sampling kernel (#778)
+
+[33mcommit 30db99b3d98cbc4886dc3e35dce0f1658a44939c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 27 19:50:34 2024 -0700
+
+    Rename prefill_token_logprobs -> input_token_logprobs; decode_token_logprobs -> output_token_logprobs (#776)
+
+[33mcommit 0a409bd438354d01897b349eb0579b9d30f5ccd2[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 27 19:15:09 2024 -0700
+
+    Fix return_log_probs with cuda graph (#775)
+
+[33mcommit e4db4e5ba58c6e5c9850327fff8b34e5366dd925[m
+Author: Mingyi <wisclmy0611@gmail.com>
+Date:   Sat Jul 27 19:03:40 2024 -0700
+
+    minor refactor: move check server args to server_args.py (#774)
+
+[33mcommit bbc07c4197fd2a40e0ba8aa53ce1d69c116e3081[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 27 17:30:12 2024 -0700
+
+    Move sampling logits to float32 (#773)
+
+[33mcommit a036d41980b0a2890e28ada25316406025b3a0b4[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 27 17:22:18 2024 -0700
+
+    Fix max new tokens (#772)
+
+[33mcommit f95e6617576ee30597ecf1d5de7e6feada70394d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 27 15:44:27 2024 -0700
+
+    Fix max_tokens for OpenAI chat completion API (#766)
+
+[33mcommit de854fb5c58594b08a505c76c3a70a0054ff6d9c[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sun Jul 28 02:22:22 2024 +1000
+
+    feat: add fake tag (#770)
+
+[33mcommit f64b2a9bc061fafe8bbc8fa834869a10d4d3cbf7[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 27 06:29:15 2024 -0700
+
+    Add slack invitation link.
+
+[33mcommit 9f95dcc64f5e97ce1a949818803e1371259d285f[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Jul 27 06:12:16 2024 -0700
+
+    Update readme (#769)
+    
+    Co-authored-by: Mingyi <wisclmy0611@gmail.com>
+
+[33mcommit 0736b270202696b8f865e2915aadc36d3d51811b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 27 05:05:15 2024 -0700
+
+    [Minor] Improve the code style in TokenizerManager (#767)
+
+[33mcommit 3fdab91912fb271c20642e21c2055df0e23d514e[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Sat Jul 27 17:44:46 2024 +0800
+
+    Fix TransformerTokenizer init for chatglm2 & 3 (#761)
+
+[33mcommit ba29504b211df9855a90d5d30c014f296dec9e12[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Jul 26 22:53:53 2024 -0700
+
+    Update supported models (#763)
+
+[33mcommit a72342f180fdbdcc7af2ad24e98959f53cbf3bc1[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Jul 27 14:51:33 2024 +1000
+
+    fix: not run workflows on fork repo (#762)
+
+[33mcommit c3c74bf87480c9cfa98ae182b7917cae979bc484[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Jul 27 14:07:37 2024 +1000
+
+    docs: update model support (#760)
+
+[33mcommit d9fccfefe27c9d63212698578084aaf9f688a066[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Jul 26 18:13:13 2024 -0700
+
+    Fix context length (#757)
+
+[33mcommit 679ebcbbdc37e129b16d305660514dfafda0f7ca[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Jul 26 17:10:07 2024 -0700
+
+    Deepseek v2 support (#693)
+
+[33mcommit 5bd06b45992e850548e0520c4cb4b46493f68ab9[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Jul 27 05:56:30 2024 +1000
+
+    fix: use REPO_TOKEN (#755)
+
+[33mcommit 9a61182732e124ecc69501ff306ea92573c1d25f[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Jul 27 05:48:38 2024 +1000
+
+    fix: add release tag workflow (#754)
+
+[33mcommit eeb2482186dc9aac06570867c89e040dc6027278[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Jul 27 05:37:02 2024 +1000
+
+    feat: add release tag workflow (#753)
+
+[33mcommit 3e455b016efd7ad202e2d26a83e724b7df2917ac[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Jul 27 04:19:30 2024 +1000
+
+    misc: replace deprecated variable HUGGING_FACE_HUB_TOKEN with HF_TOKEN (#752)
+
+[33mcommit 8628ab9c8bdf9b01c4671e3c6caabf49afd73395[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Sat Jul 27 03:54:51 2024 +1000
+
+    feat: add docker workflow (#751)
+
+[33mcommit 1b77670f394b02426444f06164790fcc62a204af[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Jul 26 21:27:41 2024 +1000
+
+    chore: bump v0.2.1 (#740)
+
+[33mcommit 768e05d08fc0de1eca07df93866890a5691cd24c[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Jul 26 21:26:13 2024 +1000
+
+    fix benchmark (#743)
+    
+    Co-authored-by: hnyls2002 <hnyls2002@gmail.com>
+    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
+
+[33mcommit 01fbb11bb7c066cd9052166b694048835d0e6cc9[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Jul 26 21:05:53 2024 +1000
+
+    docs: fix typo (#742)
+
+[33mcommit 05d216da32b9a5f0da9caad44ac0c17c7102338e[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Jul 26 21:03:20 2024 +1000
+
+    docs: add llama 3.1 405b instruction (#739)
+    
+    Co-authored-by: Ying1123 <sqy1415@gmail.com>
+
+[33mcommit 6b32bb1c0b95480c1f4534d026bede81bb3fcc83[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Fri Jul 26 21:00:51 2024 +1000
+
+    misc: format (#741)
+
+[33mcommit 40facad5f122259c9d6868153859792748ca0d10[m
+Author: Toshiki Kataoka <tos.lunar@gmail.com>
+Date:   Fri Jul 26 18:53:17 2024 +0900
+
+    feat: support token ids in /v1/completions (#736)
+
+[33mcommit da504445dc731b33e32ee31012905b8534065b2b[m
+Author: Toshiki Kataoka <tos.lunar@gmail.com>
+Date:   Fri Jul 26 17:27:56 2024 +0900
+
+    fix /generate without sampling_params (#734)
+
+[33mcommit 252e0f7bbd554a020e488a607fc67c09bc7ea07b[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Jul 25 21:14:54 2024 -0700
+
+    fix: small bug for llama-405b fp16 (#733)
+
+[33mcommit 7f6f2f0f09fc3216fb40214e66e54c4c442a9aaf[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Jul 25 09:13:37 2024 -0700
+
+    Update readme (#731)
+
+[33mcommit 7802df1e2b30508c14c20126ad1e9955659b4084[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Jul 25 08:14:36 2024 -0700
+
+    Update readme
+
+[33mcommit 1a491d00cb765c62864ecbd60b0357dc81b4371e[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Jul 25 08:03:36 2024 -0700
+
+    Bump version to 0.2.0 (#730)
+
+[33mcommit 8fbba3de3ddf4384b29bb1c582d837ecd8c08916[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Jul 25 07:42:00 2024 -0700
+
+    Fix bugs (fp8 checkpoints, triton cache manager) (#729)
+
+[33mcommit ae0f6130cb79419f9ddfb2b5194d0d39644057ea[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Jul 25 07:25:33 2024 -0700
+
+    Revert "fix: fp8 config" (#728)
+
+[33mcommit 60105897832e12ebcfc4773f6ad3d9c8686982f7[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Jul 25 20:52:37 2024 +1000
+
+    misc: update bug issue template (#727)
+
+[33mcommit 926ac01b649eb54eb8bbc6a1418f0acbbdd4a651[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Jul 25 20:47:46 2024 +1000
+
+    fix: resolve the logo display issue on the PyPI page (#726)
+
+[33mcommit 25c881a005f11ce2002968b0ce1d8ca6abf319c2[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Jul 25 20:04:35 2024 +1000
+
+    chore: bump v0.1.25 (#725)
+
+[33mcommit 04ec6ba2ac7a4e4beee8be9dc15bc1922544ca82[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Jul 25 03:04:21 2024 -0700
+
+    Fix dockerfile and triton cache manager (#720)
+
+[33mcommit d63f13c13b2fc216a9de448d572b60ea39bbf2bf[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Jul 25 02:01:56 2024 -0700
+
+    fix: fp8 config (#723)
+
+[33mcommit fded67441d9ef12939cba8e41618dba9cff91749[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Jul 25 17:08:11 2024 +1000
+
+    misc: update bulid instruction (#724)
+
+[33mcommit 6e4539405167da85107fd46fce1db6d874067fc4[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Jul 25 16:31:23 2024 +1000
+
+    chore: add close inactive issues workflow (#722)
+
+[33mcommit 97e0f7d250ac2f098e0339fb451dd02625926273[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Jul 25 10:51:18 2024 +1000
+
+    docs: update comment (#721)
+
+[33mcommit d5146baec9e94a9a3fc57d114906e68d451a479d[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Jul 25 09:34:01 2024 +1000
+
+    docs: update supported models (#719)
+
+[33mcommit 459abad2615e09f4e1bd28313b60fb1ada12c432[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Jul 24 15:55:01 2024 -0700
+
+    Bump version to 0.1.24 (#718)
+
+[33mcommit 30d8e130e78645e80318f25dcabcf1efd12c4d05[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Jul 24 14:44:14 2024 -0700
+
+    Improve benchmark scripts (#717)
+
+[33mcommit 08a3bd19ccfb8442a6928175c5cc288bd2e08d3d[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Jul 24 13:38:06 2024 -0700
+
+    docs: update doc (#716)
+
+[33mcommit 321a963b01c7bfb478799429d33859792a2f5b43[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Jul 25 06:05:46 2024 +1000
+
+    misc: update doc (#715)
+
+[33mcommit e17deb27b5cb3417c0315791ceddb5b4e40e7e91[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Jul 25 02:37:41 2024 +1000
+
+    fix: llama 3.1 405b fp8 (#714)
+
+[33mcommit 2d3ae4e1258791a04a28279044359c08c16af99e[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Thu Jul 25 00:03:17 2024 +1000
+
+    docs: update doc (#713)
+
+[33mcommit 75f4ccb7ddea2fd1abaa6475855da141b6c63980[m
+Author: Yineng Zhang <me@zhyncs.com>
+Date:   Wed Jul 24 23:33:28 2024 +1000
+
+    docs: update README (#712)
+
+[33mcommit 83d2b30d759ec2e7e781d4da7d4c98c0b778b941[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Jul 24 10:53:07 2024 +0000
+
+    format
+
+[33mcommit 4367f4bb8d8d9b0cf6c2977cccb03871d300cd9b[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Jul 24 03:41:15 2024 -0700
+
+    Fix prefill size (#711)
+
+[33mcommit 00e4baa7289bf6f83611246b1e02a7411d774ff7[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jul 24 01:22:30 2024 -0700
+
+    Update schedule_heuristic.py
+
+[33mcommit 4cd64b8ee626d57d5e3211d326de141eb6520408[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Jul 23 22:06:02 2024 -0700
+
+    Auto adjust new ratio (#708)
+
+[33mcommit 01d66ae2e8aa08a3fa2b93f8023063f8798477f0[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jul 23 21:53:36 2024 -0700
+
+    Fix multi-node deadlock (#709)
+
+[33mcommit a523a3c13af258856210a678f296d14c825da837[m
+Author: Mingyi <wisclmy0611@gmail.com>
+Date:   Tue Jul 23 16:42:21 2024 -0700
+
+    Reduce hardcoded logic of kernel usage (#707)
+
+[33mcommit 9f94728f5aa7d4e73727fa31dfd58f3ab2d2035b[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Jul 23 13:53:19 2024 -0700
+
+    bump version to 0.1.23 (#706)
+
+[33mcommit 444a02441a50f87d86c406901b3726b323a3fc0f[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Jul 23 13:49:34 2024 -0700
+
+    Update vllm version to support llama3.1 (#705)
+
+[33mcommit fa7ccb3316dccdf0326913222c337da20b436251[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Wed Jul 24 05:51:10 2024 +1000
+
+    feat: add e2e latency (#704)
+
+[33mcommit 268684439b4d5e99cc73937848c222a0322dc50a[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Jul 23 11:52:50 2024 -0700
+
+    Use min new token ratio at start (#701)
+
+[33mcommit 824a77d04d90662eeb3864d3f36e9f2458d4b9f6[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Wed Jul 24 02:39:08 2024 +0800
+
+    Fix hf config loading (#702)
+
+[33mcommit cf99eab7d5037eb0671486a8cb60ae85ce842732[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Jul 23 01:27:01 2024 -0700
+
+    Fix flashinfer (#700)
+
+[33mcommit 9fdea29d054fa4e11da7102b5b07ea6f87ff2465[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Tue Jul 23 02:00:27 2024 +1000
+
+    misc: fix typo (#698)
+
+[33mcommit df7c4c19b4b93ec5b7ba9f4e227f802f8d82a246[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Jul 22 06:32:41 2024 -0700
+
+    Fix trt benchmark (#697)
+
+[33mcommit c3f1aac811c85abe97129ad5c917e8878890e2f9[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Jul 22 03:19:24 2024 -0700
+
+    Tune params (#696)
+
+[33mcommit d198791fe8eaa5ff59fe76d4274c0d2479f196cb[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Mon Jul 22 19:34:05 2024 +1000
+
+    misc: update output token logic (#695)
+
+[33mcommit c07526e46c98dd72c62822c8a5e0fb21e1aeeb16[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Mon Jul 22 18:23:33 2024 +1000
+
+    fix: update bench serving (#694)
+
+[33mcommit 7b597475f2f7c52f49a614914066eafaff527f0f[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Mon Jul 22 03:41:20 2024 +1000
+
+    docs: update README (#692)
+
+[33mcommit 5303c1ed2274a0aa8b970545745c4bd7b54a487a[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Mon Jul 22 01:36:53 2024 +0800
+
+    Support Mistral-Nemo (#691)
+
+[33mcommit 65bd13386b096157739bcdb1d31cbb1d832dbf3a[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Mon Jul 22 00:13:33 2024 +1000
+
+    misc: recommend to use chat model for benchmark (#690)
+
+[33mcommit eedc12e12ed3a4ecf9cc8c6648d9e2bc2caffc23[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Jul 21 03:09:29 2024 -0700
+
+    Support Deepseek MoE Model (#689)
+
+[33mcommit 5a4ef2b5c8625a14bd7b72ef05c14528f11d2191[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Jul 21 02:58:57 2024 -0700
+
+    update readme
+
+[33mcommit 9dab947d5667e3cc8b1f4f37a682d6bd0c492f4d[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Sun Jul 21 18:32:58 2024 +1000
+
+    docs: update README (#688)
+
+[33mcommit 33ee97b0bfdba0c86780fada828587f47b95e11d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Jul 21 01:12:34 2024 -0700
+
+    Allow disabling streaming in bench (#687)
+
+[33mcommit 6a846bb1fd38fd420e481a2ec30cefe9ff27747e[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Sun Jul 21 18:07:30 2024 +1000
+
+    misc: update output file logic (#686)
+
+[33mcommit 0fdb3127a12d0636731638b1a600f5d225dcc4b1[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Sun Jul 21 16:46:58 2024 +1000
+
+    feat: update bench serving (#685)
+
+[33mcommit 5ad033a0706c16c7025618a9d1f65133a024a931[m
+Author: Max Shawabkeh <max99x@gmail.com>
+Date:   Sat Jul 20 23:32:11 2024 -0700
+
+    Fix StreamExecutor.fork() losing the current role start index. (#684)
+
+[33mcommit 77e592e8e08c5a0cd7cae537c5ffb9f5827efa52[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 20 18:36:42 2024 -0700
+
+    support non-streaming benchmark (#682)
+
+[33mcommit caaad53b52f58c2d5ccfacd2e830c1af63597ca5[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Jul 20 18:34:37 2024 -0700
+
+    Support gpt-bigcode model class (#681)
+
+[33mcommit 69d19188fc1ae1fb2caee5d633d58d2c8bf9cce3[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Jul 20 14:16:45 2024 -0700
+
+    Decouple kv (#679)
+
+[33mcommit 4b4a67f81488fc7e5ee3b22070300d318140a10a[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Sun Jul 21 04:05:35 2024 +1000
+
+    feat: support TRT LLM benchmark and multiple benchmarks (#670)
+
+[33mcommit 0ac94c36cbc89c6b4b31a61779cb86982999211e[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Sun Jul 21 01:44:54 2024 +0800
+
+    Fallback when sampling failed (#678)
+
+[33mcommit 2b4c64627727eaf8c14c56d3695c53fd9832a084[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Jul 20 03:39:50 2024 -0700
+
+    Update version to 0.1.22 (#677)
+
+[33mcommit f424e76d96e9cdc580cf648d7fdc75853a8530e1[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Jul 20 03:11:15 2024 -0700
+
+    Fix illegal tokens during sampling (#676)
+
+[33mcommit 490a1f39dd54115b56e3c587b457cca49e0a9bfc[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 20 02:43:55 2024 -0700
+
+    Fix cuda graph with flashinfer (#675)
+
+[33mcommit 06487f126e888d9f1f42330955f9ea604affaa65[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Jul 20 02:18:22 2024 -0700
+
+    refactor model loader: initial refactor (#664)
+
+[33mcommit 39c57317e102be88024aecc95774e56af61aabc4[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Jul 20 02:06:31 2024 -0700
+
+    Revert "Temporary fix invalid sample results" (#673)
+
+[33mcommit 9592a1f3bd07cbe5f826ef0357356df237a3476f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 20 01:57:43 2024 -0700
+
+    Fix random dataset (#671)
+
+[33mcommit 35759efa91f39168c6aa255fd9b14fd50aea968b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 20 01:06:43 2024 -0700
+
+    Support random dataset in bench_serving.py (#669)
+
+[33mcommit 8f4b1559e796bd37cf43d6fa61a8fa7e191eb872[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Jul 20 00:51:05 2024 -0700
+
+    Temporary fix invalid sample results (#668)
+
+[33mcommit e3046ea3a8189aa897a24428da94af67a10a0ee1[m
+Author: Mingyi <wisclmy0611@gmail.com>
+Date:   Fri Jul 19 23:20:54 2024 -0700
+
+    Update OpenAI API (#667)
+
+[33mcommit 49c5e0eca9fe0193e716d0a51bdc2ec7c90a0184[m
+Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
+Date:   Sat Jul 20 14:10:01 2024 +0800
+
+    Add support for OpenAI API parallel sampling (#640)
+
+[33mcommit ec2150b2944edd7d805cbbeb40565aa1a8df70d0[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Sat Jul 20 12:43:11 2024 +0800
+
+    Fix kill process util (#666)
+
+[33mcommit 7620cd37dd3935eb23963c33b6c7f0444b8fd909[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Jul 19 16:42:06 2024 -0700
+
+    Fix jump forward when streaming (#665)
+
+[33mcommit 50a53887bee1d6c565532c40f9bfc3c12b86040e[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Jul 19 11:40:06 2024 -0700
+
+    Update docs
+
+[33mcommit 11c8efff73fb869b728fbe75aa0ecd7387f814da[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Jul 19 11:12:23 2024 -0700
+
+    Add benchmark instructions (#663)
+
+[33mcommit e87c7fd501ceac8ae692a0f7f3430948b64f2d37[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Jul 19 10:58:03 2024 -0700
+
+    Improve docs (#662)
+
+[33mcommit 630479c3a6acc8a36554843be2101e1e6ff04e8c[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Sat Jul 20 02:54:15 2024 +1000
+
+    feat: update check env (#661)
+
+[33mcommit 51fda1439fe76cba5b2d59f0db8f4f10e1cd6b4f[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Jul 19 09:54:01 2024 -0700
+
+    Update Readme (#660)
+    
+    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
+
+[33mcommit dc4e4a6acc533e9622af1980e4dfa1e9f7da8c8c[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Sat Jul 20 02:27:39 2024 +1000
+
+    misc: update SGLang package description (#659)
+
+[33mcommit 2d96da813e3a78999bc5d7632a946fe8a949d771[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Jul 19 09:27:06 2024 -0700
+
+    refactor model loader [unreachable code]: initial refactor (#655)
+
+[33mcommit c126a6ccba240afd48254c2f254f1e483c13aa8d[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Sat Jul 20 02:15:21 2024 +1000
+
+    feat: add benchmark serving (#657)
+
+[33mcommit ac971ff633de330de3ded7f7475caaf7cd5bbdcd[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Sat Jul 20 02:14:22 2024 +1000
+
+    perf: reduce ttft and itl with stream_interval 1 (#658)
+
+[33mcommit e1792cca2491af86f29782a3b83533a6566ac75b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Jul 18 23:28:40 2024 -0700
+
+    Remove cached triton launcher (#656)
+
+[33mcommit 1b7adbb5a0cfb6826e8b7d45807fc3900b4a5f25[m
+Author: shrirajh <22592342+shrirajh@users.noreply.github.com>
+Date:   Fri Jul 19 14:25:29 2024 +0930
+
+    `TokenizerManager.context_len` should inherit from `server_args.conte… (#654)
+
+[33mcommit a9ef49c12ccd1c36fb225b8831f8a434d90485f4[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Jul 18 17:57:40 2024 -0700
+
+    Detokenize incrementally when streaming (#653)
+
+[33mcommit 21ba3a88a10c3dadb6997320950facfb8c567626[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Jul 18 05:31:44 2024 -0700
+
+    Remove useless variables in infer_batch.py (#651)
+
+[33mcommit 9c5cac24506a230f487659a97b7cf09c920bb480[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Thu Jul 18 20:33:21 2024 +1000
+
+    fix: resolve lint error (#650)
+
+[33mcommit 5960a6e5058df3ccad2ccd0dc47d5f8e09b348b0[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Thu Jul 18 20:04:30 2024 +1000
+
+    feat: add lint workflow (#648)
+
+[33mcommit b050d9283f28f5841a6fa727cd77f92bae12d61d[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Thu Jul 18 19:35:45 2024 +1000
+
+    fix: set ulimit -n 65535 (#647)
+
+[33mcommit 6a4dc996973847b25829c8c0d93f768704a34c11[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Thu Jul 18 19:35:38 2024 +1000
+
+    misc: rm rpyc from PACKAGE_LIST (#649)
+
+[33mcommit d774acad5cef7a538da33d39207f9e2bc51474eb[m
+Author: Mingyi <wisclmy0611@gmail.com>
+Date:   Thu Jul 18 02:13:54 2024 -0700
+
+    Remove the dependency of rpyc (#646)
+
+[33mcommit d93388da3e362216e4a8e050afd0eea537d0d2ea[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Thu Jul 18 14:39:28 2024 +1000
+
+    feat: add check_env (#645)
+
+[33mcommit 476584cb6e1c4535e09e2439ff139357ca78477a[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Jul 17 15:44:41 2024 -0700
+
+    Increase the capacity of the memory pool (#643)
+
+[33mcommit abd5385ac5be52a9e378a678cb0164673b2febf1[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Jul 17 13:49:15 2024 -0700
+
+    Move `global_server_args_dict` (#642)
+
+[33mcommit 3de2f30a27b1d9ffef6dfddcdcc7877c2a2dc857[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Jul 17 13:24:43 2024 -0700
+
+    Flashinfer sample kernel (#617)
+
+[33mcommit 4efcc59d4f96b1863137358673c444f20c4fac0d[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Thu Jul 18 04:58:11 2024 +1000
+
+    misc: add issue and pr template (#638)
+
+[33mcommit 2e341cd4930e20a11456a379f818301801ebe214[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Thu Jul 18 04:55:39 2024 +1000
+
+    misc: add pre-commit config (#637)
+
+[33mcommit a8552cb18b452e9a0a7e421651caf9d3c4deb673[m
+Author: zhyncs <me@zhyncs.com>
+Date:   Wed Jul 17 15:40:03 2024 +1000
+
+    feat: support internlm2 (#636)
+
+[33mcommit a470e60c97d080901ce84d66b5b0d63f5cbae798[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Jul 16 20:15:24 2024 -0700
+
+    clean up step function (#635)
+
+[33mcommit 5f90e0769c83873cb79e404632367542f6650215[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Jul 16 19:18:54 2024 -0700
+
+    Update README.md
+
+[33mcommit 8832ecb1e451a58a85cbdcd7029586187c1c9574[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Jul 16 16:12:12 2024 -0700
+
+    Reduce docker size (#632)
+
+[33mcommit 5ff60eda7829cf075fe607b1ba2cf7be66917168[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Jul 16 16:07:19 2024 -0700
+
+    Fix vertexai (#633)
+
+[33mcommit c193002297d18efeacbc0887ec1c3a4c7b2c039e[m
+Author: Aidan Cooper <30752032+AidanCooper@users.noreply.github.com>
+Date:   Tue Jul 16 19:54:42 2024 +0100
+
+    Add support for VertexAI safety settings (#624)
+
+[33mcommit fe3be1595dc52ec3a10b784191b54b611804574f[m
+Author: ylying <373472509@qq.com>
+Date:   Wed Jul 17 02:48:49 2024 +0800
+
+    Add qwen2 tie word embedding (#630)
+
+[33mcommit 0aa189f150b59ed6b2bd91605e1eb056a7c6b98c[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Jul 16 09:05:10 2024 -0700
+
+    Disable NCCL_NVLS by default (#631)
+
+[33mcommit f6b29f69208397f5ee83990bff77fa49860ddf05[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Jul 16 01:12:37 2024 -0700
+
+    Update docker file (#629)
+
+[33mcommit c9ee3d3559717dd7a92616315b1f997dd6ba7acc[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Jul 15 22:09:09 2024 -0700
+
+    Fix model forward grad (#628)
+
+[33mcommit 41d1f67704a3761423131f48c357b957452a00a9[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jul 15 19:56:55 2024 -0700
+
+    Fix flush cache (#627)
+
+[33mcommit 56f5fc4ab5e1d9e270dcdab5f1d6f67e6785eebe[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Jul 15 13:10:53 2024 -0700
+
+    Bump version to 0.1.21 (#626)
+
+[33mcommit 6a2941f4d037cb5fa7c927342dc7f09387c29ab0[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Jul 15 07:10:51 2024 -0700
+
+    Improve tensor parallel performance (#625)
+    
+    Co-authored-by: Mingyi <wisclmy0611@gmail.com>
+
+[33mcommit 5ac8b80677614a9c024740e94f9a087a39eb3499[m
+Author: Mingyi <wisclmy0611@gmail.com>
+Date:   Mon Jul 15 02:01:09 2024 -0700
+
+    Simplify mem state (#623)
+
+[33mcommit bae9541e4c727ac8d7e082a2b8fd4e028e58b606[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Jul 14 14:38:13 2024 -0700
+
+    Update benchmark script (#621)
+
+[33mcommit a56858ba671400f5954b17accdaf9a087b632cd1[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Jul 14 12:55:55 2024 -0700
+
+    Unify index operations (#620)
+
+[33mcommit 564a898ad975192b593be81387d11faf15cb1d3e[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Jul 13 23:39:37 2024 -0700
+
+    Optimize mem indices mangement (#619)
+
+[33mcommit 5d264a90ac5154d8e368ee558337dd3dd92e720b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 13 17:27:55 2024 -0700
+
+    Bump version to 0.1.20 (#618)
+
+[33mcommit 5949b1ca0ec50d58ad921442683ddfb9a3bdb157[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Jul 13 16:45:11 2024 -0700
+
+    Fix memory pool index error (#616)
+
+[33mcommit 0feca02dd9659ff7bbaa5e5aa7b9eb1d4422f080[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 13 15:59:04 2024 -0700
+
+    Improve benchmark scripts (#615)
+
+[33mcommit 10143e1a5f2ee5826f7e566432d29e221d8c4af0[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Jul 13 15:24:03 2024 -0700
+
+    Memorypool chunked prefetch (#614)
+
+[33mcommit 65c65776969dddb50f198c4790f801a0d55137ff[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 13 15:00:26 2024 -0700
+
+    Improve benchmark scripts & fix llava (#613)
+
+[33mcommit 665815969a71a478b840999cb821054814a723fc[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jul 13 05:29:46 2024 -0700
+
+    Enable cuda graph by default (#612)
+
+[33mcommit 396a69240fc99e54d079d9f623ad83239eb39167[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Jul 12 18:21:11 2024 -0700
+
+    Cleanup attention backend: flashinfer and triton (#611)
+
+[33mcommit af4e7910e75bcac920a6b2fdf800188fd5615205[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Jul 12 13:00:03 2024 -0700
+
+    Clean up the usage of flashinfer (#610)
+
+[33mcommit 519e20cfda4aad594e32c86e844effdec753dcca[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Jul 12 12:28:09 2024 -0700
+
+    Code clean up: Remove deprecated prefill move InputMetadata to infer_batch.py (#609)
+
+[33mcommit d9a6902986f7c74c77aa0a570f6c7e59d85d2125[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Jul 11 14:37:01 2024 -0700
+
+    Fix bench latency (#607)
+
+[33mcommit ad872feb14738d2974a434c2d6bfccf8c0ee0062[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jul 9 02:23:14 2024 -0700
+
+    bump version to 0.1.19
+
+[33mcommit da2e5d6546755507fc3b893d9c68e62f07c77311[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jul 9 01:52:55 2024 -0700
+
+    Fix the default argument of OpenAI Chat completion (#605)
+
+[33mcommit ce62dc73f06c6dcc37631dc1e94cc74d434e0a6d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jul 9 01:32:46 2024 -0700
+
+    Update model_support.md
+
+[33mcommit 02b72586584d7141fd6b964ae572b65e474f876a[m
+Author: 胡译文 <1020030101@qq.com>
+Date:   Tue Jul 9 15:35:39 2024 +0800
+
+    [Feat] Expose logprob options to `sgl.gen` API (#503)
+    
+    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
+
+[33mcommit d557e9f3b7929c1a781514ce8272fdeb1f8267b2[m
+Author: prophe <49546352+for-just-we@users.noreply.github.com>
+Date:   Tue Jul 9 14:55:44 2024 +0800
+
+    Update chat template for qwen and yi-1.5. (#530)
+
+[33mcommit 740c46a1520207d5369a39ff25da97eeceef6ec4[m
+Author: Tommy Yang <tommyyang0524@gmail.com>
+Date:   Tue Jul 9 14:44:59 2024 +0800
+
+    Add Qwen2 MoE support (#603)
+
+[33mcommit b38687226a140fd997676501229597366949cb56[m
+Author: Tommy Yang <tommyyang0524@gmail.com>
+Date:   Tue Jul 9 14:44:22 2024 +0800
+
+    Make sglang compat with vllm 0.5.1 (#598)
+
+[33mcommit 710f614ebe26def92c4cfb5655fba7804cc24b06[m
+Author: Pan Lyu <titan.pann@gmail.com>
+Date:   Tue Jul 9 14:27:04 2024 +0800
+
+    add minicpm support (#602)
+
+[33mcommit f25b76c02abbc2971b5e5532c0c49e960e662e23[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Jul 8 17:46:55 2024 -0700
+
+    add `LogitsMetadata` (#604)
+
+[33mcommit f4e885b7c3fbed59ce48c7c3046e628e7a58d396[m
+Author: Mingyi <wisclmy0611@gmail.com>
+Date:   Sun Jul 7 19:35:22 2024 -0700
+
+    Reduce number of workspaces (#601)
+
+[33mcommit 0877f1e75b508f74ca06adc93beb86d80732a310[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Jul 7 01:55:58 2024 -0700
+
+    Fix streaming (#600)
+
+[33mcommit 5304b4ef58ecf101abac01c80d2dd5fe1e506d7f[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Jul 6 23:34:10 2024 -0700
+
+    Add `--enable-p2p-check` option (#599)
+
+[33mcommit 26908d9568ef6b7f658cbaea6096f5fcd7df5451[m
+Author: Pan Lyu <titan.pann@gmail.com>
+Date:   Sun Jul 7 05:53:22 2024 +0800
+
+    * fix(detokenizer_manager.py): fix truncated decoded output (#586)
+    
+    Co-authored-by: hnyls2002 <hnyls2002@gmail.com>
+
+[33mcommit c0982ac5535ed99bc7c1d6404e948dabf86a8af6[m
+Author: Mingyi <wisclmy0611@gmail.com>
+Date:   Sat Jul 6 00:58:46 2024 -0700
+
+    Fix Llava model (#594)
+
+[33mcommit dc1b8bcfaac5cd3deb9ea786cfff094b04f5b3de[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Jul 5 10:06:17 2024 -0700
+
+    Format (#593)
+
+[33mcommit 5a57b8addd3fc6fc40e7a605750e327d84141aec[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Jul 5 09:48:54 2024 -0700
+
+    Add Gemma2 (#592)
+
+[33mcommit d737da5f17ebd179fa9d6a79fb28e6d09398848d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Jul 4 00:55:40 2024 -0700
+
+    Update README.md
+
+[33mcommit ac113887560c5864d93e7f16c6e8933dc0d11c8c[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Jul 4 00:53:49 2024 -0700
+
+    Add docker file (#588)
+    
+    Co-authored-by: Ying Sheng <ying.sheng@databricks.com>
+
+[33mcommit dc8cef1d0c392672af5806b126d9fbb638ba1da5[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Jul 4 00:05:40 2024 -0700
+
+    Update README.md
+
+[33mcommit 2f11936f953e3b7fab698c45054c6893f98c62e4[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Jul 4 06:27:29 2024 +0000
+
+    bump version to 0.1.18
+
+[33mcommit 63fbef9876c3e09592a5410ddbc68eba9245f05f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jul 3 23:19:33 2024 -0700
+
+    fix flashinfer & http log level
+
+[33mcommit 2a754e57b052e249ed4f8572cb6f0069ba6a495e[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Jul 3 16:14:57 2024 -0700
+
+    2x performance improvement for large prefill & Fix workspace conflicts (#579)
+
+[33mcommit 96c503eb6029d37f896e91466e23469378dfc3dc[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Jul 4 07:01:19 2024 +0800
+
+    fix the broken server args (#585)
+
+[33mcommit 441cca773d28b2147d9fd14c6e699f29fe9754e7[m
+Author: Chen Xuechen Li <xuechen@x.ai>
+Date:   Wed Jul 3 12:23:30 2024 -0700
+
+    support gptj style rope in llama
+
+[33mcommit c7709d3abe6234e003c311ee059b1a90601b0cc7[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jul 3 02:07:34 2024 -0700
+
+    Update install commands (#583)
+
+[33mcommit 9380f50ff9cbc36afc1888c7a5b69f53c9a488f5[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Jul 2 02:25:07 2024 -0700
+
+    Turn on flashinfer by default (#578)
+
+[33mcommit 95dc093b195e5999699cd7bdba60867c7e60fc92[m
+Author: Daniel Hernandez Garcia <dhgarcia@users.noreply.github.com>
+Date:   Tue Jul 2 06:10:07 2024 +0100
+
+    [BugFix] gemma loading weights "lm_head.weight" key error (#577)
+
+[33mcommit d9ac639202fdc97f42fe41ff75a604089a7cac37[m
+Author: Yueyang Pan <pyyjason@gmail.com>
+Date:   Tue Jul 2 07:08:39 2024 +0200
+
+    Fix flashinfer version (#576)
+
+[33mcommit 26294b2f3d1a07302e0be7783eb47cccb3fd5359[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jul 1 09:54:08 2024 -0700
+
+    Update README.md
+
+[33mcommit 75b31a2a88411f931c623f78a23cfef36124550f[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Jun 30 18:26:04 2024 -0700
+
+    Update run_batch interface and max_prefill_tokens (#574)
+
+[33mcommit 11616fc6bd0ca9c156144839fedf586c2aa9a0e5[m
+Author: sglang <157339885+ZackZeng999@users.noreply.github.com>
+Date:   Sat Jun 29 23:42:14 2024 -0700
+
+    Minor fix in compiler & format (#545)
+
+[33mcommit 9ce89bc14b4b2fc934528fa13cce7c9e6e351b50[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri Jun 28 00:44:22 2024 -0700
+
+    Update benchmark script (#571)
+
+[33mcommit badf3fa02011f9e1af9a043033a41ff8c25dfbec[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Jun 27 23:30:39 2024 -0700
+
+    Expose dtype argument (#569)
+
+[33mcommit 945aa9beb233c7caf2b7345b5b023ecdabc838e5[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Jun 27 11:37:49 2024 -0700
+
+    Update readme (#568)
+
+[33mcommit 2e6e62e1562dc8d5dfe53ef469ad8595743a4f3c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jun 26 09:33:45 2024 -0700
+
+    Increase the number of thread limitation for tp worker managers. (#567)
+
+[33mcommit a385ee27bd0025781eba61578889e470a1c027fb[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jun 25 12:46:00 2024 -0700
+
+    Warmup cublas (#566)
+
+[33mcommit eb1ae6ae0c7e044ebdb405992efebf7b28042370[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jun 25 03:38:04 2024 -0700
+
+    Add sglang.bench_latency for offline benchmark (#564)
+
+[33mcommit 2187f36237eb532f7a9eab92c198ebd3571e1494[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jun 25 01:16:20 2024 -0700
+
+    Add a new arguments log_level_http to control the HTTP logging (#563)
+
+[33mcommit 9465b668b9d3d7d319a1e5b4364e4323bd6e83b8[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jun 24 15:24:21 2024 -0700
+
+    Allow running with vllm==0.4.3 (#561)
+
+[33mcommit 05471f210318fa72570ccb6af3c56cdcde86e55a[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Jun 24 15:23:57 2024 +0800
+
+    Update test_flashinfer (#560)
+
+[33mcommit 1fa15099d85087deeaa5090c76361e53abf9d4a6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jun 22 00:45:33 2024 -0700
+
+    Add LlamaForClassification (#559)
+
+[33mcommit 303ef8883e523456545a6d4f8a713bc04b8eed1c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jun 22 00:25:24 2024 -0700
+
+    Clean up logits processor (#558)
+
+[33mcommit 92cb93f39078a3aa8f356ecd26e71bfa7a3dd962[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Jun 22 15:11:04 2024 +0800
+
+    Fix latency benchmark (#557)
+
+[33mcommit e94e60d6fbb39d967638347c01a711cbe82e2c42[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Jun 21 17:32:36 2024 -0700
+
+    make flashinfer workspace larger
+
+[33mcommit d2f8bfb2e142348b38cdb4f8c5cd82f0ef3dcbff[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Jun 20 23:19:52 2024 -0700
+
+    Follow-up fixes for flashinfer 0.0.5 (#556)
+
+[33mcommit b7e2f800ac1f5056f8789a2cb7b851a11df36433[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Jun 20 20:29:06 2024 -0700
+
+    Update flashinfer to 0.0.5 (#554)
+
+[33mcommit 09593e9bc930f099a151cd57f66ccd2c69b3d43d[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Jun 17 20:41:24 2024 -0700
+
+    Multi-node Tensor Parallelism (#550)
+    
+    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
+
+[33mcommit 53a7ebd89a0bc64f720065035f73cc4d99e864d5[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jun 17 09:47:58 2024 -0700
+
+    Update fused_moe (#553)
+
+[33mcommit ad5f04d6ce2198db81bcc8b173cceb1d3989d7eb[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Jun 16 21:45:04 2024 +0800
+
+    Fix the Jump-Forward with Chinese (#551)
+
+[33mcommit bbec01c9aa219701320395133c8db848e635cde9[m
+Author: Qubitium-modelcloud <417764+Qubitium@users.noreply.github.com>
+Date:   Sat Jun 15 13:56:10 2024 +0800
+
+    Fix tp worker only checking req[0] for stream (#546)
+
+[33mcommit 40e53d65cbb8b609a6ff8e977d2318044d0f0ee0[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Jun 13 16:37:12 2024 +0800
+
+    Add disk cache for loading ShareGPT dataset. (#542)
+
+[33mcommit fb9296f0ed07f4b9fd41f5bd9c670d5a607ae46a[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Jun 12 21:48:40 2024 -0700
+
+    Higher priority for user input of max_prefill_tokens & format (#540)
+
+[33mcommit 1374334d38f698adaafeaac6e0e6394d2abe90a7[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Jun 12 21:23:19 2024 -0700
+
+    Fix dependency & crash issues (#539)
+
+[33mcommit 94aead9e8d9340764a2ef92fe6e079ec2475fb0b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jun 12 13:17:35 2024 -0700
+
+    Fix dependency (#538)
+
+[33mcommit 9c902b1954c55ec152a5ea91ed47e8cb696f7e46[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Jun 12 14:39:12 2024 +0800
+
+    Decode Incrementally (#517)
+
+[33mcommit 111991fe2335fbfeb03330209bca1b051b11e69f[m
+Author: ZhouXingg <165115237+ZhouXingg@users.noreply.github.com>
+Date:   Wed Jun 12 14:27:17 2024 +0800
+
+    Fix Regression: Disable p2p for 4090 (#531)
+    
+    Co-authored-by: Qubitium <417764+Qubitium@users.noreply.github.com>
+
+[33mcommit a8c787d2b316c1672d9c626e38496066c71d8adb[m
+Author: Qubitium <417764+Qubitium@users.noreply.github.com>
+Date:   Wed Jun 12 07:39:52 2024 +0800
+
+    Add ChatGLM Model Support (#516)
+    
+    Co-authored-by: ZX <zx@lbx.dev>
+
+[33mcommit 5f283991e9983dcc367317e5d13e2546b7f1e4e4[m
+Author: Fabian Preiß <fpreiss@digon.io>
+Date:   Wed Jun 12 01:37:27 2024 +0200
+
+    [Minor] Correct Optional type hints in api (#526)
+
+[33mcommit b6667a53b98b086eb0996cdd0ef38530953c35fe[m
+Author: Fabian Preiß <fpreiss@digon.io>
+Date:   Wed Jun 12 01:36:43 2024 +0200
+
+    Fix RAG nb, parea setup (parea -> parea-ai) (#525)
+
+[33mcommit 542bc733d6ebb6da2554704fc101830a07791584[m
+Author: Fabian Preiß <fpreiss@digon.io>
+Date:   Mon Jun 10 21:13:50 2024 +0200
+
+    Fix missing numpy dependency in pyproject.toml (#524)
+
+[33mcommit f6dbd24043b8c18d87a14b3c6fe5c4f567f6c1ba[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jun 8 02:06:52 2024 -0700
+
+    Improve doc strings (#518)
+
+[33mcommit e8a2327d523ce646edf400a2c6da647ca7d8c645[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Jun 7 19:49:18 2024 -0700
+
+    Update version to 0.1.17 (#515)
+
+[33mcommit 91f93f141f79fe8ec1ff8c7a10f33f3f94f96846[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Jun 7 19:22:34 2024 -0700
+
+    Crash the server when error or OOM happens (#514)
+
+[33mcommit f70f72586ad26c1738a0d6dc6fbcaa878997b68c[m
+Author: Qubitium <417764+Qubitium@users.noreply.github.com>
+Date:   Sat Jun 8 04:20:40 2024 +0800
+
+    Fix rid state map leak + Refractor .finished (#505)
+    
+    Co-authored-by: ZX <zx@lbx.dev>
+
+[33mcommit c0ae70c8ed7b9c77164f7483b1075b15fe78ed34[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Jun 7 12:51:40 2024 -0700
+
+    Improve logging & fix litellm dependency. (#512)
+
+[33mcommit 87260b7bfd7c46cfb4511024b44bc9fc43073ad5[m
+Author: 胡译文 <1020030101@qq.com>
+Date:   Sat Jun 8 03:24:28 2024 +0800
+
+    Litellm Backend (#502)
+
+[33mcommit 651a23ee7cb8f5a0560bb4ec03bb6276f48a1cbb[m
+Author: Amos You <91300605+amosyou@users.noreply.github.com>
+Date:   Fri Jun 7 12:23:29 2024 -0700
+
+    remove redundant pad_input_ids function (#500)
+
+[33mcommit bf3e271fe05f586c372d765422d2094bf0d5981c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Jun 7 12:11:31 2024 -0700
+
+    Update vllm to  v0.4.3 (#511)
+    
+    Co-authored-by: Qubitium <417764+Qubitium@users.noreply.github.com>
+    Co-authored-by: ZX <zx@lbx.dev>
+
+[33mcommit 3bc01ac1377001540b38fd8ccb470b29c0e74804[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jun 3 18:11:34 2024 -0700
+
+    [Minor] improve code style
+
+[33mcommit 9f009261f20fd95dbc5bf5b8236009095b2d4eec[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jun 1 17:46:08 2024 -0500
+
+    Improve docs
+
+[33mcommit 159cc741e47539897eb45cde0f328053c23f5bd2[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri May 31 23:33:34 2024 -0700
+
+    Make the server random by default (#493)
+
+[33mcommit 7d1ebc2d716c9f732f1500da542f1368aa6fdf4d[m
+Author: Yuanhan Zhang <yuanhan002@ntu.edu.sg>
+Date:   Sat Jun 1 14:31:56 2024 +0800
+
+    update the script: examples/usage/llava_video/srt_example_llava_v.sh (#491)
+
+[33mcommit 83525a1df20c1c5514bf5311388e4794f5fd1abf[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri May 31 12:00:21 2024 -0700
+
+    Revert "Make the server random by default" (#492)
+
+[33mcommit 80a33ce8b0b1f95638e4d54f5d13d03e57f89f82[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed May 29 18:41:18 2024 -0400
+
+    Do not set the default value of global random seed (#488)
+
+[33mcommit 1a57e4167915780c9ba458ff6f3ad5a18e048ee4[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon May 27 23:00:16 2024 -0700
+
+    do not launch workers in parallel
+
+[33mcommit adc974268a21fb31f15720aab5cea09d0c72b313[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon May 27 22:46:04 2024 -0700
+
+    Update docs (#486)
+
+[33mcommit 0463f7fb52f06dcae2b10b7ca2a18a86ac135f96[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon May 27 21:24:10 2024 -0700
+
+    Support data parallelism (static) (#480)
+    
+    Co-authored-by: Ying Sheng <ying.sheng@databricks.com>
+    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
+    Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
+    Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
+
+[33mcommit 565d727409f4fbf3a4f5d995aa7190f33c3e6e86[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon May 27 14:32:05 2024 -0700
+
+    improve logging & fix vllm version
+
+[33mcommit 09de730dee31d13451fc8967b7ca31967b1a2420[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon May 27 14:13:26 2024 -0700
+
+    Improve benchmark scripts & add more models (#484)
+
+[33mcommit 55c16436273d4a42f7cfe342df5f10ad05a8d0fe[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun May 26 12:51:45 2024 -0700
+
+    Improve benchmark scripts & rename some scripts (#477)
+
+[33mcommit 2b605ab1d76db912b599b0fd7af59bf399b81d5f[m
+Author: Li Bo <drluodian@gmail.com>
+Date:   Mon May 27 03:29:51 2024 +0800
+
+    [Feat/Fix] Refactoring Llava models into single file (#475)
+
+[33mcommit 947bda73fe7cc8d72b31619b532d1d33459cfc4a[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun May 26 12:09:03 2024 -0700
+
+    Add benchmark scripts (#476)
+
+[33mcommit f06e90c2cfb41f283a22ee705ca20d2f451aab6e[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun May 26 00:07:26 2024 +0800
+
+    Optimize retract (#440)
+
+[33mcommit 2cea6146d8735780da602c0dfa0569b0fb5d47ba[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri May 24 03:48:53 2024 -0700
+
+    Improve logging & add logit cap (#471)
+
+[33mcommit 44c998fcb553f5210a96f1dc033d24f15003486c[m
+Author: Yuanhan Zhang <yuanhan002@ntu.edu.sg>
+Date:   Fri May 24 18:38:20 2024 +0800
+
+    Add the instruction link to the LLaVA-NeXT-Video at README (#463)
+
+[33mcommit 3167d8dabcb29139e4af8aee4dca627344868281[m
+Author: bing <bingwork@users.noreply.github.com>
+Date:   Fri May 24 18:38:01 2024 +0800
+
+    fix test bug in srt_llava_next_test.py (#470)
+
+[33mcommit 0fafc5606b0dc205518002dc2058e7b9a8d5019a[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue May 21 11:46:35 2024 -0700
+
+    port fp8 mixtral (#460)
+
+[33mcommit 19d2135cb8219494e979995d8082b508459862b1[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue May 21 09:13:37 2024 -0700
+
+    Use model loader from vllm (#459)
+
+[33mcommit ced77c66262a7f7266b307f189b132cd66019ae1[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon May 20 18:44:23 2024 -0700
+
+    Rename api_num_spec_tokens -> num_api_spec_tokens (#458)
+
+[33mcommit 8dbdc018a31e1437fb40fec48fd0a9a6ba4fdaee[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon May 20 18:41:21 2024 -0700
+
+    Abort disconnected requests (#457)
+
+[33mcommit 3e684be7a3c0cf7e9db987d92d74ecc0d09f71be[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon May 20 17:01:13 2024 -0700
+
+    Fix openai speculative execution (#456)
+
+[33mcommit ec380dfd30a754f05bf66f92573f0ae93450f047[m
+Author: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
+Date:   Sat May 18 22:23:53 2024 -0700
+
+    openai chat speculative execution (#250)
+    
+    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
+
+[33mcommit 5b647543c141a6b21307f3fbc679d2a0a9231c41[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun May 19 13:00:12 2024 +0800
+
+    Fix the broken `--disable-radix-cache` (#451)
+
+[33mcommit 8210ec60f473a7c13ef26332210d355e30c093e6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri May 17 05:49:31 2024 -0700
+
+    Improve error handling & abort disconnected requests (#449)
+
+[33mcommit 5be9eb8a8c52ed7cc48c1cabd1c7bc5be334f6b6[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Fri May 17 02:35:15 2024 -0700
+
+    Add PUT for generate api (#448)
+
+[33mcommit c05956e53495a219bdb12d9f995d22afa89fd6cd[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu May 16 18:07:30 2024 -0700
+
+    Simplify port allocation (#447)
+
+[33mcommit d75dc20fae888aaa4613af6a8e2dd73e57045752[m
+Author: Matthias Gerstgrasser <matthias@gerstgrasser.net>
+Date:   Thu May 16 14:55:05 2024 -0700
+
+    Add finish_reason to OpenAI API (#446)
+
+[33mcommit 690d162d9746e96d37cc62c5bf00d22f71c32583[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue May 14 22:40:46 2024 +0800
+
+    Format code (#441)
+
+[33mcommit 664287b2a787ff774b6ce9529b2a784e304ee38c[m
+Author: Kaichen Zhang - NTU <zhan0564@e.ntu.edu.sg>
+Date:   Tue May 14 13:17:50 2024 +0800
+
+    [Feat] Add llava qwen, llava mistral (#419)
+    
+    Co-authored-by: Bo Li <drluodian@gmail.com>
+
+[33mcommit e0ae5d42ecdd06a749969f28b67df688e9edec8b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon May 13 17:29:17 2024 -0700
+
+    Update version to 0.1.16 (#438)
+
+[33mcommit 32de16ce2fb6a50534c4878d5ea1430597847829[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon May 13 17:26:18 2024 -0700
+
+    Fix streaming (#437)
+
+[33mcommit 0992d85f92688035cd669d12735518faba93b545[m
+Author: Yuanhan Zhang <yuanhan002@ntu.edu.sg>
+Date:   Tue May 14 07:57:00 2024 +0800
+
+    support llava video (#426)
+
+[33mcommit 5dc55a5f02de2aff87a12f5dfcf5a8a781f1220a[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon May 13 15:56:00 2024 -0700
+
+    Handle truncation errors (#436)
+
+[33mcommit 4231a42fa8475c039b2b468f6de2a5c294241ae7[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon May 13 12:11:55 2024 -0700
+
+    Fix import of global_config
+
+[33mcommit 455c9ccc4a8bbaac141d373f34e0d70716248e7c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon May 13 00:17:02 2024 -0700
+
+    Update readme (#434)
+
+[33mcommit 39191c851532b8899b81c8dfac1bf558ee6be160[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon May 13 12:47:13 2024 +0800
+
+    Cache optimizations (#418)
+
+[33mcommit 562b8857d8060905b64a9698d334c6133c024dd9[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun May 12 20:49:04 2024 -0700
+
+    Improve error handling (#433)
+
+[33mcommit 04c0b21488e2edbbc1a191d01abc77bd13e2c2a3[m
+Author: Shannon Shen <22512825+lolipopshock@users.noreply.github.com>
+Date:   Sun May 12 12:29:00 2024 -1000
+
+    Allow `input_ids` in the input of the `/generate` endpoint  (#363)
+
+[33mcommit 6e09cf6a1514067d08cf7c1efb42cf56634c8529[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun May 12 15:05:40 2024 -0700
+
+    Misc fixes (#432)
+
+[33mcommit 72bb3443880d5a7e382cb9783a8231021e130ae2[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun May 12 14:22:33 2024 -0700
+
+    Update version to 0.1.15 (#431)
+
+[33mcommit 2d580e7a8991b77f4bdc940fcc23b70c9fdc6b1e[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun May 12 08:18:53 2024 -0700
+
+    Fix flashinfer (#430)
+
+[33mcommit 3fc97f67095f5c183a9942bfb45679deab08c127[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun May 12 06:41:32 2024 -0700
+
+    Move openai api server into a separate file (#429)
+
+[33mcommit abc548c7079638628b8f19c6c742b518f0db0031[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun May 12 05:10:35 2024 -0700
+
+    Minor fix for the import path (#428)
+
+[33mcommit aee4f523cfd92f844208118e42dcc6bfeb271d08[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun May 12 04:54:07 2024 -0700
+
+    Fix logit processor bugs (#427)
+
+[33mcommit 7023f413c6c57cf29c20a7b28582fa01398de1b6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat May 11 20:55:00 2024 -0700
+
+    Clean up (#422)
+
+[33mcommit 09deb20deef8181a23f66c933ea74b86fee47366[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat May 11 16:56:42 2024 -0700
+
+    Optimize the memory usage of logits processor (#420)
+
+[33mcommit 33b242df303e03886835d08a583fefe979a3ee88[m
+Author: Qubitium <417764+Qubitium@users.noreply.github.com>
+Date:   Sun May 12 07:37:49 2024 +0800
+
+    Compat with latest VLLM 0.4.2 main + fork.number rename + Flashinfer 0.0.4 (#380)
+    
+    Co-authored-by: ZX <zx@lbx.dev>
+    Co-authored-by: ZhouXingg <165115237+ZhouXingg@users.noreply.github.com>
+
+[33mcommit a511a2d0895b63505fce68cfff5476b251528b34[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu May 9 15:49:29 2024 -0700
+
+    restrict vllm version
+
+[33mcommit 6ec65f4555f24fe832860679bb0eff86798f6987[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu May 9 15:39:22 2024 +0800
+
+    Make public APIs more standard. (#416)
+
+[33mcommit e2c31fca5cbbb8c145b69f6b3cdaabdcf7039add[m
+Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
+Date:   Thu May 9 08:14:01 2024 +0100
+
+    Include finish reason in meta info response (#415)
+
+[33mcommit d5de20a3ee372339872c287cbc4209df41b834f8[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed May 8 15:15:18 2024 +0800
+
+    Fix `sync()` when `fork(1)` (#412)
+
+[33mcommit 4a1c6ae2ced0f85e1a7414cb13ace9f342a13a56[m
+Author: YoungJoong Noah Kim <142862540+noah-kim-theori@users.noreply.github.com>
+Date:   Tue May 7 16:18:15 2024 +0900
+
+    Add Cohere Command R chat template (#411)
+
+[33mcommit 14522e6a26e7dfb6e27c0b79f211f11d190259fe[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun May 5 16:14:17 2024 +0800
+
+    Organize Benchmark (#381)
+
+[33mcommit 183df4728260a1469612f848f980cc71266591b9[m
+Author: ZhouXingg <165115237+ZhouXingg@users.noreply.github.com>
+Date:   Wed May 1 07:17:12 2024 +0800
+
+    SamplingParams add "spaces_between_special_tokens" argument (#392)
+
+[33mcommit 5c5aba59005d80109e8e290f770fe6501028fa4f[m
+Author: Joschka Braun <47435119+joschkabraun@users.noreply.github.com>
+Date:   Tue Apr 30 19:13:28 2024 -0400
+
+    Adding RAG tracing & eval cookbook using Parea (#390)
+
+[33mcommit ba67101f99ec8a43b738afaff39757795573dc09[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Apr 30 15:53:39 2024 -0700
+
+    Fix chatml template (#406)
+
+[33mcommit 95c4e0dfac5a5f4a2f7f9292402fec26d0838f31[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Apr 28 21:06:22 2024 +0800
+
+    Format Benchmark Code (#399)
+
+[33mcommit 19818b9c2f8df06436412f61d192d065bc2f976e[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Apr 26 01:01:36 2024 +0800
+
+    Minor: style improvement of radix_cache and memory_pool (#395)
+
+[33mcommit 9216b10678a036a1797e19693b0445c889016687[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Apr 25 17:29:07 2024 +0800
+
+    Improve performance when running with full parallel (#394)
+
+[33mcommit da19434c2f3cbe4f367f84993da0bcbd84efb6ba[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Apr 24 02:23:01 2024 +0800
+
+    Benchmark Updates (#382)
+
+[33mcommit 150d7020ed8fcba4f3fdef52b770850aff8ae048[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Apr 23 22:36:33 2024 +0800
+
+    Revert removing the unused imports (#385)
+
+[33mcommit 9acc6e350475a64207a6702a579850c93ab27b43[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Apr 22 22:38:09 2024 +0800
+
+    add `.isort.cfg` (#378)
+
+[33mcommit cf9d8efdd374b65dcdea15f8d5bb89b47f98d8ed[m
+Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
+Date:   Sun Apr 21 17:40:12 2024 +0100
+
+    llama3 instruct template (#372)
+
+[33mcommit 1bf1cf195302fdff14a4321eb8a17831f5c2fc11[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Apr 21 17:25:14 2024 +0800
+
+    Reduce overhead when `fork(1)` (#375)
+
+[33mcommit e822e5900b98d89d19e0a293d9ad384f4df2945a[m
+Author: Ke Bao <ISPObaoke@163.com>
+Date:   Thu Apr 18 00:47:37 2024 +0800
+
+    Optimize radix tree matching (#364)
+
+[33mcommit ca4f1ab89c0c9bdd80fdfabcec52968fbde108bb[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Apr 17 00:16:32 2024 -0700
+
+    Update model support in readme (#370)
+
+[33mcommit 2b6d99919143080b84db5fb1a8cb5ea504e5fabe[m
+Author: Fronx <fronx@wurmus.de>
+Date:   Tue Apr 16 20:18:24 2024 +0200
+
+    Fix issue #367 – System message not supported for Anthropic (anthropic.BadRequestError) (#368)
+    
+    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
+
+[33mcommit 65501a9cf1dc9e73bba24f35b88988f5633866a9[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Apr 16 18:10:12 2024 +0000
+
+    Fix commandr import; format code
+
+[33mcommit db611066ade4314c19973133f13b3dae49f3ee86[m
+Author: ZhouXingg <165115237+ZhouXingg@users.noreply.github.com>
+Date:   Wed Apr 17 01:36:51 2024 +0800
+
+    support `command-r` (#369)
+
+[33mcommit c93293c57e16566c18443bae1bfacc107b714c74[m
+Author: Ikko Eltociear Ashimine <eltociear@gmail.com>
+Date:   Wed Apr 10 00:39:30 2024 +0900
+
+    Update README.md (#358)
+
+[33mcommit 62b3812b696862588e7f88533bde5cc57e8d2acf[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Apr 9 23:27:31 2024 +0800
+
+    Time cost utils (#355)
+
+[33mcommit 550a4f78f382b5a7f4008d7d21e876e71ab2d2b6[m
+Author: Tom Dörr <tomdoerr96@gmail.com>
+Date:   Tue Apr 9 09:10:05 2024 +0200
+
+    Fix typos in infer_batch.py (#354)
+
+[33mcommit ff99c38a0711ee82926840129db840a70e91f0d9[m
+Author: SimoneRaponi <s.raponi.93@gmail.com>
+Date:   Wed Apr 3 16:22:06 2024 +0200
+
+    Add timeout to get_meta_info (#346)
+    
+    Co-authored-by: simone <simone.raponi@equixely.com>
+
+[33mcommit c9de3e169cca4028875967bfff55182ee10e7890[m
+Author: Qubitium <417764+Qubitium@users.noreply.github.com>
+Date:   Wed Apr 3 13:56:06 2024 +0800
+
+    Eliminate 2 gpu ops during sampling when logit_bias is zero (#338)
+    
+    Co-authored-by: hnyls2002 <hnyls2002@gmail.com>
+
+[33mcommit ed27a6b99258c905502bdc7f37300ea060d9b9b1[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Apr 3 12:45:01 2024 +0800
+
+    Revert "Eliminate 2 gpu ops during sampling when logit_bias is zero" (#345)
+
+[33mcommit 463c6632a8d1adebd4a18b106311944cba110f55[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Apr 2 19:14:55 2024 +0800
+
+    Eliminate 2 gpu ops during sampling when logit_bias is zero (#343)
+    
+    Co-authored-by: Qubitium <417764+Qubitium@users.noreply.github.com>
+
+[33mcommit b0890631a011be28d5ef5a0b4d5551fdeb94ab25[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Apr 1 07:35:58 2024 +0000
+
+    fix gemma import error
+
+[33mcommit cb389c91bcff6ffac4a95a0551a05d67e21ba306[m
+Author: Junlong Li <45759388+lockon-n@users.noreply.github.com>
+Date:   Fri Mar 29 10:24:54 2024 +0800
+
+    Fix llava parallelism/fork bug (#315)
+
+[33mcommit eddaa2b599be266f1a74d83518fb3a7a81b103cd[m
+Author: Qubitium <417764+Qubitium@users.noreply.github.com>
+Date:   Fri Mar 29 10:24:16 2024 +0800
+
+    Add support for new autogptq quant_config.checkpoint_format (#332)
+
+[33mcommit 2af565b3bb22cb8ba06acc17a2bbfa8d0ade0145[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Mar 29 01:05:19 2024 +0800
+
+    [model] DBRX-instruct support (#337)
+
+[33mcommit 3842eba5fa305edfc2c66f82e8389d72784d5911[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Mar 28 14:34:49 2024 +0800
+
+    Logprobs Refractor (#331)
+
+[33mcommit 24e59f53501bf5a66f71d08238bfc17e15b8114a[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Mar 24 19:48:37 2024 +0800
+
+    `model_runner` simplify (#329)
+
+[33mcommit 75235419621f38a4b53ae5c2882997a4ce7e698e[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Mar 24 15:41:24 2024 +0800
+
+    `model_rpc` style improvement (#293)
+
+[33mcommit 64ee9c030e250aedbecbc12ce2528569afea55ea[m
+Author: Jani Monoses <jani.monoses@gmail.com>
+Date:   Sat Mar 23 19:16:24 2024 +0200
+
+    Openrouter usage example (#327)
+
+[33mcommit 30d17840fca0118e9c4f28367e68c3898e36119c[m
+Author: Jani Monoses <jani.monoses@gmail.com>
+Date:   Sat Mar 23 19:15:58 2024 +0200
+
+    Update dependencies (#326)
+
+[33mcommit ce216c80dc413e404b7f6f63a151436253c8b837[m
+Author: Qubitium <417764+Qubitium@users.noreply.github.com>
+Date:   Sun Mar 24 01:15:16 2024 +0800
+
+    Cleanup codebase: removed unnecessary code/logic (#298)
+
+[33mcommit 51104cd405a9573a872e846ba4d85a4298f92c4f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Mar 22 13:42:22 2024 -0700
+
+    Update version to v0.1.14 (#324)
+
+[33mcommit e2b2f0a21322d6a29106dceaa92bb593b963677e[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Mar 22 13:37:57 2024 -0700
+
+    Support oai in benchmark/mmlu (#323)
+
+[33mcommit b57abe16632605ae9e8b0473dbb45fb0fd25e6f1[m
+Author: Jani Monoses <jani.monoses@gmail.com>
+Date:   Fri Mar 22 22:24:08 2024 +0200
+
+    Add StableLM model. (#301)
+
+[33mcommit e57f079275425872bad6f2d1102cdfad6b523e58[m
+Author: Jani Monoses <jani.monoses@gmail.com>
+Date:   Fri Mar 22 22:23:31 2024 +0200
+
+    Use Anthropic messages API (#304)
+
+[33mcommit 08df63a6f8b9d27ae80cf705ab9496632c8a18c2[m
+Author: Li Bo <drluodian@gmail.com>
+Date:   Sat Mar 23 03:19:58 2024 +0800
+
+    [Fix/Potential Bugs] Can not correctly import models in python/sglang/srt/models (#311)
+
+[33mcommit 77835756a7e9511bb63ac74a5b5c11543f69344c[m
+Author: ZhouGongZaiShi <993566077@qq.com>
+Date:   Sat Mar 23 03:19:11 2024 +0800
+
+    Fix outlines-0.0.35 incompatibility (#291)
+    
+    Co-authored-by: ZX <zx@lbx.dev>
+
+[33mcommit ed3157997153fdbbe142bf4ef995ecaaae62fc34[m
+Author: Liurl <liurl021@gmail.com>
+Date:   Wed Mar 13 13:15:43 2024 +0800
+
+    Fix marlin model loading compat with autogptq (#290)
+    
+    Co-authored-by: LRL <lrl@lbx.dev>
+
+[33mcommit 92e2d74fd0426afb98621465d6574ad2a823e842[m
+Author: Qubitium <417764+Qubitium@users.noreply.github.com>
+Date:   Wed Mar 13 13:02:48 2024 +0800
+
+    Fix env (docker) compat due to __file__ usage (#288)
+
+[33mcommit d9b3b0188338c6a1411c2995db5e8da7f56f6e4d[m
+Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
+Date:   Wed Mar 13 02:10:12 2024 +0000
+
+    enable marlin kernels (#286)
+
+[33mcommit 745ea007acf662b3e439eeb9a1c24ddd06f10b58[m
+Author: Arsalan <41029759+amirarsalan90@users.noreply.github.com>
+Date:   Tue Mar 12 22:09:38 2024 -0400
+
+    Fix Incorrect CURL Request Example in README (#287)
+
+[33mcommit ad1dd74673a2e918a39d869865c1830fb634d150[m
+Author: Qubitium <417764+Qubitium@users.noreply.github.com>
+Date:   Tue Mar 12 21:45:58 2024 +0800
+
+    Fix flashinfer >= 0.0.3 compat (#282)
+
+[33mcommit eb4308c4c9e3efbd58e86ec63e4f88dc36c363a8[m
+Author: Arsalan <41029759+amirarsalan90@users.noreply.github.com>
+Date:   Tue Mar 12 03:16:06 2024 -0400
+
+    adding the triton docker build minimal example (#242)
+
+[33mcommit b2eb080501b4b4a0d72eb5a0e6be30d43811dcbd[m
+Author: Qubitium <417764+Qubitium@users.noreply.github.com>
+Date:   Mon Mar 11 22:32:15 2024 +0800
+
+    Fix Runtime missing some ServerArgs options (#281)
+
+[33mcommit 4aa5dd2c5f1e386e8bf7d9c6309dc414e2fded7e[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Mar 11 05:49:27 2024 -0700
+
+    Update version to v0.1.13 (#280)
+
+[33mcommit 13662fd5336fc8428e130567fdb1695d664eea24[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Mar 11 05:24:24 2024 -0700
+
+    Fix RuntimeEndpoint (#279)
+
+[33mcommit d5ae2ebaa2a1021b53caa25aa541ca39d590918d[m
+Author: Alessio Dalla Piazza <alessio.dallapiazza@gmail.com>
+Date:   Mon Mar 11 13:16:10 2024 +0100
+
+    Add Support for API Key Authentication (#230)
+
+[33mcommit 1b355479276bf07502c40ffc7bc8b6c494b93b10[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Mar 11 20:06:52 2024 +0800
+
+    Organize `server_args` (#277)
+
+[33mcommit faba293a0d6c144de0a9687ffc0ed2be6699600d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Mar 11 04:43:39 2024 -0700
+
+    Improve gemma and documentations (#278)
+
+[33mcommit 89885b31efa6f36faf070b405640763431f3074e[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Mar 11 12:14:27 2024 +0800
+
+    Gemma Support (#256)
+
+[33mcommit 64fe311593edee917a28506be8723127d4e938c9[m
+Author: Geary.Z <92413813+TideDra@users.noreply.github.com>
+Date:   Mon Mar 11 10:04:52 2024 +0800
+
+    replace skip_embed with input_embeds (#222)
+
+[33mcommit a7ace9c88d48263b863cda5cece386630ac62da6[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Mar 11 09:54:18 2024 +0800
+
+    Fix qwen config (#261)
+
+[33mcommit a833de05d3ae49c5ba16b097c16df82308fba727[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Mar 10 18:51:47 2024 -0700
+
+    Add logo (#275)
+
+[33mcommit 30d67b2bca647d7a52fddc42a6d48842610cfec3[m
+Author: Lin Tianchuan <47070449+1024th@users.noreply.github.com>
+Date:   Thu Mar 7 23:20:11 2024 +0800
+
+    Add `set_var` to interpreter.py (#263)
+
+[33mcommit b0b722ee8e90bfa2b379eadb1432e2f6852a6ad0[m
+Author: Xinwei Xiong <3293172751NSS@gmail.com>
+Date:   Sun Mar 3 17:52:36 2024 +0800
+
+    Refactor ChatTemplate for Enhanced Clarity and Efficiency (#201)
+
+[33mcommit 01b07ea3ac0e693dfa9093938dd1ed15a7881240[m
+Author: Srinivas Billa <nivibilla@gmail.com>
+Date:   Sun Mar 3 09:41:41 2024 +0000
+
+    Add SSL Cert Functionality (#224)
+
+[33mcommit dfb13ac45518a1a6c26e3d9223f44612fa2778ac[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Mar 3 17:09:16 2024 +0800
+
+    Fix addr reuse in check_port (#253)
+
+[33mcommit ec90b9c054461f79cf902f389279853d8e40aabc[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Feb 24 19:03:46 2024 +0800
+
+    Upload `agent_calls.jsonl` download link (#226)
+
+[33mcommit 9759d927cf6fe741d998015ce35bea546a2cf0a4[m
+Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
+Date:   Sat Feb 24 08:34:22 2024 +0000
+
+    fix chatml template (#195)
+
+[33mcommit 8d0a7fae3b89eecae8cd2a755a673462c23ab31f[m
+Author: Zhang Wenbin <22811973@qq.com>
+Date:   Sat Feb 24 16:27:34 2024 +0800
+
+    Fix interpreter.py `get_var(var_name)` in text iter when `stream` is not enabled (#198)
+
+[33mcommit c4e9ebe3a480128818eeda4a3ce59ee7a8da53bf[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Feb 24 16:05:21 2024 +0800
+
+    Fix stop str merging (#225)
+    
+    Co-authored-by: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
+
+[33mcommit 3c2c5869ad719d41d87f6aca8a71e683ebcadc76[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Fri Feb 23 23:06:17 2024 -0800
+
+    Support outlines > 0.0.31 (#219)
+
+[33mcommit 4cb9aaedf3dfe4f876ba447ab2ac1ac9c75da911[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Thu Feb 22 10:33:03 2024 -0800
+
+    Fix logprobs with logprob_start_len (#193)
+
+[33mcommit 9de9a46815bded248b01daba75936b642c2a7c06[m
+Author: psych0v0yager <105936906+psych0v0yager@users.noreply.github.com>
+Date:   Tue Feb 20 18:22:56 2024 -0600
+
+    Added the ability to Modify the Context Length (#210)
+
+[33mcommit ce3b2610535281a2ebde8be2ad072f22207a7a05[m
+Author: Ikko Eltociear Ashimine <eltociear@gmail.com>
+Date:   Tue Feb 20 02:09:03 2024 +0900
+
+    Update README.md (#207)
+
+[33mcommit 91e036334f1e159dba2943cbfab1a0e8185eb60e[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Feb 17 13:40:39 2024 +0800
+
+    Adjust outlines version. (#200)
+    
+    Co-authored-by: comaniac <hao.yu.cody@gmail.com>
+
+[33mcommit 2a74748b2fb0f061934e056a90db5edfbf2a7dee[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Fri Feb 16 13:01:40 2024 -0800
+
+    Pin outlines version (#196)
+
+[33mcommit 63ba630bbbb2d55787ac54ac0a01cbde993afc20[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Thu Feb 15 10:54:20 2024 -0800
+
+    Refactor decoding logprob and add completion_tokens_wo_jump_forward (#189)
+
+[33mcommit 6493256b7d4b290ede988e5ee5425508249064c7[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Feb 12 12:43:48 2024 +0000
+
+    improve print
+
+[33mcommit 06008bc295e5e83761c2e332642b783f334e8a3c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Feb 12 04:43:14 2024 -0800
+
+    Fix server launch for jupyter notebook (#186)
+
+[33mcommit bb824da41a24371202f83889d48b06a4b730d1ad[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Feb 12 01:06:38 2024 -0800
+
+    Add Together and AzureOpenAI examples (#184)
+
+[33mcommit 931213245ce69908843c731edbee7bd662f0647b[m
+Author: Yaya Sy <58347382+yaya-sy@users.noreply.github.com>
+Date:   Sun Feb 11 22:26:20 2024 +0100
+
+    correct reference dtype openai.py (#181)
+
+[33mcommit c97fdae4aa0e196f28e7a0d35b24134c5fdc3009[m
+Author: Yaya Sy <58347382+yaya-sy@users.noreply.github.com>
+Date:   Sun Feb 11 22:25:57 2024 +0100
+
+    correct a mistake on the README.md (#182)
+
+[33mcommit 624b21e742f2bfc493b30ca17e7c86ca9255e1e6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Feb 11 06:43:45 2024 -0800
+
+    Update version to 0.1.12 (#178)
+
+[33mcommit c51020cf0c64498865538362aa34baaed13a3b50[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Feb 11 05:50:13 2024 -0800
+
+    Fix the chat template for llava-v1.6-34b & format code (#177)
+
+[33mcommit 50afed4eaafeec6c87a4f120ec95742846b4130f[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Sat Feb 10 17:21:33 2024 -0800
+
+    Support extra field regex in OpenAI API (#172)
+
+[33mcommit 4d303c4fa365dbe8b4d474be6e613954bb829939[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Fri Feb 9 20:06:15 2024 -0800
+
+    Fix token usage with jump forward (#174)
+
+[33mcommit 37b42297f87645e6ed0517281beb0e5301a63d8e[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Feb 9 10:13:02 2024 +0800
+
+    import outlines (#168)
+
+[33mcommit cba50273324e9770c587754a4abbea974a4124ba[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Thu Feb 8 17:23:09 2024 -0800
+
+    Fix BaseCache metric (#170)
+
+[33mcommit a6aa46dd3f320a407b364aa53641f1eb99cca520[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Thu Feb 8 04:35:25 2024 +0000
+
+    minor
+
+[33mcommit 405f26b00b452815852e1a1da2b6937f3a8b9ce9[m
+Author: Srinivas Billa <nivibilla@gmail.com>
+Date:   Thu Feb 8 04:07:31 2024 +0000
+
+    Add Auth Token to RuntimeEndpoint (#162)
+
+[33mcommit b1a3a454ee5b681ff8b193d186530c1d37a6bb3b[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Feb 8 00:50:12 2024 +0800
+
+    add `--disable-disk-cache` (#160)
+    
+    Co-authored-by: Ja1Zhou <50169346+Ja1Zhou@users.noreply.github.com>
+
+[33mcommit 79e6b84bec463fc1832e89fd70dc4fb55ae09baa[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Feb 6 23:14:59 2024 -0800
+
+    Update README.md
+
+[33mcommit 26c3494152131a77cd4eb1b4a25e016f3def5313[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Tue Feb 6 19:28:29 2024 -0800
+
+    [Submodule] Change FlashInfer to import (#156)
+
+[33mcommit cb8e1982f83796cdcf5243fbfb44aeb036cc7621[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Feb 6 18:44:37 2024 -0800
+
+    Update README.md
+
+[33mcommit 23f05005fd33a606af72db68f7320ef411720761[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Feb 6 13:27:46 2024 -0800
+
+    Format code & move functions (#155)
+
+[33mcommit a7334aeea138149906b83e38a593f5ce6e07ea32[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Tue Feb 6 12:24:55 2024 -0800
+
+    Support decode token logprobs (#130)
+
+[33mcommit ee1df26a77971f004444c118eabb7998dbafb14a[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Feb 6 11:35:42 2024 -0800
+
+    Update README.md
+
+[33mcommit 3ae78a09b386364088799c088f8add8ae9e2c584[m
+Author: Arcmoon <50002441+Arcmoon-Hu@users.noreply.github.com>
+Date:   Wed Feb 7 03:35:04 2024 +0800
+
+    Add gptq quantization model support  (#141)
+
+[33mcommit ccbe1e67d8d671d38b248557ef10cd14a10e4acf[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Tue Feb 6 11:34:15 2024 -0800
+
+    Temporary fix OpenAI API for Pydantic v1/v2 (#153)
+
+[33mcommit e2bf732bc3dda44f800caa1aa3d2786ee66d93d1[m
+Author: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
+Date:   Tue Feb 6 12:38:41 2024 +0800
+
+    add openai error handler with retry and logger (#148)
+
+[33mcommit 322421fae36424cdcef16ecc913e7f6e92d4b7d2[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Mon Feb 5 14:21:16 2024 -0800
+
+    Add warmup to SRT server (#146)
+
+[33mcommit 8ff870bf3e6a87d2b12d4d01820c0e86de194664[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Feb 5 11:22:06 2024 +0000
+
+    improve docs
+
+[33mcommit 26f0bedc8f351ed9b67d9b85ee30aa0c5f2aef45[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Feb 5 16:50:37 2024 +0800
+
+    jump-forward rename (#144)
+
+[33mcommit 82fa69b3cc0c8b9b3b31148f1d53070649f0d433[m
+Author: Yaya Sy <58347382+yaya-sy@users.noreply.github.com>
+Date:   Sun Feb 4 23:27:52 2024 +0100
+
+    fix undfined variable (#142)
+
+[33mcommit 8fb7459e08ecb94e045a78e2ba8ee08c27ab34c7[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Feb 3 17:42:01 2024 -0800
+
+    update json decoding docs
+
+[33mcommit bb3a3b6675b1844a13ebe368ad693f3dc75b315b[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Feb 3 23:32:05 2024 +0800
+
+    Support Faster JSON decoding for llava (#137)
+    
+    When sending fast-forwarded reqs to model_rpc, re-calculate `pad_input_ids`
+
+[33mcommit 45d6592d4053fe8b2b8dc9440f64c900de040d09[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Feb 3 04:59:06 2024 -0800
+
+    Fix no-cache mode (#136)
+
+[33mcommit f6bfe3aaff6fc9c80ed632646c8db06f4a6c0048[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Feb 3 02:50:13 2024 -0800
+
+    Release 0.1.11 (#134)
+
+[33mcommit e095b16236917d478f537c0bd71f45bcaa408d24[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Feb 3 02:35:54 2024 -0800
+
+    Add max_prefill_num_token into server arguments (#133)
+
+[33mcommit 67be11c790f600b0003ed36be94e748eb3341be6[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sat Feb 3 01:38:00 2024 -0800
+
+    fix bug of race condition in copy()
+
+[33mcommit cd8c3ccd95596db38dfeb610fa19e3b22d9b857f[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Feb 3 11:48:01 2024 +0800
+
+    Fix `is_multimodal_model` judge (#132)
+
+[33mcommit 9c121f2a45dca269c47812379f851f9ca9478852[m
+Author: hnyls2002 <hnyls2002@gmail.com>
+Date:   Fri Feb 2 09:58:24 2024 +0000
+
+    minor fix: result dump format
+
+[33mcommit 03e04b23312a1c6f5f16cd4dfffd530fb4210a65[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Feb 1 22:44:05 2024 +0000
+
+    update docs for Yi-VL
+
+[33mcommit 864425300fb52782d3f410db248371eb0451cf26[m
+Author: Christopher Chou <49086305+BabyChouSr@users.noreply.github.com>
+Date:   Thu Feb 1 08:33:22 2024 -0800
+
+    Yi-VL Model (#112)
+
+[33mcommit 79cb018e4bcabe9d22db64657649cc1930370a3d[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Feb 1 13:38:47 2024 +0800
+
+    Add city doc benchmark mode (#129)
+
+[33mcommit c7af9f73938a79b84dab6b6b4b9aa339118b593b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 31 18:52:15 2024 +0000
+
+    Fix a bug in llava-hd
+
+[33mcommit 876db8dc7a4155041c29fec64fc6d7d4d2772ab3[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 31 10:18:43 2024 -0800
+
+    Update sampling_params.md
+
+[33mcommit ad82bac6f530ce949995c271c6fb4e93c7c26d8c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 30 23:49:52 2024 -0800
+
+    Fix model loading & format code (#125)
+
+[33mcommit 71b54eea7d21a2bb1d8ef340e7002983a29b1d5f[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Tue Jan 30 22:13:14 2024 -0800
+
+    Add cache metrics (#119)
+
+[33mcommit 74b3bfaaf8238f89df56681af7b601a10486eebb[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 30 16:36:10 2024 +0000
+
+    format code
+
+[33mcommit 4a634cf64613d62986f8c1ba97e24ad6c23fc07b[m
+Author: Jay Zhou <50169346+Ja1Zhou@users.noreply.github.com>
+Date:   Tue Jan 30 08:34:51 2024 -0800
+
+    [Feature] Allow specifying all ports to use in advance (#116)
+
+[33mcommit a49dc52bfa1e04f7713644266b0992e8e977a2bb[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 30 15:37:43 2024 +0000
+
+    release v0.1.10
+
+[33mcommit 873d0e85378792f170cab1eac6c39a2efbb89ab1[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 30 14:52:06 2024 +0000
+
+    Ignore detokenization error
+
+[33mcommit 1d0fbe8e436037257e4b2df83e890b2cf637d7f6[m
+Author: Keith Stevens <keith@surfacedata.org>
+Date:   Tue Jan 30 23:12:33 2024 +0900
+
+    [Feature] Adds basic support for image content in OpenAI chat routes (#113)
+
+[33mcommit 97aa9b3284566a4d84c08f7c1fee3699bf694e3d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 30 05:45:27 2024 -0800
+
+    Improve docs & Add JSON decode example (#121)
+
+[33mcommit 0617528632fe266427e1ee6cf5037e3fca06e538[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 30 04:29:32 2024 -0800
+
+    Update quick start examples (#120)
+
+[33mcommit 4ea92f83077ce70381528d7d1fcc565db7698d69[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jan 29 17:08:12 2024 -0800
+
+    Format code (#118)
+
+[33mcommit 6b0af2853c138b945e61e78ec7a0ab4a1a239bb8[m
+Author: Junyang Lin <justinlin930319@hotmail.com>
+Date:   Tue Jan 30 09:06:02 2024 +0800
+
+    Add qwen2 (#114)
+
+[33mcommit 6f560c761b2fc2f577682d0cfda62630f37a3bb0[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jan 29 17:05:42 2024 -0800
+
+    Improve the control of streaming and improve the first token latency in streaming (#117)
+
+[33mcommit cd6872334e9ead684049b8fccd5f2dac9433b1b4[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Fri Jan 26 09:38:43 2024 -0800
+
+    Fix Mistral model loading (#108)
+    
+    Co-authored-by: johndun <dunavent.jm@gmail.com>
+
+[33mcommit 81561f8e2d55d105aabbe0eab1b3b33f4fc04b0b[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Fri Jan 26 13:32:59 2024 +0800
+
+    Flush Cache API (#103)
+
+[33mcommit 3a581e9949d14992400c1a9455d0804d6a3fb7e0[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Thu Jan 25 15:29:07 2024 -0800
+
+    Dynamic model class loading (#101)
+
+[33mcommit 0147f940ddc5642e6f88e404123881d69c2b7f0a[m
+Author: shiyi.c_98 <shicao@berkeley.edu>
+Date:   Thu Jan 25 07:56:25 2024 -0800
+
+    fix batch error for llava-hd (#98)
+
+[33mcommit 23950056f0de45cb2c0b12c432c31e8f03047f22[m
+Author: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
+Date:   Thu Jan 25 03:57:06 2024 -0600
+
+    support speculative execution for openai API (#48)
+    
+    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
+
+[33mcommit 93414c8238c0fae97b8c741940f33dff58aec7c6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 24 22:25:33 2024 -0800
+
+    Add a link to HF paper page
+
+[33mcommit ed7c7eca0ec3485def2a3c5124e516479cea60a4[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 24 16:52:21 2024 -0800
+
+    Update README.md
+
+[33mcommit 0c457bae8f177b2af01a68edea488e8d898102ed[m
+Author: isaac-vidas <80056737+isaac-vidas@users.noreply.github.com>
+Date:   Wed Jan 24 19:23:11 2024 -0500
+
+    Handle grayscale images in expand2square (#97)
+
+[33mcommit d3fc86a43e2287e0446a4b3c9acf1300611f1f85[m
+Author: Haotian Liu <6631389+haotian-liu@users.noreply.github.com>
+Date:   Wed Jan 24 14:23:27 2024 -0600
+
+    Improve Chinese character streaming when the last char is half Chinese word. (#95)
+
+[33mcommit 01ee0fbc051f4e177ad917ef90ab26904c7d6cab[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Thu Jan 25 01:16:25 2024 +0800
+
+    fast regex decode
+    
+    Auto-detect constant str path in regex FSM, then extend instead.
+
+[33mcommit 711d3435305144d984b3358bbb34fdde40b9a63f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 24 11:44:07 2024 +0000
+
+    add a batch llava example
+
+[33mcommit 6dceab4d1786a6b94dfcbd2bc4030caa4cb3b0ba[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 24 11:37:25 2024 +0000
+
+    bump version to 0.1.9
+
+[33mcommit c70b3cfa9e3a06b129eef4e2a0b32d67a3c7eb33[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 24 03:33:34 2024 -0800
+
+    Bump the version to v0.1.8 (#93)
+
+[33mcommit 489796c7ea4bc8aa02b94c082400eced5a9a32bc[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Jan 24 10:45:44 2024 +0000
+
+    minor performance fix
+
+[33mcommit fa7a696d04f65848362332c2edf13d2d5c6d4921[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 24 10:44:32 2024 +0000
+
+    Fix max_new_tokens for limited memory
+
+[33mcommit bef0b359022f62cbc6de7eaef06d074ac635f7ee[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 24 10:35:31 2024 +0000
+
+    Fix llava & Fix multiprocessing
+
+[33mcommit c6576e820c87a801d2c9c94ad81e812159c75804[m
+Author: shiyi.c_98 <shicao@berkeley.edu>
+Date:   Wed Jan 24 01:51:21 2024 -0800
+
+    Llava-hd Support (#92)
+    
+    Co-authored-by: Haotian Liu <liuhaotian.cn@gmail.com>
+
+[33mcommit 99258181c62e98c8a7365a78b74522eefb9d04de[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 24 08:55:38 2024 +0000
+
+    set start method to spawn
+
+[33mcommit 3de54a1b5576efe0a06ce73b856f4aa8f40a8b29[m
+Author: isaac-vidas <80056737+isaac-vidas@users.noreply.github.com>
+Date:   Tue Jan 23 22:00:28 2024 -0500
+
+    Add health endpoint to SGLang runtime server (#90)
+
+[33mcommit 7358fa64f7da3f18ce7512148d330755b0c1f1fe[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 23 22:10:17 2024 +0000
+
+    Fix a bug in runtime backend
+
+[33mcommit 9a16fea0123ca731e74f919e494facfc5c30c30d[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 23 05:07:30 2024 -0800
+
+    Return logprob for choices (#87)
+
+[33mcommit 9e037c822ccabaf593c0145a9d8377f177e22ff9[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 23 03:43:19 2024 -0800
+
+    Update README.md
+
+[33mcommit 9076386d904171c7cc88ace681ca3ebbec2c71ea[m
+Author: 0xWe11es.eth <83463505+CSWellesSun@users.noreply.github.com>
+Date:   Tue Jan 23 16:25:26 2024 +0800
+
+    Fix SRT endpoint api json syntax (#84)
+
+[33mcommit 959c4174b218595dc77bc8853bdc695a670394b0[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jan 22 21:46:47 2024 -0800
+
+    Fix the chat template for QWen (#83)
+
+[33mcommit 94e05770db538cadce18f5c201572067ab87840e[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jan 22 21:17:05 2024 -0800
+
+    Fix after QWen support (#82)
+
+[33mcommit 63e97e5e4ccc835a089e140f8ae7079c0e6d823f[m
+Author: Arcmoon <50002441+Arcmoon-Hu@users.noreply.github.com>
+Date:   Tue Jan 23 12:14:51 2024 +0800
+
+    Suppport qwen model and solve some problems (#75)
+
+[33mcommit e08bca2840ca8442c4047e31a659b4329ab943f6[m
+Author: isaac-vidas <80056737+isaac-vidas@users.noreply.github.com>
+Date:   Mon Jan 22 21:15:48 2024 -0500
+
+    Support load fine-tuned LLaVA model (#80)
+
+[33mcommit cd3ccb2ed7aaeaa8f56acd467af9ad8fb482f465[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Jan 21 16:51:45 2024 -0800
+
+    Add a note about triton version for older GPUs (#72)
+
+[33mcommit 3f5c2f4c4aa6b8342497b612a3c35b1294bd2314[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Jan 21 15:17:30 2024 -0800
+
+    Add an async example (#37)
+
+[33mcommit 007eeb4eb91cb30bd6b0c2bb17a0655ea5841c15[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Jan 21 14:56:25 2024 -0800
+
+    Fix the error message and dependency of openai backend (#71)
+
+[33mcommit e8f2b155fe25a24cffdf085a8045c1e702f9503e[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Sun Jan 21 02:45:58 2024 -0800
+
+    Update README.md
+
+[33mcommit 723f0421638b8a0ee72ff0a84eed283946753ed6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Jan 21 10:31:02 2024 +0000
+
+    release v0.1.7 & fix bugs
+
+[33mcommit 585eababa1ffbd68ad364681aff4ca1b7b2b1824[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Jan 21 10:13:45 2024 +0000
+
+    Improve error message of openai
+
+[33mcommit cc3ada983f6445f98a88fa890a8130f40eb0f2af[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Jan 21 01:45:02 2024 -0800
+
+    Bump version to 0.1.6 (#68)
+
+[33mcommit a837166e6f80f6c5148bf1eb24bd88e309bbc86e[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sun Jan 21 01:39:23 2024 -0800
+
+    Fix select and normalized logprobs (#67)
+
+[33mcommit 11f3cca64fa7bd91a795075876ed2407c4b1ec86[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Sat Jan 20 23:20:35 2024 -0800
+
+    Fix select (#64)
+
+[33mcommit ca13f3b8c58e419c04e706bb5a6711073f466aa0[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sun Jan 21 13:26:11 2024 +0800
+
+    Disk FSM cache and adjust code. (#63)
+
+[33mcommit 0b2efc2adc8c5e01c1a4ef3a5ec6c9f5bac684be[m
+Author: Ikko Eltociear Ashimine <eltociear@gmail.com>
+Date:   Sat Jan 20 14:00:29 2024 +0900
+
+    Update README.md (#58)
+
+[33mcommit f30abd090a1d02377a1211a8c8f5b10deac0e763[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Fri Jan 19 17:03:33 2024 -0800
+
+    Improve error message & Add vicuna template (#57)
+
+[33mcommit 40ab1f01294f1c6cebcc0ed8597b76db039028bb[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Sat Jan 20 03:01:15 2024 +0800
+
+    Fix the possible bug of decode out of memory (#36)
+
+[33mcommit 199e82a15d2d06c9955c71c4e19c68afbe9dc860[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Jan 18 23:51:19 2024 -0800
+
+    Format code & Improve readme (#52)
+
+[33mcommit 23471f9aa3d4fb7bea89d2f9c0b471025292406e[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Thu Jan 18 23:43:09 2024 -0800
+
+    Support v1/chat/completions (#50)
+
+[33mcommit 61d4c93962001da758aee799e8618672c17bec53[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Thu Jan 18 17:00:56 2024 -0800
+
+    Support stream=True in v1/completions (#49)
+
+[33mcommit 98a3e8ef78f673175a6890fd0ed8f39a6bd51b07[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Jan 18 13:46:38 2024 -0800
+
+    Add a llava example (#47)
+
+[33mcommit 2b079f89315553897d9abe49366a24534eee7ec0[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Jan 18 13:30:10 2024 -0800
+
+    Increase interpreter parallelism (#46)
+
+[33mcommit 05b4c398df9f72b5461d9f013f0058da4c44882b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Thu Jan 18 11:49:27 2024 -0800
+
+    Document sampling parameters (#45)
+
+[33mcommit dafafe5b111d1a74ecade029d342bbea5df4edfa[m
+Author: Cody Yu <hao.yu.cody@gmail.com>
+Date:   Thu Jan 18 11:18:22 2024 -0800
+
+    Use HTTP link in 3rdparty module (#42)
+
+[33mcommit b240f751006bfdb4f4f249348c6915ae962fda62[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 17 22:26:32 2024 -0800
+
+    Add a parallel sampling case (#34)
+
+[33mcommit 501f94444529bad2ea944a42c7c38deae986b827[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 17 21:14:31 2024 -0800
+
+    Bump version to 0.1.5 (#33)
+
+[33mcommit 22ec7bc2a1a0849870323cffa98cf7abf7d416ff[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 17 18:37:02 2024 -0800
+
+    Expose more arguments to control the scheduling policy (#32)
+
+[33mcommit c0454b323c87c78f650074659c2db92b0fd2524d[m
+Author: Christopher Chou <49086305+BabyChouSr@users.noreply.github.com>
+Date:   Wed Jan 17 18:15:02 2024 -0800
+
+    Add option to return metadata in async streaming (#18)
+
+[33mcommit 8024fc5eec67c9ba9bd4df36bade64a939624ab4[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 17 16:38:20 2024 -0800
+
+    Fix streaming (#30)
+
+[33mcommit 70528762bf0800793dab10d1b51d40f21a0608ba[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 17 10:42:55 2024 -0800
+
+    update readme
+
+[33mcommit 71d30d6ddc00f7142b403f858d12f2f1d8385378[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Wed Jan 17 09:49:53 2024 -0800
+
+    Update README.md
+
+[33mcommit f9d723816ab762c20279463797f3b1a95158f23b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 17 04:43:17 2024 -0800
+
+    Teak mem fraction (#20)
+
+[33mcommit bf51ddc6e52d872700724ddc181089162811319a[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Wed Jan 17 02:54:41 2024 -0800
+
+    Improve docs & Rename Gemini -> VertexAI (#19)
+
+[33mcommit fd7c4792391127e7cd8d9e3647e118c6a38bf382[m
+Author: shiyi.c_98 <shiyicao314@gmail.com>
+Date:   Tue Jan 16 22:29:37 2024 -0800
+
+    Gemini Backend (#9)
+    
+    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
+
+[33mcommit c4707f1bb52c1743d1f438940d388ae0da36c92b[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 16 19:53:55 2024 -0800
+
+    Improve docs (#17)
+
+[33mcommit ffe4aaee1da5ccfdda1c6228ae911d6f139b68eb[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Tue Jan 16 15:49:03 2024 -0800
+
+    Fix for T4 GPUs (#16)
+    
+    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
+
+[33mcommit 5b27a1dce413987f07f3a063a938ca27ecc82ea0[m
+Author: Christopher Chou <49086305+BabyChouSr@users.noreply.github.com>
+Date:   Tue Jan 16 15:41:30 2024 -0800
+
+    Rename image_url to image_file (#15)
+
+[33mcommit e71d4ab3f941e8ecec461480b582e50170a6842e[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 16 04:18:54 2024 -0800
+
+    Update docs (#12)
+
+[33mcommit fbf42263f1ba0c6775b92062661a5d2c1fb3bd68[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 16 02:46:27 2024 -0800
+
+    Update Readme (#11)
+
+[33mcommit 2ccd9fd8c5331ee5bd53185772d7ed615a3a8c30[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 16 05:55:25 2024 +0000
+
+    update version to 0.1.3
+
+[33mcommit 46b7ea7c851a98b532e35243957f5438bed9ba89[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jan 15 21:37:11 2024 -0800
+
+    Improve Readme (#10)
+
+[33mcommit 70359bf31a4b86bb772cd0853cbf833faf159a7c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jan 15 16:12:57 2024 -0800
+
+    Update benchmark scripts (#8)
+
+[33mcommit 01ca82d7650c597a9619cec01fd542ca7bcf8620[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Jan 16 01:42:46 2024 +0800
+
+    fix radix cache match (#7)
+
+[33mcommit 4bd8233f2cf88da571ff7edcf9518e273e65475c[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jan 15 01:15:53 2024 -0800
+
+    Fix test cases (#6)
+
+[33mcommit 08ab2a1655224a671fd8d356387aa83f3179129a[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Mon Jan 15 16:49:29 2024 +0800
+
+    Json Decode && Mutl-Turns (#4)
+
+[33mcommit f652494df16ef9fa0fac998ddf63961aee0849d4[m
+Author: hnyls2002 <hnyls2002@gmail.com>
+Date:   Wed Jan 10 04:21:17 2024 +0000
+
+    fix typo
+
+[33mcommit 30720e732c599abf62516ebc6562abc9ad93157f[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Tue Jan 9 12:43:40 2024 -0800
+
+    Add install with pip (#3)
+
+[33mcommit 331848de9d639952ef30cec35e7a58f2760154f8[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Wed Jan 10 04:35:44 2024 +0800
+
+    Add SRT json decode example (#2)
+
+[33mcommit 93eeb543bacf2d7f5f56dd7e7dd4767884026b38[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jan 8 21:20:23 2024 +0000
+
+    Update readme.md
+
+[33mcommit ead5b39f82f0664f6b8ffd571c9b41b8a2dd1e22[m
+Author: Liangsheng Yin <hnyls2002@gmail.com>
+Date:   Tue Jan 9 00:26:18 2024 +0800
+
+    Add flashinfer && Oultines (#1)
+
+[33mcommit 22085081bb247cc57fe971c3d72eb66f053d77b6[m
+Author: Lianmin Zheng <lianminzheng@gmail.com>
+Date:   Mon Jan 8 04:37:50 2024 +0000
+
+    release initial code
+    
+    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
+    Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
+    Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
+    Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
+    Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
+    Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
+
+[33mcommit f6d40df0ee1e1fc53db3edc04bf90575f221cf23[m
+Author: Ying Sheng <sqy1415@gmail.com>
+Date:   Mon Oct 9 15:41:15 2023 -0700
+
+    Initial commit
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index ab8e4fd8109b..7e2c1c5c9e7d 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -8,7 +8,7 @@
 
 root = Path(__file__).parent.resolve()
 
-# 添加调试模式控制
+# add debug mode control
 debug_build = os.environ.get('DEBUG_BUILD', '0').lower() in ('1', 'true', 'yes', 'on')
 print(f"Debug build: {'enabled' if debug_build else 'disabled'}")
 
@@ -55,12 +55,12 @@ def update_wheel_platform_tag():
     "-fPIC",
     "-gencode=arch=compute_89,code=sm_89",
     "-gencode=arch=compute_90,code=sm_90",
-    "-gencode=arch=compute_90a,code=sm_90a",  # 只保留这个
+    "-gencode=arch=compute_90a,code=sm_90a",
     "-U__CUDA_NO_HALF_OPERATORS__",
     "-U__CUDA_NO_HALF2_OPERATORS__",
 ]
 
-# 如果是调试模式，添加调试标志
+# if debug, add debug flag
 if debug_build:
     nvcc_flags.extend([
         "-DSGL_DEBUG_BUILD",
diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py
index 06894c3358ef..1df5b3578ba5 100644
--- a/sgl-kernel/src/sgl-kernel/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/__init__.py
@@ -15,5 +15,4 @@
     "custom_reduce",
     "int8_scaled_mm",
     "fp8_scaled_mm",
-    "fp8_scaled_mm_profile",
 ]
diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index 27a5b64cb650..c757cf34a370 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -566,99 +566,6 @@ void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch
     }
 }
 
-#define DISPATCH_FP8_GEMM_CONFIG(TB_M, TB_N, TB_K, WP_M, WP_N, WP_K, STAGES) \
-    sm89_dispatch_bias<ElementOutput, cutlass::gemm::GemmShape<TB_M, TB_N, TB_K>, \
-        cutlass::gemm::GemmShape<WP_M, WP_N, WP_K>, STAGES>(out, mat_a, mat_b, scales_a, scales_b, bias)
-// generate all stages for a group of configs
-#define DISPATCH_FP8_GEMM_GROUP(GROUP_ID, CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, BASE_CASE) \
-    case BASE_CASE:     DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 2); break; \
-    case BASE_CASE + 1: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 3); break; \
-    case BASE_CASE + 2: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 4); break; \
-    case BASE_CASE + 3: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 5); break; \
-    case BASE_CASE + 4: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 6); break; \
-    case BASE_CASE + 5: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 7); break;
-
-template <typename ElementOutput>
-void sm89_dispatch_shape_explicit(torch::Tensor& out, const torch::Tensor& mat_a, const torch::Tensor& mat_b,
-                            const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-                            const c10::optional<torch::Tensor>& bias,
-                            int config_id) {
-#ifdef SGL_DEBUG_BUILD
-    switch(config_id) {
-        case 1:
-            DISPATCH_FP8_GEMM_CONFIG(32, 64, 128, 16, 64, 64, 5);
-            break;
-        // case 2:
-        //     DISPATCH_FP8_GEMM_CONFIG(16, 64, 128, 16, 64, 64, 5);
-        //     break;
-        // case 3:
-        //     DISPATCH_FP8_GEMM_CONFIG(64, 64, 128, 32, 64, 64, 5);
-        //     break;
-        // case 4:
-        //     DISPATCH_FP8_GEMM_CONFIG(64, 128, 64, 32, 64, 64, 5);
-        //     break;
-        // case 5:
-        //     DISPATCH_FP8_GEMM_CONFIG(128, 128, 64, 64, 32, 64, 2);
-        //     break;
-        // case 6:
-        //     DISPATCH_FP8_GEMM_CONFIG(64, 128, 64, 32, 64, 64, 6);
-        //     break;
-        default:
-            throw std::runtime_error("Invalid config_id in debug mode: " + std::to_string(config_id));
-    }
-#else
-    switch(config_id) {
-        // Group 1: CtaShape32x128x64_WarpShape32x32x64
-        DISPATCH_FP8_GEMM_GROUP(1, 32, 128, 64, 32, 32, 64, 1);
-
-        // Group 2: CtaShape64x128x64_WarpShape32x64x64
-        DISPATCH_FP8_GEMM_GROUP(2, 64, 128, 64, 32, 64, 64, 7);
-
-        // Group 3: CtaShape64x64x128_WarpShape32x64x64
-        DISPATCH_FP8_GEMM_GROUP(3, 64, 64, 128, 32, 64, 64, 13);
-
-        // Group 4: CtaShape64x128x64_WarpShape64x32x64
-        DISPATCH_FP8_GEMM_GROUP(4, 64, 128, 64, 64, 32, 64, 19);
-
-        // Group 5: CtaShape128x64x64_WarpShape64x32x64
-        DISPATCH_FP8_GEMM_GROUP(5, 128, 64, 64, 64, 32, 64, 25);
-
-        // Group 6: CtaShape128x128x64_WarpShape64x32x64
-        DISPATCH_FP8_GEMM_GROUP(6, 128, 128, 64, 64, 32, 64, 31);
-
-        // Group 7: CtaShape128x128x64_WarpShape64x64x64
-        DISPATCH_FP8_GEMM_GROUP(7, 128, 128, 64, 64, 64, 64, 37);
-
-        // Group 8: CtaShape128x128x64_WarpShape128x32x64
-        DISPATCH_FP8_GEMM_GROUP(8, 128, 128, 64, 128, 32, 64, 43);
-
-        // Group 9: CtaShape128x256x64_WarpShape64x64x64
-        DISPATCH_FP8_GEMM_GROUP(9, 128, 256, 64, 64, 64, 64, 49);
-
-        // Group 10: CtaShape256x128x64_WarpShape64x64x64
-        DISPATCH_FP8_GEMM_GROUP(10, 256, 128, 64, 64, 64, 64, 55);
-
-        // Group 11: CtaShape128x64x128_WarpShape64x32x128
-        DISPATCH_FP8_GEMM_GROUP(11, 128, 64, 128, 64, 32, 128, 61);
-
-        // Group 12: CtaShape16x256x128_WarpShape16x64x128
-        DISPATCH_FP8_GEMM_GROUP(12, 16, 256, 128, 16, 64, 128, 67);
-
-        // Group 13: CtaShape16x64x128_WarpShape16x64x64
-        DISPATCH_FP8_GEMM_GROUP(13, 16, 64, 128, 16, 64, 64, 73);
-
-        // Group 14: CtaShape16x128x64_WarpShape16x64x64
-        DISPATCH_FP8_GEMM_GROUP(14, 16, 128, 64, 16, 64, 64, 79);
-
-        // Group 15: CtaShape32x64x128_WarpShape16x64x64
-        DISPATCH_FP8_GEMM_GROUP(15, 32, 64, 128, 16, 64, 64, 85);
-        
-        default:
-            throw std::runtime_error("Invalid config_id: " + std::to_string(config_id));
-    }
-#endif
-}
-
 torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
                              const c10::optional<torch::Tensor>& bias, bool is_profile=false) {
@@ -702,30 +609,6 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
             sm90_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
         }
   } else if (sm_version == 89) {
-        if (is_profile) {
-            std::string config_path = get_config_path(mat_a.size(1), mat_b.size(1), out_dtype);
-            try {
-                json config = read_json_config(config_path);
-                int current_m = mat_a.size(0);
-                int nearest_m = find_nearest_m(config, current_m);
-                if (nearest_m != -1) {
-                    std::string key = "M=" + std::to_string(nearest_m);
-                    int config_id = config[key].get<int>();
-                    if (out_dtype == torch::kBFloat16) {
-                        sm89_dispatch_shape_explicit<cutlass::bfloat16_t>(
-                            out, mat_a, mat_b, scales_a, scales_b, bias, config_id);
-
-                    } else {
-                        sm89_dispatch_shape_explicit<cutlass::half_t>(
-                            out, mat_a, mat_b, scales_a, scales_b, bias, config_id);
-                    }
-                    return out;
-                }
-            } catch (const std::exception& e) {
-                std::cerr << "Failed to read config, using default dispatch: " << e.what() << std::endl;
-            }
-        }
-        
         if (out_dtype == torch::kBFloat16) {
             sm89_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
         } else {
@@ -737,150 +620,4 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
 
 
   return out;
-}
-
-
-template <typename OutType>
-float test_config(int config_id, torch::Tensor& out, const torch::Tensor& mat_a, const torch::Tensor& mat_b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b, const c10::optional<torch::Tensor>& bias) {
-    const int NUM_WARMUP = 25;
-    const int NUM_TEST = 100;
-    // warmup
-    for (int i = 0; i < NUM_WARMUP; i++) {
-        sm89_dispatch_shape_explicit<OutType>(out, mat_a, mat_b, scales_a, scales_b, bias, config_id);
-    }
-    
-    float total_time = 0.0f;
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-    
-    for (int i = 0; i < NUM_TEST; i++) {
-        cudaEventRecord(start);
-        sm89_dispatch_shape_explicit<OutType>(out, mat_a, mat_b, scales_a, scales_b, bias, config_id);
-        cudaEventRecord(stop);
-        cudaEventSynchronize(stop);
-        
-        float elapsed_time;
-        cudaEventElapsedTime(&elapsed_time, start, stop);
-        total_time += elapsed_time;
-    }
-    
-    cudaEventDestroy(start);
-    cudaEventDestroy(stop);
-    
-    return total_time / NUM_TEST;
-}
-
-template <typename OutType>
-int sm89_dispatch_shape_profile(const torch::Tensor& mat_a, const torch::Tensor& mat_b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b, 
-    const torch::Dtype& out_dtype, const c10::optional<torch::Tensor>& bias) {
-    torch::Tensor out = torch::empty({mat_a.size(0), mat_b.size(1)}, mat_a.options().dtype(out_dtype));
-    float min_time = std::numeric_limits<float>::max();
-    int best_config = -1;
-#ifdef SGL_DEBUG_BUILD
-    for (int i = 1; i <= MAX_CONFIG_ID; i++) {
-#else
-    for (int i = 1; i <= MAX_CONFIG_ID; i++) {
-#endif
-        try {
-            float elapsed_time = test_config<OutType>(i, out, mat_a, mat_b, scales_a, scales_b, bias);
-            #ifdef SGL_DEBUG_BUILD
-            std::cout << "batch_size: " << mat_a.size(0) << ", config_id: " << i << ", time: " << elapsed_time << "ms" << std::endl;
-            #endif
-            if (elapsed_time < min_time) {
-                min_time = elapsed_time;
-                best_config = i;
-            }
-        } catch (const std::exception& e) {
-            continue;
-        }
-    }
-    return best_config;
-}
-
-
-void fp8_scaled_mm_profile(const torch::Tensor& mat_a, const torch::Tensor& mat_b, 
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b, 
-    const torch::Dtype& out_dtype, const c10::optional<torch::Tensor>& bias) {
-    
-    TORCH_CHECK(mat_a.is_cuda(), "mat_a must be a CUDA tensor");
-    TORCH_CHECK(mat_b.is_cuda(), "mat_b must be a CUDA tensor");
-    TORCH_CHECK(mat_a.dim() == 2, "mat_a must be a 2D tensor");
-    TORCH_CHECK(mat_b.dim() == 2, "mat_b must be a 2D tensor");
-    TORCH_CHECK(mat_a.stride(1) == 1, "mat_a must be a row major tensor");
-    TORCH_CHECK(mat_b.stride(0) == 1, "mat_a must be a column major tensor");
-    TORCH_CHECK(mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied");
-
-    TORCH_CHECK((mat_a.size(1) * mat_a.element_size()) % 16 == 0, "mat_a must be multiple of 16 bytes for memory alignment");
-    TORCH_CHECK((mat_b.size(0) * mat_b.element_size()) % 16 == 0, "mat_b must be multiple of 16 bytes for memory alignment");
-    TORCH_CHECK(mat_a.scalar_type() == torch::kFloat8_e4m3fn, "mat_a must be Float8_e4m3fn");
-    TORCH_CHECK(mat_b.scalar_type() == torch::kFloat8_e4m3fn, "mat_b must be Float8_e4m3fn");
-    TORCH_CHECK(out_dtype == torch::kHalf || out_dtype == torch::kBFloat16, "out_dtype must be Half or BFloat16");
-
-    TORCH_CHECK(scales_a.numel() == mat_a.size(0), "size of scales_a is not matched");
-    TORCH_CHECK(scales_b.numel() == mat_b.size(1), "size of scales_b is not matched");
-    TORCH_CHECK(scales_a.is_contiguous(), "scales_a must be contiguous");
-    TORCH_CHECK(scales_b.is_contiguous(), "scales_b must be contiguous");
-    TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32, "scales_a must be Float32");
-    TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32, "scales_b must be Float32");
-
-    if (bias) {
-        TORCH_CHECK(bias->numel() == mat_b.size(1), "size of bias is not matched");
-        TORCH_CHECK(bias->is_contiguous(), "bias must be contiguous");
-        TORCH_CHECK(bias->dtype() == out_dtype, "bias dtype must match output dtype");
-    }
-
-    torch::Tensor out = torch::empty({mat_a.size(0), mat_b.size(1)}, mat_a.options().dtype(out_dtype));
-    TORCH_CHECK((out.size(1) * out.element_size()) % 16 == 0, "out must be multiple of 16 bytes for memory alignment");
-    
-    std::string config_path = get_config_path(mat_a.size(1), mat_b.size(1), out_dtype);
-    int best_config = -1;
-    bool need_profile = true;
-    
-    try {
-        json config = read_json_config(config_path);
-        // construct key
-        std::string key = "M=" + std::to_string(mat_a.size(0));
-        
-        // check if key exists
-        if (config.contains(key)) {
-            best_config = config[key].get<int>();
-            need_profile = false;
-        }
-    } catch (const std::exception& e) {
-        // if read failed, create new json object
-        need_profile = true;
-    }
-    
-    // if need profile, run profile and update config
-    if (need_profile) {
-        if (out_dtype == torch::kBFloat16) {
-            best_config = sm89_dispatch_shape_profile<cutlass::bfloat16_t>(mat_a, mat_b, scales_a, scales_b, out_dtype, bias);
-        } else {
-            best_config = sm89_dispatch_shape_profile<cutlass::half_t>(mat_a, mat_b, scales_a, scales_b, out_dtype, bias);
-        }
-        if (best_config != -1) {
-            try {
-                // read existing config or create new config
-                json config;
-                try {
-                    config = read_json_config(config_path);
-                } catch (...) {
-                    // if file not exists, use empty json object
-                }
-                
-                // update config
-                std::string key = "M=" + std::to_string(mat_a.size(0));
-                config[key] = best_config;
-                
-                // save config
-                std::ofstream o(config_path);
-                o << std::setw(4) << config << std::endl;
-            } catch (const std::exception& e) {
-                std::cerr << "Failed to save config: " << e.what() << std::endl;
-            }
-        }
-    }
 }
\ No newline at end of file
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index 390b88b48bc8..a11a89839b31 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -21,10 +21,6 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
                              const c10::optional<torch::Tensor>& bias, bool is_profile=false);
 
-void fp8_scaled_mm_profile(const torch::Tensor& mat_a, const torch::Tensor& mat_b, 
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b, 
-    const torch::Dtype& out_dtype, const c10::optional<torch::Tensor>& bias);
-
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // trt_reduce
   m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)");
@@ -36,6 +32,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // m.def("int8_scaled_mm", &int8_scaled_mm, "INT8 scaled matmul (CUDA)");
   // fp8_scaled_mm
   m.def("fp8_scaled_mm", &fp8_scaled_mm, "FP8 scaled matmul (CUDA)");
-  // fp8_scaled_mm_profile
-  m.def("fp8_scaled_mm_profile", &fp8_scaled_mm_profile, "FP8 scaled matmul profile (CUDA)");
 }
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index ea684346a7c4..bc62d835d628 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -4,7 +4,6 @@
 # from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm
 from sgl_kernel.ops._kernels import fp8_scaled_mm as _fp8_scaled_mm
 from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size
-from sgl_kernel.ops._kernels import fp8_scaled_mm_profile as _fp8_scaled_mm_profile
 
 def init_custom_reduce(rank_id, num_devices, buffers, barrier_in, barrier_out):
     return _init_custom_ar(rank_id, num_devices, buffers, barrier_in, barrier_out)
@@ -60,14 +59,4 @@ def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None, is_pro
         out_dtype,
         bias,
         is_profile,
-    )
-
-def fp8_scaled_mm_profile(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
-    _fp8_scaled_mm_profile(
-        mat_a,
-        mat_b,
-        scales_a,
-        scales_b,
-        out_dtype,
-        bias,
-    )
+    )
\ No newline at end of file

From 93e2d8556b775d1e28fba66b664474cf1b6edfc2 Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Tue, 14 Jan 2025 11:34:37 +0000
Subject: [PATCH 063/248] fix

---
 sgl-kernel/src/sgl-kernel/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py
index 1df5b3578ba5..2a4a2bd51771 100644
--- a/sgl-kernel/src/sgl-kernel/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/__init__.py
@@ -4,7 +4,6 @@
     init_custom_reduce,
     int8_scaled_mm,
     fp8_scaled_mm,
-    fp8_scaled_mm_profile,
     moe_align_block_size,
 )
 

From 8c08dbb5ad6feee8a23260314d5adaf8f14e9bf4 Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Tue, 14 Jan 2025 11:50:41 +0000
Subject: [PATCH 064/248] clean code

---
 sgl-kernel/benchmark/89_fp8_bf16.json         | 10 ----
 ...fp8_bf16_256\350\247\243\345\206\263.json" | 10 ----
 ...4096,device=NVIDIA_L40,dtype=bfloat16.json | 11 -----
 ...=4096,device=NVIDIA_L40,dtype=float16.json | 11 -----
 sgl-kernel/benchmark/bench_fp8_gemm.py        | 49 ++-----------------
 .../benchmark/bench_fp8_res/results.html      |  2 +
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    | 11 -----
 7 files changed, 6 insertions(+), 98 deletions(-)
 delete mode 100644 sgl-kernel/benchmark/89_fp8_bf16.json
 delete mode 100644 "sgl-kernel/benchmark/89_fp8_bf16_256\350\247\243\345\206\263.json"
 delete mode 100644 sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=bfloat16.json
 delete mode 100644 sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=float16.json

diff --git a/sgl-kernel/benchmark/89_fp8_bf16.json b/sgl-kernel/benchmark/89_fp8_bf16.json
deleted file mode 100644
index 6f5e12bbba87..000000000000
--- a/sgl-kernel/benchmark/89_fp8_bf16.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "M=1,N=4096,K=8192": 1,
-    "M=1024,N=4096,K=8192": 5,
-    "M=128,N=4096,K=8192": 3,
-    "M=16,N=4096,K=8192": 1,
-    "M=2048,N=4096,K=8192": 5,
-    "M=256,N=4096,K=8192": 4,
-    "M=512,N=4096,K=8192": 5,
-    "M=64,N=4096,K=8192": 1
-}
diff --git "a/sgl-kernel/benchmark/89_fp8_bf16_256\350\247\243\345\206\263.json" "b/sgl-kernel/benchmark/89_fp8_bf16_256\350\247\243\345\206\263.json"
deleted file mode 100644
index 9bbf86714e92..000000000000
--- "a/sgl-kernel/benchmark/89_fp8_bf16_256\350\247\243\345\206\263.json"
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "M=1,N=4096,K=8192": 75,
-    "M=1024,N=4096,K=8192": 33,
-    "M=128,N=4096,K=8192": 15,
-    "M=16,N=4096,K=8192": 88,
-    "M=2048,N=4096,K=8192": 32,
-    "M=256,N=4096,K=8192": 12,
-    "M=512,N=4096,K=8192": 31,
-    "M=64,N=4096,K=8192": 90
-}
diff --git a/sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=bfloat16.json b/sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=bfloat16.json
deleted file mode 100644
index c41a7a4ff120..000000000000
--- a/sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=bfloat16.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "M=1": 75,
-    "M=1024": 31,
-    "M=128": 15,
-    "M=16": 87,
-    "M=2048": 32,
-    "M=256": 10,
-    "M=4096": 32,
-    "M=512": 31,
-    "M=64": 90
-}
diff --git a/sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=float16.json b/sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=float16.json
deleted file mode 100644
index ea15e9dbb510..000000000000
--- a/sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=float16.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "M=1": 75,
-    "M=1024": 31,
-    "M=128": 15,
-    "M=16": 75,
-    "M=2048": 31,
-    "M=256": 10,
-    "M=4096": 32,
-    "M=512": 31,
-    "M=64": 90
-}
diff --git a/sgl-kernel/benchmark/bench_fp8_gemm.py b/sgl-kernel/benchmark/bench_fp8_gemm.py
index 95ec522c9883..a8b966ebd2b9 100644
--- a/sgl-kernel/benchmark/bench_fp8_gemm.py
+++ b/sgl-kernel/benchmark/bench_fp8_gemm.py
@@ -5,50 +5,19 @@
 from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
 from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
 from sgl_kernel import fp8_scaled_mm as sgl_scaled_mm
-from sgl_kernel import fp8_scaled_mm_profile as sgl_scaled_mm_profile
 import time
 
-def get_sm_version():
-    device = torch.cuda.current_device()
-    major, minor = torch.cuda.get_device_capability(device)
-    return major * 10 + minor
-
-
-def get_device_name():
-    return torch.cuda.get_device_name(torch.cuda.current_device())
-
-def get_config_filename(dtype="bf16"):
-    sm_version = get_sm_version()
-    return f"sm{sm_version}_fp8_{dtype}.json"
-
-def do_profile(dtype="bf16"):
-    M = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
-    for m in M:
-        n = 4096
-        k = 8192
-        a = torch.ones((m, k), device="cuda") * 5.0
-        b = torch.ones((n, k), device="cuda") * 5.0
-        scale_a = torch.randn((m,), device="cuda", dtype=torch.float32)
-        scale_b = torch.randn((n,), device="cuda", dtype=torch.float32)
-        a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
-        b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
-        b_fp8 = b_fp8.t()
-        sgl_scaled_mm_profile(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=None)
-
 @triton.testing.perf_report(
         triton.testing.Benchmark(
         x_names=["batch_size"],
         x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048],
         x_log=False,
         line_arg="provider",
-        line_vals=["vllm-fp8-fp16", "vllm-fp8-bf16", "sglang-fp8-fp16", "sglang-fp8-bf16", 
-                  "sglang-fp8-profile-fp16", "sglang-fp8-profile-bf16"],
-        line_names=["vllm-fp8-fp16", "vllm-fp8-bf16", "sglang-fp8-fp16", "sglang-fp8-bf16", 
-                   "sglang-fp8-profile-fp16", "sglang-fp8-profile-bf16"],
-        styles=[("green", "-"), ("green", "--"), ("blue", "-"), ("blue", "--"), 
-               ("red", "-"), ("red", "--")],
+        line_vals=["vllm-fp8-fp16", "vllm-fp8-bf16", "sglang-fp8-fp16", "sglang-fp8-bf16"],
+        line_names=["vllm-fp8-fp16", "vllm-fp8-bf16", "sglang-fp8-fp16", "sglang-fp8-bf16"],
+        styles=[("green", "-"), ("green", "--"), ("blue", "-"), ("blue", "--")],
         ylabel="GB/s",
-        plot_name="int8 scaled matmul",
+        plot_name="fp8 scaled matmul",
         args={},
     )
 )
@@ -73,16 +42,6 @@ def benchmark(batch_size, provider):
             ),
             quantiles=quantiles,
         )
-    elif "sglang-fp8-profile" in provider:
-        do_profile(dtype)
-        try:
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: sgl_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=None, is_profile=True),
-                quantiles=quantiles,
-            )
-        except RuntimeError as e:
-            print("Error details:", e)
-            ms, min_ms, max_ms = 1, 1, 1
     elif "sglang-fp8" in provider:
         ms, min_ms, max_ms = triton.testing.do_bench(
             lambda: sgl_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=None, is_profile=False),
diff --git a/sgl-kernel/benchmark/bench_fp8_res/results.html b/sgl-kernel/benchmark/bench_fp8_res/results.html
index 0632d108d287..6e17ec3d55b6 100644
--- a/sgl-kernel/benchmark/bench_fp8_res/results.html
+++ b/sgl-kernel/benchmark/bench_fp8_res/results.html
@@ -1 +1,3 @@
 <html><body>
+<image src="fp8 scaled matmul.png"/>
+</body></html>
diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index c757cf34a370..9c3251181814 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -436,15 +436,6 @@ typename Gemm::Arguments prepare_sm90_fp8_args(torch::Tensor& out, const torch::
     StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, make_shape(n, k, 1));
     StrideC stride_c;
     StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, make_shape(m, n, 1));
-    std::cout << "m: " << m << std::endl;
-    std::cout << "n: " << n << std::endl;
-    std::cout << "k: " << k << std::endl;
-    std::cout << "stride_a: " << stride_a << std::endl;
-    std::cout << "stride_b: " << stride_b << std::endl;
-    std::cout << "stride_d: " << stride_d << std::endl;
-    std::cout << "ptr_a: " << ptr_a << std::endl;
-    std::cout << "ptr_b: " << ptr_b << std::endl;
-    std::cout << "ptr_d: " << ptr_d << std::endl;
     typename Gemm::Arguments args
         = {cutlass::gemm::GemmUniversalMode::kGemm, {m, n, k, 1}, {ptr_a, stride_a, ptr_b, stride_b},
             {{}, // epilogue.thread
@@ -507,7 +498,6 @@ void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const
                   << ", workspace_size: " << workspace_size
                   << ", workspace_options: " << workspace_options;
         
-        // 检查CUDA错误
         cudaError_t cuda_err = cudaGetLastError();
         if (cuda_err != cudaSuccess) {
             error_msg << "\nCUDA error: " << cudaGetErrorString(cuda_err);
@@ -516,7 +506,6 @@ void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const
         TORCH_CHECK(false, error_msg.str());
     }
 
-    // 5. 同步并检查最终状态
     cudaError_t sync_err = cudaStreamSynchronize(stream);
     if (sync_err != cudaSuccess) {
         TORCH_CHECK(false, "CUDA sync error: ", cudaGetErrorString(sync_err));

From fb95b0e7e4c2362e7cf47b76d9ed0ce04d5d1e04 Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Wed, 15 Jan 2025 02:00:58 +0000
Subject: [PATCH 065/248] clean code

---
 sgl-kernel/src/sgl-kernel/csrc/utils.hpp | 88 ------------------------
 1 file changed, 88 deletions(-)

diff --git a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
index ad31d7a40856..8a284ac6cca2 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
+++ b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
@@ -1,6 +1,5 @@
 #pragma once
 #include <torch/extension.h>
-#include "../../../3rdparty/nlohmann/json.hpp"
 #include <fstream>
 #include <sstream>
 #include <filesystem>
@@ -58,91 +57,4 @@ inline int getSMVersion() {
 inline uint32_t next_pow_2(uint32_t const num) {
   if (num <= 1) return num;
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
-
-inline json read_json_config(const std::string& config_path) {
-    std::ifstream f(config_path);
-    if (!f.is_open()) {
-        std::stringstream ss;
-        ss << "Failed to open config file: " << config_path;
-        throw std::runtime_error(ss.str());
-    }
-    json config;
-    try {
-        config = json::parse(f);
-    } catch (const json::parse_error& e) {
-        std::stringstream ss;
-        ss << "Failed to parse config file: " << config_path << "\n"
-           << "Error: " << e.what();
-        throw std::runtime_error(ss.str());
-    }
-    return config;
-}
-
-inline json get_gemm_config(const std::string& config_path, int m, int n) {
-    auto config = read_json_config(config_path);
-    
-    json* best_config = nullptr;
-    int min_diff = std::numeric_limits<int>::max();
-    
-    for (auto& cfg : config["configs"]) {
-        int cfg_m = cfg["m_range"][0];
-        int cfg_m_end = cfg["m_range"][1];
-        int cfg_n = cfg["n_range"][0];
-        int cfg_n_end = cfg["n_range"][1];
-        
-        if (m >= cfg_m && m <= cfg_m_end && n >= cfg_n && n <= cfg_n_end) {
-            int diff = std::abs(m - cfg_m) + std::abs(n - cfg_n);
-            if (diff < min_diff) {
-                min_diff = diff;
-                best_config = &cfg;
-            }
-        }
-    }
-    
-    if (!best_config) {
-        throw std::runtime_error("No matching configuration found for m=" + 
-                               std::to_string(m) + ", n=" + std::to_string(n));
-    }
-    
-    return *best_config;
-}
-
-inline std::string get_config_path(int64_t N, int64_t K, const torch::Dtype& dtype) {
-    static int device = -1;
-    static std::string cached_device_name;
-    
-    // 只在第一次调用时获取设备信息
-    if (device == -1) {
-        CHECK_CUDA_SUCCESS(cudaGetDevice(&device));
-        cudaDeviceProp prop;
-        CHECK_CUDA_SUCCESS(cudaGetDeviceProperties(&prop, device));
-        cached_device_name = prop.name;
-        std::replace(cached_device_name.begin(), cached_device_name.end(), ' ', '_');
-    }
-    
-    std::string dtype_str = (dtype == torch::kBFloat16) ? "bfloat16" : "float16";
-    
-    return "N=" + std::to_string(N) + 
-           ",K=" + std::to_string(K) + 
-           ",device=" + cached_device_name + 
-           ",dtype=" + dtype_str + ".json";
-}
-
-// 添加一个辅助函数来找到最近的配置
-inline int find_nearest_m(const json& config, int current_m) {
-    int nearest_m = -1;
-    int min_diff = std::numeric_limits<int>::max();
-    
-    for (auto& el : config.items()) {
-        if (el.key().substr(0, 2) == "M=") {
-            int m = std::stoi(el.key().substr(2));
-            if (m <= current_m && (current_m - m) < min_diff) {
-                min_diff = current_m - m;
-                nearest_m = m;
-            }
-        }
-    }
-    
-    return nearest_m;
 }
\ No newline at end of file

From b3e99dfb2292ee9de83ca1a29800dff900da19af Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Wed, 15 Jan 2025 16:23:42 +0800
Subject: [PATCH 066/248] chore: bump v0.4.1.post6 (#2899)

---
 docker/Dockerfile.rocm                |  2 +-
 docs/developer/setup_github_runner.md |  4 ++--
 docs/start/install.md                 | 10 +++++-----
 python/pyproject.toml                 |  2 +-
 python/sglang/version.py              |  2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index e71cd1694029..5a6e9770b721 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -1,5 +1,5 @@
 # Usage (to build SGLang ROCm docker image):
-#   docker build --build-arg SGL_BRANCH=v0.4.1.post5 -t v0.4.1.post5-rocm620 -f Dockerfile.rocm .
+#   docker build --build-arg SGL_BRANCH=v0.4.1.post6 -t v0.4.1.post6-rocm620 -f Dockerfile.rocm .
 
 # default base image
 ARG BASE_IMAGE="rocmshared/vllm-rocm:20250114-tuned-elementwise-layernorm"
diff --git a/docs/developer/setup_github_runner.md b/docs/developer/setup_github_runner.md
index fe856e9d659b..edc03d661837 100644
--- a/docs/developer/setup_github_runner.md
+++ b/docs/developer/setup_github_runner.md
@@ -11,9 +11,9 @@ docker pull nvidia/cuda:12.1.1-devel-ubuntu22.04
 # Nvidia
 docker run --shm-size 128g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash
 # AMD
-docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post5-rocm620 /bin/bash
+docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post6-rocm620 /bin/bash
 # AMD just the last 2 GPUs
-docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post5-rocm620 /bin/bash
+docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post6-rocm620 /bin/bash
 ```
 
 ### Step 2: Configure the runner by `config.sh`
diff --git a/docs/start/install.md b/docs/start/install.md
index 26b09dfe319f..8b84527c4ffc 100644
--- a/docs/start/install.md
+++ b/docs/start/install.md
@@ -13,7 +13,7 @@ Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/
 ## Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.4.1.post5 https://github.com/sgl-project/sglang.git
+git clone -b v0.4.1.post6 https://github.com/sgl-project/sglang.git
 cd sglang
 
 pip install --upgrade pip
@@ -26,7 +26,7 @@ Note: To AMD ROCm system with Instinct/MI GPUs, do following instead:
 
 ```
 # Use the last release branch
-git clone -b v0.4.1.post5 https://github.com/sgl-project/sglang.git
+git clone -b v0.4.1.post6 https://github.com/sgl-project/sglang.git
 cd sglang
 
 pip install --upgrade pip
@@ -51,7 +51,7 @@ docker run --gpus all \
 Note: To AMD ROCm system with Instinct/MI GPUs, it is recommended to use `docker/Dockerfile.rocm` to build images, example and usage as below:
 
 ```bash
-docker build --build-arg SGL_BRANCH=v0.4.1.post5 -t v0.4.1.post5-rocm620 -f Dockerfile.rocm .
+docker build --build-arg SGL_BRANCH=v0.4.1.post6 -t v0.4.1.post6-rocm620 -f Dockerfile.rocm .
 
 alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/dri --ipc=host \
     --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
@@ -60,11 +60,11 @@ alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/d
 drun -p 30000:30000 \
     -v ~/.cache/huggingface:/root/.cache/huggingface \
     --env "HF_TOKEN=<secret>" \
-    v0.4.1.post5-rocm620 \
+    v0.4.1.post6-rocm620 \
     python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
 
 # Till flashinfer backend available, --attention-backend triton --sampling-backend pytorch are set by default
-drun v0.4.1.post5-rocm620 python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8
+drun v0.4.1.post6-rocm620 python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8
 ```
 
 ## Method 4: Using docker compose
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 61a36e34132e..fe68e59eae48 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sglang"
-version = "0.4.1.post5"
+version = "0.4.1.post6"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
diff --git a/python/sglang/version.py b/python/sglang/version.py
index 51eb3167fae5..3a906dbcfffd 100644
--- a/python/sglang/version.py
+++ b/python/sglang/version.py
@@ -1 +1 @@
-__version__ = "0.4.1.post5"
+__version__ = "0.4.1.post6"

From bfbda62c8baa3c868f590b34572ccfc6e206eaa6 Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Wed, 15 Jan 2025 18:29:14 +0800
Subject: [PATCH 067/248] Add ut for w8a8 int8 quantization (#2897)

---
 test/srt/run_suite.py              |  1 +
 test/srt/test_w8a8_quantization.py | 74 ++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 test/srt/test_w8a8_quantization.py

diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 658b3d2f8158..ad5aa6aa5491 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -47,6 +47,7 @@
         "test_update_weights_from_tensor.py",
         "test_vision_chunked_prefill.py",
         "test_vision_openai_server.py",
+        "test_w8a8_quantization.py",
         "test_session_control.py",
     ],
     "nightly": [
diff --git a/test/srt/test_w8a8_quantization.py b/test/srt/test_w8a8_quantization.py
new file mode 100644
index 000000000000..78579d5e2dea
--- /dev/null
+++ b/test/srt/test_w8a8_quantization.py
@@ -0,0 +1,74 @@
+import time
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestW8A8(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "neuralmagic/Meta-Llama-3-8B-Instruct-quantized.w8a8"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--quantization", "w8a8_int8"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.7)
+
+    def run_decode(self, max_new_tokens):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                },
+                "ignore_eos": True,
+            },
+        )
+        return response.json()
+
+    def test_throughput(self):
+        max_tokens = 256
+
+        tic = time.time()
+        res = self.run_decode(max_tokens)
+        tok = time.time()
+        print(res["text"])
+        throughput = max_tokens / (tok - tic)
+        print(f"Throughput: {throughput} tokens/s")
+        assert throughput >= 140
+
+
+if __name__ == "__main__":
+    unittest.main()

From b803b395b79cc04ec431936d532719ced61796bb Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 15 Jan 2025 03:29:33 -0800
Subject: [PATCH 068/248] Disable graceful shutdown of tokenizer manager when
 not in the main thread (#2872)

---
 .../sglang/srt/managers/tokenizer_manager.py  | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index acd3b674a455..eae3d87d7380 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -21,6 +21,7 @@
 import pickle
 import signal
 import sys
+import threading
 import time
 import uuid
 from datetime import datetime
@@ -265,10 +266,16 @@ async def _tokenize_one_request(
                 )
             input_embeds = obj.input_embeds
             input_ids = obj.input_ids
-        elif obj.input_ids is None:
-            input_ids = self.tokenizer.encode(input_text)
-        else:
+        elif obj.input_ids is not None:
             input_ids = obj.input_ids
+        else:
+            if self.tokenizer is None:
+                raise ValueError(
+                    "The engine initialized with skip_tokenizer_init=True cannot "
+                    "accept text prompts. Please provide input_ids or re-initialize "
+                    "the engine with skip_tokenizer_init=False."
+                )
+            input_ids = self.tokenizer.encode(input_text)
 
         if self.is_generation:
             # TODO: also support getting embeddings for multimodal models
@@ -635,8 +642,17 @@ def auto_create_handle_loop(self):
         loop = asyncio.get_event_loop()
         self.asyncio_tasks.add(loop.create_task(self.handle_loop()))
 
-        signal_handler = SignalHandler(self)
-        loop.add_signal_handler(signal.SIGTERM, signal_handler.signal_handler)
+        # We cannot add signal handler when the tokenizer manager is not in
+        # the main thread due to the CPython limitation.
+        if threading.current_thread() is threading.main_thread():
+            signal_handler = SignalHandler(self)
+            loop.add_signal_handler(signal.SIGTERM, signal_handler.signal_handler)
+        else:
+            logger.warning(
+                "Signal handler is not added because the tokenizer manager is "
+                "not in the main thread. This disables graceful shutdown of the "
+                "tokenizer manager when SIGTERM is received."
+            )
         self.asyncio_tasks.add(loop.create_task(self.sigterm_watchdog()))
 
     async def sigterm_watchdog(self):

From f65c13b5595a8437b23b716a3d0f76657e15df14 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 15 Jan 2025 04:27:18 -0800
Subject: [PATCH 069/248] Remove normalized_prompt_logprobs from the engine to
 make code easier to maintain (#2902)

---
 .../sglang/lang/backend/runtime_endpoint.py   | 12 ++-
 python/sglang/srt/layers/logits_processor.py  | 32 -------
 .../srt/managers/detokenizer_manager.py       |  1 -
 python/sglang/srt/managers/io_struct.py       |  2 -
 python/sglang/srt/managers/schedule_batch.py  |  4 -
 python/sglang/srt/managers/schedule_policy.py |  1 -
 python/sglang/srt/managers/scheduler.py       | 13 +--
 .../sglang/srt/managers/tokenizer_manager.py  |  3 -
 .../srt/managers/tp_worker_overlap_thread.py  |  8 --
 python/sglang/test/test_programs.py           |  2 +-
 .../deprecated/test_httpserver_classify.py    | 85 -------------------
 .../test_httpserver_decode_stream.py          |  1 -
 12 files changed, 11 insertions(+), 153 deletions(-)
 delete mode 100644 scripts/deprecated/test_httpserver_classify.py

diff --git a/python/sglang/lang/backend/runtime_endpoint.py b/python/sglang/lang/backend/runtime_endpoint.py
index 1261b6d0c9fe..a00325912268 100644
--- a/python/sglang/lang/backend/runtime_endpoint.py
+++ b/python/sglang/lang/backend/runtime_endpoint.py
@@ -251,11 +251,12 @@ def select(
         }
         obj = self._generate_http_request(s, data)
 
-        normalized_prompt_logprobs = [
-            r["meta_info"]["normalized_prompt_logprob"] for r in obj
-        ]
         input_token_logprobs = [r["meta_info"]["input_token_logprobs"] for r in obj]
         output_token_logprobs = [r["meta_info"]["output_token_logprobs"] for r in obj]
+        normalized_prompt_logprobs = [
+            compute_normalized_prompt_logprobs(r["meta_info"]["input_token_logprobs"])
+            for r in obj
+        ]
 
         # Remove extra token if no token healing occurred
         for i in range(len(input_token_logprobs)):
@@ -319,3 +320,8 @@ def _add_images(self, s: StreamExecutor, data):
     def _assert_success(self, res):
         if res.status_code != 200:
             raise RuntimeError(res.json())
+
+
+def compute_normalized_prompt_logprobs(input_logprobs):
+    values = [x[0] for x in input_logprobs if x[0]]
+    return sum(values) / len(values)
diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py
index 7ca1d51a756d..f5b12b48a867 100644
--- a/python/sglang/srt/layers/logits_processor.py
+++ b/python/sglang/srt/layers/logits_processor.py
@@ -50,8 +50,6 @@ class LogitsProcessorOutput:
     next_token_top_logprobs_idx: Optional[List] = None
 
     ## Part 3: Prefill-only. This part will be assigned in python/sglang/srt/layers/logits_processor.py::LogitsProcessor
-    # The normlaized logprobs of prompts.  shape: [#seq]
-    normalized_prompt_logprobs: torch.Tensor = None
     # The logprobs of input tokens.        shape: [#token]
     input_token_logprobs: torch.Tensor = None
     # The logprobs and ids of the top-k tokens in input positions.  shape: [#seq, #token, k]
@@ -195,8 +193,6 @@ def forward(
             else:
                 input_top_logprobs_val = input_top_logprobs_idx = None
 
-            # Compute the normalized logprobs for the requested tokens.
-            # Note that we pad a zero at the end for easy batching.
             input_token_logprobs = input_logprobs[
                 torch.arange(input_logprobs.shape[0], device="cuda"),
                 torch.cat(
@@ -206,14 +202,9 @@ def forward(
                     ]
                 ),
             ]
-            normalized_prompt_logprobs = self._get_normalized_prompt_logprobs(
-                input_token_logprobs,
-                logits_metadata,
-            )
 
             return LogitsProcessorOutput(
                 next_token_logits=last_logits,
-                normalized_prompt_logprobs=normalized_prompt_logprobs,
                 input_token_logprobs=input_token_logprobs,
                 input_top_logprobs_val=input_top_logprobs_val,
                 input_top_logprobs_idx=input_top_logprobs_idx,
@@ -237,8 +228,6 @@ def _get_logits(
         if self.do_tensor_parallel_all_gather:
             logits = tensor_model_parallel_all_gather(logits)
 
-        # Compute the normalized logprobs for the requested tokens.
-        # Note that we pad a zero at the end for easy batching.
         logits = logits[:, : self.config.vocab_size].float()
 
         if self.final_logit_softcapping:
@@ -246,27 +235,6 @@ def _get_logits(
 
         return logits
 
-    @staticmethod
-    def _get_normalized_prompt_logprobs(
-        input_token_logprobs: torch.Tensor,
-        logits_metadata: LogitsMetadata,
-    ):
-        logprobs_cumsum = torch.cumsum(input_token_logprobs, dim=0, dtype=torch.float32)
-        pruned_lens = torch.tensor(
-            logits_metadata.extend_logprob_pruned_lens_cpu, device="cuda"
-        )
-
-        start = torch.zeros_like(pruned_lens)
-        start[1:] = torch.cumsum(pruned_lens[:-1], dim=0)
-        end = torch.clamp(
-            start + pruned_lens - 2, min=0, max=logprobs_cumsum.shape[0] - 1
-        )
-        sum_logp = (
-            logprobs_cumsum[end] - logprobs_cumsum[start] + input_token_logprobs[start]
-        )
-        normalized_prompt_logprobs = sum_logp / (pruned_lens - 1).clamp(min=1)
-        return normalized_prompt_logprobs
-
     @staticmethod
     def get_top_logprobs(all_logprobs: torch.Tensor, logits_metadata: LogitsMetadata):
         max_k = max(logits_metadata.top_logprobs_nums)
diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py
index b4bc1e7a448d..7a0f7b0d5faf 100644
--- a/python/sglang/srt/managers/detokenizer_manager.py
+++ b/python/sglang/srt/managers/detokenizer_manager.py
@@ -191,7 +191,6 @@ def event_loop(self):
                     input_top_logprobs_idx=recv_obj.input_top_logprobs_idx,
                     output_top_logprobs_val=recv_obj.output_top_logprobs_val,
                     output_top_logprobs_idx=recv_obj.output_top_logprobs_idx,
-                    normalized_prompt_logprob=recv_obj.normalized_prompt_logprob,
                 )
             )
 
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
index 075693c7bc90..1698dfbeb3ec 100644
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -340,7 +340,6 @@ class BatchTokenIDOut:
     input_top_logprobs_idx: List[List]
     output_top_logprobs_val: List[List]
     output_top_logprobs_idx: List[List]
-    normalized_prompt_logprob: List[float]
 
 
 @dataclass
@@ -366,7 +365,6 @@ class BatchStrOut:
     input_top_logprobs_idx: List[List]
     output_top_logprobs_val: List[List]
     output_top_logprobs_idx: List[List]
-    normalized_prompt_logprob: List[float]
 
 
 @dataclass
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 3b056cc5d492..c375df234dc0 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -280,7 +280,6 @@ def __init__(
         self.top_logprobs_num = top_logprobs_num
 
         # Logprobs (return value)
-        self.normalized_prompt_logprob = None
         self.input_token_logprobs_val = None
         self.input_token_logprobs_idx = None
         self.input_top_logprobs_val = None
@@ -344,9 +343,6 @@ def adjust_max_prefix_ids(self):
             max_prefix_len = min(max_prefix_len, input_len - 1)
 
         if self.return_logprob:
-            if self.normalized_prompt_logprob is None:
-                # Need at least two tokens to compute normalized logprob
-                max_prefix_len = min(max_prefix_len, input_len - 2)
             max_prefix_len = min(max_prefix_len, self.logprob_start_len)
 
         max_prefix_len = max(max_prefix_len, 0)
diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py
index d2083d092bcd..7cab55c74382 100644
--- a/python/sglang/srt/managers/schedule_policy.py
+++ b/python/sglang/srt/managers/schedule_policy.py
@@ -433,7 +433,6 @@ def add_one_req(self, req: Req):
                 or input_tokens <= self.rem_chunk_tokens
                 or (
                     req.return_logprob
-                    and req.normalized_prompt_logprob is None
                     and req.logprob_start_len != len(req.origin_input_ids) - 1
                 )
             ):
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 187216353171..169c202d37e8 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -1038,9 +1038,6 @@ def process_batch_result_prefill(self, batch: ScheduleBatch, result):
                     logits_output.input_token_logprobs = (
                         logits_output.input_token_logprobs.tolist()
                     )
-                    logits_output.normalized_prompt_logprobs = (
-                        logits_output.normalized_prompt_logprobs.tolist()
-                    )
 
             # Check finish conditions
             logprob_pt = 0
@@ -1188,9 +1185,6 @@ def add_logprob_return_values(
         # If logprob_start_len > 0, then first logprob_start_len prompt tokens will be ignored.
         num_input_logprobs = req.extend_input_len - req.extend_logprob_start_len
 
-        if req.normalized_prompt_logprob is None:
-            req.normalized_prompt_logprob = output.normalized_prompt_logprobs[i]
-
         if req.input_token_logprobs_val is None:
             input_token_logprobs_val = output.input_token_logprobs[
                 pt : pt + num_input_logprobs - 1 - req.last_update_decode_tokens
@@ -1288,15 +1282,12 @@ def stream_output(
                 input_top_logprobs_idx = []
                 output_top_logprobs_val = []
                 output_top_logprobs_idx = []
-                normalized_prompt_logprob = []
             else:
                 input_token_logprobs_val = input_token_logprobs_idx = (
                     output_token_logprobs_val
                 ) = output_token_logprobs_idx = input_top_logprobs_val = (
                     input_top_logprobs_idx
-                ) = output_top_logprobs_val = output_top_logprobs_idx = (
-                    normalized_prompt_logprob
-                ) = None
+                ) = output_top_logprobs_val = output_top_logprobs_idx = None
 
             for req in reqs:
                 if req is skip_req:
@@ -1343,7 +1334,6 @@ def stream_output(
                         input_top_logprobs_idx.append(req.input_top_logprobs_idx)
                         output_top_logprobs_val.append(req.output_top_logprobs_val)
                         output_top_logprobs_idx.append(req.output_top_logprobs_idx)
-                        normalized_prompt_logprob.append(req.normalized_prompt_logprob)
 
             # Send to detokenizer
             if rids:
@@ -1370,7 +1360,6 @@ def stream_output(
                         input_top_logprobs_idx,
                         output_top_logprobs_val,
                         output_top_logprobs_idx,
-                        normalized_prompt_logprob,
                     )
                 )
         else:  # embedding or reward model
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index eae3d87d7380..4f4e4f7dc872 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -796,9 +796,6 @@ def convert_logprob_style(
             recv_obj.output_token_logprobs_idx[recv_obj_index],
             return_text_in_logprobs,
         )
-        meta_info["normalized_prompt_logprob"] = recv_obj.normalized_prompt_logprob[
-            recv_obj_index
-        ]
 
         if top_logprobs_num > 0:
             meta_info["input_top_logprobs"] = self.detokenize_top_logprobs_tokens(
diff --git a/python/sglang/srt/managers/tp_worker_overlap_thread.py b/python/sglang/srt/managers/tp_worker_overlap_thread.py
index 4c98c6be2e4c..2aa9c8269890 100644
--- a/python/sglang/srt/managers/tp_worker_overlap_thread.py
+++ b/python/sglang/srt/managers/tp_worker_overlap_thread.py
@@ -151,11 +151,6 @@ def forward_thread_func_(self):
                     logits_output.input_token_logprobs = (
                         logits_output.input_token_logprobs.to("cpu", non_blocking=True)
                     )
-                    logits_output.normalized_prompt_logprobs = (
-                        logits_output.normalized_prompt_logprobs.to(
-                            "cpu", non_blocking=True
-                        )
-                    )
             next_token_ids = next_token_ids.to("cpu", non_blocking=True)
             copy_done.record()
 
@@ -174,9 +169,6 @@ def resolve_batch_result(self, bid: int):
                 logits_output.input_token_logprobs = (
                     logits_output.input_token_logprobs.tolist()
                 )
-                logits_output.normalized_prompt_logprobs = (
-                    logits_output.normalized_prompt_logprobs.tolist()
-                )
         next_token_ids = next_token_ids.tolist()
         return logits_output, next_token_ids
 
diff --git a/python/sglang/test/test_programs.py b/python/sglang/test/test_programs.py
index 411a20b9267c..219ed3cf6ecf 100644
--- a/python/sglang/test/test_programs.py
+++ b/python/sglang/test/test_programs.py
@@ -535,7 +535,7 @@ def few_shot_hellaswag(s, question, choices):
 
     # Compute accuracy
     accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
-    assert np.abs(accuracy_gen - accuracy) < 0.01
+    assert np.abs(accuracy_gen - accuracy) < 0.05
     assert np.abs(latency_gen - latency) < 1
 
     return accuracy, latency
diff --git a/scripts/deprecated/test_httpserver_classify.py b/scripts/deprecated/test_httpserver_classify.py
deleted file mode 100644
index cb88802999a7..000000000000
--- a/scripts/deprecated/test_httpserver_classify.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""
-Usage:
-python3 -m sglang.launch_server --model-path /model/llama-classification --is-embedding --disable-radix-cache
-
-python3 test_httpserver_classify.py
-"""
-
-import argparse
-
-import numpy as np
-import requests
-
-
-def get_logits_deprecated(url: str, prompt: str):
-    response = requests.post(
-        url + "/generate",
-        json={
-            "text": prompt,
-            "sampling_params": {
-                "max_new_tokens": 0,
-            },
-            "return_logprob": True,
-        },
-    )
-    return response.json()["meta_info"]["normalized_prompt_logprob"]
-
-
-def get_logits_batch_deprecated(url: str, prompts: list[str]):
-    response = requests.post(
-        url + "/generate",
-        json={
-            "text": prompts,
-            "sampling_params": {
-                "max_new_tokens": 0,
-            },
-            "return_logprob": True,
-        },
-    )
-    ret = response.json()
-    logits = np.array(
-        list(
-            ret[i]["meta_info"]["normalized_prompt_logprob"]
-            for i in range(len(prompts))
-        )
-    )
-    return logits
-
-
-def get_logits(url: str, prompt: str):
-    response = requests.post(
-        url + "/classify",
-        json={"text": prompt},
-    )
-    return response.json()["embedding"]
-
-
-def get_logits_batch(url: str, prompts: list[str]):
-    response = requests.post(
-        url + "/classify",
-        json={"text": prompts},
-    )
-    return np.array([x["embedding"] for x in response.json()])
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--host", type=str, default="http://127.0.0.1")
-    parser.add_argument("--port", type=int, default=30000)
-    args = parser.parse_args()
-
-    url = f"{args.host}:{args.port}"
-
-    # A single request
-    prompt = "This is a test prompt.<|eot_id|>"
-    logits = get_logits(url, prompt)
-    print(f"{logits=}")
-
-    # A batch of requests
-    prompts = [
-        "This is a test prompt.<|eot_id|>",
-        "This is another test prompt.<|eot_id|>",
-        "This is a long long long long test prompt.<|eot_id|>",
-    ]
-    logits = get_logits_batch(url, prompts)
-    print(f"{logits=}")
diff --git a/scripts/deprecated/test_httpserver_decode_stream.py b/scripts/deprecated/test_httpserver_decode_stream.py
index 955c368d1549..616eaf6c4b1e 100644
--- a/scripts/deprecated/test_httpserver_decode_stream.py
+++ b/scripts/deprecated/test_httpserver_decode_stream.py
@@ -42,7 +42,6 @@ def test_decode_stream(url, return_logprob, top_logprobs_num):
             if return_logprob:
                 assert data["meta_info"]["input_token_logprobs"] is not None
                 assert data["meta_info"]["output_token_logprobs"] is not None
-                assert data["meta_info"]["normalized_prompt_logprob"] is not None
                 for logprob, token_id, token_text in data["meta_info"][
                     "output_token_logprobs"
                 ][prev:]:

From 6cb3974e77524ee2b291919ca6a8b55547bca8e0 Mon Sep 17 00:00:00 2001
From: yizhang2077 <1109276519@qq.com>
Date: Thu, 16 Jan 2025 03:04:25 +0800
Subject: [PATCH 070/248] optimize custom allreduce kernel (#2904)

---
 sgl-kernel/pyproject.toml                     |   2 +-
 sgl-kernel/setup.py                           |   2 +-
 sgl-kernel/src/sgl-kernel/__init__.py         |   4 +
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     |  10 +-
 .../sgl-kernel/csrc/trt_reduce_internal.cu    | 139 +++++++++++-------
 .../sgl-kernel/csrc/trt_reduce_internal.cuh   |   8 +-
 .../src/sgl-kernel/csrc/trt_reduce_kernel.cu  | 117 ++++++++++++++-
 sgl-kernel/src/sgl-kernel/ops/__init__.py     |  20 ++-
 sgl-kernel/tests/test_trt_reduce.py           |  22 +--
 9 files changed, 244 insertions(+), 80 deletions(-)

diff --git a/sgl-kernel/pyproject.toml b/sgl-kernel/pyproject.toml
index b03b4c02b5e1..6a6a0d1fe4ef 100644
--- a/sgl-kernel/pyproject.toml
+++ b/sgl-kernel/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sgl-kernel"
-version = "0.0.2.post12"
+version = "0.0.2.post13"
 description = "Kernel Library for SGLang"
 readme = "README.md"
 requires-python = ">=3.8"
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 83025d6d6c6f..2d2d9258ade9 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -40,7 +40,7 @@ def update_wheel_platform_tag():
     "-U__CUDA_NO_HALF2_OPERATORS__",
 ]
 cxx_flags = ["-O3"]
-libraries = ["c10", "torch", "torch_python"]
+libraries = ["c10", "torch", "torch_python", "cuda"]
 extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib"]
 ext_modules = [
     CUDAExtension(
diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py
index 62c366731e55..0c744982dd84 100644
--- a/sgl-kernel/src/sgl-kernel/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/__init__.py
@@ -1,9 +1,11 @@
 from sgl_kernel.ops import (
     custom_dispose,
     custom_reduce,
+    get_graph_buffer_ipc_meta,
     init_custom_reduce,
     int8_scaled_mm,
     moe_align_block_size,
+    register_graph_buffers,
     sampling_scaling_penalties,
 )
 
@@ -14,4 +16,6 @@
     "custom_reduce",
     "int8_scaled_mm",
     "sampling_scaling_penalties",
+    "get_graph_buffer_ipc_meta",
+    "register_graph_buffers",
 ]
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index fbfe51442a35..b9879b114fe4 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -2,10 +2,14 @@
 
 // trt_reduce
 using fptr_t = int64_t;
-fptr_t init_custom_ar(int64_t rank_id, int64_t world_size, const std::vector<fptr_t>& buffers,
-                      const std::vector<fptr_t>& barrier_in, const std::vector<fptr_t>& barrier_out);
+fptr_t init_custom_ar(int64_t rank_id, int64_t world_size, torch::Tensor& rank_data, const std::vector<fptr_t>& buffers,
+                      const std::vector<fptr_t>& tmp_result_buffers, const std::vector<fptr_t>& barrier_in,
+                      const std::vector<fptr_t>& barrier_out);
 void dispose(fptr_t _fa);
 void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
+std::tuple<std::vector<int64_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(fptr_t _fa);
+void register_graph_buffers(fptr_t _fa, const std::vector<std::vector<int64_t>>& handles,
+                            const std::vector<std::vector<int64_t>>& offsets);
 
 // moe_align_block_size
 void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t block_size,
@@ -25,6 +29,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)");
   m.def("dispose", &dispose, "dispose custom allreduce meta");
   m.def("all_reduce", &all_reduce, "custom all reduce (CUDA)");
+  m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta, "custom all reduce get graph ipc meta");
+  m.def("register_graph_buffers", &register_graph_buffers, "custom all reduce register graph buffers");
   // moe_align_block_size
   m.def("moe_align_block_size", &moe_align_block_size, "MOE Align Block Size (CUDA)");
   // sampling_scaling_penalties
diff --git a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu
index a6f2d5216a18..006c3200dd1e 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu
@@ -126,10 +126,10 @@ __inline__ __device__ void multi_gpu_barrier(uint32_t** signals, uint32_t const
   __syncthreads();
 }
 
+template <bool start, bool need_fence = false>
 __inline__ __device__ void block_barrier(uint32_t** signals, uint32_t const flag, size_t const local_rank,
-                                         size_t const world_size, int const tidx, int const bidx, int const grid_size,
-                                         bool start = true, bool need_fence = false) {
-  if (!start) {
+                                         size_t const world_size, int const tidx, int const bidx, int const grid_size) {
+  if constexpr (!start) {
     __syncthreads();
   }
   // After this function, the block of id == bidx of each GPU has reached the barrier
@@ -141,22 +141,16 @@ __inline__ __device__ void block_barrier(uint32_t** signals, uint32_t const flag
     // Block broadcast its flag (local_rank on emitting dimension) to all receivers
     uint32_t flag_block_offset = world_size + bidx * world_size;
 
-    if (flag % 2 == 1) {
-      flag_block_offset += (grid_size + 1) * world_size;
-    }
+    flag_block_offset += (grid_size + 1) * world_size * (flag % 2);
 
-    if (need_fence) {
-      st_flag_release(flag, signals[tidx] + flag_block_offset + local_rank);
-    } else {
-      st_flag_volatile(flag, signals[tidx] + flag_block_offset + local_rank);
-    }
-    // Blocks check that corresponding blocks on other GPUs have also set the flag
     uint32_t* peer_barrier_d = signals[local_rank] + flag_block_offset + tidx;
-
-    if (need_fence) {
+    // Blocks check that corresponding blocks on other GPUs have also set the flag
+    if constexpr (need_fence) {
+      st_flag_release(flag, signals[tidx] + flag_block_offset + local_rank);
       while (ld_flag_acquire(peer_barrier_d) != flag) {
       }
     } else {
+      st_flag_volatile(flag, signals[tidx] + flag_block_offset + local_rank);
       while (ld_flag_volatile(peer_barrier_d) != flag) {
       }
     }
@@ -165,7 +159,7 @@ __inline__ __device__ void block_barrier(uint32_t** signals, uint32_t const flag
   __syncthreads();
 }
 
-template <typename T, int RANKS_PER_NODE> /* COPY_INPUT = false, PUSH_MODE = false */
+template <typename T, int RANKS_PER_NODE, bool COPY_INPUT = true>
 static __global__ void oneShotAllReduceKernel(AllReduceParams params) {
   // Suppose that two GPUs participate in the AR exchange, and we start four blocks.
   // The message is partitioned into chunks as detailed below:
@@ -193,6 +187,7 @@ static __global__ void oneShotAllReduceKernel(AllReduceParams params) {
 
   int const bidx = blockIdx.x;
   int const tidx = threadIdx.x;
+  int const grid_size = gridDim.x;
 
   // The number of elements packed into one for comms
   static constexpr int NUM_ELTS = 16 / sizeof(T);
@@ -201,18 +196,23 @@ static __global__ void oneShotAllReduceKernel(AllReduceParams params) {
   using PackedStruct = typename PackedOn16Bytes<T>::Type;
 
   // The source pointers. Distributed round-robin for the different warps.
-  T const* buffers[RANKS_PER_NODE];
-
+  auto peer_comm_buffer_ptrs = params.peer_comm_buffer_ptrs->ptrs;
+  T* local_shared_buffer = reinterpret_cast<T*>(peer_comm_buffer_ptrs[params.local_rank]);
   // Start and end offsets of the thread
   size_t chunk_start = bidx * params.elts_per_block + tidx * NUM_ELTS;
   size_t chunk_end = std::min((bidx + 1) * params.elts_per_block, params.elts_per_rank);
-#pragma unroll
-  for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
-    int rank = (params.local_rank + ii) % RANKS_PER_NODE;
-    buffers[ii] = reinterpret_cast<T*>(params.peer_comm_buffer_ptrs[rank]);
-  }
 
-  multi_gpu_barrier(params.peer_barrier_ptrs_in, params.barrier_flag, params.local_rank, RANKS_PER_NODE, tidx, bidx);
+  if constexpr (COPY_INPUT) {
+    T const* local_input_buffer = reinterpret_cast<T const*>(params.local_input_buffer_ptr);
+    // Copy from local buffer to shareable buffer
+    for (size_t iter_offset = chunk_start; iter_offset < chunk_end; iter_offset += blockDim.x * NUM_ELTS) {
+      *reinterpret_cast<int4*>(&local_shared_buffer[iter_offset]) =
+          *reinterpret_cast<int4 const*>(&local_input_buffer[iter_offset]);
+    }
+  }
+  // wait for equivalent blocks of other GPUs to have copied data to their shareable buffer
+  block_barrier<true>(params.peer_barrier_ptrs_in, params.barrier_flag, params.local_rank, RANKS_PER_NODE, tidx, bidx,
+                      grid_size);
 
   // Each block accumulates the values from the different GPUs on the same node.
   for (size_t iter_offset = chunk_start; iter_offset < chunk_end; iter_offset += blockDim.x * NUM_ELTS) {
@@ -220,7 +220,7 @@ static __global__ void oneShotAllReduceKernel(AllReduceParams params) {
     PackedStruct vals[RANKS_PER_NODE];
 #pragma unroll
     for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
-      vals[ii].packed = *reinterpret_cast<int4 const*>(&buffers[ii][iter_offset]);
+      vals[ii].packed = *reinterpret_cast<int4 const*>(&((T*)peer_comm_buffer_ptrs[ii])[iter_offset]);
     }
 
     // Sum the values from the different ranks.
@@ -229,8 +229,7 @@ static __global__ void oneShotAllReduceKernel(AllReduceParams params) {
 #pragma unroll
     for (int rank = 0; rank < RANKS_PER_NODE; ++rank) {
       // Always reduce from rank 0 to ensure stable reduce order.
-      int ii = (rank + RANKS_PER_NODE - params.local_rank) % RANKS_PER_NODE;
-      sums.packed = add128b(sums, vals[ii]);
+      sums.packed = add128b(sums, vals[rank]);
     }
 
     // Store to the destination buffer.
@@ -238,7 +237,7 @@ static __global__ void oneShotAllReduceKernel(AllReduceParams params) {
   }
 }
 
-template <typename T, int RANKS_PER_NODE>
+template <typename T, int RANKS_PER_NODE, bool COPY_INPUT = true>
 static __global__ void __launch_bounds__(512, 1) twoShotAllReduceKernel(AllReduceParams params) {
   // Suppose that two GPUs participate in the AR exchange, and we start two blocks.
   // The message is partitioned into chunks as detailed below:
@@ -286,20 +285,24 @@ static __global__ void __launch_bounds__(512, 1) twoShotAllReduceKernel(AllReduc
   static constexpr int PACKED_ELTS = 16 / sizeof(T);
   using PackedType = typename PackedOn16Bytes<T>::Type;
 
-  T* local_shared_buffer = reinterpret_cast<T*>(params.peer_comm_buffer_ptrs[params.local_rank]);
+  T const* local_input_buffer = reinterpret_cast<T const*>(params.local_input_buffer_ptr);
+  auto peer_comm_buffer_ptrs = params.peer_comm_buffer_ptrs->ptrs;
+  T* local_shared_buffer = reinterpret_cast<T*>(peer_comm_buffer_ptrs[params.local_rank]);
   T* local_output_buffer = reinterpret_cast<T*>(params.local_output_buffer_ptr);
 
   size_t const chunk_start = bidx * params.elts_per_block + tidx * PACKED_ELTS;
   size_t const chunk_end = min(chunk_start + params.elts_per_block, params.elts_per_rank);
 
   T* buffers[RANKS_PER_NODE];
+  T* buffers_unorder[RANKS_PER_NODE];
   int ranks[RANKS_PER_NODE];
 #pragma unroll
   for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
     // A mapping of the ranks to scatter reads as much as possible
     int rank = (params.local_rank + ii) % RANKS_PER_NODE;
     ranks[ii] = rank;
-    buffers[ii] = reinterpret_cast<T*>(params.peer_comm_buffer_ptrs[rank]);
+    buffers[ii] = reinterpret_cast<T*>(peer_comm_buffer_ptrs[rank]);
+    buffers_unorder[ii] = reinterpret_cast<T*>(peer_comm_buffer_ptrs[ii]);
   }
 
 #if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12))
@@ -308,8 +311,22 @@ static __global__ void __launch_bounds__(512, 1) twoShotAllReduceKernel(AllReduc
 #endif
 #endif
 
-  block_barrier(params.peer_barrier_ptrs_in, params.barrier_flag, params.local_rank, RANKS_PER_NODE, tidx, bidx,
-                grid_size);
+  if constexpr (COPY_INPUT) {
+    // Copy all blocks from local buffer to shareable buffer
+    for (size_t local_offset = chunk_start; local_offset < chunk_end; local_offset += blockDim.x * PACKED_ELTS) {
+#pragma unroll
+      for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+        size_t offset_rank = ranks[ii] * params.elts_per_rank + local_offset;
+        if (offset_rank >= params.elts_total) {
+          continue;
+        }
+        *reinterpret_cast<int4*>(&local_shared_buffer[offset_rank]) =
+            *reinterpret_cast<int4 const*>(&local_input_buffer[offset_rank]);
+      }
+    }
+  }
+  block_barrier<true>(params.peer_barrier_ptrs_in, params.barrier_flag, params.local_rank, RANKS_PER_NODE, tidx, bidx,
+                      grid_size);
 
   // Each block accumulates the values from the different GPUs on the same node.
   for (size_t local_offset = chunk_start; local_offset < chunk_end; local_offset += blockDim.x * PACKED_ELTS) {
@@ -319,7 +336,7 @@ static __global__ void __launch_bounds__(512, 1) twoShotAllReduceKernel(AllReduc
     PackedType vals[RANKS_PER_NODE];
 #pragma unroll
     for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
-      vals[ii].packed = *reinterpret_cast<int4 const*>(&buffers[ii][responsible_block_offset]);
+      vals[ii].packed = *reinterpret_cast<int4 const*>(&buffers_unorder[ii][responsible_block_offset]);
     }
 
     // Sum the values from the different ranks.
@@ -328,16 +345,19 @@ static __global__ void __launch_bounds__(512, 1) twoShotAllReduceKernel(AllReduc
 #pragma unroll
     for (int rank = 0; rank < RANKS_PER_NODE; ++rank) {
       // Always reduce from rank 0 to ensure stable reduce order.
-      int ii = (rank + RANKS_PER_NODE - params.local_rank) % RANKS_PER_NODE;
-      sums.packed = add128b(sums, vals[ii]);
+      sums.packed = add128b(sums, vals[rank]);
     }
 
-    // Store to the local buffer.
-    *reinterpret_cast<int4*>(&local_shared_buffer[responsible_block_offset]) = sums.packed;
+    // Store to the local buffer or tmp buffer
+    if constexpr (COPY_INPUT) {
+      *reinterpret_cast<int4*>(&local_shared_buffer[responsible_block_offset]) = sums.packed;
+    } else {
+      *reinterpret_cast<int4*>(&params.tmp_result_buffers[params.local_rank][responsible_block_offset]) = sums.packed;
+    }
   }
 
-  block_barrier(params.peer_barrier_ptrs_out, params.barrier_flag, params.local_rank, RANKS_PER_NODE, tidx, bidx,
-                grid_size, false, true);
+  block_barrier<false, true>(params.peer_barrier_ptrs_out, params.barrier_flag, params.local_rank, RANKS_PER_NODE, tidx,
+                             bidx, grid_size);
 
   // Gather all needed elts from other intra-node ranks
   for (size_t local_offset = chunk_start; local_offset < chunk_end; local_offset += blockDim.x * PACKED_ELTS) {
@@ -348,8 +368,13 @@ static __global__ void __launch_bounds__(512, 1) twoShotAllReduceKernel(AllReduc
       if (offset_rank >= params.elts_total) {
         continue;
       }
-
-      *reinterpret_cast<int4*>(&local_output_buffer[offset_rank]) = *reinterpret_cast<int4*>(&buffers[ii][offset_rank]);
+      if constexpr (COPY_INPUT) {
+        *reinterpret_cast<int4*>(&local_output_buffer[offset_rank]) =
+            *reinterpret_cast<int4*>(&buffers[ii][offset_rank]);
+      } else {
+        *reinterpret_cast<int4*>(&local_output_buffer[offset_rank]) =
+            *reinterpret_cast<int4*>(&params.tmp_result_buffers[ranks[ii]][offset_rank]);
+      }
     }
   }
 #if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12))
@@ -417,48 +442,50 @@ std::tuple<int, int> kernelLaunchConfig(AllReduceStrategyType algo, AllReducePar
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <typename T, int RANKS_PER_NODE>
+template <typename T, int RANKS_PER_NODE, bool COPY_INPUT>
 void dispatchARKernels(AllReduceStrategyType algo, AllReduceParams& param, int blocks_per_grid, int threads_per_block,
                        cudaStream_t stream) {
   switch (algo) {
     case AllReduceStrategyType::ONESHOT: {
-      oneShotAllReduceKernel<T, RANKS_PER_NODE><<<blocks_per_grid, threads_per_block, 0, stream>>>(param);
+      oneShotAllReduceKernel<T, RANKS_PER_NODE, COPY_INPUT><<<blocks_per_grid, threads_per_block, 0, stream>>>(param);
       break;
     }
     case AllReduceStrategyType::TWOSHOT: {
-      twoShotAllReduceKernel<T, RANKS_PER_NODE><<<blocks_per_grid, threads_per_block, 0, stream>>>(param);
+      twoShotAllReduceKernel<T, RANKS_PER_NODE, COPY_INPUT><<<blocks_per_grid, threads_per_block, 0, stream>>>(param);
       break;
     }
   }
 }
 
-template <typename T>
-void invokeOneOrTwoShotAllReduceKernel(AllReduceParams& param, AllReduceStrategyType strat, cudaStream_t stream) {
-  void* buffer = reinterpret_cast<void*>(param.peer_comm_buffer_ptrs[param.rank]);
-  void* local_inp_buffer = param.local_input_buffer_ptr;
-  CHECK_CUDA_SUCCESS(
-      cudaMemcpyAsync(buffer, local_inp_buffer, param.elts_total * param.elts_size, cudaMemcpyDeviceToDevice, stream));
-
-  CHECK_CUDA_SUCCESS(cudaGetLastError());
-
+template <typename T, bool COPY_INPUT>
+void dispatchARKernelsCopyInput(AllReduceStrategyType strat, AllReduceParams& param, cudaStream_t stream) {
   size_t elts_per_thread = 16 / sizeof(T);
   auto [blocks_per_grid, threads_per_block] = kernelLaunchConfig(strat, param, elts_per_thread);
   switch (param.ranks_per_node) {
     case 2:
-      dispatchARKernels<T, 2>(strat, param, blocks_per_grid, threads_per_block, stream);
+      dispatchARKernels<T, 2, COPY_INPUT>(strat, param, blocks_per_grid, threads_per_block, stream);
       break;
     case 4:
-      dispatchARKernels<T, 4>(strat, param, blocks_per_grid, threads_per_block, stream);
+      dispatchARKernels<T, 4, COPY_INPUT>(strat, param, blocks_per_grid, threads_per_block, stream);
       break;
     case 6:
-      dispatchARKernels<T, 6>(strat, param, blocks_per_grid, threads_per_block, stream);
+      dispatchARKernels<T, 6, COPY_INPUT>(strat, param, blocks_per_grid, threads_per_block, stream);
       break;
     case 8:
-      dispatchARKernels<T, 8>(strat, param, blocks_per_grid, threads_per_block, stream);
+      dispatchARKernels<T, 8, COPY_INPUT>(strat, param, blocks_per_grid, threads_per_block, stream);
       break;
     default:
       break;
   }
+}
+
+template <typename T>
+void invokeOneOrTwoShotAllReduceKernel(AllReduceParams& param, AllReduceStrategyType strat, cudaStream_t stream) {
+  if (param.is_capturing) {
+    dispatchARKernelsCopyInput<T, false>(strat, param, stream);
+  } else {
+    dispatchARKernelsCopyInput<T, true>(strat, param, stream);
+  }
   CHECK_CUDA_SUCCESS(cudaGetLastError());
 }
 
diff --git a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cuh b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cuh
index 1c7c714dc4a8..9d6f9722eb5f 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cuh
+++ b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cuh
@@ -36,6 +36,10 @@ enum class AllReduceStrategyType : int8_t {
   AUTO = 3,
 };
 
+struct RankData {
+  void* ptrs[MAX_RANKS_PER_NODE];
+};
+
 struct AllReduceParams {
   size_t elts_size;
   size_t elts_total;
@@ -46,9 +50,11 @@ struct AllReduceParams {
   uint32_t barrier_flag;
   uint32_t* peer_barrier_ptrs_in[MAX_RANKS_PER_NODE];
   uint32_t* peer_barrier_ptrs_out[MAX_RANKS_PER_NODE];
-  void* peer_comm_buffer_ptrs[MAX_RANKS_PER_NODE];
+  uint32_t* tmp_result_buffers[MAX_RANKS_PER_NODE];
+  RankData* peer_comm_buffer_ptrs;
   void* local_input_buffer_ptr;
   void* local_output_buffer_ptr;
+  bool is_capturing;
 };
 
 inline size_t GetMaxRequiredWorkspaceSize(int world_size) {
diff --git a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_kernel.cu
index 59b548c77e9e..d80beedec823 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_kernel.cu
@@ -12,25 +12,46 @@
 using namespace trt_llm;
 
 using fptr_t = int64_t;
+using IPC_KEY = std::array<uint8_t, sizeof(cudaIpcMemHandle_t)>;
 
 class AllReduceMeta {
  public:
-  AllReduceMeta(int64_t rank_id, int64_t world_size, const std::vector<fptr_t>& buffers,
-                const std::vector<fptr_t>& barrier_in, const std::vector<fptr_t>& barrier_out) {
+  AllReduceMeta(int64_t rank_id, int64_t world_size, torch::Tensor& rank_data, const std::vector<fptr_t>& buffers,
+                const std::vector<fptr_t>& tmp_result_buffers, const std::vector<fptr_t>& barrier_in,
+                const std::vector<fptr_t>& barrier_out) {
     this->rank_id = (int)rank_id;
     this->world_size = (int)world_size;
-    this->buffers = buffers;
     this->barrier_in = barrier_in;
     this->barrier_out = barrier_out;
+    this->tmp_result_buffers = tmp_result_buffers;
+
+    this->rank_data_base = reinterpret_cast<RankData*>(rank_data.data_ptr());
+    RankData data;
+    for (int i = 0; i < world_size; i++) {
+      data.ptrs[i] = (void*)buffers[i];
+    }
+    auto d_data = this->rank_data_base++;
+    CHECK_CUDA_SUCCESS(cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice));
+    this->buffers = d_data;
+  }
+
+  ~AllReduceMeta() {
+    for (auto [_, ptr] : ipc_handles_) {
+      CHECK_CUDA_SUCCESS(cudaIpcCloseMemHandle(ptr));
+    }
   }
 
  public:
   int world_size;
   int rank_id;
-  std::vector<fptr_t> buffers;
   std::vector<fptr_t> barrier_in;
   std::vector<fptr_t> barrier_out;
+  std::vector<fptr_t> tmp_result_buffers;
   int barrier_flag = 1;
+  RankData* buffers;
+  RankData* rank_data_base;
+  std::vector<void*> graph_unreg_buffers;
+  std::map<IPC_KEY, char*> ipc_handles_;
 };
 
 // Get the number of bits for a given data type.
@@ -52,9 +73,10 @@ inline bool CanApplyCustomAllReduce(int64_t num_elements, at::ScalarType dtype)
   return num_elements % (16 / ((get_bits(dtype) + 7) / 8)) == 0;
 }
 
-fptr_t init_custom_ar(int64_t rank_id, int64_t world_size, const std::vector<fptr_t>& buffers,
-                      const std::vector<fptr_t>& barrier_in, const std::vector<fptr_t>& barrier_out) {
-  auto m = new AllReduceMeta(rank_id, world_size, buffers, barrier_in, barrier_out);
+fptr_t init_custom_ar(int64_t rank_id, int64_t world_size, torch::Tensor& rank_data, const std::vector<fptr_t>& buffers,
+                      const std::vector<fptr_t>& tmp_result_buffers, const std::vector<fptr_t>& barrier_in,
+                      const std::vector<fptr_t>& barrier_out) {
+  auto m = new AllReduceMeta(rank_id, world_size, rank_data, buffers, tmp_result_buffers, barrier_in, barrier_out);
   return (fptr_t)m;
 }
 
@@ -63,6 +85,75 @@ void dispose(fptr_t _fa) {
   delete fa;
 }
 
+std::tuple<std::vector<int64_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(fptr_t _fa) {
+  AllReduceMeta* m = reinterpret_cast<AllReduceMeta*>(_fa);
+  auto num_buffers = m->graph_unreg_buffers.size();
+  auto handle_sz = sizeof(cudaIpcMemHandle_t);
+  std::string handles(handle_sz * num_buffers, static_cast<char>(0));
+  std::vector<int64_t> offsets(num_buffers);
+  for (int i = 0; i < num_buffers; i++) {
+    auto ptr = m->graph_unreg_buffers[i];
+    void* base_ptr;
+    // note: must share the base address of each allocation, or we get wrong
+    // address
+    if (cuPointerGetAttribute(&base_ptr, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)ptr) != CUDA_SUCCESS) {
+      assert(false && "failed to get pointer attr");
+    }
+
+    CHECK_CUDA_SUCCESS(cudaIpcGetMemHandle((cudaIpcMemHandle_t*)&handles[i * handle_sz], base_ptr));
+    offsets[i] = ((char*)ptr) - ((char*)base_ptr);
+  }
+  std::vector<int64_t> bytes(handles.begin(), handles.end());
+  return std::make_pair(bytes, offsets);
+}
+
+char* open_ipc_handle(AllReduceMeta* meta, const void* ipc_handle) {
+  auto [it, new_handle] = meta->ipc_handles_.insert({*((IPC_KEY*)ipc_handle), nullptr});
+  if (new_handle) {
+    char* ipc_ptr;
+    CHECK_CUDA_SUCCESS(cudaIpcOpenMemHandle((void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)ipc_handle),
+                                            cudaIpcMemLazyEnablePeerAccess));
+    it->second = ipc_ptr;
+  }
+  return it->second;
+}
+
+// Note: when registering graph buffers, we intentionally choose to not
+// deduplicate the addresses. That means if the allocator reuses some
+// addresses, they will be registered again. This is to account for the remote
+// possibility of different allocation patterns between ranks. For example,
+// rank 1 may get the same input address for the second allreduce, but rank 2
+// got a different address. IPC handles have internal reference counting
+// mechanism so overhead should be small.
+void register_graph_buffers(fptr_t _fa, const std::vector<std::vector<int64_t>>& handles,
+                            const std::vector<std::vector<int64_t>>& offsets) {
+  AllReduceMeta* m = reinterpret_cast<AllReduceMeta*>(_fa);
+  std::vector<std::string> handle_bytes;
+  handle_bytes.reserve(handles.size());
+  for (int i = 0; i < handles.size(); i++) {
+    handle_bytes.emplace_back(handles[i].begin(), handles[i].end());
+  }
+  auto num_buffers = m->graph_unreg_buffers.size();
+  std::vector<RankData> rank_data(num_buffers);
+  for (int i = 0; i < num_buffers; i++) {
+    auto self_ptr = m->graph_unreg_buffers[i];
+    auto& rd = rank_data[i];
+    for (int j = 0; j < m->world_size; j++) {
+      if (j != m->rank_id) {
+        char* handle = open_ipc_handle(m, &handle_bytes[j][i * sizeof(cudaIpcMemHandle_t)]);
+        handle += offsets[j][i];
+        rd.ptrs[j] = handle;
+      } else {
+        rd.ptrs[j] = self_ptr;
+      }
+    }
+  }
+  CHECK_CUDA_SUCCESS(
+      cudaMemcpy(m->rank_data_base, rank_data.data(), sizeof(RankData) * num_buffers, cudaMemcpyHostToDevice));
+  m->rank_data_base += num_buffers;
+  m->graph_unreg_buffers.clear();
+}
+
 void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out) {
   AllReduceMeta* m = reinterpret_cast<AllReduceMeta*>(_fa);
   auto stream = c10::cuda::getCurrentCUDAStream().stream();
@@ -87,8 +178,18 @@ void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out) {
   params.elts_size = inp.element_size();
   params.barrier_flag = ++(m->barrier_flag);
 
+  cudaStreamCaptureStatus status;
+  CHECK_CUDA_SUCCESS(cudaStreamIsCapturing(stream, &status));
+  params.is_capturing = (status == cudaStreamCaptureStatusActive);
+  if (params.is_capturing) {
+    params.peer_comm_buffer_ptrs = m->rank_data_base + m->graph_unreg_buffers.size();
+    m->graph_unreg_buffers.push_back(params.local_input_buffer_ptr);
+  } else {
+    params.peer_comm_buffer_ptrs = m->buffers;
+  }
+
   for (int i = 0; i < world_size; ++i) {
-    params.peer_comm_buffer_ptrs[i] = reinterpret_cast<void*>(m->buffers[i]);
+    params.tmp_result_buffers[i] = reinterpret_cast<uint32_t*>(m->tmp_result_buffers[i]);
   }
   for (int i = 0; i < world_size; ++i) {
     params.peer_barrier_ptrs_in[i] = reinterpret_cast<uint32_t*>(m->barrier_in[i]);
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index 03a8db80fd37..6b35f78a4904 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -1,15 +1,23 @@
 from sgl_kernel.ops._kernels import all_reduce as _all_reduce
 from sgl_kernel.ops._kernels import dispose as _dispose
+from sgl_kernel.ops._kernels import (
+    get_graph_buffer_ipc_meta as _get_graph_buffer_ipc_meta,
+)
 from sgl_kernel.ops._kernels import init_custom_ar as _init_custom_ar
 from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm
 from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size
+from sgl_kernel.ops._kernels import register_graph_buffers as _register_graph_buffers
 from sgl_kernel.ops._kernels import (
     sampling_scaling_penalties as _sampling_scaling_penalties,
 )
 
 
-def init_custom_reduce(rank_id, num_devices, buffers, barrier_in, barrier_out):
-    return _init_custom_ar(rank_id, num_devices, buffers, barrier_in, barrier_out)
+def init_custom_reduce(
+    rank_id, num_devices, rank_data, buffers, tmp_buffers, barrier_in, barrier_out
+):
+    return _init_custom_ar(
+        rank_id, num_devices, rank_data, buffers, tmp_buffers, barrier_in, barrier_out
+    )
 
 
 def custom_dispose(fa):
@@ -20,6 +28,14 @@ def custom_reduce(fa, inp, out):
     _all_reduce(fa, inp, out)
 
 
+def get_graph_buffer_ipc_meta(fa):
+    return _get_graph_buffer_ipc_meta(fa)
+
+
+def register_graph_buffers(fa, handles, offsets):
+    _register_graph_buffers(fa, handles, offsets)
+
+
 def moe_align_block_size(
     topk_ids,
     num_experts,
diff --git a/sgl-kernel/tests/test_trt_reduce.py b/sgl-kernel/tests/test_trt_reduce.py
index a5ce1b41db14..b79580070c0c 100644
--- a/sgl-kernel/tests/test_trt_reduce.py
+++ b/sgl-kernel/tests/test_trt_reduce.py
@@ -10,6 +10,7 @@
 import ray
 import torch
 import torch.distributed as dist
+from sgl_kernel import ops as custom_ops
 from torch.distributed import ProcessGroup
 from vllm import _custom_ops as vllm_ops
 
@@ -104,35 +105,38 @@ def test_performance(self):
             multi_process_parallel(world_size, self, self.performance)
 
     def init_custom_allreduce(self, rank, world_size, group):
-        import sgl_kernel
-
         buffer_max_size = 8 * 1024 * 1024
         barrier_max_size = 8 * (24 + 2) * 8
 
         self.buffer_ptrs = self.create_shared_buffer(buffer_max_size, group=group)
+        self.tmp_result_buffer_ptrs = self.create_shared_buffer(
+            buffer_max_size, group=group
+        )
         self.barrier_in_ptrs = self.create_shared_buffer(barrier_max_size, group=group)
         self.barrier_out_ptrs = self.create_shared_buffer(barrier_max_size, group=group)
+        self.rank_data = torch.empty(
+            8 * 1024 * 1024, dtype=torch.uint8, device=torch.device(f"cuda:{rank}")
+        )
 
-        self.custom_ptr = sgl_kernel.ops.init_custom_reduce(
+        self.custom_ptr = custom_ops.init_custom_reduce(
             rank,
             world_size,
+            self.rank_data,
             self.buffer_ptrs,
+            self.tmp_result_buffer_ptrs,
             self.barrier_in_ptrs,
             self.barrier_out_ptrs,
         )
 
     def custom_allreduce(self, inp, out):
-        import sgl_kernel
-
-        sgl_kernel.ops.custom_reduce(self.custom_ptr, inp, out)
+        custom_ops.custom_reduce(self.custom_ptr, inp, out)
 
     def free_custom_allreduce(self, group):
-        import sgl_kernel
-
         self.free_shared_buffer(self.buffer_ptrs, group)
+        self.free_shared_buffer(self.tmp_result_buffer_ptrs, group)
         self.free_shared_buffer(self.barrier_in_ptrs, group)
         self.free_shared_buffer(self.barrier_out_ptrs, group)
-        sgl_kernel.ops.custom_dispose(self.custom_ptr)
+        custom_ops.custom_dispose(self.custom_ptr)
 
     def init_vllm_allreduce(self, rank, group):
         self.vllm_rank = rank

From a53454c55e222fcc7375676b665b7e1276464170 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 16 Jan 2025 04:53:23 +0800
Subject: [PATCH 071/248] fix: sgl-kernel link cuda (#2906)

---
 sgl-kernel/build.sh                           |  2 +
 sgl-kernel/pyproject.toml                     |  2 +-
 sgl-kernel/setup.py                           |  2 +-
 .../csrc/sampling_scaling_penalties.cu        | 81 +++++++++----------
 .../src/sgl-kernel/csrc/vectorization.cuh     |  5 +-
 5 files changed, 44 insertions(+), 48 deletions(-)

diff --git a/sgl-kernel/build.sh b/sgl-kernel/build.sh
index 799b724dfe6e..55ce9df7f33d 100755
--- a/sgl-kernel/build.sh
+++ b/sgl-kernel/build.sh
@@ -11,6 +11,8 @@ docker run --rm \
     ${PYTHON_ROOT_PATH}/bin/pip install --no-cache-dir torch==2.4.0 --index-url https://download.pytorch.org/whl/cu${CUDA_VERSION//.} && \
     export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX' && \
     export CUDA_VERSION=${CUDA_VERSION} && \
+    mkdir -p /usr/lib/x86_64-linux-gnu/ && \
+    ln -s /usr/local/cuda-${CUDA_VERSION}/targets/x86_64-linux/lib/stubs/libcuda.so /usr/lib/x86_64-linux-gnu/libcuda.so && \
     cd /sgl-kernel && \
     ${PYTHON_ROOT_PATH}/bin/python setup.py bdist_wheel
     "
diff --git a/sgl-kernel/pyproject.toml b/sgl-kernel/pyproject.toml
index 6a6a0d1fe4ef..b0554bd8fed1 100644
--- a/sgl-kernel/pyproject.toml
+++ b/sgl-kernel/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sgl-kernel"
-version = "0.0.2.post13"
+version = "0.0.2.post14"
 description = "Kernel Library for SGLang"
 readme = "README.md"
 requires-python = ">=3.8"
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 2d2d9258ade9..da6b465d8412 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -41,7 +41,7 @@ def update_wheel_platform_tag():
 ]
 cxx_flags = ["-O3"]
 libraries = ["c10", "torch", "torch_python", "cuda"]
-extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib"]
+extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib", "-L/usr/lib/x86_64-linux-gnu"]
 ext_modules = [
     CUDAExtension(
         name="sgl_kernel.ops._kernels",
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu b/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
index 30264caa3666..a61d4b860596 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
@@ -1,64 +1,59 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+
 #include <THC/THCAtomics.cuh>
+
 #include "utils.hpp"
 #include "vectorization.cuh"
 
 template <typename scalar_t>
-__global__ void sampling_scaling_penalties_kernel(
-    const scalar_t* logits,
-    const scalar_t* scaling_penalties,
-    scalar_t* output,
-    const int32_t numel) {
-
-    const int32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int32_t stride = blockDim.x * gridDim.x;
+__global__ void sampling_scaling_penalties_kernel(const scalar_t* logits, const scalar_t* scaling_penalties,
+                                                  scalar_t* output, const int32_t numel) {
+  const int32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t stride = blockDim.x * gridDim.x;
 
-    auto const* vectorized_logits = reinterpret_cast<vec4_t<scalar_t> const*>(logits);
-    auto const* vectorized_penalties = reinterpret_cast<vec4_t<scalar_t> const*>(scaling_penalties);
-    auto* vectorized_output = reinterpret_cast<vec4_t<scalar_t>*>(output);
+  auto const* vectorized_logits = reinterpret_cast<vec4_t<scalar_t> const*>(logits);
+  auto const* vectorized_penalties = reinterpret_cast<vec4_t<scalar_t> const*>(scaling_penalties);
+  auto* vectorized_output = reinterpret_cast<vec4_t<scalar_t>*>(output);
 
-    const int32_t num_vec_elems = numel >> 2;
+  const int32_t num_vec_elems = numel >> 2;
 
 #pragma unroll 4
-    for (int32_t i = tid; i < num_vec_elems; i += stride) {
-        vec4_t<scalar_t> logits_vec = vectorized_logits[i];
-        vec4_t<scalar_t> penalties_vec = vectorized_penalties[i];
-        vec4_t<scalar_t> out_vec;
-
-        out_vec.x = logits_vec.x > 0 ? logits_vec.x / penalties_vec.x : logits_vec.x * penalties_vec.x;
-        out_vec.y = logits_vec.y > 0 ? logits_vec.y / penalties_vec.y : logits_vec.y * penalties_vec.y;
-        out_vec.z = logits_vec.z > 0 ? logits_vec.z / penalties_vec.z : logits_vec.z * penalties_vec.z;
-        out_vec.w = logits_vec.w > 0 ? logits_vec.w / penalties_vec.w : logits_vec.w * penalties_vec.w;
-
-        vectorized_output[i] = out_vec;
-    }
-
-    const int32_t start_idx = num_vec_elems * 4;
-    for (int32_t i = start_idx + tid; i < numel; i += stride) {
-        scalar_t logit = logits[i];
-        scalar_t penalty = scaling_penalties[i];
-        output[i] = logit > 0 ? logit / penalty : logit * penalty;
-    }
+  for (int32_t i = tid; i < num_vec_elems; i += stride) {
+    vec4_t<scalar_t> logits_vec = vectorized_logits[i];
+    vec4_t<scalar_t> penalties_vec = vectorized_penalties[i];
+    vec4_t<scalar_t> out_vec;
+
+    out_vec.x = logits_vec.x > 0 ? logits_vec.x / penalties_vec.x : logits_vec.x * penalties_vec.x;
+    out_vec.y = logits_vec.y > 0 ? logits_vec.y / penalties_vec.y : logits_vec.y * penalties_vec.y;
+    out_vec.z = logits_vec.z > 0 ? logits_vec.z / penalties_vec.z : logits_vec.z * penalties_vec.z;
+    out_vec.w = logits_vec.w > 0 ? logits_vec.w / penalties_vec.w : logits_vec.w * penalties_vec.w;
+
+    vectorized_output[i] = out_vec;
+  }
+
+  const int32_t start_idx = num_vec_elems * 4;
+  for (int32_t i = start_idx + tid; i < numel; i += stride) {
+    scalar_t logit = logits[i];
+    scalar_t penalty = scaling_penalties[i];
+    output[i] = logit > 0 ? logit / penalty : logit * penalty;
+  }
 }
 
 torch::Tensor sampling_scaling_penalties(const torch::Tensor& logits, const torch::Tensor& scaling_penalties) {
-    auto output = torch::empty_like(logits);
-    const auto numel = logits.numel();
-    const int threads = 512;
+  auto output = torch::empty_like(logits);
+  const auto numel = logits.numel();
+  const int threads = 512;
 
-    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
-        logits.scalar_type(), "sampling_scaling_penalties_kernel", ([&] {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16, logits.scalar_type(), "sampling_scaling_penalties_kernel", ([&] {
         const int blocks = (numel + threads * 4 - 1) / (threads * 4);
         sampling_scaling_penalties_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
-            logits.data_ptr<scalar_t>(),
-            scaling_penalties.data_ptr<scalar_t>(),
-            output.data_ptr<scalar_t>(),
-            numel);
-    }));
+            logits.data_ptr<scalar_t>(), scaling_penalties.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), numel);
+      }));
 
-    return output;
+  return output;
 }
diff --git a/sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh b/sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh
index cb36d0e7a456..2bfb710189bb 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh
+++ b/sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh
@@ -6,8 +6,8 @@
 
 // Include both AMD and NVIDIA fp8 types to avoid circular import
 // TODO(luka/varun) use FP8_TYPE instead after refactoring
-#include <c10/util/Float8_e4m3fnuz.h>
 #include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
 
 // Vectorization containers
 template <typename scalar_t>
@@ -20,8 +20,7 @@ struct __align__(8) vec4_t {
 
 template <typename quant_type_t>
 struct __align__(4) q8x4_t {
-  static_assert(std::is_same_v<quant_type_t, int8_t> ||
-                std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
+  static_assert(std::is_same_v<quant_type_t, int8_t> || std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
                 std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
   quant_type_t x;
   quant_type_t y;

From 767c9dec03e306400d162785ba77615ab40ae6c8 Mon Sep 17 00:00:00 2001
From: yizhang2077 <1109276519@qq.com>
Date: Thu, 16 Jan 2025 04:57:35 +0800
Subject: [PATCH 072/248] adapt custom allreduce for tensorrt llm (#2511)

---
 python/pyproject.toml                         |   2 +-
 python/sglang/srt/_custom_ops.py              |  49 +++---
 .../device_communicators/custom_all_reduce.py |  92 +++++-----
 test/srt/run_suite.py                         |   1 +
 test/srt/test_custom_allreduce.py             | 164 ++++++++++++++++++
 5 files changed, 241 insertions(+), 67 deletions(-)
 create mode 100644 test/srt/test_custom_allreduce.py

diff --git a/python/pyproject.toml b/python/pyproject.toml
index fe68e59eae48..ea7c2482a720 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -27,7 +27,7 @@ runtime_common = [
 ]
 srt = [
     "sglang[runtime_common]", "cuda-python",
-    "sgl-kernel>=0.0.2.post12", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1",
+    "sgl-kernel>=0.0.2.post14", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1",
     "flashinfer==0.1.6"
 ]
 
diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py
index 9eb7caa1bbae..f59f67605b3c 100644
--- a/python/sglang/srt/_custom_ops.py
+++ b/python/sglang/srt/_custom_ops.py
@@ -1,4 +1,4 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/_custom_ops.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/_custom_ops.py
 import contextlib
 import functools
 import importlib
@@ -14,7 +14,7 @@
 
 if not is_hpu():
     try:
-        import custom_ar
+        import sgl_kernel
     except ImportError as e:
         logger.warning("Failed to import from custom_ar with %r", e)
 
@@ -50,46 +50,41 @@ def wrapper(*args, **kwargs):
 
 # custom ar
 def init_custom_ar(
-    ipc_tensors: List[torch.Tensor],
-    rank_data: torch.Tensor,
-    rank: int,
-    full_nvlink: bool,
+    rank_id: int,
+    world_size: int,
+    rank_data_base: torch.Tensor,
+    buffers: List[int],
+    tmp_result_buffers: List[int],
+    barrier_in: List[int],
+    barrier_out: List[int],
 ) -> int:
-    return torch.ops._C_vllm_ar.init_custom_ar(
-        ipc_tensors, rank_data, rank, full_nvlink
+    return sgl_kernel.ops.init_custom_reduce(
+        rank_id,
+        world_size,
+        rank_data_base,
+        buffers,
+        tmp_result_buffers,
+        barrier_in,
+        barrier_out,
     )
 
 
-def all_reduce(
-    fa: int,
-    inp: torch.Tensor,
-    out: torch.Tensor,
-    reg_buffer: int,
-    reg_buffer_sz_bytes: int,
-) -> None:
-    torch.ops._C_vllm_ar.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
+def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
+    sgl_kernel.ops.custom_reduce(fa, inp, out)
 
 
 def dispose(fa: int) -> None:
-    torch.ops._C_vllm_ar.dispose(fa)
-
-
-def meta_size() -> int:
-    return torch.ops._C_vllm_ar.meta_size()
-
-
-def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
-    return torch.ops._C_vllm_ar.register_buffer(fa, ipc_tensors)
+    sgl_kernel.ops.custom_dispose(fa)
 
 
 def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
-    return torch.ops._C_vllm_ar.get_graph_buffer_ipc_meta(fa)
+    return sgl_kernel.ops.get_graph_buffer_ipc_meta(fa)
 
 
 def register_graph_buffers(
     fa: int, handles: List[List[int]], offsets: List[List[int]]
 ) -> None:
-    torch.ops._C_vllm_ar.register_graph_buffers(fa, handles, offsets)
+    sgl_kernel.ops.register_graph_buffers(fa, handles, offsets)
 
 
 # temporary fix for https://github.com/vllm-project/vllm/issues/5456
diff --git a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
index b6df234407d8..ba9feb59d0c6 100644
--- a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
+++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
@@ -21,7 +21,8 @@
 from sglang.srt.utils import cuda_device_count_stateless, is_cuda
 
 try:
-    ops.meta_size()
+    import sgl_kernel
+
     custom_ar = True
 except Exception:
     # For AMD GPUs and CPUs
@@ -29,7 +30,6 @@
 
 logger = logging.getLogger(__name__)
 
-
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
 
@@ -47,7 +47,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
 
 
 @with_nvml_context
-def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
+def is_full_nvlink(physical_device_ids: List[int]) -> bool:
     """
     query if the set of gpus are fully connected by nvlink (1 hop)
     """
@@ -196,32 +196,39 @@ def __init__(
             )
             return
 
-        self.disabled = False
-        # Buffers memory are owned by this Python class and passed to C++.
-        # Meta data composes of two parts: meta data for synchronization and a
-        # temporary buffer for storing intermediate allreduce results.
-        self.meta_ptrs = self.create_shared_buffer(
-            ops.meta_size() + max_size, group=group
-        )
-        # This is a pre-registered IPC buffer. In eager mode, input tensors
-        # are first copied into this buffer before allreduce is performed
-        self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
-        # This is a buffer for storing the tuples of pointers pointing to
-        # IPC buffers from all ranks. Each registered tuple has size of
-        # 8*world_size bytes where world_size is at most 8. Allocating 8MB
-        # is enough for 131072 such tuples. The largest model I've seen only
-        # needs less than 10000 of registered tuples.
-        self.rank_data = torch.empty(
-            8 * 1024 * 1024, dtype=torch.uint8, device=self.device
-        )
         self.max_size = max_size
         self.rank = rank
         self.world_size = world_size
         self.full_nvlink = full_nvlink
+
+        # From TensorRT-LLM getMaxRequiredWorkspaceSize
+        self.max_required_workspace_size = [16 * 1024 * 1024, 8 * 1024 * 1024]
+
+        # sizeof(uint32_t) * (MAX_ALL_REDUCE_BLOCKS + 2) * MAX_RANKS_PER_NODE;
+        self.barrier_max_size = 8 * (36 + 2) * 8
+
+        self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
+        self.tmp_result_buffer_ptrs = self.create_shared_buffer(max_size, group=group)
+        self.rank_data_base = torch.empty(
+            8 * 1024 * 1024, dtype=torch.uint8, device=self.device
+        )
+        self.barrier_in_ptrs = self.create_shared_buffer(
+            self.barrier_max_size, group=group
+        )
+        self.barrier_out_ptrs = self.create_shared_buffer(
+            self.barrier_max_size, group=group
+        )
+
         self._ptr = ops.init_custom_ar(
-            self.meta_ptrs, self.rank_data, rank, self.full_nvlink
+            rank,
+            world_size,
+            self.rank_data_base,
+            self.buffer_ptrs,
+            self.tmp_result_buffer_ptrs,
+            self.barrier_in_ptrs,
+            self.barrier_out_ptrs,
         )
-        ops.register_buffer(self._ptr, self.buffer_ptrs)
+        self.disabled = False
 
     @staticmethod
     def create_shared_buffer(
@@ -300,12 +307,25 @@ def should_custom_ar(self, inp: torch.Tensor):
             return False
         # for 4 or more non NVLink-capable GPUs, custom allreduce provides
         # little performance improvement over NCCL.
-        if self.world_size == 2 or self.full_nvlink:
-            return inp_size < self.max_size
+        if self.world_size == 2:
+            return (
+                inp_size < self.max_size
+                and inp_size < self.max_required_workspace_size[0]
+            )
+
+        if self.full_nvlink:
+            return (
+                inp_size < self.max_size
+                and inp_size < self.max_required_workspace_size[1]
+            )
+
         return False
 
     def all_reduce(
-        self, inp: torch.Tensor, *, out: torch.Tensor = None, registered: bool = False
+        self,
+        inp: torch.Tensor,
+        *,
+        out: torch.Tensor = None,
     ):
         """Performs an out-of-place all reduce.
 
@@ -315,12 +335,7 @@ def all_reduce(
         """
         if out is None:
             out = torch.empty_like(inp)
-        if registered:
-            ops.all_reduce(self._ptr, inp, out, 0, 0)
-        else:
-            ops.all_reduce(
-                self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
-            )
+        ops.all_reduce(self._ptr, inp, out)
         return out
 
     def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
@@ -330,23 +345,22 @@ def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
             return None
         if self._IS_CAPTURING:
             if torch.cuda.is_current_stream_capturing():
-                return self.all_reduce(input, registered=True)
+                return self.all_reduce(input)
             else:
                 # If warm up, mimic the allocation pattern since custom
                 # allreduce is out-of-place.
                 return torch.empty_like(input)
         else:
-            # Note: outside of cuda graph context, custom allreduce incurs a
-            # cost of cudaMemcpy, which should be small (<=1% of overall
-            # latency) compared to the performance gain of using custom kernels
-            return self.all_reduce(input, registered=False)
+            return self.all_reduce(input)
 
     def close(self):
         if not self.disabled and self._ptr:
             ops.dispose(self._ptr)
-            self._ptr = 0
-            self.free_shared_buffer(self.meta_ptrs)
             self.free_shared_buffer(self.buffer_ptrs)
+            self.free_shared_buffer(self.tmp_result_buffer_ptrs)
+            self.free_shared_buffer(self.barrier_in_ptrs)
+            self.free_shared_buffer(self.barrier_out_ptrs)
+            self._ptr = 0
 
     def __del__(self):
         self.close()
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index ad5aa6aa5491..b00c866a9616 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -12,6 +12,7 @@
         "sampling/penaltylib",
         "test_abort.py",
         "test_chunked_prefill.py",
+        "test_custom_allreduce.py",
         "test_double_sparsity.py",
         "test_eagle_infer.py",
         "test_embedding_openai_server.py",
diff --git a/test/srt/test_custom_allreduce.py b/test/srt/test_custom_allreduce.py
new file mode 100644
index 000000000000..5f6f5d9b4918
--- /dev/null
+++ b/test/srt/test_custom_allreduce.py
@@ -0,0 +1,164 @@
+import os
+import random
+import socket
+import unittest
+from typing import Any
+
+import ray
+import torch
+import torch.distributed as dist
+
+from sglang.srt.distributed import init_distributed_environment
+from sglang.srt.distributed.communication_op import (  # noqa
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.distributed.parallel_state import (
+    get_tensor_model_parallel_group,
+    graph_capture,
+    initialize_model_parallel,
+)
+
+
+def get_open_port() -> int:
+    # try ipv4
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+    except OSError:
+        # try ipv6
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+
+
+def multi_process_parallel(
+    world_size: int,
+    cls: Any,
+    test_target: Any,
+) -> None:
+
+    # Using ray helps debugging the error when it failed
+    # as compared to multiprocessing.
+    # NOTE: We need to set working_dir for distributed tests,
+    # otherwise we may get import errors on ray workers
+    ray.init(log_to_driver=False)
+
+    distributed_init_port = get_open_port()
+    refs = []
+    for rank in range(world_size):
+        refs.append(test_target.remote(cls, world_size, rank, distributed_init_port))
+    ray.get(refs)
+
+    ray.shutdown()
+
+
+class TestCustomAllReduce(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        random.seed(42)
+        # 512B to 32MB
+        cls.test_sizes = [512, 4096, 32768, 262144, 2097152, 16777216, 33554432]
+        cls.world_sizes = [2, 4, 6, 8]
+        cls.test_loop = 10
+
+    def test_graph_allreduce(self):
+        for world_size in self.world_sizes:
+            if world_size > torch.cuda.device_count():
+                continue
+            multi_process_parallel(world_size, self, self.graph_allreduce)
+
+    def test_eager_allreduce(self):
+        for world_size in self.world_sizes:
+            if world_size > torch.cuda.device_count():
+                continue
+            multi_process_parallel(world_size, self, self.eager_allreduce)
+
+    @ray.remote(num_gpus=1, max_calls=1)
+    def graph_allreduce(self, world_size, rank, distributed_init_port):
+        del os.environ["CUDA_VISIBLE_DEVICES"]
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        distributed_init_method = f"tcp://localhost:{distributed_init_port}"
+        init_distributed_environment(
+            world_size=world_size,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            local_rank=rank,
+        )
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
+        group = get_tensor_model_parallel_group().device_group
+
+        # A small all_reduce for warmup.
+        # this is needed because device communicators might be created lazily
+        # (e.g. NCCL). This will ensure that the communicator is initialized
+        # before any communication happens, so that this group can be used for
+        # graph capture immediately.
+        data = torch.zeros(1)
+        data = data.to(device=device)
+        torch.distributed.all_reduce(data, group=group)
+        torch.cuda.synchronize()
+        del data
+
+        for sz in self.test_sizes:
+            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                for _ in range(self.test_loop):
+                    with graph_capture() as graph_capture_context:
+                        # use integers so result matches NCCL exactly
+                        inp1 = torch.randint(
+                            1,
+                            16,
+                            (sz,),
+                            dtype=dtype,
+                            device=torch.cuda.current_device(),
+                        )
+                        inp2 = torch.randint(
+                            1,
+                            16,
+                            (sz,),
+                            dtype=dtype,
+                            device=torch.cuda.current_device(),
+                        )
+                        torch.cuda.synchronize()
+                        graph = torch.cuda.CUDAGraph()
+                        with torch.cuda.graph(
+                            graph, stream=graph_capture_context.stream
+                        ):
+                            out1 = tensor_model_parallel_all_reduce(inp1)
+                            # the input buffer is immediately modified to test
+                            # synchronization
+                            dist.all_reduce(inp1, group=group)
+                            out2 = tensor_model_parallel_all_reduce(inp2)
+                            dist.all_reduce(inp2, group=group)
+                    graph.replay()
+                    torch.testing.assert_close(out1, inp1)
+                    torch.testing.assert_close(out2, inp2)
+
+    @ray.remote(num_gpus=1, max_calls=1)
+    def eager_allreduce(self, world_size, rank, distributed_init_port):
+        del os.environ["CUDA_VISIBLE_DEVICES"]
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        distributed_init_method = f"tcp://localhost:{distributed_init_port}"
+        init_distributed_environment(
+            world_size=world_size,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            local_rank=rank,
+        )
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
+        group = get_tensor_model_parallel_group().device_group
+
+        for sz in self.test_sizes:
+            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                for _ in range(self.test_loop):
+                    inp1 = torch.randint(
+                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
+                    out1 = tensor_model_parallel_all_reduce(inp1)
+                    dist.all_reduce(inp1, group=group)
+                    torch.testing.assert_close(out1, inp1)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 58f42b1dd8d9aa934a2684e82c06c80c5f0561a0 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 16 Jan 2025 05:51:49 +0800
Subject: [PATCH 073/248] minor: update pr test (#2908)

---
 .github/workflows/pr-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 274c97c63932..51117127adad 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -52,7 +52,7 @@ jobs:
     runs-on: 1-gpu-runner
     strategy:
       matrix:
-        range: [0-6, 6-16, 16-23, 23-30, 30-100]
+        range: [0-6, 6-16, 16-23, 23-30, 30-38, 38-100]
     steps:
       - name: Checkout code
         uses: actions/checkout@v3

From b7f3fec13c0280d341096a2db302f81ba4dbaf74 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 16 Jan 2025 05:55:43 +0800
Subject: [PATCH 074/248] minor: rename bench for sgl kernel (#2909)

---
 ...g_scaling_penalties.py => bench_sampling_scaling_penalties.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename sgl-kernel/benchmark/{benchmark_sampling_scaling_penalties.py => bench_sampling_scaling_penalties.py} (100%)

diff --git a/sgl-kernel/benchmark/benchmark_sampling_scaling_penalties.py b/sgl-kernel/benchmark/bench_sampling_scaling_penalties.py
similarity index 100%
rename from sgl-kernel/benchmark/benchmark_sampling_scaling_penalties.py
rename to sgl-kernel/benchmark/bench_sampling_scaling_penalties.py

From ab31793661956a448bf4f47098f0f3907c4841e1 Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Thu, 16 Jan 2025 14:18:29 +0800
Subject: [PATCH 075/248] [kernel] MiniMax-Text-01 prefill lightning_attn with
 triton (#2911)

---
 .../benchmark_lighting_attention_prefill.py   | 601 ++++++++++++++++++
 1 file changed, 601 insertions(+)
 create mode 100644 benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_prefill.py

diff --git a/benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_prefill.py b/benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_prefill.py
new file mode 100644
index 000000000000..3db4694c7dc4
--- /dev/null
+++ b/benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_prefill.py
@@ -0,0 +1,601 @@
+import itertools
+import math
+import os
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from einops import rearrange
+
+
+# Adapted from https://github.com/OpenNLPLab/lightning-attention/blob/main/lightning_attn/ops/triton/lightning_attn2.py
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    Out,
+    S,  # log lambda
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n: tl.constexpr,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK: tl.constexpr,
+    BLOCK_MODEL: tl.constexpr,
+):
+    ##### get offset
+    off_bh = tl.program_id(0)
+    off_h = off_bh % h
+    off_e = tl.program_id(1)
+    qk_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    o_offset = off_bh * n * e
+    # channel offset
+    e_offset = off_e * BLOCK_MODEL
+
+    ##### get block ptr
+    Q_block_ptr = Q + qk_offset + tl.arange(0, d)[None, :]
+    K_trans_block_ptr = K + qk_offset + tl.arange(0, d)[:, None]
+    V_block_ptr = V + v_offset + e_offset + tl.arange(0, BLOCK_MODEL)[None, :]
+    O_block_ptr = Out + o_offset + e_offset + tl.arange(0, BLOCK_MODEL)[None, :]
+    S_block_ptr = S + off_h
+
+    ##### init diag decay(Lambda); q, k decay; kv
+    s = tl.load(S_block_ptr)
+    # q, k decay
+    off_block = tl.arange(
+        0, BLOCK
+    )  # Not bug, this is a bit different from algorithm 1, but is mathematically equivalent
+    q_decay = tl.exp(-s.to(tl.float32) * off_block[:, None])
+    k_trans_decay = tl.exp(-s.to(tl.float32) * (BLOCK - off_block[None, :]))
+    block_decay = tl.exp(-s.to(tl.float32) * BLOCK)
+    # diag decay
+    index = off_block[:, None] - off_block[None, :]
+    s_index = s * index
+    s_index = tl.where(index >= 0, -s_index, float("-inf"))
+    diag_decay = tl.exp(s_index)
+    kv = tl.zeros([d, BLOCK_MODEL], dtype=tl.float32)
+
+    ##### compute
+    for i in range(NUM_BLOCK):
+        # load
+        q = tl.load(
+            Q_block_ptr + off_block[:, None] * d, mask=off_block[:, None] < n, other=0.0
+        ).to(tl.float32)
+        k_trans = tl.load(
+            K_trans_block_ptr + off_block[None, :] * d,
+            mask=off_block[None, :] < n,
+            other=0.0,
+        ).to(tl.float32)
+        v = tl.load(
+            V_block_ptr + off_block[:, None] * e, mask=off_block[:, None] < n, other=0.0
+        ).to(tl.float32)
+
+        # compute
+        qk = tl.dot(q, k_trans) * diag_decay
+        o_intra = tl.dot(qk, v)
+        o_inter = tl.dot(q, kv) * q_decay
+        o = o_intra + o_inter
+
+        # save and update
+        tl.store(
+            O_block_ptr + off_block[:, None] * e,
+            o.to(O_block_ptr.dtype.element_ty),
+            mask=off_block[:, None] < n,
+        )
+        kv = block_decay * kv + tl.dot(k_trans * k_trans_decay, v)
+        off_block += BLOCK
+
+
+def lightning_attn2(q, k, v, s):
+    q = q.contiguous()
+    k = k.contiguous()
+    v = v.contiguous()
+    s = s.contiguous()
+
+    b, h, n, d = q.shape
+    e = v.shape[-1]
+
+    # Pad d to next power of 2
+    d_padded = next_power_of_2(d)
+    if d_padded != d:
+        q_padded = F.pad(q, (0, d_padded - d))
+        k_padded = F.pad(k, (0, d_padded - d))
+    else:
+        q_padded = q
+        k_padded = k
+
+    # Pad e to next power of 2
+    e_padded = next_power_of_2(e)
+    if e_padded != e:
+        v_padded = F.pad(v, (0, e_padded - e))
+    else:
+        v_padded = v
+
+    o_padded = torch.empty((b, h, n, e_padded), dtype=q.dtype, device=q.device)
+
+    BLOCK = 64
+    NUM_BLOCK = triton.cdiv(q.shape[2], BLOCK)
+    # parallel over channel
+    BLOCK_MODEL = min(triton.next_power_of_2(e_padded), 32)
+    grid = (b * h, triton.cdiv(e_padded, BLOCK_MODEL))
+
+    _fwd_kernel[grid](
+        q_padded,
+        k_padded,
+        v_padded,
+        o_padded,
+        s,
+        b,
+        h,
+        n,
+        d_padded,
+        e_padded,
+        BLOCK=BLOCK,
+        NUM_BLOCK=NUM_BLOCK,
+        BLOCK_MODEL=BLOCK_MODEL,
+    )
+
+    # Remove padding from output
+    if e_padded != e:
+        o = o_padded[..., :e]
+    else:
+        o = o_padded
+
+    return o
+
+
+def is_support(dim):
+    return 16 % dim
+
+
+def next_power_of_2(n):
+    return 2 ** (int(math.ceil(math.log(n, 2))))
+
+
+def lightning_attn_func(q, k, v, s):
+    b, h, n, d = q.shape
+    e = v.shape[-1]
+    assert is_support(d) and is_support(e)
+
+    # pad v's feature dim to power of 2
+    e_pad = next_power_of_2(e)
+    need_pad = e_pad != e
+    if need_pad:
+        v = F.pad(v, (0, e_pad - e))
+
+    if d > 128:
+        # split over head
+        if 64 % d:
+            m = 64
+        elif 32 % d:
+            m = 32
+        elif 16 % d:
+            m = 16
+        arr = [m * i for i in range(d // m + 1)]
+        if arr[-1] != d:
+            arr.append(d)
+        n = len(arr)
+        o = 0
+        for i in range(n - 1):
+            start = arr[i]
+            end = arr[i + 1]
+            q1 = q[..., start:end]
+            k1 = k[..., start:end]
+            o += lightning_attn2(q1, k1, v, s)
+    else:
+        o = lightning_attn2(q, k, v, s)
+
+    if need_pad:
+        o = o[:, :, :, :e]
+
+    return o
+
+
+debug = eval(os.environ.get("debug", default="False"))
+
+BLOCK = 256
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->MiniMaxText01
+class MiniMaxText01RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MiniMaxText01RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from https://huggingface.co/MiniMaxAI/MiniMax-Text-01/blob/main/modeling_minimax_text_01.py
+def get_activation_fn(activation):
+    if debug:
+        logger.info(f"activation: {activation}")
+    if activation == "gelu":
+        return F.gelu
+    elif activation == "relu":
+        return F.relu
+    elif activation == "elu":
+        return F.elu
+    elif activation == "sigmoid":
+        return F.sigmoid
+    elif activation == "exp":
+
+        def f(x):
+            with torch.no_grad():
+                x_max = torch.max(x, dim=-1, keepdims=True).values
+            y = torch.exp(x - x_max)
+
+            return y
+
+        return f
+    elif activation == "leak":
+        return F.leaky_relu
+    elif activation == "1+elu":
+
+        def f(x):
+            return 1 + F.elu(x)
+
+        return f
+    elif activation == "2+elu":
+
+        def f(x):
+            return 2 + F.elu(x)
+
+        return f
+    elif activation == "silu" or activation == "swish":
+        return F.silu
+    elif activation == "sine":
+        return torch.sin
+    else:
+        logger.info(f"activation: does not support {activation}, use Identity!!!")
+        return lambda x: x
+
+
+# Copied from https://huggingface.co/MiniMaxAI/MiniMax-Text-01/blob/main/modeling_minimax_text_01.py
+class MiniMaxText01LightningAttention(nn.Module):
+    def __init__(self, config=None, layer_idx: Optional[int] = None, **kwargs):
+        super().__init__()
+        if config is None:
+            config = type("Config", (), kwargs)
+
+        bias = False
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+
+        self.out_proj = nn.Linear(
+            self.head_dim * self.num_heads, self.hidden_size, bias=bias
+        )
+        self.act = get_activation_fn(config.hidden_act)
+        self.norm = MiniMaxText01RMSNorm(self.head_dim * self.num_heads)
+
+        self.qkv_proj = nn.Linear(
+            self.hidden_size, 3 * self.head_dim * self.num_heads, bias=bias
+        )
+        self.output_gate = nn.Linear(
+            self.hidden_size, self.head_dim * self.num_heads, bias=bias
+        )
+
+        # for inference only
+        self.offset = 0
+        self.layer_idx = layer_idx
+
+    def forward(
+        self,
+        hidden_states,
+        attn_mask: Optional[torch.Tensor] = None,  # (b, h, n, m)
+        output_attentions: bool = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+        slope_rate: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        if (not self.training) and (not do_eval):
+            return self.inference(
+                hidden_states,
+                attn_mask,
+                output_attentions,
+                past_key_value,
+                use_cache,
+                slope_rate,
+            )
+
+    def inference(
+        self,
+        x,
+        attn_mask: Optional[torch.Tensor] = None,  # (b, n)
+        output_attentions: bool = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+        slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
+    ):
+        # x: b n d
+        b, n, d = x.shape
+        # linear map
+        qkv = self.act(self.qkv_proj(x))
+        new_shape = qkv.size()[:-1] + (self.num_heads, -1)
+        qkv = qkv.view(*new_shape)
+        q, k, v = torch.split(qkv, [self.head_dim] * 3, dim=3)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        if past_key_value is None:
+            self.offset = q.shape[-2]
+        else:
+            self.offset += 1
+
+        # for align with metaseq
+        ratio = torch.exp(-slope_rate)
+
+        # only use for the first time
+        if past_key_value is None:
+            slope_rate = slope_rate.to(torch.float32)
+            if attn_mask is not None:
+                v = v.masked_fill(
+                    (1 - attn_mask).unsqueeze(1).unsqueeze(-1).to(torch.bool), 0
+                )
+            NUM_BLOCK = (n + BLOCK - 1) // BLOCK
+            b, h, n, d = q.shape
+            e = v.shape[-1]
+            # other
+            array = torch.arange(BLOCK).to(q) + 1
+            q_decay = torch.exp(-slope_rate * array.reshape(-1, 1))
+            k_decay = torch.exp(-slope_rate * (BLOCK - array.reshape(-1, 1)))
+            index = array[:, None] - array[None, :]
+            s_index = (
+                slope_rate
+                * index[
+                    None,
+                    None,
+                ]
+            )
+            s_index = torch.where(index >= 0, -s_index, float("-inf"))
+            diag_decay = torch.exp(s_index)
+
+            kv = torch.zeros(b, h, d, e).to(torch.float32).to(q.device)
+            output = torch.empty((b, h, n, e), dtype=q.dtype, device=q.device)
+            for i in range(NUM_BLOCK):
+                si = i * BLOCK
+                ei = min(si + BLOCK, n)
+                m = ei - si
+                qi = q[:, :, si:ei].contiguous()
+                ki = k[:, :, si:ei].contiguous()
+                vi = v[:, :, si:ei].contiguous()
+                qkv_none_diag = torch.matmul(qi * q_decay[:, :m], kv).to(torch.float32)
+
+                # diag
+                qk = (
+                    torch.matmul(qi, ki.transpose(-1, -2)).to(torch.float32)
+                    * diag_decay[:, :, :m, :m]
+                )
+                qkv_diag = torch.matmul(qk, vi.to(torch.float32))
+                block_decay = torch.exp(-slope_rate * m)
+                output[:, :, si:ei] = qkv_none_diag + qkv_diag
+                kv = block_decay * kv + torch.matmul(
+                    (ki * k_decay[:, -m:]).transpose(-1, -2).to(vi.dtype), vi
+                )
+
+        else:
+            kv = past_key_value
+            output = []
+            for i in range(n):
+                kv = ratio * kv + torch.einsum(
+                    "... n d, ... n e -> ... d e",
+                    k[:, :, i : i + 1],
+                    v[:, :, i : i + 1],
+                )
+                qkv = torch.einsum(
+                    "... n e, ... e d -> ... n d", q[:, :, i : i + 1], kv.to(q.dtype)
+                )
+                output.append(qkv)
+            output = torch.concat(output, dim=-2)
+        # reshape
+        output = rearrange(output, "b h n d -> b n (h d)")
+        # normalize
+        output = self.norm(output)
+        # gate
+        output = F.sigmoid(self.output_gate(x)) * output
+        # outproj
+        output = self.out_proj(output)
+
+        attn_weights = None
+
+        return output, attn_weights, kv
+
+
+def _build_slope_tensor(n_attention_heads: int):
+    def get_slopes(n):
+        def get_slopes_power_of_2(n):
+            start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+            ratio = start
+            return [start * ratio**i for i in range(n)]
+
+        if math.log2(n).is_integer():
+            return get_slopes_power_of_2(
+                n
+            )  # In the paper, we only train models that have 2^a heads for some a. This function has
+        else:  # some good properties that only occur when the input is a power of 2. To maintain that even
+            closest_power_of_2 = 2 ** math.floor(
+                math.log2(n)
+            )  # when the number of heads is not a power of 2, we use this workaround.
+            return (
+                get_slopes_power_of_2(closest_power_of_2)
+                + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+            )
+
+    # h, 1, 1
+    slopes = torch.tensor(get_slopes(n_attention_heads)).reshape(
+        n_attention_heads, 1, 1
+    )
+
+    return slopes
+
+
+def test_lightning_attention_implementations(model_params):
+    torch.manual_seed(42)
+
+    batch_size = 2
+    seq_len = 1024
+    dtype = torch.bfloat16
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    hidden_states = torch.randn(
+        batch_size, seq_len, model_params["hidden_size"], dtype=dtype, device=device
+    )
+
+    attention_mask = torch.ones(batch_size, seq_len, dtype=dtype, device=device)
+
+    slope_rate = _build_slope_tensor(model_params["num_attention_heads"]).to(device)
+
+    model_attn = MiniMaxText01LightningAttention(**model_params).to(dtype).to(device)
+    model_attn.eval()
+
+    with torch.no_grad():
+        model_output, _, _ = model_attn.inference(
+            hidden_states, attn_mask=attention_mask, slope_rate=slope_rate
+        )
+
+    qkv = model_attn.act(model_attn.qkv_proj(hidden_states))
+    new_shape = qkv.size()[:-1] + (model_attn.num_heads, -1)
+    qkv = qkv.view(*new_shape)
+    q, k, v = torch.split(qkv, [model_attn.head_dim] * 3, dim=-1)
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+    v = v.transpose(1, 2)
+
+    lib_output = lightning_attn_func(q, k, v, slope_rate)
+    lib_output = lib_output.transpose(1, 2).contiguous()
+    lib_output = lib_output.view(batch_size, seq_len, -1)
+    lib_output = model_attn.norm(lib_output)
+    lib_output = torch.sigmoid(model_attn.output_gate(hidden_states)) * lib_output
+    lib_output = model_attn.out_proj(lib_output)
+
+    torch.testing.assert_close(
+        model_output,
+        lib_output,
+        rtol=1e-3,
+        atol=1e-2,
+        msg="Lightning attention implementations produce different results",
+    )
+
+
+def get_benchmark():
+    batch_size_range = [2**i for i in range(0, 7)]  # max 64
+    seq_length_range = [256, 512, 1024, 2048, 4096]  # max 4096
+    configs = list(itertools.product(batch_size_range, seq_length_range))
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["batch_size", "seq_len"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["MiniMax-Text-01", "OpenNLPLab"],
+            line_names=[
+                "MiniMax-Text-01 Model Implementation",
+                "OpenNLPLab Library Implementation",
+            ],
+            styles=[("blue", "-"), ("green", "-")],
+            ylabel="us",
+            plot_name="lightning-attention-prefill-performance",
+            args={},
+        )
+    )
+    def benchmark(batch_size, seq_len, provider):
+        dtype = torch.bfloat16
+        device = torch.device("cuda")
+
+        params = {
+            "hidden_size": 6144,
+            "num_attention_heads": 64,
+            "head_dim": 96,
+            "hidden_act": "gelu",
+        }
+
+        hidden_states = torch.randn(
+            batch_size, seq_len, params["hidden_size"], dtype=dtype, device=device
+        )
+
+        attention_mask = torch.ones(batch_size, seq_len, dtype=dtype, device=device)
+
+        slope_rate = _build_slope_tensor(params["num_attention_heads"]).to(device)
+        model_attn = MiniMaxText01LightningAttention(**params).to(dtype).to(device)
+        model_attn.eval()
+
+        quantiles = [0.5, 0.2, 0.8]
+        if provider == "MiniMax-Text-01":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: model_attn.inference(
+                    hidden_states, attn_mask=attention_mask, slope_rate=slope_rate
+                ),
+                quantiles=quantiles,
+            )
+        else:
+
+            def run_lib():
+                qkv = model_attn.act(model_attn.qkv_proj(hidden_states))
+                new_shape = qkv.size()[:-1] + (model_attn.num_heads, -1)
+                qkv = qkv.view(*new_shape)
+                q, k, v = torch.split(qkv, [model_attn.head_dim] * 3, dim=-1)
+                q = q.transpose(1, 2)
+                k = k.transpose(1, 2)
+                v = v.transpose(1, 2)
+
+                lib_output = lightning_attn_func(q, k, v, slope_rate)
+                lib_output = lib_output.transpose(1, 2).contiguous()
+                lib_output = lib_output.view(batch_size, seq_len, -1)
+                lib_output = model_attn.norm(lib_output)
+                lib_output = (
+                    torch.sigmoid(model_attn.output_gate(hidden_states)) * lib_output
+                )
+                return model_attn.out_proj(lib_output)
+
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                run_lib,
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./configs/benchmark_ops/lightning_attention_prefill/",
+        help="Path to save lightning attention prefill benchmark results",
+    )
+    args = parser.parse_args()
+
+    # Run correctness test first
+    # Adapted from https://huggingface.co/MiniMaxAI/MiniMax-Text-01/blob/main/config.json
+    params = {
+        "hidden_size": 6144,
+        "num_attention_heads": 64,
+        "head_dim": 96,
+        "hidden_act": "silu",
+    }
+    test_lightning_attention_implementations(params)
+
+    # Run performance benchmark
+    benchmark = get_benchmark()
+    benchmark.run(print_data=True, save_path=args.save_path)

From bf8d07a6f912e4ad291f38ed6a95e53fef273f40 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 16 Jan 2025 18:00:03 +0800
Subject: [PATCH 076/248] feat: patch linear base (#2915)

---
 python/sglang/srt/layers/linear.py            | 42 +++++++++++++++++--
 .../srt/layers/quantization/__init__.py       | 25 +++++++++--
 python/sglang/srt/layers/quantization/fp8.py  |  7 +++-
 .../srt/layers/quantization/modelopt_quant.py |  3 +-
 .../srt/layers/quantization/w8a8_int8.py      |  2 +-
 python/sglang/srt/utils.py                    |  2 +-
 6 files changed, 68 insertions(+), 13 deletions(-)

diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py
index ee9386c13fa3..4596f3d78f54 100644
--- a/python/sglang/srt/layers/linear.py
+++ b/python/sglang/srt/layers/linear.py
@@ -16,9 +16,6 @@
     tensor_model_parallel_all_reduce,
 )
 
-# Workaround: many QuantizationConfig still depends on this, so we have to use vLLM's LinearBase now.
-from vllm.model_executor.layers.linear import LinearBase
-
 from sglang.srt.layers.parameter import (
     BasevLLMParameter,
     PackedColumnParameter,
@@ -174,6 +171,45 @@ def apply(
         return F.linear(x, layer.weight, bias)
 
 
+class LinearBase(torch.nn.Module):
+    """Base linear layer.
+
+    Args:
+        input_size: input dimension of the linear layer.
+        output_size: output dimension of the linear layer.
+        bias: If true, add bias.
+        skip_bias_add: If true, skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.skip_bias_add = skip_bias_add
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+        if quant_config is None:
+            self.quant_method: Optional[QuantizeMethodBase] = UnquantizedLinearMethod()
+        else:
+            self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+
 class ReplicatedLinear(LinearBase):
     """Replicated linear layer.
 
diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py
index 1a39e800633c..88e9af6956ed 100644
--- a/python/sglang/srt/layers/quantization/__init__.py
+++ b/python/sglang/srt/layers/quantization/__init__.py
@@ -58,12 +58,11 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
 
 def fp8_get_quant_method(self, layer, prefix):
     """Enhanced get_quant_method for FP8 config."""
-    from vllm.model_executor.layers.linear import LinearBase
     from vllm.model_executor.layers.quantization.utils.quant_utils import (
         is_layer_skipped,
     )
 
-    from sglang.srt.layers.linear import UnquantizedLinearMethod
+    from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
     from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
     from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod, Fp8MoEMethod
 
@@ -77,12 +76,12 @@ def fp8_get_quant_method(self, layer, prefix):
 
 
 def gptq_get_quant_method(self, layer, prefix):
-    from vllm.model_executor.layers.linear import LinearBase
     from vllm.model_executor.layers.quantization.gptq_marlin import (
         GPTQMarlinLinearMethod,
         GPTQMarlinMoEMethod,
     )
 
+    from sglang.srt.layers.linear import LinearBase
     from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
 
     if isinstance(layer, LinearBase):
@@ -93,12 +92,12 @@ def gptq_get_quant_method(self, layer, prefix):
 
 
 def awq_get_quant_method(self, layer, prefix):
-    from vllm.model_executor.layers.linear import LinearBase
     from vllm.model_executor.layers.quantization.awq_marlin import (
         AWQMarlinLinearMethod,
         AWQMoEMethod,
     )
 
+    from sglang.srt.layers.linear import LinearBase
     from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
 
     if isinstance(layer, LinearBase):
@@ -108,6 +107,23 @@ def awq_get_quant_method(self, layer, prefix):
     return None
 
 
+def patch_vllm_linear_base_isinstance():
+    import builtins
+
+    from vllm.model_executor.layers.linear import LinearBase
+
+    from sglang.srt.layers.linear import LinearBase as PatchedLinearBase
+
+    original_isinstance = builtins.isinstance
+
+    def patched_isinstance(obj, classinfo):
+        if classinfo is LinearBase:
+            return original_isinstance(obj, PatchedLinearBase)
+        return original_isinstance(obj, classinfo)
+
+    builtins.isinstance = patched_isinstance
+
+
 def apply_monkey_patches():
     """Apply all monkey patches in one place."""
     setattr(Fp8Config, "get_quant_method", fp8_get_quant_method)
@@ -115,6 +131,7 @@ def apply_monkey_patches():
     setattr(AWQMarlinConfig, "get_quant_method", awq_get_quant_method)
 
 
+patch_vllm_linear_base_isinstance()
 # Apply patches when module is imported
 apply_monkey_patches()
 
diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
index d16a3b0c257b..5ccac960f578 100644
--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -9,7 +9,6 @@
 from torch.nn.parameter import Parameter
 from vllm import _custom_ops as ops
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.model_executor.layers.linear import LinearBase
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear,
@@ -25,7 +24,11 @@
     requantize_with_max_scale,
 )
 
-from sglang.srt.layers.linear import LinearMethodBase, UnquantizedLinearMethod
+from sglang.srt.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
 from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py
index 5d65899d6349..3e5f996ed10d 100644
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -5,14 +5,13 @@
 
 import torch
 from torch.nn.parameter import Parameter
-from vllm.model_executor.layers.linear import LinearBase
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     apply_fp8_linear,
     cutlass_fp8_supported,
     requantize_with_max_scale,
 )
 
-from sglang.srt.layers.linear import LinearMethodBase
+from sglang.srt.layers.linear import LinearBase, LinearMethodBase
 from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
diff --git a/python/sglang/srt/layers/quantization/w8a8_int8.py b/python/sglang/srt/layers/quantization/w8a8_int8.py
index 0c39393b70a9..87ba4cfc5593 100644
--- a/python/sglang/srt/layers/quantization/w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/w8a8_int8.py
@@ -54,7 +54,7 @@ def get_quant_method(
         layer: torch.nn.Module,
         prefix: str,
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.model_executor.layers.linear import LinearBase
+        from sglang.srt.layers.linear import LinearBase
 
         if isinstance(layer, LinearBase):
             return W8A8Int8LinearMethod(self)
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index e70e6b42526d..c521e002faba 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -574,13 +574,13 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
 
 
 def monkey_patch_vllm_gguf_config():
-    from vllm.model_executor.layers.linear import LinearBase
     from vllm.model_executor.layers.quantization.gguf import (
         GGUFConfig,
         GGUFEmbeddingMethod,
         GGUFLinearMethod,
     )
 
+    from sglang.srt.layers.linear import LinearBase
     from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
 
     def get_quant_method_with_embedding_replaced(

From 2dc957d421fdd6a92b04ba52e9e0c745f1e370b1 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 16 Jan 2025 18:17:34 +0800
Subject: [PATCH 077/248] fix setup for sgl kernel (#2917)

---
 sgl-kernel/setup.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index da6b465d8412..36596a1b00e3 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -15,11 +15,12 @@ def get_version():
 
 def update_wheel_platform_tag():
     wheel_dir = Path("dist")
-    old_wheel = next(wheel_dir.glob("*.whl"))
-    new_wheel = wheel_dir / old_wheel.name.replace(
-        "linux_x86_64", "manylinux2014_x86_64"
-    )
-    old_wheel.rename(new_wheel)
+    if wheel_dir.exists() and wheel_dir.is_dir():
+        old_wheel = next(wheel_dir.glob("*.whl"))
+        new_wheel = wheel_dir / old_wheel.name.replace(
+            "linux_x86_64", "manylinux2014_x86_64"
+        )
+        old_wheel.rename(new_wheel)
 
 
 cutlass = root / "3rdparty" / "cutlass"

From 75964177327c85ee1de05a28ebf5eae4d88974d5 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 16 Jan 2025 18:39:11 +0800
Subject: [PATCH 078/248] minor: use bear for compilation database (#2919)

---
 sgl-kernel/CMakeLists.txt | 65 ---------------------------------------
 sgl-kernel/Makefile       | 13 +++++---
 2 files changed, 8 insertions(+), 70 deletions(-)
 delete mode 100644 sgl-kernel/CMakeLists.txt

diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt
deleted file mode 100644
index 623984f2f3e7..000000000000
--- a/sgl-kernel/CMakeLists.txt
+++ /dev/null
@@ -1,65 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-project(sgl-kernel LANGUAGES CXX CUDA)
-
-# Basic settings
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-
-set(CUTLASS_DIR "3rdparty/cutlass")
-set(CUB_DIR "3rdparty/cub")
-
-# Set CUDA architectures
-set(CMAKE_CUDA_ARCHITECTURES "75;80;86;89;90")
-message(STATUS "Building for CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
-find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
-
-# Find PyTorch
-execute_process(
-    COMMAND ${Python3_EXECUTABLE} -c "import torch; print(torch.utils.cmake_prefix_path)"
-    OUTPUT_VARIABLE TORCH_CMAKE_PATH
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-)
-list(APPEND CMAKE_PREFIX_PATH "${TORCH_CMAKE_PATH}")
-
-find_package(Torch REQUIRED)
-
-# Warp Reduce library
-add_library(_kernels SHARED
-    src/sgl-kernel/csrc/trt_reduce_internal.cu
-    src/sgl-kernel/csrc/trt_reduce_kernel.cu
-    src/sgl-kernel/csrc/moe_align_kernel.cu
-    src/sgl-kernel/csrc/int8_gemm_kernel.cu
-    src/sgl-kernel/csrc/sampling_scaling_penalties.cu
-    src/sgl-kernel/csrc/sgl_kernel_ops.cu
-)
-
-target_include_directories(_kernels
-    PRIVATE
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/sgl-kernel/csrc
-        ${CUDA_INCLUDE_DIRS}
-        ${TORCH_INCLUDE_DIRS}
-        ${CUTLASS_DIR}/include
-        ${CUTLASS_DIR}/tools/util/include
-        ${CUB_DIR}/cub
-)
-
-target_link_libraries(_kernels
-    PRIVATE
-        ${TORCH_LIBRARIES}
-        Python3::Python
-)
-
-# Set common properties for both libraries
-foreach(target _kernels)
-    set_target_properties(${target} PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON
-        POSITION_INDEPENDENT_CODE ON
-        CUDA_RESOLVE_DEVICE_SYMBOLS ON
-        PREFIX ""
-        SUFFIX ".so"
-    )
-endforeach()
diff --git a/sgl-kernel/Makefile b/sgl-kernel/Makefile
index 7a041b1ed408..fac4c5c56c8e 100644
--- a/sgl-kernel/Makefile
+++ b/sgl-kernel/Makefile
@@ -1,15 +1,18 @@
-.PHONY: tree ln install build clean test format
+.PHONY: tree ln submodule install build clean test format
 
 tree:
 	@tree --prune -I "__pycache__|*.egg-info|*.so|build"
 
-ln:
-	@rm -rf build && cmake . -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DCMAKE_CUDA_COMPILER=nvcc -B build && rm -rf compile_commands.json && ln -s build/compile_commands.json compile_commands.json
+submodule:
+	@git submodule update --init --recursive
 
-install:
+ln: submodule
+	@rm -rf build && bear python3 setup.py build
+
+install: submodule
 	@pip install -e .
 
-build:
+build: submodule
 	@export MAX_JOBS=$(nproc) && python3 setup.py bdist_wheel
 
 clean:

From 8f2c522abac932a9d4146000213dd559c5136c26 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 16 Jan 2025 06:24:31 -0800
Subject: [PATCH 079/248] Improve benchmark scripts and error message printing
 (#2922)

---
 python/sglang/bench_offline_throughput.py     | 37 ++++++-----
 python/sglang/bench_serving.py                | 65 +++++++++++--------
 python/sglang/srt/managers/io_struct.py       |  6 ++
 python/sglang/srt/managers/scheduler.py       |  3 +-
 .../sglang/srt/managers/tokenizer_manager.py  | 41 ++++++++++--
 python/sglang/srt/server.py                   |  8 ++-
 python/sglang/test/test_utils.py              |  1 +
 test/srt/test_moe_ep.py                       |  4 +-
 8 files changed, 110 insertions(+), 55 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index f32063b41ca9..54b042c115d9 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -39,14 +39,15 @@ class BenchArgs:
     dataset_path: str = ""
     num_prompts: int = 1000
     sharegpt_output_len: Optional[int] = None
+    sharegpt_context_len: Optional[int] = None
     random_input_len: int = 1024
     random_output_len: int = 1024
     random_range_ratio: float = 0.0
-    gen_num_groups: int = 64
-    gen_prompts_per_group: int = 16
-    gen_system_prompt_len: int = 2048
-    gen_question_len: int = 128
-    gen_output_len: int = 256
+    gsp_num_groups: int = 64
+    gsp_prompts_per_group: int = 16
+    gsp_system_prompt_len: int = 2048
+    gsp_question_len: int = 128
+    gsp_output_len: int = 256
     disable_ignore_eos: bool = False
     extra_request_body: Optional[str] = None
     seed: int = 1
@@ -82,6 +83,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
             default=BenchArgs.sharegpt_output_len,
             help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
         )
+        parser.add_argument(
+            "--sharegpt-context-len",
+            type=int,
+            default=BenchArgs.sharegpt_context_len,
+            help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
+        )
         parser.add_argument(
             "--random-input-len",
             type=int,
@@ -102,35 +109,35 @@ def add_cli_args(parser: argparse.ArgumentParser):
             "used only for random dataset.",
         )
         parser.add_argument(
-            "--gen-num-groups",
+            "--gsp-num-groups",
             type=int,
-            default=BenchArgs.gen_num_groups,
+            default=BenchArgs.gsp_num_groups,
             help="Number of groups with shared prefix, used"
             "only for generate-shared-prefix",
         )
         parser.add_argument(
-            "--gen-prompts-per-group",
+            "--gsp-prompts-per-group",
             type=int,
-            default=BenchArgs.gen_prompts_per_group,
+            default=BenchArgs.gsp_prompts_per_group,
             help="Number of prompts per group of shared prefix, used"
             "only for generate-shared-prefix",
         )
         parser.add_argument(
-            "--gen-system-prompt-len",
+            "--gsp-system-prompt-len",
             type=int,
-            default=BenchArgs.gen_system_prompt_len,
+            default=BenchArgs.gsp_system_prompt_len,
             help="System prompt length, used" "only for generate-shared-prefix",
         )
         parser.add_argument(
-            "--gen-question-len",
+            "--gsp-question-len",
             type=int,
-            default=BenchArgs.gen_question_len,
+            default=BenchArgs.gsp_question_len,
             help="Question length, used" "only for generate-shared-prefix",
         )
         parser.add_argument(
-            "--gen-output-len",
+            "--gsp-output-len",
             type=int,
-            default=BenchArgs.gen_output_len,
+            default=BenchArgs.gsp_output_len,
             help="Target length in tokens for outputs in generated-shared-prefix dataset",
         )
         parser.add_argument(
diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index 941507705e36..991b4ddcf1a4 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -452,6 +452,7 @@ def get_dataset(args, tokenizer):
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             fixed_output_len=args.sharegpt_output_len,
+            context_len=args.sharegpt_context_len,
         )
     elif args.dataset_name == "random":
         input_requests = sample_random_requests(
@@ -464,11 +465,11 @@ def get_dataset(args, tokenizer):
         )
     elif args.dataset_name == "generated-shared-prefix":
         input_requests = sample_generated_shared_prefix_requests(
-            num_groups=args.gen_num_groups,
-            prompts_per_group=args.gen_prompts_per_group,
-            system_prompt_len=args.gen_system_prompt_len,
-            question_len=args.gen_question_len,
-            output_len=args.gen_output_len,
+            num_groups=args.gsp_num_groups,
+            prompts_per_group=args.gsp_prompts_per_group,
+            system_prompt_len=args.gsp_system_prompt_len,
+            question_len=args.gsp_question_len,
+            output_len=args.gsp_output_len,
             tokenizer=tokenizer,
         )
     else:
@@ -560,6 +561,7 @@ def sample_sharegpt_requests(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int] = None,
+    context_len: Optional[int] = None,
 ) -> List[Tuple[str, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
@@ -597,14 +599,15 @@ def sample_sharegpt_requests(
         output_len = (
             len(completion_token_ids) if fixed_output_len is None else fixed_output_len
         )
-        if prompt_len < 4 or output_len < 4:
+
+        if prompt_len < 1 or output_len < 1:
             # Prune too short sequences.
             continue
-        if prompt_len > 1024 or (
-            prompt_len + output_len > 2048 and fixed_output_len is None
-        ):
+
+        if context_len and prompt_len + output_len > context_len:
             # Prune too long sequences.
             continue
+
         filtered_dataset.append((prompt, prompt_len, output_len))
 
     print(f"#Input tokens: {np.sum([x[1] for x in filtered_dataset])}")
@@ -706,8 +709,8 @@ def get_gen_prefix_cache_path(args, tokenizer):
 
     # Create a unique cache filename based on the generation parameters
     cache_key = (
-        f"gen_prefix_{args.gen_num_groups}_{args.gen_prompts_per_group}_"
-        f"{args.gen_system_prompt_len}_{args.gen_question_len}_{args.gen_output_len}_"
+        f"gen_shared_prefix_{args.gsp_num_groups}_{args.gsp_prompts_per_group}_"
+        f"{args.gsp_system_prompt_len}_{args.gsp_question_len}_{args.gsp_output_len}_"
         f"{tokenizer.__class__.__name__}.pkl"
     )
     return cache_dir / cache_key
@@ -1374,6 +1377,12 @@ def set_ulimit(target_soft_limit=65535):
         default=None,
         help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
     )
+    parser.add_argument(
+        "--sharegpt-context-len",
+        type=int,
+        default=None,
+        help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
+    )
     parser.add_argument(
         "--random-input-len",
         type=int,
@@ -1453,49 +1462,49 @@ def set_ulimit(target_soft_limit=65535):
         help="Append given JSON object to the request payload. You can use this to specify"
         "additional generate params like sampling params.",
     )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--lora-name",
+        type=str,
+        default=None,
+        help="The name of LoRA adapter",
+    )
 
     group = parser.add_argument_group("generated-shared-prefix dataset arguments")
     group.add_argument(
-        "--gen-num-groups",
+        "--gsp-num-groups",
         type=int,
         default=64,
         help="Number of system prompt groups for generated-shared-prefix dataset",
     )
     group.add_argument(
-        "--gen-prompts-per-group",
+        "--gsp-prompts-per-group",
         type=int,
         default=16,
         help="Number of prompts per system prompt group for generated-shared-prefix dataset",
     )
     group.add_argument(
-        "--gen-system-prompt-len",
+        "--gsp-system-prompt-len",
         type=int,
         default=2048,
         help="Target length in tokens for system prompts in generated-shared-prefix dataset",
     )
     group.add_argument(
-        "--gen-question-len",
+        "--gsp-question-len",
         type=int,
         default=128,
         help="Target length in tokens for questions in generated-shared-prefix dataset",
     )
     group.add_argument(
-        "--gen-output-len",
+        "--gsp-output-len",
         type=int,
         default=256,
         help="Target length in tokens for outputs in generated-shared-prefix dataset",
     )
-    parser.add_argument(
-        "--profile",
-        action="store_true",
-        help="Use Torch Profiler. The endpoint must be launched with "
-        "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
-    )
-    parser.add_argument(
-        "--lora-name",
-        type=str,
-        default=None,
-        help="The name of LoRA adapter",
-    )
     args = parser.parse_args()
     run_benchmark(args)
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
index 1698dfbeb3ec..7f07055132fd 100644
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -59,6 +59,9 @@ class GenerateReqInput:
     return_text_in_logprobs: bool = False
     # Whether to stream output.
     stream: bool = False
+    # Whether to log metrics for this request (e.g. health_generate calls do not log metrics)
+    log_metrics: bool = True
+
     # The modalities of the image data [image, multi-images, video]
     modalities: Optional[List[str]] = None
     # LoRA related
@@ -196,6 +199,7 @@ def __getitem__(self, i):
             top_logprobs_num=self.top_logprobs_num[i],
             return_text_in_logprobs=self.return_text_in_logprobs,
             stream=self.stream,
+            log_metrics=self.log_metrics,
             modalities=self.modalities[i] if self.modalities else None,
             lora_path=self.lora_path[i] if self.lora_path is not None else None,
         )
@@ -243,6 +247,8 @@ class EmbeddingReqInput:
     sampling_params: Union[List[Dict], Dict] = None
     # Dummy input embeds for compatibility
     input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
+    # Whether to log metrics for this request (e.g. health_generate calls do not log metrics)
+    log_metrics: bool = True
 
     def normalize_batch_and_arguments(self):
         if (self.text is None and self.input_ids is None) or (
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 169c202d37e8..6ee93b3cd28a 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -631,7 +631,8 @@ def handle_generate_request(
         if len(req.origin_input_ids) > self.max_req_input_len:
             logger.warning(
                 "Request length is longer than the KV cache pool size or "
-                "the max context length. Truncated!!!"
+                "the max context length. Truncated. "
+                f"{len(req.origin_input_ids)=}, {self.max_req_input_len=}."
             )
             req.origin_input_ids = req.origin_input_ids[: self.max_req_input_len]
 
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 4f4e4f7dc872..4e120f3a9868 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -79,6 +79,7 @@
     get_zmq_socket,
     kill_process_tree,
 )
+from sglang.utils import get_exception_traceback
 
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 
@@ -640,7 +641,9 @@ def auto_create_handle_loop(self):
 
         self.to_create_loop = False
         loop = asyncio.get_event_loop()
-        self.asyncio_tasks.add(loop.create_task(self.handle_loop()))
+        self.asyncio_tasks.add(
+            loop.create_task(print_exception_wrapper(self.handle_loop))
+        )
 
         # We cannot add signal handler when the tokenizer manager is not in
         # the main thread due to the CPython limitation.
@@ -653,7 +656,9 @@ def auto_create_handle_loop(self):
                 "not in the main thread. This disables graceful shutdown of the "
                 "tokenizer manager when SIGTERM is received."
             )
-        self.asyncio_tasks.add(loop.create_task(self.sigterm_watchdog()))
+        self.asyncio_tasks.add(
+            loop.create_task(print_exception_wrapper(self.sigterm_watchdog))
+        )
 
     async def sigterm_watchdog(self):
         while not self.gracefully_exit:
@@ -738,9 +743,13 @@ async def handle_loop(self):
                     state.finished = recv_obj.finished_reasons[i] is not None
                     state.event.set()
 
-                    if self.enable_metrics:
+                    if self.enable_metrics and state.obj.log_metrics:
                         self.collect_metrics(state, recv_obj, i)
-                    if self.dump_requests_folder and state.finished:
+                    if (
+                        self.dump_requests_folder
+                        and state.finished
+                        and state.obj.log_metrics
+                    ):
                         self.dump_requests(state, out_dict)
             elif isinstance(recv_obj, OpenSessionReqOutput):
                 self.session_futures[recv_obj.session_id].set_result(
@@ -887,20 +896,38 @@ def dump_requests(self, state: ReqState, out_dict: dict):
         )
 
         if len(self.dump_request_list) >= self.dump_requests_threshold:
+            filename = os.path.join(
+                self.dump_requests_folder,
+                datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".pkl",
+            )
+            logger.info(f"Dump {len(self.dump_request_list)} requests to {filename}")
+
             to_dump = self.dump_request_list
             self.dump_request_list = []
 
             def background_task():
                 os.makedirs(self.dump_requests_folder, exist_ok=True)
-                current_time = datetime.now()
-                filename = current_time.strftime("%Y-%m-%d_%H-%M-%S") + ".pkl"
-                with open(os.path.join(self.dump_requests_folder, filename), "wb") as f:
+                with open(filename, "wb") as f:
                     pickle.dump(to_dump, f)
 
             # Schedule the task to run in the background without awaiting it
             asyncio.create_task(asyncio.to_thread(background_task))
 
 
+async def print_exception_wrapper(func):
+    """
+    Sometimes an asyncio function does not print exception.
+    We do another wrapper to handle the exception.
+    """
+    try:
+        await func()
+    except Exception:
+        traceback = get_exception_traceback()
+        logger.error(f"TokenizerManager hit an exception: {traceback}")
+        kill_process_tree(os.getpid(), include_parent=True)
+        sys.exit(1)
+
+
 class SignalHandler:
     def __init__(self, tokenizer_manager):
         self.tokenizer_manager = tokenizer_manager
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index 93fe1304caff..6b180039e012 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -135,9 +135,13 @@ async def health_generate(request: Request) -> Response:
     sampling_params = {"max_new_tokens": 1, "temperature": 0.7}
 
     if tokenizer_manager.is_generation:
-        gri = GenerateReqInput(input_ids=[0], sampling_params=sampling_params)
+        gri = GenerateReqInput(
+            input_ids=[0], sampling_params=sampling_params, log_metrics=False
+        )
     else:
-        gri = EmbeddingReqInput(input_ids=[0], sampling_params=sampling_params)
+        gri = EmbeddingReqInput(
+            input_ids=[0], sampling_params=sampling_params, log_metrics=False
+        )
 
     try:
         async for _ in tokenizer_manager.generate_request(gri, request):
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index 4121deb17cc7..42e0b6d808a7 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -560,6 +560,7 @@ def run_bench_serving(
         tokenizer=tokenizer,
         num_prompts=num_prompts,
         sharegpt_output_len=None,
+        sharegpt_context_len=None,
         random_input_len=random_input_len,
         random_output_len=random_output_len,
         random_range_ratio=0.0,
diff --git a/test/srt/test_moe_ep.py b/test/srt/test_moe_ep.py
index 4d9fd435edb5..9f87eb24d719 100644
--- a/test/srt/test_moe_ep.py
+++ b/test/srt/test_moe_ep.py
@@ -44,7 +44,7 @@ def test_mmlu(self):
         )
 
         metrics = run_eval(args)
-        assert metrics["score"] >= 0.5
+        self.assertGreater(metrics["score"], 0.5)
 
     def test_mgsm_en(self):
         args = SimpleNamespace(
@@ -56,7 +56,7 @@ def test_mgsm_en(self):
         )
 
         metrics = run_eval(args)
-        assert metrics["score"] >= 0.8
+        self.assertGreater(metrics["score"], 0.8)
 
 
 class TestEpMoEFP8(unittest.TestCase):

From a2f602b5418f76f64c4d762dff3c56cc856d484e Mon Sep 17 00:00:00 2001
From: Rin Intachuen <113603872+RinRin-32@users.noreply.github.com>
Date: Thu, 16 Jan 2025 21:51:43 +0700
Subject: [PATCH 080/248] fixed lm_head.weight error for quantized qwen (#2910)

---
 python/sglang/srt/models/qwen2.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py
index e42559bbc00c..bc3f1099753e 100644
--- a/python/sglang/srt/models/qwen2.py
+++ b/python/sglang/srt/models/qwen2.py
@@ -356,6 +356,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 break
             else:
                 # Skip loading extra bias for GPTQ models.
+                if "lm_head.weight" in name:
+                    continue
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 param = params_dict[name]

From e00e5385e0d9592a55381684292fbcb437c7c70b Mon Sep 17 00:00:00 2001
From: Yun Dai <yundai424@gmail.com>
Date: Thu, 16 Jan 2025 07:24:24 -0800
Subject: [PATCH 081/248] add profiling to bench_one_batch script (#2821)

---
 python/sglang/bench_one_batch.py | 51 ++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/python/sglang/bench_one_batch.py b/python/sglang/bench_one_batch.py
index 63787addf0ed..99fba8be913d 100644
--- a/python/sglang/bench_one_batch.py
+++ b/python/sglang/bench_one_batch.py
@@ -9,7 +9,8 @@
 python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
 ## sweep through multiple data points and store (append) the results in a jsonl file:
 python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
-
+## run with profiling:
+python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --profile
 # Usage (correctness test):
 python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
 
@@ -77,6 +78,8 @@ class BenchArgs:
     correctness_test: bool = False
     # This is only used for correctness test
     cut_len: int = 4
+    profile: bool = False
+    profile_filename_prefix: str = "profile"
 
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -95,6 +98,19 @@ def add_cli_args(parser: argparse.ArgumentParser):
         )
         parser.add_argument("--correctness-test", action="store_true")
         parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
+        parser.add_argument(
+            "--profile",
+            action="store_true",
+            help="Use Torch Profiler. The endpoint must be launched with "
+            "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+        )
+        parser.add_argument(
+            "--profile-filename-prefix",
+            type=str,
+            default=BenchArgs.profile_filename_prefix,
+            help="Prefix of the profiling file names. The full profiling result file(s) be "
+            '"[profile_filename_prefix]_batch[batch_size]_input[input_len]_output[output_len].trace.json.gz"',
+        )
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -286,7 +302,16 @@ def synchronize(device):
 
 
 def latency_test_run_once(
-    run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len, device
+    run_name,
+    model_runner,
+    rank_print,
+    reqs,
+    batch_size,
+    input_len,
+    output_len,
+    device,
+    profile,
+    profile_filename_prefix,
 ):
     max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
     if batch_size > max_batch_size:
@@ -308,6 +333,17 @@ def latency_test_run_once(
 
     tot_latency = 0
 
+    profiler = None
+    if profile:
+        profiler = torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            with_stack=True,
+        )
+        profiler.start()
+
     # Prefill
     synchronize(device)
     tic = time.time()
@@ -338,6 +374,13 @@ def latency_test_run_once(
                 f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
             )
 
+    if profile:
+        profiler.stop()
+        profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}.trace.json.gz"
+        parent_dir = os.path.dirname(os.path.abspath(profile_filename))
+        os.makedirs(parent_dir, exist_ok=True)
+        profiler.export_chrome_trace(profile_filename)
+
     # Record decode timing from 2nd output
     if output_len > 1:
         med_decode_latency = np.median(decode_latencies)
@@ -386,6 +429,8 @@ def latency_test(
         bench_args.input_len[0],
         8,  # shorter decoding to speed up the warmup
         server_args.device,
+        profile=False,
+        profile_filename_prefix="",  # not used
     )
 
     rank_print("Benchmark ...")
@@ -405,6 +450,8 @@ def latency_test(
             il,
             ol,
             server_args.device,
+            bench_args.profile,
+            bench_args.profile_filename_prefix,
         )
         if ret is not None:
             result_list.append(ret)

From 93d690617e2d0dce582a68f8037f98b7e168d72f Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 16 Jan 2025 07:52:17 -0800
Subject: [PATCH 082/248] Simplify the process launch code in server.py (#2923)

---
 python/sglang/srt/server.py | 50 +++++++++++++++++++++----------------
 python/sglang/srt/utils.py  | 12 +++++++++
 2 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index 6b180039e012..af0f2a08d90b 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -44,7 +44,6 @@
 from fastapi import FastAPI, File, Form, Request, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import ORJSONResponse, Response, StreamingResponse
-from uvicorn.config import LOGGING_CONFIG
 
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.srt.hf_transformers_utils import get_tokenizer
@@ -97,6 +96,7 @@
     prepare_model_and_tokenizer,
     set_prometheus_multiproc_dir,
     set_ulimit,
+    set_uvicorn_logging_configs,
 )
 from sglang.utils import get_exception_traceback
 from sglang.version import __version__
@@ -474,13 +474,13 @@ def launch_engine(
         server_args.model_path, server_args.tokenizer_path
     )
 
-    memory_saver_adapter = TorchMemorySaverAdapter.create(
-        enable=server_args.enable_memory_saver
-    )
-
+    scheduler_procs = []
     if server_args.dp_size == 1:
         # Launch tensor parallel scheduler processes
-        scheduler_procs = []
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=server_args.enable_memory_saver
+        )
+
         scheduler_pipe_readers = []
         tp_size_per_node = server_args.tp_size // server_args.nnodes
         tp_rank_range = range(
@@ -498,12 +498,6 @@ def launch_engine(
                 proc.start()
             scheduler_procs.append(proc)
             scheduler_pipe_readers.append(reader)
-
-        if server_args.node_rank >= 1:
-            # For other nodes, they do not need to run tokenizer or detokenizer,
-            # so they can just wait here.
-            for proc in scheduler_procs:
-                proc.join()
     else:
         # Launch the data parallel controller
         reader, writer = mp.Pipe(duplex=False)
@@ -512,8 +506,27 @@ def launch_engine(
             target=run_data_parallel_controller_process,
             args=(server_args, port_args, writer),
         )
-        with memory_saver_adapter.configure_subprocess():
-            proc.start()
+        proc.start()
+        scheduler_procs.append(proc)
+
+    if server_args.node_rank >= 1:
+        # In multi-node cases, non-zero rank nodes do not need to run tokenizer or detokenizer,
+        # so they can just wait here.
+
+        for reader in scheduler_pipe_readers:
+            data = reader.recv()
+            assert data["status"] == "ready"
+
+        if os.getenv("SGLANG_BLOCK_NONZERO_RANK_CHILDREN") == "0":
+            # When using `Engine` as a Python API, we don't want to block here.
+            return
+
+        for proc in scheduler_procs:
+            proc.join()
+            logger.error(
+                f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
+            )
+        return
 
     # Launch detokenizer process
     detoken_proc = mp.Process(
@@ -597,14 +610,7 @@ def launch_server(
 
     try:
         # Update logging configs
-        LOGGING_CONFIG["formatters"]["default"][
-            "fmt"
-        ] = "[%(asctime)s] %(levelprefix)s %(message)s"
-        LOGGING_CONFIG["formatters"]["default"]["datefmt"] = "%Y-%m-%d %H:%M:%S"
-        LOGGING_CONFIG["formatters"]["access"][
-            "fmt"
-        ] = '[%(asctime)s] %(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)s'
-        LOGGING_CONFIG["formatters"]["access"]["datefmt"] = "%Y-%m-%d %H:%M:%S"
+        set_uvicorn_logging_configs()
 
         # Listen for HTTP requests
         uvicorn.run(
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index c521e002faba..583dd92e17d6 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -59,6 +59,7 @@
     default_dump_dir,
     default_override_dir,
 )
+from uvicorn.config import LOGGING_CONFIG
 
 logger = logging.getLogger(__name__)
 
@@ -1404,3 +1405,14 @@ def nullable_str(val: str):
     if not val or val == "None":
         return None
     return val
+
+
+def set_uvicorn_logging_configs():
+    LOGGING_CONFIG["formatters"]["default"][
+        "fmt"
+    ] = "[%(asctime)s] %(levelprefix)s %(message)s"
+    LOGGING_CONFIG["formatters"]["default"]["datefmt"] = "%Y-%m-%d %H:%M:%S"
+    LOGGING_CONFIG["formatters"]["access"][
+        "fmt"
+    ] = '[%(asctime)s] %(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)s'
+    LOGGING_CONFIG["formatters"]["access"]["datefmt"] = "%Y-%m-%d %H:%M:%S"

From 58f3f2b8405962ef9b64c34f58fb0ca8fd30ffc4 Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Fri, 17 Jan 2025 01:26:51 +0800
Subject: [PATCH 083/248] Add CI for sgl-kernel (#2924)

---
 .github/workflows/pr-test-sgl-kernel.yml | 46 ++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 .github/workflows/pr-test-sgl-kernel.yml

diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
new file mode 100644
index 000000000000..d5848d4e4f08
--- /dev/null
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -0,0 +1,46 @@
+name: PR Test (sgl-kernel)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "sgl-kernel/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "sgl-kernel/**"
+  workflow_dispatch:
+
+concurrency:
+  group: pr-test-sgl-kernel-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-test:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci_install_dependency.sh
+
+          cd sgl-kernel
+          git submodule update --init --recursive
+          pip3 install -e . --force-reinstall
+          pip3 list | grep sgl-kernel
+
+      - name: Run test
+        timeout-minutes: 10
+        run: |
+          cd sgl-kernel
+          find tests -name "test_*.py" | xargs -n 1 python3
+
+  finish:
+    needs: [unit-test]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Finish
+        run: echo "This is an empty step to ensure that all jobs are completed."

From 8b6ce52e92ab390952e75e2fc68c90d4e3f7928c Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 16 Jan 2025 11:15:00 -0800
Subject: [PATCH 084/248] Support multi-node DP attention (#2925)

Co-authored-by: dhou-xai <dhou@x.ai>
---
 docs/backend/server_arguments.md              |   4 +-
 docs/references/llama_405B.md                 |   4 +-
 .../layers/attention/flashinfer_backend.py    |  19 +--
 .../srt/layers/attention/triton_backend.py    |  10 +-
 python/sglang/srt/layers/dp_attention.py      |  68 +++++++++
 python/sglang/srt/layers/logits_processor.py  |   2 +-
 .../srt/managers/data_parallel_controller.py  | 140 +++++++++---------
 python/sglang/srt/managers/schedule_batch.py  |   7 +-
 python/sglang/srt/managers/scheduler.py       |  83 ++++++++---
 python/sglang/srt/managers/tp_worker.py       |   8 +-
 .../srt/managers/tp_worker_overlap_thread.py  |   3 +
 .../srt/model_executor/cuda_graph_runner.py   |   3 +-
 .../sglang/srt/model_executor/model_runner.py |  11 ++
 python/sglang/srt/models/deepseek_v2.py       |   7 +-
 python/sglang/srt/server_args.py              |  51 +++++--
 python/sglang/srt/utils.py                    |   4 +-
 16 files changed, 287 insertions(+), 137 deletions(-)
 create mode 100644 python/sglang/srt/layers/dp_attention.py

diff --git a/docs/backend/server_arguments.md b/docs/backend/server_arguments.md
index 90b36a0bdd91..6d72aa55a3f4 100644
--- a/docs/backend/server_arguments.md
+++ b/docs/backend/server_arguments.md
@@ -26,8 +26,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
 ```
 # Node 0
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --dist-init-addr sgl-dev-0:50000 --nnodes 2 --node-rank 0
 
 # Node 1
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --dist-init-addr sgl-dev-0:50000 --nnodes 2 --node-rank 1
 ```
diff --git a/docs/references/llama_405B.md b/docs/references/llama_405B.md
index 4f70e89f6d9a..075aac030964 100644
--- a/docs/references/llama_405B.md
+++ b/docs/references/llama_405B.md
@@ -11,9 +11,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instr
 ```bash
 # on the first node, replace 172.16.4.52:20000 with your own node ip address and port
 
-python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0
+python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --dist-init-addr-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0
 
 # on the second node, replace 172.18.45.52:20000 with your own node ip address and port
 
-python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.18.45.52:20000 --nnodes 2 --node-rank 1
+python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --dist-init-addr-addr 172.18.45.52:20000 --nnodes 2 --node-rank 1
 ```
diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py
index 6a4636128103..7540515c5fd1 100644
--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -18,6 +18,7 @@
 
 from sglang.global_config import global_config
 from sglang.srt.layers.attention import AttentionBackend
+from sglang.srt.layers.dp_attention import get_attention_tp_size
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.utils import is_flashinfer_available
 
@@ -62,9 +63,9 @@ def __init__(self, model_runner: ModelRunner):
         self.decode_use_tensor_cores = should_use_tensor_core(
             kv_cache_dtype=model_runner.kv_cache_dtype,
             num_attention_heads=model_runner.model_config.num_attention_heads
-            // model_runner.tp_size,
+            // get_attention_tp_size(),
             num_kv_heads=model_runner.model_config.get_num_kv_heads(
-                model_runner.tp_size
+                get_attention_tp_size()
             ),
         )
         self.max_context_len = model_runner.model_config.context_len
@@ -147,7 +148,7 @@ def __init__(self, model_runner: ModelRunner):
         self.prefill_cuda_graph_metadata = {}
 
     def init_forward_metadata(self, forward_batch: ForwardBatch):
-        if forward_batch.forward_mode.is_decode():
+        if forward_batch.forward_mode.is_decode_or_idle():
             self.indices_updater_decode.update(
                 forward_batch.req_pool_indices,
                 forward_batch.seq_lens,
@@ -238,7 +239,7 @@ def init_forward_metadata_capture_cuda_graph(
         forward_mode: ForwardMode,
         spec_info: Optional[SpecInfo],
     ):
-        if forward_mode.is_decode():
+        if forward_mode.is_decode_or_idle():
             decode_wrappers = []
             for i in range(self.num_wrappers):
                 decode_wrappers.append(
@@ -307,7 +308,7 @@ def init_forward_metadata_replay_cuda_graph(
         forward_mode: ForwardMode,
         spec_info: Optional[SpecInfo],
     ):
-        if forward_mode.is_decode():
+        if forward_mode.is_decode_or_idle():
             self.indices_updater_decode.update(
                 req_pool_indices[:bs],
                 seq_lens[:bs],
@@ -453,10 +454,10 @@ class FlashInferIndicesUpdaterDecode:
     def __init__(self, model_runner: ModelRunner, attn_backend: AttentionBackend):
         # Parse Constants
         self.num_qo_heads = (
-            model_runner.model_config.num_attention_heads // model_runner.tp_size
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
         )
         self.num_kv_heads = model_runner.model_config.get_num_kv_heads(
-            model_runner.tp_size
+            get_attention_tp_size()
         )
         self.head_dim = model_runner.model_config.head_dim
         self.data_type = model_runner.kv_cache_dtype
@@ -625,10 +626,10 @@ class FlashInferIndicesUpdaterPrefill:
     def __init__(self, model_runner: ModelRunner, attn_backend: AttentionBackend):
         # Parse Constants
         self.num_qo_heads = (
-            model_runner.model_config.num_attention_heads // model_runner.tp_size
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
         )
         self.num_kv_heads = model_runner.model_config.get_num_kv_heads(
-            model_runner.tp_size
+            get_attention_tp_size()
         )
         self.head_dim = model_runner.model_config.head_dim
         self.data_type = model_runner.kv_cache_dtype
diff --git a/python/sglang/srt/layers/attention/triton_backend.py b/python/sglang/srt/layers/attention/triton_backend.py
index 04327b162b90..fade8ed292dc 100644
--- a/python/sglang/srt/layers/attention/triton_backend.py
+++ b/python/sglang/srt/layers/attention/triton_backend.py
@@ -5,6 +5,7 @@
 import torch
 
 from sglang.srt.layers.attention import AttentionBackend
+from sglang.srt.layers.dp_attention import get_attention_tp_size
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 
 if TYPE_CHECKING:
@@ -28,12 +29,9 @@ def __init__(self, model_runner: ModelRunner):
         self.decode_attention_fwd = decode_attention_fwd
         self.extend_attention_fwd = extend_attention_fwd
 
-        if model_runner.server_args.enable_dp_attention:
-            self.num_head = model_runner.model_config.num_attention_heads
-        else:
-            self.num_head = (
-                model_runner.model_config.num_attention_heads // model_runner.tp_size
-            )
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
 
         self.num_kv_splits = model_runner.server_args.triton_attention_num_kv_splits
         self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[-1]
diff --git a/python/sglang/srt/layers/dp_attention.py b/python/sglang/srt/layers/dp_attention.py
new file mode 100644
index 000000000000..41bcb2181377
--- /dev/null
+++ b/python/sglang/srt/layers/dp_attention.py
@@ -0,0 +1,68 @@
+import torch
+from vllm.distributed import GroupCoordinator, get_tp_group
+
+_ATTN_TP_GROUP = None
+_ATTN_TP_RANK = None
+_ATTN_TP_SIZE = None
+_DP_RANK = None
+_DP_SIZE = None
+
+
+def compute_dp_attention_world_info(enable_dp_attention, tp_rank, tp_size, dp_size):
+    if not enable_dp_attention:
+        return tp_rank, tp_size, 0
+
+    attn_tp_size = tp_size // dp_size
+    dp_rank = tp_rank // attn_tp_size
+    attn_tp_rank = tp_rank % attn_tp_size
+    return attn_tp_rank, attn_tp_size, dp_rank
+
+
+def initialize_dp_attention(enable_dp_attention, tp_rank, tp_size, dp_size):
+    global _ATTN_TP_GROUP, _ATTN_TP_RANK, _ATTN_TP_SIZE, _DP_RANK, _DP_SIZE
+
+    _ATTN_TP_RANK, _ATTN_TP_SIZE, _DP_RANK = compute_dp_attention_world_info(
+        enable_dp_attention, tp_rank, tp_size, dp_size
+    )
+    _DP_SIZE = dp_size
+
+    tp_group = get_tp_group()
+    _ATTN_TP_GROUP = GroupCoordinator(
+        [
+            list(range(head, head + _ATTN_TP_SIZE))
+            for head in range(0, tp_size, _ATTN_TP_SIZE)
+        ],
+        tp_rank,
+        torch.distributed.get_backend(tp_group.device_group),
+        False,
+        False,
+        False,
+        False,
+        False,
+        group_name="attention_tp",
+    )
+
+
+def get_attention_tp_group():
+    assert _ATTN_TP_GROUP is not None, "dp attention not initialized!"
+    return _ATTN_TP_GROUP
+
+
+def get_attention_tp_rank():
+    assert _ATTN_TP_RANK is not None, "dp attention not initialized!"
+    return _ATTN_TP_RANK
+
+
+def get_attention_tp_size():
+    assert _ATTN_TP_SIZE is not None, "dp attention not initialized!"
+    return _ATTN_TP_SIZE
+
+
+def get_attention_dp_rank():
+    assert _DP_RANK is not None, "dp attention not initialized!"
+    return _DP_RANK
+
+
+def get_attention_dp_size():
+    assert _DP_SIZE is not None, "dp attention not initialized!"
+    return _DP_SIZE
diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py
index f5b12b48a867..e1dc94548084 100644
--- a/python/sglang/srt/layers/logits_processor.py
+++ b/python/sglang/srt/layers/logits_processor.py
@@ -133,7 +133,7 @@ def forward(
 
         # Get the last hidden states and last logits for the next token prediction
         if (
-            logits_metadata.forward_mode.is_decode()
+            logits_metadata.forward_mode.is_decode_or_idle()
             or logits_metadata.forward_mode.is_target_verify()
         ):
             last_index = None
diff --git a/python/sglang/srt/managers/data_parallel_controller.py b/python/sglang/srt/managers/data_parallel_controller.py
index 7ae6689ee694..c4ebbb3cfcab 100644
--- a/python/sglang/srt/managers/data_parallel_controller.py
+++ b/python/sglang/srt/managers/data_parallel_controller.py
@@ -23,6 +23,7 @@
 import setproctitle
 import zmq
 
+from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
 from sglang.srt.managers.io_struct import (
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
@@ -63,9 +64,10 @@ def __init__(self, server_args, port_args) -> None:
 
         # Init inter-process communication
         self.context = zmq.Context(1 + server_args.dp_size)
-        self.recv_from_tokenizer = get_zmq_socket(
-            self.context, zmq.PULL, port_args.scheduler_input_ipc_name
-        )
+        if server_args.node_rank == 0:
+            self.recv_from_tokenizer = get_zmq_socket(
+                self.context, zmq.PULL, port_args.scheduler_input_ipc_name
+            )
 
         # Dispatch method
         self.round_robin_counter = 0
@@ -75,33 +77,47 @@ def __init__(self, server_args, port_args) -> None:
         }
         self.dispatching = dispatch_lookup[self.load_balance_method]
 
-        # Start data parallel workers
-        base_gpu_id = 0
+        # Launch data parallel workers
+        self.scheduler_procs = []
         self.workers = [None] * server_args.dp_size
 
+        if not server_args.enable_dp_attention:
+            dp_port_args = self.launch_dp_schedulers(server_args, port_args)
+        else:
+            dp_port_args = self.launch_dp_attention_schedulers(server_args, port_args)
+
+        # Only node rank 0 runs the real data parallel controller that dispatches the requests.
+        if server_args.node_rank == 0:
+            for dp_rank in range(server_args.dp_size):
+                self.workers[dp_rank] = get_zmq_socket(
+                    self.context,
+                    zmq.PUSH,
+                    dp_port_args[dp_rank].scheduler_input_ipc_name,
+                )
+
+    def launch_dp_schedulers(self, server_args, port_args):
+        base_gpu_id = 0
+
         threads = []
         sockets = []
+        dp_port_args = []
         for dp_rank in range(server_args.dp_size):
             tmp_port_args = PortArgs.init_new(server_args)
             tmp_port_args.tokenizer_ipc_name = port_args.tokenizer_ipc_name
             tmp_port_args.detokenizer_ipc_name = port_args.detokenizer_ipc_name
+            dp_port_args.append(tmp_port_args)
 
-            if server_args.enable_dp_attention:
-                # Data parallelism resues the tensor parallelism group,
-                # so all dp ranks should use the same nccl port.
-                tmp_port_args.nccl_port = port_args.nccl_port
-            else:
-                # This port is checked free in PortArgs.init_new.
-                # We hold it first so that the next dp worker gets a different port
-                sockets.append(bind_port(tmp_port_args.nccl_port))
+            # This port is checked free in PortArgs.init_new.
+            # We hold it first so that the next dp worker gets a different port
+            sockets.append(bind_port(tmp_port_args.nccl_port))
 
             # Create a thread for each worker
             thread = threading.Thread(
-                target=self.launch_worker_func,
+                target=self.launch_tensor_parallel_group,
                 args=(server_args, tmp_port_args, base_gpu_id, dp_rank),
             )
             threads.append(thread)
-            base_gpu_id += 1 if server_args.enable_dp_attention else server_args.tp_size
+            base_gpu_id += server_args.tp_size
 
         # Free all sockets before starting the threads to launch TP workers
         for sock in sockets:
@@ -113,26 +129,14 @@ def __init__(self, server_args, port_args) -> None:
         for thread in threads:
             thread.join()
 
-    def launch_worker_func(
-        self,
-        server_args: ServerArgs,
-        port_args: PortArgs,
-        base_gpu_id: int,
-        dp_rank: int,
-    ):
-        logger.info(f"Launch DP{dp_rank} starting at GPU #{base_gpu_id}.")
+        return dp_port_args
 
-        launch_func_ = (
-            self.launch_tensor_parallel_process
-            if server_args.enable_dp_attention
-            else self.launch_tensor_parallel_group
-        )
-        self.workers[dp_rank] = launch_func_(
-            server_args,
-            port_args,
-            base_gpu_id,
-            dp_rank,
-        )
+    def launch_dp_attention_schedulers(self, server_args, port_args):
+        self.launch_tensor_parallel_group(server_args, port_args, 0, None)
+        dp_port_args = []
+        for dp_rank in range(server_args.dp_size):
+            dp_port_args.append(PortArgs.init_new(server_args, dp_rank))
+        return dp_port_args
 
     def launch_tensor_parallel_group(
         self,
@@ -141,8 +145,10 @@ def launch_tensor_parallel_group(
         base_gpu_id: int,
         dp_rank: int,
     ):
+        if not server_args.enable_dp_attention:
+            logger.info(f"Launch DP{dp_rank} starting at GPU #{base_gpu_id}.")
+
         # Launch tensor parallel scheduler processes
-        scheduler_procs = []
         scheduler_pipe_readers = []
         tp_size_per_node = server_args.tp_size // server_args.nnodes
         tp_rank_range = range(
@@ -150,53 +156,39 @@ def launch_tensor_parallel_group(
             tp_size_per_node * (server_args.node_rank + 1),
         )
         for tp_rank in tp_rank_range:
+            rank_port_args = port_args
+
+            if server_args.enable_dp_attention:
+                # dp attention has different sharding logic
+                _, _, dp_rank = compute_dp_attention_world_info(
+                    server_args.enable_dp_attention,
+                    tp_rank,
+                    server_args.tp_size,
+                    server_args.dp_size,
+                )
+                # compute zmq ports for this dp rank
+                rank_port_args = PortArgs.init_new(server_args, dp_rank)
+                # Data parallelism resues the tensor parallelism group,
+                # so all dp ranks should use the same nccl port.
+                rank_port_args.nccl_port = port_args.nccl_port
+
             reader, writer = mp.Pipe(duplex=False)
             gpu_id = server_args.base_gpu_id + base_gpu_id + tp_rank % tp_size_per_node
             proc = mp.Process(
                 target=run_scheduler_process,
-                args=(server_args, port_args, gpu_id, tp_rank, dp_rank, writer),
+                args=(server_args, rank_port_args, gpu_id, tp_rank, dp_rank, writer),
             )
             proc.start()
-            scheduler_procs.append(proc)
+            self.scheduler_procs.append(proc)
             scheduler_pipe_readers.append(reader)
 
-        send_to = get_zmq_socket(
-            self.context, zmq.PUSH, port_args.scheduler_input_ipc_name
-        )
-
-        # Wait for model to finish loading and get max token nums
+        # Wait for model to finish loading
         scheduler_info = []
         for i in range(len(scheduler_pipe_readers)):
             scheduler_info.append(scheduler_pipe_readers[i].recv())
 
         self.max_total_num_tokens = scheduler_info[0]["max_total_num_tokens"]
 
-        return send_to
-
-    def launch_tensor_parallel_process(
-        self,
-        server_args: ServerArgs,
-        port_args: PortArgs,
-        base_gpu_id: int,
-        dp_rank: int,
-    ):
-        reader, writer = mp.Pipe(duplex=False)
-        gpu_id = base_gpu_id
-        tp_rank = dp_rank
-        proc = mp.Process(
-            target=run_scheduler_process,
-            args=(server_args, port_args, gpu_id, tp_rank, dp_rank, writer),
-        )
-        proc.start()
-        send_to = get_zmq_socket(
-            self.context, zmq.PUSH, port_args.scheduler_input_ipc_name
-        )
-
-        scheduler_info = reader.recv()
-        self.max_total_num_tokens = scheduler_info["max_total_num_tokens"]
-
-        return send_to
-
     def round_robin_scheduler(self, req):
         self.workers[self.round_robin_counter].send_pyobj(req)
         self.round_robin_counter = (self.round_robin_counter + 1) % len(self.workers)
@@ -221,8 +213,8 @@ def event_loop(self):
                 ):
                     self.dispatching(recv_req)
                 else:
-                    # Send other control messages to all workers
-                    for worker in self.workers:
+                    # Send other control messages to first worker of tp group
+                    for worker in self.workers[:: self.server_args.tp_size]:
                         worker.send_pyobj(recv_req)
 
 
@@ -240,7 +232,13 @@ def run_data_parallel_controller_process(
         pipe_writer.send(
             {"status": "ready", "max_total_num_tokens": controller.max_total_num_tokens}
         )
-        controller.event_loop()
+        if server_args.node_rank == 0:
+            controller.event_loop()
+        for proc in controller.scheduler_procs:
+            proc.join()
+            logger.error(
+                f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
+            )
     except Exception:
         traceback = get_exception_traceback()
         logger.error(f"DataParallelController hit an exception: {traceback}")
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index c375df234dc0..654c944caec5 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -1003,6 +1003,11 @@ def prepare_for_idle(self):
         self.req_pool_indices = torch.empty(0, dtype=torch.int32, device=self.device)
         self.seq_lens_sum = 0
         self.extend_num_tokens = 0
+        self.sampling_info = SamplingBatchInfo.from_schedule_batch(
+            self,
+            self.model_config.vocab_size,
+            enable_overlap_schedule=self.enable_overlap,
+        )
 
     def prepare_for_decode(self):
         self.forward_mode = ForwardMode.DECODE
@@ -1117,7 +1122,7 @@ def merge_batch(self, other: "ScheduleBatch"):
             self.spec_info.merge_batch(other.spec_info)
 
     def get_model_worker_batch(self):
-        if self.forward_mode.is_decode() or self.forward_mode.is_idle():
+        if self.forward_mode.is_decode_or_idle():
             extend_seq_lens = extend_prefix_lens = extend_logprob_start_lens = None
         else:
             extend_seq_lens = self.extend_lens
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 6ee93b3cd28a..62dc22ef2365 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -33,6 +33,7 @@
 from sglang.global_config import global_config
 from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer
+from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.io_struct import (
     AbortReq,
@@ -135,7 +136,17 @@ def __init__(
         # Init inter-process communication
         context = zmq.Context(2)
 
-        if self.tp_rank == 0 or self.server_args.enable_dp_attention:
+        self.dp_size = server_args.dp_size
+        self.attn_tp_rank, self.attn_tp_size, self.dp_rank = (
+            compute_dp_attention_world_info(
+                server_args.enable_dp_attention,
+                self.tp_rank,
+                self.tp_size,
+                self.dp_size,
+            )
+        )
+
+        if self.attn_tp_rank == 0:
             self.recv_from_tokenizer = get_zmq_socket(
                 context, zmq.PULL, port_args.scheduler_input_ipc_name
             )
@@ -244,6 +255,7 @@ def __init__(
             _,
         ) = self.tp_worker.get_worker_info()
         self.tp_cpu_group = self.tp_worker.get_tp_cpu_group()
+        self.attn_tp_cpu_group = self.tp_worker.get_attention_tp_cpu_group()
         self.pad_input_ids_func = self.tp_worker.get_pad_input_ids_func()
         global_server_args_dict.update(worker_global_server_args_dict)
         set_random_seed(self.random_seed)
@@ -447,6 +459,10 @@ def event_loop_overlap(self):
             self.process_input_requests(recv_reqs)
 
             batch = self.get_next_batch_to_run()
+
+            if self.server_args.enable_dp_attention:  # TODO: simplify this
+                batch = self.prepare_dp_attn_batch(batch)
+
             self.cur_batch = batch
 
             if batch:
@@ -479,7 +495,7 @@ def event_loop_overlap(self):
 
     def recv_requests(self) -> List[Req]:
         """Receive results at tp_rank = 0 and broadcast it to all other TP ranks."""
-        if self.tp_rank == 0 or self.server_args.enable_dp_attention:
+        if self.attn_tp_rank == 0:
             recv_reqs = []
 
             while True:
@@ -491,7 +507,40 @@ def recv_requests(self) -> List[Req]:
         else:
             recv_reqs = None
 
-        if self.tp_size != 1 and not self.server_args.enable_dp_attention:
+        if self.server_args.enable_dp_attention:
+            if self.attn_tp_rank == 0:
+                work_reqs = [
+                    req
+                    for req in recv_reqs
+                    if isinstance(
+                        req, (TokenizedGenerateReqInput, TokenizedEmbeddingReqInput)
+                    )
+                ]
+                control_reqs = [
+                    req
+                    for req in recv_reqs
+                    if not isinstance(
+                        req, (TokenizedGenerateReqInput, TokenizedEmbeddingReqInput)
+                    )
+                ]
+            else:
+                work_reqs = None
+                control_reqs = None
+
+            if self.attn_tp_size != 1:
+                attn_tp_rank_0 = self.dp_rank * self.attn_tp_size
+                work_reqs = broadcast_pyobj(
+                    work_reqs,
+                    self.attn_tp_rank,
+                    self.attn_tp_cpu_group,
+                    src=attn_tp_rank_0,
+                )
+            if self.tp_size != 1:
+                control_reqs = broadcast_pyobj(
+                    control_reqs, self.tp_rank, self.tp_cpu_group
+                )
+            recv_reqs = work_reqs + control_reqs
+        elif self.tp_size != 1:
             recv_reqs = broadcast_pyobj(recv_reqs, self.tp_rank, self.tp_cpu_group)
         return recv_reqs
 
@@ -887,7 +936,7 @@ def get_new_batch_prefill(self) -> Optional[ScheduleBatch]:
             self.being_chunked_req.is_being_chunked += 1
 
         # Print stats
-        if self.tp_rank == 0:
+        if self.attn_tp_rank == 0:
             self.log_prefill_stats(adder, can_run_list, running_bs, has_being_chunked)
 
         # Create a new batch
@@ -974,7 +1023,7 @@ def run_batch(self, batch: ScheduleBatch):
         self.forward_ct += 1
 
         if self.is_generation:
-            if batch.forward_mode.is_decode() or batch.extend_num_tokens != 0:
+            if batch.forward_mode.is_decode_or_idle() or batch.extend_num_tokens != 0:
                 if self.spec_algorithm.is_none():
                     model_worker_batch = batch.get_model_worker_batch()
                     logits_output, next_token_ids = (
@@ -988,18 +1037,8 @@ def run_batch(self, batch: ScheduleBatch):
                         num_accepted_tokens,
                     ) = self.draft_worker.forward_batch_speculative_generation(batch)
                     self.num_generated_tokens += num_accepted_tokens
-            elif batch.forward_mode.is_idle():
-                model_worker_batch = batch.get_model_worker_batch()
-                self.tp_worker.forward_batch_idle(model_worker_batch)
-                return
             else:
-                logits_output = None
-                if self.skip_tokenizer_init:
-                    next_token_ids = torch.full(
-                        (batch.batch_size(),), self.tokenizer.eos_token_id
-                    )
-                else:
-                    next_token_ids = torch.full((batch.batch_size(),), 0)
+                assert False, "batch.extend_num_tokens == 0, this is unexpected!"
             batch.output_ids = next_token_ids
             ret = logits_output, next_token_ids, model_worker_batch.bid
         else:  # embedding or reward model
@@ -1016,6 +1055,9 @@ def process_batch_result(self, batch: ScheduleBatch, result):
                 self.running_batch = None
         elif batch.forward_mode.is_extend():
             self.process_batch_result_prefill(batch, result)
+        elif batch.forward_mode.is_idle():
+            if self.enable_overlap:
+                self.tp_worker.resolve_batch_result(result[-1])
         elif batch.forward_mode.is_dummy_first():
             batch.next_batch_sampling_info.update_regex_vocab_mask()
             self.current_stream.synchronize()
@@ -1166,7 +1208,7 @@ def process_batch_result_decode(self, batch: ScheduleBatch, result):
 
         self.forward_ct_decode = (self.forward_ct_decode + 1) % (1 << 30)
         if (
-            self.tp_rank == 0
+            self.attn_tp_rank == 0
             and self.forward_ct_decode % self.server_args.decode_log_interval == 0
         ):
             self.log_decode_stats()
@@ -1402,12 +1444,7 @@ def prepare_dp_attn_batch(self, local_batch: ScheduleBatch):
             # Check forward mode for cuda graph
             if not self.server_args.disable_cuda_graph:
                 forward_mode_state = torch.tensor(
-                    (
-                        1
-                        if local_batch.forward_mode.is_decode()
-                        or local_batch.forward_mode.is_idle()
-                        else 0
-                    ),
+                    (1 if local_batch.forward_mode.is_decode_or_idle() else 0),
                     dtype=torch.int32,
                 )
                 torch.distributed.all_reduce(
diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py
index 25a1c85f2c69..47e3eea40840 100644
--- a/python/sglang/srt/managers/tp_worker.py
+++ b/python/sglang/srt/managers/tp_worker.py
@@ -101,6 +101,7 @@ def __init__(
                 self.max_total_num_tokens // 2
                 if server_args.max_running_requests is None
                 else server_args.max_running_requests
+                // (server_args.dp_size if server_args.enable_dp_attention else 1)
             ),
             self.model_runner.req_to_token_pool.size,
         )
@@ -142,16 +143,15 @@ def get_pad_input_ids_func(self):
     def get_tp_cpu_group(self):
         return self.model_runner.tp_group.cpu_group
 
+    def get_attention_tp_cpu_group(self):
+        return self.model_runner.attention_tp_group.cpu_group
+
     def get_memory_pool(self):
         return (
             self.model_runner.req_to_token_pool,
             self.model_runner.token_to_kv_pool,
         )
 
-    def forward_batch_idle(self, model_worker_batch: ModelWorkerBatch):
-        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
-        self.model_runner.forward(forward_batch)
-
     def forward_batch_generation(
         self,
         model_worker_batch: ModelWorkerBatch,
diff --git a/python/sglang/srt/managers/tp_worker_overlap_thread.py b/python/sglang/srt/managers/tp_worker_overlap_thread.py
index 2aa9c8269890..64c34a851796 100644
--- a/python/sglang/srt/managers/tp_worker_overlap_thread.py
+++ b/python/sglang/srt/managers/tp_worker_overlap_thread.py
@@ -92,6 +92,9 @@ def get_pad_input_ids_func(self):
     def get_tp_cpu_group(self):
         return self.worker.get_tp_cpu_group()
 
+    def get_attention_tp_cpu_group(self):
+        return self.worker.get_attention_tp_cpu_group()
+
     def get_memory_pool(self):
         return (
             self.worker.model_runner.req_to_token_pool,
diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
index e4580b5e2ba8..e167ff16a1a6 100644
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -122,6 +122,7 @@ def __init__(self, model_runner: "ModelRunner"):
         self.is_encoder_decoder = self.model_runner.model_config.is_encoder_decoder
         self.enable_dp_attention = self.model_runner.server_args.enable_dp_attention
         self.tp_size = self.model_runner.tp_size
+        self.dp_size = self.model_runner.server_args.dp_size
 
         # Batch sizes to capture
         self.capture_bs = self.model_runner.server_args.cuda_graph_bs
@@ -218,7 +219,7 @@ def __init__(self, model_runner: "ModelRunner"):
             if self.enable_dp_attention:
                 self.gathered_buffer = torch.zeros(
                     (
-                        self.max_bs * self.tp_size,
+                        self.max_bs * self.dp_size,
                         self.model_runner.model_config.hidden_size,
                     ),
                     dtype=self.model_runner.dtype,
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 238f8603ac95..d238c91955ea 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -35,6 +35,10 @@
 from sglang.srt.layers.attention.flashinfer_backend import FlashInferAttnBackend
 from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBackend
 from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_group,
+    initialize_dp_attention,
+)
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.sampler import Sampler
 from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
@@ -235,11 +239,18 @@ def init_torch_distributed(self):
                 distributed_init_method=dist_init_method,
             )
             initialize_model_parallel(tensor_model_parallel_size=self.tp_size)
+            initialize_dp_attention(
+                enable_dp_attention=self.server_args.enable_dp_attention,
+                tp_rank=self.tp_rank,
+                tp_size=self.tp_size,
+                dp_size=self.server_args.dp_size,
+            )
 
         min_per_gpu_memory = get_available_gpu_memory(
             self.device, self.gpu_id, distributed=self.tp_size > 1
         )
         self.tp_group = get_tp_group()
+        self.attention_tp_group = get_attention_tp_group()
 
         # Check memory for tensor parallelism
         if self.tp_size > 1:
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
index a9c0b59cea37..19a73a86e611 100644
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -855,10 +855,9 @@ def forward(
         forward_batch: ForwardBatch,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, forward_batch)
-        if not forward_batch.forward_mode.is_idle():
-            return self.logits_processor(
-                input_ids, hidden_states, self.lm_head, forward_batch
-            )
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index e445217b62fd..df98bdeb38a6 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -239,15 +239,14 @@ def __post_init__(self):
 
         # Others
         if self.enable_dp_attention:
+            assert self.tp_size % self.dp_size == 0
             self.dp_size = self.tp_size
             self.chunked_prefill_size = self.chunked_prefill_size // 2
             self.schedule_conservativeness = self.schedule_conservativeness * 0.3
-            self.disable_overlap_schedule = True
             logger.warning(
                 f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
                 f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
                 "Data parallel size is adjusted to be the same as tensor parallel size. "
-                "Overlap scheduler is disabled."
             )
 
         # Speculative Decoding
@@ -880,8 +879,8 @@ def check_server_args(self):
             self.tp_size % self.nnodes == 0
         ), "tp_size must be divisible by number of nodes"
         assert not (
-            self.dp_size > 1 and self.nnodes != 1
-        ), "multi-node data parallel is not supported"
+            self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
+        ), "multi-node data parallel is not supported unless dp attention!"
         assert (
             self.max_loras_per_batch > 0
             # FIXME
@@ -919,6 +918,9 @@ def prepare_server_args(argv: List[str]) -> ServerArgs:
     return server_args
 
 
+ZMQ_TCP_PORT_DELTA = 233
+
+
 @dataclasses.dataclass
 class PortArgs:
     # The ipc filename for tokenizer to receive inputs from detokenizer (zmq)
@@ -932,7 +934,7 @@ class PortArgs:
     nccl_port: int
 
     @staticmethod
-    def init_new(server_args) -> "PortArgs":
+    def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
         port = server_args.port + random.randint(100, 1000)
         while True:
             if is_port_available(port):
@@ -942,12 +944,39 @@ def init_new(server_args) -> "PortArgs":
             else:
                 port -= 43
 
-        return PortArgs(
-            tokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
-            scheduler_input_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
-            detokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
-            nccl_port=port,
-        )
+        if not server_args.enable_dp_attention:
+            # Normal case, use IPC within a single node
+            return PortArgs(
+                tokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
+                scheduler_input_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
+                detokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
+                nccl_port=port,
+            )
+        else:
+            # DP attention. Use TCP + port to handle both single-node and multi-node.
+            if server_args.nnodes == 1 and server_args.dist_init_addr is None:
+                dist_init_addr = ("127.0.0.1", server_args.port + ZMQ_TCP_PORT_DELTA)
+            else:
+                dist_init_addr = server_args.dist_init_addr.split(":")
+            assert (
+                len(dist_init_addr) == 2
+            ), "please provide --dist-init-addr as host:port of head node"
+
+            dist_init_host, dist_init_port = dist_init_addr
+            port_base = int(dist_init_port) + 1
+            if dp_rank is None:
+                scheduler_input_port = (
+                    port_base + 2
+                )  # TokenizerManager to DataParallelController
+            else:
+                scheduler_input_port = port_base + 2 + 1 + dp_rank
+
+            return PortArgs(
+                tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
+                scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
+                detokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base + 1}",
+                nccl_port=port,
+            )
 
 
 class LoRAPathAction(argparse.Action):
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index 583dd92e17d6..f1603ec0ef0b 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -802,11 +802,11 @@ def get_zmq_socket(context: zmq.Context, socket_type: zmq.SocketType, endpoint:
     if socket_type == zmq.PUSH:
         socket.setsockopt(zmq.SNDHWM, 0)
         socket.setsockopt(zmq.SNDBUF, buf_size)
-        socket.connect(f"ipc://{endpoint}")
+        socket.connect(endpoint)
     elif socket_type == zmq.PULL:
         socket.setsockopt(zmq.RCVHWM, 0)
         socket.setsockopt(zmq.RCVBUF, buf_size)
-        socket.bind(f"ipc://{endpoint}")
+        socket.bind(endpoint)
     else:
         raise ValueError(f"Unsupported socket type: {socket_type}")
 

From a883f0790d48bad03c7c2e052c0f4e8f4c841674 Mon Sep 17 00:00:00 2001
From: saienduri <77521230+saienduri@users.noreply.github.com>
Date: Thu, 16 Jan 2025 12:42:29 -0800
Subject: [PATCH 085/248] Update release-docker-amd.yml to run on amd docker
 runner. (#2927)

---
 .github/workflows/release-docker-amd.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release-docker-amd.yml b/.github/workflows/release-docker-amd.yml
index c0394e8e57a2..228eecdb9c5c 100644
--- a/.github/workflows/release-docker-amd.yml
+++ b/.github/workflows/release-docker-amd.yml
@@ -10,7 +10,7 @@ on:
 jobs:
   publish:
     if: github.repository == 'sgl-project/sglang'
-    runs-on: ubuntu-latest
+    runs-on: amd-docker
     environment: 'prod'
     strategy:
       matrix:

From bc6915e3b97f7c87e21872ab898123f93753072b Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 16 Jan 2025 12:51:11 -0800
Subject: [PATCH 086/248] Improve type annotation and styles (#2926)

---
 .../tree_of_thought_deep/bench_sglang.py      |  1 +
 python/sglang/srt/managers/schedule_batch.py  | 14 ++--
 python/sglang/srt/managers/scheduler.py       | 77 +++++++++++++++----
 .../sglang/srt/model_executor/model_runner.py |  7 +-
 python/sglang/srt/openai_api/protocol.py      |  2 +
 python/sglang/srt/server.py                   |  1 -
 python/sglang/srt/server_args.py              |  2 +-
 7 files changed, 78 insertions(+), 26 deletions(-)

diff --git a/benchmark/tree_of_thought_deep/bench_sglang.py b/benchmark/tree_of_thought_deep/bench_sglang.py
index b60f1f00f19c..bfb2a4113de5 100644
--- a/benchmark/tree_of_thought_deep/bench_sglang.py
+++ b/benchmark/tree_of_thought_deep/bench_sglang.py
@@ -103,6 +103,7 @@ def tree_search(s, question, num_branches):
 
 def main(args):
     lines = read_jsonl(args.data_path)
+    lines = list(lines)
 
     # Construct prompts
     num_branches = 2
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 654c944caec5..f1055dcb49b8 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -226,8 +226,9 @@ def __init__(
             else origin_input_ids  # Before image padding
         )
         self.origin_input_ids = origin_input_ids
-        self.output_ids = []  # Each decode stage's output ids
-        self.fill_ids = None  # fill_ids = origin_input_ids + output_ids
+        # Each decode stage's output ids
+        self.output_ids = []
+        # fill_ids = origin_input_ids + output_ids. Updated if chunked.
         self.session_id = session_id
         self.input_embeds = input_embeds
 
@@ -265,6 +266,7 @@ def __init__(
         # Prefix info
         self.prefix_indices = []
         # Tokens to run prefill. input_tokens - shared_prefix_tokens.
+        # Updated if chunked.
         self.extend_input_len = 0
         self.last_node = None
 
@@ -280,10 +282,10 @@ def __init__(
         self.top_logprobs_num = top_logprobs_num
 
         # Logprobs (return value)
-        self.input_token_logprobs_val = None
-        self.input_token_logprobs_idx = None
-        self.input_top_logprobs_val = None
-        self.input_top_logprobs_idx = None
+        self.input_token_logprobs_val: Optional[List[float]] = None
+        self.input_token_logprobs_idx: Optional[List[int]] = None
+        self.input_top_logprobs_val: Optional[List[float]] = None
+        self.input_top_logprobs_idx: Optional[List[int]] = None
 
         if return_logprob:
             self.output_token_logprobs_val = []
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 62dc22ef2365..9bebbcd923ed 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -22,8 +22,9 @@
 import warnings
 from collections import deque
 from concurrent import futures
+from dataclasses import dataclass
 from types import SimpleNamespace
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import psutil
 import setproctitle
@@ -102,6 +103,19 @@
 test_retract = get_bool_env_var("SGLANG_TEST_RETRACT")
 
 
+@dataclass
+class GenerationBatchResult:
+    logits_output: LogitsProcessorOutput
+    next_token_ids: List[int]
+    bid: int
+
+
+@dataclass
+class EmbeddingBatchResult:
+    embeddings: torch.Tensor
+    bid: int
+
+
 class Scheduler:
     """A scheduler that manages a tensor parallel GPU worker."""
 
@@ -411,16 +425,16 @@ def watchdog_thread(self):
         self.watchdog_last_time = time.time()
 
         while True:
+            current = time.time()
             if self.cur_batch is not None:
                 if self.watchdog_last_forward_ct == self.forward_ct:
-                    if time.time() > self.watchdog_last_time + self.watchdog_timeout:
+                    if current > self.watchdog_last_time + self.watchdog_timeout:
                         logger.error(f"Watchdog timeout ({self.watchdog_timeout=})")
                         break
                 else:
                     self.watchdog_last_forward_ct = self.forward_ct
-                    self.watchdog_last_time = time.time()
-            time.sleep(self.watchdog_timeout / 2)
-
+                    self.watchdog_last_time = current
+            time.sleep(self.watchdog_timeout // 2)
         # Wait sometimes so that the parent process can print the error.
         time.sleep(5)
         self.parent_process.send_signal(signal.SIGQUIT)
@@ -1018,7 +1032,9 @@ def update_running_batch(self, batch: ScheduleBatch) -> Optional[ScheduleBatch]:
         batch.prepare_for_decode()
         return batch
 
-    def run_batch(self, batch: ScheduleBatch):
+    def run_batch(
+        self, batch: ScheduleBatch
+    ) -> Union[GenerationBatchResult, EmbeddingBatchResult]:
         """Run a batch."""
         self.forward_ct += 1
 
@@ -1040,15 +1056,26 @@ def run_batch(self, batch: ScheduleBatch):
             else:
                 assert False, "batch.extend_num_tokens == 0, this is unexpected!"
             batch.output_ids = next_token_ids
-            ret = logits_output, next_token_ids, model_worker_batch.bid
+
+            ret = GenerationBatchResult(
+                logits_output=logits_output,
+                next_token_ids=next_token_ids,
+                bid=model_worker_batch.bid,
+            )
         else:  # embedding or reward model
             assert batch.extend_num_tokens != 0
             model_worker_batch = batch.get_model_worker_batch()
             embeddings = self.tp_worker.forward_batch_embedding(model_worker_batch)
-            ret = embeddings, model_worker_batch.bid
+            ret = EmbeddingBatchResult(
+                embeddings=embeddings, bid=model_worker_batch.bid
+            )
         return ret
 
-    def process_batch_result(self, batch: ScheduleBatch, result):
+    def process_batch_result(
+        self,
+        batch: ScheduleBatch,
+        result: Union[GenerationBatchResult, EmbeddingBatchResult],
+    ):
         if batch.forward_mode.is_decode():
             self.process_batch_result_decode(batch, result)
             if batch.is_empty():
@@ -1057,17 +1084,29 @@ def process_batch_result(self, batch: ScheduleBatch, result):
             self.process_batch_result_prefill(batch, result)
         elif batch.forward_mode.is_idle():
             if self.enable_overlap:
-                self.tp_worker.resolve_batch_result(result[-1])
+                self.tp_worker.resolve_batch_result(result.bid)
         elif batch.forward_mode.is_dummy_first():
             batch.next_batch_sampling_info.update_regex_vocab_mask()
             self.current_stream.synchronize()
             batch.next_batch_sampling_info.sampling_info_done.set()
 
-    def process_batch_result_prefill(self, batch: ScheduleBatch, result):
+    def process_batch_result_prefill(
+        self,
+        batch: ScheduleBatch,
+        result: Union[GenerationBatchResult, EmbeddingBatchResult],
+    ):
         skip_stream_req = None
 
         if self.is_generation:
-            logits_output, next_token_ids, bid = result
+            (
+                logits_output,
+                next_token_ids,
+                bid,
+            ) = (
+                result.logits_output,
+                result.next_token_ids,
+                result.bid,
+            )
 
             if self.enable_overlap:
                 logits_output, next_token_ids = self.tp_worker.resolve_batch_result(bid)
@@ -1125,7 +1164,7 @@ def process_batch_result_prefill(self, batch: ScheduleBatch, result):
                 batch.next_batch_sampling_info.sampling_info_done.set()
 
         else:  # embedding or reward model
-            embeddings, bid = result
+            embeddings, bid = result.embeddings, result.bid
             embeddings = embeddings.tolist()
 
             # Check finish conditions
@@ -1149,8 +1188,16 @@ def process_batch_result_prefill(self, batch: ScheduleBatch, result):
 
         self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)
 
-    def process_batch_result_decode(self, batch: ScheduleBatch, result):
-        logits_output, next_token_ids, bid = result
+    def process_batch_result_decode(
+        self,
+        batch: ScheduleBatch,
+        result: GenerationBatchResult,
+    ):
+        logits_output, next_token_ids, bid = (
+            result.logits_output,
+            result.next_token_ids,
+            result.bid,
+        )
         self.num_generated_tokens += len(batch.reqs)
 
         if self.enable_overlap:
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index d238c91955ea..6e7e69cff8ed 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -37,6 +37,7 @@
 from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
 from sglang.srt.layers.dp_attention import (
     get_attention_tp_group,
+    get_attention_tp_size,
     initialize_dp_attention,
 )
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
@@ -532,7 +533,7 @@ def profile_max_num_token(self, total_gpu_memory: int):
             )
         else:
             cell_size = (
-                self.model_config.get_num_kv_heads(self.tp_size)
+                self.model_config.get_num_kv_heads(get_attention_tp_size())
                 * self.model_config.head_dim
                 * self.model_config.num_hidden_layers
                 * 2
@@ -626,7 +627,7 @@ def init_memory_pool(
             self.token_to_kv_pool = DoubleSparseTokenToKVPool(
                 self.max_total_num_tokens,
                 dtype=self.kv_cache_dtype,
-                head_num=self.model_config.get_num_kv_heads(self.tp_size),
+                head_num=self.model_config.get_num_kv_heads(get_attention_tp_size()),
                 head_dim=self.model_config.head_dim,
                 layer_num=self.model_config.num_hidden_layers,
                 device=self.device,
@@ -637,7 +638,7 @@ def init_memory_pool(
             self.token_to_kv_pool = MHATokenToKVPool(
                 self.max_total_num_tokens,
                 dtype=self.kv_cache_dtype,
-                head_num=self.model_config.get_num_kv_heads(self.tp_size),
+                head_num=self.model_config.get_num_kv_heads(get_attention_tp_size()),
                 head_dim=self.model_config.head_dim,
                 layer_num=self.model_config.num_hidden_layers,
                 device=self.device,
diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py
index 4fbe20846568..2ed9006c0ea2 100644
--- a/python/sglang/srt/openai_api/protocol.py
+++ b/python/sglang/srt/openai_api/protocol.py
@@ -180,6 +180,7 @@ class CompletionRequest(BaseModel):
     ignore_eos: bool = False
     skip_special_tokens: bool = True
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    session_params: Optional[Dict] = None
 
 
 class CompletionResponseChoice(BaseModel):
@@ -322,6 +323,7 @@ class ChatCompletionRequest(BaseModel):
     ignore_eos: bool = False
     skip_special_tokens: bool = True
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    session_params: Optional[Dict] = None
 
 
 class FunctionResponse(BaseModel):
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index af0f2a08d90b..a41b94301e06 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -842,7 +842,6 @@ async def async_generate(
             generator = ret.body_iterator
 
             async def generator_wrapper():
-
                 offset = 0
 
                 while True:
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index df98bdeb38a6..9d4ec90e9680 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -239,8 +239,8 @@ def __post_init__(self):
 
         # Others
         if self.enable_dp_attention:
-            assert self.tp_size % self.dp_size == 0
             self.dp_size = self.tp_size
+            assert self.tp_size % self.dp_size == 0
             self.chunked_prefill_size = self.chunked_prefill_size // 2
             self.schedule_conservativeness = self.schedule_conservativeness * 0.3
             logger.warning(

From 78e974b2a5c890ae3b05c0700d1a82c2b9e6814e Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Fri, 17 Jan 2025 04:51:38 +0800
Subject: [PATCH 087/248] [kernel] MiniMax-Text-01 decode lightning_attn with
 triton (#2920)

---
 .../benchmark_lighting_attention_decode.py    | 492 ++++++++++++++++++
 1 file changed, 492 insertions(+)
 create mode 100644 benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_decode.py

diff --git a/benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_decode.py b/benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_decode.py
new file mode 100644
index 000000000000..1a2036dc0ae7
--- /dev/null
+++ b/benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_decode.py
@@ -0,0 +1,492 @@
+import itertools
+import math
+import os
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from einops import rearrange
+
+
+@triton.jit
+def _decode_kernel(
+    Q,
+    K,
+    V,
+    KV,
+    Out,
+    S,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n: tl.constexpr,
+    d: tl.constexpr,
+    e: tl.constexpr,
+):
+    off_bh = tl.program_id(0)
+    off_h = off_bh % h
+
+    qk_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    o_offset = off_bh * n * e
+    kv_offset = off_bh * d * e
+
+    s = tl.load(S + off_h)
+    ratio = tl.exp(-s)
+
+    d_idx = tl.arange(0, d)
+    e_idx = tl.arange(0, e)
+
+    q = tl.load(Q + qk_offset + d_idx)
+    k = tl.load(K + qk_offset + d_idx)
+    v = tl.load(V + v_offset + e_idx)
+
+    kv = tl.load(KV + kv_offset + d_idx[:, None] * e + e_idx[None, :])
+
+    k_v_prod = k[:, None] * v[None, :]
+    kv = ratio * kv + k_v_prod
+
+    tl.store(
+        KV + kv_offset + d_idx[:, None] * e + e_idx[None, :], kv.to(KV.dtype.element_ty)
+    )
+
+    o = tl.sum(q[:, None] * kv, axis=0)
+    tl.store(Out + o_offset + e_idx, o.to(Out.dtype.element_ty))
+
+
+def lightning_attn_decode(q, k, v, kv, s):
+    """Triton implementation of Lightning Attention decode operation"""
+    b, h, n, d = q.shape
+    e = v.shape[-1]
+    assert n == 1, "Sequence length must be 1 in decode mode"
+
+    # Pad dimensions to power of 2
+    d_padded = next_power_of_2(d)
+    e_padded = next_power_of_2(e)
+
+    # Pad inputs
+    q_padded = F.pad(q, (0, d_padded - d))
+    k_padded = F.pad(k, (0, d_padded - d))
+    v_padded = F.pad(v, (0, e_padded - e))
+    kv_padded = F.pad(kv, (0, e_padded - e, 0, d_padded - d))
+
+    # Ensure inputs are contiguous
+    q_padded = q_padded.contiguous()
+    k_padded = k_padded.contiguous()
+    v_padded = v_padded.contiguous()
+    kv_padded = kv_padded.contiguous().to(torch.float32)
+    s = s.contiguous()
+
+    # Create output tensor (padded)
+    o_padded = torch.empty(b, h, n, e_padded, dtype=v.dtype, device=v.device)
+
+    # Launch kernel
+    grid = (b * h, 1)
+    _decode_kernel[grid](
+        q_padded,
+        k_padded,
+        v_padded,
+        kv_padded,
+        o_padded,
+        s,
+        b=b,
+        h=h,
+        n=n,
+        d=d_padded,
+        e=e_padded,
+    )
+
+    # Remove padding
+    o = o_padded[..., :e]
+    kv_out = kv_padded[..., :d, :e]
+
+    return o, kv_out
+
+
+def next_power_of_2(n):
+    return 2 ** (int(math.ceil(math.log(n, 2))))
+
+
+class MiniMaxText01LightningAttention(nn.Module):
+    def __init__(self, config=None, layer_idx: Optional[int] = None, **kwargs):
+        super().__init__()
+        if config is None:
+            config = type("Config", (), kwargs)
+
+        bias = False
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+
+        self.out_proj = nn.Linear(
+            self.head_dim * self.num_heads, self.hidden_size, bias=bias
+        )
+        self.act = get_activation_fn(config.hidden_act)
+        self.norm = MiniMaxText01RMSNorm(self.head_dim * self.num_heads)
+
+        self.qkv_proj = nn.Linear(
+            self.hidden_size, 3 * self.head_dim * self.num_heads, bias=bias
+        )
+        self.output_gate = nn.Linear(
+            self.hidden_size, self.head_dim * self.num_heads, bias=bias
+        )
+
+        # for inference only
+        self.offset = 0
+        self.layer_idx = layer_idx
+
+    def forward(
+        self,
+        hidden_states,
+        attn_mask: Optional[torch.Tensor] = None,  # (b, h, n, m)
+        output_attentions: bool = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+        slope_rate: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        if (not self.training) and (not do_eval):
+            return self.inference(
+                hidden_states,
+                attn_mask,
+                output_attentions,
+                past_key_value,
+                use_cache,
+                slope_rate,
+            )
+
+    def inference(
+        self,
+        x,
+        attn_mask: Optional[torch.Tensor] = None,  # (b, n)
+        output_attentions: bool = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+        slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
+    ):
+        # x: b n d
+        b, n, d = x.shape
+        # linear map
+        qkv = self.act(self.qkv_proj(x))
+        new_shape = qkv.size()[:-1] + (self.num_heads, -1)
+        qkv = qkv.view(*new_shape)
+        q, k, v = torch.split(qkv, [self.head_dim] * 3, dim=3)
+        q = q.transpose(1, 2)  # [b, n, h, d] -> [b, h, n, d]
+        k = k.transpose(1, 2)  # [b, n, h, d] -> [b, h, n, d]
+        v = v.transpose(1, 2)  # [b, n, h, d] -> [b, h, n, e]
+
+        self.offset += 1
+        ratio = torch.exp(-slope_rate)  # [h, 1, 1]
+
+        # decode mode
+        kv = past_key_value  # [b, h, d, e]
+        output = []
+        for i in range(n):
+            # kv: [b, h, d, e]
+            # ratio: [h, 1, 1]
+            # k: [b, h, n, d]
+            # v: [b, h, n, e]
+            # k[:, :, i : i + 1]: [b, h, 1, d]
+            # v[:, :, i : i + 1]: [b, h, 1, e]
+            # ratio * kv: [b, h, d, e]
+            # torch.einsum(
+            #     "... n d, ... n e -> ... d e",
+            #     k[:, :, i : i + 1],
+            #     v[:, :, i : i + 1],
+            # )
+            # [b, h, d, e] + [b, h, d, e] -> [b, h, d, e]
+            kv = ratio * kv + torch.einsum(
+                "... n d, ... n e -> ... d e",
+                k[:, :, i : i + 1],
+                v[:, :, i : i + 1],
+            )
+            # q[:, :, i : i + 1]: [b, h, 1, d]
+            # kv.to(q.dtype): [b, h, d, e]
+            # torch.einsum(
+            #     "... n e, ... e d -> ... n d", q[:, :, i : i + 1], kv.to(q.dtype)
+            # )
+            # [b, h, 1, d] * [b, h, d, e] -> [b, h, 1, e]
+            qkv = torch.einsum(
+                "... n e, ... e d -> ... n d", q[:, :, i : i + 1], kv.to(q.dtype)
+            )
+            output.append(qkv)
+        output = torch.concat(output, dim=-2)
+
+        # reshape
+        output = rearrange(output, "b h n d -> b n (h d)")
+        # normalize
+        output = self.norm(output)
+        # gate
+        output = F.sigmoid(self.output_gate(x)) * output
+        # outproj
+        output = self.out_proj(output)
+
+        attn_weights = None
+
+        return output, attn_weights, kv
+
+
+def get_activation_fn(activation):
+    if activation == "gelu":
+        return F.gelu
+    elif activation == "relu":
+        return F.relu
+    elif activation == "elu":
+        return F.elu
+    elif activation == "sigmoid":
+        return F.sigmoid
+    elif activation == "exp":
+
+        def f(x):
+            with torch.no_grad():
+                x_max = torch.max(x, dim=-1, keepdims=True).values
+            y = torch.exp(x - x_max)
+            return y
+
+        return f
+    elif activation == "leak":
+        return F.leaky_relu
+    elif activation == "1+elu":
+
+        def f(x):
+            return 1 + F.elu(x)
+
+        return f
+    elif activation == "2+elu":
+
+        def f(x):
+            return 2 + F.elu(x)
+
+        return f
+    elif activation == "silu" or activation == "swish":
+        return F.silu
+    elif activation == "sine":
+        return torch.sin
+    else:
+        return lambda x: x
+
+
+class MiniMaxText01RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MiniMaxText01RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+def test_lightning_attention_implementations(model_params):
+    torch.manual_seed(42)
+
+    batch_size = 64
+    seq_len = 1
+    dtype = torch.bfloat16
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    hidden_states = torch.randn(
+        batch_size, seq_len, model_params["hidden_size"], dtype=dtype, device=device
+    )
+
+    attention_mask = torch.ones(batch_size, seq_len, dtype=dtype, device=device)
+
+    slope_rate = _build_slope_tensor(model_params["num_attention_heads"]).to(device)
+
+    model_attn = MiniMaxText01LightningAttention(**model_params).to(dtype).to(device)
+    model_attn.eval()
+
+    d = model_params["head_dim"]
+    past_kv = torch.randn(
+        batch_size,
+        model_params["num_attention_heads"],
+        d,
+        d,
+        dtype=dtype,
+        device=device,
+    )
+    with torch.no_grad():
+        model_output, _, new_kv = model_attn.inference(
+            hidden_states,
+            attn_mask=attention_mask,
+            slope_rate=slope_rate,
+            past_key_value=past_kv,
+        )
+
+    qkv = model_attn.act(model_attn.qkv_proj(hidden_states))
+    new_shape = qkv.size()[:-1] + (model_attn.num_heads, -1)
+    qkv = qkv.view(*new_shape)
+    q, k, v = torch.split(qkv, [model_attn.head_dim] * 3, dim=-1)
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+    v = v.transpose(1, 2)
+
+    triton_output, triton_new_kv = lightning_attn_decode(q, k, v, past_kv, slope_rate)
+    triton_output = triton_output.transpose(1, 2).contiguous()
+    triton_output = triton_output.view(batch_size, seq_len, -1)
+    triton_output = model_attn.norm(triton_output)
+    triton_output = torch.sigmoid(model_attn.output_gate(hidden_states)) * triton_output
+    triton_output = model_attn.out_proj(triton_output)
+
+    torch.testing.assert_close(
+        model_output,
+        triton_output,
+        rtol=1e-3,
+        atol=1e-2,
+        msg="Lightning attention implementations produce different output results",
+    )
+    torch.testing.assert_close(
+        new_kv,
+        triton_new_kv,
+        rtol=1e-3,
+        atol=1e-2,
+        msg="Lightning attention implementations produce different kv results",
+    )
+
+
+def _build_slope_tensor(n_attention_heads: int):
+    def get_slopes(n):
+        def get_slopes_power_of_2(n):
+            start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+            ratio = start
+            return [start * ratio**i for i in range(n)]
+
+        if math.log2(n).is_integer():
+            return get_slopes_power_of_2(n)
+        else:
+            closest_power_of_2 = 2 ** math.floor(math.log2(n))
+            return (
+                get_slopes_power_of_2(closest_power_of_2)
+                + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+            )
+
+    slopes = torch.tensor(get_slopes(n_attention_heads)).reshape(
+        n_attention_heads, 1, 1
+    )
+    return slopes
+
+
+def get_benchmark():
+    batch_size_range = [2**i for i in range(0, 12)]  # max 2048
+    seq_length_range = [1]  # decode mode sequence length is fixed to 1
+    configs = list(itertools.product(batch_size_range, seq_length_range))
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["batch_size", "seq_len"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["Original", "Triton"],
+            line_names=[
+                "Original PyTorch Implementation",
+                "Triton Implementation",
+            ],
+            styles=[("blue", "-"), ("green", "-")],
+            ylabel="us",
+            plot_name="lightning-attention-decode-performance",
+            args={},
+        )
+    )
+    def benchmark(batch_size, seq_len, provider):
+        dtype = torch.bfloat16
+        device = torch.device("cuda")
+
+        params = {
+            "hidden_size": 6144,
+            "num_attention_heads": 64,
+            "head_dim": 96,
+            "hidden_act": "gelu",
+        }
+
+        hidden_states = torch.randn(
+            batch_size, seq_len, params["hidden_size"], dtype=dtype, device=device
+        )
+
+        attention_mask = torch.ones(batch_size, seq_len, dtype=dtype, device=device)
+
+        slope_rate = _build_slope_tensor(params["num_attention_heads"]).to(device)
+        model_attn = MiniMaxText01LightningAttention(**params).to(dtype).to(device)
+        model_attn.eval()
+
+        d = params["head_dim"]
+        past_kv = torch.randn(
+            batch_size,
+            params["num_attention_heads"],
+            d,
+            d,
+            dtype=dtype,
+            device=device,
+        )
+
+        quantiles = [0.5, 0.2, 0.8]
+        if provider == "Original":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: model_attn.inference(
+                    hidden_states,
+                    attn_mask=attention_mask,
+                    slope_rate=slope_rate,
+                    past_key_value=past_kv,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+
+            def run_triton():
+                qkv = model_attn.act(model_attn.qkv_proj(hidden_states))
+                new_shape = qkv.size()[:-1] + (model_attn.num_heads, -1)
+                qkv = qkv.view(*new_shape)
+                q, k, v = torch.split(qkv, [model_attn.head_dim] * 3, dim=-1)
+                q = q.transpose(1, 2)
+                k = k.transpose(1, 2)
+                v = v.transpose(1, 2)
+
+                output, new_kv = lightning_attn_decode(q, k, v, past_kv, slope_rate)
+                output = output.transpose(1, 2).contiguous()
+                output = output.view(batch_size, seq_len, -1)
+                output = model_attn.norm(output)
+                output = torch.sigmoid(model_attn.output_gate(hidden_states)) * output
+                return model_attn.out_proj(output)
+
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                run_triton,
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./configs/benchmark_ops/lightning_attention_decode/",
+        help="Path to save lightning attention decode benchmark results",
+    )
+    args = parser.parse_args()
+
+    params = {
+        "hidden_size": 6144,
+        "num_attention_heads": 64,
+        "head_dim": 96,
+        "hidden_act": "silu",
+    }
+    # Run correctness test first
+    # Adapted from https://huggingface.co/MiniMaxAI/MiniMax-Text-01/blob/main/config.json
+    test_lightning_attention_implementations(params)
+
+    # Run performance benchmark
+    benchmark = get_benchmark()
+    benchmark.run(print_data=True, save_path=args.save_path)

From bf3edc2c6040da2fa8c40957a40fe9ec03445fc2 Mon Sep 17 00:00:00 2001
From: Chayenne <zhaochen20@outlook.com>
Date: Thu, 16 Jan 2025 13:04:11 -0800
Subject: [PATCH 088/248] Docs: Update pull_request_template.md (#2928)

---
 .github/pull_request_template.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 8e73727a0936..e3803f9cda5f 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -10,6 +10,6 @@
 
 ## Checklist
 
-- [ ] Format your code according to the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/references/contribution_guide.md).
-- [ ] Add unit tests as outlined in the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/references/contribution_guide.md).
-- [ ] Update documentation as needed, including docstrings or example tutorials.
+- [ ] Format your code according to the [Code Formatting with Pre-Commit](https://docs.sglang.ai/references/contribution_guide.html#code-formatting-with-pre-commit).
+- [ ] Add unit tests as outlined in the [Running Unit Tests](https://docs.sglang.ai/references/contribution_guide.html#running-unit-tests-adding-to-ci).
+- [ ] Update documentation / docstrings / example tutorials as needed, according to [Writing Documentation](https://docs.sglang.ai/references/contribution_guide.html#writing-documentation-running-docs-ci).

From 0427416b59d11958a63f2ed344af3c5141d8e835 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 16 Jan 2025 14:36:07 -0800
Subject: [PATCH 089/248] Fix zmq binding (#2930)

Co-authored-by: Chunyuan WU <chunyuan.wu@intel.com>
---
 .../sglang/srt/managers/data_parallel_controller.py   |  3 ++-
 python/sglang/srt/managers/detokenizer_manager.py     |  4 ++--
 python/sglang/srt/managers/scheduler.py               |  8 ++++----
 python/sglang/srt/managers/tokenizer_manager.py       |  4 ++--
 python/sglang/srt/utils.py                            | 11 ++++++++---
 5 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/python/sglang/srt/managers/data_parallel_controller.py b/python/sglang/srt/managers/data_parallel_controller.py
index c4ebbb3cfcab..4f57ac5b21e7 100644
--- a/python/sglang/srt/managers/data_parallel_controller.py
+++ b/python/sglang/srt/managers/data_parallel_controller.py
@@ -66,7 +66,7 @@ def __init__(self, server_args, port_args) -> None:
         self.context = zmq.Context(1 + server_args.dp_size)
         if server_args.node_rank == 0:
             self.recv_from_tokenizer = get_zmq_socket(
-                self.context, zmq.PULL, port_args.scheduler_input_ipc_name
+                self.context, zmq.PULL, port_args.scheduler_input_ipc_name, False
             )
 
         # Dispatch method
@@ -93,6 +93,7 @@ def __init__(self, server_args, port_args) -> None:
                     self.context,
                     zmq.PUSH,
                     dp_port_args[dp_rank].scheduler_input_ipc_name,
+                    True,
                 )
 
     def launch_dp_schedulers(self, server_args, port_args):
diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py
index 7a0f7b0d5faf..f0605ee1fea0 100644
--- a/python/sglang/srt/managers/detokenizer_manager.py
+++ b/python/sglang/srt/managers/detokenizer_manager.py
@@ -58,10 +58,10 @@ def __init__(
         # Init inter-process communication
         context = zmq.Context(2)
         self.recv_from_scheduler = get_zmq_socket(
-            context, zmq.PULL, port_args.detokenizer_ipc_name
+            context, zmq.PULL, port_args.detokenizer_ipc_name, True
         )
         self.send_to_tokenizer = get_zmq_socket(
-            context, zmq.PUSH, port_args.tokenizer_ipc_name
+            context, zmq.PUSH, port_args.tokenizer_ipc_name, False
         )
 
         if server_args.skip_tokenizer_init:
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 9bebbcd923ed..e00bd980f919 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -162,21 +162,21 @@ def __init__(
 
         if self.attn_tp_rank == 0:
             self.recv_from_tokenizer = get_zmq_socket(
-                context, zmq.PULL, port_args.scheduler_input_ipc_name
+                context, zmq.PULL, port_args.scheduler_input_ipc_name, False
             )
             self.send_to_tokenizer = get_zmq_socket(
-                context, zmq.PUSH, port_args.tokenizer_ipc_name
+                context, zmq.PUSH, port_args.tokenizer_ipc_name, False
             )
 
             if server_args.skip_tokenizer_init:
                 # Directly send to the TokenizerManager
                 self.send_to_detokenizer = get_zmq_socket(
-                    context, zmq.PUSH, port_args.tokenizer_ipc_name
+                    context, zmq.PUSH, port_args.tokenizer_ipc_name, False
                 )
             else:
                 # Send to the DetokenizerManager
                 self.send_to_detokenizer = get_zmq_socket(
-                    context, zmq.PUSH, port_args.detokenizer_ipc_name
+                    context, zmq.PUSH, port_args.detokenizer_ipc_name, False
                 )
         else:
             self.recv_from_tokenizer = None
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 4e120f3a9868..230d4f8d04ed 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -119,10 +119,10 @@ def __init__(
         # Init inter-process communication
         context = zmq.asyncio.Context(2)
         self.recv_from_detokenizer = get_zmq_socket(
-            context, zmq.PULL, port_args.tokenizer_ipc_name
+            context, zmq.PULL, port_args.tokenizer_ipc_name, True
         )
         self.send_to_scheduler = get_zmq_socket(
-            context, zmq.PUSH, port_args.scheduler_input_ipc_name
+            context, zmq.PUSH, port_args.scheduler_input_ipc_name, True
         )
 
         # Read model args
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index f1603ec0ef0b..0813fa248968 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -789,7 +789,9 @@ def first_rank_print(*args, **kwargs):
         pass
 
 
-def get_zmq_socket(context: zmq.Context, socket_type: zmq.SocketType, endpoint: str):
+def get_zmq_socket(
+    context: zmq.Context, socket_type: zmq.SocketType, endpoint: str, bind: bool
+):
     mem = psutil.virtual_memory()
     total_mem = mem.total / 1024**3
     available_mem = mem.available / 1024**3
@@ -802,14 +804,17 @@ def get_zmq_socket(context: zmq.Context, socket_type: zmq.SocketType, endpoint:
     if socket_type == zmq.PUSH:
         socket.setsockopt(zmq.SNDHWM, 0)
         socket.setsockopt(zmq.SNDBUF, buf_size)
-        socket.connect(endpoint)
     elif socket_type == zmq.PULL:
         socket.setsockopt(zmq.RCVHWM, 0)
         socket.setsockopt(zmq.RCVBUF, buf_size)
-        socket.bind(endpoint)
     else:
         raise ValueError(f"Unsupported socket type: {socket_type}")
 
+    if bind:
+        socket.bind(endpoint)
+    else:
+        socket.connect(endpoint)
+
     return socket
 
 

From a8ccacc8b8118ded5c993fe42e27fa3c6533e6d8 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Thu, 16 Jan 2025 14:51:19 -0800
Subject: [PATCH 090/248] [Frontend] Fix request length check and add option to
 disallow auto truncation in scheduler (#2876)

---
 python/sglang/srt/managers/scheduler.py       | 32 +++++----
 .../sglang/srt/managers/tokenizer_manager.py  | 20 +++++-
 python/sglang/srt/managers/utils.py           | 41 +++++++++++
 python/sglang/srt/server_args.py              |  6 ++
 test/srt/run_suite.py                         |  1 +
 test/srt/test_request_length_validation.py    | 71 +++++++++++++++++++
 6 files changed, 154 insertions(+), 17 deletions(-)
 create mode 100644 python/sglang/srt/managers/utils.py
 create mode 100644 test/srt/test_request_length_validation.py

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index e00bd980f919..0619b2e98aee 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -78,6 +78,7 @@
 from sglang.srt.managers.session_controller import Session
 from sglang.srt.managers.tp_worker import TpModelWorker
 from sglang.srt.managers.tp_worker_overlap_thread import TpModelWorkerClient
+from sglang.srt.managers.utils import validate_input_length
 from sglang.srt.mem_cache.chunk_cache import ChunkCache
 from sglang.srt.mem_cache.radix_cache import RadixCache
 from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerStats
@@ -690,14 +691,16 @@ def handle_generate_request(
             # By default, only return the logprobs for output tokens
             req.logprob_start_len = len(req.origin_input_ids) - 1
 
-        # Truncate prompts that are too long
-        if len(req.origin_input_ids) > self.max_req_input_len:
-            logger.warning(
-                "Request length is longer than the KV cache pool size or "
-                "the max context length. Truncated. "
-                f"{len(req.origin_input_ids)=}, {self.max_req_input_len=}."
-            )
-            req.origin_input_ids = req.origin_input_ids[: self.max_req_input_len]
+        # Validate prompts length
+        error_msg = validate_input_length(
+            req,
+            self.max_req_input_len,
+            self.server_args.allow_auto_truncate,
+        )
+
+        if error_msg:
+            self.waiting_queue.append(req)
+            return
 
         req.sampling_params.max_new_tokens = min(
             (
@@ -745,13 +748,12 @@ def handle_embedding_request(
         )
         req.tokenizer = self.tokenizer
 
-        # Truncate prompts that are too long
-        if len(req.origin_input_ids) >= self.max_req_input_len:
-            logger.warning(
-                "Request length is longer than the KV cache pool size or "
-                "the max context length. Truncated!!!"
-            )
-            req.origin_input_ids = req.origin_input_ids[: self.max_req_input_len]
+        # Validate prompts length
+        validate_input_length(
+            req,
+            self.max_req_input_len,
+            self.server_args.allow_auto_truncate,
+        )
 
         self.waiting_queue.append(req)
 
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 230d4f8d04ed..18ac7503cd76 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -292,12 +292,28 @@ async def _tokenize_one_request(
                 SessionParams(**obj.session_params) if obj.session_params else None
             )
 
-        if obj.input_ids is not None and len(input_ids) >= self.context_len:
+        input_token_num = len(input_ids) if input_ids is not None else 0
+        if input_token_num >= self.context_len:
             raise ValueError(
-                f"The input ({len(input_ids)} tokens) is longer than the "
+                f"The input ({input_token_num} tokens) is longer than the "
                 f"model's context length ({self.context_len} tokens)."
             )
 
+        if (
+            obj.sampling_params.get("max_new_tokens") is not None
+            and obj.sampling_params.get("max_new_tokens") + input_token_num
+            >= self.context_len
+        ):
+            raise ValueError(
+                f"Requested token count exceeds the model's maximum context length "
+                f"of {self.context_len} tokens. You requested a total of "
+                f"{obj.sampling_params.get('max_new_tokens') + input_token_num} "
+                f"tokens: {input_token_num} tokens from the input messages and "
+                f"{obj.sampling_params.get('max_new_tokens')} tokens for the "
+                f"completion. Please reduce the number of tokens in the input "
+                f"messages or the completion to fit within the limit."
+            )
+
         # Parse sampling parameters
         sampling_params = SamplingParams(**obj.sampling_params)
         sampling_params.normalize(self.tokenizer)
diff --git a/python/sglang/srt/managers/utils.py b/python/sglang/srt/managers/utils.py
new file mode 100644
index 000000000000..0ab5a0909c3a
--- /dev/null
+++ b/python/sglang/srt/managers/utils.py
@@ -0,0 +1,41 @@
+import logging
+from typing import Optional
+
+from sglang.srt.managers.schedule_batch import FINISH_ABORT, Req
+
+logger = logging.getLogger(__name__)
+
+
+def validate_input_length(
+    req: Req, max_req_input_len: int, allow_auto_truncate: bool
+) -> Optional[str]:
+    """Validate and potentially truncate input length.
+
+    Args:
+        req: The request containing input_ids to validate
+        max_req_input_len: Maximum allowed input length
+        allow_auto_truncate: Whether to truncate long inputs
+
+    Returns:
+        Error message if validation fails, None if successful
+    """
+    if len(req.origin_input_ids) >= max_req_input_len:
+        if allow_auto_truncate:
+            logger.warning(
+                "Request length is longer than the KV cache pool size or "
+                "the max context length. Truncated. "
+                f"{len(req.origin_input_ids)=}, {max_req_input_len=}."
+            )
+            req.origin_input_ids = req.origin_input_ids[:max_req_input_len]
+            return None
+        else:
+            error_msg = (
+                f"Input length ({len(req.origin_input_ids)} tokens) exceeds "
+                f"the maximum allowed length ({max_req_input_len} tokens). "
+                f"Use a shorter input or enable --allow-auto-truncate."
+            )
+            logger.error(error_msg)
+            req.finished_reason = FINISH_ABORT(error_msg)
+            return error_msg
+
+    return None
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 9d4ec90e9680..5d490d3f85f6 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -157,6 +157,7 @@ class ServerArgs:
     num_continuous_decode_steps: int = 1
     delete_ckpt_after_loading: bool = False
     enable_memory_saver: bool = False
+    allow_auto_truncate: bool = False
 
     def __post_init__(self):
         # Set missing default values
@@ -859,6 +860,11 @@ def add_cli_args(parser: argparse.ArgumentParser):
             action="store_true",
             help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
         )
+        parser.add_argument(
+            "--allow-auto-truncate",
+            action="store_true",
+            help="Allow automatically truncating requests that exceed the maximum input length instead of returning an error.",
+        )
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index b00c866a9616..83e24e3a8676 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -31,6 +31,7 @@
         "test_pytorch_sampling_backend.py",
         "test_radix_attention.py",
         "test_release_memory_occupation.py",
+        "test_request_length_validation.py",
         "test_retract_decode.py",
         "test_server_args.py",
         "test_session_control.py",
diff --git a/test/srt/test_request_length_validation.py b/test/srt/test_request_length_validation.py
new file mode 100644
index 000000000000..713e3e21e56b
--- /dev/null
+++ b/test/srt/test_request_length_validation.py
@@ -0,0 +1,71 @@
+import unittest
+
+import openai
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestRequestLengthValidation(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+
+        # Start server with auto truncate disabled
+        cls.process = popen_launch_server(
+            DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=("--max-total-tokens", "1000", "--context-length", "100"),
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_input_length_validation(self):
+        client = openai.Client(api_key=self.api_key, base_url=f"{self.base_url}/v1")
+
+        long_text = "hello " * 100  # Will tokenize to more than context length
+
+        with self.assertRaises(openai.BadRequestError) as cm:
+            client.chat.completions.create(
+                model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+                messages=[
+                    {"role": "user", "content": long_text},
+                ],
+                temperature=0,
+            )
+
+        self.assertIn("is longer than the model's context length", str(cm.exception))
+
+    def test_max_tokens_validation(self):
+        client = openai.Client(api_key=self.api_key, base_url=f"{self.base_url}/v1")
+
+        long_text = "hello "
+
+        with self.assertRaises(openai.BadRequestError) as cm:
+            client.chat.completions.create(
+                model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+                messages=[
+                    {"role": "user", "content": long_text},
+                ],
+                temperature=0,
+                max_tokens=500,
+            )
+
+        self.assertIn(
+            "Requested token count exceeds the model's maximum context",
+            str(cm.exception),
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 63051738a91e643631beaa965eafe825c27e84b8 Mon Sep 17 00:00:00 2001
From: Chunyuan WU <chunyuan.wu@intel.com>
Date: Fri, 17 Jan 2025 13:22:53 +0800
Subject: [PATCH 091/248] Enable CPU device on SGLang (#2806)

---
 python/pyproject.toml                         |   6 +
 python/sglang/srt/configs/device_config.py    |   2 +-
 .../sglang/srt/layers/moe/fused_moe_native.py |  69 +++++
 .../layers/moe/fused_moe_triton/fused_moe.py  |   5 +-
 .../srt/layers/moe/fused_moe_triton/layer.py  |  28 +-
 python/sglang/srt/layers/rotary_embedding.py  | 248 ++++++++++++++++++
 python/sglang/srt/managers/schedule_batch.py  |   1 +
 python/sglang/srt/managers/scheduler.py       |   2 +
 .../srt/managers/tp_worker_overlap_thread.py  |   2 +
 .../sglang/srt/model_executor/model_runner.py |  12 +-
 python/sglang/srt/models/deepseek_v2.py       |   4 +-
 python/sglang/srt/server_args.py              |   2 +-
 python/sglang/srt/utils.py                    |   4 +
 13 files changed, 376 insertions(+), 9 deletions(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index ea7c2482a720..379a4c9acf84 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -40,6 +40,10 @@ srt_xpu = ["sglang[runtime_common]"]
 #For Intel Gaudi(device : hpu) follow the installation guide
 #https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
 srt_hpu = ["sglang[runtime_common]"]
+# CPU: currently, there are no pre-built vllm wheels for CPU.
+# To install vllm for CPU, please follow the instruction here:
+# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
+srt_cpu = ["sglang[runtime_common]", "torch"]
 
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
@@ -57,11 +61,13 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
+all_cpu = ["sglang[srt_cpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 
 dev = ["sglang[all]", "sglang[test]"]
 dev_hip = ["sglang[all_hip]", "sglang[test]"]
 dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
 dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
+dev_cpu = ["sglang[all_cpu]", "sglang[test]"]
 
 [project.urls]
 "Homepage" = "https://github.com/sgl-project/sglang"
diff --git a/python/sglang/srt/configs/device_config.py b/python/sglang/srt/configs/device_config.py
index 74deb8919024..d95e848ddae6 100644
--- a/python/sglang/srt/configs/device_config.py
+++ b/python/sglang/srt/configs/device_config.py
@@ -10,7 +10,7 @@ class DeviceConfig:
     device: Optional[torch.device]
 
     def __init__(self, device: str = "cuda") -> None:
-        if device in ["cuda", "xpu", "hpu"]:
+        if device in ["cuda", "xpu", "hpu", "cpu"]:
             self.device_type = device
         else:
             raise RuntimeError(f"Not supported device type: {device}")
diff --git a/python/sglang/srt/layers/moe/fused_moe_native.py b/python/sglang/srt/layers/moe/fused_moe_native.py
index 638173b647d5..0703e840ca64 100644
--- a/python/sglang/srt/layers/moe/fused_moe_native.py
+++ b/python/sglang/srt/layers/moe/fused_moe_native.py
@@ -8,6 +8,7 @@
 import torch
 from torch.nn import functional as F
 
+from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.moe.topk import select_experts
 
 
@@ -44,3 +45,71 @@ def fused_moe_forward_native(
     x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
     expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
     return torch.einsum("tai,ta -> ti", expert_outs, topk_weights.to(expert_outs.dtype))
+
+
+def moe_forward_native(
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    use_grouped_topk: bool,
+    top_k: int,
+    router_logits: torch.Tensor,
+    renormalize: bool,
+    topk_group: Optional[int] = None,
+    num_expert_group: Optional[int] = None,
+    custom_routing_function: Optional[Callable] = None,
+    correction_bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+
+    topk_weights, topk_ids = select_experts(
+        hidden_states=x,
+        router_logits=router_logits,
+        use_grouped_topk=use_grouped_topk,
+        top_k=top_k,
+        renormalize=renormalize,
+        topk_group=topk_group,
+        num_expert_group=num_expert_group,
+        custom_routing_function=custom_routing_function,
+        correction_bias=correction_bias,
+        torch_native=True,
+    )
+
+    # Ref code from https://huggingface.co/deepseek-ai/DeepSeek-V2/blob/e0828e3cc0a03408724b80c3cc92c8e072db8d01/modeling_deepseek.py#L589
+    len_experts = layer.num_experts
+
+    cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
+    cnts.scatter_(1, topk_ids.to(torch.int64), 1)
+    tokens_per_expert = cnts.sum(dim=0)
+    idxs = topk_ids.view(-1).argsort()
+
+    sorted_tokens = x[idxs // topk_ids.shape[1]]
+    tokens_per_expert = tokens_per_expert.cpu().numpy()
+
+    outputs = []
+    start_idx = 0
+    for i, num_tokens in enumerate(tokens_per_expert):
+        end_idx = start_idx + num_tokens
+        if num_tokens == 0:
+            continue
+        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+
+        layer_w13_weight = layer.w13_weight[i]
+        layer_w2_weight = layer.w2_weight[i]
+
+        gate_up = F.linear(tokens_for_this_expert, layer_w13_weight)
+        gate_up = SiluAndMul()(gate_up)
+        expert_out = F.linear(gate_up, layer_w2_weight)
+        outputs.append(expert_out)
+        start_idx = end_idx
+
+    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+    new_x = torch.empty_like(outs)
+
+    new_x[idxs] = outs
+    final_out = (
+        new_x.view(*topk_ids.shape, -1)
+        .type(topk_weights.dtype)
+        .mul_(topk_weights.unsqueeze(dim=-1))
+        .sum(dim=1)
+        .type(new_x.dtype)
+    )
+    return final_out
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
index ed132555bd20..01ecce1a6ed9 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
@@ -19,7 +19,10 @@
 
 is_hip_flag = False
 if not is_hip():
-    from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
+    if torch.cuda.is_available():
+        from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
+    else:
+        sgl_moe_align_block_size = None
 
     is_hip_flag = False
 else:
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
index d95498377793..a484c2ef9a49 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -13,6 +13,7 @@
 from vllm.model_executor.custom_op import CustomOp
 
 from sglang.srt.layers.custom_op_util import register_custom_op
+from sglang.srt.layers.moe.fused_moe_native import moe_forward_native
 from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
@@ -185,8 +186,31 @@ def forward_cuda(
                 inplace=True,
             )
 
-    def forward_cpu(self, *args, **kwargs):
-        raise NotImplementedError("The CPU backend currently does not support MoE.")
+    def forward_cpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        correction_bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return moe_forward_native(
+            layer,
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            custom_routing_function,
+            correction_bias,
+        )
 
     def forward_tpu(self, *args, **kwargs) -> torch.Tensor:
         raise NotImplementedError("The TPU backend currently does not support MoE.")
diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py
index 80158573bd63..e4a93814b889 100644
--- a/python/sglang/srt/layers/rotary_embedding.py
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -15,6 +15,15 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
+from vllm.model_executor.layers.rotary_embedding import (
+    RotaryEmbedding,
+    _rotate_gptj,
+    _rotate_neox,
+    _yarn_find_correction_range,
+    _yarn_linear_ramp_mask,
+    get_rope,
+    yarn_get_mscale,
+)
 
 
 class MRotaryEmbedding:
@@ -110,3 +119,242 @@ def get_next_input_positions(
             )
             for _ in range(3)
         ]
+
+
+# TODO: in the DeepseekScalingRotaryEmbedding class defined in vllm,
+# the device has been hard-coded to "cuda" in these two places:
+# https://github.com/vllm-project/vllm/blob/8a1f938e6f02052df0f4953c149410605a2d56d8/vllm/model_executor/layers/rotary_embedding.py#L646
+# https://github.com/vllm-project/vllm/blob/8a1f938e6f02052df0f4953c149410605a2d56d8/vllm/model_executor/layers/rotary_embedding.py#L665
+# We port the related code to this file to make it compatible with the CPU version.
+# We will add an optimized rotary embedding kernel for CPU and will remove the ported code then.
+class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+        mscale: float = 1,
+        mscale_all_dim: float = 0,
+        device: Optional[str] = None,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        # Get n-d magnitude scaling corrected for interpolation.
+        self.mscale = float(
+            yarn_get_mscale(self.scaling_factor, float(mscale))
+            / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim))
+            * attn_factor
+        )
+        self.device = device
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base ** (
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float, device=self.device)
+            / self.rotary_dim
+        )
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = _yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.rotary_dim,
+            self.base,
+            self.max_position_embeddings,
+        )
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (
+            1
+            - _yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
+        ) * self.extrapolation_factor
+        inv_freq = (
+            inv_freq_interpolation * (1 - inv_freq_mask)
+            + inv_freq_extrapolation * inv_freq_mask
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(
+            self.max_position_embeddings * self.scaling_factor,
+            device=self.device,
+            dtype=torch.float32,
+        )
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * self.mscale
+        sin = freqs.sin() * self.mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        print("Cache shape", cache.shape)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """PyTorch-native implementation equivalent to forward()."""
+        query_rot = query[..., : self.rotary_dim]
+        key_rot = key[..., : self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim :]
+            key_pass = key[..., self.rotary_dim :]
+
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(positions.device)
+        cos_sin = self.cos_sin_cache[
+            torch.add(positions, offsets) if offsets is not None else positions
+        ]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            # NOTE(woosuk): Here we assume that the positions tensor has the
+            # shape [batch_size, seq_len].
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+
+        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
+        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
+        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
+
+        if self.rotary_dim < self.head_size:
+            query = torch.cat((query_rot, query_pass), dim=-1)
+            key = torch.cat((key_rot, key_pass), dim=-1)
+        else:
+            query = query_rot
+            key = key_rot
+        return query, key
+
+
+_ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
+
+
+def get_rope_cpu(
+    head_size: int,
+    rotary_dim: int,
+    max_position: int,
+    base: int,
+    is_neox_style: bool = True,
+    rope_scaling: Optional[Dict[str, Any]] = None,
+    dtype: Optional[torch.dtype] = None,
+    partial_rotary_factor: float = 1.0,
+    device: Optional[str] = None,
+) -> RotaryEmbedding:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if rope_scaling is not None:
+        # Transforms every value that is a list into a tuple for caching calls
+        rope_scaling_tuple = {
+            k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items()
+        }
+        rope_scaling_args = tuple(rope_scaling_tuple.items())
+    else:
+        rope_scaling_args = None
+    if partial_rotary_factor < 1.0:
+        rotary_dim = int(rotary_dim * partial_rotary_factor)
+    key = (
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        rope_scaling_args,
+        dtype,
+    )
+    if key in _ROPE_DICT:
+        return _ROPE_DICT[key]
+
+    assert rope_scaling is not None
+    scaling_type = rope_scaling["rope_type"]
+    assert (
+        scaling_type == "deepseek_yarn"
+    ), "Only deepseek_yarn is supported for CPU for now"
+
+    scaling_factor = rope_scaling["factor"]
+    original_max_position = rope_scaling["original_max_position_embeddings"]
+    # assert max_position == original_max_position * scaling_factor
+    extra_kwargs = {
+        k: v
+        for k, v in rope_scaling.items()
+        if k
+        in (
+            "extrapolation_factor",
+            "attn_factor",
+            "beta_fast",
+            "beta_slow",
+            "mscale",
+            "mscale_all_dim",
+        )
+    }
+    extra_kwargs["device"] = device
+    rotary_emb = DeepseekScalingRotaryEmbedding(
+        head_size,
+        rotary_dim,
+        original_max_position,
+        base,
+        is_neox_style,
+        scaling_factor,
+        dtype,
+        **extra_kwargs,
+    )
+
+    _ROPE_DICT[key] = rotary_emb
+    return rotary_emb
+
+
+def get_rope_wrapper(
+    head_size: int,
+    rotary_dim: int,
+    max_position: int,
+    base: int,
+    is_neox_style: bool = True,
+    rope_scaling: Optional[Dict[str, Any]] = None,
+    dtype: Optional[torch.dtype] = None,
+    partial_rotary_factor: float = 1.0,
+    device: Optional[str] = None,
+):
+    if device != "cpu":
+        return get_rope(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            rope_scaling,
+            dtype,
+            partial_rotary_factor,
+        )
+
+    return get_rope_cpu(
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        rope_scaling,
+        dtype,
+        partial_rotary_factor,
+        device,
+    )
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index f1055dcb49b8..6c3800785144 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -65,6 +65,7 @@
     "enable_nan_detection": ServerArgs.enable_nan_detection,
     "enable_dp_attention": ServerArgs.enable_dp_attention,
     "enable_ep_moe": ServerArgs.enable_ep_moe,
+    "device": ServerArgs.device,
 }
 
 
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 0619b2e98aee..f8bb7d3348ab 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -317,6 +317,8 @@ def __init__(
         self.last_decode_stats_tic = time.time()
         self.stream_interval = server_args.stream_interval
         self.current_stream = torch.get_device_module(self.device).current_stream()
+        if self.device == "cpu":
+            self.current_stream.synchronize = lambda: None  # No-op for CPU
 
         # Session info
         self.sessions: Dict[str, Session] = {}
diff --git a/python/sglang/srt/managers/tp_worker_overlap_thread.py b/python/sglang/srt/managers/tp_worker_overlap_thread.py
index 64c34a851796..961b0bbdc119 100644
--- a/python/sglang/srt/managers/tp_worker_overlap_thread.py
+++ b/python/sglang/srt/managers/tp_worker_overlap_thread.py
@@ -82,6 +82,8 @@ def __init__(
         self.forward_thread.start()
         self.parent_process = psutil.Process().parent()
         self.scheduler_stream = torch.get_device_module(self.device).current_stream()
+        if self.device == "cpu":
+            self.scheduler_stream.synchronize = lambda: None  # No-op for CPU
 
     def get_worker_info(self):
         return self.worker.get_worker_info()
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 6e7e69cff8ed..86d59582f902 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -106,8 +106,10 @@ def __init__(
             self.model_config.attention_arch == AttentionArch.MLA
             and not self.server_args.disable_mla
         ):
-            logger.info("MLA optimization is turned on. Use triton backend.")
-            self.server_args.attention_backend = "triton"
+            # TODO: add MLA optimization on CPU
+            if self.server_args.device != "cpu":
+                logger.info("MLA optimization is turned on. Use triton backend.")
+                self.server_args.attention_backend = "triton"
 
         if self.server_args.enable_double_sparsity:
             logger.info(
@@ -164,6 +166,7 @@ def __init__(
                 "enable_nan_detection": server_args.enable_nan_detection,
                 "enable_dp_attention": server_args.enable_dp_attention,
                 "enable_ep_moe": server_args.enable_ep_moe,
+                "device": server_args.device,
             }
         )
 
@@ -221,6 +224,8 @@ def init_torch_distributed(self):
             backend = "gloo"
         elif self.device == "hpu":
             backend = "hccl"
+        elif self.device == "cpu":
+            backend = "gloo"
 
         if not self.server_args.enable_p2p_check:
             monkey_patch_vllm_p2p_access_check(self.gpu_id)
@@ -269,7 +274,8 @@ def load_model(self):
         )
 
         # This can reduce thread conflicts and speed up weight loading.
-        torch.set_num_threads(1)
+        if self.device != "cpu":
+            torch.set_num_threads(1)
         if self.device == "cuda":
             if torch.cuda.get_device_capability()[0] < 8:
                 logger.info(
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
index 19a73a86e611..626722121dec 100644
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -49,6 +49,7 @@
     normalize_e4m3fn_to_e4m3fnuz,
 )
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope_wrapper
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
@@ -271,13 +272,14 @@ def __init__(
             quant_config=quant_config,
         )
         rope_scaling["rope_type"] = "deepseek_yarn"
-        self.rotary_emb = get_rope(
+        self.rotary_emb = get_rope_wrapper(
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,
             max_position=max_position_embeddings,
             base=rope_theta,
             rope_scaling=rope_scaling,
             is_neox_style=False,
+            device=global_server_args_dict["device"],
         )
 
         if rope_scaling:
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 5d490d3f85f6..052e316b7c46 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -392,7 +392,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
             "--device",
             type=str,
             default="cuda",
-            choices=["cuda", "xpu", "hpu"],
+            choices=["cuda", "xpu", "hpu", "cpu"],
             help="The device type.",
         )
         parser.add_argument(
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index 0813fa248968..4ba2a6c2c54d 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -223,6 +223,10 @@ def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True
 
         free_gpu_memory, total_gpu_memory = torch.hpu.mem_get_info()
 
+    elif device == "cpu":
+        # TODO: rename the variables in the current function to be not GPU specific
+        free_gpu_memory = psutil.virtual_memory().available
+
     if distributed:
         tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
             torch.device(device, gpu_id)

From 6a7973add8393b89cb3cd506041ddf56e729cab9 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Fri, 17 Jan 2025 00:36:40 -0800
Subject: [PATCH 092/248] Update release-docs.yml (#2937)

---
 .github/workflows/release-docs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml
index 84138f7430e0..37db70c7c4be 100644
--- a/.github/workflows/release-docs.yml
+++ b/.github/workflows/release-docs.yml
@@ -39,7 +39,7 @@ jobs:
 
       - name: Execute notebooks and push to documents
         env:
-          GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.DOCUMENTATION_PAT_TOKEN }}
         run: |
           cd docs
           make clean

From f3e9b4894b991c9f8c0d2c098721ab29d19a6223 Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Fri, 17 Jan 2025 17:26:21 +0800
Subject: [PATCH 093/248] Fix sgl-kernel ci (#2938)

---
 .github/workflows/pr-test-sgl-kernel.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
index d5848d4e4f08..4115677dcb02 100644
--- a/.github/workflows/pr-test-sgl-kernel.yml
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -38,6 +38,10 @@ jobs:
           cd sgl-kernel
           find tests -name "test_*.py" | xargs -n 1 python3
 
+      - name: Uninstall dependencies
+        run: |
+          pip3 uninstall sgl-kernel -y
+
   finish:
     needs: [unit-test]
     runs-on: ubuntu-latest

From 5dc54f1a627a1e412c6732777dc2622363df7c21 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Fri, 17 Jan 2025 22:31:51 +0800
Subject: [PATCH 094/248] feat: remove vllm distributed (#2907)

Co-authored-by: Zhangyi <1109276519@qq.com>
---
 python/sglang/srt/layers/activation.py             |  6 +++---
 python/sglang/srt/layers/dp_attention.py           |  3 ++-
 python/sglang/srt/layers/linear.py                 |  4 ++--
 python/sglang/srt/layers/logits_processor.py       |  4 ++--
 python/sglang/srt/layers/moe/ep_moe/layer.py       |  8 ++++----
 .../srt/layers/moe/fused_moe_triton/layer.py       |  6 +++---
 python/sglang/srt/layers/parameter.py              |  3 ++-
 python/sglang/srt/layers/quantization/fp8.py       |  2 +-
 .../sglang/srt/layers/vocab_parallel_embedding.py  |  4 ++--
 .../sglang/srt/model_executor/cuda_graph_runner.py |  4 ++--
 python/sglang/srt/model_executor/model_runner.py   | 14 +++++++++-----
 python/sglang/srt/model_loader/loader.py           | 14 ++++++++------
 python/sglang/srt/model_loader/weight_utils.py     |  2 +-
 python/sglang/srt/models/baichuan.py               |  8 ++++----
 python/sglang/srt/models/chatglm.py                |  2 +-
 python/sglang/srt/models/commandr.py               |  6 +++---
 python/sglang/srt/models/dbrx.py                   |  8 ++++----
 python/sglang/srt/models/deepseek.py               |  6 +++---
 python/sglang/srt/models/deepseek_v2.py            |  6 +++---
 python/sglang/srt/models/exaone.py                 |  2 +-
 python/sglang/srt/models/gemma.py                  |  2 +-
 python/sglang/srt/models/gemma2.py                 |  2 +-
 python/sglang/srt/models/gpt2.py                   |  3 ++-
 python/sglang/srt/models/gpt_bigcode.py            |  2 +-
 python/sglang/srt/models/granite.py                |  2 +-
 python/sglang/srt/models/grok.py                   |  6 +++---
 python/sglang/srt/models/internlm2.py              |  2 +-
 python/sglang/srt/models/llama.py                  |  8 ++++----
 python/sglang/srt/models/minicpm.py                |  2 +-
 python/sglang/srt/models/minicpm3.py               |  2 +-
 python/sglang/srt/models/mixtral.py                |  6 +++---
 python/sglang/srt/models/mixtral_quant.py          |  6 +++---
 python/sglang/srt/models/mllama.py                 |  4 ++--
 python/sglang/srt/models/olmo.py                   |  2 +-
 python/sglang/srt/models/olmo2.py                  |  8 ++++----
 python/sglang/srt/models/olmoe.py                  |  8 ++++----
 python/sglang/srt/models/phi3_small.py             |  2 +-
 python/sglang/srt/models/qwen.py                   |  2 +-
 python/sglang/srt/models/qwen2.py                  |  2 +-
 python/sglang/srt/models/qwen2_moe.py              |  6 +++---
 python/sglang/srt/models/qwen2_vl.py               |  4 ++--
 python/sglang/srt/models/stablelm.py               |  2 +-
 python/sglang/srt/models/torch_native_llama.py     |  6 +++---
 python/sglang/srt/models/xverse.py                 |  2 +-
 python/sglang/srt/models/xverse_moe.py             | 10 +++++-----
 45 files changed, 111 insertions(+), 102 deletions(-)

diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py
index c4c54f0b03c4..ebb0652c5d2e 100644
--- a/python/sglang/srt/layers/activation.py
+++ b/python/sglang/srt/layers/activation.py
@@ -25,13 +25,13 @@
 if is_flashinfer_available():
     from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
 
-from vllm.distributed import (
+from vllm.model_executor.custom_op import CustomOp
+
+from sglang.srt.distributed import (
     divide,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
-from vllm.model_executor.custom_op import CustomOp
-
 from sglang.srt.layers.custom_op_util import register_custom_op
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.utils import set_weight_attrs
diff --git a/python/sglang/srt/layers/dp_attention.py b/python/sglang/srt/layers/dp_attention.py
index 41bcb2181377..65efa0feb84b 100644
--- a/python/sglang/srt/layers/dp_attention.py
+++ b/python/sglang/srt/layers/dp_attention.py
@@ -1,5 +1,6 @@
 import torch
-from vllm.distributed import GroupCoordinator, get_tp_group
+
+from sglang.srt.distributed import GroupCoordinator, get_tp_group
 
 _ATTN_TP_GROUP = None
 _ATTN_TP_RANK = None
diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py
index 4596f3d78f54..bfa5d2b66544 100644
--- a/python/sglang/srt/layers/linear.py
+++ b/python/sglang/srt/layers/linear.py
@@ -7,7 +7,8 @@
 import torch
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter, UninitializedParameter
-from vllm.distributed import (
+
+from sglang.srt.distributed import (
     divide,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -15,7 +16,6 @@
     tensor_model_parallel_all_gather,
     tensor_model_parallel_all_reduce,
 )
-
 from sglang.srt.layers.parameter import (
     BasevLLMParameter,
     PackedColumnParameter,
diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py
index e1dc94548084..a4fe49051c30 100644
--- a/python/sglang/srt/layers/logits_processor.py
+++ b/python/sglang/srt/layers/logits_processor.py
@@ -20,11 +20,11 @@
 import triton
 import triton.language as tl
 from torch import nn
-from vllm.distributed import (
+
+from sglang.srt.distributed import (
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_gather,
 )
-
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from sglang.srt.model_executor.forward_batch_info import (
     CaptureHiddenMode,
diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py
index 96e02e312781..eaa65c544b80 100644
--- a/python/sglang/srt/layers/moe/ep_moe/layer.py
+++ b/python/sglang/srt/layers/moe/ep_moe/layer.py
@@ -4,13 +4,13 @@
 import torch
 from torch.nn import Module
 from vllm import _custom_ops as ops
-from vllm.distributed import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod
 
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from sglang.srt.layers.custom_op_util import register_custom_op
 from sglang.srt.layers.moe.ep_moe.kernels import (
     grouped_gemm_triton,
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
index a484c2ef9a49..75d4c5ead650 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -5,13 +5,13 @@
 from typing import Callable, List, Optional, Tuple
 
 import torch
-from vllm.distributed import (
+from vllm.model_executor.custom_op import CustomOp
+
+from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
-from vllm.model_executor.custom_op import CustomOp
-
 from sglang.srt.layers.custom_op_util import register_custom_op
 from sglang.srt.layers.moe.fused_moe_native import moe_forward_native
 from sglang.srt.layers.moe.topk import select_experts
diff --git a/python/sglang/srt/layers/parameter.py b/python/sglang/srt/layers/parameter.py
index fe999baa2660..d99b2efe85ff 100644
--- a/python/sglang/srt/layers/parameter.py
+++ b/python/sglang/srt/layers/parameter.py
@@ -6,7 +6,8 @@
 
 import torch
 from torch.nn import Parameter
-from vllm.distributed import get_tensor_model_parallel_rank
+
+from sglang.srt.distributed import get_tensor_model_parallel_rank
 
 __all__ = [
     "BasevLLMParameter",
diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
index 5ccac960f578..bd59352a7969 100644
--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -8,7 +8,6 @@
 from torch.nn import Module
 from torch.nn.parameter import Parameter
 from vllm import _custom_ops as ops
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear,
@@ -24,6 +23,7 @@
     requantize_with_max_scale,
 )
 
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.linear import (
     LinearBase,
     LinearMethodBase,
diff --git a/python/sglang/srt/layers/vocab_parallel_embedding.py b/python/sglang/srt/layers/vocab_parallel_embedding.py
index a346a2cbd1c9..ed9d67ef9706 100644
--- a/python/sglang/srt/layers/vocab_parallel_embedding.py
+++ b/python/sglang/srt/layers/vocab_parallel_embedding.py
@@ -6,13 +6,13 @@
 import torch
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter, UninitializedParameter
-from vllm.distributed import (
+
+from sglang.srt.distributed import (
     divide,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
-
 from sglang.srt.layers.parameter import BasevLLMParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
index e167ff16a1a6..9fdf7a8ac781 100644
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -21,10 +21,10 @@
 
 import torch
 import tqdm
-from vllm.distributed import get_tensor_model_parallel_rank
-from vllm.distributed.parallel_state import graph_capture
 from vllm.model_executor.custom_op import CustomOp
 
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.distributed.parallel_state import graph_capture
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.moe.fused_moe_native import fused_moe_forward_native
 from sglang.srt.layers.torchao_utils import save_gemlite_cache
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 86d59582f902..26191469435e 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -21,16 +21,17 @@
 
 import torch
 import torch.distributed as dist
-from vllm.distributed import (
+
+from sglang.srt.configs.device_config import DeviceConfig
+from sglang.srt.configs.load_config import LoadConfig
+from sglang.srt.configs.model_config import AttentionArch, ModelConfig
+from sglang.srt.distributed import (
     get_tp_group,
     init_distributed_environment,
     initialize_model_parallel,
     set_custom_all_reduce,
 )
-
-from sglang.srt.configs.device_config import DeviceConfig
-from sglang.srt.configs.load_config import LoadConfig
-from sglang.srt.configs.model_config import AttentionArch, ModelConfig
+from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state
 from sglang.srt.layers.attention.double_sparsity_backend import DoubleSparseAttnBackend
 from sglang.srt.layers.attention.flashinfer_backend import FlashInferAttnBackend
 from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBackend
@@ -295,12 +296,15 @@ def load_model(self):
             monkey_patch_vllm_gguf_config()
 
         # Load the model
+        # Remove monkey_patch when linear.py quant remove dependencies with vllm
+        monkey_patch_vllm_parallel_state()
         with self.memory_saver_adapter.region():
             self.model = get_model(
                 model_config=self.model_config,
                 load_config=self.load_config,
                 device_config=DeviceConfig(self.device),
             )
+        monkey_patch_vllm_parallel_state(reverse=True)
 
         if self.server_args.kv_cache_dtype == "fp8_e4m3":
             if self.server_args.quantization_param_path is not None:
diff --git a/python/sglang/srt/model_loader/loader.py b/python/sglang/srt/model_loader/loader.py
index 776b69aafa7c..677d716d43b6 100644
--- a/python/sglang/srt/model_loader/loader.py
+++ b/python/sglang/srt/model_loader/loader.py
@@ -21,14 +21,14 @@
 from torch import nn
 from transformers import AutoModelForCausalLM, PretrainedConfig
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
-from vllm.distributed import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
 
 from sglang.srt.configs.device_config import DeviceConfig
 from sglang.srt.configs.load_config import LoadConfig, LoadFormat
 from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.model_loader.utils import (
     get_model_architecture,
@@ -496,7 +496,8 @@ def load_model(
         device_config: DeviceConfig,
     ) -> nn.Module:
         from safetensors.torch import safe_open
-        from vllm.distributed import get_tensor_model_parallel_rank
+
+        from sglang.srt.distributed import get_tensor_model_parallel_rank
 
         local_model_path = self._prepare_weights(
             model_config.model_path, model_config.revision
@@ -556,7 +557,8 @@ def save_model(
         max_size: Optional[int] = None,
     ) -> None:
         from safetensors.torch import save_file
-        from vllm.distributed import get_tensor_model_parallel_rank
+
+        from sglang.srt.distributed import get_tensor_model_parallel_rank
 
         if pattern is None:
             pattern = ShardedStateLoader.DEFAULT_PATTERN
diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py
index 13b323b5d329..015c65145300 100644
--- a/python/sglang/srt/model_loader/weight_utils.py
+++ b/python/sglang/srt/model_loader/weight_utils.py
@@ -19,10 +19,10 @@
 from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
 from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm
-from vllm.distributed import get_tensor_model_parallel_rank
 
 from sglang.srt.configs.load_config import LoadConfig
 from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.distributed import get_tensor_model_parallel_rank
 from sglang.srt.layers.quantization import QuantizationConfig, get_quantization_config
 from sglang.srt.utils import print_warning_once
 
diff --git a/python/sglang/srt/models/baichuan.py b/python/sglang/srt/models/baichuan.py
index 3bd60c25d3e4..c973e64c7a07 100644
--- a/python/sglang/srt/models/baichuan.py
+++ b/python/sglang/srt/models/baichuan.py
@@ -24,10 +24,6 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
     QKVParallelLinear,
@@ -35,6 +31,10 @@
 )
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
diff --git a/python/sglang/srt/models/chatglm.py b/python/sglang/srt/models/chatglm.py
index b69a9e11639a..4d73aa0def7c 100644
--- a/python/sglang/srt/models/chatglm.py
+++ b/python/sglang/srt/models/chatglm.py
@@ -21,10 +21,10 @@
 import torch
 from torch import nn
 from torch.nn import LayerNorm
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.configs import ChatGLMConfig
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py
index 83ac3d8671b8..d701c10a7678 100644
--- a/python/sglang/srt/models/commandr.py
+++ b/python/sglang/srt/models/commandr.py
@@ -44,12 +44,12 @@
 from torch import nn
 from torch.nn.parameter import Parameter
 from transformers import PretrainedConfig
-from vllm.distributed import (
+from vllm.model_executor.layers.rotary_embedding import get_rope
+
+from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
-from vllm.model_executor.layers.rotary_embedding import get_rope
-
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.linear import (
     MergedColumnParallelLinear,
diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py
index f838cfa575bb..206f24d61bf1 100644
--- a/python/sglang/srt/models/dbrx.py
+++ b/python/sglang/srt/models/dbrx.py
@@ -19,14 +19,14 @@
 
 import torch
 import torch.nn as nn
-from vllm.distributed import (
+from vllm.model_executor.layers.rotary_embedding import get_rope
+
+from sglang.srt.configs import DbrxConfig
+from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
-from vllm.model_executor.layers.rotary_embedding import get_rope
-
-from sglang.srt.configs import DbrxConfig
 from sglang.srt.layers.linear import (
     QKVParallelLinear,
     ReplicatedLinear,
diff --git a/python/sglang/srt/models/deepseek.py b/python/sglang/srt/models/deepseek.py
index d840cb866bd2..cbd123c9e961 100644
--- a/python/sglang/srt/models/deepseek.py
+++ b/python/sglang/srt/models/deepseek.py
@@ -21,13 +21,13 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import (
+from vllm.model_executor.layers.rotary_embedding import get_rope
+
+from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
-from vllm.model_executor.layers.rotary_embedding import get_rope
-
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
index 626722121dec..5a76c8ac9815 100644
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -23,14 +23,14 @@
 from torch import nn
 from transformers import PretrainedConfig
 from vllm import _custom_ops as ops
-from vllm.distributed import (
+from vllm.model_executor.layers.rotary_embedding import get_rope
+
+from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     get_tp_group,
     tensor_model_parallel_all_reduce,
 )
-from vllm.model_executor.layers.rotary_embedding import get_rope
-
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/exaone.py b/python/sglang/srt/models/exaone.py
index 536c253c33ae..5bb0ea538f9f 100644
--- a/python/sglang/srt/models/exaone.py
+++ b/python/sglang/srt/models/exaone.py
@@ -20,9 +20,9 @@
 
 import torch
 from torch import nn
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py
index 10949a2f5727..bed496c6952a 100644
--- a/python/sglang/srt/models/gemma.py
+++ b/python/sglang/srt/models/gemma.py
@@ -21,9 +21,9 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import GeluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py
index 58d9ce02f20a..af51ba41b668 100644
--- a/python/sglang/srt/models/gemma2.py
+++ b/python/sglang/srt/models/gemma2.py
@@ -20,8 +20,8 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
 
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import GeluAndMul
 from sglang.srt.layers.layernorm import GemmaRMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/gpt2.py b/python/sglang/srt/models/gpt2.py
index 144ad8bbf728..a99232dc2018 100644
--- a/python/sglang/srt/models/gpt2.py
+++ b/python/sglang/srt/models/gpt2.py
@@ -22,10 +22,11 @@
 import torch
 from torch import nn
 from transformers import GPT2Config
-from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 
+from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_world_size
+
 # from sglang.srt.layers.activation import get_act_fn
 from sglang.srt.layers.linear import (
     ColumnParallelLinear,
diff --git a/python/sglang/srt/models/gpt_bigcode.py b/python/sglang/srt/models/gpt_bigcode.py
index f2f5ebd5204d..0d705fb41b60 100644
--- a/python/sglang/srt/models/gpt_bigcode.py
+++ b/python/sglang/srt/models/gpt_bigcode.py
@@ -21,8 +21,8 @@
 import torch
 from torch import nn
 from transformers import GPTBigCodeConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
 
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import get_act_fn
 from sglang.srt.layers.linear import (
     ColumnParallelLinear,
diff --git a/python/sglang/srt/models/granite.py b/python/sglang/srt/models/granite.py
index d207ff61b26d..1383e0ef0fde 100644
--- a/python/sglang/srt/models/granite.py
+++ b/python/sglang/srt/models/granite.py
@@ -22,9 +22,9 @@
 import torch
 from torch import nn
 from transformers import GraniteConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py
index 33a055a8fcb9..490798b5b66f 100644
--- a/python/sglang/srt/models/grok.py
+++ b/python/sglang/srt/models/grok.py
@@ -22,12 +22,12 @@
 import torch.nn.functional as F
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import (
+from vllm.model_executor.layers.rotary_embedding import get_rope
+
+from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
-from vllm.model_executor.layers.rotary_embedding import get_rope
-
 from sglang.srt.layers.activation import GeluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/internlm2.py b/python/sglang/srt/models/internlm2.py
index 0a737c1388b8..31617db5e5a9 100644
--- a/python/sglang/srt/models/internlm2.py
+++ b/python/sglang/srt/models/internlm2.py
@@ -19,9 +19,9 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py
index 4f09fd185b83..198d53995e46 100644
--- a/python/sglang/srt/models/llama.py
+++ b/python/sglang/srt/models/llama.py
@@ -22,13 +22,13 @@
 import torch
 from torch import nn
 from transformers import LlamaConfig
-from vllm.distributed import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.weight_utils import kv_cache_scales_loader
 
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/minicpm.py b/python/sglang/srt/models/minicpm.py
index 3482a8281323..b0853c3eee69 100644
--- a/python/sglang/srt/models/minicpm.py
+++ b/python/sglang/srt/models/minicpm.py
@@ -18,9 +18,9 @@
 
 import torch
 from torch import nn
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/minicpm3.py b/python/sglang/srt/models/minicpm3.py
index b0c93274e2b4..f5e722a14c6e 100644
--- a/python/sglang/srt/models/minicpm3.py
+++ b/python/sglang/srt/models/minicpm3.py
@@ -19,7 +19,6 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     MergedColumnParallelLinear,
@@ -28,6 +27,7 @@
 )
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py
index 9dbdb46ff979..c2c8d2294d3d 100644
--- a/python/sglang/srt/models/mixtral.py
+++ b/python/sglang/srt/models/mixtral.py
@@ -21,12 +21,12 @@
 import torch
 from torch import nn
 from transformers import MixtralConfig
-from vllm.distributed import (
+from vllm.model_executor.layers.rotary_embedding import get_rope
+
+from sglang.srt.distributed import (
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
-from vllm.model_executor.layers.rotary_embedding import get_rope
-
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
     QKVParallelLinear,
diff --git a/python/sglang/srt/models/mixtral_quant.py b/python/sglang/srt/models/mixtral_quant.py
index e5f49f5662fe..c38328369c43 100644
--- a/python/sglang/srt/models/mixtral_quant.py
+++ b/python/sglang/srt/models/mixtral_quant.py
@@ -23,13 +23,13 @@
 import torch.nn.functional as F
 from torch import nn
 from transformers import MixtralConfig
-from vllm.distributed import (
+from vllm.model_executor.layers.rotary_embedding import get_rope
+
+from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
-from vllm.model_executor.layers.rotary_embedding import get_rope
-
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
     QKVParallelLinear,
diff --git a/python/sglang/srt/models/mllama.py b/python/sglang/srt/models/mllama.py
index 019d21c20861..43f6793e4ef2 100644
--- a/python/sglang/srt/models/mllama.py
+++ b/python/sglang/srt/models/mllama.py
@@ -8,14 +8,14 @@
 import torch.nn.functional as F
 import torch.utils.checkpoint
 import transformers.models.mllama.configuration_mllama as config_mllama
-import vllm.distributed.parallel_state as ps
 from torch import nn
 from transformers.modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
 from transformers.models.mllama.modeling_mllama import (
     _prepare_aspect_ratio_attention_mask,
 )
-from vllm.distributed import get_tensor_model_parallel_world_size
 
+import sglang.srt.distributed.parallel_state as ps
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import get_act_fn
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/olmo.py b/python/sglang/srt/models/olmo.py
index 1cfa27309fe2..e8fe9a7a071f 100644
--- a/python/sglang/srt/models/olmo.py
+++ b/python/sglang/srt/models/olmo.py
@@ -20,9 +20,9 @@
 import torch
 from torch import nn
 from transformers import OlmoConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.linear import (
     MergedColumnParallelLinear,
diff --git a/python/sglang/srt/models/olmo2.py b/python/sglang/srt/models/olmo2.py
index 0944b5720925..df0121930f32 100755
--- a/python/sglang/srt/models/olmo2.py
+++ b/python/sglang/srt/models/olmo2.py
@@ -21,15 +21,15 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import (
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     split_tensor_along_last_dim,
     tensor_model_parallel_all_gather,
 )
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/olmoe.py b/python/sglang/srt/models/olmoe.py
index df96be3bc94f..74bc98372e73 100644
--- a/python/sglang/srt/models/olmoe.py
+++ b/python/sglang/srt/models/olmoe.py
@@ -23,10 +23,6 @@
 import torch.nn.functional as F
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import (
-    get_tensor_model_parallel_world_size,
-    tensor_model_parallel_all_reduce,
-)
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
     QKVParallelLinear,
@@ -35,6 +31,10 @@
 )
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
diff --git a/python/sglang/srt/models/phi3_small.py b/python/sglang/srt/models/phi3_small.py
index 1e70c7d7874c..e59f88013d31 100644
--- a/python/sglang/srt/models/phi3_small.py
+++ b/python/sglang/srt/models/phi3_small.py
@@ -5,9 +5,9 @@
 from torch import nn
 from transformers import Phi3Config
 from transformers.configuration_utils import PretrainedConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.linear import (
     MergedColumnParallelLinear,
     QKVParallelLinear,
diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py
index 5492a3e12214..ed9ca02b7da3 100644
--- a/python/sglang/srt/models/qwen.py
+++ b/python/sglang/srt/models/qwen.py
@@ -20,9 +20,9 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py
index bc3f1099753e..f015b2872efe 100644
--- a/python/sglang/srt/models/qwen2.py
+++ b/python/sglang/srt/models/qwen2.py
@@ -20,9 +20,9 @@
 
 import torch
 from torch import nn
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py
index 9db2d538234d..b0f08f975ba4 100644
--- a/python/sglang/srt/models/qwen2_moe.py
+++ b/python/sglang/srt/models/qwen2_moe.py
@@ -22,12 +22,12 @@
 import torch.nn.functional as F
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import (
+from vllm.model_executor.layers.rotary_embedding import get_rope
+
+from sglang.srt.distributed import (
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
-from vllm.model_executor.layers.rotary_embedding import get_rope
-
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/qwen2_vl.py b/python/sglang/srt/models/qwen2_vl.py
index 2e9ec9d8f507..fc5dd49ea61b 100644
--- a/python/sglang/srt/models/qwen2_vl.py
+++ b/python/sglang/srt/models/qwen2_vl.py
@@ -30,12 +30,12 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from vllm.distributed import parallel_state
-from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import QuickGELU
 
 from sglang.srt.configs import Qwen2VLConfig, Qwen2VLVisionConfig
+from sglang.srt.distributed import parallel_state
+from sglang.srt.distributed import utils as dist_utils
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.layers.attention.triton_ops.prefill_attention import (
     context_attention_fwd,
diff --git a/python/sglang/srt/models/stablelm.py b/python/sglang/srt/models/stablelm.py
index 079d54e3c83d..2f144dcb1ee6 100644
--- a/python/sglang/srt/models/stablelm.py
+++ b/python/sglang/srt/models/stablelm.py
@@ -24,9 +24,9 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.linear import (
     MergedColumnParallelLinear,
diff --git a/python/sglang/srt/models/torch_native_llama.py b/python/sglang/srt/models/torch_native_llama.py
index 7a55d50457a4..04d21f0d4f39 100644
--- a/python/sglang/srt/models/torch_native_llama.py
+++ b/python/sglang/srt/models/torch_native_llama.py
@@ -47,12 +47,12 @@
 from torch import nn
 from torch.nn.parameter import Parameter
 from transformers import LlamaConfig
-from vllm.distributed import (
+from vllm.model_executor.layers.rotary_embedding import get_rope
+
+from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
-from vllm.model_executor.layers.rotary_embedding import get_rope
-
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
diff --git a/python/sglang/srt/models/xverse.py b/python/sglang/srt/models/xverse.py
index e65514215190..7d6158a9b898 100644
--- a/python/sglang/srt/models/xverse.py
+++ b/python/sglang/srt/models/xverse.py
@@ -21,7 +21,6 @@
 import torch
 from torch import nn
 from transformers import LlamaConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
@@ -31,6 +30,7 @@
 )
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
diff --git a/python/sglang/srt/models/xverse_moe.py b/python/sglang/srt/models/xverse_moe.py
index 9b4b27f07d26..27f763011d52 100644
--- a/python/sglang/srt/models/xverse_moe.py
+++ b/python/sglang/srt/models/xverse_moe.py
@@ -18,11 +18,6 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-    tensor_model_parallel_all_reduce,
-)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
@@ -33,6 +28,11 @@
 )
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.moe.fused_moe_triton import fused_moe
 from sglang.srt.layers.quantization.base_config import QuantizationConfig

From 53e6552fed4595edb67f13b88772e6d2eab8dff9 Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Fri, 17 Jan 2025 22:35:26 +0800
Subject: [PATCH 095/248] Fix qwen accuracy issue (#2945)

---
 python/sglang/srt/models/qwen2.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py
index f015b2872efe..f1d37118a86b 100644
--- a/python/sglang/srt/models/qwen2.py
+++ b/python/sglang/srt/models/qwen2.py
@@ -356,8 +356,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 break
             else:
                 # Skip loading extra bias for GPTQ models.
-                if "lm_head.weight" in name:
-                    continue
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 param = params_dict[name]

From c5644cace9a0e3e333da8b0c0f9bd565789f6fcf Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Fri, 17 Jan 2025 23:41:57 +0800
Subject: [PATCH 096/248] docs: add Cursor for adoption and sponsorship (#2950)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bacdb9fc15f6..1165826c559b 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
 
 ## Adoption and Sponsorship
-The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
+The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
 
 ## Acknowledgment and Citation
 We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.

From d06c1ab5872e82ebf8807b05eb45cc46d4913bf0 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Fri, 17 Jan 2025 23:42:23 +0800
Subject: [PATCH 097/248] update ci install dependency (#2949)

---
 scripts/ci_install_dependency.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh
index 66b113f61976..1a059d5ff683 100755
--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
@@ -23,3 +23,6 @@ pip install cutex
 
 # For compling xgrammar kernels
 pip install cuda-python nvidia-cuda-nvrtc-cu12
+
+# reinstall sgl-kernel
+pip install sgl-kernel --force-reinstall --no-deps

From 033c715b4662576bade850fc42b20ce07151fa46 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Fri, 17 Jan 2025 23:46:48 +0800
Subject: [PATCH 098/248] cleanup models dependencies 1/n (#2948)

---
 python/sglang/srt/layers/moe/ep_moe/layer.py |  2 +-
 python/sglang/srt/lora/lora.py               | 10 +---------
 python/sglang/srt/models/baichuan.py         | 10 +++++-----
 python/sglang/srt/models/gpt2.py             |  3 +--
 python/sglang/srt/models/minicpm3.py         | 12 ++++++------
 python/sglang/srt/models/olmo2.py            |  2 +-
 python/sglang/srt/models/olmoe.py            | 11 +++++------
 python/sglang/srt/models/qwen2_vl.py         |  4 ++--
 python/sglang/srt/models/xverse.py           | 12 ++++++------
 python/sglang/srt/models/xverse_moe.py       | 16 ++++++++--------
 10 files changed, 36 insertions(+), 46 deletions(-)

diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py
index eaa65c544b80..8f5a71dff8c3 100644
--- a/python/sglang/srt/layers/moe/ep_moe/layer.py
+++ b/python/sglang/srt/layers/moe/ep_moe/layer.py
@@ -5,7 +5,6 @@
 from torch.nn import Module
 from vllm import _custom_ops as ops
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -25,6 +24,7 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from sglang.srt.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod
 from sglang.srt.utils import is_hip, set_weight_attrs
 
 logger = logging.getLogger(__name__)
diff --git a/python/sglang/srt/lora/lora.py b/python/sglang/srt/lora/lora.py
index 839d10222e2b..c8cbe36602b2 100644
--- a/python/sglang/srt/lora/lora.py
+++ b/python/sglang/srt/lora/lora.py
@@ -19,18 +19,11 @@
 # https://github.com/vllm-project/vllm/blob/4abf6336ec65c270343eb895e7b18786e9274176/vllm/lora/layers.py
 
 
-import json
-import os
 import re
-from typing import Any, Dict, List, Optional, Tuple
 
-import safetensors.torch
 import torch
 from torch import nn
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead,
-    VocabParallelEmbedding,
-)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 
 from sglang.srt.layers.linear import (
     ColumnParallelLinear,
@@ -38,7 +31,6 @@
     QKVParallelLinear,
     RowParallelLinear,
 )
-from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.model_loader.loader import DefaultModelLoader
 
 
diff --git a/python/sglang/srt/models/baichuan.py b/python/sglang/srt/models/baichuan.py
index c973e64c7a07..d8916abacfbc 100644
--- a/python/sglang/srt/models/baichuan.py
+++ b/python/sglang/srt/models/baichuan.py
@@ -24,11 +24,6 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.linear import (
-    MergedColumnParallelLinear,
-    QKVParallelLinear,
-    RowParallelLinear,
-)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
@@ -37,6 +32,11 @@
 )
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
diff --git a/python/sglang/srt/models/gpt2.py b/python/sglang/srt/models/gpt2.py
index a99232dc2018..280ff152a0c0 100644
--- a/python/sglang/srt/models/gpt2.py
+++ b/python/sglang/srt/models/gpt2.py
@@ -22,10 +22,9 @@
 import torch
 from torch import nn
 from transformers import GPT2Config
-from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 
 from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
 
 # from sglang.srt.layers.activation import get_act_fn
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/minicpm3.py b/python/sglang/srt/models/minicpm3.py
index f5e722a14c6e..2d15af43ff23 100644
--- a/python/sglang/srt/models/minicpm3.py
+++ b/python/sglang/srt/models/minicpm3.py
@@ -19,17 +19,17 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.linear import (
-    ColumnParallelLinear,
-    MergedColumnParallelLinear,
-    ReplicatedLinear,
-    RowParallelLinear,
-)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
diff --git a/python/sglang/srt/models/olmo2.py b/python/sglang/srt/models/olmo2.py
index df0121930f32..fafe39d7189d 100755
--- a/python/sglang/srt/models/olmo2.py
+++ b/python/sglang/srt/models/olmo2.py
@@ -22,7 +22,6 @@
 from torch import nn
 from transformers import PretrainedConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -45,6 +44,7 @@
     VocabParallelEmbedding,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.utils import make_layers
 
 
diff --git a/python/sglang/srt/models/olmoe.py b/python/sglang/srt/models/olmoe.py
index 74bc98372e73..9abe9ff25d9f 100644
--- a/python/sglang/srt/models/olmoe.py
+++ b/python/sglang/srt/models/olmoe.py
@@ -23,12 +23,6 @@
 import torch.nn.functional as F
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.linear import (
-    MergedColumnParallelLinear,
-    QKVParallelLinear,
-    ReplicatedLinear,
-    RowParallelLinear,
-)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
@@ -37,6 +31,11 @@
 )
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
diff --git a/python/sglang/srt/models/qwen2_vl.py b/python/sglang/srt/models/qwen2_vl.py
index fc5dd49ea61b..83912e894e23 100644
--- a/python/sglang/srt/models/qwen2_vl.py
+++ b/python/sglang/srt/models/qwen2_vl.py
@@ -22,6 +22,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
+import logging
 from functools import lru_cache, partial
 from typing import Iterable, List, Optional, Tuple, Type, TypedDict
 
@@ -30,7 +31,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import QuickGELU
 
 from sglang.srt.configs import Qwen2VLConfig, Qwen2VLVisionConfig
@@ -50,7 +50,7 @@
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.qwen2 import Qwen2Model
 
-logger = init_logger(__name__)
+logger = logging.getLogger(__name__)
 
 # === Vision Inputs === #
 
diff --git a/python/sglang/srt/models/xverse.py b/python/sglang/srt/models/xverse.py
index 7d6158a9b898..799e513ae7e5 100644
--- a/python/sglang/srt/models/xverse.py
+++ b/python/sglang/srt/models/xverse.py
@@ -21,16 +21,16 @@
 import torch
 from torch import nn
 from transformers import LlamaConfig
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (
+from vllm.model_executor.layers.rotary_embedding import get_rope
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
     MergedColumnParallelLinear,
     QKVParallelLinear,
     RowParallelLinear,
 )
-from vllm.model_executor.layers.rotary_embedding import get_rope
-
-from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
diff --git a/python/sglang/srt/models/xverse_moe.py b/python/sglang/srt/models/xverse_moe.py
index 27f763011d52..97b62815a87c 100644
--- a/python/sglang/srt/models/xverse_moe.py
+++ b/python/sglang/srt/models/xverse_moe.py
@@ -18,14 +18,6 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (
-    MergedColumnParallelLinear,
-    QKVParallelLinear,
-    ReplicatedLinear,
-    RowParallelLinear,
-)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
@@ -33,6 +25,14 @@
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.moe.fused_moe_triton import fused_moe
 from sglang.srt.layers.quantization.base_config import QuantizationConfig

From d47c5101f10aa52041be128326047849ecc1f897 Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Sat, 18 Jan 2025 00:03:54 +0800
Subject: [PATCH 099/248] Add ut for qwen model (#2947)

---
 test/srt/models/test_qwen_models.py | 78 +++++++++++++++++++++++++++++
 test/srt/run_suite.py               |  1 +
 2 files changed, 79 insertions(+)
 create mode 100644 test/srt/models/test_qwen_models.py

diff --git a/test/srt/models/test_qwen_models.py b/test/srt/models/test_qwen_models.py
new file mode 100644
index 000000000000..903fd45d5503
--- /dev/null
+++ b/test/srt/models/test_qwen_models.py
@@ -0,0 +1,78 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestQwen2(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "Qwen/Qwen2-7B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.81)
+
+
+class TestQwen2FP8(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "neuralmagic/Qwen2-7B-Instruct-FP8"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.8)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 83e24e3a8676..e2ecdfb3a681 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -8,6 +8,7 @@
         "models/test_embedding_models.py",
         "models/test_generation_models.py",
         "models/test_lora.py",
+        "models/test_qwen_models.py",
         "models/test_reward_models.py",
         "sampling/penaltylib",
         "test_abort.py",

From dc2ac0cbdb5c1c5dc18640408ec7c5a836a79011 Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Sat, 18 Jan 2025 00:44:16 +0800
Subject: [PATCH 100/248] Update pr template (#2951)

---
 .github/pull_request_template.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index e3803f9cda5f..5493c4201c41 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -13,3 +13,4 @@
 - [ ] Format your code according to the [Code Formatting with Pre-Commit](https://docs.sglang.ai/references/contribution_guide.html#code-formatting-with-pre-commit).
 - [ ] Add unit tests as outlined in the [Running Unit Tests](https://docs.sglang.ai/references/contribution_guide.html#running-unit-tests-adding-to-ci).
 - [ ] Update documentation / docstrings / example tutorials as needed, according to [Writing Documentation](https://docs.sglang.ai/references/contribution_guide.html#writing-documentation-running-docs-ci).
+- [ ] Provide throughput / latency benchmark results and accuracy evaluation results as needed, according to [Benchmark and Profiling](https://docs.sglang.ai/references/benchmark_and_profiling.html).

From 7a15e9ad36624db4c0f0a484fe31d32ab9db3dc0 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Sat, 18 Jan 2025 01:09:19 +0800
Subject: [PATCH 101/248] cleanup models unused import 2/n (#2952)

---
 python/sglang/srt/models/gpt2.py  |  2 +-
 python/sglang/srt/models/olmo.py  |  2 +-
 python/sglang/srt/models/olmoe.py | 11 +++--------
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/python/sglang/srt/models/gpt2.py b/python/sglang/srt/models/gpt2.py
index 280ff152a0c0..d457603b063b 100644
--- a/python/sglang/srt/models/gpt2.py
+++ b/python/sglang/srt/models/gpt2.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-2 model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, Optional, Tuple
 
 import torch
 from torch import nn
diff --git a/python/sglang/srt/models/olmo.py b/python/sglang/srt/models/olmo.py
index e8fe9a7a071f..45a3f3ff4615 100644
--- a/python/sglang/srt/models/olmo.py
+++ b/python/sglang/srt/models/olmo.py
@@ -15,7 +15,7 @@
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/olmo.py#L1
 """Inference-only OLMo model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, Optional, Tuple
 
 import torch
 from torch import nn
diff --git a/python/sglang/srt/models/olmoe.py b/python/sglang/srt/models/olmoe.py
index 9abe9ff25d9f..91722d966c88 100644
--- a/python/sglang/srt/models/olmoe.py
+++ b/python/sglang/srt/models/olmoe.py
@@ -17,26 +17,21 @@
 
 """Inference-only OLMoE model compatible with HuggingFace weights."""
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Dict, Iterable, Optional, Tuple
 
 import torch
-import torch.nn.functional as F
 from torch import nn
 from transformers import PretrainedConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
-from sglang.srt.distributed import (
-    get_tensor_model_parallel_world_size,
-    tensor_model_parallel_all_reduce,
-)
-from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
     QKVParallelLinear,
     ReplicatedLinear,
     RowParallelLinear,
 )
-from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention

From 78e5b22f29e756667fb60a98c67dc142d3fe95e3 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Sat, 18 Jan 2025 02:57:18 +0800
Subject: [PATCH 102/248] feat: use get_rope for gemma2 (#2954)

---
 python/sglang/srt/models/gemma2.py | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py
index af51ba41b668..ee0a762aa0a4 100644
--- a/python/sglang/srt/models/gemma2.py
+++ b/python/sglang/srt/models/gemma2.py
@@ -20,6 +20,7 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import GeluAndMul
@@ -48,19 +49,6 @@ def get_attention_sliding_window_size(config):
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 
 
-class GemmaRotaryEmbedding(RotaryEmbedding):
-    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
-        # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/gemma/modeling_gemma.py#L107
-        inv_freq = 1.0 / (
-            base
-            ** (
-                torch.arange(0, self.rotary_dim, 2, dtype=torch.int64).float()
-                / self.rotary_dim
-            )
-        )
-        return inv_freq
-
-
 class Gemma2MLP(nn.Module):
     def __init__(
         self,
@@ -143,14 +131,12 @@ def __init__(
             bias=config.attention_bias,
             quant_config=quant_config,
         )
-        # from vLLM: TODO(woosuk): Use the `get_rope` interface.
-        self.rotary_emb = GemmaRotaryEmbedding(
-            self.head_dim,
+        self.rotary_emb = get_rope(
             self.head_dim,
-            max_position_embeddings,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
             base=self.rope_theta,
             is_neox_style=True,
-            dtype=torch.get_default_dtype(),
         )
 
         use_sliding_window = layer_id % 2 == 0 and hasattr(config, "sliding_window")

From 120c3634efa52f733bef4b8290aff9a70ba65215 Mon Sep 17 00:00:00 2001
From: Wen Sun <35923278+HermitSun@users.noreply.github.com>
Date: Sat, 18 Jan 2025 06:46:38 +0800
Subject: [PATCH 103/248] Fix Llama-3.1-405B References Docs (#2944)

---
 docs/references/llama_405B.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/references/llama_405B.md b/docs/references/llama_405B.md
index 075aac030964..a63b012fb27f 100644
--- a/docs/references/llama_405B.md
+++ b/docs/references/llama_405B.md
@@ -11,9 +11,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instr
 ```bash
 # on the first node, replace 172.16.4.52:20000 with your own node ip address and port
 
-python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --dist-init-addr-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0
+python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --dist-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0
 
 # on the second node, replace 172.18.45.52:20000 with your own node ip address and port
 
-python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --dist-init-addr-addr 172.18.45.52:20000 --nnodes 2 --node-rank 1
+python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --dist-init-addr 172.18.45.52:20000 --nnodes 2 --node-rank 1
 ```

From 13387e6b7a9cbc29e6031e8d905d170b98a8fab8 Mon Sep 17 00:00:00 2001
From: Zhiqiang Xie <xiezhq@stanford.edu>
Date: Fri, 17 Jan 2025 16:17:24 -0800
Subject: [PATCH 104/248] Multi-turn benchmark for hierarchical caching (#2942)

---
 benchmark/hicache/bench_multiturn.py | 334 +++++++++++++++++++++++++++
 1 file changed, 334 insertions(+)
 create mode 100644 benchmark/hicache/bench_multiturn.py

diff --git a/benchmark/hicache/bench_multiturn.py b/benchmark/hicache/bench_multiturn.py
new file mode 100644
index 000000000000..ab34c33da44e
--- /dev/null
+++ b/benchmark/hicache/bench_multiturn.py
@@ -0,0 +1,334 @@
+import argparse
+import asyncio
+import json
+import queue
+import random
+import threading
+import time
+from typing import Optional
+
+import aiohttp
+import requests
+from tqdm.asyncio import tqdm
+
+from sglang.bench_serving import (
+    RequestFuncOutput,
+    get_tokenizer,
+    remove_prefix,
+    sample_random_requests,
+)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Script to benchmark concurrent requests to a server."
+    )
+    parser.add_argument(
+        "--num-clients",
+        type=int,
+        default=200,
+        help="Number of concurrent clients",
+    )
+    parser.add_argument(
+        "--request-length",
+        type=int,
+        default=512,
+        help="Length of each new request",
+    )
+    parser.add_argument(
+        "--output-length",
+        type=int,
+        default=64,
+        help="Length of each output",
+    )
+    parser.add_argument(
+        "--num-rounds",
+        type=int,
+        default=5,
+        help="Number of rounds per client",
+    )
+    parser.add_argument(
+        "--distribution",
+        type=str,
+        default="poisson",
+        choices=["poisson", "uniform"],
+        help="Distribution type for request intervals (poisson or uniform)",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=1.0,
+        help="Average number of requests per second",
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="localhost",
+        help="Server hostname or IP (default: localhost)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=30000,
+        help="Server port (default: 30000)",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="meta-llama/Llama-3.1-8B-Instruct",
+        help="model path compatible with Hugging Face Transformers",
+    )
+    return parser.parse_args()
+
+
+async def async_request_sglang_generate(
+    payload,
+    url,
+    pbar: Optional[tqdm] = None,
+):
+    """
+    Sends a streaming request to the server. Gathers text token-by-token.
+    """
+    async with aiohttp.ClientSession() as session:
+        headers = {}
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        output = RequestFuncOutput()
+
+        try:
+            async with session.post(url=url, json=payload, headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+
+                            if data["text"]:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text = data["text"]
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception as e:
+            output.success = False
+            output.error = str(e)
+            print(f"Request failed: {e}")
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+def gen_payload(prompt, output_len):
+    payload = {
+        "text": prompt,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_new_tokens": output_len,
+            "ignore_eos": True,
+        },
+        "stream": True,
+        "lora_path": "",
+        "return_logprob": False,
+        "logprob_start_len": -1,
+    }
+    return payload
+
+
+class ReadyQueue:
+    """
+    Thread-safe queue that can pop requests in different orders based on given policy.
+    """
+
+    def __init__(self, init_requests=None, policy="random"):
+        self.lock = threading.Lock()
+        self.requests = init_requests or []
+        self.policy = policy
+
+    def append(self, item):
+        with self.lock:
+            self.requests.append(item)
+
+    def pop(self):
+        with self.lock:
+            if not self.requests:
+                return None
+            if self.policy == "random":
+                index = random.randrange(len(self.requests))
+                return self.requests.pop(index)
+            elif self.policy == "fifo":
+                return self.requests.pop(0)
+            else:
+                # todo, varying thinking time of clients
+                raise ValueError(f"{self.policy} not implemented")
+
+
+class WorkloadGenerator:
+    def __init__(self, args):
+        # Construct the base URL for requests
+        self.url = f"http://{args.host}:{args.port}/generate"
+
+        self.tokenizer = get_tokenizer(args.model)
+        self.distribution = args.distribution
+        self.request_rate = args.request_rate
+        self.start_time = None
+        self.finished_time = None
+
+        self.candidate_inputs = sample_random_requests(
+            input_len=args.request_length,
+            output_len=args.output_length,
+            num_prompts=args.num_clients * args.num_rounds,
+            range_ratio=1.0,
+            tokenizer=self.tokenizer,
+            dataset_path="",
+        )
+        self.candidate_inputs = [i[0] for i in self.candidate_inputs]
+
+        init_requests = [
+            (i, gen_payload(self.candidate_inputs[i], args.output_length))
+            for i in range(args.num_clients)
+        ]
+        self.client_records = {
+            i: {"round": 0, "history": init_requests[i][1]["text"]}
+            for i in range(args.num_clients)
+        }
+        self.ready_queue = ReadyQueue(init_requests=init_requests)
+        self.candidate_inputs = self.candidate_inputs[args.num_clients :]
+
+        self.response_queue = queue.Queue()
+        self.pbar = tqdm(total=args.num_clients * args.num_rounds)
+        self.performance_metrics = {"ttft": [], "latency": []}
+
+    async def handle_request(self, item):
+        try:
+            client_id, payload = item
+            response = await async_request_sglang_generate(payload, self.url, self.pbar)
+            if self.pbar.n == self.pbar.total:
+                self.finished_time = time.time()
+            self.response_queue.put((client_id, response))
+        except Exception as e:
+            print(f"Request failed: {e}")
+
+    def request_sender(self):
+        async def request_loop():
+            while True:
+                # Calculate Poisson-distributed wait time
+                if self.distribution == "poisson":
+                    sleep_time = random.expovariate(self.request_rate)
+                elif self.distribution == "uniform":
+                    avg_interval = (
+                        1.0 / self.request_rate if self.request_rate > 0 else 1.0
+                    )
+                    sleep_time = random.uniform(0, 2 * avg_interval)
+                else:
+                    raise ValueError("Invalid distribution type")
+                await asyncio.sleep(sleep_time)  # Wait before sending the next request
+
+                new_request = self.ready_queue.pop()
+                # Submit async request
+                if new_request:
+                    asyncio.create_task(self.handle_request(new_request))
+                else:
+                    if self.pbar.n == self.pbar.total:
+                        break
+
+        # Create and run the event loop for asynchronous requests
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(request_loop())
+        loop.close()
+
+    def response_handler(self):
+        while True:
+            try:
+                client_id, response = self.response_queue.get(
+                    timeout=10
+                )  # Block until response is available
+                if not response.success:
+                    raise ValueError(f"Request failed with error: {response.error}")
+                self.client_records[client_id]["history"] += response.generated_text
+                self.client_records[client_id]["round"] += 1
+                self.performance_metrics["ttft"].append(response.ttft)
+                self.performance_metrics["latency"].append(response.latency)
+
+                if self.client_records[client_id]["round"] < args.num_rounds:
+                    self.client_records[client_id][
+                        "history"
+                    ] += self.candidate_inputs.pop()
+                    self.ready_queue.append(
+                        (
+                            client_id,
+                            gen_payload(
+                                self.client_records[client_id]["history"],
+                                args.output_length,
+                            ),
+                        )
+                    )
+            except queue.Empty:
+                if self.pbar.n == self.pbar.total:
+                    break
+
+    def run(self):
+        request_thread = threading.Thread(target=self.request_sender, daemon=True)
+        response_thread = threading.Thread(target=self.response_handler, daemon=True)
+
+        self.start_time = time.time()
+        request_thread.start()
+        response_thread.start()
+
+        request_thread.join()
+        response_thread.join()
+
+        self.pbar.close()
+        print("All requests completed.")
+        print("Performance metrics summary:")
+        print(
+            f"  Total requests: {len(self.performance_metrics['ttft'])} at {self.request_rate} requests per second"
+        )
+        print(
+            f"  Average TTFT: {sum(self.performance_metrics['ttft']) / len(self.performance_metrics['ttft']):.2f}"
+        )
+        print(
+            f"  Median TTFT: {sorted(self.performance_metrics['ttft'])[len(self.performance_metrics['ttft']) // 2]:.2f}"
+        )
+        print(
+            f"  Average latency: {sum(self.performance_metrics['latency']) / len(self.performance_metrics['latency']):.2f}"
+        )
+        print(
+            f"  Median latency: {sorted(self.performance_metrics['latency'])[len(self.performance_metrics['latency']) // 2]:.2f}"
+        )
+        throughput = self.pbar.total / (self.finished_time - self.start_time)
+        print(f"Throughput: {throughput:.2f} requests per second")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    flush_cache_url = f"http://{args.host}:{args.port}/flush_cache"
+
+    for request_rate in range(1, 41, 2):
+        args.request_rate = request_rate
+        requests.post(flush_cache_url)
+        WorkloadGenerator(args).run()

From d3024f4fc8d9956e817bdef8fdbdae26c474b448 Mon Sep 17 00:00:00 2001
From: bjmsong <wq.songbob@gmail.com>
Date: Sat, 18 Jan 2025 11:43:22 +0800
Subject: [PATCH 105/248] support e4m3 kvcache in qwen2 & add kv scaling facotr
 json (#2894)

Co-authored-by: bjmsong <bjmsong@126.com>
---
 .../sglang/srt/model_loader/weight_utils.py   | 55 +++++++++++++++++-
 python/sglang/srt/models/llama.py             |  6 +-
 python/sglang/srt/models/qwen2.py             | 36 +++++++++++-
 python/sglang/test/test_utils.py              |  1 +
 test/srt/kv_cache_scales_llama3_8b.json       | 42 ++++++++++++++
 test/srt/kv_cache_scales_qwen2_1_5b.json      | 38 +++++++++++++
 test/srt/run_suite.py                         |  1 +
 test/srt/test_fp8_kvcache.py                  | 57 +++++++++++++++++--
 8 files changed, 227 insertions(+), 9 deletions(-)
 create mode 100644 test/srt/kv_cache_scales_llama3_8b.json
 create mode 100644 test/srt/kv_cache_scales_qwen2_1_5b.json

diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py
index 015c65145300..77c3fcbee74d 100644
--- a/python/sglang/srt/model_loader/weight_utils.py
+++ b/python/sglang/srt/model_loader/weight_utils.py
@@ -9,7 +9,17 @@
 import os
 import tempfile
 from collections import defaultdict
-from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
 
 import filelock
 import gguf
@@ -638,3 +648,46 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
 
     # If there were no matches, return the untouched param name
     return name
+
+
+def kv_cache_scales_loader(
+    filename: str,
+    tp_rank: int,
+    tp_size: int,
+    num_hidden_layers: int,
+    model_type: Optional[str],
+) -> Iterable[Tuple[int, float]]:
+    """
+    A simple utility to read in KV cache scaling factors that have been
+    previously serialized to disk. Used by the model to populate the appropriate
+    KV cache scaling factors. The serialization should represent a dictionary
+    whose keys are the TP ranks and values are another dictionary mapping layers
+    to their KV cache scaling factors.
+    """
+    try:
+        with open(filename) as f:
+            context = {
+                "model_type": model_type,
+                "num_hidden_layers": num_hidden_layers,
+                "tp_rank": tp_rank,
+                "tp_size": tp_size,
+            }
+            schema_dct = json.load(f)
+            schema = QuantParamSchema.model_validate(schema_dct, context=context)
+            layer_scales_map = schema.kv_cache.scaling_factor[tp_rank]
+            return layer_scales_map.items()
+    except FileNotFoundError:
+        logger.error("File or directory '%s' not found.", filename)
+    except json.JSONDecodeError:
+        logger.error("Error decoding JSON in file '%s'.", filename)
+    except Exception:
+        logger.exception("An error occurred while reading '%s'.", filename)
+    # This section is reached if and only if any of the excepts are hit
+    # Return an empty iterable (list) => no KV cache scales are loaded
+    # which ultimately defaults to 1.0 scales
+    logger.warning(
+        "Defaulting to KV cache scaling factors = 1.0 for all "
+        "layers in TP rank %d as an error occurred during loading.",
+        tp_rank,
+    )
+    return []
diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py
index 198d53995e46..9ea80d0c05d3 100644
--- a/python/sglang/srt/models/llama.py
+++ b/python/sglang/srt/models/llama.py
@@ -23,7 +23,6 @@
 from torch import nn
 from transformers import LlamaConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.model_loader.weight_utils import kv_cache_scales_loader
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -45,7 +44,10 @@
     VocabParallelEmbedding,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+)
 from sglang.srt.utils import make_layers
 from sglang.utils import get_exception_traceback
 
diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py
index f1d37118a86b..04faa8dea1b6 100644
--- a/python/sglang/srt/models/qwen2.py
+++ b/python/sglang/srt/models/qwen2.py
@@ -22,7 +22,10 @@
 from torch import nn
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
-from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
@@ -39,7 +42,10 @@
     VocabParallelEmbedding,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+)
 from sglang.srt.utils import make_layers
 
 Qwen2Config = None
@@ -265,6 +271,29 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    # If this function is called, it should always initialize KV cache scale
+    # factors (or else raise an exception). Thus, handled exceptions should
+    # make sure to leave KV cache scale factors in a known good (dummy) state
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.layers[layer_idx].self_attn
+            if hasattr(layer_self_attn.attn, "k_scale"):
+                layer_self_attn.attn.k_scale = scaling_factor
+                layer_self_attn.attn.v_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling " "factor attribute!"
+                )
+
 
 class Qwen2ForCausalLM(nn.Module):
 
@@ -373,5 +402,8 @@ def set_embed_and_head(self, embed, head):
         torch.cuda.empty_cache()
         torch.cuda.synchronize()
 
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
 
 EntryClass = Qwen2ForCausalLM
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index 42e0b6d808a7..d3c9b7cab5f8 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -40,6 +40,7 @@
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
+DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
 
 
 def is_in_ci():
diff --git a/test/srt/kv_cache_scales_llama3_8b.json b/test/srt/kv_cache_scales_llama3_8b.json
new file mode 100644
index 000000000000..466b0d01a74c
--- /dev/null
+++ b/test/srt/kv_cache_scales_llama3_8b.json
@@ -0,0 +1,42 @@
+{
+    "model_type": "llama",
+    "kv_cache": {
+        "dtype": "float8_e4m3fn",
+        "scaling_factor": {
+            "0": {
+                "0": 0.0408,
+                "1": 0.0503,
+                "2": 0.0667,
+                "3": 0.0909,
+                "4": 0.1135,
+                "5": 0.127,
+                "6": 0.1768,
+                "7": 0.1488,
+                "8": 0.1135,
+                "9": 0.1203,
+                "10": 0.1013,
+                "11": 0.0842,
+                "12": 0.1231,
+                "13": 0.1096,
+                "14": 0.1221,
+                "15": 0.1013,
+                "16": 0.1067,
+                "17": 0.0952,
+                "18": 0.0899,
+                "19": 0.097,
+                "20": 0.087,
+                "21": 0.0994,
+                "22": 0.0904,
+                "23": 0.1013,
+                "24": 0.1019,
+                "25": 0.1053,
+                "26": 0.1,
+                "27": 0.0894,
+                "28": 0.1013,
+                "29": 0.1488,
+                "30": 0.0766,
+                "31": 0.0821
+            }
+        }
+    }
+}
diff --git a/test/srt/kv_cache_scales_qwen2_1_5b.json b/test/srt/kv_cache_scales_qwen2_1_5b.json
new file mode 100644
index 000000000000..984747509f70
--- /dev/null
+++ b/test/srt/kv_cache_scales_qwen2_1_5b.json
@@ -0,0 +1,38 @@
+{
+    "model_type": "qwen",
+    "kv_cache": {
+        "dtype": "float8_e4m3fn",
+        "scaling_factor": {
+            "0": {
+                "0": 0.9846,
+                 "1": 0.0645,
+                 "2": 0.0731,
+                 "3": 0.0800,
+                 "4": 0.0748,
+                 "5": 0.0780,
+                 "6": 0.0702,
+                 "7": 0.0894,
+                 "8": 0.0410,
+                 "9": 0.0758,
+                 "10": 0.0556,
+                 "11": 0.0731,
+                 "12": 0.0899,
+                 "13": 0.0780,
+                 "14": 0.1441,
+                 "15": 0.0914,
+                 "16": 0.5614,
+                 "17": 0.1067,
+                 "18": 0.0537,
+                 "19": 0.0658,
+                 "20": 0.0523,
+                 "21": 0.0533,
+                 "22": 0.0699,
+                 "23": 0.0635,
+                 "24": 0.0588,
+                 "25": 0.0884,
+                 "26": 0.0947,
+                 "27": 0.1032
+            }
+        }
+    }
+}
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index e2ecdfb3a681..fb1c6abf29bd 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -52,6 +52,7 @@
         "test_vision_openai_server.py",
         "test_w8a8_quantization.py",
         "test_session_control.py",
+        "test_fp8_kvcache.py",
     ],
     "nightly": [
         "test_nightly_gsm8k_eval.py",
diff --git a/test/srt/test_fp8_kvcache.py b/test/srt/test_fp8_kvcache.py
index 0d6602997de5..4a8a2434699b 100644
--- a/test/srt/test_fp8_kvcache.py
+++ b/test/srt/test_fp8_kvcache.py
@@ -6,19 +6,26 @@
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN,
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
     popen_launch_server,
 )
 
 
-class TestFp8Kvcache(unittest.TestCase):
+class TestFp8KvcacheBase(unittest.TestCase):
+    model_config = None
+
     @classmethod
     def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        if cls.model_config is None:
+            raise NotImplementedError("model_config must be specified in subclass")
+
+        cls.model = cls.model_config["model_name"]
         cls.base_url = DEFAULT_URL_FOR_TEST
         dirpath = os.path.dirname(__file__)
-        config_file = os.path.join(dirpath, "kv_cache_scales_llama3_8b_chat.json")
+        config_file = os.path.join(dirpath, cls.model_config["config_filename"])
+
         cls.process = popen_launch_server(
             cls.model,
             cls.base_url,
@@ -31,6 +38,13 @@ def setUpClass(cls):
             ],
         )
 
+
+class TestFp8KvcacheLlama(TestFp8KvcacheBase):
+    model_config = {
+        "model_name": DEFAULT_MODEL_NAME_FOR_TEST,
+        "config_filename": "kv_cache_scales_llama3_8b.json",
+    }
+
     @classmethod
     def tearDownClass(cls):
         kill_process_tree(cls.process.pid)
@@ -45,7 +59,7 @@ def test_mgsm_en(self):
         )
 
         metrics = run_eval(args)
-        self.assertGreater(metrics["score"], 0.835)
+        self.assertGreater(metrics["score"], 0.80)
 
     def test_mmlu(self):
         args = SimpleNamespace(
@@ -60,5 +74,40 @@ def test_mmlu(self):
         self.assertGreaterEqual(metrics["score"], 0.65)
 
 
+class TestFp8KvcacheQwen(TestFp8KvcacheBase):
+    model_config = {
+        "model_name": DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN,
+        "config_filename": "kv_cache_scales_qwen2_1_5b.json",
+    }
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["score"], 0.01)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.3)
+
+
 if __name__ == "__main__":
     unittest.main()

From 8af7048dcf36d8b0804eb01cf3e58a58052f7271 Mon Sep 17 00:00:00 2001
From: Zhiqiang Xie <xiezhq@stanford.edu>
Date: Fri, 17 Jan 2025 20:20:26 -0800
Subject: [PATCH 106/248] Query remaining memory dynamically for PrefillAdder
 (#2941)

---
 python/sglang/srt/managers/schedule_policy.py | 37 +++++++++++++------
 python/sglang/srt/managers/scheduler.py       |  2 +-
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py
index 7cab55c74382..a3a099b83de2 100644
--- a/python/sglang/srt/managers/schedule_policy.py
+++ b/python/sglang/srt/managers/schedule_policy.py
@@ -24,6 +24,7 @@
 
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
+from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool
 from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
 
 # Clip the estimation of max_new_tokens for the request whose max_new_tokens is very large.
@@ -250,23 +251,24 @@ class PrefillAdder:
     def __init__(
         self,
         tree_cache: BasePrefixCache,
+        token_to_kv_pool: BaseTokenToKVPool,
         running_batch: ScheduleBatch,
         new_token_ratio: float,
-        rem_total_tokens: int,
         rem_input_tokens: int,
         rem_chunk_tokens: Optional[int],
         mixed_with_decode_tokens: int = 0,
     ):
         self.tree_cache = tree_cache
+        self.token_to_kv_pool = token_to_kv_pool
         self.running_batch = running_batch
         self.new_token_ratio = new_token_ratio
-        self.rem_total_tokens = rem_total_tokens - mixed_with_decode_tokens
         self.rem_input_tokens = rem_input_tokens - mixed_with_decode_tokens
         self.rem_chunk_tokens = rem_chunk_tokens
         if self.rem_chunk_tokens is not None:
             self.rem_chunk_tokens -= mixed_with_decode_tokens
 
-        self.cur_rem_tokens = rem_total_tokens - mixed_with_decode_tokens
+        self.rem_total_token_offset = mixed_with_decode_tokens
+        self.cur_rem_token_offset = mixed_with_decode_tokens
 
         self.req_states = None
         self.can_run_list = []
@@ -275,8 +277,7 @@ def __init__(
         self.log_input_tokens = 0
 
         if running_batch is not None:
-            # Pre-remove the tokens which will be occupied by the running requests
-            self.rem_total_tokens -= sum(
+            self.rem_total_token_offset += sum(
                 [
                     min(
                         (r.sampling_params.max_new_tokens - len(r.output_ids)),
@@ -287,6 +288,22 @@ def __init__(
                 ]
             )
 
+    @property
+    def rem_total_tokens(self):
+        return (
+            self.token_to_kv_pool.available_size()
+            + self.tree_cache.evictable_size()
+            - self.rem_total_token_offset
+        )
+
+    @property
+    def cur_rem_tokens(self):
+        return (
+            self.token_to_kv_pool.available_size()
+            + self.tree_cache.evictable_size()
+            - self.cur_rem_token_offset
+        )
+
     def budget_state(self):
         if self.rem_total_tokens <= 0 or self.cur_rem_tokens <= 0:
             return AddReqResult.NO_TOKEN
@@ -301,8 +318,8 @@ def budget_state(self):
     def _prefill_one_req(
         self, prefix_len: int, extend_input_len: int, max_new_tokens: int
     ):
-        self.rem_total_tokens -= extend_input_len + max_new_tokens
-        self.cur_rem_tokens -= extend_input_len
+        self.rem_total_token_offset += extend_input_len + max_new_tokens
+        self.cur_rem_token_offset += extend_input_len
         self.rem_input_tokens -= extend_input_len
         if self.rem_chunk_tokens is not None:
             self.rem_chunk_tokens -= extend_input_len
@@ -332,12 +349,10 @@ def add_being_chunked_req(self, req: Req):
     @contextmanager
     def _lock_node(self, last_node: TreeNode):
         try:
-            delta = self.tree_cache.inc_lock_ref(last_node)
-            self.rem_total_tokens += delta
+            self.tree_cache.inc_lock_ref(last_node)
             yield None
         finally:
-            delta = self.tree_cache.dec_lock_ref(last_node)
-            self.rem_total_tokens += delta
+            self.tree_cache.dec_lock_ref(last_node)
 
     def add_one_req_ignore_eos(self, req: Req):
         def add_req_state(r, insert_sort=False):
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index f8bb7d3348ab..bc963e0083b6 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -891,9 +891,9 @@ def get_new_batch_prefill(self) -> Optional[ScheduleBatch]:
         # Prefill policy
         adder = PrefillAdder(
             self.tree_cache,
+            self.token_to_kv_pool,
             self.running_batch,
             self.new_token_ratio,
-            self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size(),
             self.max_prefill_tokens,
             self.chunked_prefill_size,
             running_bs if self.is_mixed_chunk else 0,

From 656dcc1a99add613d3d3b5d6c13a8ac6568dd41a Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Sat, 18 Jan 2025 15:00:29 +0800
Subject: [PATCH 107/248] Remove fp8 monkey patch (#2960)

---
 .../srt/layers/quantization/__init__.py       | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py
index 88e9af6956ed..1c0092c1a40d 100644
--- a/python/sglang/srt/layers/quantization/__init__.py
+++ b/python/sglang/srt/layers/quantization/__init__.py
@@ -56,25 +56,6 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     return QUANTIZATION_METHODS[quantization]
 
 
-def fp8_get_quant_method(self, layer, prefix):
-    """Enhanced get_quant_method for FP8 config."""
-    from vllm.model_executor.layers.quantization.utils.quant_utils import (
-        is_layer_skipped,
-    )
-
-    from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
-    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
-    from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod, Fp8MoEMethod
-
-    if isinstance(layer, LinearBase):
-        if is_layer_skipped(prefix, self.ignored_layers):
-            return UnquantizedLinearMethod()
-        return Fp8LinearMethod(self)
-    elif isinstance(layer, FusedMoE):
-        return Fp8MoEMethod(self)
-    return None
-
-
 def gptq_get_quant_method(self, layer, prefix):
     from vllm.model_executor.layers.quantization.gptq_marlin import (
         GPTQMarlinLinearMethod,
@@ -126,7 +107,6 @@ def patched_isinstance(obj, classinfo):
 
 def apply_monkey_patches():
     """Apply all monkey patches in one place."""
-    setattr(Fp8Config, "get_quant_method", fp8_get_quant_method)
     setattr(GPTQMarlinConfig, "get_quant_method", gptq_get_quant_method)
     setattr(AWQMarlinConfig, "get_quant_method", awq_get_quant_method)
 

From 6f98c586bd6e7451646d3b1702b5494ec1d9c530 Mon Sep 17 00:00:00 2001
From: lukec <118525388+sleepcoo@users.noreply.github.com>
Date: Sat, 18 Jan 2025 18:50:37 +0800
Subject: [PATCH 108/248] fix sgl-kernel setup.py (#2963)

---
 sgl-kernel/setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 36596a1b00e3..33e4abe1b234 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from setuptools import setup
+from setuptools import find_packages, setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 
 root = Path(__file__).parent.resolve()
@@ -67,7 +67,7 @@ def update_wheel_platform_tag():
 setup(
     name="sgl-kernel",
     version=get_version(),
-    packages=["sgl_kernel"],
+    packages=find_packages(),
     package_dir={"": "src"},
     ext_modules=ext_modules,
     cmdclass={"build_ext": BuildExtension},

From 2add697d7a5f685d029db275de0cbea09878dfd5 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Sat, 18 Jan 2025 19:38:01 +0800
Subject: [PATCH 109/248] feat: remove vllm get_rope (#2964)

---
 python/sglang/srt/layers/rotary_embedding.py  | 1176 ++++++++++++++---
 python/sglang/srt/models/baichuan.py          |    2 +-
 python/sglang/srt/models/chatglm.py           |    2 +-
 python/sglang/srt/models/commandr.py          |    2 +-
 python/sglang/srt/models/dbrx.py              |    2 +-
 python/sglang/srt/models/deepseek.py          |    2 +-
 python/sglang/srt/models/deepseek_v2.py       |    5 +-
 python/sglang/srt/models/exaone.py            |    2 +-
 python/sglang/srt/models/gemma.py             |    2 +-
 python/sglang/srt/models/gemma2.py            |    8 +-
 python/sglang/srt/models/gpt2.py              |    2 -
 python/sglang/srt/models/granite.py           |    2 +-
 python/sglang/srt/models/grok.py              |    2 +-
 python/sglang/srt/models/internlm2.py         |    2 +-
 python/sglang/srt/models/llama.py             |    2 +-
 python/sglang/srt/models/minicpm.py           |    2 +-
 python/sglang/srt/models/minicpm3.py          |    2 +-
 python/sglang/srt/models/mixtral.py           |    2 +-
 python/sglang/srt/models/mixtral_quant.py     |    2 +-
 python/sglang/srt/models/olmo.py              |    2 +-
 python/sglang/srt/models/olmo2.py             |    2 +-
 python/sglang/srt/models/olmoe.py             |    2 +-
 python/sglang/srt/models/phi3_small.py        |    2 +-
 python/sglang/srt/models/qwen.py              |    2 +-
 python/sglang/srt/models/qwen2.py             |    2 +-
 python/sglang/srt/models/qwen2_moe.py         |    2 +-
 python/sglang/srt/models/stablelm.py          |    2 +-
 .../sglang/srt/models/torch_native_llama.py   |    2 +-
 python/sglang/srt/models/xverse.py            |    2 +-
 python/sglang/srt/models/xverse_moe.py        |    2 +-
 30 files changed, 1026 insertions(+), 217 deletions(-)

diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py
index e4a93814b889..7c18c683e969 100644
--- a/python/sglang/srt/layers/rotary_embedding.py
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -1,132 +1,647 @@
-# Copyright 2023-2024 SGLang Team
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""MRotaryEmbedding"""
+# Adapted from https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.6.6.post1/vllm/model_executor/layers/rotary_embedding.py
+
+"""Rotary Positional Embeddings."""
+import math
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
-from vllm.model_executor.layers.rotary_embedding import (
-    RotaryEmbedding,
-    _rotate_gptj,
-    _rotate_neox,
-    _yarn_find_correction_range,
-    _yarn_linear_ramp_mask,
-    get_rope,
-    yarn_get_mscale,
-)
-
-
-class MRotaryEmbedding:
-    """Rotary Embedding with Multimodal Sections."""
+import torch.nn as nn
+from vllm.model_executor.custom_op import CustomOp
 
-    @staticmethod
-    def get_input_positions(
-        input_tokens: torch.Tensor,
-        image_grid_thw: Union[List[List[int]], torch.Tensor],
-        vision_start_token_id: int,
-        spatial_merge_size: int,
-        context_len: int = 0,
-    ) -> Tuple[List[List[int]], int]:
-        """Get mrope input positions and delta value."""
 
-        if isinstance(image_grid_thw, torch.Tensor):
-            image_grid_thw = image_grid_thw.tolist()
+def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
 
-        vision_start_indices = torch.argwhere(
-            input_tokens == vision_start_token_id
-        ).squeeze(1)
-        image_indices = vision_start_indices + 1
-        image_nums = image_indices.shape[0]
-        llm_pos_ids_list: list = []
 
-        st = 0
-        input_tokens_len = input_tokens.shape[0]
-        for image_index in range(image_nums):
-            ed = image_indices[image_index].item()
-            t, h, w = (
-                image_grid_thw[image_index][0],
-                image_grid_thw[image_index][1],
-                image_grid_thw[image_index][2],
-            )
-            llm_grid_t, llm_grid_h, llm_grid_w = (
-                t,
-                h // spatial_merge_size,
-                w // spatial_merge_size,
+def _rotate_gptj(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)
+
+
+def _apply_rotary_emb(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_neox_style: bool,
+) -> torch.Tensor:
+    """
+    Args:
+        x: [num_tokens, num_heads, head_size]
+        cos: [num_tokens, head_size // 2]
+        sin: [num_tokens, head_size // 2]
+        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
+            positional embeddings.
+    """
+    cos = cos.unsqueeze(-2).to(x.dtype)
+    sin = sin.unsqueeze(-2).to(x.dtype)
+    if is_neox_style:
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+    else:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+    o1 = x1 * cos - x2 * sin
+    o2 = x2 * cos + x1 * sin
+    if is_neox_style:
+        return torch.cat((o1, o2), dim=-1)
+    else:
+        return torch.stack((o1, o2), dim=-1).flatten(-2)
+
+
+@CustomOp.register("rotary_embedding")
+class RotaryEmbedding(CustomOp):
+    """Original rotary positional embedding."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.dtype = dtype
+
+        cache = self._compute_cos_sin_cache()
+        cache = cache.to(dtype)
+        self.cos_sin_cache: torch.Tensor
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        # NOTE(woosuk): To exactly match the HF implementation, we need to
+        # use CPU to compute the cache and then move it to GPU. However, we
+        # create the cache on GPU for faster initialization. This may cause
+        # a slight numerical difference between the HF implementation and ours.
+        inv_freq = 1.0 / (
+            base
+            ** (
+                torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
             )
-            text_len = ed - st
+        )
+        return inv_freq
 
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """A PyTorch-native implementation of forward()."""
+        if offsets is not None:
+            positions = positions + offsets
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        from vllm import _custom_ops as ops
+
+        self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
+        # ops.rotary_embedding()/batched_rotary_embedding()
+        # are in-place operations that update the query and key tensors.
+        if offsets is not None:
+            ops.batched_rotary_embedding(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+                self.rotary_dim,
+                offsets,
             )
+        else:
+            ops.rotary_embedding(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+            )
+        return query, key
 
-            t_index = (
-                torch.arange(llm_grid_t)
-                .view(-1, 1)
-                .expand(-1, llm_grid_h * llm_grid_w)
-                .flatten()
+    def forward_xpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        self.cos_sin_cache = self.cos_sin_cache.to(positions.device, dtype=query.dtype)
+        # ops.rotary_embedding()/batched_rotary_embedding()
+        # are in-place operations that update the query and key tensors.
+        if offsets is not None:
+            ops.batched_rotary_embedding(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+                self.rotary_dim,
+                offsets,
             )
-            h_index = (
-                torch.arange(llm_grid_h)
-                .view(1, -1, 1)
-                .expand(llm_grid_t, -1, llm_grid_w)
-                .flatten()
+        else:
+            ops.rotary_embedding(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
             )
-            w_index = (
-                torch.arange(llm_grid_w)
-                .view(1, 1, -1)
-                .expand(llm_grid_t, llm_grid_h, -1)
-                .flatten()
+        return query, key
+
+    def forward_hpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        from habana_frameworks.torch.hpex.kernels import (
+            RotaryPosEmbeddingMode,
+            apply_rotary_pos_emb,
+        )
+
+        positions = positions.flatten()
+        if offsets is not None:
+            positions = positions + offsets
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions).view(num_tokens, 1, -1)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        # HPU RoPE kernel requires hidden dimension for cos and sin to be equal
+        # to query hidden dimension, so the original tensors need to be
+        # expanded
+        # GPT-NeoX kernel requires position_ids = None, offset, mode = BLOCKWISE
+        # and expansion of cos/sin tensors via concatenation
+        # GPT-J kernel requires position_ids = None, offset = 0, mode = PAIRWISE
+        # and expansion of cos/sin tensors via repeat_interleave
+        rope_mode: RotaryPosEmbeddingMode
+        if self.is_neox_style:
+            rope_mode = RotaryPosEmbeddingMode.BLOCKWISE
+            cos = torch.cat((cos, cos), dim=-1)
+            sin = torch.cat((sin, sin), dim=-1)
+        else:
+            rope_mode = RotaryPosEmbeddingMode.PAIRWISE
+            sin = torch.repeat_interleave(sin, 2, dim=-1, output_size=cos_sin.shape[-1])
+            cos = torch.repeat_interleave(cos, 2, dim=-1, output_size=cos_sin.shape[-1])
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0, rope_mode)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def extra_repr(self) -> str:
+        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+        s += f", max_position_embeddings={self.max_position_embeddings}"
+        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
+        return s
+
+
+class LinearScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with linear scaling.
+
+    It supports multiple scaling factors. Since multiple LoRA adapters may have
+    different scaling factors, we need multiple cos/sin caches. In this way,
+    instead of running rotary embedding kernel per lora, we can run multiple
+    lora in a batched way.
+
+    In addition to that, we also keep the cos/sin cache for the scaling factor
+    of 1 (default) at all times.
+
+    Exemplary for two scaling factors x=1, y and z with embeddings
+    [[x11, x12, ... x1m], ..., [xn1, xn2, ..., xnm]] and
+    [[y11, y12, ... y1o], ..., [yn1, yn2, ..., yno]], and
+    [[z11, z12, ... z1p], ..., [zn1, zn2, ..., znp]],
+
+    we construct the cos/sin cache as follows:
+    [[x11, x12, ... x1m, y11, y12, ... y1o, z11, z12, ... z1p],
+        ...
+     [xn1, xn2, ... xnm, yn1, yn2, ... yno, zn1, zn2, ... znp]]
+
+    We then use offsets to index into the cos/sin cache for
+    the respective scaling factors.
+
+    The offset to cache can be accessed via `scaling_factor_to_offset` API.
+
+    Credits to the Reddit user /u/kaiokendev
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factors: Union[List[float], float],
+        dtype: torch.dtype,
+    ) -> None:
+        if isinstance(scaling_factors, float):
+            scaling_factors = [scaling_factors]
+        self.scaling_factors: List[float] = scaling_factors  # noqa
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+        # Lazy initialized.
+        self._scaling_factor_to_offset: Dict[float, int]
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.base)
+        cache_list: List[torch.Tensor] = []
+        # offsets to the next cache in a tensor.
+        # Each offset corresponds to the same index in scaling_factors.
+        offsets: List[int] = []
+        for scaling_factor in self.scaling_factors:
+            # NOTE(woosuk): self.max_position_embeddings is the original
+            # maximum length before applying the rope scaling.
+            # Thus, the maximum length after applying the rope scaling is
+            # self.max_position_embeddings * self.scaling_factor.
+            max_len = self.max_position_embeddings * scaling_factor
+            t = torch.arange(max_len, dtype=torch.float)
+            t = t / scaling_factor
+
+            freqs = torch.einsum("i,j -> ij", t, inv_freq)
+            cos = freqs.cos()
+            sin = freqs.sin()
+            cache = torch.cat((cos, sin), dim=-1)
+            if not cache_list:
+                offset = 0
+            else:
+                last_offset = offsets[-1]
+                next_max_len = cache_list[-1].shape[0]
+                offset = last_offset + next_max_len
+            offsets.append(offset)
+            cache_list.append(cache)
+        self._scaling_factor_to_offset = {
+            float(scaling_factor): offsets[i]
+            for i, scaling_factor in enumerate(self.scaling_factors)
+        }
+        assert len(self.scaling_factors) == len(offsets)
+        return torch.cat(cache_list, dim=0)
+
+    @property
+    def scaling_factor_to_offset(self) -> Dict[float, int]:
+        return self._scaling_factor_to_offset
+
+
+class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling.
+
+    Credits to the Reddit users /u/bloc97 and /u/emozilla
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        # NOTE(woosuk): self.max_position_embeddings is the original
+        # maximum length before applying the rope scaling.
+        # Thus, the maximum length after applying the rope scaling is
+        # self.max_position_embeddings * self.scaling_factor.
+        max_len = self.max_position_embeddings * self.scaling_factor
+        base = self.base * (
+            (self.scaling_factor * max_len / self.max_position_embeddings)
+            - (self.scaling_factor - 1)
+        ) ** (self.rotary_dim / (self.rotary_dim - 2))
+        inv_freq = self._compute_inv_freq(base)
+        t = torch.arange(max_len, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+
+# Inverse dim formula to find dim based on number of rotations
+def _yarn_find_correction_dim(
+    num_rotations: int,
+    dim: int,
+    base: float = 10000,
+    max_position_embeddings: int = 2048,
+) -> float:
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
+
+
+# Find dim range bounds based on rotations
+def _yarn_find_correction_range(
+    low_rot: int,
+    high_rot: int,
+    dim: int,
+    base: float = 10000,
+    max_position_embeddings: int = 2048,
+) -> Tuple[int, int]:
+    low = math.floor(
+        _yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+    )
+    high = math.ceil(
+        _yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+    )
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def _yarn_linear_ramp_mask(
+    low: float, high: float, dim: int, dtype: torch.dtype
+) -> torch.Tensor:
+    if low == high:
+        high += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def _yarn_get_mscale(scale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * math.log(scale) + 1.0
+
+
+class YaRNScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        # Get n-d magnitude scaling corrected for interpolation
+        self.mscale = float(_yarn_get_mscale(self.scaling_factor) * attn_factor)
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base ** (
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+        )
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = _yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.rotary_dim,
+            self.base,
+            self.max_position_embeddings,
+        )
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (
+            1
+            - _yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
+        ) * self.extrapolation_factor
+        inv_freq = (
+            inv_freq_interpolation * (1 - inv_freq_mask)
+            + inv_freq_extrapolation * inv_freq_mask
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(
+            self.max_position_embeddings * self.scaling_factor, dtype=torch.float32
+        )
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * self.mscale
+        sin = freqs.sin() * self.mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+
+class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
+    """Phi3 family of models scaled rotary embedding.
+
+    Based on the original RotaryEmbedding implementation.
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        original_max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        short_factor: List[float],
+        long_factor: List[float],
+        short_mscale: Optional[float] = None,
+        long_mscale: Optional[float] = None,
+    ):
+        super().__init__()
+
+        if rotary_dim != head_size:
+            raise ValueError(
+                f"`Phi3LongRoPEScaledRotaryEmbedding` does not support \
+                    rotary_dim != head_size ({rotary_dim}!={head_size})."
             )
-            llm_pos_ids_list.append(
-                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+        if is_neox_style is False:
+            raise ValueError(
+                "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style."
             )
-            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
 
-        if st < input_tokens_len:
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            text_len = input_tokens_len - st
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+        self.head_size = head_size
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.base = base
+        self.short_factor = short_factor
+        self.long_factor = long_factor
+
+        scale = self.max_position_embeddings / self.original_max_position_embeddings
+        if scale <= 1.0:
+            scaling_factor = 1.0
+        else:
+            scaling_factor = math.sqrt(
+                1 + math.log(scale) / math.log(self.original_max_position_embeddings)
             )
+        if short_mscale is None:
+            short_mscale = scaling_factor
+        if long_mscale is None:
+            long_mscale = scaling_factor
 
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        llm_positions = llm_positions[:, context_len:]
-        mrope_position_delta = (llm_positions.max() + 1 - input_tokens_len).item()
-        return llm_positions.tolist(), mrope_position_delta
+        self.short_mscale = short_mscale
+        self.long_mscale = long_mscale
 
-    @staticmethod
-    def get_next_input_positions(
-        mrope_position_delta: int,
-        context_len: int,
-        seq_len: int,
-    ) -> List[List[int]]:
-        return [
-            list(
-                range(
-                    context_len + mrope_position_delta, seq_len + mrope_position_delta
+        short_cache = self._compute_cos_sin_cache(
+            original_max_position_embeddings, short_factor, short_mscale
+        )
+        short_cache = short_cache.to(dtype)
+        self.register_buffer("short_cos_sin_cache", short_cache, persistent=False)
+
+        long_cache = self._compute_cos_sin_cache(
+            max_position_embeddings, long_factor, long_mscale
+        )
+        long_cache = long_cache.to(dtype)
+        self.register_buffer("long_cos_sin_cache", long_cache, persistent=False)
+
+        long_short_cache = torch.cat(
+            [self.short_cos_sin_cache, self.long_cos_sin_cache], dim=0
+        )
+        self.register_buffer(
+            "long_short_cos_sin_cache", long_short_cache, persistent=False
+        )
+
+    def _compute_inv_freq(self, rescale_factors: List[float]) -> torch.Tensor:
+        rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32)
+        inv_freq = 1.0 / (
+            rescale_factors
+            * (
+                self.base
+                ** (
+                    torch.arange(0, self.head_size, 2, dtype=torch.float)
+                    / self.head_size
                 )
             )
-            for _ in range(3)
-        ]
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(
+        self,
+        max_position_embeddings: int,
+        rescale_factors: List[float],
+        mscale: float,
+    ) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(rescale_factors)
+        t = torch.arange(max_position_embeddings, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * mscale
+        sin = freqs.sin() * mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        query = query.view(*query.shape[:-1], -1, self.head_size)
+        key = key.view(*key.shape[:-1], -1, self.head_size)
+
+        k = self.original_max_position_embeddings
+        long_prompt_offset = (
+            torch.any(positions > k).float() * torch.full_like(positions, k)
+        ).long()
+        idx = (
+            torch.add(positions, long_prompt_offset)
+            if long_prompt_offset is not None
+            else positions
+        )
+        self.long_short_cos_sin_cache: torch.Tensor = self.long_short_cos_sin_cache.to(
+            idx.device
+        )
+        idx = torch.add(idx, offsets) if offsets is not None else idx
+        cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)
+
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        cos = cos.repeat(1, 2).unsqueeze(-2)
+        sin = sin.repeat(1, 2).unsqueeze(-2)
+
+        query = query * cos + _rotate_neox(query) * sin
+        key = key * cos + _rotate_neox(key) * sin
+
+        return query.flatten(-2), key.flatten(-2)
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
 
 
-# TODO: in the DeepseekScalingRotaryEmbedding class defined in vllm,
-# the device has been hard-coded to "cuda" in these two places:
-# https://github.com/vllm-project/vllm/blob/8a1f938e6f02052df0f4953c149410605a2d56d8/vllm/model_executor/layers/rotary_embedding.py#L646
-# https://github.com/vllm-project/vllm/blob/8a1f938e6f02052df0f4953c149410605a2d56d8/vllm/model_executor/layers/rotary_embedding.py#L665
-# We port the related code to this file to make it compatible with the CPU version.
-# We will add an optimized rotary embedding kernel for CPU and will remove the ported code then.
 class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
     """RotaryEmbedding extended with YaRN method.
 
@@ -149,7 +664,6 @@ def __init__(
         beta_slow: int = 1,
         mscale: float = 1,
         mscale_all_dim: float = 0,
-        device: Optional[str] = None,
     ) -> None:
         self.scaling_factor = scaling_factor
         self.extrapolation_factor = extrapolation_factor
@@ -162,14 +676,13 @@ def __init__(
             / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim))
             * attn_factor
         )
-        self.device = device
         super().__init__(
             head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
         )
 
     def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
         pos_freqs = self.base ** (
-            torch.arange(0, self.rotary_dim, 2, dtype=torch.float, device=self.device)
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float, device="cuda")
             / self.rotary_dim
         )
         inv_freq_extrapolation = 1.0 / pos_freqs
@@ -197,7 +710,7 @@ def _compute_cos_sin_cache(self) -> torch.Tensor:
         inv_freq = self._compute_inv_freq(self.scaling_factor)
         t = torch.arange(
             self.max_position_embeddings * self.scaling_factor,
-            device=self.device,
+            device="cuda",
             dtype=torch.float32,
         )
         freqs = torch.einsum("i,j -> ij", t, inv_freq)
@@ -248,10 +761,249 @@ def forward(
         return query, key
 
 
+class Llama3RotaryEmbedding(RotaryEmbedding):
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        scaling_factor: float,
+        low_freq_factor: float,
+        high_freq_factor: float,
+        orig_max_position: int,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.low_freq_factor = low_freq_factor
+        self.high_freq_factor = high_freq_factor
+        self.orig_max_position = orig_max_position
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        inv_freqs = super()._compute_inv_freq(base)
+        low_freq_wavelen = self.orig_max_position / self.low_freq_factor
+        high_freq_wavelen = self.orig_max_position / self.high_freq_factor
+
+        wave_len = 2 * math.pi / inv_freqs
+        if self.low_freq_factor != self.high_freq_factor:
+            smooth = (self.orig_max_position / wave_len - self.low_freq_factor) / (
+                self.high_freq_factor - self.low_freq_factor
+            )
+        else:
+            smooth = 0
+        new_freqs = torch.where(
+            wave_len < high_freq_wavelen,
+            inv_freqs,
+            torch.where(
+                wave_len > low_freq_wavelen,
+                inv_freqs / self.scaling_factor,
+                (1 - smooth) * inv_freqs / self.scaling_factor + smooth * inv_freqs,
+            ),
+        )
+        return new_freqs
+
+
+class MRotaryEmbedding(RotaryEmbedding):
+    """Rotary Embedding with Multimodal Sections."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        mrope_section: Optional[List[int]] = None,
+    ) -> None:
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+        self.mrope_section = mrope_section
+        if self.mrope_section:
+            assert sum(self.mrope_section) == rotary_dim // 2
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """PyTorch-native implementation equivalent to forward().
+
+        Args:
+            positions:
+                [num_tokens,] (text only) or
+                [3, num_tokens] (T/H/W positions with multimodal inputs)
+            query: [num_tokens, num_heads * head_size]
+            key: [num_tokens, num_kv_heads * head_size]
+        """
+        assert positions.ndim == 1 or positions.ndim == 2
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if positions.ndim == 2:
+            assert self.mrope_section
+
+            cos = torch.cat(
+                [m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))],
+                dim=-1,
+            )
+            sin = torch.cat(
+                [m[i] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))],
+                dim=-1,
+            )
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    @staticmethod
+    def get_input_positions(
+        input_tokens: List[int],
+        image_grid_thw: Union[List[List[int]], torch.Tensor],
+        video_grid_thw: Union[List[List[int]], torch.Tensor],
+        image_token_id: int,
+        video_token_id: int,
+        vision_start_token_id: int,
+        vision_end_token_id: int,
+        spatial_merge_size: int,
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+    ) -> Tuple[List[List[int]], int]:
+        """Get mrope input positions and delta value."""
+
+        if isinstance(image_grid_thw, torch.Tensor):
+            image_grid_thw = image_grid_thw.tolist()
+        if isinstance(video_grid_thw, torch.Tensor):
+            video_grid_thw = video_grid_thw.tolist()
+
+        input_tokens_tensor = torch.tensor(input_tokens)
+        vision_start_indices = torch.argwhere(
+            input_tokens_tensor == vision_start_token_id
+        ).squeeze(1)
+        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (vision_tokens == video_token_id).sum()
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_videos = image_nums, video_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + video_nums):
+            if image_token_id in input_tokens and remain_images > 0:
+                ed_image = input_tokens.index(image_token_id, st)
+            else:
+                ed_image = len(input_tokens) + 1
+            if video_token_id in input_tokens and remain_videos > 0:
+                ed_video = input_tokens.index(video_token_id, st)
+            else:
+                ed_video = len(input_tokens) + 1
+            if ed_image < ed_video:
+                t, h, w = (
+                    image_grid_thw[image_index][0],
+                    image_grid_thw[image_index][1],
+                    image_grid_thw[image_index][2],
+                )
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = (
+                    video_grid_thw[video_index][0],
+                    video_grid_thw[video_index][1],
+                    video_grid_thw[video_index][2],
+                )
+                video_index += 1
+                remain_videos -= 1
+                ed = ed_video
+            llm_grid_t, llm_grid_h, llm_grid_w = (
+                t,
+                h // spatial_merge_size,
+                w // spatial_merge_size,
+            )
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+            t_index = (
+                torch.arange(llm_grid_t)
+                .view(-1, 1)
+                .expand(-1, llm_grid_h * llm_grid_w)
+                .flatten()
+            )
+            h_index = (
+                torch.arange(llm_grid_h)
+                .view(1, -1, 1)
+                .expand(llm_grid_t, -1, llm_grid_w)
+                .flatten()
+            )
+            w_index = (
+                torch.arange(llm_grid_w)
+                .view(1, 1, -1)
+                .expand(llm_grid_t, llm_grid_h, -1)
+                .flatten()
+            )
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+            )
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+        llm_positions = llm_positions[:, context_len:seq_len]
+
+        return llm_positions.tolist(), mrope_position_delta
+
+    @staticmethod
+    def get_next_input_positions(
+        mrope_position_delta: int,
+        context_len: int,
+        seq_len: int,
+    ) -> List[List[int]]:
+        return [
+            list(
+                range(
+                    context_len + mrope_position_delta, seq_len + mrope_position_delta
+                )
+            )
+            for _ in range(3)
+        ]
+
+
 _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
 
 
-def get_rope_cpu(
+def get_rope(
     head_size: int,
     rotary_dim: int,
     max_position: int,
@@ -260,7 +1012,6 @@ def get_rope_cpu(
     rope_scaling: Optional[Dict[str, Any]] = None,
     dtype: Optional[torch.dtype] = None,
     partial_rotary_factor: float = 1.0,
-    device: Optional[str] = None,
 ) -> RotaryEmbedding:
     if dtype is None:
         dtype = torch.get_default_dtype()
@@ -286,75 +1037,140 @@ def get_rope_cpu(
     if key in _ROPE_DICT:
         return _ROPE_DICT[key]
 
-    assert rope_scaling is not None
-    scaling_type = rope_scaling["rope_type"]
-    assert (
-        scaling_type == "deepseek_yarn"
-    ), "Only deepseek_yarn is supported for CPU for now"
-
-    scaling_factor = rope_scaling["factor"]
-    original_max_position = rope_scaling["original_max_position_embeddings"]
-    # assert max_position == original_max_position * scaling_factor
-    extra_kwargs = {
-        k: v
-        for k, v in rope_scaling.items()
-        if k
-        in (
-            "extrapolation_factor",
-            "attn_factor",
-            "beta_fast",
-            "beta_slow",
-            "mscale",
-            "mscale_all_dim",
+    if rope_scaling is None:
+        rotary_emb = RotaryEmbedding(
+            head_size, rotary_dim, max_position, base, is_neox_style, dtype
         )
-    }
-    extra_kwargs["device"] = device
-    rotary_emb = DeepseekScalingRotaryEmbedding(
-        head_size,
-        rotary_dim,
-        original_max_position,
-        base,
-        is_neox_style,
-        scaling_factor,
-        dtype,
-        **extra_kwargs,
-    )
+    else:
+        scaling_type = rope_scaling["rope_type"]
 
+        if scaling_type == "llama3":
+            scaling_factor = rope_scaling["factor"]
+            low_freq_factor = rope_scaling["low_freq_factor"]
+            high_freq_factor = rope_scaling["high_freq_factor"]
+            original_max_position = rope_scaling["original_max_position_embeddings"]
+            rotary_emb = Llama3RotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                dtype,
+                scaling_factor,
+                low_freq_factor,
+                high_freq_factor,
+                original_max_position,
+            )
+        elif scaling_type == "default":
+            if "mrope_section" in rope_scaling:
+                rotary_emb = MRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    dtype,
+                    mrope_section=rope_scaling["mrope_section"],
+                )
+            else:
+                rotary_emb = RotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    dtype,
+                )
+        elif scaling_type == "linear":
+            scaling_factor = rope_scaling["factor"]
+            rotary_emb = LinearScalingRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                scaling_factor,
+                dtype,
+            )
+        elif scaling_type == "dynamic":
+            scaling_factor = rope_scaling["factor"]
+            rotary_emb = DynamicNTKScalingRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                scaling_factor,
+                dtype,
+            )
+        elif scaling_type == "yarn":
+            scaling_factor = rope_scaling["factor"]
+            original_max_position = rope_scaling["original_max_position_embeddings"]
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k
+                in ("extrapolation_factor", "attn_factor", "beta_fast", "beta_slow")
+            }
+            rotary_emb = YaRNScalingRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                original_max_position,
+                base,
+                is_neox_style,
+                scaling_factor,
+                dtype,
+                **extra_kwargs,
+            )
+        elif scaling_type == "deepseek_yarn":
+            scaling_factor = rope_scaling["factor"]
+            original_max_position = rope_scaling["original_max_position_embeddings"]
+            # assert max_position == original_max_position * scaling_factor
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k
+                in (
+                    "extrapolation_factor",
+                    "attn_factor",
+                    "beta_fast",
+                    "beta_slow",
+                    "mscale",
+                    "mscale_all_dim",
+                )
+            }
+            rotary_emb = DeepseekScalingRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                original_max_position,
+                base,
+                is_neox_style,
+                scaling_factor,
+                dtype,
+                **extra_kwargs,
+            )
+        elif scaling_type == "longrope":
+            short_factor = rope_scaling["short_factor"]
+            long_factor = rope_scaling["long_factor"]
+            original_max_position = rope_scaling["original_max_position_embeddings"]
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k in ("short_mscale", "long_mscale")
+            }
+            rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                original_max_position,
+                base,
+                is_neox_style,
+                dtype,
+                short_factor,
+                long_factor,
+                **extra_kwargs,
+            )
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
     _ROPE_DICT[key] = rotary_emb
     return rotary_emb
-
-
-def get_rope_wrapper(
-    head_size: int,
-    rotary_dim: int,
-    max_position: int,
-    base: int,
-    is_neox_style: bool = True,
-    rope_scaling: Optional[Dict[str, Any]] = None,
-    dtype: Optional[torch.dtype] = None,
-    partial_rotary_factor: float = 1.0,
-    device: Optional[str] = None,
-):
-    if device != "cpu":
-        return get_rope(
-            head_size,
-            rotary_dim,
-            max_position,
-            base,
-            is_neox_style,
-            rope_scaling,
-            dtype,
-            partial_rotary_factor,
-        )
-
-    return get_rope_cpu(
-        head_size,
-        rotary_dim,
-        max_position,
-        base,
-        is_neox_style,
-        rope_scaling,
-        dtype,
-        partial_rotary_factor,
-        device,
-    )
diff --git a/python/sglang/srt/models/baichuan.py b/python/sglang/srt/models/baichuan.py
index d8916abacfbc..066157f05ce1 100644
--- a/python/sglang/srt/models/baichuan.py
+++ b/python/sglang/srt/models/baichuan.py
@@ -24,7 +24,6 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -40,6 +39,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/chatglm.py b/python/sglang/srt/models/chatglm.py
index 4d73aa0def7c..222cc3e2d805 100644
--- a/python/sglang/srt/models/chatglm.py
+++ b/python/sglang/srt/models/chatglm.py
@@ -21,7 +21,6 @@
 import torch
 from torch import nn
 from torch.nn import LayerNorm
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.configs import ChatGLMConfig
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
@@ -35,6 +34,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py
index d701c10a7678..151087732f05 100644
--- a/python/sglang/srt/models/commandr.py
+++ b/python/sglang/srt/models/commandr.py
@@ -44,7 +44,6 @@
 from torch import nn
 from torch.nn.parameter import Parameter
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -59,6 +58,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py
index 206f24d61bf1..cedc96392205 100644
--- a/python/sglang/srt/models/dbrx.py
+++ b/python/sglang/srt/models/dbrx.py
@@ -19,7 +19,6 @@
 
 import torch
 import torch.nn as nn
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.configs import DbrxConfig
 from sglang.srt.distributed import (
@@ -36,6 +35,7 @@
 from sglang.srt.layers.moe.fused_moe_triton import fused_moe
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
diff --git a/python/sglang/srt/models/deepseek.py b/python/sglang/srt/models/deepseek.py
index cbd123c9e961..7d2c0700fe45 100644
--- a/python/sglang/srt/models/deepseek.py
+++ b/python/sglang/srt/models/deepseek.py
@@ -21,7 +21,6 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -40,6 +39,7 @@
 from sglang.srt.layers.moe.fused_moe_triton import fused_moe
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
index 5a76c8ac9815..0d327c0ca97e 100644
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -23,7 +23,6 @@
 from torch import nn
 from transformers import PretrainedConfig
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -49,7 +48,7 @@
     normalize_e4m3fn_to_e4m3fnuz,
 )
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.rotary_embedding import get_rope_wrapper
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
@@ -272,7 +271,7 @@ def __init__(
             quant_config=quant_config,
         )
         rope_scaling["rope_type"] = "deepseek_yarn"
-        self.rotary_emb = get_rope_wrapper(
+        self.rotary_emb = get_rope(
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,
             max_position=max_position_embeddings,
diff --git a/python/sglang/srt/models/exaone.py b/python/sglang/srt/models/exaone.py
index 5bb0ea538f9f..10be1e74d617 100644
--- a/python/sglang/srt/models/exaone.py
+++ b/python/sglang/srt/models/exaone.py
@@ -20,7 +20,6 @@
 
 import torch
 from torch import nn
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
@@ -33,6 +32,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py
index bed496c6952a..9940c569e257 100644
--- a/python/sglang/srt/models/gemma.py
+++ b/python/sglang/srt/models/gemma.py
@@ -21,7 +21,6 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import GeluAndMul
@@ -34,6 +33,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py
index ee0a762aa0a4..4d21901de7c7 100644
--- a/python/sglang/srt/models/gemma2.py
+++ b/python/sglang/srt/models/gemma2.py
@@ -15,12 +15,11 @@
 # Adapted from:
 # https://github.com/vllm-project/vllm/blob/56b325e977435af744f8b3dca7af0ca209663558/vllm/model_executor/models/gemma2.py
 
-from typing import Iterable, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple
 
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import GeluAndMul
@@ -33,6 +32,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
@@ -45,10 +45,6 @@ def get_attention_sliding_window_size(config):
     return config.sliding_window - 1
 
 
-# FIXME: temporary solution, remove after next vllm release
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
-
-
 class Gemma2MLP(nn.Module):
     def __init__(
         self,
diff --git a/python/sglang/srt/models/gpt2.py b/python/sglang/srt/models/gpt2.py
index d457603b063b..04c3005ce2f3 100644
--- a/python/sglang/srt/models/gpt2.py
+++ b/python/sglang/srt/models/gpt2.py
@@ -25,8 +25,6 @@
 
 from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import get_act_fn
-
-# from sglang.srt.layers.activation import get_act_fn
 from sglang.srt.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
diff --git a/python/sglang/srt/models/granite.py b/python/sglang/srt/models/granite.py
index 1383e0ef0fde..255f23227ff5 100644
--- a/python/sglang/srt/models/granite.py
+++ b/python/sglang/srt/models/granite.py
@@ -22,7 +22,6 @@
 import torch
 from torch import nn
 from transformers import GraniteConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
@@ -36,6 +35,7 @@
 from sglang.srt.layers.pooler import Pooler, PoolingType
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py
index 490798b5b66f..c13d3e253688 100644
--- a/python/sglang/srt/models/grok.py
+++ b/python/sglang/srt/models/grok.py
@@ -22,7 +22,6 @@
 import torch.nn.functional as F
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -40,6 +39,7 @@
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/internlm2.py b/python/sglang/srt/models/internlm2.py
index 31617db5e5a9..ce8f9a3cf651 100644
--- a/python/sglang/srt/models/internlm2.py
+++ b/python/sglang/srt/models/internlm2.py
@@ -19,7 +19,6 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
@@ -32,6 +31,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py
index 9ea80d0c05d3..4ea77eede9be 100644
--- a/python/sglang/srt/models/llama.py
+++ b/python/sglang/srt/models/llama.py
@@ -22,7 +22,6 @@
 import torch
 from torch import nn
 from transformers import LlamaConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -39,6 +38,7 @@
 from sglang.srt.layers.pooler import Pooler, PoolingType
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/minicpm.py b/python/sglang/srt/models/minicpm.py
index b0853c3eee69..f5e69411acc0 100644
--- a/python/sglang/srt/models/minicpm.py
+++ b/python/sglang/srt/models/minicpm.py
@@ -18,7 +18,6 @@
 
 import torch
 from torch import nn
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
@@ -31,6 +30,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/minicpm3.py b/python/sglang/srt/models/minicpm3.py
index 2d15af43ff23..118be8ff6c81 100644
--- a/python/sglang/srt/models/minicpm3.py
+++ b/python/sglang/srt/models/minicpm3.py
@@ -19,7 +19,6 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
@@ -33,6 +32,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py
index c2c8d2294d3d..4ea734836afc 100644
--- a/python/sglang/srt/models/mixtral.py
+++ b/python/sglang/srt/models/mixtral.py
@@ -21,7 +21,6 @@
 import torch
 from torch import nn
 from transformers import MixtralConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_world_size,
@@ -38,6 +37,7 @@
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/mixtral_quant.py b/python/sglang/srt/models/mixtral_quant.py
index c38328369c43..244dc7df2d06 100644
--- a/python/sglang/srt/models/mixtral_quant.py
+++ b/python/sglang/srt/models/mixtral_quant.py
@@ -23,7 +23,6 @@
 import torch.nn.functional as F
 from torch import nn
 from transformers import MixtralConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -39,6 +38,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/olmo.py b/python/sglang/srt/models/olmo.py
index 45a3f3ff4615..4d8a79900f4c 100644
--- a/python/sglang/srt/models/olmo.py
+++ b/python/sglang/srt/models/olmo.py
@@ -20,7 +20,6 @@
 import torch
 from torch import nn
 from transformers import OlmoConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
@@ -32,6 +31,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/olmo2.py b/python/sglang/srt/models/olmo2.py
index fafe39d7189d..f3e1979f8492 100755
--- a/python/sglang/srt/models/olmo2.py
+++ b/python/sglang/srt/models/olmo2.py
@@ -21,7 +21,6 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -39,6 +38,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/olmoe.py b/python/sglang/srt/models/olmoe.py
index 91722d966c88..10b781d72ffb 100644
--- a/python/sglang/srt/models/olmoe.py
+++ b/python/sglang/srt/models/olmoe.py
@@ -22,7 +22,6 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.layernorm import RMSNorm
@@ -35,6 +34,7 @@
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/phi3_small.py b/python/sglang/srt/models/phi3_small.py
index e59f88013d31..b7195dbaa28b 100644
--- a/python/sglang/srt/models/phi3_small.py
+++ b/python/sglang/srt/models/phi3_small.py
@@ -5,7 +5,6 @@
 from torch import nn
 from transformers import Phi3Config
 from transformers.configuration_utils import PretrainedConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.linear import (
@@ -17,6 +16,7 @@
 from sglang.srt.layers.pooler import Pooler, PoolingType
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py
index ed9ca02b7da3..2c99da926b60 100644
--- a/python/sglang/srt/models/qwen.py
+++ b/python/sglang/srt/models/qwen.py
@@ -20,7 +20,6 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
@@ -33,6 +32,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py
index 04faa8dea1b6..935e743bf6a9 100644
--- a/python/sglang/srt/models/qwen2.py
+++ b/python/sglang/srt/models/qwen2.py
@@ -20,7 +20,6 @@
 
 import torch
 from torch import nn
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -37,6 +36,7 @@
 from sglang.srt.layers.pooler import Pooler, PoolingType
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py
index b0f08f975ba4..6183f30daf43 100644
--- a/python/sglang/srt/models/qwen2_moe.py
+++ b/python/sglang/srt/models/qwen2_moe.py
@@ -22,7 +22,6 @@
 import torch.nn.functional as F
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_world_size,
@@ -40,6 +39,7 @@
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/stablelm.py b/python/sglang/srt/models/stablelm.py
index 2f144dcb1ee6..c169dd6fba42 100644
--- a/python/sglang/srt/models/stablelm.py
+++ b/python/sglang/srt/models/stablelm.py
@@ -24,7 +24,6 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
@@ -36,6 +35,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/torch_native_llama.py b/python/sglang/srt/models/torch_native_llama.py
index 04d21f0d4f39..024a6f317fa3 100644
--- a/python/sglang/srt/models/torch_native_llama.py
+++ b/python/sglang/srt/models/torch_native_llama.py
@@ -47,7 +47,6 @@
 from torch import nn
 from torch.nn.parameter import Parameter
 from transformers import LlamaConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -58,6 +57,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/xverse.py b/python/sglang/srt/models/xverse.py
index 799e513ae7e5..7fd241823749 100644
--- a/python/sglang/srt/models/xverse.py
+++ b/python/sglang/srt/models/xverse.py
@@ -21,7 +21,6 @@
 import torch
 from torch import nn
 from transformers import LlamaConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
@@ -34,6 +33,7 @@
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
diff --git a/python/sglang/srt/models/xverse_moe.py b/python/sglang/srt/models/xverse_moe.py
index 97b62815a87c..218b96f9cb46 100644
--- a/python/sglang/srt/models/xverse_moe.py
+++ b/python/sglang/srt/models/xverse_moe.py
@@ -18,7 +18,6 @@
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -37,6 +36,7 @@
 from sglang.srt.layers.moe.fused_moe_triton import fused_moe
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,

From e2cdc8a5b5c6a6d5a68e39d8c3e2a0c46248a2d2 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Sat, 18 Jan 2025 23:37:42 +0800
Subject: [PATCH 110/248] upgrade cutlass v3.7.0 (#2967)

---
 sgl-kernel/3rdparty/cutlass | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sgl-kernel/3rdparty/cutlass b/sgl-kernel/3rdparty/cutlass
index bf9da7b76c76..b78588d1630a 160000
--- a/sgl-kernel/3rdparty/cutlass
+++ b/sgl-kernel/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit bf9da7b76c766d7ee7d536afc77880a4ef1f1156
+Subproject commit b78588d1630aa6643bf021613717bafb705df4ef

From c2f212d672ccaf8a1e5ef09099e981d943600b14 Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Sat, 18 Jan 2025 23:41:01 +0800
Subject: [PATCH 111/248] optimize MiniMax-Text-01 lightning_attn_decode triton
 (#2966)

---
 .../benchmark_lighting_attention_decode.py    | 69 +++++++++++++------
 1 file changed, 47 insertions(+), 22 deletions(-)

diff --git a/benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_decode.py b/benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_decode.py
index 1a2036dc0ae7..4ce7f2b499d0 100644
--- a/benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_decode.py
+++ b/benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_decode.py
@@ -23,7 +23,10 @@ def _decode_kernel(
     h: tl.constexpr,
     n: tl.constexpr,
     d: tl.constexpr,
+    d_original: tl.constexpr,
     e: tl.constexpr,
+    e_original: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr = 32,
 ):
     off_bh = tl.program_id(0)
     off_h = off_bh % h
@@ -39,21 +42,38 @@ def _decode_kernel(
     d_idx = tl.arange(0, d)
     e_idx = tl.arange(0, e)
 
-    q = tl.load(Q + qk_offset + d_idx)
-    k = tl.load(K + qk_offset + d_idx)
-    v = tl.load(V + v_offset + e_idx)
+    # Create masks for original dimensions
+    d_mask = d_idx < d_original
+    e_mask = e_idx < e_original
 
-    kv = tl.load(KV + kv_offset + d_idx[:, None] * e + e_idx[None, :])
+    # Load with masking
+    q = tl.load(Q + qk_offset + d_idx, mask=d_mask, other=0.0)
+    k = tl.load(K + qk_offset + d_idx, mask=d_mask, other=0.0)
+    v = tl.load(V + v_offset + e_idx, mask=e_mask, other=0.0)
 
+    # Load KV with 2D masking
+    kv = tl.load(
+        KV + kv_offset + d_idx[:, None] * e + e_idx[None, :],
+        mask=(d_mask[:, None] & e_mask[None, :]),
+        other=0.0,
+    )
+
+    # Compute outer product using element-wise operations
     k_v_prod = k[:, None] * v[None, :]
     kv = ratio * kv + k_v_prod
 
+    # Store KV with 2D masking
     tl.store(
-        KV + kv_offset + d_idx[:, None] * e + e_idx[None, :], kv.to(KV.dtype.element_ty)
+        KV + kv_offset + d_idx[:, None] * e + e_idx[None, :],
+        kv.to(KV.dtype.element_ty),
+        mask=(d_mask[:, None] & e_mask[None, :]),
     )
 
+    # Compute matrix-vector multiplication using element-wise operations and reduction
     o = tl.sum(q[:, None] * kv, axis=0)
-    tl.store(Out + o_offset + e_idx, o.to(Out.dtype.element_ty))
+
+    # Store output with masking
+    tl.store(Out + o_offset + e_idx, o.to(Out.dtype.element_ty), mask=e_mask)
 
 
 def lightning_attn_decode(q, k, v, kv, s):
@@ -62,26 +82,27 @@ def lightning_attn_decode(q, k, v, kv, s):
     e = v.shape[-1]
     assert n == 1, "Sequence length must be 1 in decode mode"
 
-    # Pad dimensions to power of 2
+    # Get padded dimensions (power of 2)
     d_padded = next_power_of_2(d)
     e_padded = next_power_of_2(e)
 
-    # Pad inputs
-    q_padded = F.pad(q, (0, d_padded - d))
-    k_padded = F.pad(k, (0, d_padded - d))
-    v_padded = F.pad(v, (0, e_padded - e))
-    kv_padded = F.pad(kv, (0, e_padded - e, 0, d_padded - d))
-
-    # Ensure inputs are contiguous
-    q_padded = q_padded.contiguous()
-    k_padded = k_padded.contiguous()
-    v_padded = v_padded.contiguous()
-    kv_padded = kv_padded.contiguous().to(torch.float32)
-    s = s.contiguous()
-
     # Create output tensor (padded)
     o_padded = torch.empty(b, h, n, e_padded, dtype=v.dtype, device=v.device)
 
+    # Create padded tensors without actually padding the data
+    q_padded = torch.empty(b, h, n, d_padded, dtype=q.dtype, device=q.device)
+    k_padded = torch.empty(b, h, n, d_padded, dtype=k.dtype, device=k.device)
+    v_padded = torch.empty(b, h, n, e_padded, dtype=v.dtype, device=v.device)
+    kv_padded = torch.empty(
+        b, h, d_padded, e_padded, dtype=torch.float32, device=kv.device
+    )
+
+    # Copy data to padded tensors
+    q_padded[..., :d] = q
+    k_padded[..., :d] = k
+    v_padded[..., :e] = v
+    kv_padded[..., :d, :e] = kv
+
     # Launch kernel
     grid = (b * h, 1)
     _decode_kernel[grid](
@@ -95,10 +116,12 @@ def lightning_attn_decode(q, k, v, kv, s):
         h=h,
         n=n,
         d=d_padded,
+        d_original=d,
         e=e_padded,
+        e_original=e,
     )
 
-    # Remove padding
+    # Get unpadded outputs
     o = o_padded[..., :e]
     kv_out = kv_padded[..., :d, :e]
 
@@ -351,6 +374,8 @@ def test_lightning_attention_implementations(model_params):
         msg="Lightning attention implementations produce different kv results",
     )
 
+    print("✅ Two implementations match")
+
 
 def _build_slope_tensor(n_attention_heads: int):
     def get_slopes(n):
@@ -375,7 +400,7 @@ def get_slopes_power_of_2(n):
 
 
 def get_benchmark():
-    batch_size_range = [2**i for i in range(0, 12)]  # max 2048
+    batch_size_range = [i for i in range(1, 33)]  # max 32
     seq_length_range = [1]  # decode mode sequence length is fixed to 1
     configs = list(itertools.product(batch_size_range, seq_length_range))
 

From 3d93f84a00e2ddd8d180f3f6b361c832a5c12095 Mon Sep 17 00:00:00 2001
From: Mick <mickjagger19@icloud.com>
Date: Sun, 19 Jan 2025 06:14:19 +0800
Subject: [PATCH 112/248] [Feature] Support minicpmv v2.6 (#2785)

Co-authored-by: Chayenne <zhaochen20@outlook.com>
Co-authored-by: yizhang2077 <1109276519@qq.com>
---
 docs/references/supported_models.md           |    2 +-
 python/sglang/lang/chat_template.py           |   19 +-
 python/sglang/srt/configs/model_config.py     |    1 +
 python/sglang/srt/conversation.py             |   15 +-
 python/sglang/srt/layers/attention/vision.py  |  204 +++
 python/sglang/srt/layers/logits_processor.py  |    2 +-
 .../srt/managers/data_parallel_controller.py  |   10 +-
 python/sglang/srt/managers/image_processor.py |  160 ++-
 python/sglang/srt/managers/schedule_batch.py  |   18 +-
 python/sglang/srt/managers/scheduler.py       |    7 +-
 .../sglang/srt/managers/tokenizer_manager.py  |    5 +-
 .../sglang/srt/model_executor/model_runner.py |    2 +-
 python/sglang/srt/models/minicpmv.py          | 1238 +++++++++++++++++
 python/sglang/srt/models/qwen2.py             |    7 +-
 python/sglang/srt/models/qwen2_vl.py          |  136 +-
 python/sglang/srt/server.py                   |    1 +
 python/sglang/srt/utils.py                    |    2 +
 python/sglang/test/test_utils.py              |    2 +-
 test/README.md                                |    2 +-
 test/srt/test_vision_openai_server.py         |   21 +-
 20 files changed, 1715 insertions(+), 139 deletions(-)
 create mode 100644 python/sglang/srt/layers/attention/vision.py
 create mode 100644 python/sglang/srt/models/minicpmv.py

diff --git a/docs/references/supported_models.md b/docs/references/supported_models.md
index 1cc7b874732d..860841816e02 100644
--- a/docs/references/supported_models.md
+++ b/docs/references/supported_models.md
@@ -24,7 +24,7 @@
 - InternLM 2
 - Exaone 3
 - BaiChuan2
-- MiniCPM / MiniCPM 3
+- MiniCPM / MiniCPM 3 / MiniCPMV
 - XVERSE / XVERSE MoE
 - SmolLM
 - GLM-4
diff --git a/python/sglang/lang/chat_template.py b/python/sglang/lang/chat_template.py
index 4a774c4fb6b8..845e1e52ddae 100644
--- a/python/sglang/lang/chat_template.py
+++ b/python/sglang/lang/chat_template.py
@@ -88,7 +88,6 @@ def get_chat_template_by_model_path(model_path):
     )
 )
 
-
 register_chat_template(
     ChatTemplate(
         name="claude",
@@ -101,7 +100,6 @@ def get_chat_template_by_model_path(model_path):
     )
 )
 
-
 register_chat_template(
     ChatTemplate(
         name="chatml",
@@ -116,7 +114,6 @@ def get_chat_template_by_model_path(model_path):
     )
 )
 
-
 register_chat_template(
     ChatTemplate(
         name="chatml-llava",
@@ -132,7 +129,6 @@ def get_chat_template_by_model_path(model_path):
     )
 )
 
-
 # There is default system prompt for qwen
 # reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
 # The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
@@ -219,6 +215,21 @@ def get_chat_template_by_model_path(model_path):
     )
 )
 
+# https://huggingface.co/openbmb/MiniCPM-V-2_6
+register_chat_template(
+    ChatTemplate(
+        name="minicpmv",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", " "),
+            "user": ("user:", " "),
+            "assistant": ("assistant:", "</s>"),
+        },
+        stop_str=("<|im_end|>", "<|endoftext|>"),
+        image_token="(<image>./</image>)",
+    )
+)
+
 # The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
 register_chat_template(
     ChatTemplate(
diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index d087a2f2348c..6d144f84433c 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -402,6 +402,7 @@ def is_multimodal_model(model_architectures: List[str]):
         or "LlavaVidForCausalLM" in model_architectures
         or "MllamaForConditionalGeneration" in model_architectures
         or "Qwen2VLForConditionalGeneration" in model_architectures
+        or "MiniCPMV" in model_architectures
     ):
         return True
     else:
diff --git a/python/sglang/srt/conversation.py b/python/sglang/srt/conversation.py
index 60dba87cb081..3a775aa1e95f 100644
--- a/python/sglang/srt/conversation.py
+++ b/python/sglang/srt/conversation.py
@@ -452,7 +452,6 @@ def generate_chat_conv(
 
     # Add a blank message for the assistant.
     conv.append_message(conv.roles[1], None)
-
     return conv
 
 
@@ -555,3 +554,17 @@ def generate_chat_conv(
         image_token="<|vision_start|><|image_pad|><|vision_end|>",
     )
 )
+
+# Reference: https://huggingface.co/openbmb/MiniCPM-V-2_6#usage
+register_conv_template(
+    Conversation(
+        name="minicpmv",
+        system_message="You are a helpful assistant",
+        system_template="<|im_start|>system\n{system_message}.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        stop_str=("<|im_end|>", "<|endoftext|>"),
+        image_token="(<image>./</image>)",
+    )
+)
diff --git a/python/sglang/srt/layers/attention/vision.py b/python/sglang/srt/layers/attention/vision.py
new file mode 100644
index 000000000000..f66456b0437c
--- /dev/null
+++ b/python/sglang/srt/layers/attention/vision.py
@@ -0,0 +1,204 @@
+from __future__ import annotations
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+
+from sglang.srt.layers.attention.triton_ops.prefill_attention import (
+    context_attention_fwd,
+)
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.quantization import QuantizationConfig
+
+
+def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(
+            torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
+        )
+
+
+def apply_rotary_emb_torch(
+    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
+) -> torch.Tensor:
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(
+        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    )
+    sin = repeat(
+        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    )
+    return torch.cat(
+        [
+            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
+            x[..., ro_dim:],
+        ],
+        dim=-1,
+    )
+
+
+def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    t_ = t.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    output = apply_rotary_emb_torch(t_, cos, sin).type_as(t)
+    return output
+
+
+class VisionAttention(nn.Module):
+    """Multi-headed attention without any cache, mostly used for ViT."""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        use_qkv_parallel: bool,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads
+        )
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, world_size
+        )
+        # self.tp_size = get_tensor_model_parallel_world_size()
+        # num_heads = self.num_heads_per_partition
+        self.use_qkv_parallel = use_qkv_parallel
+        if use_qkv_parallel:
+            self.head_dim = embed_dim // num_heads
+            self.qkv_proj = QKVParallelLinear(
+                hidden_size=embed_dim,
+                head_size=self.head_dim,
+                total_num_heads=num_heads,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+        else:
+            self.qkv_proj = ColumnParallelLinear(
+                input_size=embed_dim,
+                output_size=3 * projection_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+        self.proj = RowParallelLinear(
+            input_size=embed_dim,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        """
+        Input shape: [b, s, embed_dim]
+        Output shape: [s, b, num_heads * head_size]
+        """
+
+        bsz, s, _ = x.shape
+        if self.use_qkv_parallel:
+            # [b, s, embed_dim] --> [b, s, embed_dim]
+            qkv, _ = self.qkv_proj(x)
+            q, k, v = qkv.chunk(3, dim=-1)
+
+            # [b, s, embed_dim] --> [b * s, num_heads, head_size]
+            q, k, v = [
+                x.reshape(
+                    bsz * s, self.num_attention_heads_per_partition, -1
+                ).contiguous()
+                for x in (q, k, v)
+            ]
+        else:
+            # [b, s, embed_dim] --> [s, b, embed_dim]
+            x = rearrange(x, "b s ... -> s b ...")
+            # [s, b, embed_dim] --> [s, b, head * 3 * head_dim]
+            qkv, _ = self.qkv_proj(x)
+            # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim]
+            new_x_shape = qkv.size()[:-1] + (
+                self.num_attention_heads_per_partition,
+                3 * self.hidden_size_per_attention_head,
+            )
+            qkv = qkv.view(*new_x_shape)
+
+            # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim]
+            q, k, v = dist_utils.split_tensor_along_last_dim(qkv, 3)
+
+            # [s, b, head, head_dim] --> [b, s, head, head_dim]
+            q, k, v = [
+                rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
+            ]
+
+        if rotary_pos_emb is not None:
+            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+
+        if self.use_qkv_parallel:
+            pass
+        else:
+            # [b, s, head, head_dim] --> [b * s, head, head_dim]
+            q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
+
+        # [b * s, num_heads, head_size]
+        output = torch.empty_like(q)
+
+        seq_lens = (cu_seqlens[1:] - cu_seqlens[:-1]).cuda()
+        max_seqlen = seq_lens.max().item()
+
+        context_attention_fwd(
+            q,
+            k,
+            v,
+            output,
+            cu_seqlens.cuda(),
+            seq_lens,
+            max_seqlen,
+            is_causal=False,
+        )
+
+        if self.use_qkv_parallel:
+
+            # [b * s, head, head_dim] --> [b, s, head * head_dim]
+            output = rearrange(output, "(b s) ... h d -> b s ... (h d)", b=bsz)
+
+            # [b, s, head, head_dim] --> [b, s, head, head_dim]
+            output, _ = self.proj(output)
+        else:
+            # [b * s, head, head_dim] --> [b, s, head, head_dim]
+            context_layer = rearrange(output, "(b s) ... -> b s ...", b=bsz)
+
+            # [s, b, num_heads * head_size]
+            context_layer = rearrange(
+                context_layer, "b s h d -> s b (h d)"
+            ).contiguous()
+
+            # [s, b, num_heads * head_size] --> [s, b, num_heads * head_size]
+            output, _ = self.proj(context_layer)
+
+            output = output.view(bsz, s, -1)
+
+        return output
diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py
index a4fe49051c30..10f264677874 100644
--- a/python/sglang/srt/layers/logits_processor.py
+++ b/python/sglang/srt/layers/logits_processor.py
@@ -127,7 +127,7 @@ def forward(
         hidden_states,
         lm_head: VocabParallelEmbedding,
         logits_metadata: Union[LogitsMetadata, ForwardBatch],
-    ):
+    ) -> LogitsProcessorOutput:
         if isinstance(logits_metadata, ForwardBatch):
             logits_metadata = LogitsMetadata.from_forward_batch(logits_metadata)
 
diff --git a/python/sglang/srt/managers/data_parallel_controller.py b/python/sglang/srt/managers/data_parallel_controller.py
index 4f57ac5b21e7..3b959b1ba768 100644
--- a/python/sglang/srt/managers/data_parallel_controller.py
+++ b/python/sglang/srt/managers/data_parallel_controller.py
@@ -56,6 +56,7 @@ class DataParallelController:
 
     def __init__(self, server_args, port_args) -> None:
         # Parse args
+        self.max_total_num_tokens = None
         self.server_args = server_args
         self.port_args = port_args
         self.load_balance_method = LoadBalanceMethod.from_str(
@@ -96,6 +97,8 @@ def __init__(self, server_args, port_args) -> None:
                     True,
                 )
 
+        self.max_req_input_len = None
+
     def launch_dp_schedulers(self, server_args, port_args):
         base_gpu_id = 0
 
@@ -189,6 +192,7 @@ def launch_tensor_parallel_group(
             scheduler_info.append(scheduler_pipe_readers[i].recv())
 
         self.max_total_num_tokens = scheduler_info[0]["max_total_num_tokens"]
+        self.max_req_input_len = scheduler_info[0]["max_req_input_len"]
 
     def round_robin_scheduler(self, req):
         self.workers[self.round_robin_counter].send_pyobj(req)
@@ -231,7 +235,11 @@ def run_data_parallel_controller_process(
     try:
         controller = DataParallelController(server_args, port_args)
         pipe_writer.send(
-            {"status": "ready", "max_total_num_tokens": controller.max_total_num_tokens}
+            {
+                "status": "ready",
+                "max_total_num_tokens": controller.max_total_num_tokens,
+                "max_req_input_len": controller.max_req_input_len,
+            }
         )
         if server_args.node_rank == 0:
             controller.event_loop()
diff --git a/python/sglang/srt/managers/image_processor.py b/python/sglang/srt/managers/image_processor.py
index 7120fa48d525..c8ebbed783ae 100644
--- a/python/sglang/srt/managers/image_processor.py
+++ b/python/sglang/srt/managers/image_processor.py
@@ -9,6 +9,8 @@
 
 import numpy as np
 import transformers
+from decord import VideoReader, cpu
+from PIL import Image
 
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.mm_utils import expand2square, process_anyres_image
@@ -36,6 +38,7 @@ class BaseImageProcessor(ABC):
     def __init__(self, hf_config, server_args, _processor):
         self.hf_config = hf_config
         self._processor = _processor
+        self.server_args = server_args
 
         self.executor = concurrent.futures.ProcessPoolExecutor(
             initializer=init_global_processor,
@@ -126,7 +129,12 @@ async def _process_single_image(
             )
 
     async def process_images_async(
-        self, image_data: List[Union[str, bytes]], input_text, request_obj
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        *args,
+        **kwargs,
     ):
         if not image_data:
             return None
@@ -229,6 +237,147 @@ async def process_images_async(
         return image_inputs
 
 
+class MiniCPMVImageProcessor(BaseImageProcessor):
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+
+    @staticmethod
+    def _process_images_task(images, input_text):
+        result = global_processor.__call__(
+            text=input_text, images=images, return_tensors="pt"
+        )
+        return {
+            "input_ids": result["input_ids"],
+            "pixel_values": result["pixel_values"],
+            "tgt_sizes": result["tgt_sizes"],
+        }
+
+    async def _process_images(self, images, input_text):
+        if self.executor is not None:
+            loop = asyncio.get_event_loop()
+            image_inputs = await loop.run_in_executor(
+                self.executor,
+                MiniCPMVImageProcessor._process_images_task,
+                images,
+                input_text,
+            )
+        else:
+            image_inputs = self._processor(
+                images=images, text=input_text, return_tensors="pt"
+            )
+
+        return image_inputs
+
+    async def process_images_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        max_req_input_len,
+    ):
+        if not image_data:
+            return None
+
+        if not isinstance(image_data, list):
+            image_data = [image_data]
+
+        image_hashes, image_sizes = [], []
+        raw_images = []
+        IMAGE_TOKEN = "(<image>./</image>)"
+
+        # roughly calculate the max number of frames
+        # TODO: the process should be applied to all the visual inputs
+        def calculate_max_num_frames() -> int:
+            # Model-specific
+            NUM_TOKEN_PER_FRAME = 330
+
+            ret = (max_req_input_len - len(input_text)) // NUM_TOKEN_PER_FRAME
+            return min(ret, 100)
+
+        # if cuda OOM set a smaller number
+        MAX_NUM_FRAMES = calculate_max_num_frames()
+        print(f"MAX_NUM_FRAMES: {MAX_NUM_FRAMES}")
+
+        def encode_video(video_path):
+            if not os.path.exists(video_path):
+                logger.error(f"Video {video_path} does not exist")
+                return []
+
+            if MAX_NUM_FRAMES == 0:
+                return []
+
+            def uniform_sample(l, n):
+                gap = len(l) / n
+                idxs = [int(i * gap + gap / 2) for i in range(n)]
+                return [l[i] for i in idxs]
+
+            vr = VideoReader(video_path, ctx=cpu(0))
+            sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+            frame_idx = [i for i in range(0, len(vr), sample_fps)]
+            if len(frame_idx) > MAX_NUM_FRAMES:
+                frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+            frames = vr.get_batch(frame_idx).asnumpy()
+            frames = [Image.fromarray(v.astype("uint8")) for v in frames]
+            return frames
+
+        if isinstance(input_text, list):
+            assert len(input_text) and isinstance(input_text[0], int)
+            input_text = self._processor.tokenizer.decode(input_text)
+
+        # MiniCPMV requires each frame of video as a single image token
+        text_parts = input_text.split(IMAGE_TOKEN)
+        new_text_parts = []
+
+        for image_index, image in enumerate(image_data):
+            try:
+                if isinstance(image, str) and image.startswith("video:"):
+                    path = image[len("video:") :]
+                    frames = encode_video(path)
+                else:
+                    raw_image, size = load_image(image)
+                    frames = [raw_image]
+                if len(frames) == 0:
+                    continue
+            except FileNotFoundError as e:
+                print(e)
+                return None
+
+            image_sizes += frames[0].size * len(frames)
+            image_hashes += [hash(image)] * len(frames)
+            raw_images += frames
+            new_text_parts.append(text_parts[image_index])
+            new_text_parts.append(IMAGE_TOKEN * len(frames))
+
+        new_text_parts.append(text_parts[-1])
+        input_text = "".join(new_text_parts)
+        if len(raw_images) == 0:
+            return None
+        res = await self._process_images(images=raw_images, input_text=input_text)
+        pixel_values = res["pixel_values"]
+        tgt_sizes = res["tgt_sizes"]
+        input_ids = res["input_ids"]
+
+        # Collect special token ids
+        tokenizer = self._processor.tokenizer
+        im_start_id = [tokenizer.im_start_id]
+        im_end_id = [tokenizer.im_end_id]
+        if tokenizer.slice_start_id:
+            slice_start_id = [tokenizer.slice_start_id]
+            slice_end_id = [tokenizer.slice_end_id]
+
+        return {
+            "input_ids": input_ids.flatten().tolist(),
+            "pixel_values": pixel_values,
+            "tgt_sizes": tgt_sizes,
+            "image_hashes": image_hashes,
+            "modalities": request_obj.modalities or ["image"],
+            "im_start_id": im_start_id,
+            "im_end_id": im_end_id,
+            "slice_start_id": slice_start_id,
+            "slice_end_id": slice_end_id,
+        }
+
+
 class Qwen2VLImageProcessor(BaseImageProcessor):
     def __init__(self, hf_config, server_args, _image_processor):
         self.hf_config = hf_config
@@ -289,7 +438,12 @@ async def _process_single_image(self, image_data: Union[bytes, str]):
             return self._process_single_image_task(image_data)
 
     async def process_images_async(
-        self, image_data: List[Union[str, bytes]], input_text, request_obj
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        *args,
+        **kwargs,
     ):
         if not image_data:
             return None
@@ -350,6 +504,8 @@ def get_image_processor(
         return MllamaImageProcessor(hf_config, server_args, processor)
     elif "Qwen2VLForConditionalGeneration" in hf_config.architectures:
         return Qwen2VLImageProcessor(hf_config, server_args, processor.image_processor)
+    elif "MiniCPMV" in hf_config.architectures:
+        return MiniCPMVImageProcessor(hf_config, server_args, processor)
     else:
         return LlavaImageProcessor(hf_config, server_args, processor.image_processor)
 
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 6c3800785144..faf05a7ff1fa 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -52,7 +52,6 @@
 if TYPE_CHECKING:
     from sglang.srt.speculative.spec_info import SpecInfo, SpeculativeAlgorithm
 
-
 INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
 
 # Put some global args for easy access
@@ -68,7 +67,6 @@
     "device": ServerArgs.device,
 }
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -149,6 +147,16 @@ class ImageInputs:
     image_grid_thws: List[Tuple[int, int, int]] = None
     mrope_position_delta: Optional[torch.Tensor] = None
 
+    # MiniCPMV related
+    # All the images in the batch should share the same special image
+    # bound token ids.
+    im_start_id: Optional[torch.Tensor] = None
+    im_end_id: Optional[torch.Tensor] = None
+    slice_start_id: Optional[torch.Tensor] = None
+    slice_end_id: Optional[torch.Tensor] = None
+
+    tgt_sizes: Optional[list] = None
+
     @staticmethod
     def from_dict(obj: dict):
         ret = ImageInputs(
@@ -168,6 +176,11 @@ def from_dict(obj: dict):
             "aspect_ratio_ids",
             "aspect_ratio_mask",
             "image_grid_thws",
+            "im_start_id",
+            "im_end_id",
+            "slice_start_id",
+            "slice_end_id",
+            "tgt_sizes",
         ]
         for arg in optional_args:
             if arg in obj:
@@ -1140,7 +1153,6 @@ def get_model_worker_batch(self):
 
         global bid
         bid += 1
-
         return ModelWorkerBatch(
             bid=bid,
             forward_mode=self.forward_mode,
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index bc963e0083b6..5ed0fde34402 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -274,7 +274,6 @@ def __init__(
         self.pad_input_ids_func = self.tp_worker.get_pad_input_ids_func()
         global_server_args_dict.update(worker_global_server_args_dict)
         set_random_seed(self.random_seed)
-
         # Print debug info
         logger.info(
             f"max_total_num_tokens={self.max_total_num_tokens}, "
@@ -1729,7 +1728,11 @@ def run_scheduler_process(
     try:
         scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank)
         pipe_writer.send(
-            {"status": "ready", "max_total_num_tokens": scheduler.max_total_num_tokens}
+            {
+                "status": "ready",
+                "max_total_num_tokens": scheduler.max_total_num_tokens,
+                "max_req_input_len": scheduler.max_req_input_len,
+            }
         )
         if scheduler.enable_overlap:
             scheduler.event_loop_overlap()
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 18ac7503cd76..9dcc986d9f74 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -112,6 +112,7 @@ def __init__(
         port_args: PortArgs,
     ):
         # Parse args
+
         self.server_args = server_args
         self.enable_metrics = server_args.enable_metrics
         self.log_requests = server_args.log_requests
@@ -207,6 +208,8 @@ def __init__(
         self.resume_memory_occupation_communicator = _Communicator(
             self.send_to_scheduler, server_args.dp_size
         )
+        # Set after scheduler is initialized
+        self.max_req_input_len = None
 
         # Metrics
         if self.enable_metrics:
@@ -281,7 +284,7 @@ async def _tokenize_one_request(
         if self.is_generation:
             # TODO: also support getting embeddings for multimodal models
             image_inputs: Dict = await self.image_processor.process_images_async(
-                obj.image_data, input_text or input_ids, obj
+                obj.image_data, input_text or input_ids, obj, self.max_req_input_len
             )
             if image_inputs and "input_ids" in image_inputs:
                 input_ids = image_inputs["input_ids"]
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 26191469435e..bca4711eb647 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -237,7 +237,7 @@ def init_torch_distributed(self):
         set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
 
         if not self.is_draft_worker:
-            # Only initilzie the distributed environment on the target model worker.
+            # Only initialize the distributed environment on the target model worker.
             init_distributed_environment(
                 backend=backend,
                 world_size=self.tp_size,
diff --git a/python/sglang/srt/models/minicpmv.py b/python/sglang/srt/models/minicpmv.py
new file mode 100644
index 000000000000..5ff941b6c27d
--- /dev/null
+++ b/python/sglang/srt/models/minicpmv.py
@@ -0,0 +1,1238 @@
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniCPM-V model compatible with HuggingFace weights."""
+from functools import cached_property, partial
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    TypedDict,
+    Union,
+)
+
+import torch
+import torch.types
+from PIL import Image
+from torch import nn
+from torch.nn.init import trunc_normal_
+from transformers import PretrainedConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.resampler import get_2d_sincos_pos_embed
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.schedule_batch import ImageInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.utils import set_default_torch_dtype
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
+
+RawImageType = Union[Image.Image, torch.Tensor]
+
+
+class Idefics2VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Idefics2EncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.embed_dim = config.hidden_size
+
+        self.num_heads = config.num_attention_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        num_heads_per_partition = divide(self.num_heads, tp_size)
+        self.self_attn = VisionAttention(
+            embed_dim=config.hidden_size,
+            num_heads=num_heads_per_partition,
+            projection_size=config.intermediate_size,
+            use_qkv_parallel=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Idefics2VisionMLP(config, quant_config=quant_config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states,
+            cu_seqlens=cu_seqlens,
+            # , forward_batch=forward_batch
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Idefics2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention
+    layers. Each layer is a
+    [`Idefics2EncoderLayer`].
+
+    Args:
+        config: Idefics2Config
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                Idefics2EncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                )
+                for _ in range(config.num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            inputs_embeds (torch.Tensor):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation.
+                This is useful if you want more control over how to convert
+                `input_ids` indices into associated vectorsthan the model's
+                internal embedding lookup matrix.
+        """
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(
+                hidden_states, cu_seqlens=cu_seqlens, forward_batch=forward_batch
+            )
+            hidden_states = layer_outputs
+        return hidden_states
+
+
+class Idefics2VisionEmbeddings(nn.Module):
+    """
+    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings
+    ` to enable images of variable
+    resolution.
+
+    The modifications are adapted from [Patch n' Pack: NaViT, a Vision
+    Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
+    which allows treating images in their native aspect ratio and without the
+    need to resize them to the same fixed size. In particular, we start from the
+    original pre-trained SigLIP model(which uses images of fixed-size square
+    images) and adapt it by training on images of variable resolutions.
+    """
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        patch_attention_mask: torch.BoolTensor,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+        target_dtype = self.patch_embedding.weight.dtype
+        pixel_values = pixel_values.to(
+            device=self.patch_embedding.weight.device, dtype=target_dtype
+        )
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        max_nb_patches_h, max_nb_patches_w = (
+            max_im_h // self.patch_size,
+            max_im_w // self.patch_size,
+        )
+        boundaries = torch.arange(
+            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
+        )
+        position_ids = torch.full(
+            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+            bucket_coords_h = torch.bucketize(
+                fractional_coords_h, boundaries, right=True
+            )
+            bucket_coords_w = torch.bucketize(
+                fractional_coords_w, boundaries, right=True
+            )
+            pos_ids = (
+                bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
+            ).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+class Idefics2VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        embed_dim = config.hidden_size
+        self.config = config
+        self.embeddings = Idefics2VisionEmbeddings(config)
+        self.encoder = Idefics2Encoder(config=config, quant_config=quant_config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def compute_cu_seqlens(self, tgt_sizes: torch.Tensor) -> torch.Tensor:
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]  # shape: (batch_size,)
+
+        # 做 prefix sum 来得到 cu_seqlens，注意在最前面插一个 0 作为 offset
+        cu_seqlens = torch.cat(
+            [
+                torch.tensor([0], device=patch_len.device, dtype=torch.int32),
+                torch.cumsum(patch_len, dim=0, dtype=torch.int32),
+            ],
+            dim=0,
+        ).to(tgt_sizes.device)
+        return cu_seqlens
+
+    def forward(
+        self,
+        pixel_values,
+        forward_batch: ForwardBatch,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+            # forward_batch=forward_batch,
+            tgt_sizes=tgt_sizes,
+        )
+        cu_seqlens = self.compute_cu_seqlens(tgt_sizes)
+        encoder_outputs = self.encoder(
+            hidden_states, cu_seqlens=cu_seqlens, forward_batch=forward_batch
+        )
+        last_hidden_state = self.post_layernorm(encoder_outputs)
+        return last_hidden_state
+
+
+class MiniCPMVImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: List[torch.Tensor]
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+
+    Note that the image size may vary, so we pass it as a list
+    instead of a batched tensor.
+    """
+
+    image_bounds: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(start, stop)` format.
+    """
+
+    tgt_sizes: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+
+class MiniCPMVImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    instead of a batched tensor.
+    """
+
+    image_bounds: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(start, stop)` format.
+    """
+
+
+MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs, MiniCPMVImageEmbeddingInputs]
+
+DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
+
+
+class BaseResampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb.
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+
+    def __init__(
+        self,
+        num_queries: int,
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: Optional[int] = None,
+        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+        do_post_projection: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.num_queries = num_queries
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        trunc_normal_(self.query, std=0.02)
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = ReplicatedLinear(
+                kv_dim,
+                embed_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.kv_proj",
+            )
+        else:
+            # Maintain the same return value with ReplicatedLinear.forward
+            self.kv_proj = lambda *args, **kwargs: (  # type: ignore # noqa
+                nn.Identity()(*args, **kwargs),
+                None,
+            )
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        self.do_post_projection = do_post_projection
+        self.ln_post = norm_layer(embed_dim) if do_post_projection else None
+        self.proj = (
+            nn.Parameter((embed_dim**-0.5) * torch.randn(embed_dim, embed_dim))
+            if do_post_projection
+            else None
+        )
+
+    def _init_weights(self, m: nn.Module) -> None:
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+
+
+class Resampler2_5(BaseResampler):
+
+    def __init__(
+        self,
+        num_queries: int,
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: Optional[int] = None,
+        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+        max_size: Tuple[int, int] = (70, 70),
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            num_queries,
+            embed_dim,
+            num_heads,
+            kv_dim,
+            norm_layer,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+
+        self.max_size = max_size
+        self._set_2d_pos_cache(self.max_size)
+
+        self.apply(self._init_weights)
+
+    def _set_2d_pos_cache(
+        self, max_size: Tuple[int, int], device: torch.types.Device = "cpu"
+    ) -> None:
+        pos_embed_arr = get_2d_sincos_pos_embed(
+            self.embed_dim, max_size, version=(2, 5)
+        )
+        pos_embed = torch.from_numpy(pos_embed_arr).float().to(device)
+        self.register_buffer("pos_embed", pos_embed, persistent=False)
+
+    def _adjust_pos_cache(
+        self, tgt_sizes: torch.Tensor, device: torch.types.Device
+    ) -> None:
+        max_h = tgt_sizes[:, 0].max().item()
+        max_w = tgt_sizes[:, 1].max().item()
+        assert isinstance(max_h, int) and isinstance(max_w, int)
+
+        if max_h > self.max_size[0] or max_w > self.max_size[1]:
+            self.max_size = (
+                max(max_h, self.max_size[0]),
+                max(max_w, self.max_size[1]),
+            )
+            self._set_2d_pos_cache(self.max_size, device)
+
+    def forward(self, x: torch.Tensor, tgt_sizes: torch.Tensor) -> torch.Tensor:
+        assert x.shape[0] == tgt_sizes.shape[0]
+        bs = x.shape[0]
+
+        device = x.device
+        dtype = x.dtype
+
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+
+        self._adjust_pos_cache(tgt_sizes, device=device)
+
+        max_patch_len = patch_len.max().item()
+        assert isinstance(max_patch_len, int)
+
+        key_padding_mask = torch.zeros(
+            (bs, max_patch_len), dtype=torch.bool, device=device
+        )
+
+        pos_embed = []
+        for i in range(bs):
+            tgt_h, tgt_w = tgt_sizes[i].tolist()
+            pos_embed.append(
+                self.pos_embed[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)).to(dtype)
+            )  # patches * D
+            key_padding_mask[i, patch_len[i] :] = True
+        pos_embed = torch.nn.utils.rnn.pad_sequence(
+            pos_embed, batch_first=True, padding_value=0.0
+        ).permute(
+            1, 0, 2
+        )  # BLD => L * B * D
+        x, _ = self.kv_proj(x)  # B * L * D
+        x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
+
+        q = self.ln_q(self.query)  # Q * D
+
+        out = self.attn(
+            self._repeat(q, bs),  # Q * B * D
+            x + pos_embed,  # L * B * D +  L * B * D
+            x,
+            key_padding_mask=key_padding_mask,
+        )[0]
+        #  out: Q * B * D
+        x = out.permute(1, 0, 2)  # B * Q * D
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+
+def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
+    version_float = getattr(config, "version", None)
+
+    # The old configs do not include version number
+    # TODO: Remove this after the HF repos are updated
+    if version_float is None:
+        if config.hidden_size == 2304 and config.query_num == 64:
+            return 2, 0
+        return 2, 5
+
+    version_str = str(version_float)
+    return tuple(int(x) for x in version_str.split("."))
+
+
+class MiniCPMVBaseModel(nn.Module):
+    """
+    The abstract class of MiniCPMV can only be inherited, but cannot be
+    instantiated.
+    """
+
+    def __init__(
+        self,
+        *,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        # multimodal_config = config.model_config.multimodal_config
+        super().__init__()
+        # All MiniCPM-V models disable `tie_word_embeddings` but
+        # `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot
+        # check `tie_word_embeddings` until vLLM integrate MiniCPM-V model
+        # and config class
+        self.config = config
+        # self.multimodal_config = multimodal_config
+
+        self.version = get_version_by_config(self.config)
+        self.llm = self.init_llm(config=config, quant_config=quant_config)
+        self.vpm = self.init_vision_module(config, quant_config)
+        self.vision_dim = (
+            self.vpm.embed_dim
+            if self.version == (2, 0)
+            else self.vpm.embeddings.embed_dim
+        )
+        self.embed_dim = self.config.hidden_size
+
+        self.resampler = self.init_resampler(
+            self.embed_dim, self.vision_dim, quant_config=quant_config
+        )
+
+        self.logits_processor = LogitsProcessor(config)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.llm, "sampler"):
+            return self.llm.sampler
+
+        return get_sampler()
+
+    def _get_image_bounds(
+        self,
+        input_ids: torch.Tensor,
+        pad_values: List[int],
+        im_start_id: torch.Tensor,
+        im_end_id: torch.Tensor,
+        slice_start_id: Optional[torch.Tensor] = None,
+        slice_end_id: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Returns a tensor indicating the bounds (start and end token ids) of the images
+        """
+        # All the images in the batch should share the same special image
+        # bound token ids.
+        start_cond = input_ids == im_start_id[0]
+        end_cond = input_ids == im_end_id[0]
+        if slice_start_id is not None:
+            start_cond |= input_ids == slice_start_id[0]
+            end_cond |= input_ids == slice_end_id[0]
+
+        (image_start_tokens,) = torch.where(start_cond)
+        image_start_tokens += 1
+        (image_end_tokens,) = torch.where(end_cond)
+
+        # the im_start_id sometimes can be cached as prefix, but it is needed for the embedding of the images
+        if len(image_start_tokens) != len(image_end_tokens):
+            if (
+                len(image_start_tokens) + 1 == len(image_end_tokens)
+                and input_ids[0] in pad_values
+                and image_end_tokens[0] < image_start_tokens[0]
+            ):
+                image_start_tokens = torch.cat(
+                    [
+                        torch.tensor([0], device=image_start_tokens.device),
+                        image_start_tokens,
+                    ]
+                )
+        valid_image_nums = min(len(image_start_tokens), len(image_end_tokens))
+
+        if valid_image_nums == 0:
+            return torch.zeros((0, 2), device=input_ids.device)
+
+        # Filter out pairs where start_token >= end_token
+        valid_pairs = []
+        for i in range(valid_image_nums):
+            start_token = image_start_tokens[i]
+            end_token = image_end_tokens[i]
+            if start_token < end_token:
+                valid_pairs.append((start_token, end_token))
+
+        if not valid_pairs:
+            return torch.zeros((0, 2), device=input_ids.device)
+
+        # Convert valid pairs to tensor
+        valid_pairs_tensor = torch.tensor(valid_pairs, device=input_ids.device)
+        return valid_pairs_tensor
+
+    def get_embedding(
+        self,
+        input_ids: torch.Tensor,
+        image_inputs: Optional[MiniCPMVImageInputs],
+        forward_batch: ForwardBatch,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids)
+
+        if image_inputs is None:  # No image
+            vision_hidden_states = torch.tensor([], device=input_ids.device)
+        else:
+            if image_inputs["type"] == "image_embeds":
+                vision_hidden_states = (
+                    image_inputs["data"]
+                    .type(vlm_embedding.dtype)
+                    .to(vlm_embedding.device)
+                )
+            else:
+                vision_hidden_states = self.get_vision_hidden_states(
+                    forward_batch, image_inputs
+                )
+
+            # See NOTE in _parse_and_validate_inputs
+            image_bounds = image_inputs["image_bounds"]
+            if len(image_bounds) > 0:
+                image_indices = torch.stack(
+                    [
+                        torch.arange(start, end, dtype=torch.long)
+                        for start, end in image_bounds.tolist()
+                    ]
+                ).to(vlm_embedding.device)
+                vlm_embedding.scatter_(
+                    0,
+                    image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]),
+                    vision_hidden_states.view(-1, vision_hidden_states.shape[-1]),
+                )
+
+        return vlm_embedding, vision_hidden_states
+
+    def _parse_and_validate_inputs(
+        self,
+        input_ids: torch.Tensor,
+        **kwargs: object,
+    ) -> Optional[MiniCPMVImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", [])
+        tgt_sizes = kwargs.pop("tgt_sizes", [])
+        im_start_id = kwargs.pop("im_start_id", None)
+        im_end_id = kwargs.pop("im_end_id", None)
+        slice_start_id = kwargs.pop("slice_start_id", None)
+        slice_end_id = kwargs.pop("slice_end_id", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        pad_values = kwargs.pop("pad_values", None)
+
+        if image_embeds is not None:
+            image_bounds = self._get_image_bounds(
+                input_ids=input_ids,
+                pad_values=pad_values,
+                im_start_id=im_start_id,
+                im_end_id=im_end_id,
+                slice_start_id=slice_start_id,
+                slice_end_id=slice_end_id,
+            )
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError(
+                    f"Incorrect type of image embeds. "
+                    f"Got type: {type(image_embeds)}"
+                )
+
+            if isinstance(image_embeds, list):
+                image_embeds = torch.concat(image_embeds)
+
+            return MiniCPMVImageEmbeddingInputs(
+                image_bounds=image_bounds,
+                data=image_embeds,
+                type="image_embeds",
+            )
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError(
+                "Incorrect type of pixel values. " f"Got type: {type(pixel_values)}"
+            )
+
+        if not isinstance(tgt_sizes, (torch.Tensor, list)):
+            raise ValueError(
+                "Incorrect type of target sizes. " f"Got type: {type(tgt_sizes)}"
+            )
+
+        if len(pixel_values) != len(tgt_sizes):
+            raise ValueError(
+                "Inconsistent batch lengths, found: "
+                f"{len(pixel_values)} vs. {len(tgt_sizes)}"
+            )
+
+        pixel_values_flat: List[torch.Tensor] = []
+        tgt_sizes_flat: List[torch.Tensor] = []
+        for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
+            if len(pixel_b) != len(tgt_b):
+                raise ValueError(
+                    "Inconsistent N lengths, found: " f"{len(pixel_b)} vs {len(tgt_b)}"
+                )
+
+            for pixel_n, tgt_n in zip(pixel_b, tgt_b):
+                pixel_values_flat += pixel_n
+                tgt_sizes_flat += tgt_n
+
+        # NOTE: Input IDs does not contain image tokens during memory profiling,
+        # so we allow it to be empty
+        if len(pixel_values_flat) != len(tgt_sizes_flat):
+            raise ValueError(
+                "Inconsistent flattened lengths, found: "
+                f"{len(pixel_values_flat)} vs. "
+                f"{len(tgt_sizes_flat)}"
+            )
+
+        if len(pixel_values_flat) == 0:
+            return None
+
+        image_bounds = self._get_image_bounds(
+            input_ids=input_ids,
+            pad_values=pad_values,
+            im_start_id=im_start_id,
+            im_end_id=im_end_id,
+            slice_start_id=slice_start_id,
+            slice_end_id=slice_end_id,
+        )
+        return MiniCPMVImagePixelInputs(
+            image_bounds=image_bounds.to(device=input_ids.device),
+            data=pixel_values_flat,
+            tgt_sizes=torch.stack(tgt_sizes_flat),
+            type="pixel_values",
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        if forward_batch.image_inputs is not None and forward_batch.image_inputs != [
+            None
+        ]:
+            kwargs.update(
+                {
+                    "pixel_values": (
+                        None
+                        if forward_batch.image_inputs is None
+                        else [
+                            i.pixel_values
+                            for i in forward_batch.image_inputs
+                            if i is not None
+                        ]
+                    ),
+                    "tgt_sizes": (
+                        None
+                        if forward_batch.image_inputs is None
+                        else [
+                            i.tgt_sizes
+                            for i in forward_batch.image_inputs
+                            if i is not None
+                        ]
+                    ),
+                    "im_start_id": forward_batch.image_inputs[0].im_start_id,
+                    "im_end_id": forward_batch.image_inputs[0].im_end_id,
+                    "slice_start_id": forward_batch.image_inputs[0].slice_start_id,
+                    "slice_end_id": forward_batch.image_inputs[0].slice_end_id,
+                    "pad_values": forward_batch.image_inputs[0].pad_values,
+                }
+            )
+
+        image_inputs = self._parse_and_validate_inputs(input_ids, **kwargs)
+
+        # Clamp input ids. This is because the input_ids for the image tokens are
+        # filled with the hash values of the image for the prefix matching in the radix attention.
+        # There values are useless because their embeddings will be replaced by vision embeddings anyway.
+        input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
+
+        vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs, forward_batch)
+
+        # always pass the input via `inputs_embeds`
+        # to make sure the computation graph is consistent
+        # for `torch.compile` integration
+        input_ids = None
+
+        hidden_states = self.llm.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            input_embeds=vlm_embeddings,
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.llm.lm_head, forward_batch
+        )
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.llm.compute_logits(hidden_states, sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="llm", connector="resampler", tower_model="vpm"
+        )
+
+    def init_llm(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> nn.Module:
+        raise NotImplementedError
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+    ) -> nn.Module:
+        raise NotImplementedError
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> nn.Module:
+        raise NotImplementedError
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def get_vision_hidden_states(
+        self, forward_batch: ForwardBatch, data: MiniCPMVImageInputs
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+
+class MiniCPMV2_6(MiniCPMVBaseModel):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        # vision encoder
+        "fc1",
+        "fc2",
+        "out_proj",
+        # language model
+        "qkv_proj",  # same name with vision encoder
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        # resampler
+        "kv_proj",
+    ]
+
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__(config=config, quant_config=quant_config)
+        assert self.version == (2, 6)
+
+    def init_llm(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> nn.Module:
+        return Qwen2ForCausalLM(config=config, quant_config=quant_config)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+    ) -> nn.Module:
+        model = Idefics2VisionTransformer(
+            config=config.vision_config, quant_config=quant_config
+        )
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+
+        setattr(model, "embed_dim", model.embeddings.embed_dim)
+        setattr(model, "patch_size", model.embeddings.patch_size)
+        return model
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            # The resampler in 2.6 remains consistent with the one in 2.5.
+            resampler = Resampler2_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+                quant_config=quant_config,
+            )
+
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        vision_embedding = self.vpm(
+            pixel_values,
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return vision_embedding
+
+    def get_vision_hidden_states(
+        self,
+        forward_batch: ForwardBatch,
+        data: MiniCPMVImageInputs,
+    ) -> torch.Tensor:
+        pixel_values = data["data"]
+        tgt_sizes = data["tgt_sizes"]
+
+        device = self.vpm.embeddings.position_embedding.weight.device
+        dtype = self.vpm.embeddings.position_embedding.weight.dtype
+        all_pixel_values_lst = [
+            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
+        ]
+
+        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
+        assert isinstance(max_patches, int)
+
+        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+            all_pixel_values_lst, batch_first=True, padding_value=0.0
+        )
+        B, L, _ = all_pixel_values.shape
+        all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+        patch_attn_mask = torch.zeros(
+            (B, 1, max_patches), dtype=torch.bool, device=device
+        )
+        for i in range(B):
+            patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True
+        vision_embedding = self.vpm(
+            all_pixel_values.type(dtype),
+            forward_batch=forward_batch,
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+
+        return self.resampler(vision_embedding, tgt_sizes)
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: ImageInputs):
+        if not isinstance(image_inputs.im_start_id, list) or not isinstance(
+            image_inputs.im_end_id, list
+        ):
+            return input_ids
+
+        new_input_ids = []
+        last_idx = 0
+        image_idx = -1
+        image_inputs.image_offsets = []
+
+        # Get all special token IDs
+        im_start_id = (
+            image_inputs.im_start_id[0].item()
+            if isinstance(image_inputs.im_start_id[0], torch.Tensor)
+            else image_inputs.im_start_id[0]
+        )
+        im_end_id = (
+            image_inputs.im_end_id[0].item()
+            if isinstance(image_inputs.im_end_id[0], torch.Tensor)
+            else image_inputs.im_end_id[0]
+        )
+        slice_start_id = (
+            image_inputs.slice_start_id[0].item()
+            if isinstance(image_inputs.slice_start_id[0], torch.Tensor)
+            else image_inputs.slice_start_id[0]
+        )
+        slice_end_id = (
+            image_inputs.slice_end_id[0].item()
+            if isinstance(image_inputs.slice_end_id[0], torch.Tensor)
+            else image_inputs.slice_end_id[0]
+        )
+
+        # Find all start and end positions for both types
+        start_indices = [
+            i
+            for i, x in enumerate(input_ids)
+            if x == im_start_id or x == slice_start_id
+        ]
+        end_indices = [
+            i for i, x in enumerate(input_ids) if x == im_end_id or x == slice_end_id
+        ]
+
+        if len(start_indices) != len(end_indices):
+            return input_ids
+        # Process each region (both image and slice)
+        for start_idx, end_idx in zip(start_indices, end_indices):
+            # Add non-image tokens before this region
+            new_input_ids.extend(
+                input_ids[last_idx : start_idx + 1]
+            )  # include start token
+
+            is_image_start = input_ids[start_idx] == im_start_id
+
+            if is_image_start:
+                image_inputs.image_offsets += [start_idx]
+                image_idx += 1
+
+            num_tokens = end_idx - start_idx - 1  # exclude start and end tokens
+
+            # Generate pad_ids
+            pad_values = [image_inputs.pad_values[image_idx]]
+
+            pad_ids = pad_values * ((num_tokens + len(pad_values)) // len(pad_values))
+            pad_ids = pad_ids[:num_tokens]
+
+            # Add pad_ids
+            new_input_ids.extend(pad_ids)
+
+            # Update last_idx to after end token
+            last_idx = end_idx
+
+        # Add remaining tokens after last region
+        new_input_ids.extend(input_ids[last_idx:])
+        assert len(input_ids) == len(new_input_ids)
+        return new_input_ids
+
+
+_SUPPORT_VERSION = {(2, 6): MiniCPMV2_6}
+
+
+class MiniCPMV:
+    """
+    Different versions of MiniCPMV use different visual encoders and LLMs,
+    which is not conducive to the current integration logic of LoRA and
+    bitsandbytes in vLLM. Therefore, it is necessary to separate them.
+    """
+
+    # Ensure that the LoRA support check passes when the class is not
+    # initialized, but set all these attributes to empty.
+    packed_modules_mapping = {}
+    supported_lora_modules = []
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    minicpmv: nn.Module
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        if not hasattr(config, "version"):
+            version = (2, 6)
+        else:
+            version = str(config.version).split(".")
+            version = tuple([int(x) for x in version])
+        # Dispatch class based on version
+        instance_class = _SUPPORT_VERSION.get(version)
+        if instance_class is None:
+            raise ValueError("Currently, MiniCPMV only supports versions 2.6")
+
+        try:
+            minicpmv = instance_class(config=config, quant_config=quant_config)
+            self.minicpmv = minicpmv
+        except Exception as e:
+            print(f"Failed to instantiate MiniCPMV: {e}")
+            raise e
+        self.config = config
+
+    def __getattr__(self, name):
+        if name == "minicpmv":
+            return None
+        return getattr(self.minicpmv, name)
+
+    def __call__(self, *args, **kwargs):
+        return self.minicpmv(*args, **kwargs)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.minicpmv.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq~" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+
+            # adapt to VisionAttention
+            name = name.replace(r"self_attn.out_proj", r"self_attn.proj")
+
+            if "sampler" in name:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # replace the name and load with customized loader
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = MiniCPMV
diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py
index 935e743bf6a9..0c01ab9e5b4b 100644
--- a/python/sglang/srt/models/qwen2.py
+++ b/python/sglang/srt/models/qwen2.py
@@ -248,6 +248,9 @@ def __init__(
         )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -296,7 +299,6 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
 
 
 class Qwen2ForCausalLM(nn.Module):
-
     # BitandBytes specific attributes
     default_bitsandbytes_target_modules = [
         ".gate_proj.",
@@ -334,6 +336,9 @@ def __init__(
         self.logits_processor = LogitsProcessor(config)
         self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     @torch.no_grad()
     def forward(
         self,
diff --git a/python/sglang/srt/models/qwen2_vl.py b/python/sglang/srt/models/qwen2_vl.py
index 83912e894e23..0fb85679f7af 100644
--- a/python/sglang/srt/models/qwen2_vl.py
+++ b/python/sglang/srt/models/qwen2_vl.py
@@ -37,9 +37,7 @@
 from sglang.srt.distributed import parallel_state
 from sglang.srt.distributed import utils as dist_utils
 from sglang.srt.hf_transformers_utils import get_processor
-from sglang.srt.layers.attention.triton_ops.prefill_attention import (
-    context_attention_fwd,
-)
+from sglang.srt.layers.attention.vision import VisionAttention
 from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.pooler import Pooler, PoolingType
@@ -52,6 +50,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 # === Vision Inputs === #
 
 
@@ -110,118 +109,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
-    if not interleaved:
-        x1, x2 = x.chunk(2, dim=-1)
-        return torch.cat((-x2, x1), dim=-1)
-    else:
-        x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(
-            torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
-        )
-
-
-def apply_rotary_emb_torch(
-    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
-) -> torch.Tensor:
-    """
-    x: (batch_size, seqlen, nheads, headdim)
-    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
-    """
-    ro_dim = cos.shape[-1] * 2
-    assert ro_dim <= x.shape[-1]
-    cos = repeat(
-        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
-    )
-    sin = repeat(
-        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
-    )
-    return torch.cat(
-        [
-            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
-            x[..., ro_dim:],
-        ],
-        dim=-1,
-    )
-
-
-def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
-    t_ = t.float()
-    cos = freqs.cos()
-    sin = freqs.sin()
-    output = apply_rotary_emb_torch(t_, cos, sin).type_as(t)
-    return output
-
-
-class Qwen2VisionAttention(nn.Module):
-
-    def __init__(
-        self,
-        embed_dim: Optional[int] = None,
-        num_heads: Optional[int] = None,
-        projection_size: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        # Per attention head and per partition values.
-        world_size = parallel_state.get_tensor_model_parallel_world_size()
-        self.hidden_size_per_attention_head = dist_utils.divide(
-            projection_size, num_heads
-        )
-        self.num_attention_heads_per_partition = dist_utils.divide(
-            num_heads, world_size
-        )
-
-        self.qkv = ColumnParallelLinear(
-            input_size=embed_dim,
-            output_size=3 * projection_size,
-            quant_config=quant_config,
-        )
-        self.proj = RowParallelLinear(
-            input_size=projection_size, output_size=embed_dim, quant_config=quant_config
-        )
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor = None,
-    ) -> torch.Tensor:
-        # [s, b, c] --> [s, b, head * 3 * head_dim]
-        x, _ = self.qkv(x)
-
-        # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim]
-        new_x_shape = x.size()[:-1] + (
-            self.num_attention_heads_per_partition,
-            3 * self.hidden_size_per_attention_head,
-        )
-        x = x.view(*new_x_shape)
-
-        # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim]
-        q, k, v = dist_utils.split_tensor_along_last_dim(x, 3)
-        batch_size = q.shape[1]
-
-        q, k, v = [rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)]
-        if rotary_pos_emb is not None:
-            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
-            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
-
-        seq_lens = cu_seqlens[1:] - cu_seqlens[:-1]
-        max_seqlen = (seq_lens).max().item()
-        q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
-
-        output = torch.empty_like(q)
-        context_attention_fwd(
-            q, k, v, output, cu_seqlens, seq_lens, max_seqlen, is_causal=False
-        )
-
-        context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
-        context_layer = rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
-
-        output, _ = self.proj(context_layer)
-        return output
-
-
 class Qwen2VisionBlock(nn.Module):
 
     def __init__(
@@ -240,10 +127,11 @@ def __init__(
         self.norm2 = norm_layer(dim)
         mlp_hidden_dim = int(dim * mlp_ratio)
 
-        self.attn = Qwen2VisionAttention(
+        self.attn = VisionAttention(
             embed_dim=dim,
             num_heads=num_heads,
             projection_size=dim,
+            use_qkv_parallel=False,
             quant_config=quant_config,
         )
         self.mlp = Qwen2VisionMLP(
@@ -253,9 +141,13 @@ def __init__(
     def forward(
         self, x: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor
     ) -> torch.Tensor:
-        x = x + self.attn(
-            self.norm1(x), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
+        hidden_states = self.norm1(x)
+        hidden_states = rearrange(hidden_states, "s b ... -> b s ...")
+        attn = self.attn(
+            hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
         )
+        attn = rearrange(attn, "b s ... -> s b ...")
+        x = x + attn
         x = x + self.mlp(self.norm2(x))
         return x
 
@@ -684,10 +576,12 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
+
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
@@ -696,6 +590,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+
                 if "visual" in name and "qkv.weight" in name:
                     visual_num_heads = self.config.vision_config.num_heads
                     visual_embed_dim = self.config.vision_config.embed_dim
@@ -712,6 +607,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     loaded_weight = loaded_weight.view(3, visual_num_heads, head_size)
                     loaded_weight = loaded_weight.transpose(0, 1)
                     loaded_weight = loaded_weight.reshape(-1)
+
+                if "visual" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+
                 try:
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index a41b94301e06..b3526520cd10 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -565,6 +565,7 @@ def launch_engine(
 
     # Assume all schedulers have same scheduler_info
     scheduler_info = scheduler_infos[0]
+    tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
 
 
 def launch_server(
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index 4ba2a6c2c54d..3e8b95b15978 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -451,6 +451,8 @@ def load_image(image_file: Union[str, bytes]):
     else:
         raise ValueError(f"Invalid image: {image}")
 
+    # if image_size is None:
+    #     image_size = image.size
     return image, image_size
 
 
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index d3c9b7cab5f8..c1437074f67b 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -406,7 +406,7 @@ def popen_launch_server(
     base_url: str,
     timeout: float,
     api_key: Optional[str] = None,
-    other_args: tuple = (),
+    other_args: list[str] = (),
     env: Optional[dict] = None,
     return_stdout_stderr: Optional[tuple] = None,
 ):
diff --git a/test/README.md b/test/README.md
index 3d739cc04967..868061bbc4a5 100644
--- a/test/README.md
+++ b/test/README.md
@@ -25,7 +25,7 @@ export OPENAI_API_KEY=sk-*****
 python3 test_openai_backend.py
 
 # Run a single test
-python3 -m unittest test_openai_backend.TestOpenAIBackend.test_few_shot_qa
+python3 -m unittest test_openai_backend.TestOpenAIServer.test_few_shot_qa
 
 # Run a suite with multiple files
 python3 run_suite.py --suite per-commit
diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py
index e19e6b01d513..163b0511e6ef 100644
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -171,7 +171,7 @@ def test_multi_images_chat_completion(self):
         text = response.choices[0].message.content
         assert isinstance(text, str)
         print(text)
-        assert "man" in text or "cab" in text, text
+        assert "man" in text or "cab" in text or "SUV" in text or "taxi" in text, text
         assert "logo" in text or '"S"' in text or "SG" in text, text
         assert response.id
         assert response.created
@@ -444,5 +444,24 @@ def test_video_chat_completion(self):
         pass
 
 
+class TestMinicpmvServer(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "openbmb/MiniCPM-V-2_6"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--chat-template",
+                "minicpmv",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
 if __name__ == "__main__":
     unittest.main()

From 83452dbb4a19c6a2461e972eb2b64a2df9a466b8 Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Sun, 19 Jan 2025 10:56:13 +0800
Subject: [PATCH 113/248] fix file name spelling mistake and useless variable
 in minmax-text-01-lightning_attention (#2971)

---
 .../benchmark_lightning_attention_decode.py}                    | 1 -
 .../benchmark_lightning_attention_prefill.py}                   | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)
 rename benchmark/kernels/{minmax-text-01-lighting_attention/benchmark_lighting_attention_decode.py => minmax-text-01-lightning_attention/benchmark_lightning_attention_decode.py} (99%)
 rename benchmark/kernels/{minmax-text-01-lighting_attention/benchmark_lighting_attention_prefill.py => minmax-text-01-lightning_attention/benchmark_lightning_attention_prefill.py} (99%)

diff --git a/benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_decode.py b/benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_decode.py
similarity index 99%
rename from benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_decode.py
rename to benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_decode.py
index 4ce7f2b499d0..a2d1e10f6623 100644
--- a/benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_decode.py
+++ b/benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_decode.py
@@ -26,7 +26,6 @@ def _decode_kernel(
     d_original: tl.constexpr,
     e: tl.constexpr,
     e_original: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr = 32,
 ):
     off_bh = tl.program_id(0)
     off_h = off_bh % h
diff --git a/benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_prefill.py b/benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_prefill.py
similarity index 99%
rename from benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_prefill.py
rename to benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_prefill.py
index 3db4694c7dc4..cd298487b590 100644
--- a/benchmark/kernels/minmax-text-01-lighting_attention/benchmark_lighting_attention_prefill.py
+++ b/benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_prefill.py
@@ -493,6 +493,8 @@ def test_lightning_attention_implementations(model_params):
         msg="Lightning attention implementations produce different results",
     )
 
+    print("✅ Two implementations match")
+
 
 def get_benchmark():
     batch_size_range = [2**i for i in range(0, 7)]  # max 64

From 2bd18e2d767e3a0f8afb5aff427bc8e6e4d297c0 Mon Sep 17 00:00:00 2001
From: Yang Zheng <50227060+zhengy001@users.noreply.github.com>
Date: Sun, 19 Jan 2025 11:35:12 +0800
Subject: [PATCH 114/248] Memory pool: Minor optimize to avoid to (#2901)

---
 python/sglang/srt/managers/schedule_batch.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index faf05a7ff1fa..77e5faca40ca 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -668,7 +668,7 @@ def prepare_encoder_info_extend(self, input_ids: List[int], seq_lens: List[int])
                     or len(req.prefix_indices) >= im.num_image_tokens
                 )
 
-        self.encoder_lens = torch.tensor(self.encoder_lens_cpu, dtype=torch.int32).to(
+        self.encoder_lens = torch.tensor(self.encoder_lens_cpu, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
 
@@ -702,7 +702,7 @@ def prepare_encoder_info_extend(self, input_ids: List[int], seq_lens: List[int])
         self.input_ids = torch.tensor(sum(input_ids, []), dtype=torch.int32).to(
             self.device, non_blocking=True
         )
-        self.seq_lens = torch.tensor(seq_lens, dtype=torch.int32).to(
+        self.seq_lens = torch.tensor(seq_lens, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
 
@@ -778,10 +778,10 @@ def prepare_for_extend(self):
         self.input_ids = torch.tensor(sum(input_ids, []), dtype=torch.int32).to(
             self.device, non_blocking=True
         )
-        self.req_pool_indices = torch.tensor(req_pool_indices, dtype=torch.int32).to(
+        self.req_pool_indices = torch.tensor(req_pool_indices, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
-        self.seq_lens = torch.tensor(seq_lens, dtype=torch.int32).to(
+        self.seq_lens = torch.tensor(seq_lens, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
         self.input_embeds = (
@@ -1014,9 +1014,9 @@ def prepare_encoder_info_decode(self):
     def prepare_for_idle(self):
         self.forward_mode = ForwardMode.IDLE
         self.input_ids = torch.empty(0, dtype=torch.int32, device=self.device)
-        self.seq_lens = torch.empty(0, dtype=torch.int32, device=self.device)
+        self.seq_lens = torch.empty(0, dtype=torch.int64, device=self.device)
         self.out_cache_loc = torch.empty(0, dtype=torch.int32, device=self.device)
-        self.req_pool_indices = torch.empty(0, dtype=torch.int32, device=self.device)
+        self.req_pool_indices = torch.empty(0, dtype=torch.int64, device=self.device)
         self.seq_lens_sum = 0
         self.extend_num_tokens = 0
         self.sampling_info = SamplingBatchInfo.from_schedule_batch(
@@ -1084,7 +1084,7 @@ def filter_batch(
             self.encoder_lens_cpu = [self.encoder_lens_cpu[i] for i in keep_indices]
 
         self.reqs = [self.reqs[i] for i in keep_indices]
-        new_indices = torch.tensor(keep_indices, dtype=torch.int32).to(
+        new_indices = torch.tensor(keep_indices, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
         self.req_pool_indices = self.req_pool_indices[new_indices]

From 4d4cdb3fe7dfbb87e5f138fe78954c6e1957b007 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Sat, 18 Jan 2025 19:37:30 -0800
Subject: [PATCH 115/248] Frontend: better error message handling for
 FINISH_ABORT in scheduler.py (#2956)

---
 python/sglang/srt/managers/schedule_batch.py  |  6 ++-
 python/sglang/srt/managers/scheduler.py       |  8 +--
 .../sglang/srt/managers/tokenizer_manager.py  | 11 ++++
 python/sglang/srt/managers/utils.py           |  5 +-
 test/srt/test_vision_openai_server.py         | 51 +++++++++----------
 5 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 77e5faca40ca..cec2262c4878 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -115,14 +115,18 @@ def to_json(self):
 
 
 class FINISH_ABORT(BaseFinishReason):
-    def __init__(self, message="Unknown error"):
+    def __init__(self, message="Unknown error", status_code=None, err_type=None):
         super().__init__(is_error=True)
         self.message = message
+        self.status_code = status_code
+        self.err_type = err_type
 
     def to_json(self):
         return {
             "type": "abort",
             "message": self.message,
+            "status_code": self.status_code,
+            "err_type": self.err_type,
         }
 
 
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 5ed0fde34402..d62abaff9316 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -23,6 +23,7 @@
 from collections import deque
 from concurrent import futures
 from dataclasses import dataclass
+from http import HTTPStatus
 from types import SimpleNamespace
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -672,15 +673,16 @@ def handle_generate_request(
             req.extend_image_inputs(image_inputs)
 
             if len(req.origin_input_ids) >= self.max_req_input_len:
-                logger.error(
+                error_msg = (
                     "Multimodal prompt is too long after expanding multimodal tokens. "
-                    f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}. "
+                    f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
                 )
+                logger.error(error_msg)
                 req.origin_input_ids = [0]
                 req.image_inputs = None
                 req.sampling_params.max_new_tokens = 0
                 req.finished_reason = FINISH_ABORT(
-                    "Multimodal prompt is too long. Check server logs for details."
+                    error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
                 )
                 self.waiting_queue.append(req)
                 return
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 9dcc986d9f74..85dcbcbd04c7 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -25,6 +25,7 @@
 import time
 import uuid
 from datetime import datetime
+from http import HTTPStatus
 from typing import Any, Awaitable, Dict, Generic, List, Optional, Tuple, TypeVar, Union
 
 import fastapi
@@ -384,6 +385,16 @@ async def _wait_one_response(
                     msg = f"Finish: obj={dataclass_to_string_truncated(obj)}, out={dataclass_to_string_truncated(out)}"
                     logger.info(msg)
                 del self.rid_to_state[obj.rid]
+
+                # Check if this was an abort/error created by scheduler
+                if isinstance(out["meta_info"].get("finish_reason"), dict):
+                    finish_reason = out["meta_info"]["finish_reason"]
+                    if (
+                        finish_reason.get("type") == "abort"
+                        and finish_reason.get("status_code") == HTTPStatus.BAD_REQUEST
+                    ):
+                        raise ValueError(finish_reason["message"])
+
                 yield out
                 break
 
diff --git a/python/sglang/srt/managers/utils.py b/python/sglang/srt/managers/utils.py
index 0ab5a0909c3a..10a1209631eb 100644
--- a/python/sglang/srt/managers/utils.py
+++ b/python/sglang/srt/managers/utils.py
@@ -1,4 +1,5 @@
 import logging
+from http import HTTPStatus
 from typing import Optional
 
 from sglang.srt.managers.schedule_batch import FINISH_ABORT, Req
@@ -35,7 +36,9 @@ def validate_input_length(
                 f"Use a shorter input or enable --allow-auto-truncate."
             )
             logger.error(error_msg)
-            req.finished_reason = FINISH_ABORT(error_msg)
+            req.finished_reason = FINISH_ABORT(
+                error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
+            )
             return error_msg
 
     return None
diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py
index 163b0511e6ef..5be911ab84a4 100644
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -392,34 +392,33 @@ def tearDownClass(cls):
     def test_chat_completion(self):
         client = openai.Client(api_key=self.api_key, base_url=self.base_url)
 
-        response = client.chat.completions.create(
-            model="default",
-            messages=[
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
+        with self.assertRaises(openai.BadRequestError) as cm:
+            client.chat.completions.create(
+                model="default",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
+                                },
                             },
-                        },
-                        {
-                            "type": "text",
-                            "text": "Give a lengthy description of this picture",
-                        },
-                    ],
-                },
-            ],
-            temperature=0,
-        )
+                            {
+                                "type": "text",
+                                "text": "Give a lengthy description of this picture",
+                            },
+                        ],
+                    },
+                ],
+                temperature=0,
+            )
 
-        assert response.choices[0].finish_reason == "abort"
-        assert response.id
-        assert response.created
-        assert response.usage.prompt_tokens > 0
-        assert response.usage.completion_tokens > 0
-        assert response.usage.total_tokens > 0
+        self.assertIn(
+            "Multimodal prompt is too long after expanding multimodal tokens.",
+            str(cm.exception),
+        )
 
 
 class TestMllamaServer(TestOpenAIVisionServer):

From 81d27c8e31c26a435a062fbeaff66357d28a773c Mon Sep 17 00:00:00 2001
From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
Date: Sun, 19 Jan 2025 12:13:27 +0800
Subject: [PATCH 116/248] Refactor to add TypeBasedDispatcher to simplify
 dispatching (#2958)

---
 python/sglang/srt/managers/scheduler.py       | 113 +++++-----
 .../sglang/srt/managers/tokenizer_manager.py  | 209 +++++++++---------
 python/sglang/utils.py                        |  13 +-
 3 files changed, 171 insertions(+), 164 deletions(-)

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index d62abaff9316..d859a30a0385 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -97,7 +97,7 @@
     set_random_seed,
     suppress_other_loggers,
 )
-from sglang.utils import get_exception_traceback
+from sglang.utils import TypeBasedDispatcher, get_exception_traceback
 
 logger = logging.getLogger(__name__)
 
@@ -422,6 +422,34 @@ def __init__(
                 },
             )
 
+        self._dispatcher = TypeBasedDispatcher(
+            [
+                (TokenizedGenerateReqInput, self.handle_generate_request),
+                (TokenizedEmbeddingReqInput, self.handle_embedding_request),
+                (FlushCacheReq, self.flush_cache_wrapped),
+                (AbortReq, self.abort_request),
+                (UpdateWeightFromDiskReqInput, self.update_weights_from_disk),
+                (InitWeightsUpdateGroupReqInput, self.init_weights_update_group),
+                (
+                    UpdateWeightsFromDistributedReqInput,
+                    self.update_weights_from_distributed,
+                ),
+                (UpdateWeightsFromTensorReqInput, self.update_weights_from_tensor),
+                (GetWeightsByNameReqInput, self.get_weights_by_name),
+                (ProfileReq, self.profile),
+                (OpenSessionReqInput, self.open_session),
+                (CloseSessionReqInput, self.close_session),
+                (
+                    ReleaseMemoryOccupationReqInput,
+                    lambda _: self.release_memory_occupation(),
+                ),
+                (
+                    ResumeMemoryOccupationReqInput,
+                    lambda _: self.resume_memory_occupation(),
+                ),
+            ]
+        )
+
     def watchdog_thread(self):
         """A watch dog thread that will try to kill the server itself if one batch takes too long."""
         self.watchdog_last_forward_ct = 0
@@ -563,57 +591,9 @@ def recv_requests(self) -> List[Req]:
 
     def process_input_requests(self, recv_reqs: List):
         for recv_req in recv_reqs:
-            if isinstance(recv_req, TokenizedGenerateReqInput):
-                self.handle_generate_request(recv_req)
-            elif isinstance(recv_req, TokenizedEmbeddingReqInput):
-                self.handle_embedding_request(recv_req)
-            elif isinstance(recv_req, FlushCacheReq):
-                self.flush_cache()
-            elif isinstance(recv_req, AbortReq):
-                self.abort_request(recv_req)
-            elif isinstance(recv_req, UpdateWeightFromDiskReqInput):
-                success, message = self.update_weights_from_disk(recv_req)
-                self.send_to_tokenizer.send_pyobj(
-                    UpdateWeightFromDiskReqOutput(success, message)
-                )
-            elif isinstance(recv_req, InitWeightsUpdateGroupReqInput):
-                success, message = self.init_weights_update_group(recv_req)
-                self.send_to_tokenizer.send_pyobj(
-                    InitWeightsUpdateGroupReqOutput(success, message)
-                )
-            elif isinstance(recv_req, UpdateWeightsFromDistributedReqInput):
-                success, message = self.update_weights_from_distributed(recv_req)
-                self.send_to_tokenizer.send_pyobj(
-                    UpdateWeightsFromDistributedReqOutput(success, message)
-                )
-            elif isinstance(recv_req, UpdateWeightsFromTensorReqInput):
-                success, message = self.update_weights_from_tensor(recv_req)
-                self.send_to_tokenizer.send_pyobj(
-                    UpdateWeightsFromTensorReqOutput(success, message)
-                )
-            elif isinstance(recv_req, GetWeightsByNameReqInput):
-                parameter = self.get_weights_by_name(recv_req)
-                self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
-            elif isinstance(recv_req, ReleaseMemoryOccupationReqInput):
-                self.release_memory_occupation()
-                self.send_to_tokenizer.send_pyobj(ReleaseMemoryOccupationReqOutput())
-            elif isinstance(recv_req, ResumeMemoryOccupationReqInput):
-                self.resume_memory_occupation()
-                self.send_to_tokenizer.send_pyobj(ResumeMemoryOccupationReqOutput())
-            elif isinstance(recv_req, ProfileReq):
-                if recv_req == ProfileReq.START_PROFILE:
-                    self.start_profile()
-                else:
-                    self.stop_profile()
-            elif isinstance(recv_req, OpenSessionReqInput):
-                session_id, success = self.open_session(recv_req)
-                self.send_to_tokenizer.send_pyobj(
-                    OpenSessionReqOutput(session_id=session_id, success=success)
-                )
-            elif isinstance(recv_req, CloseSessionReqInput):
-                self.close_session(recv_req)
-            else:
-                raise ValueError(f"Invalid request: {recv_req}")
+            output = self._dispatcher(recv_req)
+            if output is not None:
+                self.send_to_tokenizer.send_pyobj(output)
 
     def handle_generate_request(
         self,
@@ -1545,6 +1525,9 @@ def move_ready_grammar_requests(self):
         self.waiting_queue.extend(self.grammar_queue[:num_ready_reqs])
         self.grammar_queue = self.grammar_queue[num_ready_reqs:]
 
+    def flush_cache_wrapped(self, recv_req: FlushCacheReq):
+        self.flush_cache()
+
     def flush_cache(self):
         """Flush the memory pool and cache."""
         if len(self.waiting_queue) == 0 and (
@@ -1597,12 +1580,12 @@ def update_weights_from_disk(self, recv_req: UpdateWeightFromDiskReqInput):
             assert flash_cache_success, "Cache flush failed after updating weights"
         else:
             logger.error(message)
-        return success, message
+        return UpdateWeightFromDiskReqOutput(success, message)
 
     def init_weights_update_group(self, recv_req: InitWeightsUpdateGroupReqInput):
         """Initialize the online model parameter update group."""
         success, message = self.tp_worker.init_weights_update_group(recv_req)
-        return success, message
+        return InitWeightsUpdateGroupReqOutput(success, message)
 
     def update_weights_from_distributed(
         self,
@@ -1615,7 +1598,7 @@ def update_weights_from_distributed(
             assert flash_cache_success, "Cache flush failed after updating weights"
         else:
             logger.error(message)
-        return success, message
+        return UpdateWeightsFromDistributedReqOutput(success, message)
 
     def update_weights_from_tensor(self, recv_req: UpdateWeightsFromTensorReqInput):
         """Update the online model parameter from tensors."""
@@ -1626,11 +1609,11 @@ def update_weights_from_tensor(self, recv_req: UpdateWeightsFromTensorReqInput):
             assert flash_cache_success, "Cache flush failed after updating weights"
         else:
             logger.error(message)
-        return success, message
+        return UpdateWeightsFromTensorReqOutput(success, message)
 
     def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
         parameter = self.tp_worker.get_weights_by_name(recv_req)
-        return parameter
+        return GetWeightsByNameReqOutput(parameter)
 
     def release_memory_occupation(self):
         self.stashed_model_static_state = _export_static_state(
@@ -1638,6 +1621,7 @@ def release_memory_occupation(self):
         )
         self.memory_saver_adapter.pause()
         self.flush_cache()
+        return ReleaseMemoryOccupationReqOutput()
 
     def resume_memory_occupation(self):
         self.memory_saver_adapter.resume()
@@ -1645,6 +1629,13 @@ def resume_memory_occupation(self):
             self.tp_worker.worker.model_runner.model, self.stashed_model_static_state
         )
         del self.stashed_model_static_state
+        return ResumeMemoryOccupationReqOutput()
+
+    def profile(self, recv_req: ProfileReq):
+        if recv_req == ProfileReq.START_PROFILE:
+            self.start_profile()
+        else:
+            self.stop_profile()
 
     def start_profile(self) -> None:
         if self.profiler is None:
@@ -1660,20 +1651,20 @@ def stop_profile(self) -> None:
         )
         logger.info("Profiler is done")
 
-    def open_session(self, recv_req: OpenSessionReqInput) -> Tuple[Optional[str], bool]:
+    def open_session(self, recv_req: OpenSessionReqInput):
         # handle error
         session_id = recv_req.session_id
         if session_id in self.sessions:
             logger.warning(f"session id {session_id} already exist, cannot open.")
-            return session_id, False
+            return OpenSessionReqOutput(session_id, False)
         elif session_id is None:
             logger.warning(f"session id is None, cannot open.")
-            return session_id, False
+            return OpenSessionReqOutput(session_id, False)
         else:
             self.sessions[session_id] = Session(
                 recv_req.capacity_of_str_len, session_id
             )
-            return session_id, True
+            return OpenSessionReqOutput(session_id, True)
 
     def close_session(self, recv_req: CloseSessionReqInput):
         # handle error
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 85dcbcbd04c7..74f46538c932 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -80,7 +80,7 @@
     get_zmq_socket,
     kill_process_tree,
 )
-from sglang.utils import get_exception_traceback
+from sglang.utils import TypeBasedDispatcher, get_exception_traceback
 
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 
@@ -221,6 +221,43 @@ def __init__(
                 },
             )
 
+        self._dispatcher = TypeBasedDispatcher(
+            [
+                (BatchStrOut, self._handle_batch_output),
+                (BatchEmbeddingOut, self._handle_batch_output),
+                (BatchTokenIDOut, self._handle_batch_output),
+                (OpenSessionReqOutput, self._handle_open_session_req_output),
+                (
+                    UpdateWeightFromDiskReqOutput,
+                    self._handle_update_weights_from_disk_req_output,
+                ),
+                (
+                    InitWeightsUpdateGroupReqOutput,
+                    self.init_weights_update_group_communicator.handle_recv,
+                ),
+                (
+                    UpdateWeightsFromDistributedReqOutput,
+                    self.update_weights_from_distributed_communicator.handle_recv,
+                ),
+                (
+                    UpdateWeightsFromTensorReqOutput,
+                    self.update_weights_from_tensor_communicator.handle_recv,
+                ),
+                (
+                    GetWeightsByNameReqOutput,
+                    self.get_weights_by_name_communicator.handle_recv,
+                ),
+                (
+                    ReleaseMemoryOccupationReqOutput,
+                    self.release_memory_occupation_communicator.handle_recv,
+                ),
+                (
+                    ResumeMemoryOccupationReqOutput,
+                    self.resume_memory_occupation_communicator.handle_recv,
+                ),
+            ]
+        )
+
     async def generate_request(
         self,
         obj: Union[GenerateReqInput, EmbeddingReqInput],
@@ -712,110 +749,64 @@ async def handle_loop(self):
         """The event loop that handles requests"""
 
         while True:
-            recv_obj: Union[
-                BatchStrOut,
-                BatchEmbeddingOut,
-                BatchTokenIDOut,
-                UpdateWeightFromDiskReqOutput,
-                UpdateWeightsFromDistributedReqOutput,
-                GetWeightsByNameReqOutput,
-                InitWeightsUpdateGroupReqOutput,
-                ReleaseMemoryOccupationReqOutput,
-                ResumeMemoryOccupationReqOutput,
-            ] = await self.recv_from_detokenizer.recv_pyobj()
-
-            if isinstance(recv_obj, (BatchStrOut, BatchEmbeddingOut, BatchTokenIDOut)):
-                for i, rid in enumerate(recv_obj.rids):
-                    state = self.rid_to_state.get(rid, None)
-                    if state is None:
-                        continue
-
-                    meta_info = {
-                        "id": rid,
-                        "finish_reason": recv_obj.finished_reasons[i],
-                        "prompt_tokens": recv_obj.prompt_tokens[i],
-                    }
+            recv_obj = await self.recv_from_detokenizer.recv_pyobj()
+            self._dispatcher(recv_obj)
 
-                    if getattr(state.obj, "return_logprob", False):
-                        self.convert_logprob_style(
-                            meta_info,
-                            state.obj.top_logprobs_num,
-                            state.obj.return_text_in_logprobs,
-                            recv_obj,
-                            i,
-                        )
-
-                    if not isinstance(recv_obj, BatchEmbeddingOut):
-                        meta_info.update(
-                            {
-                                "completion_tokens": recv_obj.completion_tokens[i],
-                                "cached_tokens": recv_obj.cached_tokens[i],
-                            }
-                        )
-
-                    if isinstance(recv_obj, BatchStrOut):
-                        out_dict = {
-                            "text": recv_obj.output_strs[i],
-                            "meta_info": meta_info,
-                        }
-                    elif isinstance(recv_obj, BatchTokenIDOut):
-                        out_dict = {
-                            "token_ids": recv_obj.output_ids[i],
-                            "meta_info": meta_info,
-                        }
-                    else:
-                        assert isinstance(recv_obj, BatchEmbeddingOut)
-                        out_dict = {
-                            "embedding": recv_obj.embeddings[i],
-                            "meta_info": meta_info,
-                        }
-                    state.out_list.append(out_dict)
-                    state.finished = recv_obj.finished_reasons[i] is not None
-                    state.event.set()
-
-                    if self.enable_metrics and state.obj.log_metrics:
-                        self.collect_metrics(state, recv_obj, i)
-                    if (
-                        self.dump_requests_folder
-                        and state.finished
-                        and state.obj.log_metrics
-                    ):
-                        self.dump_requests(state, out_dict)
-            elif isinstance(recv_obj, OpenSessionReqOutput):
-                self.session_futures[recv_obj.session_id].set_result(
-                    recv_obj.session_id if recv_obj.success else None
+    def _handle_batch_output(
+        self, recv_obj: Union[BatchStrOut, BatchEmbeddingOut, BatchTokenIDOut]
+    ):
+        for i, rid in enumerate(recv_obj.rids):
+            state = self.rid_to_state.get(rid, None)
+            if state is None:
+                continue
+
+            meta_info = {
+                "id": rid,
+                "finish_reason": recv_obj.finished_reasons[i],
+                "prompt_tokens": recv_obj.prompt_tokens[i],
+            }
+
+            if getattr(state.obj, "return_logprob", False):
+                self.convert_logprob_style(
+                    meta_info,
+                    state.obj.top_logprobs_num,
+                    state.obj.return_text_in_logprobs,
+                    recv_obj,
+                    i,
                 )
-            elif isinstance(recv_obj, UpdateWeightFromDiskReqOutput):
-                if self.server_args.dp_size == 1:
-                    self.model_update_result.set_result(recv_obj)
-                else:  # self.server_args.dp_size > 1
-                    self.model_update_tmp.append(recv_obj)
-                    # set future if the all results are recevied
-                    if len(self.model_update_tmp) == self.server_args.dp_size:
-                        self.model_update_result.set_result(self.model_update_tmp)
-            elif isinstance(recv_obj, InitWeightsUpdateGroupReqOutput):
-                assert (
-                    self.server_args.dp_size == 1
-                ), "dp_size must be 1 for init parameter update group"
-                self.init_weights_update_group_communicator.handle_recv(recv_obj)
-            elif isinstance(recv_obj, UpdateWeightsFromDistributedReqOutput):
-                assert (
-                    self.server_args.dp_size == 1
-                ), "dp_size must be 1 for update weights from distributed"
-                self.update_weights_from_distributed_communicator.handle_recv(recv_obj)
-            elif isinstance(recv_obj, UpdateWeightsFromTensorReqOutput):
-                assert (
-                    self.server_args.dp_size == 1
-                ), "dp_size must be 1 for update weights from distributed"
-                self.update_weights_from_tensor_communicator.handle_recv(recv_obj)
-            elif isinstance(recv_obj, GetWeightsByNameReqOutput):
-                self.get_weights_by_name_communicator.handle_recv(recv_obj)
-            elif isinstance(recv_obj, ReleaseMemoryOccupationReqOutput):
-                self.release_memory_occupation_communicator.handle_recv(recv_obj)
-            elif isinstance(recv_obj, ResumeMemoryOccupationReqOutput):
-                self.resume_memory_occupation_communicator.handle_recv(recv_obj)
+
+            if not isinstance(recv_obj, BatchEmbeddingOut):
+                meta_info.update(
+                    {
+                        "completion_tokens": recv_obj.completion_tokens[i],
+                        "cached_tokens": recv_obj.cached_tokens[i],
+                    }
+                )
+
+            if isinstance(recv_obj, BatchStrOut):
+                out_dict = {
+                    "text": recv_obj.output_strs[i],
+                    "meta_info": meta_info,
+                }
+            elif isinstance(recv_obj, BatchTokenIDOut):
+                out_dict = {
+                    "token_ids": recv_obj.output_ids[i],
+                    "meta_info": meta_info,
+                }
             else:
-                raise ValueError(f"Invalid object: {recv_obj=}")
+                assert isinstance(recv_obj, BatchEmbeddingOut)
+                out_dict = {
+                    "embedding": recv_obj.embeddings[i],
+                    "meta_info": meta_info,
+                }
+            state.out_list.append(out_dict)
+            state.finished = recv_obj.finished_reasons[i] is not None
+            state.event.set()
+
+            if self.enable_metrics and state.obj.log_metrics:
+                self.collect_metrics(state, recv_obj, i)
+            if self.dump_requests_folder and state.finished and state.obj.log_metrics:
+                self.dump_requests(state, out_dict)
 
     def convert_logprob_style(
         self,
@@ -943,6 +934,20 @@ def background_task():
             # Schedule the task to run in the background without awaiting it
             asyncio.create_task(asyncio.to_thread(background_task))
 
+    def _handle_open_session_req_output(self, recv_obj):
+        self.session_futures[recv_obj.session_id].set_result(
+            recv_obj.session_id if recv_obj.success else None
+        )
+
+    def _handle_update_weights_from_disk_req_output(self, recv_obj):
+        if self.server_args.dp_size == 1:
+            self.model_update_result.set_result(recv_obj)
+        else:  # self.server_args.dp_size > 1
+            self.model_update_tmp.append(recv_obj)
+            # set future if the all results are recevied
+            if len(self.model_update_tmp) == self.server_args.dp_size:
+                self.model_update_result.set_result(self.model_update_tmp)
+
 
 async def print_exception_wrapper(func):
     """
diff --git a/python/sglang/utils.py b/python/sglang/utils.py
index 98e0f3f4f8db..98942fbb39c5 100644
--- a/python/sglang/utils.py
+++ b/python/sglang/utils.py
@@ -15,7 +15,7 @@
 from concurrent.futures import ThreadPoolExecutor
 from io import BytesIO
 from json import dumps
-from typing import Optional, Union
+from typing import Any, Callable, List, Optional, Tuple, Type, Union
 
 import numpy as np
 import requests
@@ -363,3 +363,14 @@ def terminate_process(process):
 def print_highlight(html_content: str):
     html_content = str(html_content).replace("\n", "<br>")
     display(HTML(f"<strong style='color: #00008B;'>{html_content}</strong>"))
+
+
+class TypeBasedDispatcher:
+    def __init__(self, mapping: List[Tuple[Type, Callable]]):
+        self._mapping = mapping
+
+    def __call__(self, obj: Any):
+        for ty, fn in self._mapping:
+            if isinstance(obj, ty):
+                return fn(obj)
+        raise ValueError(f"Invalid object: {obj}")

From 7906d1d29863bc3b33c4bcfb942a5d61f9867127 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 18 Jan 2025 20:20:23 -0800
Subject: [PATCH 117/248] Remove the unused write_with_records (#2972)

---
 python/sglang/srt/managers/schedule_batch.py  |  1 -
 python/sglang/srt/mem_cache/memory_pool.py    | 28 +------------------
 .../sglang/srt/model_executor/model_runner.py |  1 -
 3 files changed, 1 insertion(+), 29 deletions(-)

diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index cec2262c4878..afbc98b7ca99 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -158,7 +158,6 @@ class ImageInputs:
     im_end_id: Optional[torch.Tensor] = None
     slice_start_id: Optional[torch.Tensor] = None
     slice_end_id: Optional[torch.Tensor] = None
-
     tgt_sizes: Optional[list] = None
 
     @staticmethod
diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py
index ab27e81b7430..e307367223ac 100644
--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -49,7 +49,6 @@ def __init__(
         size: int,
         max_context_len: int,
         device: str,
-        use_records: bool,
         enable_memory_saver: bool,
     ):
         memory_saver_adapter = TorchMemorySaverAdapter.create(
@@ -64,17 +63,9 @@ def __init__(
                 (size, max_context_len), dtype=torch.int32, device=device
             )
         self.free_slots = list(range(size))
-        self.write_records = []
-        self.use_records = use_records
-
-        if self.use_records:
-            self.write = self.write_with_records
-        else:
-            self.write = self.write_without_records
 
     def write(self, indices, values):
-        # Keep the signature for type checking. It will be assigned during runtime.
-        raise NotImplementedError()
+        self.req_to_token[indices] = values
 
     def available_size(self):
         return len(self.free_slots)
@@ -96,23 +87,6 @@ def free(self, free_index: Union[int, List[int]]):
 
     def clear(self):
         self.free_slots = list(range(self.size))
-        self.write_records = []
-
-    def write_without_records(self, indices, values):
-        self.req_to_token[indices] = values
-
-    def write_with_records(self, indices, values):
-        self.req_to_token[indices] = values
-        self.write_records.append((indices, values))
-
-    def get_write_records(self):
-        ret = self.write_records
-        self.write_records = []
-        return ret
-
-    def apply_write_records(self, write_records: List[Tuple]):
-        for indices, values in write_records:
-            self.req_to_token[indices] = values
 
 
 class BaseTokenToKVPool:
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index bca4711eb647..46920d922497 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -617,7 +617,6 @@ def init_memory_pool(
             size=max_num_reqs + 1,
             max_context_len=self.model_config.context_len + 4,
             device=self.device,
-            use_records=False,
             enable_memory_saver=self.server_args.enable_memory_saver,
         )
         if (

From 93b77c8e8a14d74bea70b643c4f40ea5f5fbc666 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 18 Jan 2025 21:45:00 -0800
Subject: [PATCH 118/248] Fix the request loggings to make it fully able to be
 easily replayed  (#2973)

---
 python/sglang/srt/managers/configure_logging.py |  3 +++
 python/sglang/srt/managers/io_struct.py         |  1 +
 python/sglang/srt/managers/tokenizer_manager.py | 11 +++++++++--
 python/sglang/srt/utils.py                      |  6 +++---
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/python/sglang/srt/managers/configure_logging.py b/python/sglang/srt/managers/configure_logging.py
index 3351cdc400ce..187af4d9c088 100644
--- a/python/sglang/srt/managers/configure_logging.py
+++ b/python/sglang/srt/managers/configure_logging.py
@@ -27,6 +27,7 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--url", type=str, default="http://localhost:30000")
+    parser.add_argument("--log-requests", action="store_true")
     parser.add_argument(
         "--dump-requests-folder", type=str, default="/tmp/sglang_request_dump"
     )
@@ -36,6 +37,8 @@
     response = requests.post(
         args.url + "/configure_logging",
         json={
+            "log_requests": args.log_requests,
+            "log_requests_level": 1,  # Log full requests
             "dump_requests_folder": args.dump_requests_folder,
             "dump_requests_threshold": args.dump_requests_threshold,
         },
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
index 7f07055132fd..c5a35ced00cf 100644
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -495,6 +495,7 @@ class ProfileReq(Enum):
 @dataclass
 class ConfigureLoggingReq:
     log_requests: Optional[bool] = None
+    log_requests_level: Optional[int] = None
     dump_requests_folder: Optional[str] = None
     dump_requests_threshold: Optional[int] = None
 
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 74f46538c932..033a660df5e9 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -117,6 +117,7 @@ def __init__(
         self.server_args = server_args
         self.enable_metrics = server_args.enable_metrics
         self.log_requests = server_args.log_requests
+        self.log_requests_level = 0
 
         # Init inter-process communication
         context = zmq.asyncio.Context(2)
@@ -276,7 +277,10 @@ async def generate_request(
         obj.normalize_batch_and_arguments()
 
         if self.log_requests:
-            logger.info(f"Receive: obj={dataclass_to_string_truncated(obj)}")
+            max_length = 2048 if self.log_requests_level == 0 else 1 << 30
+            logger.info(
+                f"Receive: obj={dataclass_to_string_truncated(obj, max_length)}"
+            )
 
         async with self.model_update_lock.reader_lock:
             is_single = obj.is_single
@@ -419,7 +423,8 @@ async def _wait_one_response(
             state.out_list = []
             if state.finished:
                 if self.log_requests:
-                    msg = f"Finish: obj={dataclass_to_string_truncated(obj)}, out={dataclass_to_string_truncated(out)}"
+                    max_length = 2048 if self.log_requests_level == 0 else 1 << 30
+                    msg = f"Finish: obj={dataclass_to_string_truncated(obj, max_length)}, out={dataclass_to_string_truncated(out, max_length)}"
                     logger.info(msg)
                 del self.rid_to_state[obj.rid]
 
@@ -682,6 +687,8 @@ async def close_session(
     def configure_logging(self, obj: ConfigureLoggingReq):
         if obj.log_requests is not None:
             self.log_requests = obj.log_requests
+        if obj.log_requests_level is not None:
+            self.log_requests_level = obj.log_requests_level
         if obj.dump_requests_folder is not None:
             self.dump_requests_folder = obj.dump_requests_folder
         if obj.dump_requests_threshold is not None:
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index 3e8b95b15978..c67b6635b301 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -1262,9 +1262,9 @@ def dataclass_to_string_truncated(data, max_length=2048):
     if isinstance(data, str):
         if len(data) > max_length:
             half_length = max_length // 2
-            return f'"{data[:half_length]} ... {data[-half_length:]}"'
+            return f"{repr(data[:half_length])} ... {repr(data[-half_length:])}"
         else:
-            return f'"{data}"'
+            return f"{repr(data)}"
     elif isinstance(data, (list, tuple)):
         if len(data) > max_length:
             half_length = max_length // 2
@@ -1275,7 +1275,7 @@ def dataclass_to_string_truncated(data, max_length=2048):
         return (
             "{"
             + ", ".join(
-                f"{k}: {dataclass_to_string_truncated(v, max_length)}"
+                f"'{k}': {dataclass_to_string_truncated(v, max_length)}"
                 for k, v in data.items()
             )
             + "}"

From 23196d5254ff9f9d7cadd6a028b264bf5db8b18c Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 18 Jan 2025 23:03:49 -0800
Subject: [PATCH 119/248] Simplify logits processor (#2974)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
---
 python/sglang/srt/layers/logits_processor.py | 71 ++++++++++++--------
 1 file changed, 44 insertions(+), 27 deletions(-)

diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py
index 10f264677874..e5794f052c33 100644
--- a/python/sglang/srt/layers/logits_processor.py
+++ b/python/sglang/srt/layers/logits_processor.py
@@ -14,6 +14,7 @@
 """Logits processing."""
 
 import dataclasses
+import logging
 from typing import List, Optional, Union
 
 import torch
@@ -32,6 +33,8 @@
     ForwardMode,
 )
 
+logger = logging.getLogger(__name__)
+
 
 @dataclasses.dataclass
 class LogitsProcessorOutput:
@@ -136,50 +139,61 @@ def forward(
             logits_metadata.forward_mode.is_decode_or_idle()
             or logits_metadata.forward_mode.is_target_verify()
         ):
-            last_index = None
-            last_hidden = hidden_states
-        else:
+            pruned_states = hidden_states
+            sample_indices = None
+        elif (
+            logits_metadata.forward_mode.is_extend()
+            and not logits_metadata.extend_return_logprob
+        ):
+            # Prefill without input logprobs.
             last_index = torch.cumsum(logits_metadata.extend_seq_lens, dim=0) - 1
-            last_hidden = hidden_states[last_index]
+            pruned_states = hidden_states[last_index]
+            sample_indices = None
+        else:
+            # Slice the requested tokens to compute logprob
+            sample_index_pt = -1
+            sample_indices = []
+            pt, pruned_states, pruned_input_ids = 0, [], []
+            for start_len, extend_len in zip(
+                logits_metadata.extend_logprob_start_lens_cpu,
+                logits_metadata.extend_seq_lens_cpu,
+            ):
+                pruned_states.append(hidden_states[pt + start_len : pt + extend_len])
+                sample_index_pt += extend_len - start_len
+                sample_indices.append(sample_index_pt)
+                pruned_input_ids.append(input_ids[pt + start_len : pt + extend_len])
+                pt += extend_len
+
+            pruned_states = torch.cat(pruned_states)
+
+        # Compute logits for both input and sampled tokens.
+        logits = self._get_logits(pruned_states, lm_head, logits_metadata)
+        sampled_logits = (
+            logits[sample_indices] if sample_indices is not None else logits
+        )
 
-        # Compute logits
-        last_logits = self._get_logits(last_hidden, lm_head)
         if (
             not logits_metadata.extend_return_logprob
             or logits_metadata.capture_hidden_mode.need_capture()
         ):
             # Decode mode or extend mode without return_logprob.
             return LogitsProcessorOutput(
-                next_token_logits=last_logits,
+                next_token_logits=sampled_logits,
                 hidden_states=(
                     hidden_states
                     if logits_metadata.capture_hidden_mode.is_full()
                     else (
-                        last_hidden
+                        pruned_states
                         if logits_metadata.capture_hidden_mode.is_last()
                         else None
                     )
                 ),
             )
         else:
-            # Slice the requested tokens to compute logprob
-            pt, pruned_states, pruned_input_ids = 0, [], []
-            for start_len, extend_len in zip(
-                logits_metadata.extend_logprob_start_lens_cpu,
-                logits_metadata.extend_seq_lens_cpu,
-            ):
-                pruned_states.append(hidden_states[pt + start_len : pt + extend_len])
-                pruned_input_ids.append(input_ids[pt + start_len : pt + extend_len])
-                pt += extend_len
-
-            # Compute the logits of all required tokens
-            pruned_states = torch.cat(pruned_states)
-            del hidden_states
-            input_token_logits = self._get_logits(pruned_states, lm_head)
-            del pruned_states
+            input_logprobs = logits
+            del hidden_states, logits
 
             # Normalize the logprob w/o temperature, top-p
-            input_logprobs = input_token_logits
             input_logprobs = self.compute_temp_top_p_normalized_logprobs(
                 input_logprobs, logits_metadata
             )
@@ -194,17 +208,17 @@ def forward(
                 input_top_logprobs_val = input_top_logprobs_idx = None
 
             input_token_logprobs = input_logprobs[
-                torch.arange(input_logprobs.shape[0], device="cuda"),
+                torch.arange(input_logprobs.shape[0], device=input_logprobs.device),
                 torch.cat(
                     [
                         torch.cat(pruned_input_ids)[1:],
-                        torch.tensor([0], device="cuda"),
+                        torch.tensor([0], device=input_logprobs.device),
                     ]
                 ),
             ]
 
             return LogitsProcessorOutput(
-                next_token_logits=last_logits,
+                next_token_logits=sampled_logits,
                 input_token_logprobs=input_token_logprobs,
                 input_top_logprobs_val=input_top_logprobs_val,
                 input_top_logprobs_idx=input_top_logprobs_idx,
@@ -214,8 +228,11 @@ def _get_logits(
         self,
         hidden_states: torch.Tensor,
         lm_head: VocabParallelEmbedding,
+        logits_metadata: LogitsMetadata,
         embedding_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        """Get logits from hidden_states."""
+
         if hasattr(lm_head, "weight"):
             logits = torch.matmul(hidden_states, lm_head.weight.T)
         else:

From d33cbb7e5857da4cf4023ecfac2706ffbd0c76b6 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Sun, 19 Jan 2025 15:51:27 +0800
Subject: [PATCH 120/248] remove cub and add cccl (#2976)

---
 .gitmodules              | 6 +++---
 sgl-kernel/3rdparty/cccl | 1 +
 sgl-kernel/3rdparty/cub  | 1 -
 3 files changed, 4 insertions(+), 4 deletions(-)
 create mode 160000 sgl-kernel/3rdparty/cccl
 delete mode 160000 sgl-kernel/3rdparty/cub

diff --git a/.gitmodules b/.gitmodules
index c588176e7c07..c584a21e8bd2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "sgl-kernel/3rdparty/cutlass"]
 	path = sgl-kernel/3rdparty/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
-[submodule "sgl-kernel/3rdparty/cub"]
-	path = sgl-kernel/3rdparty/cub
-	url = https://github.com/NVIDIA/cub.git
+[submodule "sgl-kernel/3rdparty/cccl"]
+	path = sgl-kernel/3rdparty/cccl
+	url = https://github.com/NVIDIA/cccl.git
diff --git a/sgl-kernel/3rdparty/cccl b/sgl-kernel/3rdparty/cccl
new file mode 160000
index 000000000000..b5fe509fd11a
--- /dev/null
+++ b/sgl-kernel/3rdparty/cccl
@@ -0,0 +1 @@
+Subproject commit b5fe509fd11a925f90d6495176707cc1184eed9d
diff --git a/sgl-kernel/3rdparty/cub b/sgl-kernel/3rdparty/cub
deleted file mode 160000
index 0fc3c3701632..000000000000
--- a/sgl-kernel/3rdparty/cub
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 0fc3c3701632a4be906765b73be20a9ad0da603d

From 53cc91e504a3865d4086ac0f73d7198e66c89833 Mon Sep 17 00:00:00 2001
From: Byron Hsu <byronhsu1230@gmail.com>
Date: Sun, 19 Jan 2025 16:34:01 +0800
Subject: [PATCH 121/248] [devcontainer] Fix mount and GPU & Support rust dev 
 (#2978)

---
 .devcontainer/devcontainer.json | 7 +++++--
 docker/Dockerfile.dev           | 8 ++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index aee285898644..66f7aecbf826 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -15,6 +15,9 @@
             ]
         }
     },
-    "workspaceFolder": "/sgl-workspace/sglang",
-    "forwardPorts": []
+    "forwardPorts": [],
+    "runArgs": [
+        "--gpus",
+        "all"
+    ]
 }
diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev
index 70860d8ef886..20a373184b85 100644
--- a/docker/Dockerfile.dev
+++ b/docker/Dockerfile.dev
@@ -18,6 +18,8 @@ RUN apt-get update && apt-get install -y \
     silversearcher-ag \
     cloc \
     unzip \
+    pkg-config \
+    libssl-dev \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
 
@@ -63,6 +65,12 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.31.1/cmake-3.31.1
     && cp -r cmake-3.31.1-linux-x86_64/share/* /usr/local/share/ \
     && rm -rf cmake-3.31.1-linux-x86_64 cmake-3.31.1-linux-x86_64.tar.gz
 
+# Install uv
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Install rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+
 # Add yank script
 COPY --chown=root:root <<-"EOF" /usr/local/bin/yank
 #!/bin/bash

From ef18b0eda28b37082d158fade59a24b29f6a986c Mon Sep 17 00:00:00 2001
From: Byron Hsu <byronhsu1230@gmail.com>
Date: Sun, 19 Jan 2025 17:05:23 +0800
Subject: [PATCH 122/248] [router] Allow empty worker list for
 sglang.launch_router  (#2979)

---
 .github/workflows/pr-test-rust.yml            |  4 +--
 scripts/ci_install_rust.sh                    | 11 +++++---
 sgl-router/README.md                          | 10 +++++++
 .../py_src/sglang_router/launch_router.py     |  6 ++---
 .../py_src/sglang_router/launch_server.py     |  2 +-
 sgl-router/py_src/sglang_router/version.py    |  2 +-
 sgl-router/py_test/test_launch_router.py      | 26 ++++++++++++++-----
 sgl-router/pyproject.toml                     |  2 +-
 8 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/pr-test-rust.yml b/.github/workflows/pr-test-rust.yml
index 928d0efa5b34..277ddef774e9 100644
--- a/.github/workflows/pr-test-rust.yml
+++ b/.github/workflows/pr-test-rust.yml
@@ -40,7 +40,7 @@ jobs:
           cd sgl-router/
           cargo test
 
-  e2e-rust:
+  e2e-python:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
     runs-on: 2-gpu-runner
     steps:
@@ -65,7 +65,7 @@ jobs:
           python3 run_suite.py
 
   finish:
-    needs: [unit-test-rust, e2e-rust]
+    needs: [unit-test-rust, e2e-python]
     runs-on: ubuntu-latest
     steps:
       - name: Finish
diff --git a/scripts/ci_install_rust.sh b/scripts/ci_install_rust.sh
index 724207fd7825..519155dfbe85 100755
--- a/scripts/ci_install_rust.sh
+++ b/scripts/ci_install_rust.sh
@@ -1,9 +1,14 @@
 #!/bin/bash
 set -euxo pipefail
 
-# these are required for actix
-apt-get update
-apt-get install -y libssl-dev pkg-config
+# Check if sudo is available
+if command -v sudo >/dev/null 2>&1; then
+    sudo apt-get update
+    sudo apt-get install -y libssl-dev pkg-config
+else
+    apt-get update
+    apt-get install -y libssl-dev pkg-config
+fi
 
 # Install rustup (Rust installer and version manager)
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
diff --git a/sgl-router/README.md b/sgl-router/README.md
index f39d63625de1..61c9e692c923 100644
--- a/sgl-router/README.md
+++ b/sgl-router/README.md
@@ -67,6 +67,16 @@ $ pip install -e .
 
 **Note:** When modifying Rust code, you must rebuild the wheel for changes to take effect.
 
+### Troubleshooting
+
+1. If rust analyzer is not working in VSCode, set `rust-analyzer.linkedProjects` to the absolute path of `Cargo.toml` in your repo. For example:
+
+```json
+{
+  "rust-analyzer.linkedProjects":  ["/workspaces/sglang/sgl-router/Cargo.toml"]
+}
+```
+
 ### CI/CD Setup
 
 The continuous integration pipeline consists of three main steps:
diff --git a/sgl-router/py_src/sglang_router/launch_router.py b/sgl-router/py_src/sglang_router/launch_router.py
index e4f26a8d4bce..28cd5d11fbb1 100644
--- a/sgl-router/py_src/sglang_router/launch_router.py
+++ b/sgl-router/py_src/sglang_router/launch_router.py
@@ -27,7 +27,7 @@ def setup_logger():
 @dataclasses.dataclass
 class RouterArgs:
     # Worker configuration
-    worker_urls: List[str]
+    worker_urls: List[str] = dataclasses.field(default_factory=list)
     host: str = "127.0.0.1"
     port: int = 30000
 
@@ -141,8 +141,9 @@ def from_cli_args(
             use_router_prefix: If True, look for arguments with 'router-' prefix
         """
         prefix = "router_" if use_router_prefix else ""
+        worker_urls = args.worker_urls if args.worker_urls is not None else []
         return cls(
-            worker_urls=args.worker_urls,
+            worker_urls=worker_urls,
             host=args.host,
             port=args.port,
             policy=getattr(args, f"{prefix}policy"),
@@ -237,7 +238,6 @@ def parse_router_args(args: List[str]) -> RouterArgs:
 
 
 def main() -> None:
-    logger = setup_logger()
     router_args = parse_router_args(sys.argv[1:])
     router = launch_router(router_args)
 
diff --git a/sgl-router/py_src/sglang_router/launch_server.py b/sgl-router/py_src/sglang_router/launch_server.py
index 6ee192415429..2f433269efa2 100644
--- a/sgl-router/py_src/sglang_router/launch_server.py
+++ b/sgl-router/py_src/sglang_router/launch_server.py
@@ -23,7 +23,7 @@ def setup_logger():
     logger.setLevel(logging.INFO)
 
     formatter = logging.Formatter(
-        "[Router (Python)] %(asctime)s - %(levelname)s - %(message)s",
+        "[Router (Python)] %(asctime)s - %(levelname)s - %(message)s - %(filename)s:%(lineno)d",
         datefmt="%Y-%m-%d %H:%M:%S",
     )
 
diff --git a/sgl-router/py_src/sglang_router/version.py b/sgl-router/py_src/sglang_router/version.py
index 485f44ac21b2..b3f4756216d0 100644
--- a/sgl-router/py_src/sglang_router/version.py
+++ b/sgl-router/py_src/sglang_router/version.py
@@ -1 +1 @@
-__version__ = "0.1.1"
+__version__ = "0.1.2"
diff --git a/sgl-router/py_test/test_launch_router.py b/sgl-router/py_test/test_launch_router.py
index 1c3700d423ba..94912f69491b 100644
--- a/sgl-router/py_test/test_launch_router.py
+++ b/sgl-router/py_test/test_launch_router.py
@@ -22,11 +22,9 @@ def terminate_process(process: multiprocessing.Process, timeout: float = 1.0) ->
 
 
 class TestLaunchRouter(unittest.TestCase):
-    def test_launch_router_no_exception(self):
-
-        # Create SimpleNamespace with default arguments
-        args = SimpleNamespace(
-            worker_urls=["http://localhost:8000"],
+    def setUp(self):
+        """Set up default arguments for router tests."""
+        self.default_args = SimpleNamespace(
             host="127.0.0.1",
             port=30000,
             policy="cache_aware",
@@ -39,6 +37,15 @@ def test_launch_router_no_exception(self):
             verbose=False,
         )
 
+    def create_router_args(self, **kwargs):
+        """Create router arguments by updating default args with provided kwargs."""
+        args_dict = vars(self.default_args).copy()
+        args_dict.update(kwargs)
+        return SimpleNamespace(**args_dict)
+
+    def run_router_process(self, args):
+        """Run router in a separate process and verify it starts successfully."""
+
         def run_router():
             try:
                 from sglang_router.launch_router import launch_router
@@ -51,7 +58,6 @@ def run_router():
                 print(e)
                 return 1
 
-        # Start router in separate process
         process = multiprocessing.Process(target=run_router)
         try:
             process.start()
@@ -62,6 +68,14 @@ def run_router():
         finally:
             terminate_process(process)
 
+    def test_launch_router_common(self):
+        args = self.create_router_args(worker_urls=["http://localhost:8000"])
+        self.run_router_process(args)
+
+    def test_launch_router_with_empty_worker_urls(self):
+        args = self.create_router_args(worker_urls=[])
+        self.run_router_process(args)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/sgl-router/pyproject.toml b/sgl-router/pyproject.toml
index 20096b6b4912..90e82cecf377 100644
--- a/sgl-router/pyproject.toml
+++ b/sgl-router/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sglang-router"
-version = "0.1.1"
+version = "0.1.2"
 description = "SGLang router is a standalone module implemented in Rust to achieve data parallelism across SGLang instances."
 authors = [{name = "Byron Hsu", email = "byronhsu1230@gmail.com"}]
 requires-python = ">=3.8"

From 4719c1d04a10bd11258c0c05f08db6e7beab0414 Mon Sep 17 00:00:00 2001
From: Byron Hsu <byronhsu1230@gmail.com>
Date: Sun, 19 Jan 2025 17:11:06 +0800
Subject: [PATCH 123/248] [router] Fix sgl router path for release (#2980)

---
 .github/workflows/release-pypi-router.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/release-pypi-router.yml b/.github/workflows/release-pypi-router.yml
index df20c211cb3e..bba0c0fca53d 100644
--- a/.github/workflows/release-pypi-router.yml
+++ b/.github/workflows/release-pypi-router.yml
@@ -7,7 +7,7 @@ on:
     branches:
       - main
     paths:
-      - sglang-router/pyproject.toml
+      - sgl-router/pyproject.toml
   workflow_dispatch:
 
 jobs:
@@ -26,9 +26,9 @@ jobs:
         with:
           path: sglang-repo
 
-      - name: Move sglang-router folder to root and delete sglang-repo
+      - name: Move sgl-router folder to root and delete sglang-repo
         run: |
-          mv sglang-repo/sglang-router/* .
+          mv sglang-repo/sgl-router/* .
           rm -rf sglang-repo
           ls -alt
 
@@ -69,9 +69,9 @@ jobs:
         with:
           path: sglang-repo
 
-      - name: Move sglang-router folder to root, copy the license file, and delete sglang-repo
+      - name: Move sgl-router folder to root, copy the license file, and delete sglang-repo
         run: |
-          mv sglang-repo/sglang-router/* .
+          mv sglang-repo/sgl-router/* .
           mv sglang-repo/LICENSE .
           rm -rf sglang-repo
           ls -alt

From 5a176c92dfa13183deca012fe4c43d9d75815390 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Sun, 19 Jan 2025 21:33:27 +0800
Subject: [PATCH 124/248] fix deepseek v2 with cpu device (#2975)

---
 python/sglang/srt/layers/rotary_embedding.py | 114 ++++++++++++++++++-
 python/sglang/srt/models/deepseek_v2.py      |   4 +-
 python/sglang/srt/models/minicpmv.py         |   2 +-
 python/sglang/srt/models/olmo2.py            |   0
 4 files changed, 115 insertions(+), 5 deletions(-)
 mode change 100755 => 100644 python/sglang/srt/models/olmo2.py

diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py
index 7c18c683e969..bc38fa8c0f9e 100644
--- a/python/sglang/srt/layers/rotary_embedding.py
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -664,6 +664,7 @@ def __init__(
         beta_slow: int = 1,
         mscale: float = 1,
         mscale_all_dim: float = 0,
+        device: Optional[str] = "cuda",
     ) -> None:
         self.scaling_factor = scaling_factor
         self.extrapolation_factor = extrapolation_factor
@@ -676,13 +677,14 @@ def __init__(
             / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim))
             * attn_factor
         )
+        self.device = device
         super().__init__(
             head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
         )
 
     def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
         pos_freqs = self.base ** (
-            torch.arange(0, self.rotary_dim, 2, dtype=torch.float, device="cuda")
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float, device=self.device)
             / self.rotary_dim
         )
         inv_freq_extrapolation = 1.0 / pos_freqs
@@ -710,7 +712,7 @@ def _compute_cos_sin_cache(self) -> torch.Tensor:
         inv_freq = self._compute_inv_freq(self.scaling_factor)
         t = torch.arange(
             self.max_position_embeddings * self.scaling_factor,
-            device="cuda",
+            device=self.device,
             dtype=torch.float32,
         )
         freqs = torch.einsum("i,j -> ij", t, inv_freq)
@@ -1174,3 +1176,111 @@ def get_rope(
             raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
     _ROPE_DICT[key] = rotary_emb
     return rotary_emb
+
+
+def get_rope_cpu(
+    head_size: int,
+    rotary_dim: int,
+    max_position: int,
+    base: int,
+    is_neox_style: bool = True,
+    rope_scaling: Optional[Dict[str, Any]] = None,
+    dtype: Optional[torch.dtype] = None,
+    partial_rotary_factor: float = 1.0,
+    device: Optional[str] = None,
+) -> RotaryEmbedding:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if rope_scaling is not None:
+        # Transforms every value that is a list into a tuple for caching calls
+        rope_scaling_tuple = {
+            k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items()
+        }
+        rope_scaling_args = tuple(rope_scaling_tuple.items())
+    else:
+        rope_scaling_args = None
+    if partial_rotary_factor < 1.0:
+        rotary_dim = int(rotary_dim * partial_rotary_factor)
+    key = (
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        rope_scaling_args,
+        dtype,
+    )
+    if key in _ROPE_DICT:
+        return _ROPE_DICT[key]
+
+    assert rope_scaling is not None
+    scaling_type = rope_scaling["rope_type"]
+    assert (
+        scaling_type == "deepseek_yarn"
+    ), "Only deepseek_yarn is supported for CPU for now"
+
+    scaling_factor = rope_scaling["factor"]
+    original_max_position = rope_scaling["original_max_position_embeddings"]
+    extra_kwargs = {
+        k: v
+        for k, v in rope_scaling.items()
+        if k
+        in (
+            "extrapolation_factor",
+            "attn_factor",
+            "beta_fast",
+            "beta_slow",
+            "mscale",
+            "mscale_all_dim",
+        )
+    }
+    extra_kwargs["device"] = device
+    rotary_emb = DeepseekScalingRotaryEmbedding(
+        head_size,
+        rotary_dim,
+        original_max_position,
+        base,
+        is_neox_style,
+        scaling_factor,
+        dtype,
+        **extra_kwargs,
+    )
+
+    _ROPE_DICT[key] = rotary_emb
+    return rotary_emb
+
+
+def get_rope_wrapper(
+    head_size: int,
+    rotary_dim: int,
+    max_position: int,
+    base: int,
+    is_neox_style: bool = True,
+    rope_scaling: Optional[Dict[str, Any]] = None,
+    dtype: Optional[torch.dtype] = None,
+    partial_rotary_factor: float = 1.0,
+    device: Optional[str] = None,
+):
+    if device != "cpu":
+        return get_rope(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            rope_scaling,
+            dtype,
+            partial_rotary_factor,
+        )
+
+    return get_rope_cpu(
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        rope_scaling,
+        dtype,
+        partial_rotary_factor,
+        device,
+    )
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
index 0d327c0ca97e..17d7fcf8924c 100644
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -48,7 +48,7 @@
     normalize_e4m3fn_to_e4m3fnuz,
 )
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.rotary_embedding import get_rope, get_rope_wrapper
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
@@ -271,7 +271,7 @@ def __init__(
             quant_config=quant_config,
         )
         rope_scaling["rope_type"] = "deepseek_yarn"
-        self.rotary_emb = get_rope(
+        self.rotary_emb = get_rope_wrapper(
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,
             max_position=max_position_embeddings,
diff --git a/python/sglang/srt/models/minicpmv.py b/python/sglang/srt/models/minicpmv.py
index 5ff941b6c27d..23147529a647 100644
--- a/python/sglang/srt/models/minicpmv.py
+++ b/python/sglang/srt/models/minicpmv.py
@@ -39,12 +39,12 @@
 from torch import nn
 from torch.nn.init import trunc_normal_
 from transformers import PretrainedConfig
-from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.resampler import get_2d_sincos_pos_embed
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 
+from sglang.srt.distributed import divide, get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import get_act_fn
 from sglang.srt.layers.attention.vision import VisionAttention
 from sglang.srt.layers.linear import (
diff --git a/python/sglang/srt/models/olmo2.py b/python/sglang/srt/models/olmo2.py
old mode 100755
new mode 100644

From 24cafe317746a1051ae965925eeaab539049a09f Mon Sep 17 00:00:00 2001
From: yizhang2077 <1109276519@qq.com>
Date: Sun, 19 Jan 2025 22:30:38 +0800
Subject: [PATCH 125/248] add config to swtich from vllm custom allreduce to
 sgl_kernel custom allreduce (#2981)

---
 python/sglang/srt/_custom_ops.py              | 115 +++++++++++------
 .../device_communicators/custom_all_reduce.py | 117 ++++++++++++------
 2 files changed, 160 insertions(+), 72 deletions(-)

diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py
index f59f67605b3c..3c00a8552ffe 100644
--- a/python/sglang/srt/_custom_ops.py
+++ b/python/sglang/srt/_custom_ops.py
@@ -3,6 +3,7 @@
 import functools
 import importlib
 import logging
+import os
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import torch
@@ -11,12 +12,19 @@
 from sglang.srt.utils import is_hpu
 
 logger = logging.getLogger(__name__)
+use_vllm_custom_allreduce = os.environ.get("USE_VLLM_CUSTOM_ALLREDUCE", default=False)
 
 if not is_hpu():
-    try:
-        import sgl_kernel
-    except ImportError as e:
-        logger.warning("Failed to import from custom_ar with %r", e)
+    if use_vllm_custom_allreduce:
+        try:
+            import vllm._C
+        except ImportError as e:
+            logger.warning("Failed to import from vllm._C with %r", e)
+    else:
+        try:
+            import sgl_kernel
+        except ImportError as e:
+            logger.warning("Failed to import from custom_ar with %r", e)
 
 
 def hint_on_error(fn):
@@ -48,43 +56,78 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-# custom ar
-def init_custom_ar(
-    rank_id: int,
-    world_size: int,
-    rank_data_base: torch.Tensor,
-    buffers: List[int],
-    tmp_result_buffers: List[int],
-    barrier_in: List[int],
-    barrier_out: List[int],
-) -> int:
-    return sgl_kernel.ops.init_custom_reduce(
-        rank_id,
-        world_size,
-        rank_data_base,
-        buffers,
-        tmp_result_buffers,
-        barrier_in,
-        barrier_out,
-    )
-
-
-def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
-    sgl_kernel.ops.custom_reduce(fa, inp, out)
-
+if use_vllm_custom_allreduce:
+    # custom ar
+    def init_custom_ar(
+        ipc_tensors: List[torch.Tensor],
+        rank_data: torch.Tensor,
+        rank: int,
+        full_nvlink: bool,
+    ) -> int:
+        return torch.ops._C_custom_ar.init_custom_ar(
+            ipc_tensors, rank_data, rank, full_nvlink
+        )
 
-def dispose(fa: int) -> None:
-    sgl_kernel.ops.custom_dispose(fa)
+    def all_reduce(
+        fa: int,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        reg_buffer: int,
+        reg_buffer_sz_bytes: int,
+    ) -> None:
+        torch.ops._C_custom_ar.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
+
+    def dispose(fa: int) -> None:
+        torch.ops._C_custom_ar.dispose(fa)
+
+    def meta_size() -> int:
+        return torch.ops._C_custom_ar.meta_size()
+
+    def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
+        return torch.ops._C_custom_ar.register_buffer(fa, ipc_tensors)
+
+    def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
+        return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
+
+    def register_graph_buffers(
+        fa: int, handles: List[List[int]], offsets: List[List[int]]
+    ) -> None:
+        torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
+
+else:
+    # custom ar
+    def init_custom_ar(
+        rank_id: int,
+        world_size: int,
+        rank_data_base: torch.Tensor,
+        buffers: List[int],
+        tmp_result_buffers: List[int],
+        barrier_in: List[int],
+        barrier_out: List[int],
+    ) -> int:
+        return sgl_kernel.ops.init_custom_reduce(
+            rank_id,
+            world_size,
+            rank_data_base,
+            buffers,
+            tmp_result_buffers,
+            barrier_in,
+            barrier_out,
+        )
 
+    def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
+        sgl_kernel.ops.custom_reduce(fa, inp, out)
 
-def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
-    return sgl_kernel.ops.get_graph_buffer_ipc_meta(fa)
+    def dispose(fa: int) -> None:
+        sgl_kernel.ops.custom_dispose(fa)
 
+    def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
+        return sgl_kernel.ops.get_graph_buffer_ipc_meta(fa)
 
-def register_graph_buffers(
-    fa: int, handles: List[List[int]], offsets: List[List[int]]
-) -> None:
-    sgl_kernel.ops.register_graph_buffers(fa, handles, offsets)
+    def register_graph_buffers(
+        fa: int, handles: List[List[int]], offsets: List[List[int]]
+    ) -> None:
+        sgl_kernel.ops.register_graph_buffers(fa, handles, offsets)
 
 
 # temporary fix for https://github.com/vllm-project/vllm/issues/5456
diff --git a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
index ba9feb59d0c6..28aa9d4811e0 100644
--- a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
+++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
@@ -21,8 +21,10 @@
 from sglang.srt.utils import cuda_device_count_stateless, is_cuda
 
 try:
-    import sgl_kernel
-
+    if ops.use_vllm_custom_allreduce:
+        ops.meta_size()
+    else:
+        import sgl_kernel
     custom_ar = True
 except Exception:
     # For AMD GPUs and CPUs
@@ -201,33 +203,58 @@ def __init__(
         self.world_size = world_size
         self.full_nvlink = full_nvlink
 
-        # From TensorRT-LLM getMaxRequiredWorkspaceSize
-        self.max_required_workspace_size = [16 * 1024 * 1024, 8 * 1024 * 1024]
-
-        # sizeof(uint32_t) * (MAX_ALL_REDUCE_BLOCKS + 2) * MAX_RANKS_PER_NODE;
-        self.barrier_max_size = 8 * (36 + 2) * 8
-
-        self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
-        self.tmp_result_buffer_ptrs = self.create_shared_buffer(max_size, group=group)
-        self.rank_data_base = torch.empty(
-            8 * 1024 * 1024, dtype=torch.uint8, device=self.device
-        )
-        self.barrier_in_ptrs = self.create_shared_buffer(
-            self.barrier_max_size, group=group
-        )
-        self.barrier_out_ptrs = self.create_shared_buffer(
-            self.barrier_max_size, group=group
-        )
-
-        self._ptr = ops.init_custom_ar(
-            rank,
-            world_size,
-            self.rank_data_base,
-            self.buffer_ptrs,
-            self.tmp_result_buffer_ptrs,
-            self.barrier_in_ptrs,
-            self.barrier_out_ptrs,
-        )
+        if ops.use_vllm_custom_allreduce:
+            # Buffers memory are owned by this Python class and passed to C++.
+            # Meta data composes of two parts: meta data for synchronization and a
+            # temporary buffer for storing intermediate allreduce results.
+            self.meta_ptrs = self.create_shared_buffer(
+                ops.meta_size() + max_size, group=group
+            )
+            # This is a pre-registered IPC buffer. In eager mode, input tensors
+            # are first copied into this buffer before allreduce is performed
+            self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
+            # This is a buffer for storing the tuples of pointers pointing to
+            # IPC buffers from all ranks. Each registered tuple has size of
+            # 8*world_size bytes where world_size is at most 8. Allocating 8MB
+            # is enough for 131072 such tuples. The largest model I've seen only
+            # needs less than 10000 of registered tuples.
+            self.rank_data = torch.empty(
+                8 * 1024 * 1024, dtype=torch.uint8, device=self.device
+            )
+            self._ptr = ops.init_custom_ar(
+                self.meta_ptrs, self.rank_data, rank, self.full_nvlink
+            )
+            ops.register_buffer(self._ptr, self.buffer_ptrs)
+        else:
+            # From TensorRT-LLM getMaxRequiredWorkspaceSize
+            self.max_required_workspace_size = [16 * 1024 * 1024, 8 * 1024 * 1024]
+
+            # sizeof(uint32_t) * (MAX_ALL_REDUCE_BLOCKS + 2) * MAX_RANKS_PER_NODE;
+            self.barrier_max_size = 8 * (36 + 2) * 8
+
+            self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
+            self.tmp_result_buffer_ptrs = self.create_shared_buffer(
+                max_size, group=group
+            )
+            self.rank_data_base = torch.empty(
+                8 * 1024 * 1024, dtype=torch.uint8, device=self.device
+            )
+            self.barrier_in_ptrs = self.create_shared_buffer(
+                self.barrier_max_size, group=group
+            )
+            self.barrier_out_ptrs = self.create_shared_buffer(
+                self.barrier_max_size, group=group
+            )
+
+            self._ptr = ops.init_custom_ar(
+                rank,
+                world_size,
+                self.rank_data_base,
+                self.buffer_ptrs,
+                self.tmp_result_buffer_ptrs,
+                self.barrier_in_ptrs,
+                self.barrier_out_ptrs,
+            )
         self.disabled = False
 
     @staticmethod
@@ -307,6 +334,11 @@ def should_custom_ar(self, inp: torch.Tensor):
             return False
         # for 4 or more non NVLink-capable GPUs, custom allreduce provides
         # little performance improvement over NCCL.
+        if ops.use_vllm_custom_allreduce:
+            if self.world_size == 2 or self.full_nvlink:
+                return inp_size < self.max_size
+            return False
+
         if self.world_size == 2:
             return (
                 inp_size < self.max_size
@@ -326,6 +358,7 @@ def all_reduce(
         inp: torch.Tensor,
         *,
         out: torch.Tensor = None,
+        registered: bool = False,
     ):
         """Performs an out-of-place all reduce.
 
@@ -335,7 +368,15 @@ def all_reduce(
         """
         if out is None:
             out = torch.empty_like(inp)
-        ops.all_reduce(self._ptr, inp, out)
+        if ops.use_vllm_custom_allreduce:
+            if registered:
+                ops.all_reduce(self._ptr, inp, out, 0, 0)
+            else:
+                ops.all_reduce(
+                    self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
+                )
+        else:
+            ops.all_reduce(self._ptr, inp, out)
         return out
 
     def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
@@ -345,21 +386,25 @@ def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
             return None
         if self._IS_CAPTURING:
             if torch.cuda.is_current_stream_capturing():
-                return self.all_reduce(input)
+                return self.all_reduce(input, registered=True)
             else:
                 # If warm up, mimic the allocation pattern since custom
                 # allreduce is out-of-place.
                 return torch.empty_like(input)
         else:
-            return self.all_reduce(input)
+            return self.all_reduce(input, registered=False)
 
     def close(self):
         if not self.disabled and self._ptr:
             ops.dispose(self._ptr)
-            self.free_shared_buffer(self.buffer_ptrs)
-            self.free_shared_buffer(self.tmp_result_buffer_ptrs)
-            self.free_shared_buffer(self.barrier_in_ptrs)
-            self.free_shared_buffer(self.barrier_out_ptrs)
+            if ops.use_vllm_custom_allreduce:
+                self.free_shared_buffer(self.meta_ptrs)
+                self.free_shared_buffer(self.buffer_ptrs)
+            else:
+                self.free_shared_buffer(self.buffer_ptrs)
+                self.free_shared_buffer(self.tmp_result_buffer_ptrs)
+                self.free_shared_buffer(self.barrier_in_ptrs)
+                self.free_shared_buffer(self.barrier_out_ptrs)
             self._ptr = 0
 
     def __del__(self):

From 6ada05d0ed52f099ec8ffb49c7f7aa7efc31cd49 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Sun, 19 Jan 2025 23:33:04 +0800
Subject: [PATCH 126/248] feat: check for is_cuda for sgl_kernel import (#2984)

---
 .../layers/moe/fused_moe_triton/fused_moe.py  | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
index 01ecce1a6ed9..c0d558085587 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
@@ -15,18 +15,18 @@
 
 from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
-from sglang.srt.utils import direct_register_custom_op, get_device_name, is_hip
+from sglang.srt.utils import (
+    direct_register_custom_op,
+    get_device_name,
+    is_cuda_available,
+    is_hip,
+)
 
-is_hip_flag = False
-if not is_hip():
-    if torch.cuda.is_available():
-        from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
-    else:
-        sgl_moe_align_block_size = None
+is_cuda = is_cuda_available()
+is_hip_flag = is_hip()
+if is_cuda:
+    from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
 
-    is_hip_flag = False
-else:
-    is_hip_flag = True
 
 logger = logging.getLogger(__name__)
 padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0

From 3fc2b625891029bf6207186098e0450e85c0c638 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Sun, 19 Jan 2025 23:45:39 +0800
Subject: [PATCH 127/248] update docker dev image (#2985)

---
 docker/Dockerfile.dev | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev
index 20a373184b85..9d05ee5997e8 100644
--- a/docker/Dockerfile.dev
+++ b/docker/Dockerfile.dev
@@ -20,6 +20,7 @@ RUN apt-get update && apt-get install -y \
     unzip \
     pkg-config \
     libssl-dev \
+    bear \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
 
@@ -41,7 +42,8 @@ RUN python3 -m pip install --no-cache-dir \
     pytest \
     black \
     isort \
-    icdiff
+    icdiff \
+    pre-commit
 
 # Install diff-so-fancy
 RUN curl -LSso /usr/local/bin/diff-so-fancy https://github.com/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \

From def5c31873d9a667ab375ef13f5a77a0e5493e25 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Mon, 20 Jan 2025 00:44:30 +0800
Subject: [PATCH 128/248] docs: update supported_models (#2987)

---
 docs/references/supported_models.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/references/supported_models.md b/docs/references/supported_models.md
index 860841816e02..23c98ea93057 100644
--- a/docs/references/supported_models.md
+++ b/docs/references/supported_models.md
@@ -81,6 +81,7 @@ To port a model from vLLM to SGLang, you can compare these two files [SGLang Lla
   - Remove `Sample`.
   - Change `forward()` functions, and add `forward_batch`.
   - Add `EntryClass` at the end.
+  - Please ensure the new implementation uses **only SGLang components and does not rely on any vLLM components**.
 
 ### Registering an external model implementation
 

From a69cb5cff7389fb6ce1b4c45c52b6796e78ce0f3 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Mon, 20 Jan 2025 00:44:49 +0800
Subject: [PATCH 129/248] cleanup unused header in sgl_kernel (#2986)

---
 .../epilogue/epilogue_per_row_per_col_scale.h       |  7 ++-----
 .../gemm/gemm_universal_base_compat.h               | 13 +++----------
 .../gemm/gemm_with_epilogue_visitor.h               | 13 +++++--------
 sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu    |  2 ++
 sgl-kernel/src/sgl-kernel/csrc/trt_reduce_kernel.cu |  3 ---
 5 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/epilogue/epilogue_per_row_per_col_scale.h b/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/epilogue/epilogue_per_row_per_col_scale.h
index a9deeb9a7da7..c83cf49ad830 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/epilogue/epilogue_per_row_per_col_scale.h
+++ b/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/epilogue/epilogue_per_row_per_col_scale.h
@@ -3,11 +3,8 @@
 
 #pragma once
 
-#include "cutlass/arch/memory.h"
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/numeric_conversion.h"
+#include <cutlass/arch/memory.h>
+#include <cutlass/numeric_conversion.h>
 
 namespace cutlass {
 namespace epilogue {
diff --git a/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_universal_base_compat.h b/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_universal_base_compat.h
index 10be552a8ec2..33e82decc2b2 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_universal_base_compat.h
+++ b/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_universal_base_compat.h
@@ -2,16 +2,9 @@
 // https://github.com/NVIDIA/TensorRT-LLM/blob/be1788106245496872d18e702978e59b6bfd50e0/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/device/gemm_universal_base_compat.h
 #pragma once
 
-#include "cutlass/arch/arch.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/device_kernel.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/default_gemm_universal.h"
-#include "cutlass/gemm/kernel/gemm_universal.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/trace.h"
+#include <cutlass/cutlass.h>
+#include <cutlass/device_kernel.h>
+#include <cutlass/trace.h>
 
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_with_epilogue_visitor.h b/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_with_epilogue_visitor.h
index cf0b9cfa3e97..674e191a077f 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_with_epilogue_visitor.h
+++ b/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_with_epilogue_visitor.h
@@ -3,14 +3,11 @@
 
 #pragma once
 
-#include "cutlass/complex.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/trace.h"
-#include "cutlass_extensions/epilogue/epilogue_per_row_per_col_scale.h"
+#include <cutlass/complex.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/fast_math.h>
+#include <cutlass/matrix_coord.h>
+#include <cutlass/trace.h>
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index b9879b114fe4..99d0326cf073 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -1,3 +1,5 @@
+#include <vector>
+
 #include "utils.hpp"
 
 // trt_reduce
diff --git a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_kernel.cu
index d80beedec823..d647c349602e 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_kernel.cu
@@ -3,9 +3,6 @@
 #include <c10/cuda/CUDAStream.h>
 
 #include <cassert>
-#include <iostream>
-#include <sstream>
-#include <unordered_map>
 
 #include "trt_reduce_internal.cuh"
 

From 8b6a4486ecbf83c915c6a9d3c727d188a22f455e Mon Sep 17 00:00:00 2001
From: giorgiopiatti-dfinity <giorgio.piatti@dfinity.org>
Date: Sun, 19 Jan 2025 20:36:07 +0100
Subject: [PATCH 130/248] fix missing revision arg when loading tokenizer
 (#2982)

---
 python/sglang/srt/managers/detokenizer_manager.py | 1 +
 python/sglang/srt/managers/scheduler.py           | 2 ++
 python/sglang/srt/managers/tokenizer_manager.py   | 2 ++
 python/sglang/srt/managers/tp_worker.py           | 2 ++
 python/sglang/srt/server.py                       | 1 +
 5 files changed, 8 insertions(+)

diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py
index f0605ee1fea0..a8dc14f01034 100644
--- a/python/sglang/srt/managers/detokenizer_manager.py
+++ b/python/sglang/srt/managers/detokenizer_manager.py
@@ -71,6 +71,7 @@ def __init__(
                 server_args.tokenizer_path,
                 tokenizer_mode=server_args.tokenizer_mode,
                 trust_remote_code=server_args.trust_remote_code,
+                revision=server_args.revision,
             )
 
         self.decode_status = LimitedCapacityDict()
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index d859a30a0385..5df9c24cee10 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -206,6 +206,7 @@ def __init__(
                     server_args.tokenizer_path,
                     tokenizer_mode=server_args.tokenizer_mode,
                     trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
                 )
                 self.tokenizer = self.processor.tokenizer
             else:
@@ -213,6 +214,7 @@ def __init__(
                     server_args.tokenizer_path,
                     tokenizer_mode=server_args.tokenizer_mode,
                     trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
                 )
 
         # Check whether overlap can be enabled
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 033a660df5e9..9cf6d9cc556b 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -158,6 +158,7 @@ def __init__(
                     server_args.tokenizer_path,
                     tokenizer_mode=server_args.tokenizer_mode,
                     trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
                 )
                 self.tokenizer = self.processor.tokenizer
                 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -171,6 +172,7 @@ def __init__(
                     server_args.tokenizer_path,
                     tokenizer_mode=server_args.tokenizer_mode,
                     trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
                 )
 
         # Store states
diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py
index 47e3eea40840..fd4dbae9900d 100644
--- a/python/sglang/srt/managers/tp_worker.py
+++ b/python/sglang/srt/managers/tp_worker.py
@@ -83,6 +83,7 @@ def __init__(
                     server_args.tokenizer_path,
                     tokenizer_mode=server_args.tokenizer_mode,
                     trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
                 )
                 self.tokenizer = self.processor.tokenizer
             else:
@@ -90,6 +91,7 @@ def __init__(
                     server_args.tokenizer_path,
                     tokenizer_mode=server_args.tokenizer_mode,
                     trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
                 )
         self.device = self.model_runner.device
 
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index b3526520cd10..a2c1cb375dcc 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -1027,6 +1027,7 @@ def get_tokenizer(self):
             self.server_args.tokenizer_path,
             tokenizer_mode=self.server_args.tokenizer_mode,
             trust_remote_code=self.server_args.trust_remote_code,
+            revision=self.server_args.revision,
         )
 
     async def async_generate(

From d77caa2b757044f84e0078336b43de531cdd5688 Mon Sep 17 00:00:00 2001
From: Seungduk Kim <seungduk.kim@yanolja.com>
Date: Mon, 20 Jan 2025 04:36:53 +0900
Subject: [PATCH 131/248] [#2812] Make the decode status dict capcity
 adjustable by a CLI param (#2839)

---
 .../srt/managers/detokenizer_manager.py       | 23 ++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py
index a8dc14f01034..972f9595b2c8 100644
--- a/python/sglang/srt/managers/detokenizer_manager.py
+++ b/python/sglang/srt/managers/detokenizer_manager.py
@@ -15,6 +15,7 @@
 
 import dataclasses
 import logging
+import os
 import signal
 from collections import OrderedDict
 from typing import Dict, List, Union
@@ -35,6 +36,12 @@
 
 logger = logging.getLogger(__name__)
 
+# Maximum number of request states that detokenizer can hold. When exceeded,
+# oldest request states will be evicted. Default: 65536 (1<<16).
+# For more details, see: https://github.com/sgl-project/sglang/issues/2812
+# Use power of 2 values for better memory allocation.
+DETOKENIZER_MAX_STATES = int(os.environ.get("SGLANG_DETOKENIZER_MAX_STATES", 1 << 16))
+
 
 @dataclasses.dataclass
 class DecodeStatus:
@@ -74,7 +81,7 @@ def __init__(
                 revision=server_args.revision,
             )
 
-        self.decode_status = LimitedCapacityDict()
+        self.decode_status = LimitedCapacityDict(capacity=DETOKENIZER_MAX_STATES)
 
     def trim_matched_stop(
         self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
@@ -156,7 +163,17 @@ def event_loop(self):
             # Incremental decoding
             output_strs = []
             for i in range(bs):
-                s = self.decode_status[recv_obj.rids[i]]
+                try:
+                    s = self.decode_status[recv_obj.rids[i]]
+                except KeyError:
+                    raise RuntimeError(
+                        f"Decode status not found for request {recv_obj.rids[i]}. "
+                        "It may be due to the request being evicted from the decode status due to memory pressure. "
+                        "Please increase the maximum number of requests by setting "
+                        "the SGLANG_DETOKENIZER_MAX_STATES environment variable to a bigger value than the default value. "
+                        f"The current value is {DETOKENIZER_MAX_STATES}. "
+                        "For more details, see: https://github.com/sgl-project/sglang/issues/2812"
+                    )
                 new_text = read_texts[i][len(surr_texts[i]) :]
                 if recv_obj.finished_reasons[i] is None:
                     # Streaming chunk: update the decode status
@@ -197,7 +214,7 @@ def event_loop(self):
 
 
 class LimitedCapacityDict(OrderedDict):
-    def __init__(self, capacity=1 << 15, *args, **kwargs):
+    def __init__(self, capacity: int, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.capacity = capacity
 

From 2c05f81f157fdd5e532baea78bb0121a0ba2c1a0 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Mon, 20 Jan 2025 04:21:29 +0800
Subject: [PATCH 132/248] fix custom op version compatibility (#2988)

---
 python/pyproject.toml                        | 2 +-
 python/sglang/srt/layers/rotary_embedding.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index 379a4c9acf84..f1fcc4679d8d 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -27,7 +27,7 @@ runtime_common = [
 ]
 srt = [
     "sglang[runtime_common]", "cuda-python",
-    "sgl-kernel>=0.0.2.post14", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1",
+    "sgl-kernel>=0.0.2.post14", "torch", "vllm==0.6.4.post1",
     "flashinfer==0.1.6"
 ]
 
diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py
index bc38fa8c0f9e..964152905be6 100644
--- a/python/sglang/srt/layers/rotary_embedding.py
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -8,6 +8,8 @@
 import torch.nn as nn
 from vllm.model_executor.custom_op import CustomOp
 
+from sglang.srt.layers.custom_op_util import register_custom_op
+
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
     x1 = x[..., : x.shape[-1] // 2]
@@ -51,7 +53,7 @@ def _apply_rotary_emb(
         return torch.stack((o1, o2), dim=-1).flatten(-2)
 
 
-@CustomOp.register("rotary_embedding")
+@register_custom_op("sglang_rotary_embedding")
 class RotaryEmbedding(CustomOp):
     """Original rotary positional embedding."""
 

From 3bcf5ecea7a1a9a4f34606c739230b99d453d09b Mon Sep 17 00:00:00 2001
From: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
Date: Sun, 19 Jan 2025 20:34:41 +0000
Subject: [PATCH 133/248] support regex in xgrammar backend (#2983)

---
 docs/backend/openai_api_completions.ipynb     |   2 +-
 docs/backend/structured_outputs.ipynb         |   3 +-
 docs/references/sampling_params.md            |   2 +-
 python/pyproject.toml                         |   2 +-
 .../srt/constrained/xgrammar_backend.py       |  12 +-
 test/srt/run_suite.py                         |   1 +
 test/srt/test_regex_constrained.py            | 186 ++++++++++++++++++
 7 files changed, 200 insertions(+), 8 deletions(-)
 create mode 100644 test/srt/test_regex_constrained.py

diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index 8660da2f98fd..42cdbb112106 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -219,7 +219,7 @@
     "SGLang supports two grammar backends:\n",
     "\n",
     "- [Outlines](https://github.com/dottxt-ai/outlines) (default): Supports JSON schema and regular expression constraints.\n",
-    "- [XGrammar](https://github.com/mlc-ai/xgrammar): Supports JSON schema and EBNF constraints.\n",
+    "- [XGrammar](https://github.com/mlc-ai/xgrammar): Supports JSON schema, regular expression, and EBNF constraints.\n",
     "  - XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md)\n",
     "\n",
     "Initialize the XGrammar backend using `--grammar-backend xgrammar` flag\n",
diff --git a/docs/backend/structured_outputs.ipynb b/docs/backend/structured_outputs.ipynb
index 55ca0b627f9c..a5e6f2335b57 100644
--- a/docs/backend/structured_outputs.ipynb
+++ b/docs/backend/structured_outputs.ipynb
@@ -16,7 +16,8 @@
     "SGLang supports two grammar backends:\n",
     "\n",
     "- [Outlines](https://github.com/dottxt-ai/outlines) (default): Supports JSON schema and regular expression constraints.\n",
-    "- [XGrammar](https://github.com/mlc-ai/xgrammar): Supports JSON schema and EBNF constraints and currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md).\n",
+    "- [XGrammar](https://github.com/mlc-ai/xgrammar): Supports JSON schema, regular expression, and EBNF constraints.\n",
+    "  - XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md)\n",
     "\n",
     "We suggest using XGrammar whenever possible for its better performance. For more details, see [XGrammar technical overview](https://blog.mlc.ai/2024/11/22/achieving-efficient-flexible-portable-structured-generation-with-xgrammar).\n",
     "\n",
diff --git a/docs/references/sampling_params.md b/docs/references/sampling_params.md
index 5dad3fd12597..cdc53da61a48 100644
--- a/docs/references/sampling_params.md
+++ b/docs/references/sampling_params.md
@@ -189,7 +189,7 @@ You can specify a JSON schema, regular expression or [EBNF](https://en.wikipedia
 SGLang supports two grammar backends:
 
 - [Outlines](https://github.com/dottxt-ai/outlines) (default): Supports JSON schema and regular expression constraints.
-- [XGrammar](https://github.com/mlc-ai/xgrammar): Supports JSON schema and EBNF constraints.
+- [XGrammar](https://github.com/mlc-ai/xgrammar): Supports JSON schema, regular expression, and EBNF constraints.
   - XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md)
 
 Initialize the XGrammar backend using `--grammar-backend xgrammar` flag
diff --git a/python/pyproject.toml b/python/pyproject.toml
index f1fcc4679d8d..f97c9c26679f 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -23,7 +23,7 @@ runtime_common = [
     "packaging", "pillow", "prometheus-client>=0.20.0",
     "psutil", "pydantic", "python-multipart",
     "pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
-    "xgrammar>=0.1.6"
+    "xgrammar>=0.1.10"
 ]
 srt = [
     "sglang[runtime_common]", "cuda-python",
diff --git a/python/sglang/srt/constrained/xgrammar_backend.py b/python/sglang/srt/constrained/xgrammar_backend.py
index b0b2c31c2ac9..c423a567eda8 100644
--- a/python/sglang/srt/constrained/xgrammar_backend.py
+++ b/python/sglang/srt/constrained/xgrammar_backend.py
@@ -19,6 +19,7 @@
 import torch
 from xgrammar import (
     CompiledGrammar,
+    Grammar,
     GrammarCompiler,
     GrammarMatcher,
     TokenizerInfo,
@@ -133,10 +134,13 @@ def init_value_impl(self, key: Tuple[str, str]) -> XGrammarGrammar:
                 logging.warning(f"Skip invalid ebnf: ebnf={key_string}, {e=}")
                 return None
         elif key_type == "regex":
-            logger.warning(
-                "regex hasn't been supported by xgrammar yet. This is skipped."
-            )
-            return None
+            try:
+                ctx = self.grammar_compiler.compile_grammar(
+                    Grammar.from_regex(key_string)
+                )
+            except RuntimeError as e:
+                logging.warning(f"Skip invalid regex: regex={key_string}, {e=}")
+                return None
         else:
             raise ValueError(f"Invalid key_type: {key_type}")
 
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index fb1c6abf29bd..2ed2522755a0 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -31,6 +31,7 @@
         "test_openai_server.py",
         "test_pytorch_sampling_backend.py",
         "test_radix_attention.py",
+        "test_regex_constrained.py",
         "test_release_memory_occupation.py",
         "test_request_length_validation.py",
         "test_retract_decode.py",
diff --git a/test/srt/test_regex_constrained.py b/test/srt/test_regex_constrained.py
new file mode 100644
index 000000000000..6d5acec15e23
--- /dev/null
+++ b/test/srt/test_regex_constrained.py
@@ -0,0 +1,186 @@
+"""
+python3 -m unittest test_regex_constrained.TestRegexConstrained.test_regex_generate_email
+python3 -m unittest test_regex_constrained.TestRegexConstrained.test_regex_generate_greeting
+"""
+
+import json
+import unittest
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+def setup_class(cls, disable_overlap: bool):
+    cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+    cls.base_url = DEFAULT_URL_FOR_TEST
+
+    other_args = [
+        "--max-running-requests",
+        "10",
+        "--grammar-backend",
+        "xgrammar",
+    ]
+
+    if disable_overlap:
+        other_args += ["--disable-overlap-schedule"]
+
+    cls.process = popen_launch_server(
+        cls.model,
+        cls.base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_args,
+    )
+
+
+class TestRegexConstrained(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        setup_class(cls, disable_overlap=False)
+        cls.check_jump_forward = False
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_decode(
+        self,
+        regex,
+        prompt,
+        return_logprob=False,
+        top_logprobs_num=0,
+        n=1,
+    ):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": 0 if n == 1 else 0.5,
+                    "max_new_tokens": 128,
+                    "n": n,
+                    "regex": regex,
+                },
+                "stream": False,
+                "return_logprob": return_logprob,
+                "top_logprobs_num": top_logprobs_num,
+                "logprob_start_len": 0,
+            },
+        )
+
+        ret = response.json()
+        print(json.dumps(ret, indent=2))
+        print("=" * 100)
+
+        if not isinstance(ret, list):
+            self.fail(f"Expected response to be a list, but got {type(ret)}")
+
+        for item in ret:
+            text = item.get("text", "").strip()
+            if not text:
+                self.fail("Generated text is empty.")
+
+            if not self.regex_match(text, regex):
+                self.fail(f"Text '{text}' does not match regex pattern.")
+
+    def regex_match(self, text, pattern):
+        import re
+
+        return re.match(pattern, text) is not None
+
+    def test_regex_generate_email(self):
+        pattern = r"^user@example\.com$"
+        prompt = "Generate an email address:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_regex_generate_greeting(self):
+        pattern = r"^(Hello|Hi|Hey)$"
+        prompt = "Generate a greeting:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_regex_generate_number(self):
+        pattern = r"^\d{3}$"
+        prompt = "Generate a three-digit number:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_regex_generate_phone(self):
+        pattern = r"^\(\d{3}\) \d{3}-\d{4}$"
+        prompt = "Generate a phone number:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_regex_generate_date(self):
+        pattern = r"^2024-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])$"
+        prompt = "Generate a date in YYYY-MM-DD format:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_regex_generate_hex_color(self):
+        pattern = r"^#[0-9A-F]{6}$"
+        prompt = "Generate a hex color code:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_regex_generate_complex_json(self):
+        pattern = r'^\{\s*"name"\s*:\s*"[a-zA-Z0-9 ]+"\s*,\s*"age"\s*:\s*[1-9][0-9]*\s*,\s*"city"\s*:\s*"[a-zA-Z0-9 ]+"\s*\}$'
+        prompt = "Generate a simple JSON with name, age, and city:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_regex_generate_custom_log_format(self):
+        pattern = r"^\[2024-01-01T12:00:00Z\] INFO: System\.process - Operation [a-z]+ successfully$"
+        prompt = "Generate a log entry:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+
+class TestJumpForward(TestRegexConstrained):
+    @classmethod
+    def setUpClass(cls):
+        setup_class(cls, disable_overlap=True)
+        cls.check_jump_forward = True
+
+
+if __name__ == "__main__":
+    unittest.main()

From e403d2375719a79c4b9e1e998474aa1ee3384399 Mon Sep 17 00:00:00 2001
From: Hongpeng Guo <hpguo@anyscale.com>
Date: Sun, 19 Jan 2025 14:46:53 -0800
Subject: [PATCH 134/248] [Feature] Add sampler custom logits processor (#2396)

Signed-off-by: Hongpeng Guo <hpguo@anyscale.com>
---
 python/sglang/srt/layers/sampler.py           |  30 ++++-
 python/sglang/srt/managers/io_struct.py       |  19 +++
 python/sglang/srt/managers/schedule_batch.py  |   2 +
 python/sglang/srt/managers/scheduler.py       |  14 ++
 .../sglang/srt/managers/session_controller.py |   1 +
 .../sglang/srt/managers/tokenizer_manager.py  |   1 +
 .../srt/sampling/custom_logit_processor.py    |  38 ++++++
 .../srt/sampling/sampling_batch_info.py       | 122 +++++++++++++++++-
 python/sglang/srt/sampling/sampling_params.py |   4 +-
 python/sglang/srt/server.py                   |   4 +
 python/sglang/srt/server_args.py              |   8 ++
 test/srt/test_srt_endpoint.py                 |  63 ++++++++-
 12 files changed, 302 insertions(+), 4 deletions(-)
 create mode 100644 python/sglang/srt/sampling/custom_logit_processor.py

diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index 23037650a31c..e8b25da0704b 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -1,11 +1,12 @@
 import logging
-from typing import List
+from typing import Dict, List
 
 import torch
 from torch import nn
 
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.utils import crash_on_warnings, is_flashinfer_available
 
@@ -35,6 +36,10 @@ def forward(
     ):
         logits = logits_output.next_token_logits
 
+        # Apply the custom logit processors if registered in the sampling info.
+        if sampling_info.has_custom_logit_processor:
+            self._apply_custom_logit_processor(logits, sampling_info)
+
         if self.use_nan_detectioin and torch.any(torch.isnan(logits)):
             logger.warning("Detected errors during sampling! NaN in the logits.")
             logits = torch.where(
@@ -121,6 +126,29 @@ def forward(
 
         return batch_next_token_ids
 
+    def _apply_custom_logit_processor(
+        self, logits: torch.Tensor, sampling_batch_info: SamplingBatchInfo
+    ):
+        """Apply custom logit processors to the logits.
+        This function will modify the logits in-place."""
+
+        for _, (
+            processor,
+            batch_mask,
+        ) in sampling_batch_info.custom_logit_processor.items():
+            # Get the batch indices that need to be processed
+            batch_indices = batch_mask.nonzero(as_tuple=True)[0]
+
+            # Apply the processor to the logits
+            logits[batch_mask] = processor(
+                logits[batch_mask],
+                [sampling_batch_info.custom_params[i] for i in batch_indices],
+            )
+
+            logger.debug(
+                f"Custom logit processor {processor.__class__.__name__} is applied."
+            )
+
 
 def top_k_top_p_min_p_sampling_from_probs_torch(
     probs: torch.Tensor,
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
index c5a35ced00cf..5a803dd997ad 100644
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -22,6 +22,7 @@
 from typing import Dict, List, Optional, Union
 
 from sglang.srt.managers.schedule_batch import BaseFinishReason
+from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
 from sglang.srt.sampling.sampling_params import SamplingParams
 
 
@@ -69,6 +70,8 @@ class GenerateReqInput:
 
     # Session info for continual prompting
     session_params: Optional[Union[List[Dict], Dict]] = None
+    # Custom logit processor (serialized function)
+    custom_logit_processor: Optional[Union[List[Optional[str]], Optional[str]]] = None
 
     def normalize_batch_and_arguments(self):
         if (
@@ -183,6 +186,13 @@ def normalize_batch_and_arguments(self):
             else:
                 assert self.parallel_sample_num == 1
 
+            if self.custom_logit_processor is None:
+                self.custom_logit_processor = [None] * num
+            elif not isinstance(self.custom_logit_processor, list):
+                self.custom_logit_processor = [self.custom_logit_processor] * num
+            else:
+                assert self.parallel_sample_num == 1
+
     def regenerate_rid(self):
         self.rid = uuid.uuid4().hex
         return self.rid
@@ -202,6 +212,11 @@ def __getitem__(self, i):
             log_metrics=self.log_metrics,
             modalities=self.modalities[i] if self.modalities else None,
             lora_path=self.lora_path[i] if self.lora_path is not None else None,
+            custom_logit_processor=(
+                self.custom_logit_processor[i]
+                if self.custom_logit_processor is not None
+                else None
+            ),
         )
 
 
@@ -234,6 +249,10 @@ class TokenizedGenerateReqInput:
     # Session info for continual prompting
     session_params: Optional[SessionParams] = None
 
+    # Custom logit processor (serialized function)
+    # TODO (hpguo): Add an example and update doc string here
+    custom_logit_processor: Optional[str] = None
+
 
 @dataclass
 class EmbeddingReqInput:
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index afbc98b7ca99..a09810a38714 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -232,6 +232,7 @@ def __init__(
         lora_path: Optional[str] = None,
         input_embeds: Optional[List[List[float]]] = None,
         session_id: Optional[str] = None,
+        custom_logit_processor: Optional[str] = None,
         eos_token_ids: Optional[Set[int]] = None,
     ):
         # Input and output info
@@ -252,6 +253,7 @@ def __init__(
         # Sampling info
         self.sampling_params = sampling_params
         self.lora_path = lora_path
+        self.custom_logit_processor = custom_logit_processor
 
         # Memory pool info
         self.req_pool_idx = None
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 5df9c24cee10..a89bd1bc4f4c 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -614,6 +614,19 @@ def handle_generate_request(
                 fake_input_ids = [1] * seq_length
                 recv_req.input_ids = fake_input_ids
 
+            # Handle custom logit processor passed to the request
+            custom_logit_processor = recv_req.custom_logit_processor
+            if (
+                not self.server_args.enable_custom_logit_processor
+                and custom_logit_processor is not None
+            ):
+                logger.warning(
+                    "The SGLang server is not configured to enable custom logit processor."
+                    "The custom logit processor passed in will be ignored."
+                    "Please set --enable-custom-logits-processor to enable this feature."
+                )
+                custom_logit_processor = None
+
             req = Req(
                 recv_req.rid,
                 recv_req.input_text,
@@ -624,6 +637,7 @@ def handle_generate_request(
                 stream=recv_req.stream,
                 lora_path=recv_req.lora_path,
                 input_embeds=recv_req.input_embeds,
+                custom_logit_processor=custom_logit_processor,
                 eos_token_ids=self.model_config.hf_eos_token_id,
             )
             req.tokenizer = self.tokenizer
diff --git a/python/sglang/srt/managers/session_controller.py b/python/sglang/srt/managers/session_controller.py
index e9c0c909d52c..4f4af6367573 100644
--- a/python/sglang/srt/managers/session_controller.py
+++ b/python/sglang/srt/managers/session_controller.py
@@ -131,6 +131,7 @@ def create_req(self, req: TokenizedGenerateReqInput, tokenizer):
             sampling_params=req.sampling_params,
             lora_path=req.lora_path,
             session_id=self.session_id,
+            custom_logit_processor=req.custom_logit_processor,
         )
         if last_req is not None:
             new_req.image_inputs = last_req.image_inputs
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 9cf6d9cc556b..3e3493005536 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -381,6 +381,7 @@ async def _tokenize_one_request(
                 lora_path=obj.lora_path,
                 input_embeds=input_embeds,
                 session_params=session_params,
+                custom_logit_processor=obj.custom_logit_processor,
             )
         elif isinstance(obj, EmbeddingReqInput):
             tokenized_obj = TokenizedEmbeddingReqInput(
diff --git a/python/sglang/srt/sampling/custom_logit_processor.py b/python/sglang/srt/sampling/custom_logit_processor.py
new file mode 100644
index 000000000000..a64b2498f239
--- /dev/null
+++ b/python/sglang/srt/sampling/custom_logit_processor.py
@@ -0,0 +1,38 @@
+import json
+from abc import ABC, abstractmethod
+from functools import lru_cache
+from typing import Any, Dict, List, Optional
+
+import dill
+import torch
+
+
+@lru_cache(maxsize=None)
+def _cache_from_str(json_str: str):
+    """Deserialize a json string to a Callable object.
+    This function is cached to avoid redundant deserialization.
+    """
+    data = json.loads(json_str)
+    return dill.loads(bytes.fromhex(data["callable"]))
+
+
+class CustomLogitProcessor(ABC):
+    """Abstract base class for callable functions."""
+
+    @abstractmethod
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        custom_param_list: Optional[List[Dict[str, Any]]] = None,
+    ) -> torch.Tensor:
+        """Define the callable behavior."""
+        raise NotImplementedError
+
+    def to_str(self) -> str:
+        """Serialize the callable function to a JSON-compatible string."""
+        return json.dumps({"callable": dill.dumps(self).hex()})
+
+    @classmethod
+    def from_str(cls, json_str: str):
+        """Deserialize a callable function from a JSON string."""
+        return _cache_from_str(json_str)
diff --git a/python/sglang/srt/sampling/sampling_batch_info.py b/python/sglang/srt/sampling/sampling_batch_info.py
index 6eda63c706a3..d4c5c32386ae 100644
--- a/python/sglang/srt/sampling/sampling_batch_info.py
+++ b/python/sglang/srt/sampling/sampling_batch_info.py
@@ -3,7 +3,7 @@
 import dataclasses
 import logging
 import threading
-from typing import TYPE_CHECKING, Callable, List, Optional
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple
 
 import torch
 
@@ -14,6 +14,7 @@
     from sgl_kernel import sampling_scaling_penalties
 
 import sglang.srt.sampling.penaltylib as penaltylib
+from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
 
 logger = logging.getLogger(__name__)
 
@@ -36,6 +37,9 @@ class SamplingBatchInfo:
     # Dispatch in CUDA graph
     need_min_p_sampling: bool
 
+    # Whether any request has custom logit processor
+    has_custom_logit_processor: bool
+
     # Bias Tensors
     vocab_size: int
     grammars: Optional[List] = None
@@ -52,6 +56,14 @@ class SamplingBatchInfo:
     # Device
     device: str = "cuda"
 
+    # Custom Parameters
+    custom_params: Optional[List[Optional[Dict[str, Any]]]] = None
+
+    # Custom Logit Processor
+    custom_logit_processor: Optional[
+        Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]
+    ] = None
+
     @classmethod
     def from_schedule_batch(
         cls, batch: ScheduleBatch, vocab_size: int, enable_overlap_schedule: bool
@@ -76,6 +88,36 @@ def from_schedule_batch(
             [r.sampling_params.min_p for r in reqs], dtype=torch.float
         ).to(device, non_blocking=True)
 
+        # Check if any request has custom logit processor
+        has_custom_logit_processor = any(r.custom_logit_processor for r in reqs)
+
+        if has_custom_logit_processor:
+            # Merge the same type of custom logit processors together
+            processor_dict = {}
+            for i, r in enumerate(reqs):
+                if r.custom_logit_processor is None:
+                    continue
+                processor_str = r.custom_logit_processor
+                if processor_str not in processor_dict:
+                    processor_dict[processor_str] = []
+                processor_dict[processor_str].append(i)
+
+            merged_custom_logit_processor = {
+                hash(processor_str): (
+                    # The deserialized custom logit processor object
+                    CustomLogitProcessor.from_str(processor_str),
+                    # The mask tensor for the requests that use this custom logit processor
+                    torch.zeros(len(reqs), dtype=torch.bool)
+                    .scatter_(0, torch.tensor(true_indices), True)
+                    .to(device, non_blocking=True),
+                )
+                for processor_str, true_indices in processor_dict.items()
+            }
+            custom_params = [r.sampling_params.custom_params for r in reqs]
+        else:
+            merged_custom_logit_processor = None
+            custom_params = None
+
         ret = cls(
             temperatures=temperatures,
             top_ps=top_ps,
@@ -83,8 +125,11 @@ def from_schedule_batch(
             min_ps=min_ps,
             need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
             is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
+            has_custom_logit_processor=has_custom_logit_processor,
             vocab_size=vocab_size,
             device=device,
+            custom_params=custom_params,
+            custom_logit_processor=merged_custom_logit_processor,
         )
         # TODO (lianmin): `need_min_p_sampling` needs to be updated in filter and merge.
 
@@ -184,6 +229,8 @@ def update_regex_vocab_mask(self):
 
     def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor):
         self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
+        if self.has_custom_logit_processor:
+            self._filter_batch_custom_logit_processor(unfinished_indices, new_indices)
 
         for item in [
             "temperatures",
@@ -196,6 +243,26 @@ def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor)
             if value is not None:  # logit_bias can be None
                 setattr(self, item, value[new_indices])
 
+    def _filter_batch_custom_logit_processor(
+        self, unfinished_indices: List[int], new_indices: torch.Tensor
+    ):
+        """Filter the custom logit processor and custom params"""
+        if not self.custom_logit_processor:
+            return
+        self.custom_logit_processor = {
+            k: (p, mask[new_indices])
+            for k, (p, mask) in self.custom_logit_processor.items()
+            if any(
+                mask[new_indices]
+            )  # ignore the custom logit processor whose mask is all False
+        }
+        self.custom_params = [self.custom_params[i] for i in unfinished_indices]
+
+        if len(self) == 0:
+            self.custom_logit_processor = None
+            self.custom_params = None
+            self.has_custom_logit_processor = False
+
     @staticmethod
     def merge_bias_tensor(
         lhs: torch.Tensor,
@@ -221,6 +288,39 @@ def merge_bias_tensor(
 
         return None
 
+    @staticmethod
+    def merge_custom_logit_processor(
+        lhs: Optional[Dict[str, torch.Tensor]],
+        rhs: Optional[Dict[str, torch.Tensor]],
+        bs1: int,
+        bs2: int,
+        device: str,
+    ):
+        if lhs is None and rhs is None:
+            return None
+        lhs, rhs = lhs or {}, rhs or {}
+
+        keys = set(lhs.keys()).union(set(rhs.keys()))
+        merged_dict = {}
+
+        for k in keys:
+            # Get the logit processor object
+            processor = lhs[k][0] if k in lhs else rhs[k][0]
+            # Get and merge the mask tensors from the two dicts
+            left_mask = (
+                lhs[k][1]
+                if k in lhs
+                else torch.zeros(bs1, dtype=torch.bool, device=device)
+            )
+            right_mask = (
+                rhs[k][1]
+                if k in rhs
+                else torch.zeros(bs2, dtype=torch.bool, device=device)
+            )
+            merged_dict[k] = (processor, torch.cat([left_mask, right_mask]))
+
+        return merged_dict
+
     def merge_batch(self, other: "SamplingBatchInfo"):
         self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
 
@@ -240,6 +340,26 @@ def merge_batch(self, other: "SamplingBatchInfo"):
         )
         self.need_min_p_sampling = self.need_min_p_sampling or other.need_min_p_sampling
 
+        # Merge the custom logit processors and custom params lists
+        if self.has_custom_logit_processor or other.has_custom_logit_processor:
+            # Merge the custom logit processors
+            self.custom_logit_processor = (
+                SamplingBatchInfo.merge_custom_logit_processor(
+                    self.custom_logit_processor,
+                    other.custom_logit_processor,
+                    len(self),
+                    len(other),
+                    self.device,
+                )
+            )
+            # Merge the custom params lists
+            self.custom_params = self.custom_params or [None] * len(self)
+            other.custom_params = other.custom_params or [None] * len(other)
+            self.custom_params.extend(other.custom_params)
+
+            # Set the flag to True if any of the two has custom logit processor
+            self.has_custom_logit_processor = True
+
     def apply_logits_bias(self, logits: torch.Tensor):
         # Apply logit_bias
         if self.logit_bias is not None:
diff --git a/python/sglang/srt/sampling/sampling_params.py b/python/sglang/srt/sampling/sampling_params.py
index d1d932693c61..2224fb0919a1 100644
--- a/python/sglang/srt/sampling/sampling_params.py
+++ b/python/sglang/srt/sampling/sampling_params.py
@@ -13,7 +13,7 @@
 # ==============================================================================
 """Sampling parameters for text generation."""
 
-from typing import List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 _SAMPLING_EPS = 1e-6
 
@@ -48,6 +48,7 @@ def __init__(
         no_stop_trim: bool = False,
         ignore_eos: bool = False,
         skip_special_tokens: bool = True,
+        custom_params: Optional[Dict[str, Any]] = None,
     ) -> None:
         self.temperature = temperature
         self.top_p = top_p
@@ -71,6 +72,7 @@ def __init__(
         self.json_schema = json_schema
         self.ebnf = ebnf
         self.no_stop_trim = no_stop_trim
+        self.custom_params = custom_params
 
         # Process some special cases
         if self.temperature < _SAMPLING_EPS:
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index a2c1cb375dcc..2cb2cd95dc89 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -773,6 +773,7 @@ def generate(
         logprob_start_len: Optional[Union[List[int], int]] = None,
         top_logprobs_num: Optional[Union[List[int], int]] = None,
         lora_path: Optional[List[Optional[str]]] = None,
+        custom_logit_processor: Optional[Union[List[str], str]] = None,
         stream: bool = False,
     ):
         obj = GenerateReqInput(
@@ -784,6 +785,7 @@ def generate(
             top_logprobs_num=top_logprobs_num,
             lora_path=lora_path,
             stream=stream,
+            custom_logit_processor=custom_logit_processor,
         )
 
         # get the current event loop
@@ -824,6 +826,7 @@ async def async_generate(
         logprob_start_len: Optional[Union[List[int], int]] = None,
         top_logprobs_num: Optional[Union[List[int], int]] = None,
         lora_path: Optional[List[Optional[str]]] = None,
+        custom_logit_processor: Optional[Union[str, List[str]]] = None,
         stream: bool = False,
     ):
         obj = GenerateReqInput(
@@ -835,6 +838,7 @@ async def async_generate(
             top_logprobs_num=top_logprobs_num,
             lora_path=lora_path,
             stream=stream,
+            custom_logit_processor=custom_logit_processor,
         )
 
         ret = await generate_request(obj, None)
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 052e316b7c46..6dd0b9456541 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -159,6 +159,9 @@ class ServerArgs:
     enable_memory_saver: bool = False
     allow_auto_truncate: bool = False
 
+    # Custom logit processor
+    enable_custom_logit_processor: bool = False
+
     def __post_init__(self):
         # Set missing default values
         if self.tokenizer_path is None:
@@ -865,6 +868,11 @@ def add_cli_args(parser: argparse.ArgumentParser):
             action="store_true",
             help="Allow automatically truncating requests that exceed the maximum input length instead of returning an error.",
         )
+        parser.add_argument(
+            "--enable-custom-logit-processor",
+            action="store_true",
+            help="Enable users to pass custom logit processors to the server (disabled by default for security)",
+        )
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py
index 0fd71efcb0b2..7afdc9bf41c2 100644
--- a/test/srt/test_srt_endpoint.py
+++ b/test/srt/test_srt_endpoint.py
@@ -5,10 +5,12 @@
 
 import json
 import unittest
+from concurrent.futures import ThreadPoolExecutor
 
 import numpy as np
 import requests
 
+from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
 from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
     DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
@@ -24,7 +26,10 @@ def setUpClass(cls):
         cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
         cls.base_url = DEFAULT_URL_FOR_TEST
         cls.process = popen_launch_server(
-            cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=("--enable-custom-logit-processor",),
         )
 
     @classmethod
@@ -248,6 +253,62 @@ def test_logprob_grammar(self):
 
         self.assertTrue(all(x is not None for x in logprobs))
 
+    def run_custom_logit_processor(self, target_token_id: int):
+        """Test custom logit processor with custom params."""
+
+        custom_params = {"token_id": target_token_id}
+
+        class DeterministicLogitProcessor(CustomLogitProcessor):
+            """A dummy logit processor that changes the logits to always
+            sample the given token id.
+            """
+
+            def __call__(self, logits, custom_param_list):
+                assert logits.shape[0] == len(custom_param_list)
+                key = "token_id"
+
+                for i, param_dict in enumerate(custom_param_list):
+                    # Mask all other tokens
+                    logits[i, :] = -float("inf")
+                    # Assign highest probability to the specified token
+                    logits[i, param_dict[key]] = 0.0
+                return logits
+
+        prompts = "Question: Is Paris the Capital of France? Answer:"
+
+        # Base case json data to be posted to the server.
+        base_json = {
+            "text": prompts,
+            "sampling_params": {"temperature": 0.0},
+            "return_logprob": True,
+        }
+
+        # Custom json data with custom logit processor and params.
+        custom_json = base_json.copy()
+        custom_json["custom_logit_processor"] = DeterministicLogitProcessor().to_str()
+        custom_json["sampling_params"]["custom_params"] = custom_params
+
+        custom_response = requests.post(
+            self.base_url + "/generate",
+            json=custom_json,
+        ).json()
+
+        output_token_logprobs = custom_response["meta_info"]["output_token_logprobs"]
+        sampled_tokens = [x[1] for x in output_token_logprobs]
+
+        # The logit processor should always sample the given token as the logits is deterministic.
+        self.assertTrue(all(x == custom_params["token_id"] for x in sampled_tokens))
+
+    def test_custom_logit_processor(self):
+        """Test custom logit processor with a single request."""
+        self.run_custom_logit_processor(target_token_id=5)
+
+    def test_custom_logit_processor_batch(self):
+        """Test custom logit processor with a batch of requests."""
+        target_token_ids = list(range(32))
+        with ThreadPoolExecutor(len(target_token_ids)) as executor:
+            list(executor.map(self.run_custom_logit_processor, target_token_ids))
+
     def test_get_server_info(self):
         response = requests.get(self.base_url + "/get_server_info")
         response_json = response.json()

From 61f42b5732a0740ed9a416a098b96e7e6e14f277 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 19 Jan 2025 17:10:29 -0800
Subject: [PATCH 135/248] Move sgl.Runtime under sglang/lang (#2990)

---
 .../frontend_language/usage/json_decode.py    |   2 +-
 .../models/character_generation/1/model.py    |   4 +-
 examples/runtime/async_io_api.py              |  46 -----
 python/sglang/api.py                          |   7 +-
 python/sglang/bench_offline_throughput.py     |   3 +-
 .../sglang/lang/backend/runtime_endpoint.py   | 169 +++++++++++++++++-
 python/sglang/launch_server_llavavid.py       |  25 ---
 python/sglang/srt/constrained/__init__.py     |  16 --
 .../srt/constrained/base_grammar_backend.py   |  21 +++
 python/sglang/srt/managers/scheduler.py       | 109 +++++------
 .../sglang/srt/managers/tokenizer_manager.py  |   4 +-
 python/sglang/srt/server.py                   | 160 -----------------
 python/sglang/test/runners.py                 |  20 +--
 scripts/deprecated/test_jump_forward.py       |   2 +-
 test/lang/test_srt_backend.py                 |   2 +-
 test/srt/models/test_qwen_models.py           |   2 +-
 test/srt/models/test_reward_models.py         |   4 +-
 17 files changed, 267 insertions(+), 329 deletions(-)
 delete mode 100644 examples/runtime/async_io_api.py
 delete mode 100644 python/sglang/launch_server_llavavid.py
 delete mode 100644 python/sglang/srt/constrained/__init__.py

diff --git a/examples/frontend_language/usage/json_decode.py b/examples/frontend_language/usage/json_decode.py
index ce8f5ba70627..5dc3522d512a 100644
--- a/examples/frontend_language/usage/json_decode.py
+++ b/examples/frontend_language/usage/json_decode.py
@@ -9,7 +9,7 @@
 from pydantic import BaseModel
 
 import sglang as sgl
-from sglang.srt.constrained import build_regex_from_object
+from sglang.srt.constrained.outlines_backend import build_regex_from_object
 
 character_regex = (
     r"""\{\n"""
diff --git a/examples/frontend_language/usage/triton/models/character_generation/1/model.py b/examples/frontend_language/usage/triton/models/character_generation/1/model.py
index 5550e93984b7..4bf86f1b6919 100644
--- a/examples/frontend_language/usage/triton/models/character_generation/1/model.py
+++ b/examples/frontend_language/usage/triton/models/character_generation/1/model.py
@@ -3,8 +3,8 @@
 from pydantic import BaseModel
 
 import sglang as sgl
-from sglang import function, set_default_backend
-from sglang.srt.constrained import build_regex_from_object
+from sglang import function
+from sglang.srt.constrained.outlines_backend import build_regex_from_object
 
 sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
 
diff --git a/examples/runtime/async_io_api.py b/examples/runtime/async_io_api.py
deleted file mode 100644
index 23d3d0b90bf9..000000000000
--- a/examples/runtime/async_io_api.py
+++ /dev/null
@@ -1,46 +0,0 @@
-"""
-Usage:
-
-python3 async_io.py
-"""
-
-import asyncio
-
-from sglang import Runtime
-
-
-async def generate(
-    engine,
-    prompt,
-    sampling_params,
-):
-    tokenizer = engine.get_tokenizer()
-
-    messages = [
-        {
-            "role": "system",
-            "content": "You will be given question answer tasks.",
-        },
-        {"role": "user", "content": prompt},
-    ]
-
-    prompt = tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-
-    stream = engine.add_request(prompt, sampling_params)
-
-    async for output in stream:
-        print(output, end="", flush=True)
-    print()
-
-
-if __name__ == "__main__":
-    runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
-    print("--- runtime ready ---\n")
-
-    prompt = "Who is Alan Turing?"
-    sampling_params = {"max_new_tokens": 128}
-    asyncio.run(generate(runtime, prompt, sampling_params))
-
-    runtime.shutdown()
diff --git a/python/sglang/api.py b/python/sglang/api.py
index 9a30ad492da3..a9c5fa9da99a 100644
--- a/python/sglang/api.py
+++ b/python/sglang/api.py
@@ -1,6 +1,5 @@
 """Public APIs of the language."""
 
-import os
 import re
 from typing import Callable, List, Optional, Union
 
@@ -33,17 +32,13 @@ def decorator(func):
 
 
 def Runtime(*args, **kwargs):
-    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
-
     # Avoid importing unnecessary dependency
-    from sglang.srt.server import Runtime
+    from sglang.lang.backend.runtime_endpoint import Runtime
 
     return Runtime(*args, **kwargs)
 
 
 def Engine(*args, **kwargs):
-    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
-
     # Avoid importing unnecessary dependency
     from sglang.srt.server import Engine
 
diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index 54b042c115d9..6b31ac40e116 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -27,7 +27,8 @@
     sample_random_requests,
     set_ulimit,
 )
-from sglang.srt.server import Engine, Runtime
+from sglang.lang.backend.runtime_endpoint import Runtime
+from sglang.srt.server import Engine
 from sglang.srt.server_args import ServerArgs
 
 
diff --git a/python/sglang/lang/backend/runtime_endpoint.py b/python/sglang/lang/backend/runtime_endpoint.py
index a00325912268..23e9f1afbc62 100644
--- a/python/sglang/lang/backend/runtime_endpoint.py
+++ b/python/sglang/lang/backend/runtime_endpoint.py
@@ -1,6 +1,11 @@
+import atexit
 import json
+import multiprocessing
 import warnings
-from typing import List, Optional
+from typing import Dict, List, Optional, Union
+
+import aiohttp
+import requests
 
 from sglang.global_config import global_config
 from sglang.lang.backend.base_backend import BaseBackend
@@ -14,6 +19,9 @@
     REGEX_STR,
     SglSamplingParams,
 )
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import is_port_available, kill_process_tree
 from sglang.utils import http_request
 
 
@@ -325,3 +333,162 @@ def _assert_success(self, res):
 def compute_normalized_prompt_logprobs(input_logprobs):
     values = [x[0] for x in input_logprobs if x[0]]
     return sum(values) / len(values)
+
+
+class Runtime:
+    """
+    A wrapper for the HTTP server.
+    This is used for launching the server in a python program without
+    using the commond line interface.
+
+    It is mainly used for the frontend language.
+    You should use the Engine class if you want to do normal offline processing.
+    """
+
+    def __init__(
+        self,
+        log_level: str = "error",
+        *args,
+        **kwargs,
+    ):
+        """See the arguments in server_args.py::ServerArgs"""
+        from sglang.srt.server import launch_server
+
+        self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
+
+        # before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
+        atexit.register(self.shutdown)
+
+        # Pre-allocate ports
+        for port in range(self.server_args.port, 40000):
+            if is_port_available(port):
+                break
+        self.server_args.port = port
+
+        self.url = self.server_args.url()
+        self.generate_url = self.url + "/generate"
+
+        # NOTE: We store pid instead of proc to fix some issues during __delete__
+        self.pid = None
+        pipe_reader, pipe_writer = multiprocessing.Pipe(duplex=False)
+
+        proc = multiprocessing.Process(
+            target=launch_server,
+            args=(self.server_args, pipe_writer),
+        )
+        proc.start()
+        pipe_writer.close()
+        self.pid = proc.pid
+
+        try:
+            init_state = pipe_reader.recv()
+        except EOFError:
+            init_state = ""
+
+        if init_state != "ready":
+            self.shutdown()
+            raise RuntimeError(
+                "Initialization failed. Please see the error messages above."
+            )
+
+        self.endpoint = RuntimeEndpoint(self.url)
+
+    def shutdown(self):
+        if self.pid is not None:
+            kill_process_tree(self.pid)
+            self.pid = None
+
+    def cache_prefix(self, prefix: str):
+        self.endpoint.cache_prefix(prefix)
+
+    def get_tokenizer(self):
+        return get_tokenizer(
+            self.server_args.tokenizer_path,
+            tokenizer_mode=self.server_args.tokenizer_mode,
+            trust_remote_code=self.server_args.trust_remote_code,
+            revision=self.server_args.revision,
+        )
+
+    async def async_generate(
+        self,
+        prompt: str,
+        sampling_params: Optional[Dict] = None,
+    ):
+        if self.server_args.skip_tokenizer_init:
+            json_data = {
+                "input_ids": prompt,
+                "sampling_params": sampling_params,
+                "stream": True,
+            }
+        else:
+            json_data = {
+                "text": prompt,
+                "sampling_params": sampling_params,
+                "stream": True,
+            }
+        pos = 0
+
+        timeout = aiohttp.ClientTimeout(total=3 * 3600)
+        async with aiohttp.ClientSession(timeout=timeout, trust_env=True) as session:
+            async with session.post(self.generate_url, json=json_data) as response:
+                async for chunk, _ in response.content.iter_chunks():
+                    chunk = chunk.decode("utf-8")
+                    if chunk and chunk.startswith("data:"):
+                        if chunk == "data: [DONE]\n\n":
+                            break
+                        data = json.loads(chunk[5:].strip("\n"))
+                        if "text" in data:
+                            cur = data["text"][pos:]
+                            if cur:
+                                yield cur
+                            pos += len(cur)
+                        else:
+                            yield data
+
+    add_request = async_generate
+
+    def generate(
+        self,
+        prompt: Union[str, List[str]],
+        sampling_params: Optional[Dict] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+        lora_path: Optional[List[Optional[str]]] = None,
+    ):
+        json_data = {
+            "text": prompt,
+            "sampling_params": sampling_params,
+            "return_logprob": return_logprob,
+            "logprob_start_len": logprob_start_len,
+            "top_logprobs_num": top_logprobs_num,
+            "lora_path": lora_path,
+        }
+        assert not isinstance(lora_path, list) or len(lora_path) == len(prompt)
+        response = requests.post(
+            self.url + "/generate",
+            json=json_data,
+        )
+        return json.dumps(response.json())
+
+    def encode(
+        self,
+        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
+    ):
+        json_data = {"text": prompt}
+        response = requests.post(self.url + "/encode", json=json_data)
+        return json.dumps(response.json())
+
+    async def get_server_info(self):
+        async with aiohttp.ClientSession() as session:
+            async with session.get(f"{self.url}/get_server_info") as response:
+                if response.status == 200:
+                    return await response.json()
+                else:
+                    error_data = await response.json()
+                    raise RuntimeError(
+                        f"Failed to get server info. {error_data['error']['message']}"
+                    )
+
+    def __del__(self):
+        self.shutdown()
diff --git a/python/sglang/launch_server_llavavid.py b/python/sglang/launch_server_llavavid.py
deleted file mode 100644
index 138c2127e16e..000000000000
--- a/python/sglang/launch_server_llavavid.py
+++ /dev/null
@@ -1,25 +0,0 @@
-"""Launch the inference server for Llava-video model."""
-
-import json
-import sys
-
-from sglang.srt.server import launch_server, prepare_server_args
-
-if __name__ == "__main__":
-    server_args = prepare_server_args(sys.argv[1:])
-
-    model_override_args = {}
-    model_override_args["mm_spatial_pool_stride"] = 2
-    model_override_args["architectures"] = ["LlavaVidForCausalLM"]
-    model_override_args["num_frames"] = 16
-    model_override_args["model_type"] = "llavavid"
-    if model_override_args["num_frames"] == 32:
-        model_override_args["rope_scaling"] = {"factor": 2.0, "rope_type": "linear"}
-        model_override_args["max_sequence_length"] = 4096 * 2
-        model_override_args["tokenizer_model_max_length"] = 4096 * 2
-        model_override_args["model_max_length"] = 4096 * 2
-    if "34b" in server_args.model_path.lower():
-        model_override_args["image_token_index"] = 64002
-    server_args.json_model_override_args = json.dumps(model_override_args)
-
-    launch_server(server_args)
diff --git a/python/sglang/srt/constrained/__init__.py b/python/sglang/srt/constrained/__init__.py
deleted file mode 100644
index 458d19252413..000000000000
--- a/python/sglang/srt/constrained/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright 2023-2024 SGLang Team
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# TODO(lmzheng): make this an optional dependency
-from sglang.srt.constrained.outlines_backend import build_regex_from_object
diff --git a/python/sglang/srt/constrained/base_grammar_backend.py b/python/sglang/srt/constrained/base_grammar_backend.py
index 7c88229cf168..6f304ea171ea 100644
--- a/python/sglang/srt/constrained/base_grammar_backend.py
+++ b/python/sglang/srt/constrained/base_grammar_backend.py
@@ -18,6 +18,8 @@
 from threading import Event, Lock
 from typing import Any, Optional, Tuple
 
+from sglang.srt.server_args import ServerArgs
+
 
 @dataclass
 class CacheEntry:
@@ -69,3 +71,22 @@ def get_future_value(self, key: Tuple[str, str]) -> Future:
     def reset(self):
         with self.cache_lock:
             self.cache.clear()
+
+
+def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
+    if server_args.grammar_backend == "outlines":
+        from sglang.srt.constrained.outlines_backend import OutlinesGrammarBackend
+
+        grammar_backend = OutlinesGrammarBackend(
+            tokenizer,
+            whitespace_pattern=server_args.constrained_json_whitespace_pattern,
+            allow_jump_forward=not server_args.disable_jump_forward,
+        )
+    elif server_args.grammar_backend == "xgrammar":
+        from sglang.srt.constrained.xgrammar_backend import XGrammarGrammarBackend
+
+        grammar_backend = XGrammarGrammarBackend(tokenizer, vocab_size=vocab_size)
+    else:
+        raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
+
+    return grammar_backend
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index a89bd1bc4f4c..ece5b2664559 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -34,6 +34,7 @@
 
 from sglang.global_config import global_config
 from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.constrained.base_grammar_backend import create_grammar_backend
 from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer
 from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
@@ -149,9 +150,7 @@ def __init__(
             else 1
         )
 
-        # Init inter-process communication
-        context = zmq.Context(2)
-
+        # Distributed rank info
         self.dp_size = server_args.dp_size
         self.attn_tp_rank, self.attn_tp_size, self.dp_rank = (
             compute_dp_attention_world_info(
@@ -162,6 +161,8 @@ def __init__(
             )
         )
 
+        # Init inter-process communication
+        context = zmq.Context(2)
         if self.attn_tp_rank == 0:
             self.recv_from_tokenizer = get_zmq_socket(
                 context, zmq.PULL, port_args.scheduler_input_ipc_name, False
@@ -243,7 +244,7 @@ def __init__(
             nccl_port=port_args.nccl_port,
         )
 
-        # Launch worker for speculative decoding if need
+        # Launch a worker for speculative decoding if needed
         if self.spec_algorithm.is_eagle():
             from sglang.srt.speculative.eagle_worker import EAGLEWorker
 
@@ -316,6 +317,8 @@ def __init__(
         self.forward_ct = 0
         self.forward_ct_decode = 0
         self.num_generated_tokens = 0
+        self.spec_num_total_accepted_tokens = 0
+        self.spec_num_total_forward_ct = 0
         self.last_decode_stats_tic = time.time()
         self.stream_interval = server_args.stream_interval
         self.current_stream = torch.get_device_module(self.device).current_stream()
@@ -337,28 +340,9 @@ def __init__(
         # Init the grammar backend for constrained generation
         self.grammar_queue: List[Req] = []
         if not server_args.skip_tokenizer_init:
-            if server_args.grammar_backend == "outlines":
-                from sglang.srt.constrained.outlines_backend import (
-                    OutlinesGrammarBackend,
-                )
-
-                self.grammar_backend = OutlinesGrammarBackend(
-                    self.tokenizer,
-                    whitespace_pattern=server_args.constrained_json_whitespace_pattern,
-                    allow_jump_forward=not server_args.disable_jump_forward,
-                )
-            elif server_args.grammar_backend == "xgrammar":
-                from sglang.srt.constrained.xgrammar_backend import (
-                    XGrammarGrammarBackend,
-                )
-
-                self.grammar_backend = XGrammarGrammarBackend(
-                    self.tokenizer, vocab_size=self.model_config.vocab_size
-                )
-            else:
-                raise ValueError(
-                    f"Invalid grammar backend: {server_args.grammar_backend}"
-                )
+            self.grammar_backend = create_grammar_backend(
+                server_args, self.tokenizer, self.model_config.vocab_size
+            )
         else:
             self.grammar_backend = None
 
@@ -424,7 +408,8 @@ def __init__(
                 },
             )
 
-        self._dispatcher = TypeBasedDispatcher(
+        # Init request dispatcher
+        self._request_dispatcher = TypeBasedDispatcher(
             [
                 (TokenizedGenerateReqInput, self.handle_generate_request),
                 (TokenizedEmbeddingReqInput, self.handle_embedding_request),
@@ -480,10 +465,6 @@ def event_loop_normal(self):
             self.process_input_requests(recv_reqs)
 
             batch = self.get_next_batch_to_run()
-
-            if self.server_args.enable_dp_attention:  # TODO: simplify this
-                batch = self.prepare_dp_attn_batch(batch)
-
             self.cur_batch = batch
 
             if batch:
@@ -506,10 +487,6 @@ def event_loop_overlap(self):
             self.process_input_requests(recv_reqs)
 
             batch = self.get_next_batch_to_run()
-
-            if self.server_args.enable_dp_attention:  # TODO: simplify this
-                batch = self.prepare_dp_attn_batch(batch)
-
             self.cur_batch = batch
 
             if batch:
@@ -517,7 +494,7 @@ def event_loop_overlap(self):
                 result_queue.append((batch.copy(), result))
 
                 if self.last_batch is None:
-                    # Create a dummy first batch to start the pipeline for overlap scheduler.
+                    # Create a dummy first batch to start the pipeline for overlap schedule.
                     # It is now used for triggering the sampling_info_done event.
                     tmp_batch = ScheduleBatch(
                         reqs=None,
@@ -593,7 +570,7 @@ def recv_requests(self) -> List[Req]:
 
     def process_input_requests(self, recv_reqs: List):
         for recv_req in recv_reqs:
-            output = self._dispatcher(recv_req)
+            output = self._request_dispatcher(recv_req)
             if output is not None:
                 self.send_to_tokenizer.send_pyobj(output)
 
@@ -798,15 +775,32 @@ def log_decode_stats(self):
         self.num_generated_tokens = 0
         self.last_decode_stats_tic = time.time()
         num_running_reqs = len(self.running_batch.reqs) if self.running_batch else 0
-        logger.info(
-            f"Decode batch. "
-            f"#running-req: {num_running_reqs}, "
-            f"#token: {num_used}, "
-            f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
-            f"gen throughput (token/s): {gen_throughput:.2f}, "
-            f"#queue-req: {len(self.waiting_queue)}"
-        )
 
+        if self.spec_algorithm.is_none():
+            msg = (
+                f"Decode batch. "
+                f"#running-req: {num_running_reqs}, "
+                f"#token: {num_used}, "
+                f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
+                f"gen throughput (token/s): {gen_throughput:.2f}, "
+                f"#queue-req: {len(self.waiting_queue)}"
+            )
+        else:
+            accept_length = (
+                self.spec_num_total_accepted_tokens / self.spec_num_total_forward_ct
+            )
+            self.spec_num_total_accepted_tokens = self.spec_num_total_forward_ct = 0
+            msg = (
+                f"Decode batch. "
+                f"#running-req: {num_running_reqs}, "
+                f"#token: {num_used}, "
+                f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
+                f"accept len: {accept_length:.2f}, "
+                f"gen throughput (token/s): {gen_throughput:.2f}, "
+                f"#queue-req: {len(self.waiting_queue)}"
+            )
+
+        logger.info(msg)
         if self.enable_metrics:
             self.stats.num_running_reqs = num_running_reqs
             self.stats.num_used_tokens = num_used
@@ -855,16 +849,23 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
                 else:
                     self.running_batch.merge_batch(self.last_batch)
 
-        # Run prefill first if possible
         new_batch = self.get_new_batch_prefill()
         if new_batch is not None:
-            return new_batch
+            # Run prefill first if possible
+            ret = new_batch
+        else:
+            # Run decode
+            if self.running_batch is None:
+                ret = None
+            else:
+                self.running_batch = self.update_running_batch(self.running_batch)
+                ret = self.running_batch
 
-        # Run decode
-        if self.running_batch is None:
-            return None
-        self.running_batch = self.update_running_batch(self.running_batch)
-        return self.running_batch
+        # Handle DP attention
+        if self.server_args.enable_dp_attention:
+            ret = self.prepare_dp_attn_batch(ret)
+
+        return ret
 
     def get_new_batch_prefill(self) -> Optional[ScheduleBatch]:
         # Check if the grammar is ready in the grammar queue
@@ -1053,6 +1054,10 @@ def run_batch(
                         model_worker_batch,
                         num_accepted_tokens,
                     ) = self.draft_worker.forward_batch_speculative_generation(batch)
+                    self.spec_num_total_accepted_tokens += (
+                        num_accepted_tokens + batch.batch_size()
+                    )
+                    self.spec_num_total_forward_ct += batch.batch_size()
                     self.num_generated_tokens += num_accepted_tokens
             else:
                 assert False, "batch.extend_num_tokens == 0, this is unexpected!"
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 3e3493005536..d6178a959d03 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -224,7 +224,7 @@ def __init__(
                 },
             )
 
-        self._dispatcher = TypeBasedDispatcher(
+        self._result_dispatcher = TypeBasedDispatcher(
             [
                 (BatchStrOut, self._handle_batch_output),
                 (BatchEmbeddingOut, self._handle_batch_output),
@@ -760,7 +760,7 @@ async def handle_loop(self):
 
         while True:
             recv_obj = await self.recv_from_detokenizer.recv_pyobj()
-            self._dispatcher(recv_obj)
+            self._result_dispatcher(recv_obj)
 
     def _handle_batch_output(
         self, recv_obj: Union[BatchStrOut, BatchEmbeddingOut, BatchTokenIDOut]
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index 2cb2cd95dc89..0b4d9c37218e 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -45,8 +45,6 @@
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import ORJSONResponse, Response, StreamingResponse
 
-from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
-from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.data_parallel_controller import (
     run_data_parallel_controller_process,
 )
@@ -90,7 +88,6 @@
     assert_pkg_version,
     configure_logger,
     delete_directory,
-    is_port_available,
     kill_process_tree,
     maybe_set_triton_cache_manager,
     prepare_model_and_tokenizer,
@@ -960,160 +957,3 @@ def resume_memory_occupation(self):
         obj = ResumeMemoryOccupationReqInput()
         loop = asyncio.get_event_loop()
         loop.run_until_complete(tokenizer_manager.resume_memory_occupation(obj, None))
-
-
-class Runtime:
-    """
-    A wrapper for the HTTP server.
-    This is used for launching the server in a python program without
-    using the commond line interface.
-
-    It is mainly used for the frontend language.
-    You should use the Engine class above if you want to do normal offline processing.
-    """
-
-    def __init__(
-        self,
-        log_level: str = "error",
-        *args,
-        **kwargs,
-    ):
-        """See the arguments in server_args.py::ServerArgs"""
-        self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
-
-        # before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
-        atexit.register(self.shutdown)
-
-        # Pre-allocate ports
-        for port in range(self.server_args.port, 40000):
-            if is_port_available(port):
-                break
-        self.server_args.port = port
-
-        self.url = self.server_args.url()
-        self.generate_url = self.url + "/generate"
-
-        # NOTE: We store pid instead of proc to fix some issues during __delete__
-        self.pid = None
-        pipe_reader, pipe_writer = mp.Pipe(duplex=False)
-
-        proc = mp.Process(
-            target=launch_server,
-            args=(self.server_args, pipe_writer),
-        )
-        proc.start()
-        pipe_writer.close()
-        self.pid = proc.pid
-
-        try:
-            init_state = pipe_reader.recv()
-        except EOFError:
-            init_state = ""
-
-        if init_state != "ready":
-            self.shutdown()
-            raise RuntimeError(
-                "Initialization failed. Please see the error messages above."
-            )
-
-        self.endpoint = RuntimeEndpoint(self.url)
-
-    def shutdown(self):
-        if self.pid is not None:
-            kill_process_tree(self.pid)
-            self.pid = None
-
-    def cache_prefix(self, prefix: str):
-        self.endpoint.cache_prefix(prefix)
-
-    def get_tokenizer(self):
-        return get_tokenizer(
-            self.server_args.tokenizer_path,
-            tokenizer_mode=self.server_args.tokenizer_mode,
-            trust_remote_code=self.server_args.trust_remote_code,
-            revision=self.server_args.revision,
-        )
-
-    async def async_generate(
-        self,
-        prompt: str,
-        sampling_params: Optional[Dict] = None,
-    ):
-        if self.server_args.skip_tokenizer_init:
-            json_data = {
-                "input_ids": prompt,
-                "sampling_params": sampling_params,
-                "stream": True,
-            }
-        else:
-            json_data = {
-                "text": prompt,
-                "sampling_params": sampling_params,
-                "stream": True,
-            }
-        pos = 0
-
-        timeout = aiohttp.ClientTimeout(total=3 * 3600)
-        async with aiohttp.ClientSession(timeout=timeout, trust_env=True) as session:
-            async with session.post(self.generate_url, json=json_data) as response:
-                async for chunk, _ in response.content.iter_chunks():
-                    chunk = chunk.decode("utf-8")
-                    if chunk and chunk.startswith("data:"):
-                        if chunk == "data: [DONE]\n\n":
-                            break
-                        data = json.loads(chunk[5:].strip("\n"))
-                        if "text" in data:
-                            cur = data["text"][pos:]
-                            if cur:
-                                yield cur
-                            pos += len(cur)
-                        else:
-                            yield data
-
-    add_request = async_generate
-
-    def generate(
-        self,
-        prompt: Union[str, List[str]],
-        sampling_params: Optional[Dict] = None,
-        return_logprob: Optional[Union[List[bool], bool]] = False,
-        logprob_start_len: Optional[Union[List[int], int]] = None,
-        top_logprobs_num: Optional[Union[List[int], int]] = None,
-        lora_path: Optional[List[Optional[str]]] = None,
-    ):
-        json_data = {
-            "text": prompt,
-            "sampling_params": sampling_params,
-            "return_logprob": return_logprob,
-            "logprob_start_len": logprob_start_len,
-            "top_logprobs_num": top_logprobs_num,
-            "lora_path": lora_path,
-        }
-        assert not isinstance(lora_path, list) or len(lora_path) == len(prompt)
-        response = requests.post(
-            self.url + "/generate",
-            json=json_data,
-        )
-        return json.dumps(response.json())
-
-    def encode(
-        self,
-        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
-    ):
-        json_data = {"text": prompt}
-        response = requests.post(self.url + "/encode", json=json_data)
-        return json.dumps(response.json())
-
-    async def get_server_info(self):
-        async with aiohttp.ClientSession() as session:
-            async with session.get(f"{self.url}/get_server_info") as response:
-                if response.status == 200:
-                    return await response.json()
-                else:
-                    error_data = await response.json()
-                    raise RuntimeError(
-                        f"Failed to get server info. {error_data['error']['message']}"
-                    )
-
-    def __del__(self):
-        self.shutdown()
diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py
index f22f9cafaf39..fc9a9793715d 100644
--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -23,7 +23,7 @@
 from transformers import AutoModelForCausalLM
 
 from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.server import Runtime
+from sglang.srt.server import Engine
 from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER
 
 DEFAULT_PROMPTS = [
@@ -278,7 +278,7 @@ def __init__(
     ):
         self.model_type = model_type
         self.is_generation = model_type == "generation"
-        self.runtime = Runtime(
+        self.engine = Engine(
             model_path=model_path,
             tp_size=tp_size,
             dtype=get_dtype_str(torch_dtype),
@@ -306,7 +306,7 @@ def forward(
             top_output_logprobs = []
             sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
             for i, prompt in enumerate(prompts):
-                response = self.runtime.generate(
+                response = self.engine.generate(
                     prompt,
                     lora_path=lora_paths[i] if lora_paths else None,
                     sampling_params=sampling_params,
@@ -314,7 +314,6 @@ def forward(
                     logprob_start_len=0,
                     top_logprobs_num=NUM_TOP_LOGPROBS,
                 )
-                response = json.loads(response)
                 output_strs.append(response["text"])
                 top_input_logprobs.append(
                     [
@@ -343,8 +342,7 @@ def forward(
                 top_output_logprobs=top_output_logprobs,
             )
         else:
-            response = self.runtime.encode(prompts)
-            response = json.loads(response)
+            response = self.engine.encode(prompts)
             if self.model_type == "embedding":
                 logits = [x["embedding"] for x in response]
                 return ModelOutput(embed_logits=logits)
@@ -366,20 +364,18 @@ def batch_forward(
             # the return value contains logprobs from prefill
             output_strs = []
             sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
-            response = self.runtime.generate(
+            response = self.engine.generate(
                 prompts,
                 lora_path=lora_paths if lora_paths else None,
                 sampling_params=sampling_params,
             )
-            response = json.loads(response)
             output_strs = [r["text"] for r in response]
 
             return ModelOutput(
                 output_strs=output_strs,
             )
         else:
-            response = self.runtime.encode(prompts)
-            response = json.loads(response)
+            response = self.engine.encode(prompts)
             if self.model_type == "embedding":
                 logits = [x["embedding"] for x in response]
                 return ModelOutput(embed_logits=logits)
@@ -391,8 +387,8 @@ def __enter__(self):
         return self
 
     def __exit__(self, exc_type, exc_value, traceback):
-        self.runtime.shutdown()
-        del self.runtime
+        self.engine.shutdown()
+        del self.engine
 
 
 def monkey_patch_gemma2_sdpa():
diff --git a/scripts/deprecated/test_jump_forward.py b/scripts/deprecated/test_jump_forward.py
index 60074a040054..315a50b5ba71 100644
--- a/scripts/deprecated/test_jump_forward.py
+++ b/scripts/deprecated/test_jump_forward.py
@@ -4,7 +4,7 @@
 from pydantic import BaseModel, constr
 
 import sglang as sgl
-from sglang.srt.constrained import build_regex_from_object
+from sglang.srt.constrained.outlines_backend import build_regex_from_object
 from sglang.test.test_utils import (
     add_common_sglang_args_and_parse,
     select_sglang_backend,
diff --git a/test/lang/test_srt_backend.py b/test/lang/test_srt_backend.py
index b99606fc1cb8..0d7cc9105576 100644
--- a/test/lang/test_srt_backend.py
+++ b/test/lang/test_srt_backend.py
@@ -73,7 +73,7 @@ def test_hellaswag_select(self):
         # Run twice to capture more bugs
         for _ in range(2):
             accuracy, latency = test_hellaswag_select()
-            self.assertGreater(accuracy, 0.71)
+            self.assertGreater(accuracy, 0.70)
 
     def test_gen_min_new_tokens(self):
         test_gen_min_new_tokens()
diff --git a/test/srt/models/test_qwen_models.py b/test/srt/models/test_qwen_models.py
index 903fd45d5503..9e61930a76e4 100644
--- a/test/srt/models/test_qwen_models.py
+++ b/test/srt/models/test_qwen_models.py
@@ -71,7 +71,7 @@ def test_gsm8k(self):
         metrics = run_eval(args)
         print(metrics)
 
-        self.assertGreater(metrics["accuracy"], 0.8)
+        self.assertGreater(metrics["accuracy"], 0.79)
 
 
 if __name__ == "__main__":
diff --git a/test/srt/models/test_reward_models.py b/test/srt/models/test_reward_models.py
index 0d80a4d0cde8..69ad563671b5 100644
--- a/test/srt/models/test_reward_models.py
+++ b/test/srt/models/test_reward_models.py
@@ -20,8 +20,8 @@
 from sglang.test.runners import HFRunner, SRTRunner
 
 MODELS = [
-    ("LxzGordon/URM-LLaMa-3.1-8B", 1, 3e-2),
-    ("Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", 1, 3e-2),
+    ("LxzGordon/URM-LLaMa-3.1-8B", 1, 4e-2),
+    ("Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", 1, 4e-2),
 ]
 TORCH_DTYPES = [torch.float16]
 

From cd493b5afc27ed1b0f5700809c896af16204f0d9 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 19 Jan 2025 18:36:59 -0800
Subject: [PATCH 136/248] Improve metrics, logging, and importing orders
 (#2992)

---
 .github/workflows/pr-test.yml                 |  2 +-
 .../runtime/engine/offline_batch_inference.py |  5 +++
 python/sglang/__init__.py                     | 44 +++++++++----------
 .../sglang/lang/backend/runtime_endpoint.py   | 20 ++++++---
 python/sglang/srt/managers/scheduler.py       |  6 ++-
 python/sglang/srt/metrics/collector.py        | 21 ++++++---
 sgl-router/py_src/sglang_router/__init__.py   | 10 ++---
 test/srt/run_suite.py                         |  3 +-
 8 files changed, 63 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 51117127adad..b910683e7daf 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -52,7 +52,7 @@ jobs:
     runs-on: 1-gpu-runner
     strategy:
       matrix:
-        range: [0-6, 6-16, 16-23, 23-30, 30-38, 38-100]
+        range: [0-6, 6-15, 15-22, 22-32, 32-37, 37-100]
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
diff --git a/examples/runtime/engine/offline_batch_inference.py b/examples/runtime/engine/offline_batch_inference.py
index 724051eab538..92e68dcd72ca 100644
--- a/examples/runtime/engine/offline_batch_inference.py
+++ b/examples/runtime/engine/offline_batch_inference.py
@@ -1,3 +1,8 @@
+"""
+Usage:
+python3 offline_batch_inference.py  --model meta-llama/Llama-3.1-8B-Instruct
+"""
+
 import argparse
 import dataclasses
 
diff --git a/python/sglang/__init__.py b/python/sglang/__init__.py
index de9134857a61..70d58043d40c 100644
--- a/python/sglang/__init__.py
+++ b/python/sglang/__init__.py
@@ -1,5 +1,6 @@
-# SGL API Components
+# SGLang public APIs
 
+# Frontend Language APIs
 from sglang.api import (
     Engine,
     Runtime,
@@ -23,16 +24,26 @@
     user_end,
     video,
 )
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.lang.choices import (
     greedy_token_selection,
     token_length_normalized,
     unconditional_likelihood_normalized,
 )
+from sglang.utils import LazyImport
+
+Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
+LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
+OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
+VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
+
+# Other configs
+from sglang.global_config import global_config
+from sglang.version import __version__
 
-# SGLang DSL APIs
 __all__ = [
-    "Runtime",
     "Engine",
+    "Runtime",
     "assistant",
     "assistant_begin",
     "assistant_end",
@@ -52,27 +63,14 @@
     "user_begin",
     "user_end",
     "video",
+    "RuntimeEndpoint",
     "greedy_token_selection",
     "token_length_normalized",
     "unconditional_likelihood_normalized",
+    "Anthropic",
+    "LiteLLM",
+    "OpenAI",
+    "VertexAI",
+    "global_config",
+    "__version__",
 ]
-
-# Global Configurations
-from sglang.global_config import global_config
-
-__all__ += ["global_config"]
-
-from sglang.version import __version__
-
-__all__ += ["__version__"]
-
-# SGLang Backends
-from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
-from sglang.utils import LazyImport
-
-Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
-LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
-OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
-VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
-
-__all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]
diff --git a/python/sglang/lang/backend/runtime_endpoint.py b/python/sglang/lang/backend/runtime_endpoint.py
index 23e9f1afbc62..c139db6f04c0 100644
--- a/python/sglang/lang/backend/runtime_endpoint.py
+++ b/python/sglang/lang/backend/runtime_endpoint.py
@@ -19,9 +19,6 @@
     REGEX_STR,
     SglSamplingParams,
 )
-from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import is_port_available, kill_process_tree
 from sglang.utils import http_request
 
 
@@ -342,7 +339,7 @@ class Runtime:
     using the commond line interface.
 
     It is mainly used for the frontend language.
-    You should use the Engine class if you want to do normal offline processing.
+    You should use the Engine class if you want to do normal offline processing without the frontend language.
     """
 
     def __init__(
@@ -352,13 +349,14 @@ def __init__(
         **kwargs,
     ):
         """See the arguments in server_args.py::ServerArgs"""
+        # We delay the import of any `sglang.srt` components in `sglang.lang`, so users can run
+        # client code without installing SRT server and its dependency if they want.
         from sglang.srt.server import launch_server
+        from sglang.srt.server_args import ServerArgs
+        from sglang.srt.utils import is_port_available
 
         self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
 
-        # before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
-        atexit.register(self.shutdown)
-
         # Pre-allocate ports
         for port in range(self.server_args.port, 40000):
             if is_port_available(port):
@@ -380,6 +378,10 @@ def __init__(
         pipe_writer.close()
         self.pid = proc.pid
 
+        # Before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
+        atexit.register(self.shutdown)
+
+        # TODO: remove this pipe_writer mechanism and use `/health_generate` instead.
         try:
             init_state = pipe_reader.recv()
         except EOFError:
@@ -394,6 +396,8 @@ def __init__(
         self.endpoint = RuntimeEndpoint(self.url)
 
     def shutdown(self):
+        from sglang.srt.utils import kill_process_tree
+
         if self.pid is not None:
             kill_process_tree(self.pid)
             self.pid = None
@@ -402,6 +406,8 @@ def cache_prefix(self, prefix: str):
         self.endpoint.cache_prefix(prefix)
 
     def get_tokenizer(self):
+        from sglang.srt.hf_transformers_utils import get_tokenizer
+
         return get_tokenizer(
             self.server_args.tokenizer_path,
             tokenizer_mode=self.server_args.tokenizer_mode,
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index ece5b2664559..416abe21cd3e 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -785,8 +785,9 @@ def log_decode_stats(self):
                 f"gen throughput (token/s): {gen_throughput:.2f}, "
                 f"#queue-req: {len(self.waiting_queue)}"
             )
+            spec_accept_length = 0
         else:
-            accept_length = (
+            spec_accept_length = (
                 self.spec_num_total_accepted_tokens / self.spec_num_total_forward_ct
             )
             self.spec_num_total_accepted_tokens = self.spec_num_total_forward_ct = 0
@@ -795,7 +796,7 @@ def log_decode_stats(self):
                 f"#running-req: {num_running_reqs}, "
                 f"#token: {num_used}, "
                 f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
-                f"accept len: {accept_length:.2f}, "
+                f"accept len: {spec_accept_length:.2f}, "
                 f"gen throughput (token/s): {gen_throughput:.2f}, "
                 f"#queue-req: {len(self.waiting_queue)}"
             )
@@ -807,6 +808,7 @@ def log_decode_stats(self):
             self.stats.token_usage = num_used / self.max_total_num_tokens
             self.stats.gen_throughput = gen_throughput
             self.stats.num_queue_reqs = len(self.waiting_queue)
+            self.stats.spec_accept_length = spec_accept_length
             self.metrics_collector.log_stats(self.stats)
 
     def check_memory(self):
diff --git a/python/sglang/srt/metrics/collector.py b/python/sglang/srt/metrics/collector.py
index 070b405be429..26eb2fc27d22 100644
--- a/python/sglang/srt/metrics/collector.py
+++ b/python/sglang/srt/metrics/collector.py
@@ -25,6 +25,7 @@ class SchedulerStats:
     gen_throughput: float = 0.0
     num_queue_reqs: int = 0
     cache_hit_rate: float = 0.0
+    spec_accept_length: float = 0.0
 
 
 class SchedulerMetricsCollector:
@@ -37,42 +38,49 @@ def __init__(self, labels: Dict[str, str]) -> None:
 
         self.num_running_reqs = Gauge(
             name="sglang:num_running_reqs",
-            documentation="The number of running requests",
+            documentation="The number of running requests.",
             labelnames=labels.keys(),
             multiprocess_mode="sum",
         )
 
         self.num_used_tokens = Gauge(
             name="sglang:num_used_tokens",
-            documentation="The number of used tokens",
+            documentation="The number of used tokens.",
             labelnames=labels.keys(),
             multiprocess_mode="sum",
         )
 
         self.token_usage = Gauge(
             name="sglang:token_usage",
-            documentation="The token usage",
+            documentation="The token usage.",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
 
         self.gen_throughput = Gauge(
             name="sglang:gen_throughput",
-            documentation="The generate throughput (token/s)",
+            documentation="The generation throughput (token/s).",
             labelnames=labels.keys(),
             multiprocess_mode="sum",
         )
 
         self.num_queue_reqs = Gauge(
             name="sglang:num_queue_reqs",
-            documentation="The number of requests in the waiting queue",
+            documentation="The number of requests in the waiting queue.",
             labelnames=labels.keys(),
             multiprocess_mode="sum",
         )
 
         self.cache_hit_rate = Gauge(
             name="sglang:cache_hit_rate",
-            documentation="The cache hit rate",
+            documentation="The prefix cache hit rate.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+
+        self.spec_accept_length = Gauge(
+            name="sglang:spec_accept_length",
+            documentation="The average acceptance length of speculative decoding.",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
@@ -88,6 +96,7 @@ def log_stats(self, stats: SchedulerStats) -> None:
         self._log_gauge(self.gen_throughput, stats.gen_throughput)
         self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
         self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
+        self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
 
 
 class TokenizerMetricsCollector:
diff --git a/sgl-router/py_src/sglang_router/__init__.py b/sgl-router/py_src/sglang_router/__init__.py
index 285ee173ba92..081740479ca6 100644
--- a/sgl-router/py_src/sglang_router/__init__.py
+++ b/sgl-router/py_src/sglang_router/__init__.py
@@ -1,11 +1,7 @@
 # a lightweihgt wrapper on router with argument type and comments
-from sglang_router_rs import PolicyType
-
 # no wrapper on policy type => direct export
-from .router import Router
-
-__all__ = ["Router", "PolicyType"]
-
+from sglang_router.router import Router
 from sglang_router.version import __version__
+from sglang_router_rs import PolicyType
 
-__all__ += ["__version__"]
+__all__ = ["Router", "PolicyType", "__version__"]
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 2ed2522755a0..69a5470bee40 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -42,8 +42,7 @@
         "test_srt_endpoint.py",
         "test_torch_compile.py",
         "test_torch_compile_moe.py",
-        # Temporarily disable this because it requires PyTorch >= 2.5
-        # "test_torch_native_attention_backend.py",
+        "test_torch_native_attention_backend.py",
         "test_torchao.py",
         "test_triton_attention_kernels.py",
         "test_triton_attention_backend.py",

From 0ffcfdf474d34858ce5641c11c0b5559861d188b Mon Sep 17 00:00:00 2001
From: Chayenne <zhaochen20@outlook.com>
Date: Sun, 19 Jan 2025 20:22:47 -0800
Subject: [PATCH 137/248] Docs: Only use X-Grammar in structed output (#2991)

---
 docs/backend/openai_api_completions.ipynb | 131 ++--------------------
 docs/backend/structured_outputs.ipynb     |  93 +++------------
 2 files changed, 22 insertions(+), 202 deletions(-)

diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index 42cdbb112106..58b524108db1 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -41,10 +41,10 @@
     ")\n",
     "\n",
     "server_process = execute_shell_command(\n",
-    "    \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n",
+    "    \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30020 --host 0.0.0.0\"\n",
     ")\n",
     "\n",
-    "wait_for_server(\"http://localhost:30000\")"
+    "wait_for_server(\"http://localhost:30020\")"
    ]
   },
   {
@@ -68,7 +68,7 @@
    "source": [
     "import openai\n",
     "\n",
-    "client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
+    "client = openai.Client(base_url=\"http://127.0.0.1:30020/v1\", api_key=\"None\")\n",
     "\n",
     "response = client.chat.completions.create(\n",
     "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -214,125 +214,8 @@
    "metadata": {},
    "source": [
     "## Structured Outputs (JSON, Regex, EBNF)\n",
-    "You can specify a JSON schema, [regular expression](https://en.wikipedia.org/wiki/Regular_expression) or [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) to constrain the model output. The model output will be guaranteed to follow the given constraints. Only one constraint parameter (`json_schema`, `regex`, or `ebnf`) can be specified for a request.\n",
     "\n",
-    "SGLang supports two grammar backends:\n",
-    "\n",
-    "- [Outlines](https://github.com/dottxt-ai/outlines) (default): Supports JSON schema and regular expression constraints.\n",
-    "- [XGrammar](https://github.com/mlc-ai/xgrammar): Supports JSON schema, regular expression, and EBNF constraints.\n",
-    "  - XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md)\n",
-    "\n",
-    "Initialize the XGrammar backend using `--grammar-backend xgrammar` flag\n",
-    "```bash\n",
-    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
-    "--port 30000 --host 0.0.0.0 --grammar-backend [xgrammar|outlines] # xgrammar or outlines (default: outlines)\n",
-    "```\n",
-    "\n",
-    "### JSON"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "\n",
-    "json_schema = json.dumps(\n",
-    "    {\n",
-    "        \"type\": \"object\",\n",
-    "        \"properties\": {\n",
-    "            \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
-    "            \"population\": {\"type\": \"integer\"},\n",
-    "        },\n",
-    "        \"required\": [\"name\", \"population\"],\n",
-    "    }\n",
-    ")\n",
-    "\n",
-    "response = client.chat.completions.create(\n",
-    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
-    "    messages=[\n",
-    "        {\n",
-    "            \"role\": \"user\",\n",
-    "            \"content\": \"Give me the information of the capital of France in the JSON format.\",\n",
-    "        },\n",
-    "    ],\n",
-    "    temperature=0,\n",
-    "    max_tokens=128,\n",
-    "    response_format={\n",
-    "        \"type\": \"json_schema\",\n",
-    "        \"json_schema\": {\"name\": \"foo\", \"schema\": json.loads(json_schema)},\n",
-    "    },\n",
-    ")\n",
-    "\n",
-    "print_highlight(response.choices[0].message.content)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Regular expression (use default \"outlines\" backend)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "response = client.chat.completions.create(\n",
-    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
-    "    messages=[\n",
-    "        {\"role\": \"user\", \"content\": \"What is the capital of France?\"},\n",
-    "    ],\n",
-    "    temperature=0,\n",
-    "    max_tokens=128,\n",
-    "    extra_body={\"regex\": \"(Paris|London)\"},\n",
-    ")\n",
-    "\n",
-    "print_highlight(response.choices[0].message.content)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### EBNF (use \"xgrammar\" backend)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# terminate the existing server(that's using default outlines backend) for this demo\n",
-    "terminate_process(server_process)\n",
-    "\n",
-    "# start new server with xgrammar backend\n",
-    "server_process = execute_shell_command(\n",
-    "    \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0 --grammar-backend xgrammar\"\n",
-    ")\n",
-    "wait_for_server(\"http://localhost:30000\")\n",
-    "\n",
-    "# EBNF example\n",
-    "ebnf_grammar = r\"\"\"\n",
-    "        root ::= \"Hello\" | \"Hi\" | \"Hey\"\n",
-    "        \"\"\"\n",
-    "response = client.chat.completions.create(\n",
-    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
-    "    messages=[\n",
-    "        {\"role\": \"system\", \"content\": \"You are a helpful EBNF test bot.\"},\n",
-    "        {\"role\": \"user\", \"content\": \"Say a greeting.\"},\n",
-    "    ],\n",
-    "    temperature=0,\n",
-    "    max_tokens=32,\n",
-    "    extra_body={\"ebnf\": ebnf_grammar},\n",
-    ")\n",
-    "\n",
-    "print_highlight(response.choices[0].message.content)"
+    "For OpenAI compatible structed outputs API, refer to [Structured Outputs](https://docs.sglang.ai/backend/structured_outputs.html#OpenAI-Compatible-API) for more details.\n"
    ]
   },
   {
@@ -362,7 +245,7 @@
     "import time\n",
     "from openai import OpenAI\n",
     "\n",
-    "client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
+    "client = OpenAI(base_url=\"http://127.0.0.1:30020/v1\", api_key=\"None\")\n",
     "\n",
     "requests = [\n",
     "    {\n",
@@ -465,7 +348,7 @@
     "import time\n",
     "from openai import OpenAI\n",
     "\n",
-    "client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
+    "client = OpenAI(base_url=\"http://127.0.0.1:30020/v1\", api_key=\"None\")\n",
     "\n",
     "requests = []\n",
     "for i in range(100):\n",
@@ -542,7 +425,7 @@
     "from openai import OpenAI\n",
     "import os\n",
     "\n",
-    "client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
+    "client = OpenAI(base_url=\"http://127.0.0.1:30020/v1\", api_key=\"None\")\n",
     "\n",
     "requests = []\n",
     "for i in range(500):\n",
diff --git a/docs/backend/structured_outputs.ipynb b/docs/backend/structured_outputs.ipynb
index a5e6f2335b57..e413743ccfde 100644
--- a/docs/backend/structured_outputs.ipynb
+++ b/docs/backend/structured_outputs.ipynb
@@ -17,11 +17,12 @@
     "\n",
     "- [Outlines](https://github.com/dottxt-ai/outlines) (default): Supports JSON schema and regular expression constraints.\n",
     "- [XGrammar](https://github.com/mlc-ai/xgrammar): Supports JSON schema, regular expression, and EBNF constraints.\n",
-    "  - XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md)\n",
     "\n",
-    "We suggest using XGrammar whenever possible for its better performance. For more details, see [XGrammar technical overview](https://blog.mlc.ai/2024/11/22/achieving-efficient-flexible-portable-structured-generation-with-xgrammar).\n",
+    "We suggest using XGrammar for its better performance and utility. XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md). For more details, see [XGrammar technical overview](https://blog.mlc.ai/2024/11/22/achieving-efficient-flexible-portable-structured-generation-with-xgrammar).\n",
     "\n",
-    "To use Xgrammar, simply add `--grammar-backend` xgrammar when launching the server. If no backend is specified, Outlines will be used as the default."
+    "To use Xgrammar, simply add `--grammar-backend` xgrammar when launching the server. If no backend is specified, Outlines will be used as the default.\n",
+    "\n",
+    "For better output quality, **It's advisable to explicitly include instructions in the prompt to guide the model to generate the desired format.** For example, you can specify, 'Please generate the output in the following JSON format: ...'.\n"
    ]
   },
   {
@@ -93,7 +94,7 @@
     "    messages=[\n",
     "        {\n",
     "            \"role\": \"user\",\n",
-    "            \"content\": \"Give me the information of the capital of France in the JSON format.\",\n",
+    "            \"content\": \"Please generate the information of the capital of France in the JSON format.\",\n",
     "        },\n",
     "    ],\n",
     "    temperature=0,\n",
@@ -197,20 +198,6 @@
     "print_highlight(response.choices[0].message.content)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "terminate_process(server_process)\n",
-    "server_process = execute_shell_command(\n",
-    "    \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n",
-    ")\n",
-    "\n",
-    "wait_for_server(\"http://localhost:30000\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -237,15 +224,6 @@
     "print_highlight(response.choices[0].message.content)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "terminate_process(server_process)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -253,21 +231,6 @@
     "## Native API and SGLang Runtime (SRT)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "server_process = execute_shell_command(\n",
-    "    \"\"\"\n",
-    "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010 --grammar-backend xgrammar\n",
-    "\"\"\"\n",
-    ")\n",
-    "\n",
-    "wait_for_server(\"http://localhost:30010\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -301,7 +264,7 @@
     "\n",
     "# Make API request\n",
     "response = requests.post(\n",
-    "    \"http://localhost:30010/generate\",\n",
+    "    \"http://localhost:30000/generate\",\n",
     "    json={\n",
     "        \"text\": \"Here is the information of the capital of France in the JSON format.\\n\",\n",
     "        \"sampling_params\": {\n",
@@ -346,7 +309,7 @@
     "\n",
     "# JSON\n",
     "response = requests.post(\n",
-    "    \"http://localhost:30010/generate\",\n",
+    "    \"http://localhost:30000/generate\",\n",
     "    json={\n",
     "        \"text\": \"Here is the information of the capital of France in the JSON format.\\n\",\n",
     "        \"sampling_params\": {\n",
@@ -376,7 +339,7 @@
     "import requests\n",
     "\n",
     "response = requests.post(\n",
-    "    \"http://localhost:30010/generate\",\n",
+    "    \"http://localhost:30000/generate\",\n",
     "    json={\n",
     "        \"text\": \"Give me the information of the capital of France.\",\n",
     "        \"sampling_params\": {\n",
@@ -399,22 +362,6 @@
     "print_highlight(response.json())"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "terminate_process(server_process)\n",
-    "server_process = execute_shell_command(\n",
-    "    \"\"\"\n",
-    "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010\n",
-    "\"\"\"\n",
-    ")\n",
-    "\n",
-    "wait_for_server(\"http://localhost:30010\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -429,7 +376,7 @@
    "outputs": [],
    "source": [
     "response = requests.post(\n",
-    "    \"http://localhost:30010/generate\",\n",
+    "    \"http://localhost:30000/generate\",\n",
     "    json={\n",
     "        \"text\": \"Paris is the capital of\",\n",
     "        \"sampling_params\": {\n",
@@ -466,7 +413,7 @@
    "source": [
     "import sglang as sgl\n",
     "\n",
-    "llm_xgrammar = sgl.Engine(\n",
+    "llm = sgl.Engine(\n",
     "    model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\", grammar_backend=\"xgrammar\"\n",
     ")"
    ]
@@ -514,7 +461,7 @@
     "    \"json_schema\": json.dumps(CapitalInfo.model_json_schema()),\n",
     "}\n",
     "\n",
-    "outputs = llm_xgrammar.generate(prompts, sampling_params)\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
     "for prompt, output in zip(prompts, outputs):\n",
     "    print_highlight(\"===============================\")\n",
     "    print_highlight(f\"Prompt: {prompt}\")  # validate the output by the pydantic model\n",
@@ -554,7 +501,7 @@
     "\n",
     "sampling_params = {\"temperature\": 0.1, \"top_p\": 0.95, \"json_schema\": json_schema}\n",
     "\n",
-    "outputs = llm_xgrammar.generate(prompts, sampling_params)\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
     "for prompt, output in zip(prompts, outputs):\n",
     "    print_highlight(\"===============================\")\n",
     "    print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
@@ -591,22 +538,12 @@
     "    ),\n",
     "}\n",
     "\n",
-    "outputs = llm_xgrammar.generate(prompts, sampling_params)\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
     "for prompt, output in zip(prompts, outputs):\n",
     "    print_highlight(\"===============================\")\n",
     "    print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm_xgrammar.shutdown()\n",
-    "llm_outlines = sgl.Engine(model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -627,7 +564,7 @@
     "\n",
     "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95, \"regex\": \"(France|England)\"}\n",
     "\n",
-    "outputs = llm_outlines.generate(prompts, sampling_params)\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
     "for prompt, output in zip(prompts, outputs):\n",
     "    print_highlight(\"===============================\")\n",
     "    print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
@@ -639,7 +576,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "llm_outlines.shutdown()"
+    "llm.shutdown()"
    ]
   }
  ],

From 1a820e38a2fcc6d0e0324605bb39baec23d81f8d Mon Sep 17 00:00:00 2001
From: Chaitanya Sri Krishna Lolla <csrikris@amd.com>
Date: Mon, 20 Jan 2025 10:30:35 +0530
Subject: [PATCH 138/248] Remove dependency of pynvml on ROCm (#2995)

---
 .../device_communicators/custom_all_reduce.py            | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
index 28aa9d4811e0..d4506b9f04c2 100644
--- a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
+++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
@@ -6,7 +6,6 @@
 from functools import wraps
 from typing import Callable, List, Optional, TypeVar, Union
 
-import pynvml
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
@@ -20,6 +19,14 @@
 from sglang.srt.distributed.parallel_state import in_the_same_node_as
 from sglang.srt.utils import cuda_device_count_stateless, is_cuda
 
+logger = logging.getLogger(__name__)
+
+if is_cuda():
+    try:
+        import pynvml
+    except ImportError as e:
+        logger.warning("Failed to import pynvml with %r", e)
+
 try:
     if ops.use_vllm_custom_allreduce:
         ops.meta_size()

From 44a966977083f3a7d7cc2a268f46a63e76d049a8 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Mon, 20 Jan 2025 13:21:36 +0800
Subject: [PATCH 139/248] keep rotary_embedding only (#2997)

---
 python/sglang/srt/layers/rotary_embedding.py | 60 ++++++--------------
 1 file changed, 16 insertions(+), 44 deletions(-)

diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py
index 964152905be6..43478f39d2c3 100644
--- a/python/sglang/srt/layers/rotary_embedding.py
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -144,28 +144,14 @@ def forward_cuda(
         from vllm import _custom_ops as ops
 
         self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
-        # ops.rotary_embedding()/batched_rotary_embedding()
-        # are in-place operations that update the query and key tensors.
-        if offsets is not None:
-            ops.batched_rotary_embedding(
-                positions,
-                query,
-                key,
-                self.head_size,
-                self.cos_sin_cache,
-                self.is_neox_style,
-                self.rotary_dim,
-                offsets,
-            )
-        else:
-            ops.rotary_embedding(
-                positions,
-                query,
-                key,
-                self.head_size,
-                self.cos_sin_cache,
-                self.is_neox_style,
-            )
+        ops.rotary_embedding(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.cos_sin_cache,
+            self.is_neox_style,
+        )
         return query, key
 
     def forward_xpu(
@@ -178,28 +164,14 @@ def forward_xpu(
         from vllm._ipex_ops import ipex_ops as ops
 
         self.cos_sin_cache = self.cos_sin_cache.to(positions.device, dtype=query.dtype)
-        # ops.rotary_embedding()/batched_rotary_embedding()
-        # are in-place operations that update the query and key tensors.
-        if offsets is not None:
-            ops.batched_rotary_embedding(
-                positions,
-                query,
-                key,
-                self.head_size,
-                self.cos_sin_cache,
-                self.is_neox_style,
-                self.rotary_dim,
-                offsets,
-            )
-        else:
-            ops.rotary_embedding(
-                positions,
-                query,
-                key,
-                self.head_size,
-                self.cos_sin_cache,
-                self.is_neox_style,
-            )
+        ops.rotary_embedding(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.cos_sin_cache,
+            self.is_neox_style,
+        )
         return query, key
 
     def forward_hpu(

From 03464890e0e0d048ebc1aa407e5235d7338f6aff Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 19 Jan 2025 22:09:24 -0800
Subject: [PATCH 140/248] Separate two entry points: Engine and HTTP server
 (#2996)

Co-authored-by: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
---
 docs/references/supported_models.md           |   2 +-
 python/sglang/api.py                          |   2 +-
 python/sglang/bench_offline_throughput.py     |   2 +-
 python/sglang/bench_one_batch.py              |   2 +-
 python/sglang/bench_one_batch_server.py       |   2 +-
 .../sglang/lang/backend/runtime_endpoint.py   |   2 +-
 python/sglang/launch_server.py                |   2 +-
 python/sglang/srt/entrypoints/engine.py       | 449 +++++++++
 python/sglang/srt/entrypoints/http_server.py  | 579 +++++++++++
 python/sglang/srt/managers/io_struct.py       |   1 -
 .../sglang/srt/managers/tokenizer_manager.py  |   7 +-
 python/sglang/srt/server.py                   | 949 +-----------------
 python/sglang/test/runners.py                 |   3 +-
 .../py_src/sglang_router/launch_server.py     |   2 +-
 test/srt/test_metrics.py                      |   1 -
 test/srt/test_nightly_gsm8k_eval.py           |   6 +-
 test/srt/test_nightly_human_eval.py           |   4 +-
 test/srt/test_srt_engine.py                   | 148 ++-
 18 files changed, 1121 insertions(+), 1042 deletions(-)
 create mode 100644 python/sglang/srt/entrypoints/engine.py
 create mode 100644 python/sglang/srt/entrypoints/http_server.py

diff --git a/docs/references/supported_models.md b/docs/references/supported_models.md
index 23c98ea93057..60551b2c1da5 100644
--- a/docs/references/supported_models.md
+++ b/docs/references/supported_models.md
@@ -91,7 +91,7 @@ Here is how you can do it:
 
 ```python
 from sglang.srt.models.registry import ModelRegistry
-from sglang.srt.server import launch_server
+from sglang.srt.entrypoints.http_server import launch_server
 
 # for a single model, you can add it to the registry
 ModelRegistry.models[model_name] = model_class
diff --git a/python/sglang/api.py b/python/sglang/api.py
index a9c5fa9da99a..7ef306380a91 100644
--- a/python/sglang/api.py
+++ b/python/sglang/api.py
@@ -40,7 +40,7 @@ def Runtime(*args, **kwargs):
 
 def Engine(*args, **kwargs):
     # Avoid importing unnecessary dependency
-    from sglang.srt.server import Engine
+    from sglang.srt.entrypoints.engine import Engine
 
     return Engine(*args, **kwargs)
 
diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index 6b31ac40e116..b0a715e61cc2 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -28,7 +28,7 @@
     set_ulimit,
 )
 from sglang.lang.backend.runtime_endpoint import Runtime
-from sglang.srt.server import Engine
+from sglang.srt.entrypoints.engine import Engine
 from sglang.srt.server_args import ServerArgs
 
 
diff --git a/python/sglang/bench_one_batch.py b/python/sglang/bench_one_batch.py
index 99fba8be913d..473f478ad5c4 100644
--- a/python/sglang/bench_one_batch.py
+++ b/python/sglang/bench_one_batch.py
@@ -57,12 +57,12 @@
 import torch.distributed as dist
 
 from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.entrypoints.engine import _set_envs_and_config
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.sampling.sampling_params import SamplingParams
-from sglang.srt.server import _set_envs_and_config
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers
diff --git a/python/sglang/bench_one_batch_server.py b/python/sglang/bench_one_batch_server.py
index 01cc561e1ced..5f0759a7ce1b 100644
--- a/python/sglang/bench_one_batch_server.py
+++ b/python/sglang/bench_one_batch_server.py
@@ -22,7 +22,7 @@
 import numpy as np
 import requests
 
-from sglang.srt.server import launch_server
+from sglang.srt.entrypoints.http_server import launch_server
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import kill_process_tree
 
diff --git a/python/sglang/lang/backend/runtime_endpoint.py b/python/sglang/lang/backend/runtime_endpoint.py
index c139db6f04c0..01f10b9f063b 100644
--- a/python/sglang/lang/backend/runtime_endpoint.py
+++ b/python/sglang/lang/backend/runtime_endpoint.py
@@ -351,7 +351,7 @@ def __init__(
         """See the arguments in server_args.py::ServerArgs"""
         # We delay the import of any `sglang.srt` components in `sglang.lang`, so users can run
         # client code without installing SRT server and its dependency if they want.
-        from sglang.srt.server import launch_server
+        from sglang.srt.entrypoints.http_server import launch_server
         from sglang.srt.server_args import ServerArgs
         from sglang.srt.utils import is_port_available
 
diff --git a/python/sglang/launch_server.py b/python/sglang/launch_server.py
index 6b0c25711c66..caae7b0f6cc7 100644
--- a/python/sglang/launch_server.py
+++ b/python/sglang/launch_server.py
@@ -3,7 +3,7 @@
 import os
 import sys
 
-from sglang.srt.server import launch_server
+from sglang.srt.entrypoints.http_server import launch_server
 from sglang.srt.server_args import prepare_server_args
 from sglang.srt.utils import kill_process_tree
 
diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
new file mode 100644
index 000000000000..310e92c23d95
--- /dev/null
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -0,0 +1,449 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+The entry point of inference server. (SRT = SGLang Runtime)
+
+This file implements python APIs for the inference engine.
+"""
+
+import asyncio
+import atexit
+import dataclasses
+import logging
+import multiprocessing as mp
+import os
+import signal
+import threading
+from typing import AsyncIterator, Dict, Iterator, List, Optional, Tuple, Union
+
+# Fix a bug of Python threading
+setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+
+import torch
+import uvloop
+
+from sglang.srt.managers.data_parallel_controller import (
+    run_data_parallel_controller_process,
+)
+from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
+from sglang.srt.managers.io_struct import (
+    EmbeddingReqInput,
+    GenerateReqInput,
+    GetWeightsByNameReqInput,
+    InitWeightsUpdateGroupReqInput,
+    ReleaseMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqInput,
+    UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromTensorReqInput,
+)
+from sglang.srt.managers.scheduler import run_scheduler_process
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.openai_api.adapter import load_chat_template_for_openai_api
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.srt.utils import (
+    MultiprocessingSerializer,
+    assert_pkg_version,
+    configure_logger,
+    kill_process_tree,
+    maybe_set_triton_cache_manager,
+    prepare_model_and_tokenizer,
+    set_prometheus_multiproc_dir,
+    set_ulimit,
+)
+from sglang.version import __version__
+
+logger = logging.getLogger(__name__)
+asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+
+class Engine:
+    """
+    The entry point to the inference engine.
+
+    - The engine consists of three components:
+        1. TokenizerManager: Tokenizes the requests and sends them to the scheduler.
+        2. Scheduler (subprocess): Receives requests from the Tokenizer Manager, schedules batches, forwards them, and sends the output tokens to the Detokenizer Manager.
+        3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.
+
+    Note:
+    1. The HTTP server, Engine, and TokenizerManager both run in the main process.
+    2. Inter-process communication is done through ICP (each process uses a different port) via the ZMQ library.
+    """
+
+    def __init__(self, **kwargs):
+        """
+        The arguments of this function is the same as `sglang/srt/server_args.py::ServerArgs`.
+        Please refer to `ServerArgs` for the documentation.
+        """
+        if "server_args" in kwargs:
+            # Directly load server_args
+            server_args = kwargs["server_args"]
+        else:
+            # Construct server_args from kwargs
+            if "log_level" not in kwargs:
+                # Do not print logs by default
+                kwargs["log_level"] = "error"
+            server_args = ServerArgs(**kwargs)
+
+        # Shutdown the subprocesses automatically when the program exists
+        atexit.register(self.shutdown)
+
+        # Launch subprocesses
+        tokenizer_manager, scheduler_info = _launch_subprocesses(
+            server_args=server_args
+        )
+        self.tokenizer_manager = tokenizer_manager
+        self.scheduler_info = scheduler_info
+
+    def generate(
+        self,
+        # The input prompt. It can be a single prompt or a batch of prompts.
+        prompt: Optional[Union[List[str], str]] = None,
+        sampling_params: Optional[Union[List[Dict], Dict]] = None,
+        # The token ids for text; one can either specify text or input_ids.
+        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+        lora_path: Optional[List[Optional[str]]] = None,
+        custom_logit_processor: Optional[Union[List[str], str]] = None,
+        stream: bool = False,
+    ) -> Union[Dict, Iterator[Dict]]:
+        """
+        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
+        Please refer to `GenerateReqInput` for the documentation.
+        """
+        obj = GenerateReqInput(
+            text=prompt,
+            input_ids=input_ids,
+            sampling_params=sampling_params,
+            return_logprob=return_logprob,
+            logprob_start_len=logprob_start_len,
+            top_logprobs_num=top_logprobs_num,
+            lora_path=lora_path,
+            custom_logit_processor=custom_logit_processor,
+            stream=stream,
+        )
+        loop = asyncio.get_event_loop()
+        generator = self.tokenizer_manager.generate_request(obj, None)
+
+        if stream:
+
+            def generator_wrapper():
+                while True:
+                    try:
+                        chunk = loop.run_until_complete(generator.__anext__())
+                        yield chunk
+                    except StopAsyncIteration:
+                        break
+
+            return generator_wrapper()
+        else:
+            ret = loop.run_until_complete(generator.__anext__())
+            return ret
+
+    async def async_generate(
+        self,
+        # The input prompt. It can be a single prompt or a batch of prompts.
+        prompt: Optional[Union[List[str], str]] = None,
+        sampling_params: Optional[Union[List[Dict], Dict]] = None,
+        # The token ids for text; one can either specify text or input_ids.
+        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+        lora_path: Optional[List[Optional[str]]] = None,
+        custom_logit_processor: Optional[Union[List[str], str]] = None,
+        stream: bool = False,
+    ) -> Union[Dict, AsyncIterator[Dict]]:
+        """
+        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
+        Please refer to `GenerateReqInput` for the documentation.
+        """
+        obj = GenerateReqInput(
+            text=prompt,
+            input_ids=input_ids,
+            sampling_params=sampling_params,
+            return_logprob=return_logprob,
+            logprob_start_len=logprob_start_len,
+            top_logprobs_num=top_logprobs_num,
+            lora_path=lora_path,
+            stream=stream,
+            custom_logit_processor=custom_logit_processor,
+        )
+        generator = self.tokenizer_manager.generate_request(obj, None)
+
+        if stream is True:
+            return generator
+        else:
+            return await generator.__anext__()
+
+    def encode(
+        self,
+        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
+    ) -> Dict:
+        """
+        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
+        Please refer to `EmbeddingReqInput` for the documentation.
+        """
+
+        obj = EmbeddingReqInput(text=prompt)
+        loop = asyncio.get_event_loop()
+        generator = self.tokenizer_manager.generate_request(obj, None)
+        ret = loop.run_until_complete(generator.__anext__())
+        return ret
+
+    def shutdown(self):
+        """Shutdown the engine"""
+        kill_process_tree(os.getpid(), include_parent=False)
+
+    def start_profile(self):
+        self.tokenizer_manager.start_profile()
+
+    def stop_profile(self):
+        self.tokenizer_manager.stop_profile()
+
+    def get_server_info(self):
+        return {
+            **dataclasses.asdict(self.tokenizer_manager.server_args),  # server args
+            **self.scheduler_info,
+            "version": __version__,
+        }
+
+    def init_weights_update_group(
+        self,
+        master_address: str,
+        master_port: int,
+        rank_offset: int,
+        world_size: int,
+        group_name: str,
+        backend: str = "nccl",
+    ):
+        """Initialize parameter update group."""
+        obj = InitWeightsUpdateGroupReqInput(
+            master_address=master_address,
+            master_port=master_port,
+            rank_offset=rank_offset,
+            world_size=world_size,
+            group_name=group_name,
+            backend=backend,
+        )
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.init_weights_update_group(obj, None)
+        )
+
+    def update_weights_from_distributed(self, name: str, dtype, shape):
+        """Update weights from distributed source."""
+        obj = UpdateWeightsFromDistributedReqInput(
+            name=name,
+            dtype=dtype,
+            shape=shape,
+        )
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.update_weights_from_distributed(obj, None)
+        )
+
+    def update_weights_from_tensor(self, named_tensors: List[Tuple[str, torch.Tensor]]):
+        """Update weights from distributed source."""
+        obj = UpdateWeightsFromTensorReqInput(
+            serialized_named_tensors=MultiprocessingSerializer.serialize(named_tensors)
+        )
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.update_weights_from_tensor(obj, None)
+        )
+
+    def get_weights_by_name(self, name: str, truncate_size: int = 100):
+        """Get weights by parameter name."""
+        obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.get_weights_by_name(obj, None)
+        )
+
+    def release_memory_occupation(self):
+        """Release GPU occupation temporarily."""
+        obj = ReleaseMemoryOccupationReqInput()
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.release_memory_occupation(obj, None)
+        )
+
+    def resume_memory_occupation(self):
+        """Resume GPU occupation."""
+        obj = ResumeMemoryOccupationReqInput()
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.resume_memory_occupation(obj, None)
+        )
+
+
+def _set_envs_and_config(server_args: ServerArgs):
+    # Set global environments
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+    os.environ["NCCL_CUMEM_ENABLE"] = "0"
+    os.environ["NCCL_NVLS_ENABLE"] = "0"
+    os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
+
+    # Set prometheus env vars
+    if server_args.enable_metrics:
+        set_prometheus_multiproc_dir()
+
+    # Set ulimit
+    set_ulimit()
+
+    # Fix triton bugs
+    if server_args.tp_size * server_args.dp_size > 1:
+        # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
+        maybe_set_triton_cache_manager()
+
+    # Check flashinfer version
+    if server_args.attention_backend == "flashinfer":
+        assert_pkg_version(
+            "flashinfer",
+            "0.1.6",
+            "Please uninstall the old version and "
+            "reinstall the latest version by following the instructions "
+            "at https://docs.flashinfer.ai/installation.html.",
+        )
+
+    # Register the signal handler.
+    # The child processes will send SIGQUIT to this process when any error happens
+    # This process then clean up the whole process tree
+    def sigquit_handler(signum, frame):
+        logger.error(
+            "Received sigquit from a child proces. It usually means the child failed."
+        )
+        kill_process_tree(os.getpid())
+
+    signal.signal(signal.SIGQUIT, sigquit_handler)
+
+    # Set mp start method
+    mp.set_start_method("spawn", force=True)
+
+
+def _launch_subprocesses(server_args: ServerArgs) -> Tuple[TokenizerManager, Dict]:
+    """
+    Launch the TokenizerManager in the main process, the Scheduler in a subprocess, and the DetokenizerManager in another subprocess.
+    """
+    # Configure global environment
+    configure_logger(server_args)
+    server_args.check_server_args()
+    _set_envs_and_config(server_args)
+
+    # Allocate ports for inter-process communications
+    port_args = PortArgs.init_new(server_args)
+    logger.info(f"{server_args=}")
+
+    # If using model from www.modelscope.cn, first download the model.
+    server_args.model_path, server_args.tokenizer_path = prepare_model_and_tokenizer(
+        server_args.model_path, server_args.tokenizer_path
+    )
+
+    scheduler_procs = []
+    if server_args.dp_size == 1:
+        # Launch tensor parallel scheduler processes
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=server_args.enable_memory_saver
+        )
+
+        scheduler_pipe_readers = []
+        tp_size_per_node = server_args.tp_size // server_args.nnodes
+        tp_rank_range = range(
+            tp_size_per_node * server_args.node_rank,
+            tp_size_per_node * (server_args.node_rank + 1),
+        )
+        for tp_rank in tp_rank_range:
+            reader, writer = mp.Pipe(duplex=False)
+            gpu_id = server_args.base_gpu_id + tp_rank % tp_size_per_node
+            proc = mp.Process(
+                target=run_scheduler_process,
+                args=(server_args, port_args, gpu_id, tp_rank, None, writer),
+            )
+            with memory_saver_adapter.configure_subprocess():
+                proc.start()
+            scheduler_procs.append(proc)
+            scheduler_pipe_readers.append(reader)
+    else:
+        # Launch the data parallel controller
+        reader, writer = mp.Pipe(duplex=False)
+        scheduler_pipe_readers = [reader]
+        proc = mp.Process(
+            target=run_data_parallel_controller_process,
+            args=(server_args, port_args, writer),
+        )
+        proc.start()
+        scheduler_procs.append(proc)
+
+    if server_args.node_rank >= 1:
+        # In multi-node cases, non-zero rank nodes do not need to run tokenizer or detokenizer,
+        # so they can just wait here.
+
+        for reader in scheduler_pipe_readers:
+            data = reader.recv()
+            assert data["status"] == "ready"
+
+        if os.getenv("SGLANG_BLOCK_NONZERO_RANK_CHILDREN") == "0":
+            # When using `Engine` as a Python API, we don't want to block here.
+            return
+
+        for proc in scheduler_procs:
+            proc.join()
+            logger.error(
+                f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
+            )
+        return
+
+    # Launch detokenizer process
+    detoken_proc = mp.Process(
+        target=run_detokenizer_process,
+        args=(
+            server_args,
+            port_args,
+        ),
+    )
+    detoken_proc.start()
+
+    # Launch tokenizer process
+    tokenizer_manager = TokenizerManager(server_args, port_args)
+    if server_args.chat_template:
+        load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
+
+    # Wait for the model to finish loading
+    scheduler_infos = []
+    for i in range(len(scheduler_pipe_readers)):
+        try:
+            data = scheduler_pipe_readers[i].recv()
+        except EOFError:
+            logger.error(
+                f"Rank {i} scheduler is dead. Please check if there are relevant logs."
+            )
+            scheduler_procs[i].join()
+            logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
+            raise
+
+        if data["status"] != "ready":
+            raise RuntimeError(
+                "Initialization failed. Please see the error messages above."
+            )
+        scheduler_infos.append(data)
+
+    # Assume all schedulers have the same scheduler_info
+    scheduler_info = scheduler_infos[0]
+    tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
+    return tokenizer_manager, scheduler_info
diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py
new file mode 100644
index 000000000000..0ebce1a85d55
--- /dev/null
+++ b/python/sglang/srt/entrypoints/http_server.py
@@ -0,0 +1,579 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+The entry point of inference server. (SRT = SGLang Runtime)
+
+This file implements HTTP APIs for the inferenc engine via fastapi.
+"""
+
+import asyncio
+import dataclasses
+import logging
+import multiprocessing as multiprocessing
+import os
+import threading
+import time
+from http import HTTPStatus
+from typing import AsyncIterator, Dict, Optional
+
+# Fix a bug of Python threading
+setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+
+import orjson
+import requests
+import uvicorn
+import uvloop
+from fastapi import FastAPI, File, Form, Request, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import ORJSONResponse, Response, StreamingResponse
+
+from sglang.srt.entrypoints.engine import _launch_subprocesses
+from sglang.srt.managers.io_struct import (
+    CloseSessionReqInput,
+    ConfigureLoggingReq,
+    EmbeddingReqInput,
+    GenerateReqInput,
+    GetWeightsByNameReqInput,
+    InitWeightsUpdateGroupReqInput,
+    OpenSessionReqInput,
+    ReleaseMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqInput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightsFromDistributedReqInput,
+)
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.metrics.func_timer import enable_func_timer
+from sglang.srt.openai_api.adapter import (
+    v1_batches,
+    v1_cancel_batch,
+    v1_chat_completions,
+    v1_completions,
+    v1_delete_file,
+    v1_embeddings,
+    v1_files_create,
+    v1_retrieve_batch,
+    v1_retrieve_file,
+    v1_retrieve_file_content,
+)
+from sglang.srt.openai_api.protocol import ModelCard, ModelList
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import (
+    add_api_key_middleware,
+    add_prometheus_middleware,
+    delete_directory,
+    kill_process_tree,
+    set_uvicorn_logging_configs,
+)
+from sglang.utils import get_exception_traceback
+from sglang.version import __version__
+
+logger = logging.getLogger(__name__)
+asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+# Fast API
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+# Store global states
+@dataclasses.dataclass
+class _GlobalState:
+    tokenizer_manager: TokenizerManager
+    scheduler_info: Dict
+
+
+_global_state: Optional[_GlobalState] = None
+
+
+def set_global_state(global_state: _GlobalState):
+    global _global_state
+    _global_state = global_state
+
+
+##### Native API endpoints #####
+
+
+@app.get("/health")
+async def health() -> Response:
+    """Check the health of the http server."""
+    return Response(status_code=200)
+
+
+@app.get("/health_generate")
+async def health_generate(request: Request) -> Response:
+    """Check the health of the inference server by generating one token."""
+
+    sampling_params = {"max_new_tokens": 1, "temperature": 0.7}
+
+    if _global_state.tokenizer_manager.is_generation:
+        gri = GenerateReqInput(
+            input_ids=[0], sampling_params=sampling_params, log_metrics=False
+        )
+    else:
+        gri = EmbeddingReqInput(
+            input_ids=[0], sampling_params=sampling_params, log_metrics=False
+        )
+
+    try:
+        async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
+            break
+        return Response(status_code=200)
+    except Exception as e:
+        logger.exception(e)
+        return Response(status_code=503)
+
+
+@app.get("/get_model_info")
+async def get_model_info():
+    """Get the model information."""
+    result = {
+        "model_path": _global_state.tokenizer_manager.model_path,
+        "tokenizer_path": _global_state.tokenizer_manager.server_args.tokenizer_path,
+        "is_generation": _global_state.tokenizer_manager.is_generation,
+    }
+    return result
+
+
+@app.get("/get_server_info")
+async def get_server_info():
+    return {
+        **dataclasses.asdict(_global_state.tokenizer_manager.server_args),
+        **_global_state.scheduler_info,
+        "version": __version__,
+    }
+
+
+# fastapi implicitly converts json in the request to obj (dataclass)
+@app.api_route("/generate", methods=["POST", "PUT"])
+async def generate_request(obj: GenerateReqInput, request: Request):
+    """Handle a generate request."""
+    if obj.stream:
+
+        async def stream_results() -> AsyncIterator[bytes]:
+            try:
+                async for out in _global_state.tokenizer_manager.generate_request(
+                    obj, request
+                ):
+                    yield b"data: " + orjson.dumps(
+                        out, option=orjson.OPT_NON_STR_KEYS
+                    ) + b"\n\n"
+            except ValueError as e:
+                out = {"error": {"message": str(e)}}
+                yield b"data: " + orjson.dumps(
+                    out, option=orjson.OPT_NON_STR_KEYS
+                ) + b"\n\n"
+            yield b"data: [DONE]\n\n"
+
+        return StreamingResponse(
+            stream_results(),
+            media_type="text/event-stream",
+            background=_global_state.tokenizer_manager.create_abort_task(obj),
+        )
+    else:
+        try:
+            ret = await _global_state.tokenizer_manager.generate_request(
+                obj, request
+            ).__anext__()
+            return ret
+        except ValueError as e:
+            logger.error(f"Error: {e}")
+            return _create_error_response(e)
+
+
+@app.api_route("/encode", methods=["POST", "PUT"])
+async def encode_request(obj: EmbeddingReqInput, request: Request):
+    """Handle an embedding request."""
+    try:
+        ret = await _global_state.tokenizer_manager.generate_request(
+            obj, request
+        ).__anext__()
+        return ret
+    except ValueError as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/classify", methods=["POST", "PUT"])
+async def classify_request(obj: EmbeddingReqInput, request: Request):
+    """Handle a reward model request. Now the arguments and return values are the same as embedding models."""
+    try:
+        ret = await _global_state.tokenizer_manager.generate_request(
+            obj, request
+        ).__anext__()
+        return ret
+    except ValueError as e:
+        return _create_error_response(e)
+
+
+@app.post("/flush_cache")
+async def flush_cache():
+    """Flush the radix cache."""
+    _global_state.tokenizer_manager.flush_cache()
+    return Response(
+        content="Cache flushed.\nPlease check backend logs for more details. "
+        "(When there are running or waiting requests, the operation will not be performed.)\n",
+        status_code=200,
+    )
+
+
+@app.api_route("/start_profile", methods=["GET", "POST"])
+async def start_profile_async():
+    """Start profiling."""
+    _global_state.tokenizer_manager.start_profile()
+    return Response(
+        content="Start profiling.\n",
+        status_code=200,
+    )
+
+
+@app.api_route("/stop_profile", methods=["GET", "POST"])
+async def stop_profile_async():
+    """Stop profiling."""
+    _global_state.tokenizer_manager.stop_profile()
+    return Response(
+        content="Stop profiling. This will take some time.\n",
+        status_code=200,
+    )
+
+
+@app.post("/update_weights_from_disk")
+async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: Request):
+    """Update the weights from disk in-place without re-launching the server."""
+    success, message = await _global_state.tokenizer_manager.update_weights_from_disk(
+        obj, request
+    )
+    content = {"success": success, "message": message}
+    if success:
+        return ORJSONResponse(
+            content,
+            status_code=HTTPStatus.OK,
+        )
+    else:
+        return ORJSONResponse(
+            content,
+            status_code=HTTPStatus.BAD_REQUEST,
+        )
+
+
+@app.post("/init_weights_update_group")
+async def init_weights_update_group(
+    obj: InitWeightsUpdateGroupReqInput, request: Request
+):
+    """Initialize the parameter update group."""
+    success, message = await _global_state.tokenizer_manager.init_weights_update_group(
+        obj, request
+    )
+    content = {"success": success, "message": message}
+    if success:
+        return ORJSONResponse(content, status_code=200)
+    else:
+        return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
+
+
+@app.post("/update_weights_from_distributed")
+async def update_weights_from_distributed(
+    obj: UpdateWeightsFromDistributedReqInput, request: Request
+):
+    """Update model parameter from distributed online."""
+    success, message = (
+        await _global_state.tokenizer_manager.update_weights_from_distributed(
+            obj, request
+        )
+    )
+    content = {"success": success, "message": message}
+    if success:
+        return ORJSONResponse(content, status_code=200)
+    else:
+        return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
+
+
+@app.api_route("/get_weights_by_name", methods=["GET", "POST"])
+async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
+    """Get model parameter by name."""
+    try:
+        ret = await _global_state.tokenizer_manager.get_weights_by_name(obj, request)
+        if ret is None:
+            return _create_error_response("Get parameter by name failed")
+        else:
+            return ORJSONResponse(ret, status_code=200)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/release_memory_occupation", methods=["GET", "POST"])
+async def release_memory_occupation(
+    obj: ReleaseMemoryOccupationReqInput, request: Request
+):
+    """Release GPU occupation temporarily"""
+    try:
+        await _global_state.tokenizer_manager.release_memory_occupation(obj, request)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/resume_memory_occupation", methods=["GET", "POST"])
+async def resume_memory_occupation(
+    obj: ResumeMemoryOccupationReqInput, request: Request
+):
+    """Resume GPU occupation"""
+    try:
+        await _global_state.tokenizer_manager.resume_memory_occupation(obj, request)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/open_session", methods=["GET", "POST"])
+async def open_session(obj: OpenSessionReqInput, request: Request):
+    """Open a session, and return its unique session id."""
+    try:
+        session_id = await _global_state.tokenizer_manager.open_session(obj, request)
+        if session_id is None:
+            raise Exception(
+                "Failed to open the session. Check if a session with the same id is still open."
+            )
+        return session_id
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/close_session", methods=["GET", "POST"])
+async def close_session(obj: CloseSessionReqInput, request: Request):
+    """Close the session"""
+    try:
+        await _global_state.tokenizer_manager.close_session(obj, request)
+        return Response(status_code=200)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/configure_logging", methods=["GET", "POST"])
+async def configure_logging(obj: ConfigureLoggingReq, request: Request):
+    """Close the session"""
+    _global_state.tokenizer_manager.configure_logging(obj)
+    return Response(status_code=200)
+
+
+##### OpenAI-compatible API endpoints #####
+
+
+@app.post("/v1/completions")
+async def openai_v1_completions(raw_request: Request):
+    return await v1_completions(_global_state.tokenizer_manager, raw_request)
+
+
+@app.post("/v1/chat/completions")
+async def openai_v1_chat_completions(raw_request: Request):
+    return await v1_chat_completions(_global_state.tokenizer_manager, raw_request)
+
+
+@app.post("/v1/embeddings", response_class=ORJSONResponse)
+async def openai_v1_embeddings(raw_request: Request):
+    response = await v1_embeddings(_global_state.tokenizer_manager, raw_request)
+    return response
+
+
+@app.get("/v1/models", response_class=ORJSONResponse)
+def available_models():
+    """Show available models."""
+    served_model_names = [_global_state.tokenizer_manager.served_model_name]
+    model_cards = []
+    for served_model_name in served_model_names:
+        model_cards.append(ModelCard(id=served_model_name, root=served_model_name))
+    return ModelList(data=model_cards)
+
+
+@app.post("/v1/files")
+async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("batch")):
+    return await v1_files_create(
+        file, purpose, _global_state.tokenizer_manager.server_args.file_storage_pth
+    )
+
+
+@app.delete("/v1/files/{file_id}")
+async def delete_file(file_id: str):
+    # https://platform.openai.com/docs/api-reference/files/delete
+    return await v1_delete_file(file_id)
+
+
+@app.post("/v1/batches")
+async def openai_v1_batches(raw_request: Request):
+    return await v1_batches(_global_state.tokenizer_manager, raw_request)
+
+
+@app.post("/v1/batches/{batch_id}/cancel")
+async def cancel_batches(batch_id: str):
+    # https://platform.openai.com/docs/api-reference/batch/cancel
+    return await v1_cancel_batch(_global_state.tokenizer_manager, batch_id)
+
+
+@app.get("/v1/batches/{batch_id}")
+async def retrieve_batch(batch_id: str):
+    return await v1_retrieve_batch(batch_id)
+
+
+@app.get("/v1/files/{file_id}")
+async def retrieve_file(file_id: str):
+    # https://platform.openai.com/docs/api-reference/files/retrieve
+    return await v1_retrieve_file(file_id)
+
+
+@app.get("/v1/files/{file_id}/content")
+async def retrieve_file_content(file_id: str):
+    # https://platform.openai.com/docs/api-reference/files/retrieve-contents
+    return await v1_retrieve_file_content(file_id)
+
+
+def _create_error_response(e):
+    return ORJSONResponse(
+        {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
+    )
+
+
+def launch_server(
+    server_args: ServerArgs,
+    pipe_finish_writer: Optional[multiprocessing.connection.Connection] = None,
+):
+    """
+    Launch SRT (SGLang Runtime) Server.
+
+    The SRT server consists of an HTTP server and an SRT engine.
+
+    - HTTP server: A FastAPI server that routes requests to the engine.
+    - The engine consists of three components:
+        1. TokenizerManager: Tokenizes the requests and sends them to the scheduler.
+        2. Scheduler (subprocess): Receives requests from the Tokenizer Manager, schedules batches, forwards them, and sends the output tokens to the Detokenizer Manager.
+        3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.
+
+    Note:
+    1. The HTTP server, Engine, and TokenizerManager both run in the main process.
+    2. Inter-process communication is done through ICP (each process uses a different port) via the ZMQ library.
+    """
+    tokenizer_manager, scheduler_info = _launch_subprocesses(server_args=server_args)
+    set_global_state(
+        _GlobalState(
+            tokenizer_manager=tokenizer_manager,
+            scheduler_info=scheduler_info,
+        )
+    )
+
+    # Add api key authorization
+    if server_args.api_key:
+        add_api_key_middleware(app, server_args.api_key)
+
+    # Add prometheus middleware
+    if server_args.enable_metrics:
+        add_prometheus_middleware(app)
+        enable_func_timer()
+
+    # Send a warmup request
+    t = threading.Thread(
+        target=_wait_and_warmup,
+        args=(
+            server_args,
+            pipe_finish_writer,
+            _global_state.tokenizer_manager.image_token_id,
+        ),
+    )
+    t.start()
+
+    try:
+        # Update logging configs
+        set_uvicorn_logging_configs()
+
+        # Listen for HTTP requests
+        uvicorn.run(
+            app,
+            host=server_args.host,
+            port=server_args.port,
+            log_level=server_args.log_level_http or server_args.log_level,
+            timeout_keep_alive=5,
+            loop="uvloop",
+        )
+    finally:
+        t.join()
+
+
+def _wait_and_warmup(server_args, pipe_finish_writer, image_token_text):
+    headers = {}
+    url = server_args.url()
+    if server_args.api_key:
+        headers["Authorization"] = f"Bearer {server_args.api_key}"
+
+    # Wait until the server is launched
+    success = False
+    for _ in range(120):
+        time.sleep(1)
+        try:
+            res = requests.get(url + "/get_model_info", timeout=5, headers=headers)
+            assert res.status_code == 200, f"{res=}, {res.text=}"
+            success = True
+            break
+        except (AssertionError, requests.exceptions.RequestException):
+            last_traceback = get_exception_traceback()
+            pass
+
+    if not success:
+        if pipe_finish_writer is not None:
+            pipe_finish_writer.send(last_traceback)
+        logger.error(f"Initialization failed. warmup error: {last_traceback}")
+        kill_process_tree(os.getpid())
+        return
+
+    model_info = res.json()
+
+    # Send a warmup request
+    request_name = "/generate" if model_info["is_generation"] else "/encode"
+    max_new_tokens = 8 if model_info["is_generation"] else 1
+    json_data = {
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": max_new_tokens,
+        },
+    }
+    if server_args.skip_tokenizer_init:
+        json_data["input_ids"] = [10, 11, 12]
+    else:
+        json_data["text"] = "The capital city of France is"
+
+    try:
+        for _ in range(server_args.dp_size):
+            res = requests.post(
+                url + request_name,
+                json=json_data,
+                headers=headers,
+                timeout=600,
+            )
+            assert res.status_code == 200, f"{res}"
+    except Exception:
+        last_traceback = get_exception_traceback()
+        if pipe_finish_writer is not None:
+            pipe_finish_writer.send(last_traceback)
+        logger.error(f"Initialization failed. warmup error: {last_traceback}")
+        kill_process_tree(os.getpid())
+        return
+
+    # Debug print
+    # logger.info(f"{res.json()=}")
+
+    logger.info("The server is fired up and ready to roll!")
+    if pipe_finish_writer is not None:
+        pipe_finish_writer.send("ready")
+
+    if server_args.delete_ckpt_after_loading:
+        delete_directory(server_args.model_path)
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
index 5a803dd997ad..9183239838de 100644
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -22,7 +22,6 @@
 from typing import Dict, List, Optional, Union
 
 from sglang.srt.managers.schedule_batch import BaseFinishReason
-from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
 from sglang.srt.sampling.sampling_params import SamplingParams
 
 
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index d6178a959d03..162f10624f9e 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -176,7 +176,7 @@ def __init__(
                 )
 
         # Store states
-        self.to_create_loop = True
+        self.no_create_loop = False
         self.rid_to_state: Dict[str, ReqState] = {}
         self.dump_requests_folder = ""  # By default do not dump
         self.dump_requests_threshold = 1000
@@ -684,7 +684,6 @@ async def open_session(
     async def close_session(
         self, obj: CloseSessionReqInput, request: Optional[fastapi.Request] = None
     ):
-        assert not self.to_create_loop, "close session should not be the first request"
         await self.send_to_scheduler.send_pyobj(obj)
 
     def configure_logging(self, obj: ConfigureLoggingReq):
@@ -713,10 +712,10 @@ async def abort_request():
         return background_tasks
 
     def auto_create_handle_loop(self):
-        if not self.to_create_loop:
+        if self.no_create_loop:
             return
 
-        self.to_create_loop = False
+        self.no_create_loop = True
         loop = asyncio.get_event_loop()
         self.asyncio_tasks.add(
             loop.create_task(print_exception_wrapper(self.handle_loop))
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index 0b4d9c37218e..8b0c56186223 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -11,949 +11,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""
-The entry point of inference server.
-SRT = SGLang Runtime.
-"""
 
-import asyncio
-import atexit
-import dataclasses
-import json
-import logging
-import multiprocessing as mp
-import os
-import signal
-import threading
-import time
-from http import HTTPStatus
-from typing import AsyncIterator, Dict, List, Optional, Tuple, Union
-
-import torch
-
-from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
-
-# Fix a bug of Python threading
-setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
-
-import aiohttp
-import orjson
-import requests
-import uvicorn
-import uvloop
-from fastapi import FastAPI, File, Form, Request, UploadFile
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import ORJSONResponse, Response, StreamingResponse
-
-from sglang.srt.managers.data_parallel_controller import (
-    run_data_parallel_controller_process,
-)
-from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
-from sglang.srt.managers.io_struct import (
-    CloseSessionReqInput,
-    ConfigureLoggingReq,
-    EmbeddingReqInput,
-    GenerateReqInput,
-    GetWeightsByNameReqInput,
-    InitWeightsUpdateGroupReqInput,
-    OpenSessionReqInput,
-    ReleaseMemoryOccupationReqInput,
-    ResumeMemoryOccupationReqInput,
-    UpdateWeightFromDiskReqInput,
-    UpdateWeightsFromDistributedReqInput,
-    UpdateWeightsFromTensorReqInput,
-)
-from sglang.srt.managers.scheduler import run_scheduler_process
-from sglang.srt.managers.tokenizer_manager import TokenizerManager
-from sglang.srt.metrics.func_timer import enable_func_timer, time_func_latency
-from sglang.srt.openai_api.adapter import (
-    load_chat_template_for_openai_api,
-    v1_batches,
-    v1_cancel_batch,
-    v1_chat_completions,
-    v1_completions,
-    v1_delete_file,
-    v1_embeddings,
-    v1_files_create,
-    v1_retrieve_batch,
-    v1_retrieve_file,
-    v1_retrieve_file_content,
-)
-from sglang.srt.openai_api.protocol import ModelCard, ModelList
-from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import (
-    MultiprocessingSerializer,
-    add_api_key_middleware,
-    add_prometheus_middleware,
-    assert_pkg_version,
-    configure_logger,
-    delete_directory,
-    kill_process_tree,
-    maybe_set_triton_cache_manager,
-    prepare_model_and_tokenizer,
-    set_prometheus_multiproc_dir,
-    set_ulimit,
-    set_uvicorn_logging_configs,
-)
-from sglang.utils import get_exception_traceback
-from sglang.version import __version__
-
-logger = logging.getLogger(__name__)
-
-asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-
-# Fast API
-app = FastAPI()
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-tokenizer_manager: TokenizerManager = None
-scheduler_info: Dict = None
-
-
-##### Native API endpoints #####
-
-
-@app.get("/health")
-async def health() -> Response:
-    """Check the health of the http server."""
-    return Response(status_code=200)
-
-
-@app.get("/health_generate")
-async def health_generate(request: Request) -> Response:
-    """Check the health of the inference server by generating one token."""
-
-    sampling_params = {"max_new_tokens": 1, "temperature": 0.7}
-
-    if tokenizer_manager.is_generation:
-        gri = GenerateReqInput(
-            input_ids=[0], sampling_params=sampling_params, log_metrics=False
-        )
-    else:
-        gri = EmbeddingReqInput(
-            input_ids=[0], sampling_params=sampling_params, log_metrics=False
-        )
-
-    try:
-        async for _ in tokenizer_manager.generate_request(gri, request):
-            break
-        return Response(status_code=200)
-    except Exception as e:
-        logger.exception(e)
-        return Response(status_code=503)
-
-
-@app.get("/get_model_info")
-async def get_model_info():
-    """Get the model information."""
-    result = {
-        "model_path": tokenizer_manager.model_path,
-        "tokenizer_path": tokenizer_manager.server_args.tokenizer_path,
-        "is_generation": tokenizer_manager.is_generation,
-    }
-    return result
-
-
-@app.get("/get_server_info")
-async def get_server_info():
-    return {
-        **dataclasses.asdict(tokenizer_manager.server_args),
-        **scheduler_info,
-        "version": __version__,
-    }
-
-
-# fastapi implicitly converts json in the request to obj (dataclass)
-@app.api_route("/generate", methods=["POST", "PUT"])
-@time_func_latency
-async def generate_request(obj: GenerateReqInput, request: Request):
-    """Handle a generate request."""
-    if obj.stream:
-
-        async def stream_results() -> AsyncIterator[bytes]:
-            try:
-                async for out in tokenizer_manager.generate_request(obj, request):
-                    yield b"data: " + orjson.dumps(
-                        out, option=orjson.OPT_NON_STR_KEYS
-                    ) + b"\n\n"
-            except ValueError as e:
-                out = {"error": {"message": str(e)}}
-                yield b"data: " + orjson.dumps(
-                    out, option=orjson.OPT_NON_STR_KEYS
-                ) + b"\n\n"
-            yield b"data: [DONE]\n\n"
-
-        return StreamingResponse(
-            stream_results(),
-            media_type="text/event-stream",
-            background=tokenizer_manager.create_abort_task(obj),
-        )
-    else:
-        try:
-            ret = await tokenizer_manager.generate_request(obj, request).__anext__()
-            return ret
-        except ValueError as e:
-            logger.error(f"Error: {e}")
-            return _create_error_response(e)
-
-
-@app.api_route("/encode", methods=["POST", "PUT"])
-@time_func_latency
-async def encode_request(obj: EmbeddingReqInput, request: Request):
-    """Handle an embedding request."""
-    try:
-        ret = await tokenizer_manager.generate_request(obj, request).__anext__()
-        return ret
-    except ValueError as e:
-        return _create_error_response(e)
-
-
-@app.api_route("/classify", methods=["POST", "PUT"])
-@time_func_latency
-async def classify_request(obj: EmbeddingReqInput, request: Request):
-    """Handle a reward model request. Now the arguments and return values are the same as embedding models."""
-    try:
-        ret = await tokenizer_manager.generate_request(obj, request).__anext__()
-        return ret
-    except ValueError as e:
-        return _create_error_response(e)
-
-
-@app.post("/flush_cache")
-async def flush_cache():
-    """Flush the radix cache."""
-    tokenizer_manager.flush_cache()
-    return Response(
-        content="Cache flushed.\nPlease check backend logs for more details. "
-        "(When there are running or waiting requests, the operation will not be performed.)\n",
-        status_code=200,
-    )
-
-
-@app.api_route("/start_profile", methods=["GET", "POST"])
-async def start_profile_async():
-    """Start profiling."""
-    tokenizer_manager.start_profile()
-    return Response(
-        content="Start profiling.\n",
-        status_code=200,
-    )
-
-
-@app.api_route("/stop_profile", methods=["GET", "POST"])
-async def stop_profile_async():
-    """Stop profiling."""
-    tokenizer_manager.stop_profile()
-    return Response(
-        content="Stop profiling. This will take some time.\n",
-        status_code=200,
-    )
-
-
-@app.post("/update_weights_from_disk")
-@time_func_latency
-async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: Request):
-    """Update the weights from disk in-place without re-launching the server."""
-    success, message = await tokenizer_manager.update_weights_from_disk(obj, request)
-    content = {"success": success, "message": message}
-    if success:
-        return ORJSONResponse(
-            content,
-            status_code=HTTPStatus.OK,
-        )
-    else:
-        return ORJSONResponse(
-            content,
-            status_code=HTTPStatus.BAD_REQUEST,
-        )
-
-
-@app.post("/init_weights_update_group")
-async def init_weights_update_group(
-    obj: InitWeightsUpdateGroupReqInput, request: Request
-):
-    """Initialize the parameter update group."""
-    success, message = await tokenizer_manager.init_weights_update_group(obj, request)
-    content = {"success": success, "message": message}
-    if success:
-        return ORJSONResponse(content, status_code=200)
-    else:
-        return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
-
-
-@app.post("/update_weights_from_distributed")
-async def update_weights_from_distributed(
-    obj: UpdateWeightsFromDistributedReqInput, request: Request
-):
-    """Update model parameter from distributed online."""
-    success, message = await tokenizer_manager.update_weights_from_distributed(
-        obj, request
-    )
-    content = {"success": success, "message": message}
-    if success:
-        return ORJSONResponse(content, status_code=200)
-    else:
-        return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
-
-
-@app.api_route("/get_weights_by_name", methods=["GET", "POST"])
-async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
-    """Get model parameter by name."""
-    try:
-        ret = await tokenizer_manager.get_weights_by_name(obj, request)
-        if ret is None:
-            return _create_error_response("Get parameter by name failed")
-        else:
-            return ORJSONResponse(ret, status_code=200)
-    except Exception as e:
-        return _create_error_response(e)
-
-
-@app.api_route("/release_memory_occupation", methods=["GET", "POST"])
-async def release_memory_occupation(
-    obj: ReleaseMemoryOccupationReqInput, request: Request
-):
-    """Release GPU occupation temporarily"""
-    try:
-        await tokenizer_manager.release_memory_occupation(obj, request)
-    except Exception as e:
-        return _create_error_response(e)
-
-
-@app.api_route("/resume_memory_occupation", methods=["GET", "POST"])
-async def resume_memory_occupation(
-    obj: ResumeMemoryOccupationReqInput, request: Request
-):
-    """Resume GPU occupation"""
-    try:
-        await tokenizer_manager.resume_memory_occupation(obj, request)
-    except Exception as e:
-        return _create_error_response(e)
-
-
-@app.api_route("/open_session", methods=["GET", "POST"])
-async def open_session(obj: OpenSessionReqInput, request: Request):
-    """Open a session, and return its unique session id."""
-    try:
-        session_id = await tokenizer_manager.open_session(obj, request)
-        if session_id is None:
-            raise Exception(
-                "Failed to open the session. Check if a session with the same id is still open."
-            )
-        return session_id
-    except Exception as e:
-        return _create_error_response(e)
-
-
-@app.api_route("/close_session", methods=["GET", "POST"])
-async def close_session(obj: CloseSessionReqInput, request: Request):
-    """Close the session"""
-    try:
-        await tokenizer_manager.close_session(obj, request)
-        return Response(status_code=200)
-    except Exception as e:
-        return _create_error_response(e)
-
-
-@app.api_route("/configure_logging", methods=["GET", "POST"])
-async def configure_logging(obj: ConfigureLoggingReq, request: Request):
-    """Close the session"""
-    tokenizer_manager.configure_logging(obj)
-    return Response(status_code=200)
-
-
-##### OpenAI-compatible API endpoints #####
-
-
-@app.post("/v1/completions")
-@time_func_latency
-async def openai_v1_completions(raw_request: Request):
-    return await v1_completions(tokenizer_manager, raw_request)
-
-
-@app.post("/v1/chat/completions")
-@time_func_latency
-async def openai_v1_chat_completions(raw_request: Request):
-    return await v1_chat_completions(tokenizer_manager, raw_request)
-
-
-@app.post("/v1/embeddings", response_class=ORJSONResponse)
-@time_func_latency
-async def openai_v1_embeddings(raw_request: Request):
-    response = await v1_embeddings(tokenizer_manager, raw_request)
-    return response
-
-
-@app.get("/v1/models", response_class=ORJSONResponse)
-def available_models():
-    """Show available models."""
-    served_model_names = [tokenizer_manager.served_model_name]
-    model_cards = []
-    for served_model_name in served_model_names:
-        model_cards.append(ModelCard(id=served_model_name, root=served_model_name))
-    return ModelList(data=model_cards)
-
-
-@app.post("/v1/files")
-async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("batch")):
-    return await v1_files_create(
-        file, purpose, tokenizer_manager.server_args.file_storage_pth
-    )
-
-
-@app.delete("/v1/files/{file_id}")
-async def delete_file(file_id: str):
-    # https://platform.openai.com/docs/api-reference/files/delete
-    return await v1_delete_file(file_id)
-
-
-@app.post("/v1/batches")
-async def openai_v1_batches(raw_request: Request):
-    return await v1_batches(tokenizer_manager, raw_request)
-
-
-@app.post("/v1/batches/{batch_id}/cancel")
-async def cancel_batches(batch_id: str):
-    # https://platform.openai.com/docs/api-reference/batch/cancel
-    return await v1_cancel_batch(tokenizer_manager, batch_id)
-
-
-@app.get("/v1/batches/{batch_id}")
-async def retrieve_batch(batch_id: str):
-    return await v1_retrieve_batch(batch_id)
-
-
-@app.get("/v1/files/{file_id}")
-async def retrieve_file(file_id: str):
-    # https://platform.openai.com/docs/api-reference/files/retrieve
-    return await v1_retrieve_file(file_id)
-
-
-@app.get("/v1/files/{file_id}/content")
-async def retrieve_file_content(file_id: str):
-    # https://platform.openai.com/docs/api-reference/files/retrieve-contents
-    return await v1_retrieve_file_content(file_id)
-
-
-def _create_error_response(e):
-    return ORJSONResponse(
-        {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
-    )
-
-
-def launch_engine(
-    server_args: ServerArgs,
-):
-    """
-    Launch the TokenizerManager in the main process, the Scheduler in a subprocess, and the DetokenizerManager in another subprocess.
-    """
-
-    global tokenizer_manager
-    global scheduler_info
-
-    # Configure global environment
-    configure_logger(server_args)
-    server_args.check_server_args()
-    _set_envs_and_config(server_args)
-
-    # Allocate ports for inter-process communications
-    port_args = PortArgs.init_new(server_args)
-    logger.info(f"{server_args=}")
-
-    # If using model from www.modelscope.cn, first download the model.
-    server_args.model_path, server_args.tokenizer_path = prepare_model_and_tokenizer(
-        server_args.model_path, server_args.tokenizer_path
-    )
-
-    scheduler_procs = []
-    if server_args.dp_size == 1:
-        # Launch tensor parallel scheduler processes
-        memory_saver_adapter = TorchMemorySaverAdapter.create(
-            enable=server_args.enable_memory_saver
-        )
-
-        scheduler_pipe_readers = []
-        tp_size_per_node = server_args.tp_size // server_args.nnodes
-        tp_rank_range = range(
-            tp_size_per_node * server_args.node_rank,
-            tp_size_per_node * (server_args.node_rank + 1),
-        )
-        for tp_rank in tp_rank_range:
-            reader, writer = mp.Pipe(duplex=False)
-            gpu_id = server_args.base_gpu_id + tp_rank % tp_size_per_node
-            proc = mp.Process(
-                target=run_scheduler_process,
-                args=(server_args, port_args, gpu_id, tp_rank, None, writer),
-            )
-            with memory_saver_adapter.configure_subprocess():
-                proc.start()
-            scheduler_procs.append(proc)
-            scheduler_pipe_readers.append(reader)
-    else:
-        # Launch the data parallel controller
-        reader, writer = mp.Pipe(duplex=False)
-        scheduler_pipe_readers = [reader]
-        proc = mp.Process(
-            target=run_data_parallel_controller_process,
-            args=(server_args, port_args, writer),
-        )
-        proc.start()
-        scheduler_procs.append(proc)
-
-    if server_args.node_rank >= 1:
-        # In multi-node cases, non-zero rank nodes do not need to run tokenizer or detokenizer,
-        # so they can just wait here.
-
-        for reader in scheduler_pipe_readers:
-            data = reader.recv()
-            assert data["status"] == "ready"
-
-        if os.getenv("SGLANG_BLOCK_NONZERO_RANK_CHILDREN") == "0":
-            # When using `Engine` as a Python API, we don't want to block here.
-            return
-
-        for proc in scheduler_procs:
-            proc.join()
-            logger.error(
-                f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
-            )
-        return
-
-    # Launch detokenizer process
-    detoken_proc = mp.Process(
-        target=run_detokenizer_process,
-        args=(
-            server_args,
-            port_args,
-        ),
-    )
-    detoken_proc.start()
-
-    # Launch tokenizer process
-    tokenizer_manager = TokenizerManager(server_args, port_args)
-    if server_args.chat_template:
-        load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
-
-    # Wait for model to finish loading
-    scheduler_infos = []
-    for i in range(len(scheduler_pipe_readers)):
-        try:
-            data = scheduler_pipe_readers[i].recv()
-        except EOFError as e:
-            logger.exception(e)
-            logger.error(
-                f"Rank {i} scheduler is dead. Please check if there are relevant logs."
-            )
-            scheduler_procs[i].join()
-            logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
-            raise
-
-        if data["status"] != "ready":
-            raise RuntimeError(
-                "Initialization failed. Please see the error messages above."
-            )
-        scheduler_infos.append(data)
-
-    # Assume all schedulers have same scheduler_info
-    scheduler_info = scheduler_infos[0]
-    tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
-
-
-def launch_server(
-    server_args: ServerArgs,
-    pipe_finish_writer: Optional[mp.connection.Connection] = None,
-):
-    """
-    Launch SRT (SGLang Runtime) Server
-
-    The SRT server consists of an HTTP server and the SRT engine.
-
-    1. HTTP server: A FastAPI server that routes requests to the engine.
-    2. SRT engine:
-        1. TokenizerManager: Tokenizes the requests and sends them to the scheduler.
-        2. Scheduler (subprocess): Receives requests from the Tokenizer Manager, schedules batches, forwards them, and sends the output tokens to the Detokenizer Manager.
-        3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.
-
-    Note:
-    1. The HTTP server and TokenizerManager both run in the main process.
-    2. Inter-process communication is done through ICP (each process uses a different port) via the ZMQ library.
-    """
-    launch_engine(server_args=server_args)
-
-    # Add api key authorization
-    if server_args.api_key:
-        add_api_key_middleware(app, server_args.api_key)
-
-    # Add prometheus middleware
-    if server_args.enable_metrics:
-        add_prometheus_middleware(app)
-        enable_func_timer()
-
-    # Send a warmup request
-    t = threading.Thread(
-        target=_wait_and_warmup,
-        args=(
-            server_args,
-            pipe_finish_writer,
-            tokenizer_manager.image_token_id,
-        ),
-    )
-    t.start()
-
-    try:
-        # Update logging configs
-        set_uvicorn_logging_configs()
-
-        # Listen for HTTP requests
-        uvicorn.run(
-            app,
-            host=server_args.host,
-            port=server_args.port,
-            log_level=server_args.log_level_http or server_args.log_level,
-            timeout_keep_alive=5,
-            loop="uvloop",
-        )
-    finally:
-        t.join()
-
-
-def _set_envs_and_config(server_args: ServerArgs):
-    # Set global environments
-    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
-    os.environ["NCCL_CUMEM_ENABLE"] = "0"
-    os.environ["NCCL_NVLS_ENABLE"] = "0"
-    os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
-    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
-
-    # Set prometheus env vars
-    if server_args.enable_metrics:
-        set_prometheus_multiproc_dir()
-
-    # Set ulimit
-    set_ulimit()
-
-    # Fix triton bugs
-    if server_args.tp_size * server_args.dp_size > 1:
-        # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
-        maybe_set_triton_cache_manager()
-
-    # Check flashinfer version
-    if server_args.attention_backend == "flashinfer":
-        assert_pkg_version(
-            "flashinfer",
-            "0.1.6",
-            "Please uninstall the old version and "
-            "reinstall the latest version by following the instructions "
-            "at https://docs.flashinfer.ai/installation.html.",
-        )
-
-    # Register the signal handler.
-    # The child processes will send SIGQUIT to this process when any error happens
-    # This process then clean up the whole process tree
-    def sigquit_handler(signum, frame):
-        logger.error(
-            "Received sigquit from a child proces. It usually means the child failed."
-        )
-        kill_process_tree(os.getpid())
-
-    signal.signal(signal.SIGQUIT, sigquit_handler)
-
-    # Set mp start method
-    mp.set_start_method("spawn", force=True)
-
-
-def _wait_and_warmup(server_args, pipe_finish_writer, image_token_text):
-    headers = {}
-    url = server_args.url()
-    if server_args.api_key:
-        headers["Authorization"] = f"Bearer {server_args.api_key}"
-
-    # Wait until the server is launched
-    success = False
-    for _ in range(120):
-        time.sleep(1)
-        try:
-            res = requests.get(url + "/get_model_info", timeout=5, headers=headers)
-            assert res.status_code == 200, f"{res=}, {res.text=}"
-            success = True
-            break
-        except (AssertionError, requests.exceptions.RequestException):
-            last_traceback = get_exception_traceback()
-            pass
-
-    if not success:
-        if pipe_finish_writer is not None:
-            pipe_finish_writer.send(last_traceback)
-        logger.error(f"Initialization failed. warmup error: {last_traceback}")
-        kill_process_tree(os.getpid())
-        return
-
-    model_info = res.json()
-
-    # Send a warmup request
-    request_name = "/generate" if model_info["is_generation"] else "/encode"
-    max_new_tokens = 8 if model_info["is_generation"] else 1
-    json_data = {
-        "sampling_params": {
-            "temperature": 0,
-            "max_new_tokens": max_new_tokens,
-        },
-    }
-    if server_args.skip_tokenizer_init:
-        json_data["input_ids"] = [10, 11, 12]
-    else:
-        json_data["text"] = "The capital city of France is"
-
-    try:
-        for _ in range(server_args.dp_size):
-            res = requests.post(
-                url + request_name,
-                json=json_data,
-                headers=headers,
-                timeout=600,
-            )
-            assert res.status_code == 200, f"{res}"
-    except Exception:
-        last_traceback = get_exception_traceback()
-        if pipe_finish_writer is not None:
-            pipe_finish_writer.send(last_traceback)
-        logger.error(f"Initialization failed. warmup error: {last_traceback}")
-        kill_process_tree(os.getpid())
-        return
-
-    # Debug print
-    # logger.info(f"{res.json()=}")
-
-    logger.info("The server is fired up and ready to roll!")
-    if pipe_finish_writer is not None:
-        pipe_finish_writer.send("ready")
-
-    if server_args.delete_ckpt_after_loading:
-        delete_directory(server_args.model_path)
-
-
-STREAM_END_SYMBOL = b"data: [DONE]"
-STREAM_CHUNK_START_SYMBOL = b"data:"
-
-
-class Engine:
-    """
-    SRT Engine without an HTTP server layer.
-
-    This class provides a direct inference engine without the need for an HTTP server. It is designed for use cases where
-    launching the HTTP server adds unnecessary complexity or overhead,
-    """
-
-    def __init__(self, log_level: str = "error", *args, **kwargs):
-        """See the arguments in server_args.py::ServerArgs"""
-
-        # before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
-        atexit.register(self.shutdown)
-
-        server_args = ServerArgs(*args, log_level=log_level, **kwargs)
-        launch_engine(server_args=server_args)
-
-    def generate(
-        self,
-        # The input prompt. It can be a single prompt or a batch of prompts.
-        prompt: Optional[Union[List[str], str]] = None,
-        sampling_params: Optional[Union[List[Dict], Dict]] = None,
-        # The token ids for text; one can either specify text or input_ids.
-        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
-        return_logprob: Optional[Union[List[bool], bool]] = False,
-        logprob_start_len: Optional[Union[List[int], int]] = None,
-        top_logprobs_num: Optional[Union[List[int], int]] = None,
-        lora_path: Optional[List[Optional[str]]] = None,
-        custom_logit_processor: Optional[Union[List[str], str]] = None,
-        stream: bool = False,
-    ):
-        obj = GenerateReqInput(
-            text=prompt,
-            input_ids=input_ids,
-            sampling_params=sampling_params,
-            return_logprob=return_logprob,
-            logprob_start_len=logprob_start_len,
-            top_logprobs_num=top_logprobs_num,
-            lora_path=lora_path,
-            stream=stream,
-            custom_logit_processor=custom_logit_processor,
-        )
-
-        # get the current event loop
-        loop = asyncio.get_event_loop()
-        ret = loop.run_until_complete(generate_request(obj, None))
-
-        if stream is True:
-
-            def generator_wrapper():
-                offset = 0
-                loop = asyncio.get_event_loop()
-                generator = ret.body_iterator
-                while True:
-                    chunk = loop.run_until_complete(generator.__anext__())
-
-                    if chunk.startswith(STREAM_END_SYMBOL):
-                        break
-                    else:
-                        data = json.loads(chunk[len(STREAM_CHUNK_START_SYMBOL) :])
-                        data["text"] = data["text"][offset:]
-                        offset += len(data["text"])
-                        yield data
-
-            # we cannot yield in the scope of generate() because python does not allow yield + return in the same function
-            # however, it allows to wrap the generator as a subfunction and return
-            return generator_wrapper()
-        else:
-            return ret
-
-    async def async_generate(
-        self,
-        # The input prompt. It can be a single prompt or a batch of prompts.
-        prompt: Optional[Union[List[str], str]] = None,
-        sampling_params: Optional[Dict] = None,
-        # The token ids for text; one can either specify text or input_ids.
-        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
-        return_logprob: Optional[Union[List[bool], bool]] = False,
-        logprob_start_len: Optional[Union[List[int], int]] = None,
-        top_logprobs_num: Optional[Union[List[int], int]] = None,
-        lora_path: Optional[List[Optional[str]]] = None,
-        custom_logit_processor: Optional[Union[str, List[str]]] = None,
-        stream: bool = False,
-    ):
-        obj = GenerateReqInput(
-            text=prompt,
-            input_ids=input_ids,
-            sampling_params=sampling_params,
-            return_logprob=return_logprob,
-            logprob_start_len=logprob_start_len,
-            top_logprobs_num=top_logprobs_num,
-            lora_path=lora_path,
-            stream=stream,
-            custom_logit_processor=custom_logit_processor,
-        )
-
-        ret = await generate_request(obj, None)
-
-        if stream is True:
-            generator = ret.body_iterator
-
-            async def generator_wrapper():
-                offset = 0
-
-                while True:
-                    chunk = await generator.__anext__()
-
-                    if chunk.startswith(STREAM_END_SYMBOL):
-                        break
-                    else:
-                        data = json.loads(chunk[len(STREAM_CHUNK_START_SYMBOL) :])
-                        data["text"] = data["text"][offset:]
-                        offset += len(data["text"])
-                        yield data
-
-            return generator_wrapper()
-        else:
-            return ret
-
-    def shutdown(self):
-        kill_process_tree(os.getpid(), include_parent=False)
-
-    def get_tokenizer(self):
-        global tokenizer_manager
-
-        if tokenizer_manager is None:
-            raise ReferenceError("Tokenizer Manager is not initialized.")
-        else:
-            return tokenizer_manager.tokenizer
-
-    def encode(
-        self,
-        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
-    ):
-        obj = EmbeddingReqInput(text=prompt)
-
-        # get the current event loop
-        loop = asyncio.get_event_loop()
-        return loop.run_until_complete(encode_request(obj, None))
-
-    def start_profile(self):
-        tokenizer_manager.start_profile()
-
-    def stop_profile(self):
-        tokenizer_manager.stop_profile()
-
-    def get_server_info(self):
-        return {
-            **dataclasses.asdict(tokenizer_manager.server_args),  # server args
-            **scheduler_info,
-            "version": __version__,
-        }
-
-    def init_weights_update_group(
-        self,
-        master_address: str,
-        master_port: int,
-        rank_offset: int,
-        world_size: int,
-        group_name: str,
-        backend: str = "nccl",
-    ):
-        """Initialize parameter update group."""
-        obj = InitWeightsUpdateGroupReqInput(
-            master_address=master_address,
-            master_port=master_port,
-            rank_offset=rank_offset,
-            world_size=world_size,
-            group_name=group_name,
-            backend=backend,
-        )
-        loop = asyncio.get_event_loop()
-        return loop.run_until_complete(
-            tokenizer_manager.init_weights_update_group(obj, None)
-        )
-
-    def update_weights_from_distributed(self, name, dtype, shape):
-        """Update weights from distributed source."""
-        obj = UpdateWeightsFromDistributedReqInput(
-            name=name,
-            dtype=dtype,
-            shape=shape,
-        )
-        loop = asyncio.get_event_loop()
-        return loop.run_until_complete(
-            tokenizer_manager.update_weights_from_distributed(obj, None)
-        )
-
-    def update_weights_from_tensor(self, named_tensors: List[Tuple[str, torch.Tensor]]):
-        """Update weights from distributed source."""
-        obj = UpdateWeightsFromTensorReqInput(
-            serialized_named_tensors=MultiprocessingSerializer.serialize(named_tensors)
-        )
-        loop = asyncio.get_event_loop()
-        return loop.run_until_complete(
-            tokenizer_manager.update_weights_from_tensor(obj, None)
-        )
-
-    def get_weights_by_name(self, name, truncate_size=100):
-        """Get weights by parameter name."""
-        obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
-        loop = asyncio.get_event_loop()
-        return loop.run_until_complete(tokenizer_manager.get_weights_by_name(obj, None))
-
-    def release_memory_occupation(self):
-        """Release GPU occupation temporarily"""
-        obj = ReleaseMemoryOccupationReqInput()
-        loop = asyncio.get_event_loop()
-        loop.run_until_complete(tokenizer_manager.release_memory_occupation(obj, None))
-
-    def resume_memory_occupation(self):
-        """Resume GPU occupation"""
-        obj = ResumeMemoryOccupationReqInput()
-        loop = asyncio.get_event_loop()
-        loop.run_until_complete(tokenizer_manager.resume_memory_occupation(obj, None))
+# Some shortcuts for backward compatbility.
+# They will be removed in new versions.
+from sglang.srt.entrypoints.engine import Engine
+from sglang.srt.entrypoints.http_server import launch_server
diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py
index fc9a9793715d..bae0fcf2a494 100644
--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -12,7 +12,6 @@
 # limitations under the License.
 # ==============================================================================
 
-import json
 import multiprocessing as mp
 import os
 from dataclasses import dataclass
@@ -22,8 +21,8 @@
 import torch.nn.functional as F
 from transformers import AutoModelForCausalLM
 
+from sglang.srt.entrypoints.engine import Engine
 from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.server import Engine
 from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER
 
 DEFAULT_PROMPTS = [
diff --git a/sgl-router/py_src/sglang_router/launch_server.py b/sgl-router/py_src/sglang_router/launch_server.py
index 2f433269efa2..93bc2345d180 100644
--- a/sgl-router/py_src/sglang_router/launch_server.py
+++ b/sgl-router/py_src/sglang_router/launch_server.py
@@ -13,7 +13,7 @@
 from setproctitle import setproctitle
 from sglang_router.launch_router import RouterArgs, launch_router
 
-from sglang.srt.server import launch_server
+from sglang.srt.entrypoints.http_server import launch_server
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import is_port_available
 
diff --git a/test/srt/test_metrics.py b/test/srt/test_metrics.py
index 69babf795f01..2837107a1e6b 100644
--- a/test/srt/test_metrics.py
+++ b/test/srt/test_metrics.py
@@ -56,7 +56,6 @@ def test_metrics_enabled(self):
                 "sglang:gen_throughput",
                 "sglang:num_queue_reqs",
                 "sglang:cache_hit_rate",
-                "sglang:func_latency_seconds",
                 "sglang:prompt_tokens_total",
                 "sglang:generation_tokens_total",
                 "sglang:num_requests_total",
diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py
index 2e379c111799..06c83048f39b 100644
--- a/test/srt/test_nightly_gsm8k_eval.py
+++ b/test/srt/test_nightly_gsm8k_eval.py
@@ -45,7 +45,7 @@ def parse_models(model_string):
     return [model.strip() for model in model_string.split(",") if model.strip()]
 
 
-def launch_server(base_url, model, is_fp8, is_tp2):
+def popen_launch_server_wrapper(base_url, model, is_fp8, is_tp2):
     other_args = ["--log-level-http", "warning", "--trust-remote-code"]
     if is_fp8:
         if "Llama-3" in model or "gemma-2" in model:
@@ -148,7 +148,9 @@ def test_mgsm_en_all_models(self):
         for model_group, is_fp8, is_tp2 in self.model_groups:
             for model in model_group:
                 with self.subTest(model=model):
-                    process = launch_server(self.base_url, model, is_fp8, is_tp2)
+                    process = popen_launch_server_wrapper(
+                        self.base_url, model, is_fp8, is_tp2
+                    )
 
                     args = SimpleNamespace(
                         base_url=self.base_url,
diff --git a/test/srt/test_nightly_human_eval.py b/test/srt/test_nightly_human_eval.py
index 0b682937a825..6558b9effb9b 100644
--- a/test/srt/test_nightly_human_eval.py
+++ b/test/srt/test_nightly_human_eval.py
@@ -4,7 +4,7 @@
 import subprocess
 import unittest
 
-from test_nightly_gsm8k_eval import launch_server, parse_models
+from test_nightly_gsm8k_eval import parse_models, popen_launch_server_wrapper
 
 from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
@@ -93,7 +93,7 @@ def test_human_eval_all_models(self):
                 # NOTE: only Llama for now
                 if "Llama" in model:
                     with self.subTest(model=model):
-                        self.process = launch_server(
+                        self.process = popen_launch_server_wrapper(
                             self.base_url, model, is_fp8, is_tp2
                         )
                         self.run_evalplus(model)
diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py
index 7479b6468376..c535d5c06867 100644
--- a/test/srt/test_srt_engine.py
+++ b/test/srt/test_srt_engine.py
@@ -1,6 +1,6 @@
 """
 Usage:
-python3 -m unittest test_srt_engine.TestSRTEngine.test_3_sync_streaming_combination
+python3 -m unittest test_srt_engine.TestSRTEngine.test_4_sync_async_stream_combination
 """
 
 import asyncio
@@ -44,64 +44,97 @@ def test_1_engine_runtime_consistency(self):
         print(out2)
         self.assertEqual(out1, out2)
 
-    def test_2_engine_multiple_generate(self):
+    def test_2_engine_runtime_encode_consistency(self):
+        prompt = "Today is a sunny day and I like"
+        model_path = DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST
+
+        engine = sgl.Engine(model_path=model_path, is_embedding=True, random_seed=42)
+        out1 = torch.tensor(engine.encode(prompt)["embedding"])
+        engine.shutdown()
+
+        runtime = sgl.Runtime(model_path=model_path, is_embedding=True, random_seed=42)
+        out2 = torch.tensor(json.loads(runtime.encode(prompt))["embedding"])
+        runtime.shutdown()
+
+        self.assertTrue(torch.allclose(out1, out2, atol=1e-5, rtol=1e-3))
+
+    def test_3_engine_token_ids_consistency(self):
         # just to ensure there is no issue running multiple generate calls
         prompt = "Today is a sunny day and I like"
         model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
-
         sampling_params = {"temperature": 0, "max_new_tokens": 8}
 
-        engine = sgl.Engine(model_path=model_path, random_seed=42)
-        engine.generate(prompt, sampling_params)
-        engine.generate(prompt, sampling_params)
-        engine.shutdown()
+        engine = sgl.Engine(
+            model_path=model_path, random_seed=42, disable_radix_cache=True
+        )
+        out1 = engine.generate(prompt, sampling_params)["text"]
 
-    def test_3_sync_streaming_combination(self):
+        tokenizer = get_tokenizer(model_path)
+        token_ids = tokenizer.encode(prompt)
+        out2 = engine.generate(input_ids=token_ids, sampling_params=sampling_params)[
+            "text"
+        ]
 
-        prompt = "AI safety is..."
-        sampling_params = {"temperature": 0.8, "top_p": 0.95}
+        engine.shutdown()
 
-        async def async_streaming(engine):
+        print("==== Answer 1 ====")
+        print(out1)
 
-            generator = await engine.async_generate(
-                prompt, sampling_params, stream=True
-            )
+        print("==== Answer 2 ====")
+        print(out2)
+        self.assertEqual(out1, out2)
 
-            async for output in generator:
-                print(output["text"], end="", flush=True)
-            print()
+    def test_4_sync_async_stream_combination(self):
+        prompt = "AI safety is"
+        sampling_params = {"temperature": 0.8, "top_p": 0.95}
 
         # Create an LLM.
         llm = sgl.Engine(
             model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
         )
 
-        # 1. sync + non streaming
-        print("\n\n==== 1. sync + non streaming ====")
-        output = llm.generate(prompt, sampling_params)
+        if True:
+            # 1. sync + non streaming
+            print("\n\n==== 1. sync + non streaming ====")
+            output = llm.generate(prompt, sampling_params)
+            print(output["text"])
+
+            # 2. sync + streaming
+            print("\n\n==== 2. sync + streaming ====")
+            output_generator = llm.generate(prompt, sampling_params, stream=True)
+            offset = 0
+            for output in output_generator:
+                print(output["text"][offset:], end="", flush=True)
+                offset = len(output["text"])
+            print()
 
-        print(output["text"])
+        if True:
+            loop = asyncio.get_event_loop()
+            # 3. async + non_streaming
+            print("\n\n==== 3. async + non streaming ====")
+            output = loop.run_until_complete(
+                llm.async_generate(prompt, sampling_params)
+            )
+            print(output["text"])
 
-        # 2. sync + streaming
-        print("\n\n==== 2. sync + streaming ====")
-        output_generator = llm.generate(prompt, sampling_params, stream=True)
-        for output in output_generator:
-            print(output["text"], end="", flush=True)
-        print()
+            # 4. async + streaming
+            async def async_streaming(engine):
+                generator = await engine.async_generate(
+                    prompt, sampling_params, stream=True
+                )
 
-        loop = asyncio.get_event_loop()
-        # 3. async + non_streaming
-        print("\n\n==== 3. async + non streaming ====")
-        output = loop.run_until_complete(llm.async_generate(prompt, sampling_params))
-        print(output["text"])
+                offset = 0
+                async for output in generator:
+                    print(output["text"][offset:], end="", flush=True)
+                    offset = len(output["text"])
+                print()
 
-        # 4. async + streaming
-        print("\n\n==== 4. async + streaming ====")
-        loop.run_until_complete(async_streaming(llm))
+            print("\n\n==== 4. async + streaming ====")
+            loop.run_until_complete(async_streaming(llm))
 
         llm.shutdown()
 
-    def test_4_gsm8k(self):
+    def test_5_gsm8k(self):
 
         args = SimpleNamespace(
             model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
@@ -113,46 +146,7 @@ def test_4_gsm8k(self):
         metrics = run_eval(args)
         self.assertGreater(metrics["accuracy"], 0.3)
 
-    def test_5_prompt_input_ids_consistency(self):
-        prompt = "The capital of UK is"
-
-        model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
-        engine = sgl.Engine(
-            model_path=model_path, random_seed=42, disable_radix_cache=True
-        )
-        sampling_params = {"temperature": 0, "max_new_tokens": 8}
-        out1 = engine.generate(prompt, sampling_params)["text"]
-
-        tokenizer = get_tokenizer(model_path)
-        token_ids = tokenizer.encode(prompt)
-        out2 = engine.generate(input_ids=token_ids, sampling_params=sampling_params)[
-            "text"
-        ]
-
-        engine.shutdown()
-
-        print("==== Answer 1 ====")
-        print(out1)
-
-        print("==== Answer 2 ====")
-        print(out2)
-        self.assertEqual(out1, out2)
-
-    def test_6_engine_runtime_encode_consistency(self):
-        prompt = "Today is a sunny day and I like"
-        model_path = DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST
-
-        engine = sgl.Engine(model_path=model_path, is_embedding=True, random_seed=42)
-        out1 = torch.tensor(engine.encode(prompt)["embedding"])
-        engine.shutdown()
-
-        runtime = sgl.Runtime(model_path=model_path, is_embedding=True, random_seed=42)
-        out2 = torch.tensor(json.loads(runtime.encode(prompt))["embedding"])
-        runtime.shutdown()
-
-        self.assertTrue(torch.allclose(out1, out2, atol=1e-5, rtol=1e-3))
-
-    def test_7_engine_cpu_offload(self):
+    def test_6_engine_cpu_offload(self):
         prompt = "Today is a sunny day and I like"
         model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
 
@@ -182,7 +176,7 @@ def test_7_engine_cpu_offload(self):
         print(out2)
         self.assertEqual(out1, out2)
 
-    def test_8_engine_offline_throughput(self):
+    def test_7_engine_offline_throughput(self):
         server_args = ServerArgs(
             model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
         )

From 09bcbe0123ba33e5487b1e86505de04c3749ada4 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 19 Jan 2025 23:37:27 -0800
Subject: [PATCH 141/248] Update TypeBasedDispatcher and balance CI tests
 (#3001)

---
 .github/workflows/pr-test.yml                   | 2 +-
 python/sglang/srt/managers/tokenizer_manager.py | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index b910683e7daf..8b8d7c56e7f9 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -52,7 +52,7 @@ jobs:
     runs-on: 1-gpu-runner
     strategy:
       matrix:
-        range: [0-6, 6-15, 15-22, 22-32, 32-37, 37-100]
+        range: [0-6, 6-15, 15-22, 22-32, 32-40, 40-100]
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 162f10624f9e..2be2e532d078 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -226,9 +226,10 @@ def __init__(
 
         self._result_dispatcher = TypeBasedDispatcher(
             [
-                (BatchStrOut, self._handle_batch_output),
-                (BatchEmbeddingOut, self._handle_batch_output),
-                (BatchTokenIDOut, self._handle_batch_output),
+                (
+                    (BatchStrOut, BatchEmbeddingOut, BatchTokenIDOut),
+                    self._handle_batch_output,
+                ),
                 (OpenSessionReqOutput, self._handle_open_session_req_output),
                 (
                     UpdateWeightFromDiskReqOutput,

From 51e87f6f216d7a5f0f16f1050b3974da8238d96c Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 20 Jan 2025 00:28:47 -0800
Subject: [PATCH 142/248] Skip flaky custom_logit_processor tests (#3004)

---
 test/srt/test_srt_endpoint.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py
index 7afdc9bf41c2..cddd75fa6d6f 100644
--- a/test/srt/test_srt_endpoint.py
+++ b/test/srt/test_srt_endpoint.py
@@ -301,10 +301,14 @@ def __call__(self, logits, custom_param_list):
 
     def test_custom_logit_processor(self):
         """Test custom logit processor with a single request."""
+        # Temporarily skipped due to buggy implementation
+        return
         self.run_custom_logit_processor(target_token_id=5)
 
     def test_custom_logit_processor_batch(self):
         """Test custom logit processor with a batch of requests."""
+        # Temporarily skipped due to buggy implementation
+        return
         target_token_ids = list(range(32))
         with ThreadPoolExecutor(len(target_token_ids)) as executor:
             list(executor.map(self.run_custom_logit_processor, target_token_ids))

From 2584f6d94487645c48696762194457f6296c5ea7 Mon Sep 17 00:00:00 2001
From: Chayenne <zhaochen20@outlook.com>
Date: Mon, 20 Jan 2025 01:00:52 -0800
Subject: [PATCH 143/248] Docs: Add Performance Demonstaration for DPA (#3005)

---
 docs/references/deepseek.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/references/deepseek.md b/docs/references/deepseek.md
index 913395357e1d..2bdceb90478e 100644
--- a/docs/references/deepseek.md
+++ b/docs/references/deepseek.md
@@ -34,6 +34,10 @@ Overall, with these optimizations, we have achieved up to a 7x acceleration in o
 
 **Usage**: This optimization is aimed at improving throughput and should be used for scenarios with high QPS (Queries Per Second). Data Parallelism Attention optimization can be enabeld by `--enable-dp-attention` for DeepSeek Series Models.
 
+<p align="center">
+  <img src="https://lmsys.org/images/blog/sglang_v0_4/deepseek_coder_v2.svg" alt="Data Parallelism Attention Performance Comparison">
+</p>
+
 **Reference**: Check [Blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#data-parallelism-attention-for-deepseek-models).
 
 ## Multi Node Tensor Parallelism

From 583697cd71faa65a2e132a014743f5ff5c63890a Mon Sep 17 00:00:00 2001
From: Hongpeng Guo <hpguo@anyscale.com>
Date: Mon, 20 Jan 2025 02:00:35 -0800
Subject: [PATCH 144/248] [Enhancement] Custom Logit Processor Improvement
 (#2998)

Signed-off-by: Hongpeng Guo <hpguo@anyscale.com>
---
 python/sglang/bench_one_batch.py              |  1 +
 python/sglang/srt/layers/sampler.py           | 10 ++++
 python/sglang/srt/managers/schedule_batch.py  |  6 +++
 python/sglang/srt/managers/scheduler.py       |  2 +
 .../srt/sampling/sampling_batch_info.py       | 53 ++++++++++++-------
 test/srt/test_srt_endpoint.py                 | 35 ++++++++----
 6 files changed, 79 insertions(+), 28 deletions(-)

diff --git a/python/sglang/bench_one_batch.py b/python/sglang/bench_one_batch.py
index 473f478ad5c4..e01919399b5f 100644
--- a/python/sglang/bench_one_batch.py
+++ b/python/sglang/bench_one_batch.py
@@ -232,6 +232,7 @@ def extend(reqs, model_runner):
         model_config=model_runner.model_config,
         enable_overlap=False,
         spec_algorithm=SpeculativeAlgorithm.NONE,
+        enable_custom_logit_processor=False,
     )
     batch.prepare_for_extend()
     model_worker_batch = batch.get_model_worker_batch()
diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index e8b25da0704b..ebaa1aa0e7e9 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -132,6 +132,11 @@ def _apply_custom_logit_processor(
         """Apply custom logit processors to the logits.
         This function will modify the logits in-place."""
 
+        assert logits.shape[0] == len(sampling_batch_info), (
+            f"The batch size of logits ({logits.shape[0]}) does not match the batch size of "
+            f"sampling_batch_info ({len(sampling_batch_info)})"
+        )
+
         for _, (
             processor,
             batch_mask,
@@ -139,6 +144,11 @@ def _apply_custom_logit_processor(
             # Get the batch indices that need to be processed
             batch_indices = batch_mask.nonzero(as_tuple=True)[0]
 
+            assert batch_mask.shape[0] == len(sampling_batch_info), (
+                f"The number of batch mask ({batch_mask.shape[0]}) does not match the number of "
+                f"sampling_batch_info ({len(sampling_batch_info)})"
+            )
+
             # Apply the processor to the logits
             logits[batch_mask] = processor(
                 logits[batch_mask],
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index a09810a38714..040afe3d3242 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -595,6 +595,9 @@ class ScheduleBatch:
     spec_algorithm: SpeculativeAlgorithm = None
     spec_info: Optional[SpecInfo] = None
 
+    # Enable custom logit processor
+    enable_custom_logit_processor: bool = False
+
     @classmethod
     def init_new(
         cls,
@@ -605,6 +608,7 @@ def init_new(
         model_config: ModelConfig,
         enable_overlap: bool,
         spec_algorithm: SpeculativeAlgorithm,
+        enable_custom_logit_processor: bool,
     ):
         return cls(
             reqs=reqs,
@@ -618,6 +622,7 @@ def init_new(
             has_grammar=any(req.grammar for req in reqs),
             device=req_to_token_pool.device,
             spec_algorithm=spec_algorithm,
+            enable_custom_logit_processor=enable_custom_logit_processor,
         )
 
     def batch_size(self):
@@ -1201,6 +1206,7 @@ def copy(self):
             return_logprob=self.return_logprob,
             decoding_reqs=self.decoding_reqs,
             spec_algorithm=self.spec_algorithm,
+            enable_custom_logit_processor=self.enable_custom_logit_processor,
         )
 
     def __str__(self):
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 416abe21cd3e..fba8a67ecf40 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -966,6 +966,7 @@ def get_new_batch_prefill(self) -> Optional[ScheduleBatch]:
             self.model_config,
             self.enable_overlap,
             self.spec_algorithm,
+            self.server_args.enable_custom_logit_processor,
         )
         new_batch.prepare_for_extend()
 
@@ -1520,6 +1521,7 @@ def get_idle_batch(self):
             self.model_config,
             self.enable_overlap,
             self.spec_algorithm,
+            self.server_args.enable_custom_logit_processor,
         )
         idle_batch.prepare_for_idle()
         return idle_batch
diff --git a/python/sglang/srt/sampling/sampling_batch_info.py b/python/sglang/srt/sampling/sampling_batch_info.py
index d4c5c32386ae..a27ff1ad2a36 100644
--- a/python/sglang/srt/sampling/sampling_batch_info.py
+++ b/python/sglang/srt/sampling/sampling_batch_info.py
@@ -89,7 +89,10 @@ def from_schedule_batch(
         ).to(device, non_blocking=True)
 
         # Check if any request has custom logit processor
-        has_custom_logit_processor = any(r.custom_logit_processor for r in reqs)
+        has_custom_logit_processor = (
+            batch.enable_custom_logit_processor  # check the flag first.
+            and any(r.custom_logit_processor for r in reqs)  # then check the requests.
+        )
 
         if has_custom_logit_processor:
             # Merge the same type of custom logit processors together
@@ -247,8 +250,7 @@ def _filter_batch_custom_logit_processor(
         self, unfinished_indices: List[int], new_indices: torch.Tensor
     ):
         """Filter the custom logit processor and custom params"""
-        if not self.custom_logit_processor:
-            return
+
         self.custom_logit_processor = {
             k: (p, mask[new_indices])
             for k, (p, mask) in self.custom_logit_processor.items()
@@ -258,7 +260,9 @@ def _filter_batch_custom_logit_processor(
         }
         self.custom_params = [self.custom_params[i] for i in unfinished_indices]
 
-        if len(self) == 0:
+        # If the custom logit processor is an empty dict, set the flag to False,
+        # and set the custom logit processor and custom params to None.
+        if len(self.custom_logit_processor) == 0:
             self.custom_logit_processor = None
             self.custom_params = None
             self.has_custom_logit_processor = False
@@ -290,8 +294,8 @@ def merge_bias_tensor(
 
     @staticmethod
     def merge_custom_logit_processor(
-        lhs: Optional[Dict[str, torch.Tensor]],
-        rhs: Optional[Dict[str, torch.Tensor]],
+        lhs: Optional[Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]],
+        rhs: Optional[Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]],
         bs1: int,
         bs2: int,
         device: str,
@@ -319,27 +323,22 @@ def merge_custom_logit_processor(
             )
             merged_dict[k] = (processor, torch.cat([left_mask, right_mask]))
 
+            assert merged_dict[k][1].shape[0] == bs1 + bs2, (
+                f"The batch size of merged mask ({merged_dict[k][1].shape[0]}) does not match "
+                f"the sum of the batch sizes of the two masks ({bs1 + bs2})"
+                f"\n{left_mask=}\n{right_mask=}\n{bs1=}\n{bs2=}"
+                f"\n{lhs=}\n{rhs=}"
+            )
+
         return merged_dict
 
     def merge_batch(self, other: "SamplingBatchInfo"):
         self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
 
-        for item in [
-            "temperatures",
-            "top_ps",
-            "top_ks",
-            "min_ps",
-        ]:
-            self_val = getattr(self, item, None)
-            other_val = getattr(other, item, None)
-            setattr(self, item, torch.concat([self_val, other_val]))
-
-        self.is_all_greedy = self.is_all_greedy and other.is_all_greedy
+        # Merge the logit bias tensor
         self.logit_bias = SamplingBatchInfo.merge_bias_tensor(
             self.logit_bias, other.logit_bias, len(self), len(other), self.device
         )
-        self.need_min_p_sampling = self.need_min_p_sampling or other.need_min_p_sampling
-
         # Merge the custom logit processors and custom params lists
         if self.has_custom_logit_processor or other.has_custom_logit_processor:
             # Merge the custom logit processors
@@ -360,6 +359,22 @@ def merge_batch(self, other: "SamplingBatchInfo"):
             # Set the flag to True if any of the two has custom logit processor
             self.has_custom_logit_processor = True
 
+        # Note: becasue the __len()__ operator is defined on the temperatures tensor,
+        # please make sure any merge operation with len(self) or len(other) is done before
+        # the merge operation of the temperatures tensor below.
+        for item in [
+            "temperatures",
+            "top_ps",
+            "top_ks",
+            "min_ps",
+        ]:
+            self_val = getattr(self, item, None)
+            other_val = getattr(other, item, None)
+            setattr(self, item, torch.concat([self_val, other_val]))
+
+        self.is_all_greedy = self.is_all_greedy and other.is_all_greedy
+        self.need_min_p_sampling = self.need_min_p_sampling or other.need_min_p_sampling
+
     def apply_logits_bias(self, logits: torch.Tensor):
         # Apply logit_bias
         if self.logit_bias is not None:
diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py
index cddd75fa6d6f..7c57c13e251b 100644
--- a/test/srt/test_srt_endpoint.py
+++ b/test/srt/test_srt_endpoint.py
@@ -4,8 +4,10 @@
 """
 
 import json
+import random
 import unittest
 from concurrent.futures import ThreadPoolExecutor
+from typing import Optional
 
 import numpy as np
 import requests
@@ -253,8 +255,11 @@ def test_logprob_grammar(self):
 
         self.assertTrue(all(x is not None for x in logprobs))
 
-    def run_custom_logit_processor(self, target_token_id: int):
-        """Test custom logit processor with custom params."""
+    def run_custom_logit_processor(self, target_token_id: Optional[int] = None):
+        """Test custom logit processor with custom params.
+
+        If target_token_id is None, the custom logit processor won't be passed in.
+        """
 
         custom_params = {"token_id": target_token_id}
 
@@ -285,8 +290,12 @@ def __call__(self, logits, custom_param_list):
 
         # Custom json data with custom logit processor and params.
         custom_json = base_json.copy()
-        custom_json["custom_logit_processor"] = DeterministicLogitProcessor().to_str()
-        custom_json["sampling_params"]["custom_params"] = custom_params
+        # Only set the custom logit processor if target_token_id is not None.
+        if target_token_id is not None:
+            custom_json["custom_logit_processor"] = (
+                DeterministicLogitProcessor().to_str()
+            )
+            custom_json["sampling_params"]["custom_params"] = custom_params
 
         custom_response = requests.post(
             self.base_url + "/generate",
@@ -297,22 +306,30 @@ def __call__(self, logits, custom_param_list):
         sampled_tokens = [x[1] for x in output_token_logprobs]
 
         # The logit processor should always sample the given token as the logits is deterministic.
-        self.assertTrue(all(x == custom_params["token_id"] for x in sampled_tokens))
+        if target_token_id is not None:
+            self.assertTrue(
+                all(x == custom_params["token_id"] for x in sampled_tokens),
+                # Print the detailed test case info if the test fails.
+                f"{target_token_id=}\n{sampled_tokens=}\n{custom_response=}",
+            )
 
     def test_custom_logit_processor(self):
         """Test custom logit processor with a single request."""
-        # Temporarily skipped due to buggy implementation
-        return
         self.run_custom_logit_processor(target_token_id=5)
 
     def test_custom_logit_processor_batch(self):
         """Test custom logit processor with a batch of requests."""
-        # Temporarily skipped due to buggy implementation
-        return
         target_token_ids = list(range(32))
         with ThreadPoolExecutor(len(target_token_ids)) as executor:
             list(executor.map(self.run_custom_logit_processor, target_token_ids))
 
+    def test_custom_logit_processor_batch_mixed(self):
+        """Test a batch of requests mixed of requests with and without custom logit processor."""
+        target_token_ids = list(range(32)) + [None] * 16
+        random.shuffle(target_token_ids)
+        with ThreadPoolExecutor(len(target_token_ids)) as executor:
+            list(executor.map(self.run_custom_logit_processor, target_token_ids))
+
     def test_get_server_info(self):
         response = requests.get(self.base_url + "/get_server_info")
         response_json = response.json()

From 10bfce71b35300b61cb9016a544eb79d61352f77 Mon Sep 17 00:00:00 2001
From: yiakwy-xpu-ml-framework-team
 <89890040+yiakwy-xpu-ml-framework-team@users.noreply.github.com>
Date: Mon, 20 Jan 2025 19:33:29 +0800
Subject: [PATCH 145/248] fix moe align blocks benchmark (#3003)

---
 .../benchmark_deepseekv3_moe_align_blocks.py  | 36 +++++++++++++++----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py b/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py
index 0a6049a1200c..d00f4985ad2b 100644
--- a/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py
+++ b/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py
@@ -7,6 +7,8 @@
 import triton.language as tl
 from sgl_kernel import moe_align_block_size
 
+USE_RANDOM_PERM = False
+
 
 def ceil_div(a, b):
     return (a + b - 1) // b
@@ -141,8 +143,13 @@ def moe_align_block_size_triton(
 def calculate_diff(batch_size, seq_len):
     num_experts = 256
     block_size = 128
-    topk_ids = torch.randint(
-        0, num_experts, (batch_size, seq_len), dtype=torch.int32, device="cuda"
+    topk = 8
+
+    topk_ids = torch.stack(
+        [
+            torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
+            for _ in range(batch_size * seq_len)
+        ]
     )
 
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
@@ -169,7 +176,7 @@ def calculate_diff(batch_size, seq_len):
     expert_ids_triton = torch.empty_like(expert_ids_cuda)
     num_tokens_post_pad_triton = torch.empty_like(num_tokens_post_pad_cuda)
 
-    # 运行两个实现
+    # compare the performance of cuda and triton implementation
     moe_align_block_size(
         topk_ids,
         num_experts,
@@ -206,6 +213,15 @@ def calculate_diff(batch_size, seq_len):
 configs = list(itertools.product(batch_size_range, seq_length_range))
 
 
+def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
+    topk_ids = torch.zeros((num_tokens, topk), dtype=torch.int32, device="cuda")
+    for i in range(num_tokens):
+        topk_ids[i, :] = torch.randperm(num_experts, dtype=torch.int32, device="cuda")[
+            :topk
+        ]
+    return topk_ids
+
+
 @triton.testing.perf_report(
     triton.testing.Benchmark(
         x_names=["batch_size", "seq_len"],
@@ -223,9 +239,17 @@ def benchmark(batch_size, seq_len, provider):
     num_experts = 256
     block_size = 128
     topk = 8
-    topk_ids = torch.randint(
-        0, num_experts, (batch_size * seq_len, topk), dtype=torch.int32, device="cuda"
-    )
+
+    if USE_RANDOM_PERM:
+        topk_ids = get_topk_ids(batch_size * seq_len, num_experts, topk)
+    else:
+        topk_ids = torch.randint(
+            0,
+            num_experts,
+            (batch_size * seq_len, topk),
+            dtype=torch.int32,
+            device="cuda",
+        )
 
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
     sorted_ids = torch.empty(

From dc1881326f61734a4160620b6e12a5542b756066 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 20 Jan 2025 03:39:49 -0800
Subject: [PATCH 146/248] Fix perf regression on small batch sizes (#3008)

---
 python/sglang/srt/layers/radix_attention.py |  4 ++--
 python/sglang/srt/mem_cache/memory_pool.py  | 14 +++++++++-----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py
index a449d7188a46..0d46e7bba9aa 100644
--- a/python/sglang/srt/layers/radix_attention.py
+++ b/python/sglang/srt/layers/radix_attention.py
@@ -47,8 +47,8 @@ def __init__(
         self.logit_cap = logit_cap
         self.sliding_window_size = sliding_window_size or -1
         self.is_cross_attention = is_cross_attention
-        self.k_scale = 1.0
-        self.v_scale = 1.0
+        self.k_scale = None
+        self.v_scale = None
 
     def forward(
         self,
diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py
index e307367223ac..7b9b35611d8d 100644
--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -27,7 +27,7 @@
 import threading
 from enum import IntEnum
 from functools import wraps
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import psutil
@@ -270,13 +270,17 @@ def set_kv_buffer(
         loc: torch.Tensor,
         cache_k: torch.Tensor,
         cache_v: torch.Tensor,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
+        k_scale: Optional[float] = None,
+        v_scale: Optional[float] = None,
     ):
         layer_id = layer.layer_id
         if cache_k.dtype != self.dtype:
-            cache_k = (cache_k / k_scale).to(self.dtype)
-            cache_v = (cache_v / v_scale).to(self.dtype)
+            if k_scale is not None:
+                cache_k.div_(k_scale)
+            if v_scale is not None:
+                cache_v.div_(v_scale)
+            cache_k = cache_k.to(self.dtype)
+            cache_v = cache_v.to(self.dtype)
         if self.store_dtype != self.dtype:
             self.k_buffer[layer_id][loc] = cache_k.view(self.store_dtype)
             self.v_buffer[layer_id][loc] = cache_v.view(self.store_dtype)

From 89cd923581fec16d70ed536eceac7212dc6e0898 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 20 Jan 2025 04:03:15 -0800
Subject: [PATCH 147/248] Roll back to use vllm custom allreduce (#3006)

---
 python/sglang/srt/_custom_ops.py              |  2 +-
 python/sglang/srt/distributed/__init__.py     |  6 +-
 .../srt/distributed/communication_op.py       |  2 +-
 .../custom_all_reduce_utils.py                |  1 -
 .../device_communicators/pynccl_wrapper.py    |  2 +-
 .../device_communicators/shm_broadcast.py     |  2 +-
 python/sglang/srt/layers/attention/vision.py  |  4 +-
 .../srt/model_executor/cuda_graph_runner.py   |  3 -
 .../sglang/srt/model_executor/model_runner.py |  5 +-
 python/sglang/srt/utils.py                    | 56 ++-----------------
 10 files changed, 18 insertions(+), 65 deletions(-)

diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py
index 3c00a8552ffe..3cb313b9133b 100644
--- a/python/sglang/srt/_custom_ops.py
+++ b/python/sglang/srt/_custom_ops.py
@@ -12,7 +12,7 @@
 from sglang.srt.utils import is_hpu
 
 logger = logging.getLogger(__name__)
-use_vllm_custom_allreduce = os.environ.get("USE_VLLM_CUSTOM_ALLREDUCE", default=False)
+use_vllm_custom_allreduce = os.environ.get("USE_VLLM_CUSTOM_ALLREDUCE", default=True)
 
 if not is_hpu():
     if use_vllm_custom_allreduce:
diff --git a/python/sglang/srt/distributed/__init__.py b/python/sglang/srt/distributed/__init__.py
index db325cfabf55..12f802055c50 100644
--- a/python/sglang/srt/distributed/__init__.py
+++ b/python/sglang/srt/distributed/__init__.py
@@ -1,3 +1,3 @@
-from .communication_op import *
-from .parallel_state import *
-from .utils import *
+from sglang.srt.distributed.communication_op import *
+from sglang.srt.distributed.parallel_state import *
+from sglang.srt.distributed.utils import *
diff --git a/python/sglang/srt/distributed/communication_op.py b/python/sglang/srt/distributed/communication_op.py
index ddf3b8ef5689..7895508cd09b 100644
--- a/python/sglang/srt/distributed/communication_op.py
+++ b/python/sglang/srt/distributed/communication_op.py
@@ -4,7 +4,7 @@
 import torch
 import torch.distributed
 
-from .parallel_state import get_tp_group
+from sglang.srt.distributed.parallel_state import get_tp_group
 
 
 def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
diff --git a/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py b/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py
index d807dfd5ce59..64cf9a78d83b 100644
--- a/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py
@@ -7,7 +7,6 @@
 import subprocess
 import sys
 import tempfile
-from functools import lru_cache
 from itertools import product
 from typing import Dict, List, Optional, Sequence
 
diff --git a/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py b/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py
index e72284f51178..a2eacd741f8d 100644
--- a/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py
+++ b/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py
@@ -57,7 +57,7 @@ def find_nccl_library() -> str:
             so_file = "librccl.so.1"
         else:
             raise ValueError("NCCL only supports CUDA and ROCm backends.")
-        logger.info("Found nccl from library %s", so_file)
+        logger.debug("Found nccl from library %s", so_file)
     return so_file
 
 
diff --git a/python/sglang/srt/distributed/device_communicators/shm_broadcast.py b/python/sglang/srt/distributed/device_communicators/shm_broadcast.py
index 1afe6fca5266..c9f329fb274e 100644
--- a/python/sglang/srt/distributed/device_communicators/shm_broadcast.py
+++ b/python/sglang/srt/distributed/device_communicators/shm_broadcast.py
@@ -313,7 +313,7 @@ def __init__(
             remote_subscribe_port=remote_subscribe_port,
         )
 
-        logger.info("vLLM message queue communication handle: %s", self.handle)
+        logger.debug("Message queue communication handle: %s", self.handle)
 
     def export_handle(self) -> Handle:
         return self.handle
diff --git a/python/sglang/srt/layers/attention/vision.py b/python/sglang/srt/layers/attention/vision.py
index f66456b0437c..4fcfaad56251 100644
--- a/python/sglang/srt/layers/attention/vision.py
+++ b/python/sglang/srt/layers/attention/vision.py
@@ -5,9 +5,9 @@
 import torch
 import torch.nn as nn
 from einops import rearrange, repeat
-from vllm.distributed import parallel_state
-from vllm.distributed import utils as dist_utils
 
+from sglang.srt.distributed import parallel_state
+from sglang.srt.distributed import utils as dist_utils
 from sglang.srt.layers.attention.triton_ops.prefill_attention import (
     context_attention_fwd,
 )
diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
index 9fdf7a8ac781..762dac140fb8 100644
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -33,7 +33,6 @@
     ForwardBatch,
     ForwardMode,
 )
-from sglang.srt.utils import monkey_patch_vllm_all_gather
 
 if TYPE_CHECKING:
     from sglang.srt.model_executor.model_runner import ModelRunner
@@ -72,7 +71,6 @@ def patch_model(
     try:
         if enable_compile:
             _to_torch(model, reverse=False, batch_size=batch_size)
-            monkey_patch_vllm_all_gather()
             backup_ca_comm = tp_group.ca_comm
             # Use custom-allreduce here.
             # We found the custom allreduce is much faster than the built-in allreduce in torch,
@@ -88,7 +86,6 @@ def patch_model(
     finally:
         if enable_compile:
             _to_torch(model, reverse=True, batch_size=batch_size)
-            monkey_patch_vllm_all_gather(reverse=True)
             tp_group.ca_comm = backup_ca_comm
 
 
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 46920d922497..d5cdcf2beb07 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -63,8 +63,8 @@
     init_custom_process_group,
     is_cuda,
     is_hip,
+    monkey_patch_p2p_access_check,
     monkey_patch_vllm_gguf_config,
-    monkey_patch_vllm_p2p_access_check,
     set_cpu_offload_max_bytes,
 )
 
@@ -229,7 +229,8 @@ def init_torch_distributed(self):
             backend = "gloo"
 
         if not self.server_args.enable_p2p_check:
-            monkey_patch_vllm_p2p_access_check(self.gpu_id)
+            monkey_patch_p2p_access_check()
+
         if self.server_args.dist_init_addr:
             dist_init_method = f"tcp://{self.server_args.dist_init_addr}"
         else:
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index c67b6635b301..cf74f1d0f080 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -518,68 +518,24 @@ def kill_process_tree(parent_pid, include_parent: bool = True, skip_pid: int = N
             pass
 
 
-def monkey_patch_vllm_p2p_access_check(gpu_id: int):
+def monkey_patch_p2p_access_check():
     """
-    Monkey patch the slow p2p access check in vllm.
+    Monkey patch the slow p2p access check.
     NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
     """
 
-    import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
+    import sglang.srt.distributed.device_communicators.custom_all_reduce_utils as tgt
 
     setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
 
     # Suppress the warnings from this delete function when using sglang.bench_one_batch
-    from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
+    from sglang.srt.distributed.device_communicators.custom_all_reduce import (
+        CustomAllreduce,
+    )
 
     setattr(CustomAllreduce, "__del__", lambda *args, **kwargs: None)
 
 
-vllm_all_gather_backup = None
-
-
-def monkey_patch_vllm_all_gather(reverse: bool = False):
-    """Monkey patch all-gather to remove in-place operations."""
-    from torch.distributed import _functional_collectives as funcol
-    from vllm.distributed.parallel_state import GroupCoordinator
-
-    global vllm_all_gather_backup
-    if vllm_all_gather_backup is None:
-        vllm_all_gather_backup = GroupCoordinator.all_gather
-
-    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
-        world_size = self.world_size
-        # Bypass the function if we are using only 1 GPU.
-        if world_size == 1:
-            return input_
-        assert (
-            -input_.dim() <= dim < input_.dim()
-        ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
-        if dim < 0:
-            # Convert negative dim to positive.
-            dim += input_.dim()
-        input_size = input_.size()
-        # Allocate output tensor.
-        output_tensor = torch.empty(
-            (world_size,) + input_size, dtype=input_.dtype, device=input_.device
-        )
-
-        output_tensor = funcol.all_gather_tensor(
-            input_, gather_dim=0, group=self.device_group
-        ).view((world_size,) + input_size)
-
-        # Reshape
-        output_tensor = output_tensor.movedim(0, dim)
-        output_tensor = output_tensor.reshape(
-            input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
-        )
-        return output_tensor
-
-    if reverse:
-        setattr(GroupCoordinator, "all_gather", vllm_all_gather_backup)
-    else:
-        setattr(GroupCoordinator, "all_gather", all_gather)
-
-
 def monkey_patch_vllm_gguf_config():
     from vllm.model_executor.layers.quantization.gguf import (
         GGUFConfig,

From 73401fd0161caef9681e34f36dfead3134edd549 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 20 Jan 2025 04:57:14 -0800
Subject: [PATCH 148/248] Sync distributed package from vllm 0.6.4.post1
 (#3010)

---
 .../srt/distributed/communication_op.py       |   5 +-
 .../device_communicators/__init__.py          |   0
 .../device_communicators/cuda_wrapper.py      |   3 +-
 .../device_communicators/custom_all_reduce.py |   3 +-
 .../custom_all_reduce_utils.py                |   3 +-
 .../device_communicators/hpu_communicator.py  |   3 +-
 .../device_communicators/pynccl.py            |  81 ++++++++++++-
 .../device_communicators/pynccl_wrapper.py    | 112 +++++++++++++++++-
 .../device_communicators/shm_broadcast.py     |  75 +-----------
 .../device_communicators/xpu_communicator.py  |   3 +-
 .../sglang/srt/distributed/parallel_state.py  |   2 +-
 python/sglang/srt/distributed/utils.py        |   3 +-
 python/sglang/srt/server_args.py              |   4 +-
 python/sglang/srt/utils.py                    |  75 ++++++++++--
 python/sglang/utils.py                        |   1 -
 15 files changed, 280 insertions(+), 93 deletions(-)
 delete mode 100644 python/sglang/srt/distributed/device_communicators/__init__.py

diff --git a/python/sglang/srt/distributed/communication_op.py b/python/sglang/srt/distributed/communication_op.py
index 7895508cd09b..95600edfb410 100644
--- a/python/sglang/srt/distributed/communication_op.py
+++ b/python/sglang/srt/distributed/communication_op.py
@@ -1,10 +1,11 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/communication_op.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/communication_op.py
+
 from typing import Any, Dict, Optional, Union
 
 import torch
 import torch.distributed
 
-from sglang.srt.distributed.parallel_state import get_tp_group
+from .parallel_state import get_tp_group
 
 
 def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
diff --git a/python/sglang/srt/distributed/device_communicators/__init__.py b/python/sglang/srt/distributed/device_communicators/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/python/sglang/srt/distributed/device_communicators/cuda_wrapper.py b/python/sglang/srt/distributed/device_communicators/cuda_wrapper.py
index ab4ee33fcfc4..c902f314112e 100644
--- a/python/sglang/srt/distributed/device_communicators/cuda_wrapper.py
+++ b/python/sglang/srt/distributed/device_communicators/cuda_wrapper.py
@@ -1,4 +1,5 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/cuda_wrapper.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/cuda_wrapper.py
+
 """This file is a pure Python wrapper for the cudart library.
 It avoids the need to compile a separate shared library, and is
 convenient for use when we just need to call a few functions.
diff --git a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
index d4506b9f04c2..c3cbc41fe635 100644
--- a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
+++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
@@ -1,4 +1,5 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/custom_all_reduce.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/custom_all_reduce.py
+
 import ctypes
 import logging
 import os
diff --git a/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py b/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py
index 64cf9a78d83b..4073491aa621 100644
--- a/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py
@@ -1,4 +1,5 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+
 import ctypes
 import json
 import logging
diff --git a/python/sglang/srt/distributed/device_communicators/hpu_communicator.py b/python/sglang/srt/distributed/device_communicators/hpu_communicator.py
index 72ef3889e014..722e494cf775 100644
--- a/python/sglang/srt/distributed/device_communicators/hpu_communicator.py
+++ b/python/sglang/srt/distributed/device_communicators/hpu_communicator.py
@@ -1,4 +1,5 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/hpu_communicator.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/hpu_communicator.py
+
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
diff --git a/python/sglang/srt/distributed/device_communicators/pynccl.py b/python/sglang/srt/distributed/device_communicators/pynccl.py
index baee270da907..9f65939f6d91 100644
--- a/python/sglang/srt/distributed/device_communicators/pynccl.py
+++ b/python/sglang/srt/distributed/device_communicators/pynccl.py
@@ -1,8 +1,10 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/pynccl.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/pynccl.py
+
 import logging
 from contextlib import contextmanager
 from typing import Optional, Union
 
+# ===================== import region =====================
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup, ReduceOp
@@ -143,6 +145,57 @@ def all_reduce(
             cudaStream_t(stream.cuda_stream),
         )
 
+    def all_gather(
+        self, output_tensor: torch.Tensor, input_tensor: torch.Tensor, stream=None
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}"
+        )
+        if stream is None:
+            stream = self.stream
+        self.nccl.ncclAllGather(
+            buffer_type(input_tensor.data_ptr()),
+            buffer_type(output_tensor.data_ptr()),
+            input_tensor.numel(),
+            ncclDataTypeEnum.from_torch(input_tensor.dtype),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def reduce_scatter(
+        self,
+        output_tensor: torch.Tensor,
+        input_tensor: torch.Tensor,
+        op: ReduceOp = ReduceOp.SUM,
+        stream=None,
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}"
+        )
+        if stream is None:
+            stream = self.stream
+        self.nccl.ncclReduceScatter(
+            buffer_type(input_tensor.data_ptr()),
+            buffer_type(output_tensor.data_ptr()),
+            output_tensor.numel(),
+            ncclDataTypeEnum.from_torch(input_tensor.dtype),
+            ncclRedOpTypeEnum.from_torch(op),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
     def send(self, tensor: torch.Tensor, dst: int, stream=None):
         if self.disabled:
             return
@@ -179,6 +232,32 @@ def recv(self, tensor: torch.Tensor, src: int, stream=None):
             cudaStream_t(stream.cuda_stream),
         )
 
+    def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}"
+        )
+        if stream is None:
+            stream = self.stream
+        if src == self.rank:
+            sendbuff = buffer_type(tensor.data_ptr())
+            # NCCL requires the sender also to have a receive buffer
+            recvbuff = buffer_type(tensor.data_ptr())
+        else:
+            sendbuff = buffer_type()
+            recvbuff = buffer_type(tensor.data_ptr())
+        self.nccl.ncclBroadcast(
+            sendbuff,
+            recvbuff,
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            src,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
     @contextmanager
     def change_state(
         self, enable: Optional[bool] = None, stream: Optional[torch.cuda.Stream] = None
diff --git a/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py b/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py
index a2eacd741f8d..afb47733476a 100644
--- a/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py
+++ b/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py
@@ -1,4 +1,4 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/pynccl.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/pynccl.py
 
 # This file is a pure Python wrapper for the NCCL library.
 # The main purpose is to use NCCL combined with CUDA graph.
@@ -187,6 +187,43 @@ class NCCLLibrary:
                 cudaStream_t,
             ],
         ),
+        # ncclResult_t  ncclAllGather(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function(
+            "ncclAllGather",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclReduceScatter(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function(
+            "ncclReduceScatter",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclRedOp_t,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
         # ncclResult_t  ncclSend(
         #   const void* sendbuff, size_t count, ncclDataType_t datatype,
         #   int dest, ncclComm_t comm, cudaStream_t stream);
@@ -217,6 +254,23 @@ class NCCLLibrary:
                 cudaStream_t,
             ],
         ),
+        # ncclResult_t ncclBroadcast(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, int root, ncclComm_t comm,
+        #   cudaStream_t stream);
+        Function(
+            "ncclBroadcast",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
         # be cautious! this is a collective call, it will block until all
         # processes in the communicator have called this function.
         # because Python object destruction can happen in random order,
@@ -321,6 +375,46 @@ def ncclAllReduce(
             )
         )
 
+    def ncclReduceScatter(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        op: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(
+            self._funcs["ncclReduceScatter"](
+                sendbuff, recvbuff, count, datatype, op, comm, stream
+            )
+        )
+
+    def ncclAllGather(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # which is an aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(
+            self._funcs["ncclAllGather"](
+                sendbuff, recvbuff, count, datatype, comm, stream
+            )
+        )
+
     def ncclSend(
         self,
         sendbuff: buffer_type,
@@ -347,6 +441,22 @@ def ncclRecv(
             self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream)
         )
 
+    def ncclBroadcast(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        root: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclBroadcast"](
+                sendbuff, recvbuff, count, datatype, root, comm, stream
+            )
+        )
+
     def ncclCommDestroy(self, comm: ncclComm_t) -> None:
         self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
 
diff --git a/python/sglang/srt/distributed/device_communicators/shm_broadcast.py b/python/sglang/srt/distributed/device_communicators/shm_broadcast.py
index c9f329fb274e..7a3b22e27a81 100644
--- a/python/sglang/srt/distributed/device_communicators/shm_broadcast.py
+++ b/python/sglang/srt/distributed/device_communicators/shm_broadcast.py
@@ -1,11 +1,9 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/shm_broadcast.py
-import ipaddress
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/shm_broadcast.py
+
 import logging
 import os
 import pickle
-import socket
 import time
-import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from multiprocessing import shared_memory
@@ -18,6 +16,8 @@
 from zmq import IPV6  # type: ignore
 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
 
+from sglang.srt.utils import get_ip, get_open_port, is_valid_ipv6_address
+
 # SGLANG_RINGBUFFER_WARNING_INTERVAL can be set to 60
 SGLANG_RINGBUFFER_WARNING_INTERVAL = int(
     os.environ.get("SGLANG_RINGBUFFER_WARNING_INTERVAL", "60")
@@ -26,73 +26,6 @@
 logger = logging.getLogger(__name__)
 
 
-def get_ip() -> str:
-    # SGLANG_HOST_IP env can be ignore
-    host_ip = os.getenv("SGLANG_HOST_IP", "") or os.getenv("HOST_IP", "")
-    if host_ip:
-        return host_ip
-
-    # IP is not set, try to get it from the network interface
-
-    # try ipv4
-    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-    try:
-        s.connect(("8.8.8.8", 80))  # Doesn't need to be reachable
-        return s.getsockname()[0]
-    except Exception:
-        pass
-
-    # try ipv6
-    try:
-        s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
-        # Google's public DNS server, see
-        # https://developers.google.com/speed/public-dns/docs/using#addresses
-        s.connect(("2001:4860:4860::8888", 80))  # Doesn't need to be reachable
-        return s.getsockname()[0]
-    except Exception:
-        pass
-
-    warnings.warn(
-        "Failed to get the IP address, using 0.0.0.0 by default."
-        "The value can be set by the environment variable"
-        " SGLANG_HOST_IP or HOST_IP.",
-        stacklevel=2,
-    )
-    return "0.0.0.0"
-
-
-def get_open_port() -> int:
-
-    port = os.getenv("SGLANG_PORT")
-    if port is not None:
-        while True:
-            try:
-                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-                    s.bind(("", port))
-                    return port
-            except OSError:
-                port += 1  # Increment port number if already in use
-                logger.info("Port %d is already in use, trying port %d", port - 1, port)
-    # try ipv4
-    try:
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-            s.bind(("", 0))
-            return s.getsockname()[1]
-    except OSError:
-        # try ipv6
-        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
-            s.bind(("", 0))
-            return s.getsockname()[1]
-
-
-def is_valid_ipv6_address(address: str) -> bool:
-    try:
-        ipaddress.IPv6Address(address)
-        return True
-    except ValueError:
-        return False
-
-
 class ShmRingBuffer:
 
     def __init__(
diff --git a/python/sglang/srt/distributed/device_communicators/xpu_communicator.py b/python/sglang/srt/distributed/device_communicators/xpu_communicator.py
index ff0981b80bc8..532279f70c35 100644
--- a/python/sglang/srt/distributed/device_communicators/xpu_communicator.py
+++ b/python/sglang/srt/distributed/device_communicators/xpu_communicator.py
@@ -1,4 +1,5 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/xpu_communicator.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/xpu_communicator.py
+
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
diff --git a/python/sglang/srt/distributed/parallel_state.py b/python/sglang/srt/distributed/parallel_state.py
index 26d04b04ce91..c6d1a8307818 100644
--- a/python/sglang/srt/distributed/parallel_state.py
+++ b/python/sglang/srt/distributed/parallel_state.py
@@ -1,4 +1,4 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/parallel_state.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/parallel_state.py
 
 # Copyright 2023 The vLLM team.
 # Adapted from
diff --git a/python/sglang/srt/distributed/utils.py b/python/sglang/srt/distributed/utils.py
index a225fbb91820..e117aa30d073 100644
--- a/python/sglang/srt/distributed/utils.py
+++ b/python/sglang/srt/distributed/utils.py
@@ -1,4 +1,5 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/utils.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/utils.py
+
 # Copyright 2023 The vLLM team.
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 6dd0b9456541..4a7a28751db9 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -29,8 +29,8 @@
     get_nvgpu_memory_capacity,
     is_flashinfer_available,
     is_hip,
-    is_ipv6,
     is_port_available,
+    is_valid_ipv6_address,
     nullable_str,
 )
 
@@ -883,7 +883,7 @@ def from_cli_args(cls, args: argparse.Namespace):
         return cls(**{attr: getattr(args, attr) for attr in attrs})
 
     def url(self):
-        if is_ipv6(self.host):
+        if is_valid_ipv6_address(self.host):
             return f"http://[{self.host}]:{self.port}"
         else:
             return f"http://{self.host}:{self.port}"
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index cf74f1d0f080..4614114b41d5 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -102,14 +102,6 @@ def is_cuda_available():
     return torch.cuda.is_available() and torch.version.cuda
 
 
-def is_ipv6(address):
-    try:
-        ipaddress.IPv6Address(address)
-        return True
-    except ipaddress.AddressValueError:
-        return False
-
-
 def enable_show_time_cost():
     global show_time_cost
     show_time_cost = True
@@ -1383,3 +1375,70 @@ def set_uvicorn_logging_configs():
         "fmt"
     ] = '[%(asctime)s] %(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)s'
     LOGGING_CONFIG["formatters"]["access"]["datefmt"] = "%Y-%m-%d %H:%M:%S"
+
+
+def get_ip() -> str:
+    # SGLANG_HOST_IP env can be ignore
+    host_ip = os.getenv("SGLANG_HOST_IP", "") or os.getenv("HOST_IP", "")
+    if host_ip:
+        return host_ip
+
+    # IP is not set, try to get it from the network interface
+
+    # try ipv4
+    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+    try:
+        s.connect(("8.8.8.8", 80))  # Doesn't need to be reachable
+        return s.getsockname()[0]
+    except Exception:
+        pass
+
+    # try ipv6
+    try:
+        s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
+        # Google's public DNS server, see
+        # https://developers.google.com/speed/public-dns/docs/using#addresses
+        s.connect(("2001:4860:4860::8888", 80))  # Doesn't need to be reachable
+        return s.getsockname()[0]
+    except Exception:
+        pass
+
+    warnings.warn(
+        "Failed to get the IP address, using 0.0.0.0 by default."
+        "The value can be set by the environment variable"
+        " SGLANG_HOST_IP or HOST_IP.",
+        stacklevel=2,
+    )
+    return "0.0.0.0"
+
+
+def get_open_port() -> int:
+
+    port = os.getenv("SGLANG_PORT")
+    if port is not None:
+        while True:
+            try:
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                    s.bind(("", port))
+                    return port
+            except OSError:
+                port += 1  # Increment port number if already in use
+                logger.info("Port %d is already in use, trying port %d", port - 1, port)
+    # try ipv4
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+    except OSError:
+        # try ipv6
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+
+
+def is_valid_ipv6_address(address: str) -> bool:
+    try:
+        ipaddress.IPv6Address(address)
+        return True
+    except ValueError:
+        return False
diff --git a/python/sglang/utils.py b/python/sglang/utils.py
index 98942fbb39c5..742eebc3bc9b 100644
--- a/python/sglang/utils.py
+++ b/python/sglang/utils.py
@@ -1,7 +1,6 @@
 """Common utilities"""
 
 import base64
-import gc
 import importlib
 import json
 import logging

From b5caa22dfbdada1753011ef26d44b3da6028d2ad Mon Sep 17 00:00:00 2001
From: Byron Hsu <byronhsu1230@gmail.com>
Date: Mon, 20 Jan 2025 04:58:51 -0800
Subject: [PATCH 149/248] [kernel] port rope cuda kernel to sgl-kernel (#2993)

Co-authored-by: Yineng Zhang <me@zhyncs.com>
---
 .gitignore                                    |   3 +
 sgl-kernel/pyproject.toml                     |   2 +-
 sgl-kernel/setup.py                           |   1 +
 sgl-kernel/src/sgl-kernel/__init__.py         |   2 +
 .../src/sgl-kernel/csrc/rotary_embedding.cu   | 119 ++++++++++++++++++
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     |   6 +
 sgl-kernel/src/sgl-kernel/ops/__init__.py     |   5 +
 sgl-kernel/tests/test_rotary_embedding.py     | 118 +++++++++++++++++
 8 files changed, 255 insertions(+), 1 deletion(-)
 create mode 100644 sgl-kernel/src/sgl-kernel/csrc/rotary_embedding.cu
 create mode 100644 sgl-kernel/tests/test_rotary_embedding.py

diff --git a/.gitignore b/.gitignore
index 73fd52992c28..91966c664b53 100644
--- a/.gitignore
+++ b/.gitignore
@@ -222,3 +222,6 @@ work_dirs/
 compile_commands.json
 
 *.iml
+
+# VSCode
+.vscode
diff --git a/sgl-kernel/pyproject.toml b/sgl-kernel/pyproject.toml
index b0554bd8fed1..ab9d68b44c8c 100644
--- a/sgl-kernel/pyproject.toml
+++ b/sgl-kernel/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sgl-kernel"
-version = "0.0.2.post14"
+version = "0.0.2.post15"
 description = "Kernel Library for SGLang"
 readme = "README.md"
 requires-python = ">=3.8"
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 33e4abe1b234..25319af7a650 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -53,6 +53,7 @@ def update_wheel_platform_tag():
             "src/sgl-kernel/csrc/int8_gemm_kernel.cu",
             "src/sgl-kernel/csrc/sampling_scaling_penalties.cu",
             "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
+            "src/sgl-kernel/csrc/rotary_embedding.cu",
         ],
         include_dirs=include_dirs,
         extra_compile_args={
diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py
index 0c744982dd84..480bec71f365 100644
--- a/sgl-kernel/src/sgl-kernel/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/__init__.py
@@ -6,6 +6,7 @@
     int8_scaled_mm,
     moe_align_block_size,
     register_graph_buffers,
+    rotary_embedding,
     sampling_scaling_penalties,
 )
 
@@ -18,4 +19,5 @@
     "sampling_scaling_penalties",
     "get_graph_buffer_ipc_meta",
     "register_graph_buffers",
+    "rotary_embedding",
 ]
diff --git a/sgl-kernel/src/sgl-kernel/csrc/rotary_embedding.cu b/sgl-kernel/src/sgl-kernel/csrc/rotary_embedding.cu
new file mode 100644
index 000000000000..1dd4c4c52440
--- /dev/null
+++ b/sgl-kernel/src/sgl-kernel/csrc/rotary_embedding.cu
@@ -0,0 +1,119 @@
+// Reference: https://github.com/vllm-project/vllm/blob/main/csrc/pos_encoding_kernels.cu
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+template <typename scalar_t, bool IS_NEOX>
+inline __device__ void apply_token_rotary_embedding(scalar_t* __restrict__ arr, const scalar_t* __restrict__ cos_ptr,
+                                                    const scalar_t* __restrict__ sin_ptr, int rot_offset,
+                                                    int embed_dim) {
+  int x_index, y_index;
+  scalar_t cos, sin;
+  if (IS_NEOX) {
+    // GPT-NeoX style rotary embedding.
+    x_index = rot_offset;
+    y_index = embed_dim + rot_offset;
+    cos = __ldg(cos_ptr + x_index);
+    sin = __ldg(sin_ptr + x_index);
+  } else {
+    // GPT-J style rotary embedding.
+    x_index = 2 * rot_offset;
+    y_index = 2 * rot_offset + 1;
+    cos = __ldg(cos_ptr + x_index / 2);
+    sin = __ldg(sin_ptr + x_index / 2);
+  }
+
+  const scalar_t x = arr[x_index];
+  const scalar_t y = arr[y_index];
+  arr[x_index] = x * cos - y * sin;
+  arr[y_index] = y * cos + x * sin;
+}
+
+template <typename scalar_t, bool IS_NEOX>
+inline __device__ void apply_rotary_embedding(scalar_t* __restrict__ query,  // [batch_size, seq_len, num_heads,
+                                                                             // head_size] or [num_tokens, num_heads,
+                                                                             // head_size]
+                                              scalar_t* __restrict__ key,    // [batch_size, seq_len, num_kv_heads,
+                                                                             // head_size] or [num_tokens, num_kv_heads,
+                                                                             // head_size]
+                                              const scalar_t* cache_ptr, const int head_size, const int num_heads,
+                                              const int num_kv_heads, const int rot_dim, const int token_idx,
+                                              const int64_t query_stride, const int64_t key_stride) {
+  const int embed_dim = rot_dim / 2;
+  const scalar_t* cos_ptr = cache_ptr;
+  const scalar_t* sin_ptr = cache_ptr + embed_dim;
+
+  const int nq = num_heads * embed_dim;
+  for (int i = threadIdx.x; i < nq; i += blockDim.x) {
+    const int head_idx = i / embed_dim;
+    const int64_t token_head = token_idx * query_stride + head_idx * head_size;
+    const int rot_offset = i % embed_dim;
+    apply_token_rotary_embedding<scalar_t, IS_NEOX>(query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
+  }
+
+  const int nk = num_kv_heads * embed_dim;
+  for (int i = threadIdx.x; i < nk; i += blockDim.x) {
+    const int head_idx = i / embed_dim;
+    const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+    const int rot_offset = i % embed_dim;
+    apply_token_rotary_embedding<scalar_t, IS_NEOX>(key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
+  }
+}
+
+template <typename scalar_t, bool IS_NEOX>
+__global__ void rotary_embedding_kernel(const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
+                                                                                // [num_tokens]
+                                        scalar_t* __restrict__ query,           // [batch_size, seq_len, num_heads,
+                                                                                // head_size] or [num_tokens, num_heads,
+                                                                                // head_size]
+                                        scalar_t* __restrict__ key,             // [batch_size, seq_len, num_kv_heads,
+                                                                     // head_size] or [num_tokens, num_kv_heads,
+                                                                     // head_size]
+                                        const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
+                                                                                     // 2]
+                                        const int rot_dim, const int64_t query_stride, const int64_t key_stride,
+                                        const int num_heads, const int num_kv_heads, const int head_size) {
+  // Each thread block is responsible for one token.
+  const int token_idx = blockIdx.x;
+  int64_t pos = positions[token_idx];
+  const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+
+  apply_rotary_embedding<scalar_t, IS_NEOX>(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
+                                            token_idx, query_stride, key_stride);
+}
+
+void rotary_embedding(torch::Tensor& positions,  // [batch_size, seq_len] or [num_tokens]
+                      torch::Tensor& query,      // [batch_size, seq_len, num_heads * head_size] or
+                                                 // [num_tokens, num_heads * head_size]
+                      torch::Tensor& key,        // [batch_size, seq_len, num_kv_heads * head_size] or
+                                                 // [num_tokens, num_kv_heads * head_size]
+                      int64_t head_size,
+                      torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
+                      bool is_neox) {
+  int64_t num_tokens = query.numel() / query.size(-1);
+  int rot_dim = cos_sin_cache.size(1);
+  int num_heads = query.size(-1) / head_size;
+  int num_kv_heads = key.size(-1) / head_size;
+  int64_t query_stride = query.stride(-2);
+  int64_t key_stride = key.stride(-2);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::BFloat16, at::ScalarType::Half, query.scalar_type(), "rotary_embedding", [&] {
+        if (is_neox) {
+          rotary_embedding_kernel<scalar_t, true>
+              <<<grid, block, 0, stream>>>(positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
+                                           key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(), rot_dim,
+                                           query_stride, key_stride, num_heads, num_kv_heads, head_size);
+        } else {
+          rotary_embedding_kernel<scalar_t, false>
+              <<<grid, block, 0, stream>>>(positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
+                                           key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(), rot_dim,
+                                           query_stride, key_stride, num_heads, num_kv_heads, head_size);
+        }
+      });
+}
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index 99d0326cf073..f2ae95d7f794 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -26,6 +26,10 @@ torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& ma
                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
                              const c10::optional<torch::Tensor>& bias);
 
+// rotary embedding
+void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, torch::Tensor& key, int64_t head_size,
+                      torch::Tensor& cos_sin_cache, bool is_neox);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // trt_reduce
   m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)");
@@ -39,4 +43,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("sampling_scaling_penalties", &sampling_scaling_penalties, "Sampling scaling penalties (CUDA)");
   // int8_scaled_mm
   m.def("int8_scaled_mm", &int8_scaled_mm, "INT8 scaled matmul (CUDA)");
+  // rotary embedding
+  m.def("rotary_embedding", &rotary_embedding, "Rotary Embedding (CUDA)");
 }
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index 6b35f78a4904..b8abd57d39df 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -7,6 +7,7 @@
 from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm
 from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size
 from sgl_kernel.ops._kernels import register_graph_buffers as _register_graph_buffers
+from sgl_kernel.ops._kernels import rotary_embedding as _rotary_embedding
 from sgl_kernel.ops._kernels import (
     sampling_scaling_penalties as _sampling_scaling_penalties,
 )
@@ -71,3 +72,7 @@ def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
         out_dtype,
         bias,
     )
+
+
+def rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox):
+    return _rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox)
diff --git a/sgl-kernel/tests/test_rotary_embedding.py b/sgl-kernel/tests/test_rotary_embedding.py
new file mode 100644
index 000000000000..1bbe8f1bfebb
--- /dev/null
+++ b/sgl-kernel/tests/test_rotary_embedding.py
@@ -0,0 +1,118 @@
+from typing import Optional, Tuple
+
+import torch
+from vllm.model_executor.layers.rotary_embedding import (
+    RotaryEmbedding as VLLMRotaryEmbedding,
+)
+
+
+class SGLRotaryEmbedding(VLLMRotaryEmbedding):
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        from sgl_kernel import rotary_embedding
+
+        self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
+
+        rotary_embedding(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.cos_sin_cache,
+            self.is_neox_style,
+        )
+        return query, key
+
+
+# Compare the output of SGLRotaryEmbedding's forward_cuda with VLLMRotaryEmbedding's forward_native
+
+
+def test_rotary_embedding():
+    # Test case 1: FP32
+    def run_test(
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        dtype,
+        batch_size,
+        seq_len,
+        num_heads,
+        test_name,
+    ):
+        print(f"\nRunning {test_name}...")
+        # Initialize both implementations
+        sgl_rope = SGLRotaryEmbedding(
+            head_size, rotary_dim, max_position, base, is_neox_style, dtype
+        ).to("cuda")
+        vllm_rope = VLLMRotaryEmbedding(
+            head_size, rotary_dim, max_position, base, is_neox_style, dtype
+        ).to("cuda")
+
+        # Regular forward pass
+        positions = torch.arange(seq_len, device="cuda").repeat(batch_size)
+        query = torch.randn(
+            batch_size * seq_len, num_heads * head_size, device="cuda", dtype=dtype
+        )
+        key = torch.randn(
+            batch_size * seq_len, num_heads * head_size, device="cuda", dtype=dtype
+        )
+
+        # Make copies for both implementations
+        query_sgl = query.clone()
+        key_sgl = key.clone()
+        query_vllm = query.clone()
+        key_vllm = key.clone()
+
+        # Run both implementations
+        query_sgl_out, key_sgl_out = sgl_rope.forward_cuda(
+            positions, query_sgl, key_sgl
+        )
+        query_vllm_out, key_vllm_out = vllm_rope.forward_native(
+            positions, query_vllm, key_vllm
+        )
+
+        # Compare outputs
+        torch.testing.assert_close(query_sgl_out, query_vllm_out, rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(key_sgl_out, key_vllm_out, rtol=1e-3, atol=1e-3)
+
+        print(f"{test_name} passed!")
+
+    # Test Case 1: FP32 with larger dimensions
+    run_test(
+        head_size=128,
+        rotary_dim=64,
+        max_position=4096,
+        base=10000,
+        is_neox_style=True,
+        dtype=torch.float32,
+        batch_size=4,
+        seq_len=32,
+        num_heads=8,
+        test_name="FP32 Test",
+    )
+
+    # Test Case 2: BF16 with smaller dimensions
+    run_test(
+        head_size=64,
+        rotary_dim=32,
+        max_position=2048,
+        base=8000,
+        is_neox_style=True,
+        dtype=torch.bfloat16,
+        batch_size=2,
+        seq_len=16,
+        num_heads=4,
+        test_name="BF16 Test",
+    )
+
+
+if __name__ == "__main__":
+    test_rotary_embedding()

From e94fb7cb1094f1210c0ab92a31bcc848e2c2cf7a Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Mon, 20 Jan 2025 21:50:55 +0800
Subject: [PATCH 150/248] chore: bump v0.4.1.post7 (#3009)

---
 docker/Dockerfile.rocm                |  2 +-
 docs/developer/setup_github_runner.md |  4 ++--
 docs/start/install.md                 | 10 +++++-----
 python/pyproject.toml                 |  2 +-
 python/sglang/version.py              |  2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 5a6e9770b721..2a55504e6122 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -1,5 +1,5 @@
 # Usage (to build SGLang ROCm docker image):
-#   docker build --build-arg SGL_BRANCH=v0.4.1.post6 -t v0.4.1.post6-rocm620 -f Dockerfile.rocm .
+#   docker build --build-arg SGL_BRANCH=v0.4.1.post7 -t v0.4.1.post7-rocm620 -f Dockerfile.rocm .
 
 # default base image
 ARG BASE_IMAGE="rocmshared/vllm-rocm:20250114-tuned-elementwise-layernorm"
diff --git a/docs/developer/setup_github_runner.md b/docs/developer/setup_github_runner.md
index edc03d661837..e805cfce7dad 100644
--- a/docs/developer/setup_github_runner.md
+++ b/docs/developer/setup_github_runner.md
@@ -11,9 +11,9 @@ docker pull nvidia/cuda:12.1.1-devel-ubuntu22.04
 # Nvidia
 docker run --shm-size 128g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash
 # AMD
-docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post6-rocm620 /bin/bash
+docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post7-rocm620 /bin/bash
 # AMD just the last 2 GPUs
-docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post6-rocm620 /bin/bash
+docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post7-rocm620 /bin/bash
 ```
 
 ### Step 2: Configure the runner by `config.sh`
diff --git a/docs/start/install.md b/docs/start/install.md
index 8b84527c4ffc..81e2345a6738 100644
--- a/docs/start/install.md
+++ b/docs/start/install.md
@@ -13,7 +13,7 @@ Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/
 ## Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.4.1.post6 https://github.com/sgl-project/sglang.git
+git clone -b v0.4.1.post7 https://github.com/sgl-project/sglang.git
 cd sglang
 
 pip install --upgrade pip
@@ -26,7 +26,7 @@ Note: To AMD ROCm system with Instinct/MI GPUs, do following instead:
 
 ```
 # Use the last release branch
-git clone -b v0.4.1.post6 https://github.com/sgl-project/sglang.git
+git clone -b v0.4.1.post7 https://github.com/sgl-project/sglang.git
 cd sglang
 
 pip install --upgrade pip
@@ -51,7 +51,7 @@ docker run --gpus all \
 Note: To AMD ROCm system with Instinct/MI GPUs, it is recommended to use `docker/Dockerfile.rocm` to build images, example and usage as below:
 
 ```bash
-docker build --build-arg SGL_BRANCH=v0.4.1.post6 -t v0.4.1.post6-rocm620 -f Dockerfile.rocm .
+docker build --build-arg SGL_BRANCH=v0.4.1.post7 -t v0.4.1.post7-rocm620 -f Dockerfile.rocm .
 
 alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/dri --ipc=host \
     --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
@@ -60,11 +60,11 @@ alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/d
 drun -p 30000:30000 \
     -v ~/.cache/huggingface:/root/.cache/huggingface \
     --env "HF_TOKEN=<secret>" \
-    v0.4.1.post6-rocm620 \
+    v0.4.1.post7-rocm620 \
     python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
 
 # Till flashinfer backend available, --attention-backend triton --sampling-backend pytorch are set by default
-drun v0.4.1.post6-rocm620 python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8
+drun v0.4.1.post7-rocm620 python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8
 ```
 
 ## Method 4: Using docker compose
diff --git a/python/pyproject.toml b/python/pyproject.toml
index f97c9c26679f..80cc0e9dc60e 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sglang"
-version = "0.4.1.post6"
+version = "0.4.1.post7"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
diff --git a/python/sglang/version.py b/python/sglang/version.py
index 3a906dbcfffd..18ca924974b2 100644
--- a/python/sglang/version.py
+++ b/python/sglang/version.py
@@ -1 +1 @@
-__version__ = "0.4.1.post6"
+__version__ = "0.4.1.post7"

From 41a0ccd4f1714ea57b532d7a5f3abe655db2f04e Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Mon, 20 Jan 2025 23:22:19 +0800
Subject: [PATCH 151/248] Add clang-format check to sgl-kernel ci (#3012)

---
 .github/workflows/pr-test-sgl-kernel.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
index 4115677dcb02..cacf938a3303 100644
--- a/.github/workflows/pr-test-sgl-kernel.yml
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -16,6 +16,20 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Check clang-format
+        uses: DoozyX/clang-format-lint-action@v0.18.1
+        with:
+          source: sgl-kernel
+          extensions: h,c,cpp,hpp,cu,cuh,cc
+          clangFormatVersion: 16
+          style: file
+
   unit-test:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
     runs-on: 1-gpu-runner

From 5dfcacfcb186ca7d35ce535d0ab3c34df6eff7ea Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Tue, 21 Jan 2025 00:04:12 +0800
Subject: [PATCH 152/248] Add compile flags for cutlass 3.x (#3013)

Co-authored-by: HandH1998 <1335248067@qq.com>
---
 sgl-kernel/setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 25319af7a650..9f9867113387 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -30,6 +30,7 @@ def update_wheel_platform_tag():
     root / "src" / "sgl-kernel" / "csrc",
 ]
 nvcc_flags = [
+    "-DNDEBUG",
     "-O3",
     "-Xcompiler",
     "-fPIC",
@@ -37,6 +38,7 @@ def update_wheel_platform_tag():
     "-gencode=arch=compute_80,code=sm_80",
     "-gencode=arch=compute_89,code=sm_89",
     "-gencode=arch=compute_90,code=sm_90",
+    "-gencode=arch=compute_90a,code=sm_90a",
     "-U__CUDA_NO_HALF_OPERATORS__",
     "-U__CUDA_NO_HALF2_OPERATORS__",
 ]

From 0311ce8e1ccda984f1afe5a90e1208902ed923fc Mon Sep 17 00:00:00 2001
From: Byron Hsu <byronhsu1230@gmail.com>
Date: Mon, 20 Jan 2025 12:45:13 -0800
Subject: [PATCH 153/248] [router] Expose worker startup secs & Return error
 instead of panic for router init (#3016)

---
 .../py_src/sglang_router/launch_router.py     | 20 ++++---
 .../py_src/sglang_router/launch_server.py     | 32 +++++++++--
 sgl-router/py_src/sglang_router/router.py     |  3 ++
 sgl-router/py_test/test_launch_router.py      |  1 +
 sgl-router/src/lib.rs                         | 20 ++++---
 sgl-router/src/router.rs                      | 54 +++++++++++++++----
 sgl-router/src/server.rs                      | 41 +++++++-------
 7 files changed, 124 insertions(+), 47 deletions(-)

diff --git a/sgl-router/py_src/sglang_router/launch_router.py b/sgl-router/py_src/sglang_router/launch_router.py
index 28cd5d11fbb1..384e3666db00 100644
--- a/sgl-router/py_src/sglang_router/launch_router.py
+++ b/sgl-router/py_src/sglang_router/launch_router.py
@@ -33,6 +33,7 @@ class RouterArgs:
 
     # Routing policy
     policy: str = "cache_aware"
+    worker_startup_timeout_secs: int = 300
     cache_threshold: float = 0.5
     balance_abs_threshold: int = 32
     balance_rel_threshold: float = 1.0001
@@ -87,6 +88,12 @@ def add_cli_args(
             choices=["random", "round_robin", "cache_aware"],
             help="Load balancing policy to use",
         )
+        parser.add_argument(
+            f"--{prefix}worker-startup-timeout-secs",
+            type=int,
+            default=RouterArgs.worker_startup_timeout_secs,
+            help="Timeout in seconds for worker startup",
+        )
         parser.add_argument(
             f"--{prefix}cache-threshold",
             type=float,
@@ -147,6 +154,9 @@ def from_cli_args(
             host=args.host,
             port=args.port,
             policy=getattr(args, f"{prefix}policy"),
+            worker_startup_timeout_secs=getattr(
+                args, f"{prefix}worker_startup_timeout_secs"
+            ),
             cache_threshold=getattr(args, f"{prefix}cache_threshold"),
             balance_abs_threshold=getattr(args, f"{prefix}balance_abs_threshold"),
             balance_rel_threshold=getattr(args, f"{prefix}balance_rel_threshold"),
@@ -188,9 +198,10 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
 
         router = Router(
             worker_urls=router_args.worker_urls,
-            policy=policy_from_str(router_args.policy),
             host=router_args.host,
             port=router_args.port,
+            policy=policy_from_str(router_args.policy),
+            worker_startup_timeout_secs=router_args.worker_startup_timeout_secs,
             cache_threshold=router_args.cache_threshold,
             balance_abs_threshold=router_args.balance_abs_threshold,
             balance_rel_threshold=router_args.balance_rel_threshold,
@@ -205,7 +216,7 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
 
     except Exception as e:
         logger.error(f"Error starting router: {e}")
-        return None
+        raise e
 
 
 class CustomHelpFormatter(
@@ -239,10 +250,7 @@ def parse_router_args(args: List[str]) -> RouterArgs:
 
 def main() -> None:
     router_args = parse_router_args(sys.argv[1:])
-    router = launch_router(router_args)
-
-    if router is None:
-        sys.exit(1)
+    launch_router(router_args)
 
 
 if __name__ == "__main__":
diff --git a/sgl-router/py_src/sglang_router/launch_server.py b/sgl-router/py_src/sglang_router/launch_server.py
index 93bc2345d180..74353c21edbb 100644
--- a/sgl-router/py_src/sglang_router/launch_server.py
+++ b/sgl-router/py_src/sglang_router/launch_server.py
@@ -68,7 +68,7 @@ def run_server(server_args, dp_rank):
     # create new process group
     os.setpgrp()
 
-    setproctitle(f"sglang::server")
+    setproctitle("sglang::server")
     # Set SGLANG_DP_RANK environment variable
     os.environ["SGLANG_DP_RANK"] = str(dp_rank)
 
@@ -120,9 +120,26 @@ def find_available_ports(base_port: int, count: int) -> List[int]:
 
 def cleanup_processes(processes: List[mp.Process]):
     for process in processes:
-        logger.info(f"Terminating process {process.pid}")
-        process.terminate()
-    logger.info("All processes terminated")
+        logger.info(f"Terminating process group {process.pid}")
+        try:
+            os.killpg(process.pid, signal.SIGTERM)
+        except ProcessLookupError:
+            # Process group may already be terminated
+            pass
+
+    # Wait for processes to terminate
+    for process in processes:
+        process.join(timeout=5)
+        if process.is_alive():
+            logger.warning(
+                f"Process {process.pid} did not terminate gracefully, forcing kill"
+            )
+            try:
+                os.killpg(process.pid, signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+
+    logger.info("All process groups terminated")
 
 
 def main():
@@ -173,7 +190,12 @@ def main():
     ]
 
     # Start the router
-    router = launch_router(router_args)
+    try:
+        launch_router(router_args)
+    except Exception as e:
+        logger.error(f"Failed to start router: {e}")
+        cleanup_processes(server_processes)
+        sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/sgl-router/py_src/sglang_router/router.py b/sgl-router/py_src/sglang_router/router.py
index 5ce21c3d78ea..1665f8a67bed 100644
--- a/sgl-router/py_src/sglang_router/router.py
+++ b/sgl-router/py_src/sglang_router/router.py
@@ -17,6 +17,7 @@ class Router:
             - PolicyType.CacheAware: Distribute requests based on cache state and load balance
         host: Host address to bind the router server. Default: '127.0.0.1'
         port: Port number to bind the router server. Default: 3001
+        worker_startup_timeout_secs: Timeout in seconds for worker startup. Default: 300
         cache_threshold: Cache threshold (0.0-1.0) for cache-aware routing. Routes to cached worker
             if the match rate exceeds threshold, otherwise routes to the worker with the smallest
             tree. Default: 0.5
@@ -37,6 +38,7 @@ def __init__(
         policy: PolicyType = PolicyType.RoundRobin,
         host: str = "127.0.0.1",
         port: int = 3001,
+        worker_startup_timeout_secs: int = 300,
         cache_threshold: float = 0.50,
         balance_abs_threshold: int = 32,
         balance_rel_threshold: float = 1.0001,
@@ -50,6 +52,7 @@ def __init__(
             policy=policy,
             host=host,
             port=port,
+            worker_startup_timeout_secs=worker_startup_timeout_secs,
             cache_threshold=cache_threshold,
             balance_abs_threshold=balance_abs_threshold,
             balance_rel_threshold=balance_rel_threshold,
diff --git a/sgl-router/py_test/test_launch_router.py b/sgl-router/py_test/test_launch_router.py
index 94912f69491b..15549cae72fa 100644
--- a/sgl-router/py_test/test_launch_router.py
+++ b/sgl-router/py_test/test_launch_router.py
@@ -28,6 +28,7 @@ def setUp(self):
             host="127.0.0.1",
             port=30000,
             policy="cache_aware",
+            worker_startup_timeout_secs=600,
             cache_threshold=0.5,
             balance_abs_threshold=32,
             balance_rel_threshold=1.0001,
diff --git a/sgl-router/src/lib.rs b/sgl-router/src/lib.rs
index 2d8cf4c0c8d6..8355f135216c 100644
--- a/sgl-router/src/lib.rs
+++ b/sgl-router/src/lib.rs
@@ -17,6 +17,7 @@ struct Router {
     port: u16,
     worker_urls: Vec<String>,
     policy: PolicyType,
+    worker_startup_timeout_secs: u64,
     cache_threshold: f32,
     balance_abs_threshold: usize,
     balance_rel_threshold: f32,
@@ -34,6 +35,7 @@ impl Router {
         policy = PolicyType::RoundRobin,
         host = String::from("127.0.0.1"),
         port = 3001,
+        worker_startup_timeout_secs = 300,
         cache_threshold = 0.50,
         balance_abs_threshold = 32,
         balance_rel_threshold = 1.0001,
@@ -47,6 +49,7 @@ impl Router {
         policy: PolicyType,
         host: String,
         port: u16,
+        worker_startup_timeout_secs: u64,
         cache_threshold: f32,
         balance_abs_threshold: usize,
         balance_rel_threshold: f32,
@@ -60,6 +63,7 @@ impl Router {
             port,
             worker_urls,
             policy,
+            worker_startup_timeout_secs,
             cache_threshold,
             balance_abs_threshold,
             balance_rel_threshold,
@@ -72,9 +76,14 @@ impl Router {
 
     fn start(&self) -> PyResult<()> {
         let policy_config = match &self.policy {
-            PolicyType::Random => router::PolicyConfig::RandomConfig,
-            PolicyType::RoundRobin => router::PolicyConfig::RoundRobinConfig,
+            PolicyType::Random => router::PolicyConfig::RandomConfig {
+                timeout_secs: self.worker_startup_timeout_secs,
+            },
+            PolicyType::RoundRobin => router::PolicyConfig::RoundRobinConfig {
+                timeout_secs: self.worker_startup_timeout_secs,
+            },
             PolicyType::CacheAware => router::PolicyConfig::CacheAwareConfig {
+                timeout_secs: self.worker_startup_timeout_secs,
                 cache_threshold: self.cache_threshold,
                 balance_abs_threshold: self.balance_abs_threshold,
                 balance_rel_threshold: self.balance_rel_threshold,
@@ -93,10 +102,9 @@ impl Router {
                 max_payload_size: self.max_payload_size,
             })
             .await
-            .unwrap();
-        });
-
-        Ok(())
+            .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e.to_string()))?;
+            Ok(())
+        })
     }
 }
 
diff --git a/sgl-router/src/router.rs b/sgl-router/src/router.rs
index 08f6cdefa759..6ea791685d4b 100644
--- a/sgl-router/src/router.rs
+++ b/sgl-router/src/router.rs
@@ -3,7 +3,7 @@ use actix_web::http::header::{HeaderValue, CONTENT_TYPE};
 use actix_web::{HttpRequest, HttpResponse};
 use bytes::Bytes;
 use futures_util::{StreamExt, TryStreamExt};
-use log::{debug, info, warn};
+use log::{debug, error, info, warn};
 use std::collections::HashMap;
 use std::fmt::Debug;
 use std::sync::atomic::AtomicUsize;
@@ -17,9 +17,11 @@ pub enum Router {
     RoundRobin {
         worker_urls: Arc<RwLock<Vec<String>>>,
         current_index: AtomicUsize,
+        timeout_secs: u64,
     },
     Random {
         worker_urls: Arc<RwLock<Vec<String>>>,
+        timeout_secs: u64,
     },
     CacheAware {
         /*
@@ -89,36 +91,51 @@ pub enum Router {
         cache_threshold: f32,
         balance_abs_threshold: usize,
         balance_rel_threshold: f32,
+        timeout_secs: u64,
         _eviction_thread: Option<thread::JoinHandle<()>>,
     },
 }
 
 #[derive(Debug, Clone)]
 pub enum PolicyConfig {
-    RandomConfig,
-    RoundRobinConfig,
+    RandomConfig {
+        timeout_secs: u64,
+    },
+    RoundRobinConfig {
+        timeout_secs: u64,
+    },
     CacheAwareConfig {
         cache_threshold: f32,
         balance_abs_threshold: usize,
         balance_rel_threshold: f32,
         eviction_interval_secs: u64,
         max_tree_size: usize,
+        timeout_secs: u64,
     },
 }
 
 impl Router {
     pub fn new(worker_urls: Vec<String>, policy_config: PolicyConfig) -> Result<Self, String> {
+        // Get timeout from policy config
+        let timeout_secs = match &policy_config {
+            PolicyConfig::RandomConfig { timeout_secs } => *timeout_secs,
+            PolicyConfig::RoundRobinConfig { timeout_secs } => *timeout_secs,
+            PolicyConfig::CacheAwareConfig { timeout_secs, .. } => *timeout_secs,
+        };
+
         // Wait until all workers are healthy
-        Self::wait_for_healthy_workers(&worker_urls, 300, 10)?;
+        Self::wait_for_healthy_workers(&worker_urls, timeout_secs, 10)?;
 
         // Create router based on policy...
         Ok(match policy_config {
-            PolicyConfig::RandomConfig => Router::Random {
+            PolicyConfig::RandomConfig { timeout_secs } => Router::Random {
                 worker_urls: Arc::new(RwLock::new(worker_urls)),
+                timeout_secs,
             },
-            PolicyConfig::RoundRobinConfig => Router::RoundRobin {
+            PolicyConfig::RoundRobinConfig { timeout_secs } => Router::RoundRobin {
                 worker_urls: Arc::new(RwLock::new(worker_urls)),
                 current_index: std::sync::atomic::AtomicUsize::new(0),
+                timeout_secs,
             },
             PolicyConfig::CacheAwareConfig {
                 cache_threshold,
@@ -126,6 +143,7 @@ impl Router {
                 balance_rel_threshold,
                 eviction_interval_secs,
                 max_tree_size,
+                timeout_secs,
             } => {
                 let mut running_queue = HashMap::new();
                 for url in &worker_urls {
@@ -176,6 +194,7 @@ impl Router {
                     cache_threshold,
                     balance_abs_threshold,
                     balance_rel_threshold,
+                    timeout_secs,
                     _eviction_thread: Some(eviction_thread),
                 }
             }
@@ -192,6 +211,10 @@ impl Router {
 
         loop {
             if start_time.elapsed() > Duration::from_secs(timeout_secs) {
+                error!(
+                    "Timeout {}s waiting for workers to become healthy",
+                    timeout_secs
+                );
                 return Err(format!(
                     "Timeout {}s waiting for workers to become healthy",
                     timeout_secs
@@ -238,7 +261,7 @@ impl Router {
     fn select_first_worker(&self) -> Result<String, String> {
         match self {
             Router::RoundRobin { worker_urls, .. }
-            | Router::Random { worker_urls }
+            | Router::Random { worker_urls, .. }
             | Router::CacheAware { worker_urls, .. } => {
                 if worker_urls.read().unwrap().is_empty() {
                     Err("No workers are available".to_string())
@@ -349,6 +372,7 @@ impl Router {
             Router::RoundRobin {
                 worker_urls,
                 current_index,
+                ..
             } => {
                 let idx = current_index
                     .fetch_update(
@@ -360,7 +384,7 @@ impl Router {
                 worker_urls.read().unwrap()[idx].clone()
             }
 
-            Router::Random { worker_urls } => worker_urls.read().unwrap()
+            Router::Random { worker_urls, .. } => worker_urls.read().unwrap()
                 [rand::random::<usize>() % worker_urls.read().unwrap().len()]
             .clone(),
 
@@ -571,13 +595,21 @@ impl Router {
 
     pub async fn add_worker(&self, worker_url: &str) -> Result<String, String> {
         let interval_secs = 10; // check every 10 seconds
-        let timeout_secs = 300; // 5 minutes
+        let timeout_secs = match self {
+            Router::Random { timeout_secs, .. } => *timeout_secs,
+            Router::RoundRobin { timeout_secs, .. } => *timeout_secs,
+            Router::CacheAware { timeout_secs, .. } => *timeout_secs,
+        };
 
         let start_time = std::time::Instant::now();
         let client = reqwest::Client::new();
 
         loop {
             if start_time.elapsed() > Duration::from_secs(timeout_secs) {
+                error!(
+                    "Timeout {}s waiting for worker {} to become healthy",
+                    timeout_secs, worker_url
+                );
                 return Err(format!(
                     "Timeout {}s waiting for worker {} to become healthy",
                     timeout_secs, worker_url
@@ -589,7 +621,7 @@ impl Router {
                     if res.status().is_success() {
                         match self {
                             Router::RoundRobin { worker_urls, .. }
-                            | Router::Random { worker_urls }
+                            | Router::Random { worker_urls, .. }
                             | Router::CacheAware { worker_urls, .. } => {
                                 info!("Worker {} health check passed", worker_url);
                                 let mut urls = worker_urls.write().unwrap();
@@ -663,7 +695,7 @@ impl Router {
     pub fn remove_worker(&self, worker_url: &str) {
         match self {
             Router::RoundRobin { worker_urls, .. }
-            | Router::Random { worker_urls }
+            | Router::Random { worker_urls, .. }
             | Router::CacheAware { worker_urls, .. } => {
                 let mut urls = worker_urls.write().unwrap();
                 if let Some(index) = urls.iter().position(|url| url == &worker_url) {
diff --git a/sgl-router/src/server.rs b/sgl-router/src/server.rs
index 09878f07f8ec..e3587389e9ff 100644
--- a/sgl-router/src/server.rs
+++ b/sgl-router/src/server.rs
@@ -18,14 +18,10 @@ impl AppState {
         worker_urls: Vec<String>,
         client: reqwest::Client,
         policy_config: PolicyConfig,
-    ) -> Self {
+    ) -> Result<Self, String> {
         // Create router based on policy
-        let router = match Router::new(worker_urls, policy_config) {
-            Ok(router) => router,
-            Err(error) => panic!("Failed to create router: {}", error),
-        };
-
-        Self { router, client }
+        let router = Router::new(worker_urls, policy_config)?;
+        Ok(Self { router, client })
     }
 }
 
@@ -131,6 +127,7 @@ pub struct ServerConfig {
 }
 
 pub async fn startup(config: ServerConfig) -> std::io::Result<()> {
+    // Initialize logger
     Builder::new()
         .format(|buf, record| {
             use chrono::Local;
@@ -152,24 +149,30 @@ pub async fn startup(config: ServerConfig) -> std::io::Result<()> {
         )
         .init();
 
+    info!("🚧 Initializing router on {}:{}", config.host, config.port);
+    info!("🚧 Initializing workers on {:?}", config.worker_urls);
+    info!("🚧 Policy Config: {:?}", config.policy_config);
+    info!(
+        "🚧 Max payload size: {} MB",
+        config.max_payload_size / (1024 * 1024)
+    );
+
     let client = reqwest::Client::builder()
         .build()
         .expect("Failed to create HTTP client");
 
-    let app_state = web::Data::new(AppState::new(
-        config.worker_urls.clone(),
-        client,
-        config.policy_config.clone(),
-    ));
-
-    info!("✅ Starting router on {}:{}", config.host, config.port);
-    info!("✅ Serving Worker URLs: {:?}", config.worker_urls);
-    info!("✅ Policy Config: {:?}", config.policy_config);
-    info!(
-        "✅ Max payload size: {} MB",
-        config.max_payload_size / (1024 * 1024)
+    let app_state = web::Data::new(
+        AppState::new(
+            config.worker_urls.clone(),
+            client,
+            config.policy_config.clone(),
+        )
+        .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?,
     );
 
+    info!("✅ Serving router on {}:{}", config.host, config.port);
+    info!("✅ Serving workers on {:?}", config.worker_urls);
+
     HttpServer::new(move || {
         App::new()
             .app_data(app_state.clone())

From 3a8428ecaa6375996de0142afd73df2f98c4cc23 Mon Sep 17 00:00:00 2001
From: Byron Hsu <byronhsu1230@gmail.com>
Date: Mon, 20 Jan 2025 14:36:54 -0800
Subject: [PATCH 154/248] [router] Expose worker startup interval (#3019)

---
 .../py_src/sglang_router/launch_router.py     | 11 ++++
 sgl-router/py_src/sglang_router/router.py     |  3 +
 sgl-router/py_test/test_launch_router.py      |  1 +
 sgl-router/src/lib.rs                         |  7 +++
 sgl-router/src/router.rs                      | 63 +++++++++++++++----
 5 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/sgl-router/py_src/sglang_router/launch_router.py b/sgl-router/py_src/sglang_router/launch_router.py
index 384e3666db00..38f1fbba2dce 100644
--- a/sgl-router/py_src/sglang_router/launch_router.py
+++ b/sgl-router/py_src/sglang_router/launch_router.py
@@ -34,6 +34,7 @@ class RouterArgs:
     # Routing policy
     policy: str = "cache_aware"
     worker_startup_timeout_secs: int = 300
+    worker_startup_check_interval: int = 10
     cache_threshold: float = 0.5
     balance_abs_threshold: int = 32
     balance_rel_threshold: float = 1.0001
@@ -94,6 +95,12 @@ def add_cli_args(
             default=RouterArgs.worker_startup_timeout_secs,
             help="Timeout in seconds for worker startup",
         )
+        parser.add_argument(
+            f"--{prefix}worker-startup-check-interval",
+            type=int,
+            default=RouterArgs.worker_startup_check_interval,
+            help="Interval in seconds between checks for worker startup",
+        )
         parser.add_argument(
             f"--{prefix}cache-threshold",
             type=float,
@@ -157,6 +164,9 @@ def from_cli_args(
             worker_startup_timeout_secs=getattr(
                 args, f"{prefix}worker_startup_timeout_secs"
             ),
+            worker_startup_check_interval=getattr(
+                args, f"{prefix}worker_startup_check_interval"
+            ),
             cache_threshold=getattr(args, f"{prefix}cache_threshold"),
             balance_abs_threshold=getattr(args, f"{prefix}balance_abs_threshold"),
             balance_rel_threshold=getattr(args, f"{prefix}balance_rel_threshold"),
@@ -202,6 +212,7 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
             port=router_args.port,
             policy=policy_from_str(router_args.policy),
             worker_startup_timeout_secs=router_args.worker_startup_timeout_secs,
+            worker_startup_check_interval=router_args.worker_startup_check_interval,
             cache_threshold=router_args.cache_threshold,
             balance_abs_threshold=router_args.balance_abs_threshold,
             balance_rel_threshold=router_args.balance_rel_threshold,
diff --git a/sgl-router/py_src/sglang_router/router.py b/sgl-router/py_src/sglang_router/router.py
index 1665f8a67bed..b8757168b242 100644
--- a/sgl-router/py_src/sglang_router/router.py
+++ b/sgl-router/py_src/sglang_router/router.py
@@ -18,6 +18,7 @@ class Router:
         host: Host address to bind the router server. Default: '127.0.0.1'
         port: Port number to bind the router server. Default: 3001
         worker_startup_timeout_secs: Timeout in seconds for worker startup. Default: 300
+        worker_startup_check_interval: Interval in seconds between checks for worker initialization. Default: 10
         cache_threshold: Cache threshold (0.0-1.0) for cache-aware routing. Routes to cached worker
             if the match rate exceeds threshold, otherwise routes to the worker with the smallest
             tree. Default: 0.5
@@ -39,6 +40,7 @@ def __init__(
         host: str = "127.0.0.1",
         port: int = 3001,
         worker_startup_timeout_secs: int = 300,
+        worker_startup_check_interval: int = 10,
         cache_threshold: float = 0.50,
         balance_abs_threshold: int = 32,
         balance_rel_threshold: float = 1.0001,
@@ -53,6 +55,7 @@ def __init__(
             host=host,
             port=port,
             worker_startup_timeout_secs=worker_startup_timeout_secs,
+            worker_startup_check_interval=worker_startup_check_interval,
             cache_threshold=cache_threshold,
             balance_abs_threshold=balance_abs_threshold,
             balance_rel_threshold=balance_rel_threshold,
diff --git a/sgl-router/py_test/test_launch_router.py b/sgl-router/py_test/test_launch_router.py
index 15549cae72fa..27ed64d6e668 100644
--- a/sgl-router/py_test/test_launch_router.py
+++ b/sgl-router/py_test/test_launch_router.py
@@ -29,6 +29,7 @@ def setUp(self):
             port=30000,
             policy="cache_aware",
             worker_startup_timeout_secs=600,
+            worker_startup_check_interval=10,
             cache_threshold=0.5,
             balance_abs_threshold=32,
             balance_rel_threshold=1.0001,
diff --git a/sgl-router/src/lib.rs b/sgl-router/src/lib.rs
index 8355f135216c..ba9aeac1fef2 100644
--- a/sgl-router/src/lib.rs
+++ b/sgl-router/src/lib.rs
@@ -18,6 +18,7 @@ struct Router {
     worker_urls: Vec<String>,
     policy: PolicyType,
     worker_startup_timeout_secs: u64,
+    worker_startup_check_interval: u64,
     cache_threshold: f32,
     balance_abs_threshold: usize,
     balance_rel_threshold: f32,
@@ -36,6 +37,7 @@ impl Router {
         host = String::from("127.0.0.1"),
         port = 3001,
         worker_startup_timeout_secs = 300,
+        worker_startup_check_interval = 10,
         cache_threshold = 0.50,
         balance_abs_threshold = 32,
         balance_rel_threshold = 1.0001,
@@ -50,6 +52,7 @@ impl Router {
         host: String,
         port: u16,
         worker_startup_timeout_secs: u64,
+        worker_startup_check_interval: u64,
         cache_threshold: f32,
         balance_abs_threshold: usize,
         balance_rel_threshold: f32,
@@ -64,6 +67,7 @@ impl Router {
             worker_urls,
             policy,
             worker_startup_timeout_secs,
+            worker_startup_check_interval,
             cache_threshold,
             balance_abs_threshold,
             balance_rel_threshold,
@@ -78,12 +82,15 @@ impl Router {
         let policy_config = match &self.policy {
             PolicyType::Random => router::PolicyConfig::RandomConfig {
                 timeout_secs: self.worker_startup_timeout_secs,
+                interval_secs: self.worker_startup_check_interval,
             },
             PolicyType::RoundRobin => router::PolicyConfig::RoundRobinConfig {
                 timeout_secs: self.worker_startup_timeout_secs,
+                interval_secs: self.worker_startup_check_interval,
             },
             PolicyType::CacheAware => router::PolicyConfig::CacheAwareConfig {
                 timeout_secs: self.worker_startup_timeout_secs,
+                interval_secs: self.worker_startup_check_interval,
                 cache_threshold: self.cache_threshold,
                 balance_abs_threshold: self.balance_abs_threshold,
                 balance_rel_threshold: self.balance_rel_threshold,
diff --git a/sgl-router/src/router.rs b/sgl-router/src/router.rs
index 6ea791685d4b..5bbffc74ccf5 100644
--- a/sgl-router/src/router.rs
+++ b/sgl-router/src/router.rs
@@ -18,10 +18,12 @@ pub enum Router {
         worker_urls: Arc<RwLock<Vec<String>>>,
         current_index: AtomicUsize,
         timeout_secs: u64,
+        interval_secs: u64,
     },
     Random {
         worker_urls: Arc<RwLock<Vec<String>>>,
         timeout_secs: u64,
+        interval_secs: u64,
     },
     CacheAware {
         /*
@@ -92,6 +94,7 @@ pub enum Router {
         balance_abs_threshold: usize,
         balance_rel_threshold: f32,
         timeout_secs: u64,
+        interval_secs: u64,
         _eviction_thread: Option<thread::JoinHandle<()>>,
     },
 }
@@ -100,9 +103,11 @@ pub enum Router {
 pub enum PolicyConfig {
     RandomConfig {
         timeout_secs: u64,
+        interval_secs: u64,
     },
     RoundRobinConfig {
         timeout_secs: u64,
+        interval_secs: u64,
     },
     CacheAwareConfig {
         cache_threshold: f32,
@@ -111,31 +116,50 @@ pub enum PolicyConfig {
         eviction_interval_secs: u64,
         max_tree_size: usize,
         timeout_secs: u64,
+        interval_secs: u64,
     },
 }
 
 impl Router {
     pub fn new(worker_urls: Vec<String>, policy_config: PolicyConfig) -> Result<Self, String> {
-        // Get timeout from policy config
-        let timeout_secs = match &policy_config {
-            PolicyConfig::RandomConfig { timeout_secs } => *timeout_secs,
-            PolicyConfig::RoundRobinConfig { timeout_secs } => *timeout_secs,
-            PolicyConfig::CacheAwareConfig { timeout_secs, .. } => *timeout_secs,
+        // Get timeout and interval from policy config
+        let (timeout_secs, interval_secs) = match &policy_config {
+            PolicyConfig::RandomConfig {
+                timeout_secs,
+                interval_secs,
+            } => (*timeout_secs, *interval_secs),
+            PolicyConfig::RoundRobinConfig {
+                timeout_secs,
+                interval_secs,
+            } => (*timeout_secs, *interval_secs),
+            PolicyConfig::CacheAwareConfig {
+                timeout_secs,
+                interval_secs,
+                ..
+            } => (*timeout_secs, *interval_secs),
         };
 
         // Wait until all workers are healthy
-        Self::wait_for_healthy_workers(&worker_urls, timeout_secs, 10)?;
+        Self::wait_for_healthy_workers(&worker_urls, timeout_secs, interval_secs)?;
 
         // Create router based on policy...
         Ok(match policy_config {
-            PolicyConfig::RandomConfig { timeout_secs } => Router::Random {
+            PolicyConfig::RandomConfig {
+                timeout_secs,
+                interval_secs,
+            } => Router::Random {
                 worker_urls: Arc::new(RwLock::new(worker_urls)),
                 timeout_secs,
+                interval_secs,
             },
-            PolicyConfig::RoundRobinConfig { timeout_secs } => Router::RoundRobin {
+            PolicyConfig::RoundRobinConfig {
+                timeout_secs,
+                interval_secs,
+            } => Router::RoundRobin {
                 worker_urls: Arc::new(RwLock::new(worker_urls)),
                 current_index: std::sync::atomic::AtomicUsize::new(0),
                 timeout_secs,
+                interval_secs,
             },
             PolicyConfig::CacheAwareConfig {
                 cache_threshold,
@@ -144,6 +168,7 @@ impl Router {
                 eviction_interval_secs,
                 max_tree_size,
                 timeout_secs,
+                interval_secs,
             } => {
                 let mut running_queue = HashMap::new();
                 for url in &worker_urls {
@@ -195,6 +220,7 @@ impl Router {
                     balance_abs_threshold,
                     balance_rel_threshold,
                     timeout_secs,
+                    interval_secs,
                     _eviction_thread: Some(eviction_thread),
                 }
             }
@@ -594,11 +620,22 @@ impl Router {
     }
 
     pub async fn add_worker(&self, worker_url: &str) -> Result<String, String> {
-        let interval_secs = 10; // check every 10 seconds
-        let timeout_secs = match self {
-            Router::Random { timeout_secs, .. } => *timeout_secs,
-            Router::RoundRobin { timeout_secs, .. } => *timeout_secs,
-            Router::CacheAware { timeout_secs, .. } => *timeout_secs,
+        let (timeout_secs, interval_secs) = match self {
+            Router::Random {
+                timeout_secs,
+                interval_secs,
+                ..
+            } => (*timeout_secs, *interval_secs),
+            Router::RoundRobin {
+                timeout_secs,
+                interval_secs,
+                ..
+            } => (*timeout_secs, *interval_secs),
+            Router::CacheAware {
+                timeout_secs,
+                interval_secs,
+                ..
+            } => (*timeout_secs, *interval_secs),
         };
 
         let start_time = std::time::Instant::now();

From 3ad4cd491575d9ac6f9faf7582b418c6d6bb34e6 Mon Sep 17 00:00:00 2001
From: Byron Hsu <byronhsu1230@gmail.com>
Date: Mon, 20 Jan 2025 14:38:06 -0800
Subject: [PATCH 155/248] bump router to 0.1.3 (#3020)

---
 sgl-router/py_src/sglang_router/version.py | 2 +-
 sgl-router/pyproject.toml                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sgl-router/py_src/sglang_router/version.py b/sgl-router/py_src/sglang_router/version.py
index b3f4756216d0..ae7362549b3c 100644
--- a/sgl-router/py_src/sglang_router/version.py
+++ b/sgl-router/py_src/sglang_router/version.py
@@ -1 +1 @@
-__version__ = "0.1.2"
+__version__ = "0.1.3"
diff --git a/sgl-router/pyproject.toml b/sgl-router/pyproject.toml
index 90e82cecf377..3a00d047200f 100644
--- a/sgl-router/pyproject.toml
+++ b/sgl-router/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sglang-router"
-version = "0.1.2"
+version = "0.1.3"
 description = "SGLang router is a standalone module implemented in Rust to achieve data parallelism across SGLang instances."
 authors = [{name = "Byron Hsu", email = "byronhsu1230@gmail.com"}]
 requires-python = ">=3.8"

From af6c5357d5cb283341ef21f8aeb093753bd10a2b Mon Sep 17 00:00:00 2001
From: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
Date: Mon, 20 Jan 2025 22:40:12 +0000
Subject: [PATCH 156/248] deepseek v3 and r1 chat template (#3015)

---
 python/sglang/lang/chat_template.py | 31 +++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/python/sglang/lang/chat_template.py b/python/sglang/lang/chat_template.py
index 845e1e52ddae..a2c91c561c29 100644
--- a/python/sglang/lang/chat_template.py
+++ b/python/sglang/lang/chat_template.py
@@ -354,6 +354,37 @@ def get_chat_template_by_model_path(model_path):
 )
 
 
+register_chat_template(
+    ChatTemplate(
+        name="deepseek-v3",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "",
+                "",
+            ),
+            "user": (
+                "<｜User｜>",
+                "",
+            ),
+            "assistant": (
+                "<｜Assistant｜>",
+                "<｜end▁of▁sentence｜>",
+            ),
+        },
+        stop_str=("<｜end▁of▁sentence｜>",),
+    )
+)
+
+
+@register_chat_template_matching_function
+def match_deepseek(model_path: str):
+    if (
+        "deepseek-v3" in model_path.lower() or "deepseek-r1" in model_path.lower()
+    ) and "base" not in model_path.lower():
+        return get_chat_template("deepseek-v3")
+
+
 @register_chat_template_matching_function
 def match_dbrx(model_path: str):
     if "dbrx" in model_path.lower() and "instruct" in model_path.lower():

From da4e8b389280009833cd0a01e8cf5ce4746f77fc Mon Sep 17 00:00:00 2001
From: Hui Liu <96135754+hliuca@users.noreply.github.com>
Date: Mon, 20 Jan 2025 14:40:45 -0800
Subject: [PATCH 157/248] enable kv_scale remap (#3017)

---
 python/sglang/srt/models/commandr.py | 10 +++++++++-
 python/sglang/srt/models/dbrx.py     | 10 +++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py
index 151087732f05..6d2e6d2bb411 100644
--- a/python/sglang/srt/models/commandr.py
+++ b/python/sglang/srt/models/commandr.py
@@ -61,7 +61,10 @@
 from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from sglang.srt.utils import get_compiler_backend, set_weight_attrs
 
 
@@ -372,6 +375,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py
index cedc96392205..92fc679391fd 100644
--- a/python/sglang/srt/models/dbrx.py
+++ b/python/sglang/srt/models/dbrx.py
@@ -42,7 +42,10 @@
     VocabParallelEmbedding,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from sglang.srt.utils import set_weight_attrs
 
 
@@ -411,6 +414,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, weight_name)
                 break
             else:
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)

From 949b3fbfce7ce3bc3a2e971bc6fbcd501dcc6ece Mon Sep 17 00:00:00 2001
From: Hongpeng Guo <hpguo@anyscale.com>
Date: Mon, 20 Jan 2025 16:50:25 -0800
Subject: [PATCH 158/248] [Doc] Update doc of custom logit processor (#3021)

Signed-off-by: Hongpeng Guo <hpguo@anyscale.com>
---
 docs/references/sampling_params.md      | 68 +++++++++++++++++++++++++
 python/sglang/srt/managers/io_struct.py | 11 ++--
 2 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/docs/references/sampling_params.md b/docs/references/sampling_params.md
index cdc53da61a48..77d7c9f82e75 100644
--- a/docs/references/sampling_params.md
+++ b/docs/references/sampling_params.md
@@ -32,6 +32,20 @@ class GenerateReqInput:
     return_text_in_logprobs: bool = False
     # Whether to stream output.
     stream: bool = False
+    # Whether to log metrics for this request (e.g. health_generate calls do not log metrics)
+    log_metrics: bool = True
+
+    # The modalities of the image data [image, multi-images, video]
+    modalities: Optional[List[str]] = None
+    # LoRA related
+    lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+
+    # Session info for continual prompting
+    session_params: Optional[Union[List[Dict], Dict]] = None
+    # Custom logit processor for advanced sampling control. Must be a serialized instance
+    # of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py
+    # Use the processor's `to_str()` method to generate the serialized string.
+    custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
 ```
 
 The `sampling_params` follows this format
@@ -90,6 +104,14 @@ repetition_penalty: float = 1.0,
 # difficult to infer the correct token ID by given `stop` strings.
 # Must be 0 <= value < max_new_tokens. Setting to 0 (default) will disable this penalty.
 min_new_tokens: int = 0,
+
+
+## Custom Parameters for Custom Logit Processor.
+# A dictionary of custom parameters for the custom logit processor.
+# The custom logit processor takes a list of dictionaries as input, where each
+# dictionary is the custom parameters for one token in a batch of the input.
+# See also python/sglang/srt/sampling/custom_logit_processor.py
+custom_params: Optional[Dict[str, Any]] = None,
 ```
 
 ## Examples
@@ -253,3 +275,49 @@ response = requests.post(
 )
 print(response.json())
 ```
+### Custom Logit Processor
+Launch a server with `--enable-custom-logit-processor` flag on.
+```
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --enable-custom-logit-processor
+```
+
+Define a custom logit processor that will always sample a specific token id.
+```python
+from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
+
+class DeterministicLogitProcessor(CustomLogitProcessor):
+    """A dummy logit processor that changes the logits to always
+    sample the given token id.
+    """
+
+    def __call__(self, logits, custom_param_list):
+        # Check that the number of logits matches the number of custom parameters
+        assert logits.shape[0] == len(custom_param_list)
+        key = "token_id"
+
+        for i, param_dict in enumerate(custom_param_list):
+            # Mask all other tokens
+            logits[i, :] = -float("inf")
+            # Assign highest probability to the specified token
+            logits[i, param_dict[key]] = 0.0
+        return logits
+```
+
+Send a request
+```python
+import requests
+
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "The capital of France is",
+        "custom_logit_processor": DeterministicLogitProcessor().to_str(),
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_new_tokens": 32,
+            "custom_params": {"token_id": 5},
+        },
+    },
+)
+print(response.json())
+```
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
index 9183239838de..eee9b6722d4f 100644
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -69,8 +69,10 @@ class GenerateReqInput:
 
     # Session info for continual prompting
     session_params: Optional[Union[List[Dict], Dict]] = None
-    # Custom logit processor (serialized function)
-    custom_logit_processor: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    # Custom logit processor for advanced sampling control. Must be a serialized instance
+    # of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py
+    # Use the processor's `to_str()` method to generate the serialized string.
+    custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
 
     def normalize_batch_and_arguments(self):
         if (
@@ -248,8 +250,9 @@ class TokenizedGenerateReqInput:
     # Session info for continual prompting
     session_params: Optional[SessionParams] = None
 
-    # Custom logit processor (serialized function)
-    # TODO (hpguo): Add an example and update doc string here
+    # Custom logit processor for advanced sampling control. Must be a serialized instance
+    # of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py
+    # Use the processor's `to_str()` method to generate the serialized string.
     custom_logit_processor: Optional[str] = None
 
 

From 60b2a44a80d1bb168dcf28a1980c09f2d3364153 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 20 Jan 2025 16:50:39 -0800
Subject: [PATCH 159/248] Fix flaky tests in test_programs.py (#3022)

---
 python/sglang/test/test_programs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/test/test_programs.py b/python/sglang/test/test_programs.py
index 219ed3cf6ecf..361bbaed00c7 100644
--- a/python/sglang/test/test_programs.py
+++ b/python/sglang/test/test_programs.py
@@ -535,7 +535,7 @@ def few_shot_hellaswag(s, question, choices):
 
     # Compute accuracy
     accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
-    assert np.abs(accuracy_gen - accuracy) < 0.05
+    assert np.abs(accuracy_gen - accuracy) < 0.1
     assert np.abs(latency_gen - latency) < 1
 
     return accuracy, latency

From b730aa6b9e577670f1967b65ea9f24a32e0aca8d Mon Sep 17 00:00:00 2001
From: 996_icu <85502239+josephydu@users.noreply.github.com>
Date: Tue, 21 Jan 2025 09:46:43 +0800
Subject: [PATCH 160/248] [EAGLE] Fix some boundary situation when retract reqs
 and req's max token = 1 (#2939)

Co-authored-by: josephyou <josephyou@tencent.com>
---
 python/sglang/srt/managers/schedule_batch.py | 2 ++
 python/sglang/srt/speculative/eagle_utils.py | 8 ++++++++
 2 files changed, 10 insertions(+)

diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 040afe3d3242..d9af81515348 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -1112,6 +1112,8 @@ def filter_batch(
         self.has_grammar = any(req.grammar for req in self.reqs)
 
         self.sampling_info.filter_batch(keep_indices, new_indices)
+        if self.spec_info:
+            self.spec_info.filter_batch(new_indices)
 
     def merge_batch(self, other: "ScheduleBatch"):
         # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because
diff --git a/python/sglang/srt/speculative/eagle_utils.py b/python/sglang/srt/speculative/eagle_utils.py
index 1a324000cb28..ac16f6c532ea 100644
--- a/python/sglang/srt/speculative/eagle_utils.py
+++ b/python/sglang/srt/speculative/eagle_utils.py
@@ -228,6 +228,14 @@ def prepare_for_extend(self, batch: ScheduleBatch):
         assert len(batch.extend_lens) == 1
         batch.input_ids = torch.concat((batch.input_ids[1:], self.verified_id))
 
+    def filter_batch(
+        self,
+        new_indices: torch.Tensor,
+    ):
+        self.sample_output = self.sample_output[: len(new_indices)]
+        self.hidden_states = self.hidden_states[: len(new_indices)]
+        self.verified_id = self.verified_id[: len(new_indices)]
+
     def prepare_for_decode(self, batch: ScheduleBatch):
         prob = self.sample_output  # shape: (b * top_k, vocab) or (b, vocab)
         top = torch.topk(prob, self.topk, dim=-1)

From d2571dd5c7b4cee5f690dc3403cfeef0ca7115b7 Mon Sep 17 00:00:00 2001
From: Hui Liu <96135754+hliuca@users.noreply.github.com>
Date: Mon, 20 Jan 2025 19:21:41 -0800
Subject: [PATCH 161/248] Enable Cohere2 Models (#3018)

---
 python/sglang/srt/models/commandr.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py
index 6d2e6d2bb411..e4b291b66cb2 100644
--- a/python/sglang/srt/models/commandr.py
+++ b/python/sglang/srt/models/commandr.py
@@ -386,4 +386,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             loaded_params.add(name)
 
 
-EntryClass = CohereForCausalLM
+class Cohere2ForCausalLM(CohereForCausalLM):
+    pass
+
+
+EntryClass = [CohereForCausalLM, Cohere2ForCausalLM]

From 287d07a669d3fd0b0650959b0e35c8e886513824 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 20 Jan 2025 20:25:13 -0800
Subject: [PATCH 162/248] Misc fixes for eagle (flush_cache, CPU overhead)
 (#3014)

---
 python/sglang/bench_offline_throughput.py     | 28 +++---
 python/sglang/bench_serving.py                | 91 ++++++++++---------
 python/sglang/srt/managers/scheduler.py       | 11 ++-
 .../srt/model_executor/forward_batch_info.py  |  8 +-
 python/sglang/srt/server.py                   |  4 +-
 python/sglang/srt/speculative/eagle_utils.py  | 43 ++++++---
 python/sglang/srt/speculative/eagle_worker.py | 24 +++--
 python/sglang/srt/utils.py                    |  7 ++
 python/sglang/test/test_programs.py           |  3 +-
 python/sglang/test/test_utils.py              |  7 +-
 test/lang/test_srt_backend.py                 |  1 +
 11 files changed, 132 insertions(+), 95 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index b0a715e61cc2..9d56ff07c8be 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -49,12 +49,13 @@ class BenchArgs:
     gsp_system_prompt_len: int = 2048
     gsp_question_len: int = 128
     gsp_output_len: int = 256
+    seed: int = 1
     disable_ignore_eos: bool = False
     extra_request_body: Optional[str] = None
-    seed: int = 1
+    apply_chat_template: bool = False
+    profile: bool = False
     skip_warmup: bool = False
     do_not_exit: bool = False
-    profile: bool = False
 
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -141,20 +142,31 @@ def add_cli_args(parser: argparse.ArgumentParser):
             default=BenchArgs.gsp_output_len,
             help="Target length in tokens for outputs in generated-shared-prefix dataset",
         )
+        parser.add_argument("--seed", type=int, default=1, help="The random seed.")
         parser.add_argument(
             "--disable-ignore-eos",
-            type=bool,
-            default=BenchArgs.disable_ignore_eos,
+            action="store_true",
             help="Disable ignore EOS token",
         )
         parser.add_argument(
             "--extra-request-body",
             metavar='{"key1": "value1", "key2": "value2"}',
             type=str,
+            default=BenchArgs.extra_request_body,
             help="Append given JSON object to the request payload. You can use this to specify"
             "additional generate params like sampling params.",
         )
-        parser.add_argument("--seed", type=int, default=1, help="The random seed.")
+        parser.add_argument(
+            "--apply-chat-template",
+            action="store_true",
+            help="Apply chat template",
+        )
+        parser.add_argument(
+            "--profile",
+            action="store_true",
+            help="Use Torch Profiler. The endpoint must be launched with "
+            "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+        )
         parser.add_argument(
             "--skip-warmup",
             action="store_true",
@@ -165,12 +177,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
             action="store_true",
             help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
         )
-        parser.add_argument(
-            "--profile",
-            action="store_true",
-            help="Use Torch Profiler. The endpoint must be launched with "
-            "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
-        )
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index 991b4ddcf1a4..10ce965be742 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -453,6 +453,7 @@ def get_dataset(args, tokenizer):
             tokenizer=tokenizer,
             fixed_output_len=args.sharegpt_output_len,
             context_len=args.sharegpt_context_len,
+            apply_chat_template=args.apply_chat_template,
         )
     elif args.dataset_name == "random":
         input_requests = sample_random_requests(
@@ -517,6 +518,7 @@ class BenchmarkMetrics:
     median_e2e_latency_ms: float
     std_e2e_latency_ms: float
     p99_e2e_latency_ms: float
+    concurrency: float
 
 
 SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
@@ -562,6 +564,7 @@ def sample_sharegpt_requests(
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int] = None,
     context_len: Optional[int] = None,
+    apply_chat_template=False,
 ) -> List[Tuple[str, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
@@ -592,6 +595,15 @@ def sample_sharegpt_requests(
 
         # Tokenize the prompts and completions.
         prompt = dataset[i][0]
+
+        if apply_chat_template:
+            prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+            prompt = prompt.replace(tokenizer.bos_token, "")
+
         prompt_token_ids = tokenizer.encode(prompt)
         completion = dataset[i][1]
         completion_token_ids = tokenizer.encode(completion)
@@ -600,7 +612,7 @@ def sample_sharegpt_requests(
             len(completion_token_ids) if fixed_output_len is None else fixed_output_len
         )
 
-        if prompt_len < 1 or output_len < 1:
+        if prompt_len < 2 or output_len < 2:
             # Prune too short sequences.
             continue
 
@@ -880,6 +892,7 @@ def calculate_metrics(
         median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
         std_e2e_latency_ms=np.std(e2e_latencies) * 1000,
         p99_e2e_latency_ms=np.percentile(e2e_latencies, 99) * 1000,
+        concurrency=np.sum(e2e_latencies) / dur_s,
     )
 
     return metrics, output_lens
@@ -1031,6 +1044,7 @@ async def limited_request_func(request_func_input, pbar):
             "Total token throughput (tok/s):", metrics.total_throughput
         )
     )
+    print("{:<40} {:<10.2f}".format("Concurrency:", metrics.concurrency))
     print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
     print(
         "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
@@ -1062,13 +1076,24 @@ async def limited_request_func(request_func_input, pbar):
         and metrics.output_throughput is not None
     ):
         result = {
+            # Arguments
             "backend": args.backend,
             "dataset_name": args.dataset_name,
             "request_rate": request_rate,
             "max_concurrency": max_concurrency,
+            "sharegpt_output_len": args.sharegpt_output_len,
+            "random_input_len": args.random_input_len,
+            "random_output_len": args.random_output_len,
+            "random_range_ratio": args.random_range_ratio,
+            # Results
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
             "total_input_tokens": metrics.total_input,
             "total_output_tokens": metrics.total_output,
             "total_output_tokens_retokenized": metrics.total_output_retokenized,
+            "request_throughput": metrics.request_throughput,
+            "input_throughput": metrics.input_throughput,
+            "output_throughput": metrics.output_throughput,
             "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
             "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
             "std_e2e_latency_ms": metrics.std_e2e_latency_ms,
@@ -1085,14 +1110,7 @@ async def limited_request_func(request_func_input, pbar):
             "median_itl_ms": metrics.median_itl_ms,
             "std_itl_ms": metrics.std_itl_ms,
             "p99_itl_ms": metrics.p99_itl_ms,
-            "input_throughput": metrics.input_throughput,
-            "output_throughput": metrics.output_throughput,
-            "sharegpt_output_len": args.sharegpt_output_len,
-            "random_input_len": args.random_input_len,
-            "random_output_len": args.random_output_len,
-            "random_range_ratio": args.random_range_ratio,
-            "duration": benchmark_duration,
-            "completed": metrics.completed,
+            "concurrency": metrics.concurrency,
         }
     else:
         print(f"Error running benchmark for request rate: {request_rate}")
@@ -1112,36 +1130,16 @@ async def limited_request_func(request_func_input, pbar):
     with open(output_file_name, "a") as file:
         file.write(json.dumps(result) + "\n")
 
-    result = {
-        "duration": benchmark_duration,
-        "completed": metrics.completed,
-        "total_input_tokens": metrics.total_input,
-        "total_output_tokens": metrics.total_output,
-        "total_output_tokens_retokenized": metrics.total_output_retokenized,
-        "request_throughput": metrics.request_throughput,
-        "input_throughput": metrics.input_throughput,
-        "output_throughput": metrics.output_throughput,
-        "mean_ttft_ms": metrics.mean_ttft_ms,
-        "median_ttft_ms": metrics.median_ttft_ms,
-        "std_ttft_ms": metrics.std_ttft_ms,
-        "p99_ttft_ms": metrics.p99_ttft_ms,
-        "mean_tpot_ms": metrics.mean_tpot_ms,
-        "median_tpot_ms": metrics.median_tpot_ms,
-        "std_tpot_ms": metrics.std_tpot_ms,
-        "p99_tpot_ms": metrics.p99_tpot_ms,
-        "mean_itl_ms": metrics.mean_itl_ms,
-        "median_itl_ms": metrics.median_itl_ms,
-        "std_itl_ms": metrics.std_itl_ms,
-        "p99_itl_ms": metrics.p99_itl_ms,
-        "input_lens": [output.prompt_len for output in outputs],
-        "output_lens": output_lens,
-        "ttfts": [output.ttft for output in outputs],
-        "itls": [output.itl for output in outputs],
-        "generated_texts": [output.generated_text for output in outputs],
-        "errors": [output.error for output in outputs],
-        "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
-        "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
-    }
+    result.update(
+        {
+            "input_lens": [output.prompt_len for output in outputs],
+            "output_lens": output_lens,
+            "ttfts": [output.ttft for output in outputs],
+            "itls": [output.itl for output in outputs],
+            "generated_texts": [output.generated_text for output in outputs],
+            "errors": [output.error for output in outputs],
+        }
+    )
     return result
 
 
@@ -1422,7 +1420,6 @@ def set_ulimit(target_soft_limit=65535):
         "actual request rate may be lower than specified with --request-rate, "
         "if the server is not processing requests fast enough to keep up.",
     )
-    parser.add_argument("--seed", type=int, default=1, help="The random seed.")
     parser.add_argument(
         "--multi",
         action="store_true",
@@ -1446,14 +1443,15 @@ def set_ulimit(target_soft_limit=65535):
         help="Disable streaming mode.",
     )
     parser.add_argument(
-        "--disable-ignore-eos",
+        "--return-logprob",
         action="store_true",
-        help="Disable ignoring EOS.",
+        help="Return logprob.",
     )
+    parser.add_argument("--seed", type=int, default=1, help="The random seed.")
     parser.add_argument(
-        "--return-logprob",
+        "--disable-ignore-eos",
         action="store_true",
-        help="Return logprob.",
+        help="Disable ignoring EOS.",
     )
     parser.add_argument(
         "--extra-request-body",
@@ -1462,6 +1460,11 @@ def set_ulimit(target_soft_limit=65535):
         help="Append given JSON object to the request payload. You can use this to specify"
         "additional generate params like sampling params.",
     )
+    parser.add_argument(
+        "--apply-chat-template",
+        action="store_true",
+        help="Apply chat template",
+    )
     parser.add_argument(
         "--profile",
         action="store_true",
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index fba8a67ecf40..85bd1c2a4adf 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -1023,7 +1023,7 @@ def update_running_batch(self, batch: ScheduleBatch) -> Optional[ScheduleBatch]:
             )
 
         # Check for jump-forward
-        if not self.disable_jump_forward:
+        if not self.disable_jump_forward and batch.has_grammar:
             jump_forward_reqs = batch.check_for_jump_forward(self.pad_input_ids_func)
             self.waiting_queue.extend(jump_forward_reqs)
             if batch.is_empty():
@@ -1564,6 +1564,15 @@ def flush_cache(self):
                 self.grammar_backend.reset()
             self.req_to_token_pool.clear()
             self.token_to_kv_pool.clear()
+
+            if not self.spec_algorithm.is_none():
+                self.draft_worker.model_runner.req_to_token_pool.clear()
+                self.draft_worker.model_runner.token_to_kv_pool.clear()
+
+            self.num_generated_tokens = 0
+            self.forward_ct_decode = 0
+            self.spec_num_total_accepted_tokens = 0
+            self.spec_num_total_forward_ct = 0
             torch.cuda.empty_cache()
             logger.info("Cache flushed successfully!")
             if_success = True
diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py
index 354408ab3433..8ef5c57b891a 100644
--- a/python/sglang/srt/model_executor/forward_batch_info.py
+++ b/python/sglang/srt/model_executor/forward_batch_info.py
@@ -282,6 +282,9 @@ def init_new(
             can_run_dp_cuda_graph=batch.can_run_dp_cuda_graph,
             lora_paths=batch.lora_paths,
             sampling_info=batch.sampling_info,
+            req_to_token_pool=model_runner.req_to_token_pool,
+            token_to_kv_pool=model_runner.token_to_kv_pool,
+            attn_backend=model_runner.attn_backend,
             spec_algorithm=batch.spec_algorithm,
             spec_info=batch.spec_info,
             capture_hidden_mode=batch.capture_hidden_mode,
@@ -336,11 +339,6 @@ def init_new(
         if model_runner.model_is_mrope:
             ret.compute_mrope_positions(model_runner, batch)
 
-        # Init attention information
-        ret.req_to_token_pool = model_runner.req_to_token_pool
-        ret.token_to_kv_pool = model_runner.token_to_kv_pool
-        ret.attn_backend = model_runner.attn_backend
-
         # Init lora information
         if model_runner.server_args.lora_paths is not None:
             model_runner.lora_manager.prepare_lora_batch(ret)
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index 8b0c56186223..869a984d0cf9 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 # ==============================================================================
 
-# Some shortcuts for backward compatbility.
+# Some shortcuts for backward compatibility.
 # They will be removed in new versions.
 from sglang.srt.entrypoints.engine import Engine
-from sglang.srt.entrypoints.http_server import launch_server
+from sglang.srt.entrypoints.http_server import kill_process_tree, launch_server
diff --git a/python/sglang/srt/speculative/eagle_utils.py b/python/sglang/srt/speculative/eagle_utils.py
index ac16f6c532ea..049ba22750aa 100644
--- a/python/sglang/srt/speculative/eagle_utils.py
+++ b/python/sglang/srt/speculative/eagle_utils.py
@@ -180,7 +180,6 @@ def generate_draft_decode_kv_indices(
 class EAGLEDraftInput(SpecInfo):
     def __init__(self):
         self.prev_mode = ForwardMode.DECODE
-        self.sample_output = None
 
         self.scores: torch.Tensor = None
         self.score_list: List[torch.Tensor] = []
@@ -190,12 +189,16 @@ def __init__(self):
         self.cache_list: List[torch.Tenor] = []
         self.iter = 0
 
+        # shape: (b, hidden_size)
         self.hidden_states: torch.Tensor = None
+        # shape: (b,)
         self.verified_id: torch.Tensor = None
+        # shape: (b, vocab_size)
+        self.sample_output: torch.Tensor = None
+
         self.positions: torch.Tensor = None
         self.accept_length: torch.Tensor = None
-        self.has_finished: bool = False
-        self.unfinished_index: List[int] = None
+        self.accept_length_cpu: List[int] = None
 
     def load_server_args(self, server_args: ServerArgs):
         self.topk: int = server_args.speculative_eagle_topk
@@ -218,7 +221,7 @@ def prepare_for_extend(self, batch: ScheduleBatch):
                     :pre_len
                 ] = req.prefix_indices
 
-            batch.req_to_token_pool.req_to_token[req.req_pool_idx][pre_len:seq_len] = (
+            batch.req_to_token_pool.req_to_token[req.req_pool_idx, pre_len:seq_len] = (
                 out_cache_loc[pt : pt + req.extend_input_len]
             )
 
@@ -295,7 +298,9 @@ def prepare_for_decode(self, batch: ScheduleBatch):
         self.cache_list.append(batch.out_cache_loc)
         self.positions = (
             batch.seq_lens[:, None]
-            + torch.ones([1, self.topk], device="cuda", dtype=torch.long) * self.iter
+            + torch.full(
+                [1, self.topk], fill_value=self.iter, device="cuda", dtype=torch.long
+            )
         ).flatten()
 
         bs = len(batch.seq_lens)
@@ -312,24 +317,25 @@ def prepare_for_decode(self, batch: ScheduleBatch):
 
     def prepare_extend_after_decode(self, batch: ScheduleBatch):
         batch.out_cache_loc = batch.alloc_token_slots(self.verified_id.numel())
-        batch.extend_lens = (self.accept_length + 1).tolist()
+        accept_length_cpu = batch.spec_info.accept_length_cpu
+        batch.extend_lens = [x + 1 for x in accept_length_cpu]
+        batch.seq_lens = batch.spec_info.seq_lens_for_draft_extend
+        seq_lens_cpu = batch.seq_lens.tolist()
 
         pt = 0
-        seq_lens = batch.seq_lens.tolist()
-
         i = 0
-
         for req in batch.reqs:
             if req.finished():
                 continue
             # assert seq_len - pre_len == req.extend_input_len
-            input_len = self.accept_length[i] + 1
-            seq_len = seq_lens[i]
+            input_len = batch.extend_lens[i]
+            seq_len = seq_lens_cpu[i]
             batch.req_to_token_pool.req_to_token[req.req_pool_idx][
                 seq_len - input_len : seq_len
             ] = batch.out_cache_loc[pt : pt + input_len]
             pt += input_len
             i += 1
+        assert pt == batch.out_cache_loc.shape[0]
 
         self.positions = torch.empty_like(self.verified_id)
         new_verified_id = torch.empty_like(self.accept_length, dtype=torch.long)
@@ -345,7 +351,7 @@ def prepare_extend_after_decode(self, batch: ScheduleBatch):
             triton.next_power_of_2(self.spec_steps + 1),
         )
 
-        batch.seq_lens_sum = sum(batch.seq_lens)
+        batch.seq_lens_sum = sum(seq_lens_cpu)
         batch.input_ids = self.verified_id
         self.verified_id = new_verified_id
 
@@ -573,6 +579,8 @@ def verify(self, batch: ScheduleBatch, logits_output: torch.Tensor) -> torch.Ten
         finished_extend_len = {}  # {rid:accept_length + 1}
         accept_index_cpu = accept_index.tolist()
         predict_cpu = predict.tolist()
+        has_finished = False
+
         # iterate every accepted token and check if req has finished after append the token
         # should be checked BEFORE free kv cache slots
         for i, (req, accept_index_row) in enumerate(zip(batch.reqs, accept_index_cpu)):
@@ -586,7 +594,7 @@ def verify(self, batch: ScheduleBatch, logits_output: torch.Tensor) -> torch.Ten
                 finished_extend_len[req.rid] = j + 1
                 req.check_finished()
                 if req.finished():
-                    draft_input.has_finished = True
+                    has_finished = True
                     # set all tokens after finished token to -1 and break
                     accept_index[i, j + 1 :] = -1
                     break
@@ -600,7 +608,6 @@ def verify(self, batch: ScheduleBatch, logits_output: torch.Tensor) -> torch.Ten
         accept_index = accept_index[accept_index != -1]
         accept_length_cpu = accept_length.tolist()
         verified_id = predict[accept_index]
-        verified_id_cpu = verified_id.tolist()
 
         evict_mask = torch.full_like(self.draft_token, True, dtype=torch.bool)
         evict_mask[accept_index] = False
@@ -622,7 +629,13 @@ def verify(self, batch: ScheduleBatch, logits_output: torch.Tensor) -> torch.Ten
             draft_input.verified_id = predict[new_accept_index]
             draft_input.hidden_states = batch.spec_info.hidden_states[new_accept_index]
             draft_input.accept_length = accept_length[unfinished_index]
-            draft_input.unfinished_index = unfinished_index
+            draft_input.accept_length_cpu = [
+                accept_length_cpu[i] for i in unfinished_index
+            ]
+            if has_finished:
+                draft_input.seq_lens_for_draft_extend = batch.seq_lens[unfinished_index]
+            else:
+                draft_input.seq_lens_for_draft_extend = batch.seq_lens
 
         logits_output.next_token_logits = logits_output.next_token_logits[accept_index]
         return (
diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py
index 2a6ec96048bb..06a4372fce2e 100644
--- a/python/sglang/srt/speculative/eagle_worker.py
+++ b/python/sglang/srt/speculative/eagle_worker.py
@@ -13,6 +13,7 @@
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.speculative.eagle_utils import EAGLEDraftInput
+from sglang.srt.utils import rank0_print
 
 
 class EAGLEWorker(TpModelWorker):
@@ -50,18 +51,18 @@ def __init__(
 
     def forward_draft_decode(self, batch: ScheduleBatch):
         batch.spec_info.prepare_for_decode(batch)
+        batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
         model_worker_batch = batch.get_model_worker_batch()
         forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
-        forward_batch.capture_hidden_mode = CaptureHiddenMode.LAST
         logits_output = self.model_runner.forward(forward_batch)
         self.capture_for_decode(logits_output, forward_batch)
 
     def forward_draft_extend(self, batch: ScheduleBatch):
         self._set_mem_pool(batch, self.model_runner)
         batch.spec_info.prepare_for_extend(batch)
+        batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
         model_worker_batch = batch.get_model_worker_batch()
         forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
-        forward_batch.capture_hidden_mode = CaptureHiddenMode.LAST
         logits_output = self.model_runner.forward(forward_batch)
         self.capture_for_decode(logits_output, forward_batch)
         self._set_mem_pool(batch, self.target_worker.model_runner)
@@ -134,26 +135,23 @@ def _set_mem_pool(self, batch: ScheduleBatch, runner: ModelRunner):
         batch.req_to_token_pool = runner.req_to_token_pool
 
     def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
+        seq_lens_backup = batch.seq_lens
+
         self._set_mem_pool(batch, self.model_runner)
         batch.forward_mode = ForwardMode.DRAFT_EXTEND
-        if batch.spec_info.has_finished:
-            index = batch.spec_info.unfinished_index
-            seq_lens = batch.seq_lens
-            batch.seq_lens = batch.seq_lens[index]
-
         batch.spec_info.prepare_extend_after_decode(batch)
+        batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
         model_worker_batch = batch.get_model_worker_batch()
         forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
-        forward_batch.capture_hidden_mode = CaptureHiddenMode.LAST
         logits_output = self.model_runner.forward(forward_batch)
-
-        batch.spec_info.hidden_states = logits_output.hidden_states
         self.capture_for_decode(logits_output, forward_batch)
-        batch.forward_mode = ForwardMode.DECODE
-        if batch.spec_info.has_finished:
-            batch.seq_lens = seq_lens
         self._set_mem_pool(batch, self.target_worker.model_runner)
 
+        # Restore backup.
+        # This is because `seq_lens` can be modified in `prepare_extend_after_decode`
+        batch.forward_mode = ForwardMode.DECODE
+        batch.seq_lens = seq_lens_backup
+
     def capture_for_decode(
         self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
     ):
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index 4614114b41d5..23dcb43d2d90 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -1442,3 +1442,10 @@ def is_valid_ipv6_address(address: str) -> bool:
         return True
     except ValueError:
         return False
+
+
+def rank0_print(msg: str):
+    from sglang.srt.distributed import get_tensor_model_parallel_rank
+
+    if get_tensor_model_parallel_rank() == 0:
+        print(msg, flush=True)
diff --git a/python/sglang/test/test_programs.py b/python/sglang/test/test_programs.py
index 361bbaed00c7..088cb0d0af91 100644
--- a/python/sglang/test/test_programs.py
+++ b/python/sglang/test/test_programs.py
@@ -535,7 +535,8 @@ def few_shot_hellaswag(s, question, choices):
 
     # Compute accuracy
     accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
-    assert np.abs(accuracy_gen - accuracy) < 0.1
+    print(f"{accuracy=}, {accuracy_gen=}")
+    assert np.abs(accuracy_gen - accuracy) < 0.05
     assert np.abs(latency_gen - latency) < 1
 
     return accuracy, latency
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index c1437074f67b..ad8ff6cbf4d4 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -567,15 +567,16 @@ def run_bench_serving(
         random_range_ratio=0.0,
         request_rate=request_rate,
         multi=None,
-        seed=0,
         output_file=None,
         disable_tqdm=False,
         disable_stream=disable_stream,
-        disable_ignore_eos=False,
         return_logprob=False,
-        lora_name=None,
+        seed=0,
+        disable_ignore_eos=False,
         extra_request_body=None,
+        apply_chat_template=False,
         profile=None,
+        lora_name=None,
     )
 
     try:
diff --git a/test/lang/test_srt_backend.py b/test/lang/test_srt_backend.py
index 0d7cc9105576..a4b1b88a23d3 100644
--- a/test/lang/test_srt_backend.py
+++ b/test/lang/test_srt_backend.py
@@ -1,6 +1,7 @@
 """
 Usage:
 python3 -m unittest test_srt_backend.TestSRTBackend.test_gen_min_new_tokens
+python3 -m unittest test_srt_backend.TestSRTBackend.test_hellaswag_select
 """
 
 import unittest

From 6c856b4f3a4e63a25f5adc3388bf79ac2a6e4f72 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Tue, 21 Jan 2025 13:08:15 +0800
Subject: [PATCH 163/248] minor: update Makefile for sgl-kernel (#3025)

---
 .github/workflows/release-pypi-kernel.yml | 1 +
 sgl-kernel/Makefile                       | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release-pypi-kernel.yml b/.github/workflows/release-pypi-kernel.yml
index 362088c47fd1..466f2bdc70d3 100644
--- a/.github/workflows/release-pypi-kernel.yml
+++ b/.github/workflows/release-pypi-kernel.yml
@@ -14,6 +14,7 @@ concurrency:
 
 jobs:
   build-wheels:
+    if: github.repository == 'sgl-project/sglang'
     runs-on: ubuntu-latest
     strategy:
       matrix:
diff --git a/sgl-kernel/Makefile b/sgl-kernel/Makefile
index fac4c5c56c8e..c7641bb5fee1 100644
--- a/sgl-kernel/Makefile
+++ b/sgl-kernel/Makefile
@@ -1,7 +1,7 @@
 .PHONY: tree ln submodule install build clean test format
 
 tree:
-	@tree --prune -I "__pycache__|*.egg-info|*.so|build"
+	@tree --prune -I "__pycache__|*.egg-info|*.so|build|3rdparty|dist"
 
 submodule:
 	@git submodule update --init --recursive
@@ -19,7 +19,7 @@ clean:
 	@rm -rf build dist *.egg-info
 
 test:
-	@pytest tests/
+	@find tests -name "test_*.py" | xargs -n 1 python3
 
 format:
 	@find src tests -name '*.cc' -o -name '*.cu' -o -name '*.cuh' -o -name '*.h' -o -name '*.hpp' | xargs clang-format -i && find src tests -name '*.py' | xargs isort && find src tests -name '*.py' | xargs black

From ec1c21cdc4d9dcfc94f48b0dad182dc34b943553 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Tue, 21 Jan 2025 14:32:08 +0800
Subject: [PATCH 164/248] upgrade torch version for sgl-kernel (#3026)

---
 .github/workflows/pr-test-sgl-kernel.yml | 16 ++++++++--------
 sgl-kernel/build.sh                      |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
index cacf938a3303..31360c0a068b 100644
--- a/.github/workflows/pr-test-sgl-kernel.yml
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -34,16 +34,16 @@ jobs:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
     runs-on: 1-gpu-runner
     steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
+      - uses: actions/checkout@v4
+        with:
+          submodules: 'recursive'
 
-      - name: Install dependencies
+      - name: Install
         run: |
-          bash scripts/ci_install_dependency.sh
-
+          pip3 install torch==2.5.1
+          pip3 uninstall sgl-kernel -y || true
           cd sgl-kernel
-          git submodule update --init --recursive
-          pip3 install -e . --force-reinstall
+          pip3 install .
           pip3 list | grep sgl-kernel
 
       - name: Run test
@@ -57,7 +57,7 @@ jobs:
           pip3 uninstall sgl-kernel -y
 
   finish:
-    needs: [unit-test]
+    needs: [unit-test, lint]
     runs-on: ubuntu-latest
     steps:
       - name: Finish
diff --git a/sgl-kernel/build.sh b/sgl-kernel/build.sh
index 55ce9df7f33d..0d8169579519 100755
--- a/sgl-kernel/build.sh
+++ b/sgl-kernel/build.sh
@@ -8,7 +8,7 @@ docker run --rm \
     -v "$(pwd)":/sgl-kernel \
     pytorch/manylinux-builder:cuda${CUDA_VERSION} \
     bash -c "
-    ${PYTHON_ROOT_PATH}/bin/pip install --no-cache-dir torch==2.4.0 --index-url https://download.pytorch.org/whl/cu${CUDA_VERSION//.} && \
+    ${PYTHON_ROOT_PATH}/bin/pip install --no-cache-dir torch==2.5.1 --index-url https://download.pytorch.org/whl/cu${CUDA_VERSION//.} && \
     export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX' && \
     export CUDA_VERSION=${CUDA_VERSION} && \
     mkdir -p /usr/lib/x86_64-linux-gnu/ && \

From 2bac342ffbad761240a3a53d14f8b48e19cddebb Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Tue, 21 Jan 2025 14:43:02 +0800
Subject: [PATCH 165/248] fp8 dispatch change

---
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    | 67 +++++++++++--------
 1 file changed, 39 insertions(+), 28 deletions(-)

diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index 9c3251181814..23d9cbb2c74d 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -232,63 +232,74 @@ void sm89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch
                              const torch::Tensor& scales_b,
                              const c10::optional<torch::Tensor>& bias) {
     uint32_t const m = a.size(0);
-    uint32_t const mp2 =
-        std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+    // uint32_t const mp2 =
+    //     std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+    uint32_t const mp2 = next_pow_2(m);     // next power of 2
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
 
-  if (mp2 <= 16) {
-    // M in [1, 16]
+  if (m == 1) {
     if (np2 <= 8192) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-    } else if (np2 <= 24576) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 128, 64>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
+    } else if (np2 <= 16384) {
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     } else {
         return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     }
-  } else if (mp2 <= 32) {
-    // M in (16, 32]
+  } else if (mp2 <= 16) {
+    // M in (1, 16]
     if (np2 <= 8192) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
     } else if (np2 <= 16384) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 128, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
-    } else {
         return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+    } else {
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
     }
   } else if (mp2 <= 64) {
-    // M in (32, 64]
+    // M in (16, 64]
     if (np2 <= 8192) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
     } else if (np2 <= 16384) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
     } else {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
     }
   } else if (mp2 <= 128) {
     // M in (64, 128]
     if (np2 <= 8192) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
     } else if (np2 <= 16384) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     } else {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     }
   } else if (mp2 <= 256) {
     // M in (128, 256]
-    if (np2 <= 4096) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 128>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+    if (np2 <= 8192) {
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+    } else if (np2 <= 16384) {
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 7>(out, a, b, scales_a, scales_b, bias);
     } else {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 128>, cutlass::gemm::GemmShape<64, 32, 128>, 4>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (mp2 <= 512) {
+    // M in (256, 512)
+    if (np2 <= 8192) {
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
+    } else if (np2 <= 16384) {
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
+    } else {
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 4>(out, a, b, scales_a, scales_b, bias);
     }
   } else {
-    // M in (256, inf)
-    if (np2 <= 4096) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-    } else if (np2 <= 8192) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<256, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+    // M in (512, inf)
+    if (np2 <= 8192) {
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+    } else if (np2 <= 16384) {
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
     } else {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
     }
   }
 }

From ba7ca85f19c287af8f5d302d30b00b5ffbd036d9 Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Tue, 21 Jan 2025 14:50:57 +0800
Subject: [PATCH 166/248] clean code

---
 sgl-kernel/3rdparty/nlohmann/json.hpp         | 25420 ----------------
 sgl-kernel/3rdparty/nlohmann/json_fwd.hpp     |   187 -
 sgl-kernel/benchmark/bench_fp8_gemm.py        |     2 +-
 .../benchmark/bench_fp8_res/results.html      |     3 -
 .../benchmark/bench_int8_res/results.html     |     3 -
 sgl-kernel/setup.py                           |    45 +-
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    |     2 +-
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     |    12 +-
 sgl-kernel/src/sgl-kernel/csrc/utils.hpp      |    13 -
 sgl-kernel/src/sgl-kernel/ops/__init__.py     |    24 +-
 10 files changed, 25 insertions(+), 25686 deletions(-)
 delete mode 100644 sgl-kernel/3rdparty/nlohmann/json.hpp
 delete mode 100644 sgl-kernel/3rdparty/nlohmann/json_fwd.hpp
 delete mode 100644 sgl-kernel/benchmark/bench_fp8_res/results.html
 delete mode 100644 sgl-kernel/benchmark/bench_int8_res/results.html

diff --git a/sgl-kernel/3rdparty/nlohmann/json.hpp b/sgl-kernel/3rdparty/nlohmann/json.hpp
deleted file mode 100644
index 9be8b892e3dc..000000000000
--- a/sgl-kernel/3rdparty/nlohmann/json.hpp
+++ /dev/null
@@ -1,25420 +0,0 @@
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-/****************************************************************************\
- * Note on documentation: The source files contain links to the online      *
- * documentation of the public API at https://json.nlohmann.me. This URL    *
- * contains the most recent documentation and should also be applicable to  *
- * previous versions; documentation for deprecated functions is not         *
- * removed, but marked deprecated. See "Generate documentation" section in  *
- * file docs/README.md.                                                     *
-\****************************************************************************/
-
-#ifndef INCLUDE_NLOHMANN_JSON_HPP_
-#define INCLUDE_NLOHMANN_JSON_HPP_
-
-#include <algorithm> // all_of, find, for_each
-#include <cstddef> // nullptr_t, ptrdiff_t, size_t
-#include <functional> // hash, less
-#include <initializer_list> // initializer_list
-#ifndef JSON_NO_IO
-    #include <iosfwd> // istream, ostream
-#endif  // JSON_NO_IO
-#include <iterator> // random_access_iterator_tag
-#include <memory> // unique_ptr
-#include <string> // string, stoi, to_string
-#include <utility> // declval, forward, move, pair, swap
-#include <vector> // vector
-
-// #include <nlohmann/adl_serializer.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <utility>
-
-// #include <nlohmann/detail/abi_macros.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-// This file contains all macro definitions affecting or depending on the ABI
-
-#ifndef JSON_SKIP_LIBRARY_VERSION_CHECK
-    #if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH)
-        #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 3
-            #warning "Already included a different version of the library!"
-        #endif
-    #endif
-#endif
-
-#define NLOHMANN_JSON_VERSION_MAJOR 3   // NOLINT(modernize-macro-to-enum)
-#define NLOHMANN_JSON_VERSION_MINOR 11  // NOLINT(modernize-macro-to-enum)
-#define NLOHMANN_JSON_VERSION_PATCH 3   // NOLINT(modernize-macro-to-enum)
-
-#ifndef JSON_DIAGNOSTICS
-    #define JSON_DIAGNOSTICS 0
-#endif
-
-#ifndef JSON_DIAGNOSTIC_POSITIONS
-    #define JSON_DIAGNOSTIC_POSITIONS 0
-#endif
-
-#ifndef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
-    #define JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON 0
-#endif
-
-#if JSON_DIAGNOSTICS
-    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS _diag
-#else
-    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS
-#endif
-
-#if JSON_DIAGNOSTIC_POSITIONS
-    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS _dp
-#else
-    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS
-#endif
-
-#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
-    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON _ldvcmp
-#else
-    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON
-#endif
-
-#ifndef NLOHMANN_JSON_NAMESPACE_NO_VERSION
-    #define NLOHMANN_JSON_NAMESPACE_NO_VERSION 0
-#endif
-
-// Construct the namespace ABI tags component
-#define NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c) json_abi ## a ## b ## c
-#define NLOHMANN_JSON_ABI_TAGS_CONCAT(a, b, c) \
-    NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c)
-
-#define NLOHMANN_JSON_ABI_TAGS                                       \
-    NLOHMANN_JSON_ABI_TAGS_CONCAT(                                   \
-            NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS,                       \
-            NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON, \
-            NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS)
-
-// Construct the namespace version component
-#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) \
-    _v ## major ## _ ## minor ## _ ## patch
-#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(major, minor, patch) \
-    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch)
-
-#if NLOHMANN_JSON_NAMESPACE_NO_VERSION
-#define NLOHMANN_JSON_NAMESPACE_VERSION
-#else
-#define NLOHMANN_JSON_NAMESPACE_VERSION                                 \
-    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(NLOHMANN_JSON_VERSION_MAJOR, \
-                                           NLOHMANN_JSON_VERSION_MINOR, \
-                                           NLOHMANN_JSON_VERSION_PATCH)
-#endif
-
-// Combine namespace components
-#define NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b) a ## b
-#define NLOHMANN_JSON_NAMESPACE_CONCAT(a, b) \
-    NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b)
-
-#ifndef NLOHMANN_JSON_NAMESPACE
-#define NLOHMANN_JSON_NAMESPACE               \
-    nlohmann::NLOHMANN_JSON_NAMESPACE_CONCAT( \
-            NLOHMANN_JSON_ABI_TAGS,           \
-            NLOHMANN_JSON_NAMESPACE_VERSION)
-#endif
-
-#ifndef NLOHMANN_JSON_NAMESPACE_BEGIN
-#define NLOHMANN_JSON_NAMESPACE_BEGIN                \
-    namespace nlohmann                               \
-    {                                                \
-    inline namespace NLOHMANN_JSON_NAMESPACE_CONCAT( \
-                NLOHMANN_JSON_ABI_TAGS,              \
-                NLOHMANN_JSON_NAMESPACE_VERSION)     \
-    {
-#endif
-
-#ifndef NLOHMANN_JSON_NAMESPACE_END
-#define NLOHMANN_JSON_NAMESPACE_END                                     \
-    }  /* namespace (inline namespace) NOLINT(readability/namespace) */ \
-    }  // namespace nlohmann
-#endif
-
-// #include <nlohmann/detail/conversions/from_json.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <algorithm> // transform
-#include <array> // array
-#include <forward_list> // forward_list
-#include <iterator> // inserter, front_inserter, end
-#include <map> // map
-#ifdef JSON_HAS_CPP_17
-    #include <optional> // optional
-#endif
-#include <string> // string
-#include <tuple> // tuple, make_tuple
-#include <type_traits> // is_arithmetic, is_same, is_enum, underlying_type, is_convertible
-#include <unordered_map> // unordered_map
-#include <utility> // pair, declval
-#include <valarray> // valarray
-
-// #include <nlohmann/detail/exceptions.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <cstddef> // nullptr_t
-#include <exception> // exception
-#if JSON_DIAGNOSTICS
-    #include <numeric> // accumulate
-#endif
-#include <stdexcept> // runtime_error
-#include <string> // to_string
-#include <vector> // vector
-
-// #include <nlohmann/detail/value_t.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <array> // array
-#include <cstddef> // size_t
-#include <cstdint> // uint8_t
-#include <string> // string
-
-// #include <nlohmann/detail/macro_scope.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <utility> // declval, pair
-// #include <nlohmann/detail/meta/detected.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <type_traits>
-
-// #include <nlohmann/detail/meta/void_t.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-// #include <nlohmann/detail/abi_macros.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-template<typename ...Ts> struct make_void
-{
-    using type = void;
-};
-template<typename ...Ts> using void_t = typename make_void<Ts...>::type;
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-// https://en.cppreference.com/w/cpp/experimental/is_detected
-struct nonesuch
-{
-    nonesuch() = delete;
-    ~nonesuch() = delete;
-    nonesuch(nonesuch const&) = delete;
-    nonesuch(nonesuch const&&) = delete;
-    void operator=(nonesuch const&) = delete;
-    void operator=(nonesuch&&) = delete;
-};
-
-template<class Default,
-         class AlwaysVoid,
-         template<class...> class Op,
-         class... Args>
-struct detector
-{
-    using value_t = std::false_type;
-    using type = Default;
-};
-
-template<class Default, template<class...> class Op, class... Args>
-struct detector<Default, void_t<Op<Args...>>, Op, Args...>
-{
-    using value_t = std::true_type;
-    using type = Op<Args...>;
-};
-
-template<template<class...> class Op, class... Args>
-using is_detected = typename detector<nonesuch, void, Op, Args...>::value_t;
-
-template<template<class...> class Op, class... Args>
-struct is_detected_lazy : is_detected<Op, Args...> { };
-
-template<template<class...> class Op, class... Args>
-using detected_t = typename detector<nonesuch, void, Op, Args...>::type;
-
-template<class Default, template<class...> class Op, class... Args>
-using detected_or = detector<Default, void, Op, Args...>;
-
-template<class Default, template<class...> class Op, class... Args>
-using detected_or_t = typename detected_or<Default, Op, Args...>::type;
-
-template<class Expected, template<class...> class Op, class... Args>
-using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
-
-template<class To, template<class...> class Op, class... Args>
-using is_detected_convertible =
-    std::is_convertible<detected_t<Op, Args...>, To>;
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/thirdparty/hedley/hedley.hpp>
-
-
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-FileCopyrightText: 2016 - 2021 Evan Nemerson <evan@nemerson.com>
-// SPDX-License-Identifier: MIT
-
-/* Hedley - https://nemequ.github.io/hedley
- * Created by Evan Nemerson <evan@nemerson.com>
- */
-
-#if !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < 15)
-#if defined(JSON_HEDLEY_VERSION)
-    #undef JSON_HEDLEY_VERSION
-#endif
-#define JSON_HEDLEY_VERSION 15
-
-#if defined(JSON_HEDLEY_STRINGIFY_EX)
-    #undef JSON_HEDLEY_STRINGIFY_EX
-#endif
-#define JSON_HEDLEY_STRINGIFY_EX(x) #x
-
-#if defined(JSON_HEDLEY_STRINGIFY)
-    #undef JSON_HEDLEY_STRINGIFY
-#endif
-#define JSON_HEDLEY_STRINGIFY(x) JSON_HEDLEY_STRINGIFY_EX(x)
-
-#if defined(JSON_HEDLEY_CONCAT_EX)
-    #undef JSON_HEDLEY_CONCAT_EX
-#endif
-#define JSON_HEDLEY_CONCAT_EX(a,b) a##b
-
-#if defined(JSON_HEDLEY_CONCAT)
-    #undef JSON_HEDLEY_CONCAT
-#endif
-#define JSON_HEDLEY_CONCAT(a,b) JSON_HEDLEY_CONCAT_EX(a,b)
-
-#if defined(JSON_HEDLEY_CONCAT3_EX)
-    #undef JSON_HEDLEY_CONCAT3_EX
-#endif
-#define JSON_HEDLEY_CONCAT3_EX(a,b,c) a##b##c
-
-#if defined(JSON_HEDLEY_CONCAT3)
-    #undef JSON_HEDLEY_CONCAT3
-#endif
-#define JSON_HEDLEY_CONCAT3(a,b,c) JSON_HEDLEY_CONCAT3_EX(a,b,c)
-
-#if defined(JSON_HEDLEY_VERSION_ENCODE)
-    #undef JSON_HEDLEY_VERSION_ENCODE
-#endif
-#define JSON_HEDLEY_VERSION_ENCODE(major,minor,revision) (((major) * 1000000) + ((minor) * 1000) + (revision))
-
-#if defined(JSON_HEDLEY_VERSION_DECODE_MAJOR)
-    #undef JSON_HEDLEY_VERSION_DECODE_MAJOR
-#endif
-#define JSON_HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000)
-
-#if defined(JSON_HEDLEY_VERSION_DECODE_MINOR)
-    #undef JSON_HEDLEY_VERSION_DECODE_MINOR
-#endif
-#define JSON_HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000)
-
-#if defined(JSON_HEDLEY_VERSION_DECODE_REVISION)
-    #undef JSON_HEDLEY_VERSION_DECODE_REVISION
-#endif
-#define JSON_HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000)
-
-#if defined(JSON_HEDLEY_GNUC_VERSION)
-    #undef JSON_HEDLEY_GNUC_VERSION
-#endif
-#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__)
-    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
-#elif defined(__GNUC__)
-    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0)
-#endif
-
-#if defined(JSON_HEDLEY_GNUC_VERSION_CHECK)
-    #undef JSON_HEDLEY_GNUC_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_GNUC_VERSION)
-    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GNUC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_MSVC_VERSION)
-    #undef JSON_HEDLEY_MSVC_VERSION
-#endif
-#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL)
-    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100)
-#elif defined(_MSC_FULL_VER) && !defined(__ICL)
-    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10)
-#elif defined(_MSC_VER) && !defined(__ICL)
-    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0)
-#endif
-
-#if defined(JSON_HEDLEY_MSVC_VERSION_CHECK)
-    #undef JSON_HEDLEY_MSVC_VERSION_CHECK
-#endif
-#if !defined(JSON_HEDLEY_MSVC_VERSION)
-    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0)
-#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
-    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch)))
-#elif defined(_MSC_VER) && (_MSC_VER >= 1200)
-    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch)))
-#else
-    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_VER >= ((major * 100) + (minor)))
-#endif
-
-#if defined(JSON_HEDLEY_INTEL_VERSION)
-    #undef JSON_HEDLEY_INTEL_VERSION
-#endif
-#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && !defined(__ICL)
-    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE)
-#elif defined(__INTEL_COMPILER) && !defined(__ICL)
-    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0)
-#endif
-
-#if defined(JSON_HEDLEY_INTEL_VERSION_CHECK)
-    #undef JSON_HEDLEY_INTEL_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_INTEL_VERSION)
-    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_INTEL_CL_VERSION)
-    #undef JSON_HEDLEY_INTEL_CL_VERSION
-#endif
-#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && defined(__ICL)
-    #define JSON_HEDLEY_INTEL_CL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0)
-#endif
-
-#if defined(JSON_HEDLEY_INTEL_CL_VERSION_CHECK)
-    #undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_INTEL_CL_VERSION)
-    #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_CL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_PGI_VERSION)
-    #undef JSON_HEDLEY_PGI_VERSION
-#endif
-#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__)
-    #define JSON_HEDLEY_PGI_VERSION JSON_HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__)
-#endif
-
-#if defined(JSON_HEDLEY_PGI_VERSION_CHECK)
-    #undef JSON_HEDLEY_PGI_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_PGI_VERSION)
-    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PGI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_SUNPRO_VERSION)
-    #undef JSON_HEDLEY_SUNPRO_VERSION
-#endif
-#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000)
-    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf), (__SUNPRO_C & 0xf) * 10)
-#elif defined(__SUNPRO_C)
-    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf)
-#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000)
-    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf), (__SUNPRO_CC & 0xf) * 10)
-#elif defined(__SUNPRO_CC)
-    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf)
-#endif
-
-#if defined(JSON_HEDLEY_SUNPRO_VERSION_CHECK)
-    #undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_SUNPRO_VERSION)
-    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_SUNPRO_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
-    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION
-#endif
-#if defined(__EMSCRIPTEN__)
-    #define JSON_HEDLEY_EMSCRIPTEN_VERSION JSON_HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__)
-#endif
-
-#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK)
-    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
-    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_EMSCRIPTEN_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_ARM_VERSION)
-    #undef JSON_HEDLEY_ARM_VERSION
-#endif
-#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION)
-    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, (__ARMCOMPILER_VERSION % 1000000) / 10000, (__ARMCOMPILER_VERSION % 10000) / 100)
-#elif defined(__CC_ARM) && defined(__ARMCC_VERSION)
-    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100)
-#endif
-
-#if defined(JSON_HEDLEY_ARM_VERSION_CHECK)
-    #undef JSON_HEDLEY_ARM_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_ARM_VERSION)
-    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_ARM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_IBM_VERSION)
-    #undef JSON_HEDLEY_IBM_VERSION
-#endif
-#if defined(__ibmxl__)
-    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__)
-#elif defined(__xlC__) && defined(__xlC_ver__)
-    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff)
-#elif defined(__xlC__)
-    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0)
-#endif
-
-#if defined(JSON_HEDLEY_IBM_VERSION_CHECK)
-    #undef JSON_HEDLEY_IBM_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_IBM_VERSION)
-    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IBM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_TI_VERSION)
-    #undef JSON_HEDLEY_TI_VERSION
-#endif
-#if \
-    defined(__TI_COMPILER_VERSION__) && \
-    ( \
-      defined(__TMS470__) || defined(__TI_ARM__) || \
-      defined(__MSP430__) || \
-      defined(__TMS320C2000__) \
-    )
-#if (__TI_COMPILER_VERSION__ >= 16000000)
-    #define JSON_HEDLEY_TI_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
-#endif
-#endif
-
-#if defined(JSON_HEDLEY_TI_VERSION_CHECK)
-    #undef JSON_HEDLEY_TI_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_TI_VERSION)
-    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
-    #undef JSON_HEDLEY_TI_CL2000_VERSION
-#endif
-#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__)
-    #define JSON_HEDLEY_TI_CL2000_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
-#endif
-
-#if defined(JSON_HEDLEY_TI_CL2000_VERSION_CHECK)
-    #undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
-    #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL2000_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_TI_CL430_VERSION)
-    #undef JSON_HEDLEY_TI_CL430_VERSION
-#endif
-#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__)
-    #define JSON_HEDLEY_TI_CL430_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
-#endif
-
-#if defined(JSON_HEDLEY_TI_CL430_VERSION_CHECK)
-    #undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_TI_CL430_VERSION)
-    #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL430_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
-    #undef JSON_HEDLEY_TI_ARMCL_VERSION
-#endif
-#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__))
-    #define JSON_HEDLEY_TI_ARMCL_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
-#endif
-
-#if defined(JSON_HEDLEY_TI_ARMCL_VERSION_CHECK)
-    #undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
-    #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_ARMCL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
-    #undef JSON_HEDLEY_TI_CL6X_VERSION
-#endif
-#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__)
-    #define JSON_HEDLEY_TI_CL6X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
-#endif
-
-#if defined(JSON_HEDLEY_TI_CL6X_VERSION_CHECK)
-    #undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
-    #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL6X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
-    #undef JSON_HEDLEY_TI_CL7X_VERSION
-#endif
-#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__)
-    #define JSON_HEDLEY_TI_CL7X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
-#endif
-
-#if defined(JSON_HEDLEY_TI_CL7X_VERSION_CHECK)
-    #undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
-    #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL7X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
-    #undef JSON_HEDLEY_TI_CLPRU_VERSION
-#endif
-#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__)
-    #define JSON_HEDLEY_TI_CLPRU_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
-#endif
-
-#if defined(JSON_HEDLEY_TI_CLPRU_VERSION_CHECK)
-    #undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
-    #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CLPRU_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_CRAY_VERSION)
-    #undef JSON_HEDLEY_CRAY_VERSION
-#endif
-#if defined(_CRAYC)
-    #if defined(_RELEASE_PATCHLEVEL)
-        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL)
-    #else
-        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0)
-    #endif
-#endif
-
-#if defined(JSON_HEDLEY_CRAY_VERSION_CHECK)
-    #undef JSON_HEDLEY_CRAY_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_CRAY_VERSION)
-    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_CRAY_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_IAR_VERSION)
-    #undef JSON_HEDLEY_IAR_VERSION
-#endif
-#if defined(__IAR_SYSTEMS_ICC__)
-    #if __VER__ > 1000
-        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000))
-    #else
-        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE(__VER__ / 100, __VER__ % 100, 0)
-    #endif
-#endif
-
-#if defined(JSON_HEDLEY_IAR_VERSION_CHECK)
-    #undef JSON_HEDLEY_IAR_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_IAR_VERSION)
-    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IAR_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_TINYC_VERSION)
-    #undef JSON_HEDLEY_TINYC_VERSION
-#endif
-#if defined(__TINYC__)
-    #define JSON_HEDLEY_TINYC_VERSION JSON_HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100)
-#endif
-
-#if defined(JSON_HEDLEY_TINYC_VERSION_CHECK)
-    #undef JSON_HEDLEY_TINYC_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_TINYC_VERSION)
-    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TINYC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_DMC_VERSION)
-    #undef JSON_HEDLEY_DMC_VERSION
-#endif
-#if defined(__DMC__)
-    #define JSON_HEDLEY_DMC_VERSION JSON_HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf)
-#endif
-
-#if defined(JSON_HEDLEY_DMC_VERSION_CHECK)
-    #undef JSON_HEDLEY_DMC_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_DMC_VERSION)
-    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_DMC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_COMPCERT_VERSION)
-    #undef JSON_HEDLEY_COMPCERT_VERSION
-#endif
-#if defined(__COMPCERT_VERSION__)
-    #define JSON_HEDLEY_COMPCERT_VERSION JSON_HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100)
-#endif
-
-#if defined(JSON_HEDLEY_COMPCERT_VERSION_CHECK)
-    #undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_COMPCERT_VERSION)
-    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_COMPCERT_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_PELLES_VERSION)
-    #undef JSON_HEDLEY_PELLES_VERSION
-#endif
-#if defined(__POCC__)
-    #define JSON_HEDLEY_PELLES_VERSION JSON_HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0)
-#endif
-
-#if defined(JSON_HEDLEY_PELLES_VERSION_CHECK)
-    #undef JSON_HEDLEY_PELLES_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_PELLES_VERSION)
-    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PELLES_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_MCST_LCC_VERSION)
-    #undef JSON_HEDLEY_MCST_LCC_VERSION
-#endif
-#if defined(__LCC__) && defined(__LCC_MINOR__)
-    #define JSON_HEDLEY_MCST_LCC_VERSION JSON_HEDLEY_VERSION_ENCODE(__LCC__ / 100, __LCC__ % 100, __LCC_MINOR__)
-#endif
-
-#if defined(JSON_HEDLEY_MCST_LCC_VERSION_CHECK)
-    #undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_MCST_LCC_VERSION)
-    #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_MCST_LCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_GCC_VERSION)
-    #undef JSON_HEDLEY_GCC_VERSION
-#endif
-#if \
-    defined(JSON_HEDLEY_GNUC_VERSION) && \
-    !defined(__clang__) && \
-    !defined(JSON_HEDLEY_INTEL_VERSION) && \
-    !defined(JSON_HEDLEY_PGI_VERSION) && \
-    !defined(JSON_HEDLEY_ARM_VERSION) && \
-    !defined(JSON_HEDLEY_CRAY_VERSION) && \
-    !defined(JSON_HEDLEY_TI_VERSION) && \
-    !defined(JSON_HEDLEY_TI_ARMCL_VERSION) && \
-    !defined(JSON_HEDLEY_TI_CL430_VERSION) && \
-    !defined(JSON_HEDLEY_TI_CL2000_VERSION) && \
-    !defined(JSON_HEDLEY_TI_CL6X_VERSION) && \
-    !defined(JSON_HEDLEY_TI_CL7X_VERSION) && \
-    !defined(JSON_HEDLEY_TI_CLPRU_VERSION) && \
-    !defined(__COMPCERT__) && \
-    !defined(JSON_HEDLEY_MCST_LCC_VERSION)
-    #define JSON_HEDLEY_GCC_VERSION JSON_HEDLEY_GNUC_VERSION
-#endif
-
-#if defined(JSON_HEDLEY_GCC_VERSION_CHECK)
-    #undef JSON_HEDLEY_GCC_VERSION_CHECK
-#endif
-#if defined(JSON_HEDLEY_GCC_VERSION)
-    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
-#else
-    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (0)
-#endif
-
-#if defined(JSON_HEDLEY_HAS_ATTRIBUTE)
-    #undef JSON_HEDLEY_HAS_ATTRIBUTE
-#endif
-#if \
-  defined(__has_attribute) && \
-  ( \
-    (!defined(JSON_HEDLEY_IAR_VERSION) || JSON_HEDLEY_IAR_VERSION_CHECK(8,5,9)) \
-  )
-#  define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute)
-#else
-#  define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) (0)
-#endif
-
-#if defined(JSON_HEDLEY_GNUC_HAS_ATTRIBUTE)
-    #undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
-#endif
-#if defined(__has_attribute)
-    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
-#else
-    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
-#endif
-
-#if defined(JSON_HEDLEY_GCC_HAS_ATTRIBUTE)
-    #undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
-#endif
-#if defined(__has_attribute)
-    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
-#else
-    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
-#endif
-
-#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE)
-    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
-#endif
-#if \
-    defined(__has_cpp_attribute) && \
-    defined(__cplusplus) && \
-    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0))
-    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute)
-#else
-    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0)
-#endif
-
-#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS)
-    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
-#endif
-#if !defined(__cplusplus) || !defined(__has_cpp_attribute)
-    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
-#elif \
-    !defined(JSON_HEDLEY_PGI_VERSION) && \
-    !defined(JSON_HEDLEY_IAR_VERSION) && \
-    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \
-    (!defined(JSON_HEDLEY_MSVC_VERSION) || JSON_HEDLEY_MSVC_VERSION_CHECK(19,20,0))
-    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute)
-#else
-    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
-#endif
-
-#if defined(JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE)
-    #undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
-#endif
-#if defined(__has_cpp_attribute) && defined(__cplusplus)
-    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
-#else
-    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
-#endif
-
-#if defined(JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE)
-    #undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
-#endif
-#if defined(__has_cpp_attribute) && defined(__cplusplus)
-    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
-#else
-    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
-#endif
-
-#if defined(JSON_HEDLEY_HAS_BUILTIN)
-    #undef JSON_HEDLEY_HAS_BUILTIN
-#endif
-#if defined(__has_builtin)
-    #define JSON_HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin)
-#else
-    #define JSON_HEDLEY_HAS_BUILTIN(builtin) (0)
-#endif
-
-#if defined(JSON_HEDLEY_GNUC_HAS_BUILTIN)
-    #undef JSON_HEDLEY_GNUC_HAS_BUILTIN
-#endif
-#if defined(__has_builtin)
-    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
-#else
-    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
-#endif
-
-#if defined(JSON_HEDLEY_GCC_HAS_BUILTIN)
-    #undef JSON_HEDLEY_GCC_HAS_BUILTIN
-#endif
-#if defined(__has_builtin)
-    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
-#else
-    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
-#endif
-
-#if defined(JSON_HEDLEY_HAS_FEATURE)
-    #undef JSON_HEDLEY_HAS_FEATURE
-#endif
-#if defined(__has_feature)
-    #define JSON_HEDLEY_HAS_FEATURE(feature) __has_feature(feature)
-#else
-    #define JSON_HEDLEY_HAS_FEATURE(feature) (0)
-#endif
-
-#if defined(JSON_HEDLEY_GNUC_HAS_FEATURE)
-    #undef JSON_HEDLEY_GNUC_HAS_FEATURE
-#endif
-#if defined(__has_feature)
-    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
-#else
-    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
-#endif
-
-#if defined(JSON_HEDLEY_GCC_HAS_FEATURE)
-    #undef JSON_HEDLEY_GCC_HAS_FEATURE
-#endif
-#if defined(__has_feature)
-    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
-#else
-    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
-#endif
-
-#if defined(JSON_HEDLEY_HAS_EXTENSION)
-    #undef JSON_HEDLEY_HAS_EXTENSION
-#endif
-#if defined(__has_extension)
-    #define JSON_HEDLEY_HAS_EXTENSION(extension) __has_extension(extension)
-#else
-    #define JSON_HEDLEY_HAS_EXTENSION(extension) (0)
-#endif
-
-#if defined(JSON_HEDLEY_GNUC_HAS_EXTENSION)
-    #undef JSON_HEDLEY_GNUC_HAS_EXTENSION
-#endif
-#if defined(__has_extension)
-    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
-#else
-    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
-#endif
-
-#if defined(JSON_HEDLEY_GCC_HAS_EXTENSION)
-    #undef JSON_HEDLEY_GCC_HAS_EXTENSION
-#endif
-#if defined(__has_extension)
-    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
-#else
-    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
-#endif
-
-#if defined(JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE)
-    #undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
-#endif
-#if defined(__has_declspec_attribute)
-    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute)
-#else
-    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0)
-#endif
-
-#if defined(JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE)
-    #undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
-#endif
-#if defined(__has_declspec_attribute)
-    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
-#else
-    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
-#endif
-
-#if defined(JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE)
-    #undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
-#endif
-#if defined(__has_declspec_attribute)
-    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
-#else
-    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
-#endif
-
-#if defined(JSON_HEDLEY_HAS_WARNING)
-    #undef JSON_HEDLEY_HAS_WARNING
-#endif
-#if defined(__has_warning)
-    #define JSON_HEDLEY_HAS_WARNING(warning) __has_warning(warning)
-#else
-    #define JSON_HEDLEY_HAS_WARNING(warning) (0)
-#endif
-
-#if defined(JSON_HEDLEY_GNUC_HAS_WARNING)
-    #undef JSON_HEDLEY_GNUC_HAS_WARNING
-#endif
-#if defined(__has_warning)
-    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
-#else
-    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
-#endif
-
-#if defined(JSON_HEDLEY_GCC_HAS_WARNING)
-    #undef JSON_HEDLEY_GCC_HAS_WARNING
-#endif
-#if defined(__has_warning)
-    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
-#else
-    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
-#endif
-
-#if \
-    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
-    defined(__clang__) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
-    JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
-    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
-    JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
-    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \
-    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
-    JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \
-    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \
-    JSON_HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \
-    (JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) && defined(__C99_PRAGMA_OPERATOR))
-    #define JSON_HEDLEY_PRAGMA(value) _Pragma(#value)
-#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
-    #define JSON_HEDLEY_PRAGMA(value) __pragma(value)
-#else
-    #define JSON_HEDLEY_PRAGMA(value)
-#endif
-
-#if defined(JSON_HEDLEY_DIAGNOSTIC_PUSH)
-    #undef JSON_HEDLEY_DIAGNOSTIC_PUSH
-#endif
-#if defined(JSON_HEDLEY_DIAGNOSTIC_POP)
-    #undef JSON_HEDLEY_DIAGNOSTIC_POP
-#endif
-#if defined(__clang__)
-    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
-    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
-#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
-    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
-#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
-    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
-#elif \
-    JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \
-    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
-    #define JSON_HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
-#elif JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
-    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
-#elif \
-    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
-    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
-    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
-    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
-    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
-#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
-    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
-#else
-    #define JSON_HEDLEY_DIAGNOSTIC_PUSH
-    #define JSON_HEDLEY_DIAGNOSTIC_POP
-#endif
-
-/* JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for
-   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
-#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
-    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
-#endif
-#if defined(__cplusplus)
-#  if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat")
-#    if JSON_HEDLEY_HAS_WARNING("-Wc++17-extensions")
-#      if JSON_HEDLEY_HAS_WARNING("-Wc++1z-extensions")
-#        define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
-    JSON_HEDLEY_DIAGNOSTIC_PUSH \
-    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
-    _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
-    _Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") \
-    xpr \
-    JSON_HEDLEY_DIAGNOSTIC_POP
-#      else
-#        define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
-    JSON_HEDLEY_DIAGNOSTIC_PUSH \
-    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
-    _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
-    xpr \
-    JSON_HEDLEY_DIAGNOSTIC_POP
-#      endif
-#    else
-#      define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
-    JSON_HEDLEY_DIAGNOSTIC_PUSH \
-    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
-    xpr \
-    JSON_HEDLEY_DIAGNOSTIC_POP
-#    endif
-#  endif
-#endif
-#if !defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x
-#endif
-
-#if defined(JSON_HEDLEY_CONST_CAST)
-    #undef JSON_HEDLEY_CONST_CAST
-#endif
-#if defined(__cplusplus)
-#  define JSON_HEDLEY_CONST_CAST(T, expr) (const_cast<T>(expr))
-#elif \
-  JSON_HEDLEY_HAS_WARNING("-Wcast-qual") || \
-  JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) || \
-  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
-#  define JSON_HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \
-        JSON_HEDLEY_DIAGNOSTIC_PUSH \
-        JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
-        ((T) (expr)); \
-        JSON_HEDLEY_DIAGNOSTIC_POP \
-    }))
-#else
-#  define JSON_HEDLEY_CONST_CAST(T, expr) ((T) (expr))
-#endif
-
-#if defined(JSON_HEDLEY_REINTERPRET_CAST)
-    #undef JSON_HEDLEY_REINTERPRET_CAST
-#endif
-#if defined(__cplusplus)
-    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast<T>(expr))
-#else
-    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr))
-#endif
-
-#if defined(JSON_HEDLEY_STATIC_CAST)
-    #undef JSON_HEDLEY_STATIC_CAST
-#endif
-#if defined(__cplusplus)
-    #define JSON_HEDLEY_STATIC_CAST(T, expr) (static_cast<T>(expr))
-#else
-    #define JSON_HEDLEY_STATIC_CAST(T, expr) ((T) (expr))
-#endif
-
-#if defined(JSON_HEDLEY_CPP_CAST)
-    #undef JSON_HEDLEY_CPP_CAST
-#endif
-#if defined(__cplusplus)
-#  if JSON_HEDLEY_HAS_WARNING("-Wold-style-cast")
-#    define JSON_HEDLEY_CPP_CAST(T, expr) \
-    JSON_HEDLEY_DIAGNOSTIC_PUSH \
-    _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \
-    ((T) (expr)) \
-    JSON_HEDLEY_DIAGNOSTIC_POP
-#  elif JSON_HEDLEY_IAR_VERSION_CHECK(8,3,0)
-#    define JSON_HEDLEY_CPP_CAST(T, expr) \
-    JSON_HEDLEY_DIAGNOSTIC_PUSH \
-    _Pragma("diag_suppress=Pe137") \
-    JSON_HEDLEY_DIAGNOSTIC_POP
-#  else
-#    define JSON_HEDLEY_CPP_CAST(T, expr) ((T) (expr))
-#  endif
-#else
-#  define JSON_HEDLEY_CPP_CAST(T, expr) (expr)
-#endif
-
-#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED)
-    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
-#endif
-#if JSON_HEDLEY_HAS_WARNING("-Wdeprecated-declarations")
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
-#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)")
-#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:1478 1786))
-#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1216,1444,1445")
-#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
-#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
-#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996))
-#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
-#elif \
-    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
-    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
-    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
-    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
-    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
-    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718")
-#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)")
-#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && defined(__cplusplus)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)")
-#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215")
-#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)")
-#else
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
-#endif
-
-#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS)
-    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
-#endif
-#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"")
-#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)")
-#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:161))
-#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675")
-#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"")
-#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068))
-#elif \
-    JSON_HEDLEY_TI_VERSION_CHECK(16,9,0) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
-    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
-#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
-#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161")
-#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 161")
-#else
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
-#endif
-
-#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES)
-    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
-#endif
-#if JSON_HEDLEY_HAS_WARNING("-Wunknown-attributes")
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"")
-#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
-#elif JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)")
-#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:1292))
-#elif JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030))
-#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097,1098")
-#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
-#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)")
-#elif \
-    JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
-    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173")
-#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097")
-#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
-#else
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
-#endif
-
-#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL)
-    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
-#endif
-#if JSON_HEDLEY_HAS_WARNING("-Wcast-qual")
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"")
-#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)")
-#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"")
-#else
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
-#endif
-
-#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION)
-    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
-#endif
-#if JSON_HEDLEY_HAS_WARNING("-Wunused-function")
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("clang diagnostic ignored \"-Wunused-function\"")
-#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("GCC diagnostic ignored \"-Wunused-function\"")
-#elif JSON_HEDLEY_MSVC_VERSION_CHECK(1,0,0)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION __pragma(warning(disable:4505))
-#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("diag_suppress 3142")
-#else
-    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
-#endif
-
-#if defined(JSON_HEDLEY_DEPRECATED)
-    #undef JSON_HEDLEY_DEPRECATED
-#endif
-#if defined(JSON_HEDLEY_DEPRECATED_FOR)
-    #undef JSON_HEDLEY_DEPRECATED_FOR
-#endif
-#if \
-    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
-    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
-    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since))
-    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement))
-#elif \
-    (JSON_HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) && !defined(JSON_HEDLEY_IAR_VERSION)) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
-    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \
-    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
-    JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \
-    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
-    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since)))
-    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement)))
-#elif defined(__cplusplus) && (__cplusplus >= 201402L)
-    #define JSON_HEDLEY_DEPRECATED(since) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]])
-    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]])
-#elif \
-    JSON_HEDLEY_HAS_ATTRIBUTE(deprecated) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
-    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
-    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
-    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
-    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
-    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
-    JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
-    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__))
-    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__))
-#elif \
-    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
-    JSON_HEDLEY_PELLES_VERSION_CHECK(6,50,0) || \
-    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
-    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated)
-    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated)
-#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
-    #define JSON_HEDLEY_DEPRECATED(since) _Pragma("deprecated")
-    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated")
-#else
-    #define JSON_HEDLEY_DEPRECATED(since)
-    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement)
-#endif
-
-#if defined(JSON_HEDLEY_UNAVAILABLE)
-    #undef JSON_HEDLEY_UNAVAILABLE
-#endif
-#if \
-    JSON_HEDLEY_HAS_ATTRIBUTE(warning) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since)))
-#else
-    #define JSON_HEDLEY_UNAVAILABLE(available_since)
-#endif
-
-#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT)
-    #undef JSON_HEDLEY_WARN_UNUSED_RESULT
-#endif
-#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT_MSG)
-    #undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
-#endif
-#if \
-    JSON_HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
-    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
-    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
-    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
-    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
-    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
-    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
-    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
-    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__))
-#elif (JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L)
-    #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
-    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]])
-#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard)
-    #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
-    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
-#elif defined(_Check_return_) /* SAL */
-    #define JSON_HEDLEY_WARN_UNUSED_RESULT _Check_return_
-    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_
-#else
-    #define JSON_HEDLEY_WARN_UNUSED_RESULT
-    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg)
-#endif
-
-#if defined(JSON_HEDLEY_SENTINEL)
-    #undef JSON_HEDLEY_SENTINEL
-#endif
-#if \
-    JSON_HEDLEY_HAS_ATTRIBUTE(sentinel) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position)))
-#else
-    #define JSON_HEDLEY_SENTINEL(position)
-#endif
-
-#if defined(JSON_HEDLEY_NO_RETURN)
-    #undef JSON_HEDLEY_NO_RETURN
-#endif
-#if JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
-    #define JSON_HEDLEY_NO_RETURN __noreturn
-#elif \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
-#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
-    #define JSON_HEDLEY_NO_RETURN _Noreturn
-#elif defined(__cplusplus) && (__cplusplus >= 201103L)
-    #define JSON_HEDLEY_NO_RETURN JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]])
-#elif \
-    JSON_HEDLEY_HAS_ATTRIBUTE(noreturn) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(3,2,0) || \
-    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
-    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
-    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
-    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
-    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
-    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
-    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
-    JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
-    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
-#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
-    #define JSON_HEDLEY_NO_RETURN _Pragma("does_not_return")
-#elif \
-    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
-    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
-    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
-#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
-    #define JSON_HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;")
-#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
-    #define JSON_HEDLEY_NO_RETURN __attribute((noreturn))
-#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
-    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
-#else
-    #define JSON_HEDLEY_NO_RETURN
-#endif
-
-#if defined(JSON_HEDLEY_NO_ESCAPE)
-    #undef JSON_HEDLEY_NO_ESCAPE
-#endif
-#if JSON_HEDLEY_HAS_ATTRIBUTE(noescape)
-    #define JSON_HEDLEY_NO_ESCAPE __attribute__((__noescape__))
-#else
-    #define JSON_HEDLEY_NO_ESCAPE
-#endif
-
-#if defined(JSON_HEDLEY_UNREACHABLE)
-    #undef JSON_HEDLEY_UNREACHABLE
-#endif
-#if defined(JSON_HEDLEY_UNREACHABLE_RETURN)
-    #undef JSON_HEDLEY_UNREACHABLE_RETURN
-#endif
-#if defined(JSON_HEDLEY_ASSUME)
-    #undef JSON_HEDLEY_ASSUME
-#endif
-#if \
-    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
-    #define JSON_HEDLEY_ASSUME(expr) __assume(expr)
-#elif JSON_HEDLEY_HAS_BUILTIN(__builtin_assume)
-    #define JSON_HEDLEY_ASSUME(expr) __builtin_assume(expr)
-#elif \
-    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
-    #if defined(__cplusplus)
-        #define JSON_HEDLEY_ASSUME(expr) std::_nassert(expr)
-    #else
-        #define JSON_HEDLEY_ASSUME(expr) _nassert(expr)
-    #endif
-#endif
-#if \
-    (JSON_HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(JSON_HEDLEY_ARM_VERSION))) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
-    JSON_HEDLEY_PGI_VERSION_CHECK(18,10,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,5) || \
-    JSON_HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_UNREACHABLE() __builtin_unreachable()
-#elif defined(JSON_HEDLEY_ASSUME)
-    #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
-#endif
-#if !defined(JSON_HEDLEY_ASSUME)
-    #if defined(JSON_HEDLEY_UNREACHABLE)
-        #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (JSON_HEDLEY_UNREACHABLE(), 1)))
-    #else
-        #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, expr)
-    #endif
-#endif
-#if defined(JSON_HEDLEY_UNREACHABLE)
-    #if  \
-        JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
-        JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
-        #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (JSON_HEDLEY_STATIC_CAST(void, JSON_HEDLEY_ASSUME(0)), (value))
-    #else
-        #define JSON_HEDLEY_UNREACHABLE_RETURN(value) JSON_HEDLEY_UNREACHABLE()
-    #endif
-#else
-    #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (value)
-#endif
-#if !defined(JSON_HEDLEY_UNREACHABLE)
-    #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
-#endif
-
-JSON_HEDLEY_DIAGNOSTIC_PUSH
-#if JSON_HEDLEY_HAS_WARNING("-Wpedantic")
-    #pragma clang diagnostic ignored "-Wpedantic"
-#endif
-#if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus)
-    #pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
-#endif
-#if JSON_HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0)
-    #if defined(__clang__)
-        #pragma clang diagnostic ignored "-Wvariadic-macros"
-    #elif defined(JSON_HEDLEY_GCC_VERSION)
-        #pragma GCC diagnostic ignored "-Wvariadic-macros"
-    #endif
-#endif
-#if defined(JSON_HEDLEY_NON_NULL)
-    #undef JSON_HEDLEY_NON_NULL
-#endif
-#if \
-    JSON_HEDLEY_HAS_ATTRIBUTE(nonnull) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
-    #define JSON_HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__)))
-#else
-    #define JSON_HEDLEY_NON_NULL(...)
-#endif
-JSON_HEDLEY_DIAGNOSTIC_POP
-
-#if defined(JSON_HEDLEY_PRINTF_FORMAT)
-    #undef JSON_HEDLEY_PRINTF_FORMAT
-#endif
-#if defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && !defined(__USE_MINGW_ANSI_STDIO)
-    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(ms_printf, string_idx, first_to_check)))
-#elif defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && defined(__USE_MINGW_ANSI_STDIO)
-    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(gnu_printf, string_idx, first_to_check)))
-#elif \
-    JSON_HEDLEY_HAS_ATTRIBUTE(format) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
-    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
-    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
-    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
-    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
-    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
-    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
-    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check)))
-#elif JSON_HEDLEY_PELLES_VERSION_CHECK(6,0,0)
-    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check))
-#else
-    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check)
-#endif
-
-#if defined(JSON_HEDLEY_CONSTEXPR)
-    #undef JSON_HEDLEY_CONSTEXPR
-#endif
-#if defined(__cplusplus)
-    #if __cplusplus >= 201103L
-        #define JSON_HEDLEY_CONSTEXPR JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr)
-    #endif
-#endif
-#if !defined(JSON_HEDLEY_CONSTEXPR)
-    #define JSON_HEDLEY_CONSTEXPR
-#endif
-
-#if defined(JSON_HEDLEY_PREDICT)
-    #undef JSON_HEDLEY_PREDICT
-#endif
-#if defined(JSON_HEDLEY_LIKELY)
-    #undef JSON_HEDLEY_LIKELY
-#endif
-#if defined(JSON_HEDLEY_UNLIKELY)
-    #undef JSON_HEDLEY_UNLIKELY
-#endif
-#if defined(JSON_HEDLEY_UNPREDICTABLE)
-    #undef JSON_HEDLEY_UNPREDICTABLE
-#endif
-#if JSON_HEDLEY_HAS_BUILTIN(__builtin_unpredictable)
-    #define JSON_HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr))
-#endif
-#if \
-  (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && !defined(JSON_HEDLEY_PGI_VERSION)) || \
-  JSON_HEDLEY_GCC_VERSION_CHECK(9,0,0) || \
-  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-#  define JSON_HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability(  (expr), (value), (probability))
-#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability)   __builtin_expect_with_probability(!!(expr),    1   , (probability))
-#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability)  __builtin_expect_with_probability(!!(expr),    0   , (probability))
-#  define JSON_HEDLEY_LIKELY(expr)                      __builtin_expect                 (!!(expr),    1                  )
-#  define JSON_HEDLEY_UNLIKELY(expr)                    __builtin_expect                 (!!(expr),    0                  )
-#elif \
-  (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \
-  JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
-  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-  (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
-  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
-  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
-  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
-  JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
-  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
-  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
-  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
-  JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \
-  JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
-  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-#  define JSON_HEDLEY_PREDICT(expr, expected, probability) \
-    (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (JSON_HEDLEY_STATIC_CAST(void, expected), (expr)))
-#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) \
-    (__extension__ ({ \
-        double hedley_probability_ = (probability); \
-        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \
-    }))
-#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) \
-    (__extension__ ({ \
-        double hedley_probability_ = (probability); \
-        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \
-    }))
-#  define JSON_HEDLEY_LIKELY(expr)   __builtin_expect(!!(expr), 1)
-#  define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
-#else
-#  define JSON_HEDLEY_PREDICT(expr, expected, probability) (JSON_HEDLEY_STATIC_CAST(void, expected), (expr))
-#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr))
-#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr))
-#  define JSON_HEDLEY_LIKELY(expr) (!!(expr))
-#  define JSON_HEDLEY_UNLIKELY(expr) (!!(expr))
-#endif
-#if !defined(JSON_HEDLEY_UNPREDICTABLE)
-    #define JSON_HEDLEY_UNPREDICTABLE(expr) JSON_HEDLEY_PREDICT(expr, 1, 0.5)
-#endif
-
-#if defined(JSON_HEDLEY_MALLOC)
-    #undef JSON_HEDLEY_MALLOC
-#endif
-#if \
-    JSON_HEDLEY_HAS_ATTRIBUTE(malloc) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-    JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
-    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
-    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
-    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
-    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
-    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
-    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_MALLOC __attribute__((__malloc__))
-#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
-    #define JSON_HEDLEY_MALLOC _Pragma("returns_new_memory")
-#elif \
-    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
-    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
-    #define JSON_HEDLEY_MALLOC __declspec(restrict)
-#else
-    #define JSON_HEDLEY_MALLOC
-#endif
-
-#if defined(JSON_HEDLEY_PURE)
-    #undef JSON_HEDLEY_PURE
-#endif
-#if \
-  JSON_HEDLEY_HAS_ATTRIBUTE(pure) || \
-  JSON_HEDLEY_GCC_VERSION_CHECK(2,96,0) || \
-  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-  JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
-  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
-  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
-  (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
-  (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
-  (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-  JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
-  (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
-  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
-  JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
-  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-#  define JSON_HEDLEY_PURE __attribute__((__pure__))
-#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
-#  define JSON_HEDLEY_PURE _Pragma("does_not_write_global_data")
-#elif defined(__cplusplus) && \
-    ( \
-      JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
-      JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \
-      JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \
-    )
-#  define JSON_HEDLEY_PURE _Pragma("FUNC_IS_PURE;")
-#else
-#  define JSON_HEDLEY_PURE
-#endif
-
-#if defined(JSON_HEDLEY_CONST)
-    #undef JSON_HEDLEY_CONST
-#endif
-#if \
-    JSON_HEDLEY_HAS_ATTRIBUTE(const) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(2,5,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
-    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
-    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
-    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
-    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
-    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
-    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
-    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_CONST __attribute__((__const__))
-#elif \
-    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
-    #define JSON_HEDLEY_CONST _Pragma("no_side_effect")
-#else
-    #define JSON_HEDLEY_CONST JSON_HEDLEY_PURE
-#endif
-
-#if defined(JSON_HEDLEY_RESTRICT)
-    #undef JSON_HEDLEY_RESTRICT
-#endif
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus)
-    #define JSON_HEDLEY_RESTRICT restrict
-#elif \
-    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
-    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
-    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
-    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
-    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
-    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \
-    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
-    defined(__clang__) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_RESTRICT __restrict
-#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus)
-    #define JSON_HEDLEY_RESTRICT _Restrict
-#else
-    #define JSON_HEDLEY_RESTRICT
-#endif
-
-#if defined(JSON_HEDLEY_INLINE)
-    #undef JSON_HEDLEY_INLINE
-#endif
-#if \
-    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
-    (defined(__cplusplus) && (__cplusplus >= 199711L))
-    #define JSON_HEDLEY_INLINE inline
-#elif \
-    defined(JSON_HEDLEY_GCC_VERSION) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(6,2,0)
-    #define JSON_HEDLEY_INLINE __inline__
-#elif \
-    JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
-    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \
-    JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
-    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
-    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_INLINE __inline
-#else
-    #define JSON_HEDLEY_INLINE
-#endif
-
-#if defined(JSON_HEDLEY_ALWAYS_INLINE)
-    #undef JSON_HEDLEY_ALWAYS_INLINE
-#endif
-#if \
-  JSON_HEDLEY_HAS_ATTRIBUTE(always_inline) || \
-  JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
-  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-  JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
-  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
-  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
-  (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
-  (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
-  (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-  JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
-  (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
-  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
-  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
-  JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
-#  define JSON_HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) JSON_HEDLEY_INLINE
-#elif \
-  JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
-  JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
-#  define JSON_HEDLEY_ALWAYS_INLINE __forceinline
-#elif defined(__cplusplus) && \
-    ( \
-      JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
-      JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
-      JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
-      JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
-      JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-      JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \
-    )
-#  define JSON_HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;")
-#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
-#  define JSON_HEDLEY_ALWAYS_INLINE _Pragma("inline=forced")
-#else
-#  define JSON_HEDLEY_ALWAYS_INLINE JSON_HEDLEY_INLINE
-#endif
-
-#if defined(JSON_HEDLEY_NEVER_INLINE)
-    #undef JSON_HEDLEY_NEVER_INLINE
-#endif
-#if \
-    JSON_HEDLEY_HAS_ATTRIBUTE(noinline) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
-    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
-    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
-    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
-    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
-    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
-    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
-    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
-    JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
-    #define JSON_HEDLEY_NEVER_INLINE __attribute__((__noinline__))
-#elif \
-    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
-    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
-    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
-#elif JSON_HEDLEY_PGI_VERSION_CHECK(10,2,0)
-    #define JSON_HEDLEY_NEVER_INLINE _Pragma("noinline")
-#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
-    #define JSON_HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;")
-#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
-    #define JSON_HEDLEY_NEVER_INLINE _Pragma("inline=never")
-#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
-    #define JSON_HEDLEY_NEVER_INLINE __attribute((noinline))
-#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
-    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
-#else
-    #define JSON_HEDLEY_NEVER_INLINE
-#endif
-
-#if defined(JSON_HEDLEY_PRIVATE)
-    #undef JSON_HEDLEY_PRIVATE
-#endif
-#if defined(JSON_HEDLEY_PUBLIC)
-    #undef JSON_HEDLEY_PUBLIC
-#endif
-#if defined(JSON_HEDLEY_IMPORT)
-    #undef JSON_HEDLEY_IMPORT
-#endif
-#if defined(_WIN32) || defined(__CYGWIN__)
-#  define JSON_HEDLEY_PRIVATE
-#  define JSON_HEDLEY_PUBLIC   __declspec(dllexport)
-#  define JSON_HEDLEY_IMPORT   __declspec(dllimport)
-#else
-#  if \
-    JSON_HEDLEY_HAS_ATTRIBUTE(visibility) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
-    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
-    ( \
-      defined(__TI_EABI__) && \
-      ( \
-        (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-        JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \
-      ) \
-    ) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-#    define JSON_HEDLEY_PRIVATE __attribute__((__visibility__("hidden")))
-#    define JSON_HEDLEY_PUBLIC  __attribute__((__visibility__("default")))
-#  else
-#    define JSON_HEDLEY_PRIVATE
-#    define JSON_HEDLEY_PUBLIC
-#  endif
-#  define JSON_HEDLEY_IMPORT    extern
-#endif
-
-#if defined(JSON_HEDLEY_NO_THROW)
-    #undef JSON_HEDLEY_NO_THROW
-#endif
-#if \
-    JSON_HEDLEY_HAS_ATTRIBUTE(nothrow) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_NO_THROW __attribute__((__nothrow__))
-#elif \
-    JSON_HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \
-    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
-    #define JSON_HEDLEY_NO_THROW __declspec(nothrow)
-#else
-    #define JSON_HEDLEY_NO_THROW
-#endif
-
-#if defined(JSON_HEDLEY_FALL_THROUGH)
-    #undef JSON_HEDLEY_FALL_THROUGH
-#endif
-#if \
-    JSON_HEDLEY_HAS_ATTRIBUTE(fallthrough) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(7,0,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
-#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough)
-    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]])
-#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough)
-    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]])
-#elif defined(__fallthrough) /* SAL */
-    #define JSON_HEDLEY_FALL_THROUGH __fallthrough
-#else
-    #define JSON_HEDLEY_FALL_THROUGH
-#endif
-
-#if defined(JSON_HEDLEY_RETURNS_NON_NULL)
-    #undef JSON_HEDLEY_RETURNS_NON_NULL
-#endif
-#if \
-    JSON_HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__))
-#elif defined(_Ret_notnull_) /* SAL */
-    #define JSON_HEDLEY_RETURNS_NON_NULL _Ret_notnull_
-#else
-    #define JSON_HEDLEY_RETURNS_NON_NULL
-#endif
-
-#if defined(JSON_HEDLEY_ARRAY_PARAM)
-    #undef JSON_HEDLEY_ARRAY_PARAM
-#endif
-#if \
-    defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
-    !defined(__STDC_NO_VLA__) && \
-    !defined(__cplusplus) && \
-    !defined(JSON_HEDLEY_PGI_VERSION) && \
-    !defined(JSON_HEDLEY_TINYC_VERSION)
-    #define JSON_HEDLEY_ARRAY_PARAM(name) (name)
-#else
-    #define JSON_HEDLEY_ARRAY_PARAM(name)
-#endif
-
-#if defined(JSON_HEDLEY_IS_CONSTANT)
-    #undef JSON_HEDLEY_IS_CONSTANT
-#endif
-#if defined(JSON_HEDLEY_REQUIRE_CONSTEXPR)
-    #undef JSON_HEDLEY_REQUIRE_CONSTEXPR
-#endif
-/* JSON_HEDLEY_IS_CONSTEXPR_ is for
-   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
-#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
-    #undef JSON_HEDLEY_IS_CONSTEXPR_
-#endif
-#if \
-    JSON_HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \
-    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
-    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \
-    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
-    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
-    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \
-    JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
-    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
-    #define JSON_HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr)
-#endif
-#if !defined(__cplusplus)
-#  if \
-       JSON_HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \
-       JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
-       JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-       JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
-       JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
-       JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
-       JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,24)
-#if defined(__INTPTR_TYPE__)
-    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*)
-#else
-    #include <stdint.h>
-    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*)
-#endif
-#  elif \
-       ( \
-          defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
-          !defined(JSON_HEDLEY_SUNPRO_VERSION) && \
-          !defined(JSON_HEDLEY_PGI_VERSION) && \
-          !defined(JSON_HEDLEY_IAR_VERSION)) || \
-       (JSON_HEDLEY_HAS_EXTENSION(c_generic_selections) && !defined(JSON_HEDLEY_IAR_VERSION)) || \
-       JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
-       JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \
-       JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
-       JSON_HEDLEY_ARM_VERSION_CHECK(5,3,0)
-#if defined(__INTPTR_TYPE__)
-    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0)
-#else
-    #include <stdint.h>
-    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0)
-#endif
-#  elif \
-       defined(JSON_HEDLEY_GCC_VERSION) || \
-       defined(JSON_HEDLEY_INTEL_VERSION) || \
-       defined(JSON_HEDLEY_TINYC_VERSION) || \
-       defined(JSON_HEDLEY_TI_ARMCL_VERSION) || \
-       JSON_HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \
-       defined(JSON_HEDLEY_TI_CL2000_VERSION) || \
-       defined(JSON_HEDLEY_TI_CL6X_VERSION) || \
-       defined(JSON_HEDLEY_TI_CL7X_VERSION) || \
-       defined(JSON_HEDLEY_TI_CLPRU_VERSION) || \
-       defined(__clang__)
-#    define JSON_HEDLEY_IS_CONSTEXPR_(expr) ( \
-        sizeof(void) != \
-        sizeof(*( \
-                  1 ? \
-                  ((void*) ((expr) * 0L) ) : \
-((struct { char v[sizeof(void) * 2]; } *) 1) \
-                ) \
-              ) \
-                                            )
-#  endif
-#endif
-#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
-    #if !defined(JSON_HEDLEY_IS_CONSTANT)
-        #define JSON_HEDLEY_IS_CONSTANT(expr) JSON_HEDLEY_IS_CONSTEXPR_(expr)
-    #endif
-    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (JSON_HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1))
-#else
-    #if !defined(JSON_HEDLEY_IS_CONSTANT)
-        #define JSON_HEDLEY_IS_CONSTANT(expr) (0)
-    #endif
-    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (expr)
-#endif
-
-#if defined(JSON_HEDLEY_BEGIN_C_DECLS)
-    #undef JSON_HEDLEY_BEGIN_C_DECLS
-#endif
-#if defined(JSON_HEDLEY_END_C_DECLS)
-    #undef JSON_HEDLEY_END_C_DECLS
-#endif
-#if defined(JSON_HEDLEY_C_DECL)
-    #undef JSON_HEDLEY_C_DECL
-#endif
-#if defined(__cplusplus)
-    #define JSON_HEDLEY_BEGIN_C_DECLS extern "C" {
-    #define JSON_HEDLEY_END_C_DECLS }
-    #define JSON_HEDLEY_C_DECL extern "C"
-#else
-    #define JSON_HEDLEY_BEGIN_C_DECLS
-    #define JSON_HEDLEY_END_C_DECLS
-    #define JSON_HEDLEY_C_DECL
-#endif
-
-#if defined(JSON_HEDLEY_STATIC_ASSERT)
-    #undef JSON_HEDLEY_STATIC_ASSERT
-#endif
-#if \
-  !defined(__cplusplus) && ( \
-      (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
-      (JSON_HEDLEY_HAS_FEATURE(c_static_assert) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \
-      JSON_HEDLEY_GCC_VERSION_CHECK(6,0,0) || \
-      JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-      defined(_Static_assert) \
-    )
-#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
-#elif \
-  (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
-  JSON_HEDLEY_MSVC_VERSION_CHECK(16,0,0) || \
-  JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
-#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message))
-#else
-#  define JSON_HEDLEY_STATIC_ASSERT(expr, message)
-#endif
-
-#if defined(JSON_HEDLEY_NULL)
-    #undef JSON_HEDLEY_NULL
-#endif
-#if defined(__cplusplus)
-    #if __cplusplus >= 201103L
-        #define JSON_HEDLEY_NULL JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr)
-    #elif defined(NULL)
-        #define JSON_HEDLEY_NULL NULL
-    #else
-        #define JSON_HEDLEY_NULL JSON_HEDLEY_STATIC_CAST(void*, 0)
-    #endif
-#elif defined(NULL)
-    #define JSON_HEDLEY_NULL NULL
-#else
-    #define JSON_HEDLEY_NULL ((void*) 0)
-#endif
-
-#if defined(JSON_HEDLEY_MESSAGE)
-    #undef JSON_HEDLEY_MESSAGE
-#endif
-#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
-#  define JSON_HEDLEY_MESSAGE(msg) \
-    JSON_HEDLEY_DIAGNOSTIC_PUSH \
-    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
-    JSON_HEDLEY_PRAGMA(message msg) \
-    JSON_HEDLEY_DIAGNOSTIC_POP
-#elif \
-  JSON_HEDLEY_GCC_VERSION_CHECK(4,4,0) || \
-  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
-#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message msg)
-#elif JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0)
-#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(_CRI message msg)
-#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
-#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
-#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,0,0)
-#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
-#else
-#  define JSON_HEDLEY_MESSAGE(msg)
-#endif
-
-#if defined(JSON_HEDLEY_WARNING)
-    #undef JSON_HEDLEY_WARNING
-#endif
-#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
-#  define JSON_HEDLEY_WARNING(msg) \
-    JSON_HEDLEY_DIAGNOSTIC_PUSH \
-    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
-    JSON_HEDLEY_PRAGMA(clang warning msg) \
-    JSON_HEDLEY_DIAGNOSTIC_POP
-#elif \
-  JSON_HEDLEY_GCC_VERSION_CHECK(4,8,0) || \
-  JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
-  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
-#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(GCC warning msg)
-#elif \
-  JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \
-  JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
-#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(message(msg))
-#else
-#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_MESSAGE(msg)
-#endif
-
-#if defined(JSON_HEDLEY_REQUIRE)
-    #undef JSON_HEDLEY_REQUIRE
-#endif
-#if defined(JSON_HEDLEY_REQUIRE_MSG)
-    #undef JSON_HEDLEY_REQUIRE_MSG
-#endif
-#if JSON_HEDLEY_HAS_ATTRIBUTE(diagnose_if)
-#  if JSON_HEDLEY_HAS_WARNING("-Wgcc-compat")
-#    define JSON_HEDLEY_REQUIRE(expr) \
-    JSON_HEDLEY_DIAGNOSTIC_PUSH \
-    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
-    __attribute__((diagnose_if(!(expr), #expr, "error"))) \
-    JSON_HEDLEY_DIAGNOSTIC_POP
-#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) \
-    JSON_HEDLEY_DIAGNOSTIC_PUSH \
-    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
-    __attribute__((diagnose_if(!(expr), msg, "error"))) \
-    JSON_HEDLEY_DIAGNOSTIC_POP
-#  else
-#    define JSON_HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error")))
-#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error")))
-#  endif
-#else
-#  define JSON_HEDLEY_REQUIRE(expr)
-#  define JSON_HEDLEY_REQUIRE_MSG(expr,msg)
-#endif
-
-#if defined(JSON_HEDLEY_FLAGS)
-    #undef JSON_HEDLEY_FLAGS
-#endif
-#if JSON_HEDLEY_HAS_ATTRIBUTE(flag_enum) && (!defined(__cplusplus) || JSON_HEDLEY_HAS_WARNING("-Wbitfield-enum-conversion"))
-    #define JSON_HEDLEY_FLAGS __attribute__((__flag_enum__))
-#else
-    #define JSON_HEDLEY_FLAGS
-#endif
-
-#if defined(JSON_HEDLEY_FLAGS_CAST)
-    #undef JSON_HEDLEY_FLAGS_CAST
-#endif
-#if JSON_HEDLEY_INTEL_VERSION_CHECK(19,0,0)
-#  define JSON_HEDLEY_FLAGS_CAST(T, expr) (__extension__ ({ \
-        JSON_HEDLEY_DIAGNOSTIC_PUSH \
-        _Pragma("warning(disable:188)") \
-        ((T) (expr)); \
-        JSON_HEDLEY_DIAGNOSTIC_POP \
-    }))
-#else
-#  define JSON_HEDLEY_FLAGS_CAST(T, expr) JSON_HEDLEY_STATIC_CAST(T, expr)
-#endif
-
-#if defined(JSON_HEDLEY_EMPTY_BASES)
-    #undef JSON_HEDLEY_EMPTY_BASES
-#endif
-#if \
-    (JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !JSON_HEDLEY_MSVC_VERSION_CHECK(20,0,0)) || \
-    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
-    #define JSON_HEDLEY_EMPTY_BASES __declspec(empty_bases)
-#else
-    #define JSON_HEDLEY_EMPTY_BASES
-#endif
-
-/* Remaining macros are deprecated. */
-
-#if defined(JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK)
-    #undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
-#endif
-#if defined(__clang__)
-    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) (0)
-#else
-    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
-#endif
-
-#if defined(JSON_HEDLEY_CLANG_HAS_ATTRIBUTE)
-    #undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
-#endif
-#define JSON_HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
-
-#if defined(JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE)
-    #undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
-#endif
-#define JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute)
-
-#if defined(JSON_HEDLEY_CLANG_HAS_BUILTIN)
-    #undef JSON_HEDLEY_CLANG_HAS_BUILTIN
-#endif
-#define JSON_HEDLEY_CLANG_HAS_BUILTIN(builtin) JSON_HEDLEY_HAS_BUILTIN(builtin)
-
-#if defined(JSON_HEDLEY_CLANG_HAS_FEATURE)
-    #undef JSON_HEDLEY_CLANG_HAS_FEATURE
-#endif
-#define JSON_HEDLEY_CLANG_HAS_FEATURE(feature) JSON_HEDLEY_HAS_FEATURE(feature)
-
-#if defined(JSON_HEDLEY_CLANG_HAS_EXTENSION)
-    #undef JSON_HEDLEY_CLANG_HAS_EXTENSION
-#endif
-#define JSON_HEDLEY_CLANG_HAS_EXTENSION(extension) JSON_HEDLEY_HAS_EXTENSION(extension)
-
-#if defined(JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE)
-    #undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
-#endif
-#define JSON_HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute)
-
-#if defined(JSON_HEDLEY_CLANG_HAS_WARNING)
-    #undef JSON_HEDLEY_CLANG_HAS_WARNING
-#endif
-#define JSON_HEDLEY_CLANG_HAS_WARNING(warning) JSON_HEDLEY_HAS_WARNING(warning)
-
-#endif /* !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < X) */
-
-
-// This file contains all internal macro definitions (except those affecting ABI)
-// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them
-
-// #include <nlohmann/detail/abi_macros.hpp>
-
-
-// exclude unsupported compilers
-#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK)
-    #if defined(__clang__)
-        #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400
-            #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers"
-        #endif
-    #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER))
-        #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800
-            #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers"
-        #endif
-    #endif
-#endif
-
-// C++ language standard detection
-// if the user manually specified the used c++ version this is skipped
-#if !defined(JSON_HAS_CPP_20) && !defined(JSON_HAS_CPP_17) && !defined(JSON_HAS_CPP_14) && !defined(JSON_HAS_CPP_11)
-    #if (defined(__cplusplus) && __cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
-        #define JSON_HAS_CPP_20
-        #define JSON_HAS_CPP_17
-        #define JSON_HAS_CPP_14
-    #elif (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464
-        #define JSON_HAS_CPP_17
-        #define JSON_HAS_CPP_14
-    #elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1)
-        #define JSON_HAS_CPP_14
-    #endif
-    // the cpp 11 flag is always specified because it is the minimal required version
-    #define JSON_HAS_CPP_11
-#endif
-
-#ifdef __has_include
-    #if __has_include(<version>)
-        #include <version>
-    #endif
-#endif
-
-#if !defined(JSON_HAS_FILESYSTEM) && !defined(JSON_HAS_EXPERIMENTAL_FILESYSTEM)
-    #ifdef JSON_HAS_CPP_17
-        #if defined(__cpp_lib_filesystem)
-            #define JSON_HAS_FILESYSTEM 1
-        #elif defined(__cpp_lib_experimental_filesystem)
-            #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1
-        #elif !defined(__has_include)
-            #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1
-        #elif __has_include(<filesystem>)
-            #define JSON_HAS_FILESYSTEM 1
-        #elif __has_include(<experimental/filesystem>)
-            #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1
-        #endif
-
-        // std::filesystem does not work on MinGW GCC 8: https://sourceforge.net/p/mingw-w64/bugs/737/
-        #if defined(__MINGW32__) && defined(__GNUC__) && __GNUC__ == 8
-            #undef JSON_HAS_FILESYSTEM
-            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
-        #endif
-
-        // no filesystem support before GCC 8: https://en.cppreference.com/w/cpp/compiler_support
-        #if defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 8
-            #undef JSON_HAS_FILESYSTEM
-            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
-        #endif
-
-        // no filesystem support before Clang 7: https://en.cppreference.com/w/cpp/compiler_support
-        #if defined(__clang_major__) && __clang_major__ < 7
-            #undef JSON_HAS_FILESYSTEM
-            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
-        #endif
-
-        // no filesystem support before MSVC 19.14: https://en.cppreference.com/w/cpp/compiler_support
-        #if defined(_MSC_VER) && _MSC_VER < 1914
-            #undef JSON_HAS_FILESYSTEM
-            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
-        #endif
-
-        // no filesystem support before iOS 13
-        #if defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 130000
-            #undef JSON_HAS_FILESYSTEM
-            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
-        #endif
-
-        // no filesystem support before macOS Catalina
-        #if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < 101500
-            #undef JSON_HAS_FILESYSTEM
-            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
-        #endif
-    #endif
-#endif
-
-#ifndef JSON_HAS_EXPERIMENTAL_FILESYSTEM
-    #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 0
-#endif
-
-#ifndef JSON_HAS_FILESYSTEM
-    #define JSON_HAS_FILESYSTEM 0
-#endif
-
-#ifndef JSON_HAS_THREE_WAY_COMPARISON
-    #if defined(__cpp_impl_three_way_comparison) && __cpp_impl_three_way_comparison >= 201907L \
-        && defined(__cpp_lib_three_way_comparison) && __cpp_lib_three_way_comparison >= 201907L
-        #define JSON_HAS_THREE_WAY_COMPARISON 1
-    #else
-        #define JSON_HAS_THREE_WAY_COMPARISON 0
-    #endif
-#endif
-
-#ifndef JSON_HAS_RANGES
-    // ranges header shipping in GCC 11.1.0 (released 2021-04-27) has syntax error
-    #if defined(__GLIBCXX__) && __GLIBCXX__ == 20210427
-        #define JSON_HAS_RANGES 0
-    #elif defined(__cpp_lib_ranges)
-        #define JSON_HAS_RANGES 1
-    #else
-        #define JSON_HAS_RANGES 0
-    #endif
-#endif
-
-#ifndef JSON_HAS_STATIC_RTTI
-    #if !defined(_HAS_STATIC_RTTI) || _HAS_STATIC_RTTI != 0
-        #define JSON_HAS_STATIC_RTTI 1
-    #else
-        #define JSON_HAS_STATIC_RTTI 0
-    #endif
-#endif
-
-#ifdef JSON_HAS_CPP_17
-    #define JSON_INLINE_VARIABLE inline
-#else
-    #define JSON_INLINE_VARIABLE
-#endif
-
-#if JSON_HEDLEY_HAS_ATTRIBUTE(no_unique_address)
-    #define JSON_NO_UNIQUE_ADDRESS [[no_unique_address]]
-#else
-    #define JSON_NO_UNIQUE_ADDRESS
-#endif
-
-// disable documentation warnings on clang
-#if defined(__clang__)
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wdocumentation"
-    #pragma clang diagnostic ignored "-Wdocumentation-unknown-command"
-#endif
-
-// allow disabling exceptions
-#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION)
-    #define JSON_THROW(exception) throw exception
-    #define JSON_TRY try
-    #define JSON_CATCH(exception) catch(exception)
-    #define JSON_INTERNAL_CATCH(exception) catch(exception)
-#else
-    #include <cstdlib>
-    #define JSON_THROW(exception) std::abort()
-    #define JSON_TRY if(true)
-    #define JSON_CATCH(exception) if(false)
-    #define JSON_INTERNAL_CATCH(exception) if(false)
-#endif
-
-// override exception macros
-#if defined(JSON_THROW_USER)
-    #undef JSON_THROW
-    #define JSON_THROW JSON_THROW_USER
-#endif
-#if defined(JSON_TRY_USER)
-    #undef JSON_TRY
-    #define JSON_TRY JSON_TRY_USER
-#endif
-#if defined(JSON_CATCH_USER)
-    #undef JSON_CATCH
-    #define JSON_CATCH JSON_CATCH_USER
-    #undef JSON_INTERNAL_CATCH
-    #define JSON_INTERNAL_CATCH JSON_CATCH_USER
-#endif
-#if defined(JSON_INTERNAL_CATCH_USER)
-    #undef JSON_INTERNAL_CATCH
-    #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER
-#endif
-
-// allow overriding assert
-#if !defined(JSON_ASSERT)
-    #include <cassert> // assert
-    #define JSON_ASSERT(x) assert(x)
-#endif
-
-// allow to access some private functions (needed by the test suite)
-#if defined(JSON_TESTS_PRIVATE)
-    #define JSON_PRIVATE_UNLESS_TESTED public
-#else
-    #define JSON_PRIVATE_UNLESS_TESTED private
-#endif
-
-/*!
-@brief macro to briefly define a mapping between an enum and JSON
-@def NLOHMANN_JSON_SERIALIZE_ENUM
-@since version 3.4.0
-*/
-#define NLOHMANN_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...)                                            \
-    template<typename BasicJsonType>                                                            \
-    inline void to_json(BasicJsonType& j, const ENUM_TYPE& e)                                   \
-    {                                                                                           \
-        /* NOLINTNEXTLINE(modernize-type-traits) we use C++11 */                                \
-        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
-        /* NOLINTNEXTLINE(modernize-avoid-c-arrays) we don't want to depend on <array> */       \
-        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
-        auto it = std::find_if(std::begin(m), std::end(m),                                      \
-                               [e](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool  \
-        {                                                                                       \
-            return ej_pair.first == e;                                                          \
-        });                                                                                     \
-        j = ((it != std::end(m)) ? it : std::begin(m))->second;                                 \
-    }                                                                                           \
-    template<typename BasicJsonType>                                                            \
-    inline void from_json(const BasicJsonType& j, ENUM_TYPE& e)                                 \
-    {                                                                                           \
-        /* NOLINTNEXTLINE(modernize-type-traits) we use C++11 */                                \
-        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
-        /* NOLINTNEXTLINE(modernize-avoid-c-arrays) we don't want to depend on <array> */       \
-        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
-        auto it = std::find_if(std::begin(m), std::end(m),                                      \
-                               [&j](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool \
-        {                                                                                       \
-            return ej_pair.second == j;                                                         \
-        });                                                                                     \
-        e = ((it != std::end(m)) ? it : std::begin(m))->first;                                  \
-    }
-
-// Ugly macros to avoid uglier copy-paste when specializing basic_json. They
-// may be removed in the future once the class is split.
-
-#define NLOHMANN_BASIC_JSON_TPL_DECLARATION                                \
-    template<template<typename, typename, typename...> class ObjectType,   \
-             template<typename, typename...> class ArrayType,              \
-             class StringType, class BooleanType, class NumberIntegerType, \
-             class NumberUnsignedType, class NumberFloatType,              \
-             template<typename> class AllocatorType,                       \
-             template<typename, typename = void> class JSONSerializer,     \
-             class BinaryType,                                             \
-             class CustomBaseClass>
-
-#define NLOHMANN_BASIC_JSON_TPL                                            \
-    basic_json<ObjectType, ArrayType, StringType, BooleanType,             \
-    NumberIntegerType, NumberUnsignedType, NumberFloatType,                \
-    AllocatorType, JSONSerializer, BinaryType, CustomBaseClass>
-
-// Macros to simplify conversion from/to types
-
-#define NLOHMANN_JSON_EXPAND( x ) x
-#define NLOHMANN_JSON_GET_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, _61, _62, _63, _64, NAME,...) NAME
-#define NLOHMANN_JSON_PASTE(...) NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_GET_MACRO(__VA_ARGS__, \
-        NLOHMANN_JSON_PASTE64, \
-        NLOHMANN_JSON_PASTE63, \
-        NLOHMANN_JSON_PASTE62, \
-        NLOHMANN_JSON_PASTE61, \
-        NLOHMANN_JSON_PASTE60, \
-        NLOHMANN_JSON_PASTE59, \
-        NLOHMANN_JSON_PASTE58, \
-        NLOHMANN_JSON_PASTE57, \
-        NLOHMANN_JSON_PASTE56, \
-        NLOHMANN_JSON_PASTE55, \
-        NLOHMANN_JSON_PASTE54, \
-        NLOHMANN_JSON_PASTE53, \
-        NLOHMANN_JSON_PASTE52, \
-        NLOHMANN_JSON_PASTE51, \
-        NLOHMANN_JSON_PASTE50, \
-        NLOHMANN_JSON_PASTE49, \
-        NLOHMANN_JSON_PASTE48, \
-        NLOHMANN_JSON_PASTE47, \
-        NLOHMANN_JSON_PASTE46, \
-        NLOHMANN_JSON_PASTE45, \
-        NLOHMANN_JSON_PASTE44, \
-        NLOHMANN_JSON_PASTE43, \
-        NLOHMANN_JSON_PASTE42, \
-        NLOHMANN_JSON_PASTE41, \
-        NLOHMANN_JSON_PASTE40, \
-        NLOHMANN_JSON_PASTE39, \
-        NLOHMANN_JSON_PASTE38, \
-        NLOHMANN_JSON_PASTE37, \
-        NLOHMANN_JSON_PASTE36, \
-        NLOHMANN_JSON_PASTE35, \
-        NLOHMANN_JSON_PASTE34, \
-        NLOHMANN_JSON_PASTE33, \
-        NLOHMANN_JSON_PASTE32, \
-        NLOHMANN_JSON_PASTE31, \
-        NLOHMANN_JSON_PASTE30, \
-        NLOHMANN_JSON_PASTE29, \
-        NLOHMANN_JSON_PASTE28, \
-        NLOHMANN_JSON_PASTE27, \
-        NLOHMANN_JSON_PASTE26, \
-        NLOHMANN_JSON_PASTE25, \
-        NLOHMANN_JSON_PASTE24, \
-        NLOHMANN_JSON_PASTE23, \
-        NLOHMANN_JSON_PASTE22, \
-        NLOHMANN_JSON_PASTE21, \
-        NLOHMANN_JSON_PASTE20, \
-        NLOHMANN_JSON_PASTE19, \
-        NLOHMANN_JSON_PASTE18, \
-        NLOHMANN_JSON_PASTE17, \
-        NLOHMANN_JSON_PASTE16, \
-        NLOHMANN_JSON_PASTE15, \
-        NLOHMANN_JSON_PASTE14, \
-        NLOHMANN_JSON_PASTE13, \
-        NLOHMANN_JSON_PASTE12, \
-        NLOHMANN_JSON_PASTE11, \
-        NLOHMANN_JSON_PASTE10, \
-        NLOHMANN_JSON_PASTE9, \
-        NLOHMANN_JSON_PASTE8, \
-        NLOHMANN_JSON_PASTE7, \
-        NLOHMANN_JSON_PASTE6, \
-        NLOHMANN_JSON_PASTE5, \
-        NLOHMANN_JSON_PASTE4, \
-        NLOHMANN_JSON_PASTE3, \
-        NLOHMANN_JSON_PASTE2, \
-        NLOHMANN_JSON_PASTE1)(__VA_ARGS__))
-#define NLOHMANN_JSON_PASTE2(func, v1) func(v1)
-#define NLOHMANN_JSON_PASTE3(func, v1, v2) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE2(func, v2)
-#define NLOHMANN_JSON_PASTE4(func, v1, v2, v3) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE3(func, v2, v3)
-#define NLOHMANN_JSON_PASTE5(func, v1, v2, v3, v4) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE4(func, v2, v3, v4)
-#define NLOHMANN_JSON_PASTE6(func, v1, v2, v3, v4, v5) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE5(func, v2, v3, v4, v5)
-#define NLOHMANN_JSON_PASTE7(func, v1, v2, v3, v4, v5, v6) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE6(func, v2, v3, v4, v5, v6)
-#define NLOHMANN_JSON_PASTE8(func, v1, v2, v3, v4, v5, v6, v7) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE7(func, v2, v3, v4, v5, v6, v7)
-#define NLOHMANN_JSON_PASTE9(func, v1, v2, v3, v4, v5, v6, v7, v8) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE8(func, v2, v3, v4, v5, v6, v7, v8)
-#define NLOHMANN_JSON_PASTE10(func, v1, v2, v3, v4, v5, v6, v7, v8, v9) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE9(func, v2, v3, v4, v5, v6, v7, v8, v9)
-#define NLOHMANN_JSON_PASTE11(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE10(func, v2, v3, v4, v5, v6, v7, v8, v9, v10)
-#define NLOHMANN_JSON_PASTE12(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE11(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11)
-#define NLOHMANN_JSON_PASTE13(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE12(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12)
-#define NLOHMANN_JSON_PASTE14(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE13(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13)
-#define NLOHMANN_JSON_PASTE15(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE14(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14)
-#define NLOHMANN_JSON_PASTE16(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE15(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15)
-#define NLOHMANN_JSON_PASTE17(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE16(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16)
-#define NLOHMANN_JSON_PASTE18(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE17(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17)
-#define NLOHMANN_JSON_PASTE19(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE18(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18)
-#define NLOHMANN_JSON_PASTE20(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE19(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19)
-#define NLOHMANN_JSON_PASTE21(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE20(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20)
-#define NLOHMANN_JSON_PASTE22(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE21(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21)
-#define NLOHMANN_JSON_PASTE23(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE22(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22)
-#define NLOHMANN_JSON_PASTE24(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE23(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23)
-#define NLOHMANN_JSON_PASTE25(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE24(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24)
-#define NLOHMANN_JSON_PASTE26(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE25(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25)
-#define NLOHMANN_JSON_PASTE27(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE26(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26)
-#define NLOHMANN_JSON_PASTE28(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE27(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27)
-#define NLOHMANN_JSON_PASTE29(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE28(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28)
-#define NLOHMANN_JSON_PASTE30(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE29(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29)
-#define NLOHMANN_JSON_PASTE31(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE30(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30)
-#define NLOHMANN_JSON_PASTE32(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE31(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31)
-#define NLOHMANN_JSON_PASTE33(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE32(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32)
-#define NLOHMANN_JSON_PASTE34(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE33(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33)
-#define NLOHMANN_JSON_PASTE35(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE34(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34)
-#define NLOHMANN_JSON_PASTE36(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE35(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35)
-#define NLOHMANN_JSON_PASTE37(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE36(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36)
-#define NLOHMANN_JSON_PASTE38(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE37(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37)
-#define NLOHMANN_JSON_PASTE39(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE38(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38)
-#define NLOHMANN_JSON_PASTE40(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE39(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39)
-#define NLOHMANN_JSON_PASTE41(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE40(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40)
-#define NLOHMANN_JSON_PASTE42(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE41(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41)
-#define NLOHMANN_JSON_PASTE43(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE42(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42)
-#define NLOHMANN_JSON_PASTE44(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE43(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43)
-#define NLOHMANN_JSON_PASTE45(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE44(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44)
-#define NLOHMANN_JSON_PASTE46(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE45(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45)
-#define NLOHMANN_JSON_PASTE47(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE46(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46)
-#define NLOHMANN_JSON_PASTE48(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE47(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47)
-#define NLOHMANN_JSON_PASTE49(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE48(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48)
-#define NLOHMANN_JSON_PASTE50(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE49(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49)
-#define NLOHMANN_JSON_PASTE51(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE50(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50)
-#define NLOHMANN_JSON_PASTE52(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE51(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51)
-#define NLOHMANN_JSON_PASTE53(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE52(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52)
-#define NLOHMANN_JSON_PASTE54(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE53(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53)
-#define NLOHMANN_JSON_PASTE55(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE54(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54)
-#define NLOHMANN_JSON_PASTE56(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE55(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55)
-#define NLOHMANN_JSON_PASTE57(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE56(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56)
-#define NLOHMANN_JSON_PASTE58(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE57(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57)
-#define NLOHMANN_JSON_PASTE59(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE58(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58)
-#define NLOHMANN_JSON_PASTE60(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE59(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59)
-#define NLOHMANN_JSON_PASTE61(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE60(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60)
-#define NLOHMANN_JSON_PASTE62(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE61(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61)
-#define NLOHMANN_JSON_PASTE63(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE62(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62)
-#define NLOHMANN_JSON_PASTE64(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE63(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63)
-
-#define NLOHMANN_JSON_TO(v1) nlohmann_json_j[#v1] = nlohmann_json_t.v1;
-#define NLOHMANN_JSON_FROM(v1) nlohmann_json_j.at(#v1).get_to(nlohmann_json_t.v1);
-#define NLOHMANN_JSON_FROM_WITH_DEFAULT(v1) nlohmann_json_t.v1 = !nlohmann_json_j.is_null() ? nlohmann_json_j.value(#v1, nlohmann_json_default_obj.v1) : nlohmann_json_default_obj.v1;
-
-/*!
-@brief macro
-@def NLOHMANN_DEFINE_TYPE_INTRUSIVE
-@since version 3.9.0
-*/
-#define NLOHMANN_DEFINE_TYPE_INTRUSIVE(Type, ...)  \
-    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
-
-/*!
-@brief macro
-@def NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT
-@since version 3.11.0
-*/
-#define NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(Type, ...)  \
-    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
-
-/*!
-@brief macro
-@def NLOHMANN_DEFINE_TYPE_INTRUSIVE_ONLY_SERIALIZE
-@since version 3.11.x
-*/
-#define NLOHMANN_DEFINE_TYPE_INTRUSIVE_ONLY_SERIALIZE(Type, ...)  \
-    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) }
-
-/*!
-@brief macro
-@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE
-@since version 3.9.0
-*/
-#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Type, ...)  \
-    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
-
-/*!
-@brief macro
-@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_WITH_DEFAULT
-@since version 3.11.0
-*/
-#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_WITH_DEFAULT(Type, ...)  \
-    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
-
-/*!
-@brief macro
-@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE
-@since version 3.11.x
-*/
-#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE(Type, ...)  \
-    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) }
-
-/*!
-@brief macro
-@def NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE
-@since version 3.11.x
-*/
-#define NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE(Type, BaseType, ...)  \
-    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
-
-#define NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE_WITH_DEFAULT(Type, BaseType, ...)  \
-    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType&>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
-
-#define NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE_ONLY_SERIALIZE(Type, BaseType, ...)  \
-    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-
-/*!
-@brief macro
-@def NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE
-@since version 3.11.x
-*/
-#define NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE(Type, BaseType, ...)  \
-    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
-
-#define NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE_WITH_DEFAULT(Type, BaseType, ...)  \
-    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
-
-#define NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE(Type, BaseType, ...)  \
-    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-
-// inspired from https://stackoverflow.com/a/26745591
-// allows to call any std function as if (e.g. with begin):
-// using std::begin; begin(x);
-//
-// it allows using the detected idiom to retrieve the return type
-// of such an expression
-#define NLOHMANN_CAN_CALL_STD_FUNC_IMPL(std_name)                                 \
-    namespace detail {                                                            \
-    using std::std_name;                                                          \
-    \
-    template<typename... T>                                                       \
-    using result_of_##std_name = decltype(std_name(std::declval<T>()...));        \
-    }                                                                             \
-    \
-    namespace detail2 {                                                           \
-    struct std_name##_tag                                                         \
-    {                                                                             \
-    };                                                                            \
-    \
-    template<typename... T>                                                       \
-    std_name##_tag std_name(T&&...);                                              \
-    \
-    template<typename... T>                                                       \
-    using result_of_##std_name = decltype(std_name(std::declval<T>()...));        \
-    \
-    template<typename... T>                                                       \
-    struct would_call_std_##std_name                                              \
-    {                                                                             \
-        static constexpr auto const value = ::nlohmann::detail::                  \
-                                            is_detected_exact<std_name##_tag, result_of_##std_name, T...>::value; \
-    };                                                                            \
-    } /* namespace detail2 */ \
-    \
-    template<typename... T>                                                       \
-    struct would_call_std_##std_name : detail2::would_call_std_##std_name<T...>   \
-    {                                                                             \
-    }
-
-#ifndef JSON_USE_IMPLICIT_CONVERSIONS
-    #define JSON_USE_IMPLICIT_CONVERSIONS 1
-#endif
-
-#if JSON_USE_IMPLICIT_CONVERSIONS
-    #define JSON_EXPLICIT
-#else
-    #define JSON_EXPLICIT explicit
-#endif
-
-#ifndef JSON_DISABLE_ENUM_SERIALIZATION
-    #define JSON_DISABLE_ENUM_SERIALIZATION 0
-#endif
-
-#ifndef JSON_USE_GLOBAL_UDLS
-    #define JSON_USE_GLOBAL_UDLS 1
-#endif
-
-#if JSON_HAS_THREE_WAY_COMPARISON
-    #include <compare> // partial_ordering
-#endif
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-///////////////////////////
-// JSON type enumeration //
-///////////////////////////
-
-/*!
-@brief the JSON type enumeration
-
-This enumeration collects the different JSON types. It is internally used to
-distinguish the stored values, and the functions @ref basic_json::is_null(),
-@ref basic_json::is_object(), @ref basic_json::is_array(),
-@ref basic_json::is_string(), @ref basic_json::is_boolean(),
-@ref basic_json::is_number() (with @ref basic_json::is_number_integer(),
-@ref basic_json::is_number_unsigned(), and @ref basic_json::is_number_float()),
-@ref basic_json::is_discarded(), @ref basic_json::is_primitive(), and
-@ref basic_json::is_structured() rely on it.
-
-@note There are three enumeration entries (number_integer, number_unsigned, and
-number_float), because the library distinguishes these three types for numbers:
-@ref basic_json::number_unsigned_t is used for unsigned integers,
-@ref basic_json::number_integer_t is used for signed integers, and
-@ref basic_json::number_float_t is used for floating-point numbers or to
-approximate integers which do not fit in the limits of their respective type.
-
-@sa see @ref basic_json::basic_json(const value_t value_type) -- create a JSON
-value with the default value for a given type
-
-@since version 1.0.0
-*/
-enum class value_t : std::uint8_t
-{
-    null,             ///< null value
-    object,           ///< object (unordered set of name/value pairs)
-    array,            ///< array (ordered collection of values)
-    string,           ///< string value
-    boolean,          ///< boolean value
-    number_integer,   ///< number value (signed integer)
-    number_unsigned,  ///< number value (unsigned integer)
-    number_float,     ///< number value (floating-point)
-    binary,           ///< binary array (ordered collection of bytes)
-    discarded         ///< discarded by the parser callback function
-};
-
-/*!
-@brief comparison operator for JSON types
-
-Returns an ordering that is similar to Python:
-- order: null < boolean < number < object < array < string < binary
-- furthermore, each type is not smaller than itself
-- discarded values are not comparable
-- binary is represented as a b"" string in python and directly comparable to a
-  string; however, making a binary array directly comparable with a string would
-  be surprising behavior in a JSON file.
-
-@since version 1.0.0
-*/
-#if JSON_HAS_THREE_WAY_COMPARISON
-    inline std::partial_ordering operator<=>(const value_t lhs, const value_t rhs) noexcept // *NOPAD*
-#else
-    inline bool operator<(const value_t lhs, const value_t rhs) noexcept
-#endif
-{
-    static constexpr std::array<std::uint8_t, 9> order = {{
-            0 /* null */, 3 /* object */, 4 /* array */, 5 /* string */,
-            1 /* boolean */, 2 /* integer */, 2 /* unsigned */, 2 /* float */,
-            6 /* binary */
-        }
-    };
-
-    const auto l_index = static_cast<std::size_t>(lhs);
-    const auto r_index = static_cast<std::size_t>(rhs);
-#if JSON_HAS_THREE_WAY_COMPARISON
-    if (l_index < order.size() && r_index < order.size())
-    {
-        return order[l_index] <=> order[r_index]; // *NOPAD*
-    }
-    return std::partial_ordering::unordered;
-#else
-    return l_index < order.size() && r_index < order.size() && order[l_index] < order[r_index];
-#endif
-}
-
-// GCC selects the built-in operator< over an operator rewritten from
-// a user-defined spaceship operator
-// Clang, MSVC, and ICC select the rewritten candidate
-// (see GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105200)
-#if JSON_HAS_THREE_WAY_COMPARISON && defined(__GNUC__)
-inline bool operator<(const value_t lhs, const value_t rhs) noexcept
-{
-    return std::is_lt(lhs <=> rhs); // *NOPAD*
-}
-#endif
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/string_escape.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-// #include <nlohmann/detail/abi_macros.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-/*!
-@brief replace all occurrences of a substring by another string
-
-@param[in,out] s  the string to manipulate; changed so that all
-               occurrences of @a f are replaced with @a t
-@param[in]     f  the substring to replace with @a t
-@param[in]     t  the string to replace @a f
-
-@pre The search string @a f must not be empty. **This precondition is
-enforced with an assertion.**
-
-@since version 2.0.0
-*/
-template<typename StringType>
-inline void replace_substring(StringType& s, const StringType& f,
-                              const StringType& t)
-{
-    JSON_ASSERT(!f.empty());
-    for (auto pos = s.find(f);                // find first occurrence of f
-            pos != StringType::npos;          // make sure f was found
-            s.replace(pos, f.size(), t),      // replace with t, and
-            pos = s.find(f, pos + t.size()))  // find next occurrence of f
-    {}
-}
-
-/*!
- * @brief string escaping as described in RFC 6901 (Sect. 4)
- * @param[in] s string to escape
- * @return    escaped string
- *
- * Note the order of escaping "~" to "~0" and "/" to "~1" is important.
- */
-template<typename StringType>
-inline StringType escape(StringType s)
-{
-    replace_substring(s, StringType{"~"}, StringType{"~0"});
-    replace_substring(s, StringType{"/"}, StringType{"~1"});
-    return s;
-}
-
-/*!
- * @brief string unescaping as described in RFC 6901 (Sect. 4)
- * @param[in] s string to unescape
- * @return    unescaped string
- *
- * Note the order of escaping "~1" to "/" and "~0" to "~" is important.
- */
-template<typename StringType>
-static void unescape(StringType& s)
-{
-    replace_substring(s, StringType{"~1"}, StringType{"/"});
-    replace_substring(s, StringType{"~0"}, StringType{"~"});
-}
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/input/position_t.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <cstddef> // size_t
-
-// #include <nlohmann/detail/abi_macros.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-/// struct to capture the start position of the current token
-struct position_t
-{
-    /// the total number of characters read
-    std::size_t chars_read_total = 0;
-    /// the number of characters read in the current line
-    std::size_t chars_read_current_line = 0;
-    /// the number of lines read
-    std::size_t lines_read = 0;
-
-    /// conversion to size_t to preserve SAX interface
-    constexpr operator size_t() const
-    {
-        return chars_read_total;
-    }
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-// #include <nlohmann/detail/meta/cpp_future.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-FileCopyrightText: 2018 The Abseil Authors
-// SPDX-License-Identifier: MIT
-
-
-
-#include <array> // array
-#include <cstddef> // size_t
-#include <type_traits> // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type
-#include <utility> // index_sequence, make_index_sequence, index_sequence_for
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-template<typename T>
-using uncvref_t = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
-
-#ifdef JSON_HAS_CPP_14
-
-// the following utilities are natively available in C++14
-using std::enable_if_t;
-using std::index_sequence;
-using std::make_index_sequence;
-using std::index_sequence_for;
-
-#else
-
-// alias templates to reduce boilerplate
-template<bool B, typename T = void>
-using enable_if_t = typename std::enable_if<B, T>::type;
-
-// The following code is taken from https://github.com/abseil/abseil-cpp/blob/10cb35e459f5ecca5b2ff107635da0bfa41011b4/absl/utility/utility.h
-// which is part of Google Abseil (https://github.com/abseil/abseil-cpp), licensed under the Apache License 2.0.
-
-//// START OF CODE FROM GOOGLE ABSEIL
-
-// integer_sequence
-//
-// Class template representing a compile-time integer sequence. An instantiation
-// of `integer_sequence<T, Ints...>` has a sequence of integers encoded in its
-// type through its template arguments (which is a common need when
-// working with C++11 variadic templates). `absl::integer_sequence` is designed
-// to be a drop-in replacement for C++14's `std::integer_sequence`.
-//
-// Example:
-//
-//   template< class T, T... Ints >
-//   void user_function(integer_sequence<T, Ints...>);
-//
-//   int main()
-//   {
-//     // user_function's `T` will be deduced to `int` and `Ints...`
-//     // will be deduced to `0, 1, 2, 3, 4`.
-//     user_function(make_integer_sequence<int, 5>());
-//   }
-template <typename T, T... Ints>
-struct integer_sequence
-{
-    using value_type = T;
-    static constexpr std::size_t size() noexcept
-    {
-        return sizeof...(Ints);
-    }
-};
-
-// index_sequence
-//
-// A helper template for an `integer_sequence` of `size_t`,
-// `absl::index_sequence` is designed to be a drop-in replacement for C++14's
-// `std::index_sequence`.
-template <size_t... Ints>
-using index_sequence = integer_sequence<size_t, Ints...>;
-
-namespace utility_internal
-{
-
-template <typename Seq, size_t SeqSize, size_t Rem>
-struct Extend;
-
-// Note that SeqSize == sizeof...(Ints). It's passed explicitly for efficiency.
-template <typename T, T... Ints, size_t SeqSize>
-struct Extend<integer_sequence<T, Ints...>, SeqSize, 0>
-{
-    using type = integer_sequence < T, Ints..., (Ints + SeqSize)... >;
-};
-
-template <typename T, T... Ints, size_t SeqSize>
-struct Extend<integer_sequence<T, Ints...>, SeqSize, 1>
-{
-    using type = integer_sequence < T, Ints..., (Ints + SeqSize)..., 2 * SeqSize >;
-};
-
-// Recursion helper for 'make_integer_sequence<T, N>'.
-// 'Gen<T, N>::type' is an alias for 'integer_sequence<T, 0, 1, ... N-1>'.
-template <typename T, size_t N>
-struct Gen
-{
-    using type =
-        typename Extend < typename Gen < T, N / 2 >::type, N / 2, N % 2 >::type;
-};
-
-template <typename T>
-struct Gen<T, 0>
-{
-    using type = integer_sequence<T>;
-};
-
-}  // namespace utility_internal
-
-// Compile-time sequences of integers
-
-// make_integer_sequence
-//
-// This template alias is equivalent to
-// `integer_sequence<int, 0, 1, ..., N-1>`, and is designed to be a drop-in
-// replacement for C++14's `std::make_integer_sequence`.
-template <typename T, T N>
-using make_integer_sequence = typename utility_internal::Gen<T, N>::type;
-
-// make_index_sequence
-//
-// This template alias is equivalent to `index_sequence<0, 1, ..., N-1>`,
-// and is designed to be a drop-in replacement for C++14's
-// `std::make_index_sequence`.
-template <size_t N>
-using make_index_sequence = make_integer_sequence<size_t, N>;
-
-// index_sequence_for
-//
-// Converts a typename pack into an index sequence of the same length, and
-// is designed to be a drop-in replacement for C++14's
-// `std::index_sequence_for()`
-template <typename... Ts>
-using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
-
-//// END OF CODE FROM GOOGLE ABSEIL
-
-#endif
-
-// dispatch utility (taken from ranges-v3)
-template<unsigned N> struct priority_tag : priority_tag < N - 1 > {};
-template<> struct priority_tag<0> {};
-
-// taken from ranges-v3
-template<typename T>
-struct static_const
-{
-    static JSON_INLINE_VARIABLE constexpr T value{};
-};
-
-#ifndef JSON_HAS_CPP_17
-    template<typename T>
-    constexpr T static_const<T>::value;
-#endif
-
-template<typename T, typename... Args>
-constexpr std::array<T, sizeof...(Args)> make_array(Args&& ... args)
-{
-    return std::array<T, sizeof...(Args)> {{static_cast<T>(std::forward<Args>(args))...}};
-}
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/meta/type_traits.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <limits> // numeric_limits
-#include <string> // char_traits
-#include <tuple> // tuple
-#include <type_traits> // false_type, is_constructible, is_integral, is_same, true_type
-#include <utility> // declval
-
-// #include <nlohmann/detail/iterators/iterator_traits.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <iterator> // random_access_iterator_tag
-
-// #include <nlohmann/detail/abi_macros.hpp>
-
-// #include <nlohmann/detail/meta/void_t.hpp>
-
-// #include <nlohmann/detail/meta/cpp_future.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-template<typename It, typename = void>
-struct iterator_types {};
-
-template<typename It>
-struct iterator_types <
-    It,
-    void_t<typename It::difference_type, typename It::value_type, typename It::pointer,
-    typename It::reference, typename It::iterator_category >>
-{
-    using difference_type = typename It::difference_type;
-    using value_type = typename It::value_type;
-    using pointer = typename It::pointer;
-    using reference = typename It::reference;
-    using iterator_category = typename It::iterator_category;
-};
-
-// This is required as some compilers implement std::iterator_traits in a way that
-// doesn't work with SFINAE. See https://github.com/nlohmann/json/issues/1341.
-template<typename T, typename = void>
-struct iterator_traits
-{
-};
-
-template<typename T>
-struct iterator_traits < T, enable_if_t < !std::is_pointer<T>::value >>
-    : iterator_types<T>
-{
-};
-
-template<typename T>
-struct iterator_traits<T*, enable_if_t<std::is_object<T>::value>>
-{
-    using iterator_category = std::random_access_iterator_tag;
-    using value_type = T;
-    using difference_type = ptrdiff_t;
-    using pointer = T*;
-    using reference = T&;
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-// #include <nlohmann/detail/meta/call_std/begin.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-
-NLOHMANN_CAN_CALL_STD_FUNC_IMPL(begin);
-
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/meta/call_std/end.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-
-NLOHMANN_CAN_CALL_STD_FUNC_IMPL(end);
-
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/meta/cpp_future.hpp>
-
-// #include <nlohmann/detail/meta/detected.hpp>
-
-// #include <nlohmann/json_fwd.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
-    #define INCLUDE_NLOHMANN_JSON_FWD_HPP_
-
-    #include <cstdint> // int64_t, uint64_t
-    #include <map> // map
-    #include <memory> // allocator
-    #include <string> // string
-    #include <vector> // vector
-
-    // #include <nlohmann/detail/abi_macros.hpp>
-
-
-    /*!
-    @brief namespace for Niels Lohmann
-    @see https://github.com/nlohmann
-    @since version 1.0.0
-    */
-    NLOHMANN_JSON_NAMESPACE_BEGIN
-
-    /*!
-    @brief default JSONSerializer template argument
-
-    This serializer ignores the template arguments and uses ADL
-    ([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
-    for serialization.
-    */
-    template<typename T = void, typename SFINAE = void>
-    struct adl_serializer;
-
-    /// a class to store JSON values
-    /// @sa https://json.nlohmann.me/api/basic_json/
-    template<template<typename U, typename V, typename... Args> class ObjectType =
-    std::map,
-    template<typename U, typename... Args> class ArrayType = std::vector,
-    class StringType = std::string, class BooleanType = bool,
-    class NumberIntegerType = std::int64_t,
-    class NumberUnsignedType = std::uint64_t,
-    class NumberFloatType = double,
-    template<typename U> class AllocatorType = std::allocator,
-    template<typename T, typename SFINAE = void> class JSONSerializer =
-    adl_serializer,
-    class BinaryType = std::vector<std::uint8_t>, // cppcheck-suppress syntaxError
-    class CustomBaseClass = void>
-    class basic_json;
-
-    /// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
-    /// @sa https://json.nlohmann.me/api/json_pointer/
-    template<typename RefStringType>
-    class json_pointer;
-
-    /*!
-    @brief default specialization
-    @sa https://json.nlohmann.me/api/json/
-    */
-    using json = basic_json<>;
-
-    /// @brief a minimal map-like container that preserves insertion order
-    /// @sa https://json.nlohmann.me/api/ordered_map/
-    template<class Key, class T, class IgnoredLess, class Allocator>
-    struct ordered_map;
-
-    /// @brief specialization that maintains the insertion order of object keys
-    /// @sa https://json.nlohmann.me/api/ordered_json/
-    using ordered_json = basic_json<nlohmann::ordered_map>;
-
-    NLOHMANN_JSON_NAMESPACE_END
-
-#endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-/*!
-@brief detail namespace with internal helper functions
-
-This namespace collects functions that should not be exposed,
-implementations of some @ref basic_json methods, and meta-programming helpers.
-
-@since version 2.1.0
-*/
-namespace detail
-{
-
-/////////////
-// helpers //
-/////////////
-
-// Note to maintainers:
-//
-// Every trait in this file expects a non CV-qualified type.
-// The only exceptions are in the 'aliases for detected' section
-// (i.e. those of the form: decltype(T::member_function(std::declval<T>())))
-//
-// In this case, T has to be properly CV-qualified to constraint the function arguments
-// (e.g. to_json(BasicJsonType&, const T&))
-
-template<typename> struct is_basic_json : std::false_type {};
-
-NLOHMANN_BASIC_JSON_TPL_DECLARATION
-struct is_basic_json<NLOHMANN_BASIC_JSON_TPL> : std::true_type {};
-
-// used by exceptions create() member functions
-// true_type for pointer to possibly cv-qualified basic_json or std::nullptr_t
-// false_type otherwise
-template<typename BasicJsonContext>
-struct is_basic_json_context :
-    std::integral_constant < bool,
-    is_basic_json<typename std::remove_cv<typename std::remove_pointer<BasicJsonContext>::type>::type>::value
-    || std::is_same<BasicJsonContext, std::nullptr_t>::value >
-{};
-
-//////////////////////
-// json_ref helpers //
-//////////////////////
-
-template<typename>
-class json_ref;
-
-template<typename>
-struct is_json_ref : std::false_type {};
-
-template<typename T>
-struct is_json_ref<json_ref<T>> : std::true_type {};
-
-//////////////////////////
-// aliases for detected //
-//////////////////////////
-
-template<typename T>
-using mapped_type_t = typename T::mapped_type;
-
-template<typename T>
-using key_type_t = typename T::key_type;
-
-template<typename T>
-using value_type_t = typename T::value_type;
-
-template<typename T>
-using difference_type_t = typename T::difference_type;
-
-template<typename T>
-using pointer_t = typename T::pointer;
-
-template<typename T>
-using reference_t = typename T::reference;
-
-template<typename T>
-using iterator_category_t = typename T::iterator_category;
-
-template<typename T, typename... Args>
-using to_json_function = decltype(T::to_json(std::declval<Args>()...));
-
-template<typename T, typename... Args>
-using from_json_function = decltype(T::from_json(std::declval<Args>()...));
-
-template<typename T, typename U>
-using get_template_function = decltype(std::declval<T>().template get<U>());
-
-// trait checking if JSONSerializer<T>::from_json(json const&, udt&) exists
-template<typename BasicJsonType, typename T, typename = void>
-struct has_from_json : std::false_type {};
-
-// trait checking if j.get<T> is valid
-// use this trait instead of std::is_constructible or std::is_convertible,
-// both rely on, or make use of implicit conversions, and thus fail when T
-// has several constructors/operator= (see https://github.com/nlohmann/json/issues/958)
-template <typename BasicJsonType, typename T>
-struct is_getable
-{
-    static constexpr bool value = is_detected<get_template_function, const BasicJsonType&, T>::value;
-};
-
-template<typename BasicJsonType, typename T>
-struct has_from_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
-{
-    using serializer = typename BasicJsonType::template json_serializer<T, void>;
-
-    static constexpr bool value =
-        is_detected_exact<void, from_json_function, serializer,
-        const BasicJsonType&, T&>::value;
-};
-
-// This trait checks if JSONSerializer<T>::from_json(json const&) exists
-// this overload is used for non-default-constructible user-defined-types
-template<typename BasicJsonType, typename T, typename = void>
-struct has_non_default_from_json : std::false_type {};
-
-template<typename BasicJsonType, typename T>
-struct has_non_default_from_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
-{
-    using serializer = typename BasicJsonType::template json_serializer<T, void>;
-
-    static constexpr bool value =
-        is_detected_exact<T, from_json_function, serializer,
-        const BasicJsonType&>::value;
-};
-
-// This trait checks if BasicJsonType::json_serializer<T>::to_json exists
-// Do not evaluate the trait when T is a basic_json type, to avoid template instantiation infinite recursion.
-template<typename BasicJsonType, typename T, typename = void>
-struct has_to_json : std::false_type {};
-
-template<typename BasicJsonType, typename T>
-struct has_to_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
-{
-    using serializer = typename BasicJsonType::template json_serializer<T, void>;
-
-    static constexpr bool value =
-        is_detected_exact<void, to_json_function, serializer, BasicJsonType&,
-        T>::value;
-};
-
-template<typename T>
-using detect_key_compare = typename T::key_compare;
-
-template<typename T>
-struct has_key_compare : std::integral_constant<bool, is_detected<detect_key_compare, T>::value> {};
-
-// obtains the actual object key comparator
-template<typename BasicJsonType>
-struct actual_object_comparator
-{
-    using object_t = typename BasicJsonType::object_t;
-    using object_comparator_t = typename BasicJsonType::default_object_comparator_t;
-    using type = typename std::conditional < has_key_compare<object_t>::value,
-          typename object_t::key_compare, object_comparator_t>::type;
-};
-
-template<typename BasicJsonType>
-using actual_object_comparator_t = typename actual_object_comparator<BasicJsonType>::type;
-
-/////////////////
-// char_traits //
-/////////////////
-
-// Primary template of char_traits calls std char_traits
-template<typename T>
-struct char_traits : std::char_traits<T>
-{};
-
-// Explicitly define char traits for unsigned char since it is not standard
-template<>
-struct char_traits<unsigned char> : std::char_traits<char>
-{
-    using char_type = unsigned char;
-    using int_type = uint64_t;
-
-    // Redefine to_int_type function
-    static int_type to_int_type(char_type c) noexcept
-    {
-        return static_cast<int_type>(c);
-    }
-
-    static char_type to_char_type(int_type i) noexcept
-    {
-        return static_cast<char_type>(i);
-    }
-
-    static constexpr int_type eof() noexcept
-    {
-        return static_cast<int_type>(std::char_traits<char>::eof());
-    }
-};
-
-// Explicitly define char traits for signed char since it is not standard
-template<>
-struct char_traits<signed char> : std::char_traits<char>
-{
-    using char_type = signed char;
-    using int_type = uint64_t;
-
-    // Redefine to_int_type function
-    static int_type to_int_type(char_type c) noexcept
-    {
-        return static_cast<int_type>(c);
-    }
-
-    static char_type to_char_type(int_type i) noexcept
-    {
-        return static_cast<char_type>(i);
-    }
-
-    static constexpr int_type eof() noexcept
-    {
-        return static_cast<int_type>(std::char_traits<char>::eof());
-    }
-};
-
-///////////////////
-// is_ functions //
-///////////////////
-
-// https://en.cppreference.com/w/cpp/types/conjunction
-template<class...> struct conjunction : std::true_type { };
-template<class B> struct conjunction<B> : B { };
-template<class B, class... Bn>
-struct conjunction<B, Bn...>
-: std::conditional<static_cast<bool>(B::value), conjunction<Bn...>, B>::type {};
-
-// https://en.cppreference.com/w/cpp/types/negation
-template<class B> struct negation : std::integral_constant < bool, !B::value > { };
-
-// Reimplementation of is_constructible and is_default_constructible, due to them being broken for
-// std::pair and std::tuple until LWG 2367 fix (see https://cplusplus.github.io/LWG/lwg-defects.html#2367).
-// This causes compile errors in e.g. clang 3.5 or gcc 4.9.
-template <typename T>
-struct is_default_constructible : std::is_default_constructible<T> {};
-
-template <typename T1, typename T2>
-struct is_default_constructible<std::pair<T1, T2>>
-    : conjunction<is_default_constructible<T1>, is_default_constructible<T2>> {};
-
-template <typename T1, typename T2>
-struct is_default_constructible<const std::pair<T1, T2>>
-    : conjunction<is_default_constructible<T1>, is_default_constructible<T2>> {};
-
-template <typename... Ts>
-struct is_default_constructible<std::tuple<Ts...>>
-    : conjunction<is_default_constructible<Ts>...> {};
-
-template <typename... Ts>
-struct is_default_constructible<const std::tuple<Ts...>>
-    : conjunction<is_default_constructible<Ts>...> {};
-
-template <typename T, typename... Args>
-struct is_constructible : std::is_constructible<T, Args...> {};
-
-template <typename T1, typename T2>
-struct is_constructible<std::pair<T1, T2>> : is_default_constructible<std::pair<T1, T2>> {};
-
-template <typename T1, typename T2>
-struct is_constructible<const std::pair<T1, T2>> : is_default_constructible<const std::pair<T1, T2>> {};
-
-template <typename... Ts>
-struct is_constructible<std::tuple<Ts...>> : is_default_constructible<std::tuple<Ts...>> {};
-
-template <typename... Ts>
-struct is_constructible<const std::tuple<Ts...>> : is_default_constructible<const std::tuple<Ts...>> {};
-
-template<typename T, typename = void>
-struct is_iterator_traits : std::false_type {};
-
-template<typename T>
-struct is_iterator_traits<iterator_traits<T>>
-{
-  private:
-    using traits = iterator_traits<T>;
-
-  public:
-    static constexpr auto value =
-        is_detected<value_type_t, traits>::value &&
-        is_detected<difference_type_t, traits>::value &&
-        is_detected<pointer_t, traits>::value &&
-        is_detected<iterator_category_t, traits>::value &&
-        is_detected<reference_t, traits>::value;
-};
-
-template<typename T>
-struct is_range
-{
-  private:
-    using t_ref = typename std::add_lvalue_reference<T>::type;
-
-    using iterator = detected_t<result_of_begin, t_ref>;
-    using sentinel = detected_t<result_of_end, t_ref>;
-
-    // to be 100% correct, it should use https://en.cppreference.com/w/cpp/iterator/input_or_output_iterator
-    // and https://en.cppreference.com/w/cpp/iterator/sentinel_for
-    // but reimplementing these would be too much work, as a lot of other concepts are used underneath
-    static constexpr auto is_iterator_begin =
-        is_iterator_traits<iterator_traits<iterator>>::value;
-
-  public:
-    static constexpr bool value = !std::is_same<iterator, nonesuch>::value && !std::is_same<sentinel, nonesuch>::value && is_iterator_begin;
-};
-
-template<typename R>
-using iterator_t = enable_if_t<is_range<R>::value, result_of_begin<decltype(std::declval<R&>())>>;
-
-template<typename T>
-using range_value_t = value_type_t<iterator_traits<iterator_t<T>>>;
-
-// The following implementation of is_complete_type is taken from
-// https://blogs.msdn.microsoft.com/vcblog/2015/12/02/partial-support-for-expression-sfinae-in-vs-2015-update-1/
-// and is written by Xiang Fan who agreed to using it in this library.
-
-template<typename T, typename = void>
-struct is_complete_type : std::false_type {};
-
-template<typename T>
-struct is_complete_type<T, decltype(void(sizeof(T)))> : std::true_type {};
-
-template<typename BasicJsonType, typename CompatibleObjectType,
-         typename = void>
-struct is_compatible_object_type_impl : std::false_type {};
-
-template<typename BasicJsonType, typename CompatibleObjectType>
-struct is_compatible_object_type_impl <
-    BasicJsonType, CompatibleObjectType,
-    enable_if_t < is_detected<mapped_type_t, CompatibleObjectType>::value&&
-    is_detected<key_type_t, CompatibleObjectType>::value >>
-{
-    using object_t = typename BasicJsonType::object_t;
-
-    // macOS's is_constructible does not play well with nonesuch...
-    static constexpr bool value =
-        is_constructible<typename object_t::key_type,
-        typename CompatibleObjectType::key_type>::value &&
-        is_constructible<typename object_t::mapped_type,
-        typename CompatibleObjectType::mapped_type>::value;
-};
-
-template<typename BasicJsonType, typename CompatibleObjectType>
-struct is_compatible_object_type
-    : is_compatible_object_type_impl<BasicJsonType, CompatibleObjectType> {};
-
-template<typename BasicJsonType, typename ConstructibleObjectType,
-         typename = void>
-struct is_constructible_object_type_impl : std::false_type {};
-
-template<typename BasicJsonType, typename ConstructibleObjectType>
-struct is_constructible_object_type_impl <
-    BasicJsonType, ConstructibleObjectType,
-    enable_if_t < is_detected<mapped_type_t, ConstructibleObjectType>::value&&
-    is_detected<key_type_t, ConstructibleObjectType>::value >>
-{
-    using object_t = typename BasicJsonType::object_t;
-
-    static constexpr bool value =
-        (is_default_constructible<ConstructibleObjectType>::value &&
-         (std::is_move_assignable<ConstructibleObjectType>::value ||
-          std::is_copy_assignable<ConstructibleObjectType>::value) &&
-         (is_constructible<typename ConstructibleObjectType::key_type,
-          typename object_t::key_type>::value &&
-          std::is_same <
-          typename object_t::mapped_type,
-          typename ConstructibleObjectType::mapped_type >::value)) ||
-        (has_from_json<BasicJsonType,
-         typename ConstructibleObjectType::mapped_type>::value ||
-         has_non_default_from_json <
-         BasicJsonType,
-         typename ConstructibleObjectType::mapped_type >::value);
-};
-
-template<typename BasicJsonType, typename ConstructibleObjectType>
-struct is_constructible_object_type
-    : is_constructible_object_type_impl<BasicJsonType,
-      ConstructibleObjectType> {};
-
-template<typename BasicJsonType, typename CompatibleStringType>
-struct is_compatible_string_type
-{
-    static constexpr auto value =
-        is_constructible<typename BasicJsonType::string_t, CompatibleStringType>::value;
-};
-
-template<typename BasicJsonType, typename ConstructibleStringType>
-struct is_constructible_string_type
-{
-    // launder type through decltype() to fix compilation failure on ICPC
-#ifdef __INTEL_COMPILER
-    using laundered_type = decltype(std::declval<ConstructibleStringType>());
-#else
-    using laundered_type = ConstructibleStringType;
-#endif
-
-    static constexpr auto value =
-        conjunction <
-        is_constructible<laundered_type, typename BasicJsonType::string_t>,
-        is_detected_exact<typename BasicJsonType::string_t::value_type,
-        value_type_t, laundered_type >>::value;
-};
-
-template<typename BasicJsonType, typename CompatibleArrayType, typename = void>
-struct is_compatible_array_type_impl : std::false_type {};
-
-template<typename BasicJsonType, typename CompatibleArrayType>
-struct is_compatible_array_type_impl <
-    BasicJsonType, CompatibleArrayType,
-    enable_if_t <
-    is_detected<iterator_t, CompatibleArrayType>::value&&
-    is_iterator_traits<iterator_traits<detected_t<iterator_t, CompatibleArrayType>>>::value&&
-// special case for types like std::filesystem::path whose iterator's value_type are themselves
-// c.f. https://github.com/nlohmann/json/pull/3073
-    !std::is_same<CompatibleArrayType, detected_t<range_value_t, CompatibleArrayType>>::value >>
-{
-    static constexpr bool value =
-        is_constructible<BasicJsonType,
-        range_value_t<CompatibleArrayType>>::value;
-};
-
-template<typename BasicJsonType, typename CompatibleArrayType>
-struct is_compatible_array_type
-    : is_compatible_array_type_impl<BasicJsonType, CompatibleArrayType> {};
-
-template<typename BasicJsonType, typename ConstructibleArrayType, typename = void>
-struct is_constructible_array_type_impl : std::false_type {};
-
-template<typename BasicJsonType, typename ConstructibleArrayType>
-struct is_constructible_array_type_impl <
-    BasicJsonType, ConstructibleArrayType,
-    enable_if_t<std::is_same<ConstructibleArrayType,
-    typename BasicJsonType::value_type>::value >>
-            : std::true_type {};
-
-template<typename BasicJsonType, typename ConstructibleArrayType>
-struct is_constructible_array_type_impl <
-    BasicJsonType, ConstructibleArrayType,
-    enable_if_t < !std::is_same<ConstructibleArrayType,
-    typename BasicJsonType::value_type>::value&&
-    !is_compatible_string_type<BasicJsonType, ConstructibleArrayType>::value&&
-    is_default_constructible<ConstructibleArrayType>::value&&
-(std::is_move_assignable<ConstructibleArrayType>::value ||
- std::is_copy_assignable<ConstructibleArrayType>::value)&&
-is_detected<iterator_t, ConstructibleArrayType>::value&&
-is_iterator_traits<iterator_traits<detected_t<iterator_t, ConstructibleArrayType>>>::value&&
-is_detected<range_value_t, ConstructibleArrayType>::value&&
-// special case for types like std::filesystem::path whose iterator's value_type are themselves
-// c.f. https://github.com/nlohmann/json/pull/3073
-!std::is_same<ConstructibleArrayType, detected_t<range_value_t, ConstructibleArrayType>>::value&&
-is_complete_type <
-detected_t<range_value_t, ConstructibleArrayType >>::value >>
-{
-    using value_type = range_value_t<ConstructibleArrayType>;
-
-    static constexpr bool value =
-        std::is_same<value_type,
-        typename BasicJsonType::array_t::value_type>::value ||
-        has_from_json<BasicJsonType,
-        value_type>::value ||
-        has_non_default_from_json <
-        BasicJsonType,
-        value_type >::value;
-};
-
-template<typename BasicJsonType, typename ConstructibleArrayType>
-struct is_constructible_array_type
-    : is_constructible_array_type_impl<BasicJsonType, ConstructibleArrayType> {};
-
-template<typename RealIntegerType, typename CompatibleNumberIntegerType,
-         typename = void>
-struct is_compatible_integer_type_impl : std::false_type {};
-
-template<typename RealIntegerType, typename CompatibleNumberIntegerType>
-struct is_compatible_integer_type_impl <
-    RealIntegerType, CompatibleNumberIntegerType,
-    enable_if_t < std::is_integral<RealIntegerType>::value&&
-    std::is_integral<CompatibleNumberIntegerType>::value&&
-    !std::is_same<bool, CompatibleNumberIntegerType>::value >>
-{
-    // is there an assert somewhere on overflows?
-    using RealLimits = std::numeric_limits<RealIntegerType>;
-    using CompatibleLimits = std::numeric_limits<CompatibleNumberIntegerType>;
-
-    static constexpr auto value =
-        is_constructible<RealIntegerType,
-        CompatibleNumberIntegerType>::value &&
-        CompatibleLimits::is_integer &&
-        RealLimits::is_signed == CompatibleLimits::is_signed;
-};
-
-template<typename RealIntegerType, typename CompatibleNumberIntegerType>
-struct is_compatible_integer_type
-    : is_compatible_integer_type_impl<RealIntegerType,
-      CompatibleNumberIntegerType> {};
-
-template<typename BasicJsonType, typename CompatibleType, typename = void>
-struct is_compatible_type_impl: std::false_type {};
-
-template<typename BasicJsonType, typename CompatibleType>
-struct is_compatible_type_impl <
-    BasicJsonType, CompatibleType,
-    enable_if_t<is_complete_type<CompatibleType>::value >>
-{
-    static constexpr bool value =
-        has_to_json<BasicJsonType, CompatibleType>::value;
-};
-
-template<typename BasicJsonType, typename CompatibleType>
-struct is_compatible_type
-    : is_compatible_type_impl<BasicJsonType, CompatibleType> {};
-
-template<typename T1, typename T2>
-struct is_constructible_tuple : std::false_type {};
-
-template<typename T1, typename... Args>
-struct is_constructible_tuple<T1, std::tuple<Args...>> : conjunction<is_constructible<T1, Args>...> {};
-
-template<typename BasicJsonType, typename T>
-struct is_json_iterator_of : std::false_type {};
-
-template<typename BasicJsonType>
-struct is_json_iterator_of<BasicJsonType, typename BasicJsonType::iterator> : std::true_type {};
-
-template<typename BasicJsonType>
-struct is_json_iterator_of<BasicJsonType, typename BasicJsonType::const_iterator> : std::true_type
-{};
-
-// checks if a given type T is a template specialization of Primary
-template<template <typename...> class Primary, typename T>
-struct is_specialization_of : std::false_type {};
-
-template<template <typename...> class Primary, typename... Args>
-struct is_specialization_of<Primary, Primary<Args...>> : std::true_type {};
-
-template<typename T>
-using is_json_pointer = is_specialization_of<::nlohmann::json_pointer, uncvref_t<T>>;
-
-// checks if A and B are comparable using Compare functor
-template<typename Compare, typename A, typename B, typename = void>
-struct is_comparable : std::false_type {};
-
-template<typename Compare, typename A, typename B>
-struct is_comparable<Compare, A, B, void_t<
-decltype(std::declval<Compare>()(std::declval<A>(), std::declval<B>())),
-decltype(std::declval<Compare>()(std::declval<B>(), std::declval<A>()))
->> : std::true_type {};
-
-template<typename T>
-using detect_is_transparent = typename T::is_transparent;
-
-// type trait to check if KeyType can be used as object key (without a BasicJsonType)
-// see is_usable_as_basic_json_key_type below
-template<typename Comparator, typename ObjectKeyType, typename KeyTypeCVRef, bool RequireTransparentComparator = true,
-         bool ExcludeObjectKeyType = RequireTransparentComparator, typename KeyType = uncvref_t<KeyTypeCVRef>>
-using is_usable_as_key_type = typename std::conditional <
-                              is_comparable<Comparator, ObjectKeyType, KeyTypeCVRef>::value
-                              && !(ExcludeObjectKeyType && std::is_same<KeyType,
-                                   ObjectKeyType>::value)
-                              && (!RequireTransparentComparator
-                                  || is_detected <detect_is_transparent, Comparator>::value)
-                              && !is_json_pointer<KeyType>::value,
-                              std::true_type,
-                              std::false_type >::type;
-
-// type trait to check if KeyType can be used as object key
-// true if:
-//   - KeyType is comparable with BasicJsonType::object_t::key_type
-//   - if ExcludeObjectKeyType is true, KeyType is not BasicJsonType::object_t::key_type
-//   - the comparator is transparent or RequireTransparentComparator is false
-//   - KeyType is not a JSON iterator or json_pointer
-template<typename BasicJsonType, typename KeyTypeCVRef, bool RequireTransparentComparator = true,
-         bool ExcludeObjectKeyType = RequireTransparentComparator, typename KeyType = uncvref_t<KeyTypeCVRef>>
-using is_usable_as_basic_json_key_type = typename std::conditional <
-    is_usable_as_key_type<typename BasicJsonType::object_comparator_t,
-    typename BasicJsonType::object_t::key_type, KeyTypeCVRef,
-    RequireTransparentComparator, ExcludeObjectKeyType>::value
-    && !is_json_iterator_of<BasicJsonType, KeyType>::value,
-    std::true_type,
-    std::false_type >::type;
-
-template<typename ObjectType, typename KeyType>
-using detect_erase_with_key_type = decltype(std::declval<ObjectType&>().erase(std::declval<KeyType>()));
-
-// type trait to check if object_t has an erase() member functions accepting KeyType
-template<typename BasicJsonType, typename KeyType>
-using has_erase_with_key_type = typename std::conditional <
-                                is_detected <
-                                detect_erase_with_key_type,
-                                typename BasicJsonType::object_t, KeyType >::value,
-                                std::true_type,
-                                std::false_type >::type;
-
-// a naive helper to check if a type is an ordered_map (exploits the fact that
-// ordered_map inherits capacity() from std::vector)
-template <typename T>
-struct is_ordered_map
-{
-    using one = char;
-
-    struct two
-    {
-        char x[2]; // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
-    };
-
-    template <typename C> static one test( decltype(&C::capacity) ) ;
-    template <typename C> static two test(...);
-
-    enum { value = sizeof(test<T>(nullptr)) == sizeof(char) }; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
-};
-
-// to avoid useless casts (see https://github.com/nlohmann/json/issues/2893#issuecomment-889152324)
-template < typename T, typename U, enable_if_t < !std::is_same<T, U>::value, int > = 0 >
-T conditional_static_cast(U value)
-{
-    return static_cast<T>(value);
-}
-
-template<typename T, typename U, enable_if_t<std::is_same<T, U>::value, int> = 0>
-T conditional_static_cast(U value)
-{
-    return value;
-}
-
-template<typename... Types>
-using all_integral = conjunction<std::is_integral<Types>...>;
-
-template<typename... Types>
-using all_signed = conjunction<std::is_signed<Types>...>;
-
-template<typename... Types>
-using all_unsigned = conjunction<std::is_unsigned<Types>...>;
-
-// there's a disjunction trait in another PR; replace when merged
-template<typename... Types>
-using same_sign = std::integral_constant < bool,
-      all_signed<Types...>::value || all_unsigned<Types...>::value >;
-
-template<typename OfType, typename T>
-using never_out_of_range = std::integral_constant < bool,
-      (std::is_signed<OfType>::value && (sizeof(T) < sizeof(OfType)))
-      || (same_sign<OfType, T>::value && sizeof(OfType) == sizeof(T)) >;
-
-template<typename OfType, typename T,
-         bool OfTypeSigned = std::is_signed<OfType>::value,
-         bool TSigned = std::is_signed<T>::value>
-struct value_in_range_of_impl2;
-
-template<typename OfType, typename T>
-struct value_in_range_of_impl2<OfType, T, false, false>
-{
-    static constexpr bool test(T val)
-    {
-        using CommonType = typename std::common_type<OfType, T>::type;
-        return static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
-    }
-};
-
-template<typename OfType, typename T>
-struct value_in_range_of_impl2<OfType, T, true, false>
-{
-    static constexpr bool test(T val)
-    {
-        using CommonType = typename std::common_type<OfType, T>::type;
-        return static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
-    }
-};
-
-template<typename OfType, typename T>
-struct value_in_range_of_impl2<OfType, T, false, true>
-{
-    static constexpr bool test(T val)
-    {
-        using CommonType = typename std::common_type<OfType, T>::type;
-        return val >= 0 && static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
-    }
-};
-
-template<typename OfType, typename T>
-struct value_in_range_of_impl2<OfType, T, true, true>
-{
-    static constexpr bool test(T val)
-    {
-        using CommonType = typename std::common_type<OfType, T>::type;
-        return static_cast<CommonType>(val) >= static_cast<CommonType>((std::numeric_limits<OfType>::min)())
-               && static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
-    }
-};
-
-template<typename OfType, typename T,
-         bool NeverOutOfRange = never_out_of_range<OfType, T>::value,
-         typename = detail::enable_if_t<all_integral<OfType, T>::value>>
-struct value_in_range_of_impl1;
-
-template<typename OfType, typename T>
-struct value_in_range_of_impl1<OfType, T, false>
-{
-    static constexpr bool test(T val)
-    {
-        return value_in_range_of_impl2<OfType, T>::test(val);
-    }
-};
-
-template<typename OfType, typename T>
-struct value_in_range_of_impl1<OfType, T, true>
-{
-    static constexpr bool test(T /*val*/)
-    {
-        return true;
-    }
-};
-
-template<typename OfType, typename T>
-constexpr bool value_in_range_of(T val)
-{
-    return value_in_range_of_impl1<OfType, T>::test(val);
-}
-
-template<bool Value>
-using bool_constant = std::integral_constant<bool, Value>;
-
-///////////////////////////////////////////////////////////////////////////////
-// is_c_string
-///////////////////////////////////////////////////////////////////////////////
-
-namespace impl
-{
-
-template<typename T>
-constexpr bool is_c_string()
-{
-    using TUnExt = typename std::remove_extent<T>::type;
-    using TUnCVExt = typename std::remove_cv<TUnExt>::type;
-    using TUnPtr = typename std::remove_pointer<T>::type;
-    using TUnCVPtr = typename std::remove_cv<TUnPtr>::type;
-    return
-        (std::is_array<T>::value && std::is_same<TUnCVExt, char>::value)
-        || (std::is_pointer<T>::value && std::is_same<TUnCVPtr, char>::value);
-}
-
-}  // namespace impl
-
-// checks whether T is a [cv] char */[cv] char[] C string
-template<typename T>
-struct is_c_string : bool_constant<impl::is_c_string<T>()> {};
-
-template<typename T>
-using is_c_string_uncvref = is_c_string<uncvref_t<T>>;
-
-///////////////////////////////////////////////////////////////////////////////
-// is_transparent
-///////////////////////////////////////////////////////////////////////////////
-
-namespace impl
-{
-
-template<typename T>
-constexpr bool is_transparent()
-{
-    return is_detected<detect_is_transparent, T>::value;
-}
-
-}  // namespace impl
-
-// checks whether T has a member named is_transparent
-template<typename T>
-struct is_transparent : bool_constant<impl::is_transparent<T>()> {};
-
-///////////////////////////////////////////////////////////////////////////////
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/string_concat.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <cstring> // strlen
-#include <string> // string
-#include <utility> // forward
-
-// #include <nlohmann/detail/meta/cpp_future.hpp>
-
-// #include <nlohmann/detail/meta/detected.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-inline std::size_t concat_length()
-{
-    return 0;
-}
-
-template<typename... Args>
-inline std::size_t concat_length(const char* cstr, const Args& ... rest);
-
-template<typename StringType, typename... Args>
-inline std::size_t concat_length(const StringType& str, const Args& ... rest);
-
-template<typename... Args>
-inline std::size_t concat_length(const char /*c*/, const Args& ... rest)
-{
-    return 1 + concat_length(rest...);
-}
-
-template<typename... Args>
-inline std::size_t concat_length(const char* cstr, const Args& ... rest)
-{
-    // cppcheck-suppress ignoredReturnValue
-    return ::strlen(cstr) + concat_length(rest...);
-}
-
-template<typename StringType, typename... Args>
-inline std::size_t concat_length(const StringType& str, const Args& ... rest)
-{
-    return str.size() + concat_length(rest...);
-}
-
-template<typename OutStringType>
-inline void concat_into(OutStringType& /*out*/)
-{}
-
-template<typename StringType, typename Arg>
-using string_can_append = decltype(std::declval<StringType&>().append(std::declval < Arg && > ()));
-
-template<typename StringType, typename Arg>
-using detect_string_can_append = is_detected<string_can_append, StringType, Arg>;
-
-template<typename StringType, typename Arg>
-using string_can_append_op = decltype(std::declval<StringType&>() += std::declval < Arg && > ());
-
-template<typename StringType, typename Arg>
-using detect_string_can_append_op = is_detected<string_can_append_op, StringType, Arg>;
-
-template<typename StringType, typename Arg>
-using string_can_append_iter = decltype(std::declval<StringType&>().append(std::declval<const Arg&>().begin(), std::declval<const Arg&>().end()));
-
-template<typename StringType, typename Arg>
-using detect_string_can_append_iter = is_detected<string_can_append_iter, StringType, Arg>;
-
-template<typename StringType, typename Arg>
-using string_can_append_data = decltype(std::declval<StringType&>().append(std::declval<const Arg&>().data(), std::declval<const Arg&>().size()));
-
-template<typename StringType, typename Arg>
-using detect_string_can_append_data = is_detected<string_can_append_data, StringType, Arg>;
-
-template < typename OutStringType, typename Arg, typename... Args,
-           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
-                         && detect_string_can_append_op<OutStringType, Arg>::value, int > = 0 >
-inline void concat_into(OutStringType& out, Arg && arg, Args && ... rest);
-
-template < typename OutStringType, typename Arg, typename... Args,
-           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
-                         && !detect_string_can_append_op<OutStringType, Arg>::value
-                         && detect_string_can_append_iter<OutStringType, Arg>::value, int > = 0 >
-inline void concat_into(OutStringType& out, const Arg& arg, Args && ... rest);
-
-template < typename OutStringType, typename Arg, typename... Args,
-           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
-                         && !detect_string_can_append_op<OutStringType, Arg>::value
-                         && !detect_string_can_append_iter<OutStringType, Arg>::value
-                         && detect_string_can_append_data<OutStringType, Arg>::value, int > = 0 >
-inline void concat_into(OutStringType& out, const Arg& arg, Args && ... rest);
-
-template<typename OutStringType, typename Arg, typename... Args,
-         enable_if_t<detect_string_can_append<OutStringType, Arg>::value, int> = 0>
-inline void concat_into(OutStringType& out, Arg && arg, Args && ... rest)
-{
-    out.append(std::forward<Arg>(arg));
-    concat_into(out, std::forward<Args>(rest)...);
-}
-
-template < typename OutStringType, typename Arg, typename... Args,
-           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
-                         && detect_string_can_append_op<OutStringType, Arg>::value, int > >
-inline void concat_into(OutStringType& out, Arg&& arg, Args&& ... rest)
-{
-    out += std::forward<Arg>(arg);
-    concat_into(out, std::forward<Args>(rest)...);
-}
-
-template < typename OutStringType, typename Arg, typename... Args,
-           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
-                         && !detect_string_can_append_op<OutStringType, Arg>::value
-                         && detect_string_can_append_iter<OutStringType, Arg>::value, int > >
-inline void concat_into(OutStringType& out, const Arg& arg, Args&& ... rest)
-{
-    out.append(arg.begin(), arg.end());
-    concat_into(out, std::forward<Args>(rest)...);
-}
-
-template < typename OutStringType, typename Arg, typename... Args,
-           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
-                         && !detect_string_can_append_op<OutStringType, Arg>::value
-                         && !detect_string_can_append_iter<OutStringType, Arg>::value
-                         && detect_string_can_append_data<OutStringType, Arg>::value, int > >
-inline void concat_into(OutStringType& out, const Arg& arg, Args&& ... rest)
-{
-    out.append(arg.data(), arg.size());
-    concat_into(out, std::forward<Args>(rest)...);
-}
-
-template<typename OutStringType = std::string, typename... Args>
-inline OutStringType concat(Args && ... args)
-{
-    OutStringType str;
-    str.reserve(concat_length(args...));
-    concat_into(str, std::forward<Args>(args)...);
-    return str;
-}
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-
-// With -Wweak-vtables, Clang will complain about the exception classes as they
-// have no out-of-line virtual method definitions and their vtable will be
-// emitted in every translation unit. This issue cannot be fixed with a
-// header-only library as there is no implementation file to move these
-// functions to. As a result, we suppress this warning here to avoid client
-// code to stumble over this. See https://github.com/nlohmann/json/issues/4087
-// for a discussion.
-#if defined(__clang__)
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-////////////////
-// exceptions //
-////////////////
-
-/// @brief general exception of the @ref basic_json class
-/// @sa https://json.nlohmann.me/api/basic_json/exception/
-class exception : public std::exception
-{
-  public:
-    /// returns the explanatory string
-    const char* what() const noexcept override
-    {
-        return m.what();
-    }
-
-    /// the id of the exception
-    const int id; // NOLINT(cppcoreguidelines-non-private-member-variables-in-classes)
-
-  protected:
-    JSON_HEDLEY_NON_NULL(3)
-    exception(int id_, const char* what_arg) : id(id_), m(what_arg) {} // NOLINT(bugprone-throw-keyword-missing)
-
-    static std::string name(const std::string& ename, int id_)
-    {
-        return concat("[json.exception.", ename, '.', std::to_string(id_), "] ");
-    }
-
-    static std::string diagnostics(std::nullptr_t /*leaf_element*/)
-    {
-        return "";
-    }
-
-    template<typename BasicJsonType>
-    static std::string diagnostics(const BasicJsonType* leaf_element)
-    {
-#if JSON_DIAGNOSTICS
-        std::vector<std::string> tokens;
-        for (const auto* current = leaf_element; current != nullptr && current->m_parent != nullptr; current = current->m_parent)
-        {
-            switch (current->m_parent->type())
-            {
-                case value_t::array:
-                {
-                    for (std::size_t i = 0; i < current->m_parent->m_data.m_value.array->size(); ++i)
-                    {
-                        if (&current->m_parent->m_data.m_value.array->operator[](i) == current)
-                        {
-                            tokens.emplace_back(std::to_string(i));
-                            break;
-                        }
-                    }
-                    break;
-                }
-
-                case value_t::object:
-                {
-                    for (const auto& element : *current->m_parent->m_data.m_value.object)
-                    {
-                        if (&element.second == current)
-                        {
-                            tokens.emplace_back(element.first.c_str());
-                            break;
-                        }
-                    }
-                    break;
-                }
-
-                case value_t::null: // LCOV_EXCL_LINE
-                case value_t::string: // LCOV_EXCL_LINE
-                case value_t::boolean: // LCOV_EXCL_LINE
-                case value_t::number_integer: // LCOV_EXCL_LINE
-                case value_t::number_unsigned: // LCOV_EXCL_LINE
-                case value_t::number_float: // LCOV_EXCL_LINE
-                case value_t::binary: // LCOV_EXCL_LINE
-                case value_t::discarded: // LCOV_EXCL_LINE
-                default:   // LCOV_EXCL_LINE
-                    break; // LCOV_EXCL_LINE
-            }
-        }
-
-        if (tokens.empty())
-        {
-            return "";
-        }
-
-        auto str = std::accumulate(tokens.rbegin(), tokens.rend(), std::string{},
-                                   [](const std::string & a, const std::string & b)
-        {
-            return concat(a, '/', detail::escape(b));
-        });
-        return concat('(', str, ") ");
-#else
-        static_cast<void>(leaf_element);
-        return "";
-#endif
-    }
-
-  private:
-    /// an exception object as storage for error messages
-    std::runtime_error m;
-};
-
-/// @brief exception indicating a parse error
-/// @sa https://json.nlohmann.me/api/basic_json/parse_error/
-class parse_error : public exception
-{
-  public:
-    /*!
-    @brief create a parse error exception
-    @param[in] id_       the id of the exception
-    @param[in] pos       the position where the error occurred (or with
-                         chars_read_total=0 if the position cannot be
-                         determined)
-    @param[in] what_arg  the explanatory string
-    @return parse_error object
-    */
-    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
-    static parse_error create(int id_, const position_t& pos, const std::string& what_arg, BasicJsonContext context)
-    {
-        const std::string w = concat(exception::name("parse_error", id_), "parse error",
-                                     position_string(pos), ": ", exception::diagnostics(context), what_arg);
-        return {id_, pos.chars_read_total, w.c_str()};
-    }
-
-    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
-    static parse_error create(int id_, std::size_t byte_, const std::string& what_arg, BasicJsonContext context)
-    {
-        const std::string w = concat(exception::name("parse_error", id_), "parse error",
-                                     (byte_ != 0 ? (concat(" at byte ", std::to_string(byte_))) : ""),
-                                     ": ", exception::diagnostics(context), what_arg);
-        return {id_, byte_, w.c_str()};
-    }
-
-    /*!
-    @brief byte index of the parse error
-
-    The byte index of the last read character in the input file.
-
-    @note For an input with n bytes, 1 is the index of the first character and
-          n+1 is the index of the terminating null byte or the end of file.
-          This also holds true when reading a byte vector (CBOR or MessagePack).
-    */
-    const std::size_t byte;
-
-  private:
-    parse_error(int id_, std::size_t byte_, const char* what_arg)
-        : exception(id_, what_arg), byte(byte_) {}
-
-    static std::string position_string(const position_t& pos)
-    {
-        return concat(" at line ", std::to_string(pos.lines_read + 1),
-                      ", column ", std::to_string(pos.chars_read_current_line));
-    }
-};
-
-/// @brief exception indicating errors with iterators
-/// @sa https://json.nlohmann.me/api/basic_json/invalid_iterator/
-class invalid_iterator : public exception
-{
-  public:
-    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
-    static invalid_iterator create(int id_, const std::string& what_arg, BasicJsonContext context)
-    {
-        const std::string w = concat(exception::name("invalid_iterator", id_), exception::diagnostics(context), what_arg);
-        return {id_, w.c_str()};
-    }
-
-  private:
-    JSON_HEDLEY_NON_NULL(3)
-    invalid_iterator(int id_, const char* what_arg)
-        : exception(id_, what_arg) {}
-};
-
-/// @brief exception indicating executing a member function with a wrong type
-/// @sa https://json.nlohmann.me/api/basic_json/type_error/
-class type_error : public exception
-{
-  public:
-    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
-    static type_error create(int id_, const std::string& what_arg, BasicJsonContext context)
-    {
-        const std::string w = concat(exception::name("type_error", id_), exception::diagnostics(context), what_arg);
-        return {id_, w.c_str()};
-    }
-
-  private:
-    JSON_HEDLEY_NON_NULL(3)
-    type_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
-};
-
-/// @brief exception indicating access out of the defined range
-/// @sa https://json.nlohmann.me/api/basic_json/out_of_range/
-class out_of_range : public exception
-{
-  public:
-    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
-    static out_of_range create(int id_, const std::string& what_arg, BasicJsonContext context)
-    {
-        const std::string w = concat(exception::name("out_of_range", id_), exception::diagnostics(context), what_arg);
-        return {id_, w.c_str()};
-    }
-
-  private:
-    JSON_HEDLEY_NON_NULL(3)
-    out_of_range(int id_, const char* what_arg) : exception(id_, what_arg) {}
-};
-
-/// @brief exception indicating other library errors
-/// @sa https://json.nlohmann.me/api/basic_json/other_error/
-class other_error : public exception
-{
-  public:
-    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
-    static other_error create(int id_, const std::string& what_arg, BasicJsonContext context)
-    {
-        const std::string w = concat(exception::name("other_error", id_), exception::diagnostics(context), what_arg);
-        return {id_, w.c_str()};
-    }
-
-  private:
-    JSON_HEDLEY_NON_NULL(3)
-    other_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-#if defined(__clang__)
-    #pragma clang diagnostic pop
-#endif
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-// #include <nlohmann/detail/meta/cpp_future.hpp>
-
-// #include <nlohmann/detail/meta/identity_tag.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-// #include <nlohmann/detail/abi_macros.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-// dispatching helper struct
-template <class T> struct identity_tag {};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/meta/std_fs.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-
-#if JSON_HAS_EXPERIMENTAL_FILESYSTEM
-#include <experimental/filesystem>
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-namespace std_fs = std::experimental::filesystem;
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-#elif JSON_HAS_FILESYSTEM
-#include <filesystem> // NOLINT(build/c++17)
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-namespace std_fs = std::filesystem;
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-#endif
-
-// #include <nlohmann/detail/meta/type_traits.hpp>
-
-// #include <nlohmann/detail/string_concat.hpp>
-
-// #include <nlohmann/detail/value_t.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-template<typename BasicJsonType>
-inline void from_json(const BasicJsonType& j, typename std::nullptr_t& n)
-{
-    if (JSON_HEDLEY_UNLIKELY(!j.is_null()))
-    {
-        JSON_THROW(type_error::create(302, concat("type must be null, but is ", j.type_name()), &j));
-    }
-    n = nullptr;
-}
-
-#ifdef JSON_HAS_CPP_17
-#ifndef JSON_USE_IMPLICIT_CONVERSIONS
-template<typename BasicJsonType, typename T>
-void from_json(const BasicJsonType& j, std::optional<T>& opt)
-{
-    if (j.is_null())
-    {
-        opt = std::nullopt;
-    }
-    else
-    {
-        opt.emplace(j.template get<T>());
-    }
-}
-
-#endif // JSON_USE_IMPLICIT_CONVERSIONS
-#endif // JSON_HAS_CPP_17
-
-// overloads for basic_json template parameters
-template < typename BasicJsonType, typename ArithmeticType,
-           enable_if_t < std::is_arithmetic<ArithmeticType>::value&&
-                         !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
-                         int > = 0 >
-void get_arithmetic_value(const BasicJsonType& j, ArithmeticType& val)
-{
-    switch (static_cast<value_t>(j))
-    {
-        case value_t::number_unsigned:
-        {
-            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
-            break;
-        }
-        case value_t::number_integer:
-        {
-            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
-            break;
-        }
-        case value_t::number_float:
-        {
-            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
-            break;
-        }
-
-        case value_t::null:
-        case value_t::object:
-        case value_t::array:
-        case value_t::string:
-        case value_t::boolean:
-        case value_t::binary:
-        case value_t::discarded:
-        default:
-            JSON_THROW(type_error::create(302, concat("type must be number, but is ", j.type_name()), &j));
-    }
-}
-
-template<typename BasicJsonType>
-inline void from_json(const BasicJsonType& j, typename BasicJsonType::boolean_t& b)
-{
-    if (JSON_HEDLEY_UNLIKELY(!j.is_boolean()))
-    {
-        JSON_THROW(type_error::create(302, concat("type must be boolean, but is ", j.type_name()), &j));
-    }
-    b = *j.template get_ptr<const typename BasicJsonType::boolean_t*>();
-}
-
-template<typename BasicJsonType>
-inline void from_json(const BasicJsonType& j, typename BasicJsonType::string_t& s)
-{
-    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
-    {
-        JSON_THROW(type_error::create(302, concat("type must be string, but is ", j.type_name()), &j));
-    }
-    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
-}
-
-template <
-    typename BasicJsonType, typename StringType,
-    enable_if_t <
-        std::is_assignable<StringType&, const typename BasicJsonType::string_t>::value
-        && is_detected_exact<typename BasicJsonType::string_t::value_type, value_type_t, StringType>::value
-        && !std::is_same<typename BasicJsonType::string_t, StringType>::value
-        && !is_json_ref<StringType>::value, int > = 0 >
-inline void from_json(const BasicJsonType& j, StringType& s)
-{
-    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
-    {
-        JSON_THROW(type_error::create(302, concat("type must be string, but is ", j.type_name()), &j));
-    }
-
-    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
-}
-
-template<typename BasicJsonType>
-inline void from_json(const BasicJsonType& j, typename BasicJsonType::number_float_t& val)
-{
-    get_arithmetic_value(j, val);
-}
-
-template<typename BasicJsonType>
-inline void from_json(const BasicJsonType& j, typename BasicJsonType::number_unsigned_t& val)
-{
-    get_arithmetic_value(j, val);
-}
-
-template<typename BasicJsonType>
-inline void from_json(const BasicJsonType& j, typename BasicJsonType::number_integer_t& val)
-{
-    get_arithmetic_value(j, val);
-}
-
-#if !JSON_DISABLE_ENUM_SERIALIZATION
-template<typename BasicJsonType, typename EnumType,
-         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
-inline void from_json(const BasicJsonType& j, EnumType& e)
-{
-    typename std::underlying_type<EnumType>::type val;
-    get_arithmetic_value(j, val);
-    e = static_cast<EnumType>(val);
-}
-#endif  // JSON_DISABLE_ENUM_SERIALIZATION
-
-// forward_list doesn't have an insert method
-template<typename BasicJsonType, typename T, typename Allocator,
-         enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
-inline void from_json(const BasicJsonType& j, std::forward_list<T, Allocator>& l)
-{
-    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
-    {
-        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
-    }
-    l.clear();
-    std::transform(j.rbegin(), j.rend(),
-                   std::front_inserter(l), [](const BasicJsonType & i)
-    {
-        return i.template get<T>();
-    });
-}
-
-// valarray doesn't have an insert method
-template<typename BasicJsonType, typename T,
-         enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
-inline void from_json(const BasicJsonType& j, std::valarray<T>& l)
-{
-    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
-    {
-        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
-    }
-    l.resize(j.size());
-    std::transform(j.begin(), j.end(), std::begin(l),
-                   [](const BasicJsonType & elem)
-    {
-        return elem.template get<T>();
-    });
-}
-
-template<typename BasicJsonType, typename T, std::size_t N>
-auto from_json(const BasicJsonType& j, T (&arr)[N])  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
--> decltype(j.template get<T>(), void())
-{
-    for (std::size_t i = 0; i < N; ++i)
-    {
-        arr[i] = j.at(i).template get<T>();
-    }
-}
-
-template<typename BasicJsonType, typename T, std::size_t N1, std::size_t N2>
-auto from_json(const BasicJsonType& j, T (&arr)[N1][N2])  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
--> decltype(j.template get<T>(), void())
-{
-    for (std::size_t i1 = 0; i1 < N1; ++i1)
-    {
-        for (std::size_t i2 = 0; i2 < N2; ++i2)
-        {
-            arr[i1][i2] = j.at(i1).at(i2).template get<T>();
-        }
-    }
-}
-
-template<typename BasicJsonType, typename T, std::size_t N1, std::size_t N2, std::size_t N3>
-auto from_json(const BasicJsonType& j, T (&arr)[N1][N2][N3])  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
--> decltype(j.template get<T>(), void())
-{
-    for (std::size_t i1 = 0; i1 < N1; ++i1)
-    {
-        for (std::size_t i2 = 0; i2 < N2; ++i2)
-        {
-            for (std::size_t i3 = 0; i3 < N3; ++i3)
-            {
-                arr[i1][i2][i3] = j.at(i1).at(i2).at(i3).template get<T>();
-            }
-        }
-    }
-}
-
-template<typename BasicJsonType, typename T, std::size_t N1, std::size_t N2, std::size_t N3, std::size_t N4>
-auto from_json(const BasicJsonType& j, T (&arr)[N1][N2][N3][N4])  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
--> decltype(j.template get<T>(), void())
-{
-    for (std::size_t i1 = 0; i1 < N1; ++i1)
-    {
-        for (std::size_t i2 = 0; i2 < N2; ++i2)
-        {
-            for (std::size_t i3 = 0; i3 < N3; ++i3)
-            {
-                for (std::size_t i4 = 0; i4 < N4; ++i4)
-                {
-                    arr[i1][i2][i3][i4] = j.at(i1).at(i2).at(i3).at(i4).template get<T>();
-                }
-            }
-        }
-    }
-}
-
-template<typename BasicJsonType>
-inline void from_json_array_impl(const BasicJsonType& j, typename BasicJsonType::array_t& arr, priority_tag<3> /*unused*/)
-{
-    arr = *j.template get_ptr<const typename BasicJsonType::array_t*>();
-}
-
-template<typename BasicJsonType, typename T, std::size_t N>
-auto from_json_array_impl(const BasicJsonType& j, std::array<T, N>& arr,
-                          priority_tag<2> /*unused*/)
--> decltype(j.template get<T>(), void())
-{
-    for (std::size_t i = 0; i < N; ++i)
-    {
-        arr[i] = j.at(i).template get<T>();
-    }
-}
-
-template<typename BasicJsonType, typename ConstructibleArrayType,
-         enable_if_t<
-             std::is_assignable<ConstructibleArrayType&, ConstructibleArrayType>::value,
-             int> = 0>
-auto from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr, priority_tag<1> /*unused*/)
--> decltype(
-    arr.reserve(std::declval<typename ConstructibleArrayType::size_type>()),
-    j.template get<typename ConstructibleArrayType::value_type>(),
-    void())
-{
-    using std::end;
-
-    ConstructibleArrayType ret;
-    ret.reserve(j.size());
-    std::transform(j.begin(), j.end(),
-                   std::inserter(ret, end(ret)), [](const BasicJsonType & i)
-    {
-        // get<BasicJsonType>() returns *this, this won't call a from_json
-        // method when value_type is BasicJsonType
-        return i.template get<typename ConstructibleArrayType::value_type>();
-    });
-    arr = std::move(ret);
-}
-
-template<typename BasicJsonType, typename ConstructibleArrayType,
-         enable_if_t<
-             std::is_assignable<ConstructibleArrayType&, ConstructibleArrayType>::value,
-             int> = 0>
-inline void from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr,
-                                 priority_tag<0> /*unused*/)
-{
-    using std::end;
-
-    ConstructibleArrayType ret;
-    std::transform(
-        j.begin(), j.end(), std::inserter(ret, end(ret)),
-        [](const BasicJsonType & i)
-    {
-        // get<BasicJsonType>() returns *this, this won't call a from_json
-        // method when value_type is BasicJsonType
-        return i.template get<typename ConstructibleArrayType::value_type>();
-    });
-    arr = std::move(ret);
-}
-
-template < typename BasicJsonType, typename ConstructibleArrayType,
-           enable_if_t <
-               is_constructible_array_type<BasicJsonType, ConstructibleArrayType>::value&&
-               !is_constructible_object_type<BasicJsonType, ConstructibleArrayType>::value&&
-               !is_constructible_string_type<BasicJsonType, ConstructibleArrayType>::value&&
-               !std::is_same<ConstructibleArrayType, typename BasicJsonType::binary_t>::value&&
-               !is_basic_json<ConstructibleArrayType>::value,
-               int > = 0 >
-auto from_json(const BasicJsonType& j, ConstructibleArrayType& arr)
--> decltype(from_json_array_impl(j, arr, priority_tag<3> {}),
-j.template get<typename ConstructibleArrayType::value_type>(),
-void())
-{
-    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
-    {
-        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
-    }
-
-    from_json_array_impl(j, arr, priority_tag<3> {});
-}
-
-template < typename BasicJsonType, typename T, std::size_t... Idx >
-std::array<T, sizeof...(Idx)> from_json_inplace_array_impl(BasicJsonType&& j,
-                     identity_tag<std::array<T, sizeof...(Idx)>> /*unused*/, index_sequence<Idx...> /*unused*/)
-{
-    return { { std::forward<BasicJsonType>(j).at(Idx).template get<T>()... } };
-}
-
-template < typename BasicJsonType, typename T, std::size_t N >
-auto from_json(BasicJsonType&& j, identity_tag<std::array<T, N>> tag)
--> decltype(from_json_inplace_array_impl(std::forward<BasicJsonType>(j), tag, make_index_sequence<N> {}))
-{
-    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
-    {
-        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
-    }
-
-    return from_json_inplace_array_impl(std::forward<BasicJsonType>(j), tag, make_index_sequence<N> {});
-}
-
-template<typename BasicJsonType>
-inline void from_json(const BasicJsonType& j, typename BasicJsonType::binary_t& bin)
-{
-    if (JSON_HEDLEY_UNLIKELY(!j.is_binary()))
-    {
-        JSON_THROW(type_error::create(302, concat("type must be binary, but is ", j.type_name()), &j));
-    }
-
-    bin = *j.template get_ptr<const typename BasicJsonType::binary_t*>();
-}
-
-template<typename BasicJsonType, typename ConstructibleObjectType,
-         enable_if_t<is_constructible_object_type<BasicJsonType, ConstructibleObjectType>::value, int> = 0>
-inline void from_json(const BasicJsonType& j, ConstructibleObjectType& obj)
-{
-    if (JSON_HEDLEY_UNLIKELY(!j.is_object()))
-    {
-        JSON_THROW(type_error::create(302, concat("type must be object, but is ", j.type_name()), &j));
-    }
-
-    ConstructibleObjectType ret;
-    const auto* inner_object = j.template get_ptr<const typename BasicJsonType::object_t*>();
-    using value_type = typename ConstructibleObjectType::value_type;
-    std::transform(
-        inner_object->begin(), inner_object->end(),
-        std::inserter(ret, ret.begin()),
-        [](typename BasicJsonType::object_t::value_type const & p)
-    {
-        return value_type(p.first, p.second.template get<typename ConstructibleObjectType::mapped_type>());
-    });
-    obj = std::move(ret);
-}
-
-// overload for arithmetic types, not chosen for basic_json template arguments
-// (BooleanType, etc..); note: Is it really necessary to provide explicit
-// overloads for boolean_t etc. in case of a custom BooleanType which is not
-// an arithmetic type?
-template < typename BasicJsonType, typename ArithmeticType,
-           enable_if_t <
-               std::is_arithmetic<ArithmeticType>::value&&
-               !std::is_same<ArithmeticType, typename BasicJsonType::number_unsigned_t>::value&&
-               !std::is_same<ArithmeticType, typename BasicJsonType::number_integer_t>::value&&
-               !std::is_same<ArithmeticType, typename BasicJsonType::number_float_t>::value&&
-               !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
-               int > = 0 >
-inline void from_json(const BasicJsonType& j, ArithmeticType& val)
-{
-    switch (static_cast<value_t>(j))
-    {
-        case value_t::number_unsigned:
-        {
-            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
-            break;
-        }
-        case value_t::number_integer:
-        {
-            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
-            break;
-        }
-        case value_t::number_float:
-        {
-            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
-            break;
-        }
-        case value_t::boolean:
-        {
-            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::boolean_t*>());
-            break;
-        }
-
-        case value_t::null:
-        case value_t::object:
-        case value_t::array:
-        case value_t::string:
-        case value_t::binary:
-        case value_t::discarded:
-        default:
-            JSON_THROW(type_error::create(302, concat("type must be number, but is ", j.type_name()), &j));
-    }
-}
-
-template<typename BasicJsonType, typename... Args, std::size_t... Idx>
-std::tuple<Args...> from_json_tuple_impl_base(BasicJsonType&& j, index_sequence<Idx...> /*unused*/)
-{
-    return std::make_tuple(std::forward<BasicJsonType>(j).at(Idx).template get<Args>()...);
-}
-
-template < typename BasicJsonType, class A1, class A2 >
-std::pair<A1, A2> from_json_tuple_impl(BasicJsonType&& j, identity_tag<std::pair<A1, A2>> /*unused*/, priority_tag<0> /*unused*/)
-{
-    return {std::forward<BasicJsonType>(j).at(0).template get<A1>(),
-            std::forward<BasicJsonType>(j).at(1).template get<A2>()};
-}
-
-template<typename BasicJsonType, typename A1, typename A2>
-inline void from_json_tuple_impl(BasicJsonType&& j, std::pair<A1, A2>& p, priority_tag<1> /*unused*/)
-{
-    p = from_json_tuple_impl(std::forward<BasicJsonType>(j), identity_tag<std::pair<A1, A2>> {}, priority_tag<0> {});
-}
-
-template<typename BasicJsonType, typename... Args>
-std::tuple<Args...> from_json_tuple_impl(BasicJsonType&& j, identity_tag<std::tuple<Args...>> /*unused*/, priority_tag<2> /*unused*/)
-{
-    return from_json_tuple_impl_base<BasicJsonType, Args...>(std::forward<BasicJsonType>(j), index_sequence_for<Args...> {});
-}
-
-template<typename BasicJsonType, typename... Args>
-inline void from_json_tuple_impl(BasicJsonType&& j, std::tuple<Args...>& t, priority_tag<3> /*unused*/)
-{
-    t = from_json_tuple_impl_base<BasicJsonType, Args...>(std::forward<BasicJsonType>(j), index_sequence_for<Args...> {});
-}
-
-template<typename BasicJsonType, typename TupleRelated>
-auto from_json(BasicJsonType&& j, TupleRelated&& t)
--> decltype(from_json_tuple_impl(std::forward<BasicJsonType>(j), std::forward<TupleRelated>(t), priority_tag<3> {}))
-{
-    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
-    {
-        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
-    }
-
-    return from_json_tuple_impl(std::forward<BasicJsonType>(j), std::forward<TupleRelated>(t), priority_tag<3> {});
-}
-
-template < typename BasicJsonType, typename Key, typename Value, typename Compare, typename Allocator,
-           typename = enable_if_t < !std::is_constructible <
-                                        typename BasicJsonType::string_t, Key >::value >>
-inline void from_json(const BasicJsonType& j, std::map<Key, Value, Compare, Allocator>& m)
-{
-    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
-    {
-        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
-    }
-    m.clear();
-    for (const auto& p : j)
-    {
-        if (JSON_HEDLEY_UNLIKELY(!p.is_array()))
-        {
-            JSON_THROW(type_error::create(302, concat("type must be array, but is ", p.type_name()), &j));
-        }
-        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
-    }
-}
-
-template < typename BasicJsonType, typename Key, typename Value, typename Hash, typename KeyEqual, typename Allocator,
-           typename = enable_if_t < !std::is_constructible <
-                                        typename BasicJsonType::string_t, Key >::value >>
-inline void from_json(const BasicJsonType& j, std::unordered_map<Key, Value, Hash, KeyEqual, Allocator>& m)
-{
-    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
-    {
-        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
-    }
-    m.clear();
-    for (const auto& p : j)
-    {
-        if (JSON_HEDLEY_UNLIKELY(!p.is_array()))
-        {
-            JSON_THROW(type_error::create(302, concat("type must be array, but is ", p.type_name()), &j));
-        }
-        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
-    }
-}
-
-#if JSON_HAS_FILESYSTEM || JSON_HAS_EXPERIMENTAL_FILESYSTEM
-template<typename BasicJsonType>
-inline void from_json(const BasicJsonType& j, std_fs::path& p)
-{
-    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
-    {
-        JSON_THROW(type_error::create(302, concat("type must be string, but is ", j.type_name()), &j));
-    }
-    p = *j.template get_ptr<const typename BasicJsonType::string_t*>();
-}
-#endif
-
-struct from_json_fn
-{
-    template<typename BasicJsonType, typename T>
-    auto operator()(const BasicJsonType& j, T&& val) const
-    noexcept(noexcept(from_json(j, std::forward<T>(val))))
-    -> decltype(from_json(j, std::forward<T>(val)))
-    {
-        return from_json(j, std::forward<T>(val));
-    }
-};
-
-}  // namespace detail
-
-#ifndef JSON_HAS_CPP_17
-/// namespace to hold default `from_json` function
-/// to see why this is required:
-/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
-namespace // NOLINT(cert-dcl59-cpp,fuchsia-header-anon-namespaces,google-build-namespaces)
-{
-#endif
-JSON_INLINE_VARIABLE constexpr const auto& from_json = // NOLINT(misc-definitions-in-headers)
-    detail::static_const<detail::from_json_fn>::value;
-#ifndef JSON_HAS_CPP_17
-}  // namespace
-#endif
-
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/conversions/to_json.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-// #include <nlohmann/detail/macro_scope.hpp>
-// JSON_HAS_CPP_17
-#ifdef JSON_HAS_CPP_17
-    #include <optional> // optional
-#endif
-
-#include <algorithm> // copy
-#include <iterator> // begin, end
-#include <string> // string
-#include <tuple> // tuple, get
-#include <type_traits> // is_same, is_constructible, is_floating_point, is_enum, underlying_type
-#include <utility> // move, forward, declval, pair
-#include <valarray> // valarray
-#include <vector> // vector
-
-// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <cstddef> // size_t
-#include <iterator> // forward_iterator_tag
-#include <tuple> // tuple_size, get, tuple_element
-#include <utility> // move
-
-#if JSON_HAS_RANGES
-    #include <ranges> // enable_borrowed_range
-#endif
-
-// #include <nlohmann/detail/abi_macros.hpp>
-
-// #include <nlohmann/detail/meta/type_traits.hpp>
-
-// #include <nlohmann/detail/string_utils.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <cstddef> // size_t
-#include <string> // string, to_string
-
-// #include <nlohmann/detail/abi_macros.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-template<typename StringType>
-void int_to_string(StringType& target, std::size_t value)
-{
-    // For ADL
-    using std::to_string;
-    target = to_string(value);
-}
-
-template<typename StringType>
-StringType to_string(std::size_t value)
-{
-    StringType result;
-    int_to_string(result, value);
-    return result;
-}
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/value_t.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-template<typename IteratorType> class iteration_proxy_value
-{
-  public:
-    using difference_type = std::ptrdiff_t;
-    using value_type = iteration_proxy_value;
-    using pointer = value_type *;
-    using reference = value_type &;
-    using iterator_category = std::forward_iterator_tag;
-    using string_type = typename std::remove_cv< typename std::remove_reference<decltype( std::declval<IteratorType>().key() ) >::type >::type;
-
-  private:
-    /// the iterator
-    IteratorType anchor{};
-    /// an index for arrays (used to create key names)
-    std::size_t array_index = 0;
-    /// last stringified array index
-    mutable std::size_t array_index_last = 0;
-    /// a string representation of the array index
-    mutable string_type array_index_str = "0";
-    /// an empty string (to return a reference for primitive values)
-    string_type empty_str{};
-
-  public:
-    explicit iteration_proxy_value() = default;
-    explicit iteration_proxy_value(IteratorType it, std::size_t array_index_ = 0)
-    noexcept(std::is_nothrow_move_constructible<IteratorType>::value
-             && std::is_nothrow_default_constructible<string_type>::value)
-        : anchor(std::move(it))
-        , array_index(array_index_)
-    {}
-
-    iteration_proxy_value(iteration_proxy_value const&) = default;
-    iteration_proxy_value& operator=(iteration_proxy_value const&) = default;
-    // older GCCs are a bit fussy and require explicit noexcept specifiers on defaulted functions
-    iteration_proxy_value(iteration_proxy_value&&)
-    noexcept(std::is_nothrow_move_constructible<IteratorType>::value
-             && std::is_nothrow_move_constructible<string_type>::value) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor,cppcoreguidelines-noexcept-move-operations)
-    iteration_proxy_value& operator=(iteration_proxy_value&&)
-    noexcept(std::is_nothrow_move_assignable<IteratorType>::value
-             && std::is_nothrow_move_assignable<string_type>::value) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor,cppcoreguidelines-noexcept-move-operations)
-    ~iteration_proxy_value() = default;
-
-    /// dereference operator (needed for range-based for)
-    const iteration_proxy_value& operator*() const
-    {
-        return *this;
-    }
-
-    /// increment operator (needed for range-based for)
-    iteration_proxy_value& operator++()
-    {
-        ++anchor;
-        ++array_index;
-
-        return *this;
-    }
-
-    iteration_proxy_value operator++(int)& // NOLINT(cert-dcl21-cpp)
-    {
-        auto tmp = iteration_proxy_value(anchor, array_index);
-        ++anchor;
-        ++array_index;
-        return tmp;
-    }
-
-    /// equality operator (needed for InputIterator)
-    bool operator==(const iteration_proxy_value& o) const
-    {
-        return anchor == o.anchor;
-    }
-
-    /// inequality operator (needed for range-based for)
-    bool operator!=(const iteration_proxy_value& o) const
-    {
-        return anchor != o.anchor;
-    }
-
-    /// return key of the iterator
-    const string_type& key() const
-    {
-        JSON_ASSERT(anchor.m_object != nullptr);
-
-        switch (anchor.m_object->type())
-        {
-            // use integer array index as key
-            case value_t::array:
-            {
-                if (array_index != array_index_last)
-                {
-                    int_to_string( array_index_str, array_index );
-                    array_index_last = array_index;
-                }
-                return array_index_str;
-            }
-
-            // use key from the object
-            case value_t::object:
-                return anchor.key();
-
-            // use an empty key for all primitive types
-            case value_t::null:
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-                return empty_str;
-        }
-    }
-
-    /// return value of the iterator
-    typename IteratorType::reference value() const
-    {
-        return anchor.value();
-    }
-};
-
-/// proxy class for the items() function
-template<typename IteratorType> class iteration_proxy
-{
-  private:
-    /// the container to iterate
-    typename IteratorType::pointer container = nullptr;
-
-  public:
-    explicit iteration_proxy() = default;
-
-    /// construct iteration proxy from a container
-    explicit iteration_proxy(typename IteratorType::reference cont) noexcept
-        : container(&cont) {}
-
-    iteration_proxy(iteration_proxy const&) = default;
-    iteration_proxy& operator=(iteration_proxy const&) = default;
-    iteration_proxy(iteration_proxy&&) noexcept = default;
-    iteration_proxy& operator=(iteration_proxy&&) noexcept = default;
-    ~iteration_proxy() = default;
-
-    /// return iterator begin (needed for range-based for)
-    iteration_proxy_value<IteratorType> begin() const noexcept
-    {
-        return iteration_proxy_value<IteratorType>(container->begin());
-    }
-
-    /// return iterator end (needed for range-based for)
-    iteration_proxy_value<IteratorType> end() const noexcept
-    {
-        return iteration_proxy_value<IteratorType>(container->end());
-    }
-};
-
-// Structured Bindings Support
-// For further reference see https://blog.tartanllama.xyz/structured-bindings/
-// And see https://github.com/nlohmann/json/pull/1391
-template<std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0>
-auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key())
-{
-    return i.key();
-}
-// Structured Bindings Support
-// For further reference see https://blog.tartanllama.xyz/structured-bindings/
-// And see https://github.com/nlohmann/json/pull/1391
-template<std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0>
-auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value())
-{
-    return i.value();
-}
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// The Addition to the STD Namespace is required to add
-// Structured Bindings Support to the iteration_proxy_value class
-// For further reference see https://blog.tartanllama.xyz/structured-bindings/
-// And see https://github.com/nlohmann/json/pull/1391
-namespace std
-{
-
-#if defined(__clang__)
-    // Fix: https://github.com/nlohmann/json/issues/1401
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wmismatched-tags"
-#endif
-template<typename IteratorType>
-class tuple_size<::nlohmann::detail::iteration_proxy_value<IteratorType>> // NOLINT(cert-dcl58-cpp)
-    : public std::integral_constant<std::size_t, 2> {};
-
-template<std::size_t N, typename IteratorType>
-class tuple_element<N, ::nlohmann::detail::iteration_proxy_value<IteratorType >> // NOLINT(cert-dcl58-cpp)
-{
-  public:
-    using type = decltype(
-                     get<N>(std::declval <
-                            ::nlohmann::detail::iteration_proxy_value<IteratorType >> ()));
-};
-#if defined(__clang__)
-    #pragma clang diagnostic pop
-#endif
-
-}  // namespace std
-
-#if JSON_HAS_RANGES
-    template <typename IteratorType>
-    inline constexpr bool ::std::ranges::enable_borrowed_range<::nlohmann::detail::iteration_proxy<IteratorType>> = true;
-#endif
-
-// #include <nlohmann/detail/meta/cpp_future.hpp>
-
-// #include <nlohmann/detail/meta/std_fs.hpp>
-
-// #include <nlohmann/detail/meta/type_traits.hpp>
-
-// #include <nlohmann/detail/value_t.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-//////////////////
-// constructors //
-//////////////////
-
-/*
- * Note all external_constructor<>::construct functions need to call
- * j.m_data.m_value.destroy(j.m_data.m_type) to avoid a memory leak in case j contains an
- * allocated value (e.g., a string). See bug issue
- * https://github.com/nlohmann/json/issues/2865 for more information.
- */
-
-template<value_t> struct external_constructor;
-
-template<>
-struct external_constructor<value_t::boolean>
-{
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, typename BasicJsonType::boolean_t b) noexcept
-    {
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::boolean;
-        j.m_data.m_value = b;
-        j.assert_invariant();
-    }
-};
-
-template<>
-struct external_constructor<value_t::string>
-{
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, const typename BasicJsonType::string_t& s)
-    {
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::string;
-        j.m_data.m_value = s;
-        j.assert_invariant();
-    }
-
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, typename BasicJsonType::string_t&& s)
-    {
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::string;
-        j.m_data.m_value = std::move(s);
-        j.assert_invariant();
-    }
-
-    template < typename BasicJsonType, typename CompatibleStringType,
-               enable_if_t < !std::is_same<CompatibleStringType, typename BasicJsonType::string_t>::value,
-                             int > = 0 >
-    static void construct(BasicJsonType& j, const CompatibleStringType& str)
-    {
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::string;
-        j.m_data.m_value.string = j.template create<typename BasicJsonType::string_t>(str);
-        j.assert_invariant();
-    }
-};
-
-template<>
-struct external_constructor<value_t::binary>
-{
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, const typename BasicJsonType::binary_t& b)
-    {
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::binary;
-        j.m_data.m_value = typename BasicJsonType::binary_t(b);
-        j.assert_invariant();
-    }
-
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, typename BasicJsonType::binary_t&& b)
-    {
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::binary;
-        j.m_data.m_value = typename BasicJsonType::binary_t(std::move(b));
-        j.assert_invariant();
-    }
-};
-
-template<>
-struct external_constructor<value_t::number_float>
-{
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, typename BasicJsonType::number_float_t val) noexcept
-    {
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::number_float;
-        j.m_data.m_value = val;
-        j.assert_invariant();
-    }
-};
-
-template<>
-struct external_constructor<value_t::number_unsigned>
-{
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, typename BasicJsonType::number_unsigned_t val) noexcept
-    {
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::number_unsigned;
-        j.m_data.m_value = val;
-        j.assert_invariant();
-    }
-};
-
-template<>
-struct external_constructor<value_t::number_integer>
-{
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, typename BasicJsonType::number_integer_t val) noexcept
-    {
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::number_integer;
-        j.m_data.m_value = val;
-        j.assert_invariant();
-    }
-};
-
-template<>
-struct external_constructor<value_t::array>
-{
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, const typename BasicJsonType::array_t& arr)
-    {
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::array;
-        j.m_data.m_value = arr;
-        j.set_parents();
-        j.assert_invariant();
-    }
-
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
-    {
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::array;
-        j.m_data.m_value = std::move(arr);
-        j.set_parents();
-        j.assert_invariant();
-    }
-
-    template < typename BasicJsonType, typename CompatibleArrayType,
-               enable_if_t < !std::is_same<CompatibleArrayType, typename BasicJsonType::array_t>::value,
-                             int > = 0 >
-    static void construct(BasicJsonType& j, const CompatibleArrayType& arr)
-    {
-        using std::begin;
-        using std::end;
-
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::array;
-        j.m_data.m_value.array = j.template create<typename BasicJsonType::array_t>(begin(arr), end(arr));
-        j.set_parents();
-        j.assert_invariant();
-    }
-
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, const std::vector<bool>& arr)
-    {
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::array;
-        j.m_data.m_value = value_t::array;
-        j.m_data.m_value.array->reserve(arr.size());
-        for (const bool x : arr)
-        {
-            j.m_data.m_value.array->push_back(x);
-            j.set_parent(j.m_data.m_value.array->back());
-        }
-        j.assert_invariant();
-    }
-
-    template<typename BasicJsonType, typename T,
-             enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
-    static void construct(BasicJsonType& j, const std::valarray<T>& arr)
-    {
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::array;
-        j.m_data.m_value = value_t::array;
-        j.m_data.m_value.array->resize(arr.size());
-        if (arr.size() > 0)
-        {
-            std::copy(std::begin(arr), std::end(arr), j.m_data.m_value.array->begin());
-        }
-        j.set_parents();
-        j.assert_invariant();
-    }
-};
-
-template<>
-struct external_constructor<value_t::object>
-{
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, const typename BasicJsonType::object_t& obj)
-    {
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::object;
-        j.m_data.m_value = obj;
-        j.set_parents();
-        j.assert_invariant();
-    }
-
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
-    {
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::object;
-        j.m_data.m_value = std::move(obj);
-        j.set_parents();
-        j.assert_invariant();
-    }
-
-    template < typename BasicJsonType, typename CompatibleObjectType,
-               enable_if_t < !std::is_same<CompatibleObjectType, typename BasicJsonType::object_t>::value, int > = 0 >
-    static void construct(BasicJsonType& j, const CompatibleObjectType& obj)
-    {
-        using std::begin;
-        using std::end;
-
-        j.m_data.m_value.destroy(j.m_data.m_type);
-        j.m_data.m_type = value_t::object;
-        j.m_data.m_value.object = j.template create<typename BasicJsonType::object_t>(begin(obj), end(obj));
-        j.set_parents();
-        j.assert_invariant();
-    }
-};
-
-/////////////
-// to_json //
-/////////////
-
-#ifdef JSON_HAS_CPP_17
-template<typename BasicJsonType, typename T,
-         enable_if_t<std::is_constructible<BasicJsonType, T>::value, int> = 0>
-void to_json(BasicJsonType& j, const std::optional<T>& opt)
-{
-    if (opt.has_value())
-    {
-        j = *opt;
-    }
-    else
-    {
-        j = nullptr;
-    }
-}
-#endif
-
-template<typename BasicJsonType, typename T,
-         enable_if_t<std::is_same<T, typename BasicJsonType::boolean_t>::value, int> = 0>
-inline void to_json(BasicJsonType& j, T b) noexcept
-{
-    external_constructor<value_t::boolean>::construct(j, b);
-}
-
-template < typename BasicJsonType, typename BoolRef,
-           enable_if_t <
-               ((std::is_same<std::vector<bool>::reference, BoolRef>::value
-                 && !std::is_same <std::vector<bool>::reference, typename BasicJsonType::boolean_t&>::value)
-                || (std::is_same<std::vector<bool>::const_reference, BoolRef>::value
-                    && !std::is_same <detail::uncvref_t<std::vector<bool>::const_reference>,
-                                      typename BasicJsonType::boolean_t >::value))
-               && std::is_convertible<const BoolRef&, typename BasicJsonType::boolean_t>::value, int > = 0 >
-inline void to_json(BasicJsonType& j, const BoolRef& b) noexcept
-{
-    external_constructor<value_t::boolean>::construct(j, static_cast<typename BasicJsonType::boolean_t>(b));
-}
-
-template<typename BasicJsonType, typename CompatibleString,
-         enable_if_t<std::is_constructible<typename BasicJsonType::string_t, CompatibleString>::value, int> = 0>
-inline void to_json(BasicJsonType& j, const CompatibleString& s)
-{
-    external_constructor<value_t::string>::construct(j, s);
-}
-
-template<typename BasicJsonType>
-inline void to_json(BasicJsonType& j, typename BasicJsonType::string_t&& s)
-{
-    external_constructor<value_t::string>::construct(j, std::move(s));
-}
-
-template<typename BasicJsonType, typename FloatType,
-         enable_if_t<std::is_floating_point<FloatType>::value, int> = 0>
-inline void to_json(BasicJsonType& j, FloatType val) noexcept
-{
-    external_constructor<value_t::number_float>::construct(j, static_cast<typename BasicJsonType::number_float_t>(val));
-}
-
-template<typename BasicJsonType, typename CompatibleNumberUnsignedType,
-         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_unsigned_t, CompatibleNumberUnsignedType>::value, int> = 0>
-inline void to_json(BasicJsonType& j, CompatibleNumberUnsignedType val) noexcept
-{
-    external_constructor<value_t::number_unsigned>::construct(j, static_cast<typename BasicJsonType::number_unsigned_t>(val));
-}
-
-template<typename BasicJsonType, typename CompatibleNumberIntegerType,
-         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_integer_t, CompatibleNumberIntegerType>::value, int> = 0>
-inline void to_json(BasicJsonType& j, CompatibleNumberIntegerType val) noexcept
-{
-    external_constructor<value_t::number_integer>::construct(j, static_cast<typename BasicJsonType::number_integer_t>(val));
-}
-
-#if !JSON_DISABLE_ENUM_SERIALIZATION
-template<typename BasicJsonType, typename EnumType,
-         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
-inline void to_json(BasicJsonType& j, EnumType e) noexcept
-{
-    using underlying_type = typename std::underlying_type<EnumType>::type;
-    static constexpr value_t integral_value_t = std::is_unsigned<underlying_type>::value ? value_t::number_unsigned : value_t::number_integer;
-    external_constructor<integral_value_t>::construct(j, static_cast<underlying_type>(e));
-}
-#endif  // JSON_DISABLE_ENUM_SERIALIZATION
-
-template<typename BasicJsonType>
-inline void to_json(BasicJsonType& j, const std::vector<bool>& e)
-{
-    external_constructor<value_t::array>::construct(j, e);
-}
-
-template < typename BasicJsonType, typename CompatibleArrayType,
-           enable_if_t < is_compatible_array_type<BasicJsonType,
-                         CompatibleArrayType>::value&&
-                         !is_compatible_object_type<BasicJsonType, CompatibleArrayType>::value&&
-                         !is_compatible_string_type<BasicJsonType, CompatibleArrayType>::value&&
-                         !std::is_same<typename BasicJsonType::binary_t, CompatibleArrayType>::value&&
-                         !is_basic_json<CompatibleArrayType>::value,
-                         int > = 0 >
-inline void to_json(BasicJsonType& j, const CompatibleArrayType& arr)
-{
-    external_constructor<value_t::array>::construct(j, arr);
-}
-
-template<typename BasicJsonType>
-inline void to_json(BasicJsonType& j, const typename BasicJsonType::binary_t& bin)
-{
-    external_constructor<value_t::binary>::construct(j, bin);
-}
-
-template<typename BasicJsonType, typename T,
-         enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
-inline void to_json(BasicJsonType& j, const std::valarray<T>& arr)
-{
-    external_constructor<value_t::array>::construct(j, std::move(arr));
-}
-
-template<typename BasicJsonType>
-inline void to_json(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
-{
-    external_constructor<value_t::array>::construct(j, std::move(arr));
-}
-
-template < typename BasicJsonType, typename CompatibleObjectType,
-           enable_if_t < is_compatible_object_type<BasicJsonType, CompatibleObjectType>::value&& !is_basic_json<CompatibleObjectType>::value, int > = 0 >
-inline void to_json(BasicJsonType& j, const CompatibleObjectType& obj)
-{
-    external_constructor<value_t::object>::construct(j, obj);
-}
-
-template<typename BasicJsonType>
-inline void to_json(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
-{
-    external_constructor<value_t::object>::construct(j, std::move(obj));
-}
-
-template <
-    typename BasicJsonType, typename T, std::size_t N,
-    enable_if_t < !std::is_constructible<typename BasicJsonType::string_t,
-                  const T(&)[N]>::value, // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
-                  int > = 0 >
-inline void to_json(BasicJsonType& j, const T(&arr)[N]) // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
-{
-    external_constructor<value_t::array>::construct(j, arr);
-}
-
-template < typename BasicJsonType, typename T1, typename T2, enable_if_t < std::is_constructible<BasicJsonType, T1>::value&& std::is_constructible<BasicJsonType, T2>::value, int > = 0 >
-inline void to_json(BasicJsonType& j, const std::pair<T1, T2>& p)
-{
-    j = { p.first, p.second };
-}
-
-// for https://github.com/nlohmann/json/pull/1134
-template<typename BasicJsonType, typename T,
-         enable_if_t<std::is_same<T, iteration_proxy_value<typename BasicJsonType::iterator>>::value, int> = 0>
-inline void to_json(BasicJsonType& j, const T& b)
-{
-    j = { {b.key(), b.value()} };
-}
-
-template<typename BasicJsonType, typename Tuple, std::size_t... Idx>
-inline void to_json_tuple_impl(BasicJsonType& j, const Tuple& t, index_sequence<Idx...> /*unused*/)
-{
-    j = { std::get<Idx>(t)... };
-}
-
-template<typename BasicJsonType, typename T, enable_if_t<is_constructible_tuple<BasicJsonType, T>::value, int > = 0>
-inline void to_json(BasicJsonType& j, const T& t)
-{
-    to_json_tuple_impl(j, t, make_index_sequence<std::tuple_size<T>::value> {});
-}
-
-#if JSON_HAS_FILESYSTEM || JSON_HAS_EXPERIMENTAL_FILESYSTEM
-template<typename BasicJsonType>
-inline void to_json(BasicJsonType& j, const std_fs::path& p)
-{
-    j = p.string();
-}
-#endif
-
-struct to_json_fn
-{
-    template<typename BasicJsonType, typename T>
-    auto operator()(BasicJsonType& j, T&& val) const noexcept(noexcept(to_json(j, std::forward<T>(val))))
-    -> decltype(to_json(j, std::forward<T>(val)), void())
-    {
-        return to_json(j, std::forward<T>(val));
-    }
-};
-}  // namespace detail
-
-#ifndef JSON_HAS_CPP_17
-/// namespace to hold default `to_json` function
-/// to see why this is required:
-/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
-namespace // NOLINT(cert-dcl59-cpp,fuchsia-header-anon-namespaces,google-build-namespaces)
-{
-#endif
-JSON_INLINE_VARIABLE constexpr const auto& to_json = // NOLINT(misc-definitions-in-headers)
-    detail::static_const<detail::to_json_fn>::value;
-#ifndef JSON_HAS_CPP_17
-}  // namespace
-#endif
-
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/meta/identity_tag.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-
-/// @sa https://json.nlohmann.me/api/adl_serializer/
-template<typename ValueType, typename>
-struct adl_serializer
-{
-    /// @brief convert a JSON value to any value type
-    /// @sa https://json.nlohmann.me/api/adl_serializer/from_json/
-    template<typename BasicJsonType, typename TargetType = ValueType>
-    static auto from_json(BasicJsonType && j, TargetType& val) noexcept(
-        noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j), val)))
-    -> decltype(::nlohmann::from_json(std::forward<BasicJsonType>(j), val), void())
-    {
-        ::nlohmann::from_json(std::forward<BasicJsonType>(j), val);
-    }
-
-    /// @brief convert a JSON value to any value type
-    /// @sa https://json.nlohmann.me/api/adl_serializer/from_json/
-    template<typename BasicJsonType, typename TargetType = ValueType>
-    static auto from_json(BasicJsonType && j) noexcept(
-    noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j), detail::identity_tag<TargetType> {})))
-    -> decltype(::nlohmann::from_json(std::forward<BasicJsonType>(j), detail::identity_tag<TargetType> {}))
-    {
-        return ::nlohmann::from_json(std::forward<BasicJsonType>(j), detail::identity_tag<TargetType> {});
-    }
-
-    /// @brief convert any value type to a JSON value
-    /// @sa https://json.nlohmann.me/api/adl_serializer/to_json/
-    template<typename BasicJsonType, typename TargetType = ValueType>
-    static auto to_json(BasicJsonType& j, TargetType && val) noexcept(
-        noexcept(::nlohmann::to_json(j, std::forward<TargetType>(val))))
-    -> decltype(::nlohmann::to_json(j, std::forward<TargetType>(val)), void())
-    {
-        ::nlohmann::to_json(j, std::forward<TargetType>(val));
-    }
-};
-
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/byte_container_with_subtype.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <cstdint> // uint8_t, uint64_t
-#include <tuple> // tie
-#include <utility> // move
-
-// #include <nlohmann/detail/abi_macros.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-
-/// @brief an internal type for a backed binary type
-/// @sa https://json.nlohmann.me/api/byte_container_with_subtype/
-template<typename BinaryType>
-class byte_container_with_subtype : public BinaryType
-{
-  public:
-    using container_type = BinaryType;
-    using subtype_type = std::uint64_t;
-
-    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
-    byte_container_with_subtype() noexcept(noexcept(container_type()))
-        : container_type()
-    {}
-
-    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
-    byte_container_with_subtype(const container_type& b) noexcept(noexcept(container_type(b)))
-        : container_type(b)
-    {}
-
-    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
-    byte_container_with_subtype(container_type&& b) noexcept(noexcept(container_type(std::move(b))))
-        : container_type(std::move(b))
-    {}
-
-    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
-    byte_container_with_subtype(const container_type& b, subtype_type subtype_) noexcept(noexcept(container_type(b)))
-        : container_type(b)
-        , m_subtype(subtype_)
-        , m_has_subtype(true)
-    {}
-
-    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
-    byte_container_with_subtype(container_type&& b, subtype_type subtype_) noexcept(noexcept(container_type(std::move(b))))
-        : container_type(std::move(b))
-        , m_subtype(subtype_)
-        , m_has_subtype(true)
-    {}
-
-    bool operator==(const byte_container_with_subtype& rhs) const
-    {
-        return std::tie(static_cast<const BinaryType&>(*this), m_subtype, m_has_subtype) ==
-               std::tie(static_cast<const BinaryType&>(rhs), rhs.m_subtype, rhs.m_has_subtype);
-    }
-
-    bool operator!=(const byte_container_with_subtype& rhs) const
-    {
-        return !(rhs == *this);
-    }
-
-    /// @brief sets the binary subtype
-    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/set_subtype/
-    void set_subtype(subtype_type subtype_) noexcept
-    {
-        m_subtype = subtype_;
-        m_has_subtype = true;
-    }
-
-    /// @brief return the binary subtype
-    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/subtype/
-    constexpr subtype_type subtype() const noexcept
-    {
-        return m_has_subtype ? m_subtype : static_cast<subtype_type>(-1);
-    }
-
-    /// @brief return whether the value has a subtype
-    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/has_subtype/
-    constexpr bool has_subtype() const noexcept
-    {
-        return m_has_subtype;
-    }
-
-    /// @brief clears the binary subtype
-    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/clear_subtype/
-    void clear_subtype() noexcept
-    {
-        m_subtype = 0;
-        m_has_subtype = false;
-    }
-
-  private:
-    subtype_type m_subtype = 0;
-    bool m_has_subtype = false;
-};
-
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/conversions/from_json.hpp>
-
-// #include <nlohmann/detail/conversions/to_json.hpp>
-
-// #include <nlohmann/detail/exceptions.hpp>
-
-// #include <nlohmann/detail/hash.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <cstdint> // uint8_t
-#include <cstddef> // size_t
-#include <functional> // hash
-
-// #include <nlohmann/detail/abi_macros.hpp>
-
-// #include <nlohmann/detail/value_t.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-// boost::hash_combine
-inline std::size_t combine(std::size_t seed, std::size_t h) noexcept
-{
-    seed ^= h + 0x9e3779b9 + (seed << 6U) + (seed >> 2U);
-    return seed;
-}
-
-/*!
-@brief hash a JSON value
-
-The hash function tries to rely on std::hash where possible. Furthermore, the
-type of the JSON value is taken into account to have different hash values for
-null, 0, 0U, and false, etc.
-
-@tparam BasicJsonType basic_json specialization
-@param j JSON value to hash
-@return hash value of j
-*/
-template<typename BasicJsonType>
-std::size_t hash(const BasicJsonType& j)
-{
-    using string_t = typename BasicJsonType::string_t;
-    using number_integer_t = typename BasicJsonType::number_integer_t;
-    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-
-    const auto type = static_cast<std::size_t>(j.type());
-    switch (j.type())
-    {
-        case BasicJsonType::value_t::null:
-        case BasicJsonType::value_t::discarded:
-        {
-            return combine(type, 0);
-        }
-
-        case BasicJsonType::value_t::object:
-        {
-            auto seed = combine(type, j.size());
-            for (const auto& element : j.items())
-            {
-                const auto h = std::hash<string_t> {}(element.key());
-                seed = combine(seed, h);
-                seed = combine(seed, hash(element.value()));
-            }
-            return seed;
-        }
-
-        case BasicJsonType::value_t::array:
-        {
-            auto seed = combine(type, j.size());
-            for (const auto& element : j)
-            {
-                seed = combine(seed, hash(element));
-            }
-            return seed;
-        }
-
-        case BasicJsonType::value_t::string:
-        {
-            const auto h = std::hash<string_t> {}(j.template get_ref<const string_t&>());
-            return combine(type, h);
-        }
-
-        case BasicJsonType::value_t::boolean:
-        {
-            const auto h = std::hash<bool> {}(j.template get<bool>());
-            return combine(type, h);
-        }
-
-        case BasicJsonType::value_t::number_integer:
-        {
-            const auto h = std::hash<number_integer_t> {}(j.template get<number_integer_t>());
-            return combine(type, h);
-        }
-
-        case BasicJsonType::value_t::number_unsigned:
-        {
-            const auto h = std::hash<number_unsigned_t> {}(j.template get<number_unsigned_t>());
-            return combine(type, h);
-        }
-
-        case BasicJsonType::value_t::number_float:
-        {
-            const auto h = std::hash<number_float_t> {}(j.template get<number_float_t>());
-            return combine(type, h);
-        }
-
-        case BasicJsonType::value_t::binary:
-        {
-            auto seed = combine(type, j.get_binary().size());
-            const auto h = std::hash<bool> {}(j.get_binary().has_subtype());
-            seed = combine(seed, h);
-            seed = combine(seed, static_cast<std::size_t>(j.get_binary().subtype()));
-            for (const auto byte : j.get_binary())
-            {
-                seed = combine(seed, std::hash<std::uint8_t> {}(byte));
-            }
-            return seed;
-        }
-
-        default:                   // LCOV_EXCL_LINE
-            JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
-            return 0;              // LCOV_EXCL_LINE
-    }
-}
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/input/binary_reader.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <algorithm> // generate_n
-#include <array> // array
-#include <cmath> // ldexp
-#include <cstddef> // size_t
-#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
-#include <cstdio> // snprintf
-#include <cstring> // memcpy
-#include <iterator> // back_inserter
-#include <limits> // numeric_limits
-#include <string> // char_traits, string
-#include <utility> // make_pair, move
-#include <vector> // vector
-#ifdef __cpp_lib_byteswap
-    #include <bit>  //byteswap
-#endif
-
-// #include <nlohmann/detail/exceptions.hpp>
-
-// #include <nlohmann/detail/input/input_adapters.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <array> // array
-#include <cstddef> // size_t
-#include <cstring> // strlen
-#include <iterator> // begin, end, iterator_traits, random_access_iterator_tag, distance, next
-#include <memory> // shared_ptr, make_shared, addressof
-#include <numeric> // accumulate
-#include <string> // string, char_traits
-#include <type_traits> // enable_if, is_base_of, is_pointer, is_integral, remove_pointer
-#include <utility> // pair, declval
-
-#ifndef JSON_NO_IO
-    #include <cstdio>   // FILE *
-    #include <istream>  // istream
-#endif                  // JSON_NO_IO
-
-// #include <nlohmann/detail/exceptions.hpp>
-
-// #include <nlohmann/detail/iterators/iterator_traits.hpp>
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-// #include <nlohmann/detail/meta/type_traits.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-/// the supported input formats
-enum class input_format_t { json, cbor, msgpack, ubjson, bson, bjdata };
-
-////////////////////
-// input adapters //
-////////////////////
-
-#ifndef JSON_NO_IO
-/*!
-Input adapter for stdio file access. This adapter read only 1 byte and do not use any
- buffer. This adapter is a very low level adapter.
-*/
-class file_input_adapter
-{
-  public:
-    using char_type = char;
-
-    JSON_HEDLEY_NON_NULL(2)
-    explicit file_input_adapter(std::FILE* f) noexcept
-        : m_file(f)
-    {
-        JSON_ASSERT(m_file != nullptr);
-    }
-
-    // make class move-only
-    file_input_adapter(const file_input_adapter&) = delete;
-    file_input_adapter(file_input_adapter&&) noexcept = default;
-    file_input_adapter& operator=(const file_input_adapter&) = delete;
-    file_input_adapter& operator=(file_input_adapter&&) = delete;
-    ~file_input_adapter() = default;
-
-    std::char_traits<char>::int_type get_character() noexcept
-    {
-        return std::fgetc(m_file);
-    }
-
-    // returns the number of characters successfully read
-    template<class T>
-    std::size_t get_elements(T* dest, std::size_t count = 1)
-    {
-        return fread(dest, 1, sizeof(T) * count, m_file);
-    }
-
-  private:
-    /// the file pointer to read from
-    std::FILE* m_file;
-};
-
-/*!
-Input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at
-beginning of input. Does not support changing the underlying std::streambuf
-in mid-input. Maintains underlying std::istream and std::streambuf to support
-subsequent use of standard std::istream operations to process any input
-characters following those used in parsing the JSON input.  Clears the
-std::istream flags; any input errors (e.g., EOF) will be detected by the first
-subsequent call for input from the std::istream.
-*/
-class input_stream_adapter
-{
-  public:
-    using char_type = char;
-
-    ~input_stream_adapter()
-    {
-        // clear stream flags; we use underlying streambuf I/O, do not
-        // maintain ifstream flags, except eof
-        if (is != nullptr)
-        {
-            is->clear(is->rdstate() & std::ios::eofbit);
-        }
-    }
-
-    explicit input_stream_adapter(std::istream& i)
-        : is(&i), sb(i.rdbuf())
-    {}
-
-    // delete because of pointer members
-    input_stream_adapter(const input_stream_adapter&) = delete;
-    input_stream_adapter& operator=(input_stream_adapter&) = delete;
-    input_stream_adapter& operator=(input_stream_adapter&&) = delete;
-
-    input_stream_adapter(input_stream_adapter&& rhs) noexcept
-        : is(rhs.is), sb(rhs.sb)
-    {
-        rhs.is = nullptr;
-        rhs.sb = nullptr;
-    }
-
-    // std::istream/std::streambuf use std::char_traits<char>::to_int_type, to
-    // ensure that std::char_traits<char>::eof() and the character 0xFF do not
-    // end up as the same value, e.g. 0xFFFFFFFF.
-    std::char_traits<char>::int_type get_character()
-    {
-        auto res = sb->sbumpc();
-        // set eof manually, as we don't use the istream interface.
-        if (JSON_HEDLEY_UNLIKELY(res == std::char_traits<char>::eof()))
-        {
-            is->clear(is->rdstate() | std::ios::eofbit);
-        }
-        return res;
-    }
-
-    template<class T>
-    std::size_t get_elements(T* dest, std::size_t count = 1)
-    {
-        auto res = static_cast<std::size_t>(sb->sgetn(reinterpret_cast<char*>(dest), static_cast<std::streamsize>(count * sizeof(T))));
-        if (JSON_HEDLEY_UNLIKELY(res < count * sizeof(T)))
-        {
-            is->clear(is->rdstate() | std::ios::eofbit);
-        }
-        return res;
-    }
-
-  private:
-    /// the associated input stream
-    std::istream* is = nullptr;
-    std::streambuf* sb = nullptr;
-};
-#endif  // JSON_NO_IO
-
-// General-purpose iterator-based adapter. It might not be as fast as
-// theoretically possible for some containers, but it is extremely versatile.
-template<typename IteratorType>
-class iterator_input_adapter
-{
-  public:
-    using char_type = typename std::iterator_traits<IteratorType>::value_type;
-
-    iterator_input_adapter(IteratorType first, IteratorType last)
-        : current(std::move(first)), end(std::move(last))
-    {}
-
-    typename char_traits<char_type>::int_type get_character()
-    {
-        if (JSON_HEDLEY_LIKELY(current != end))
-        {
-            auto result = char_traits<char_type>::to_int_type(*current);
-            std::advance(current, 1);
-            return result;
-        }
-
-        return char_traits<char_type>::eof();
-    }
-
-    // for general iterators, we cannot really do something better than falling back to processing the range one-by-one
-    template<class T>
-    std::size_t get_elements(T* dest, std::size_t count = 1)
-    {
-        auto* ptr = reinterpret_cast<char*>(dest);
-        for (std::size_t read_index = 0; read_index < count * sizeof(T); ++read_index)
-        {
-            if (JSON_HEDLEY_LIKELY(current != end))
-            {
-                ptr[read_index] = static_cast<char>(*current);
-                std::advance(current, 1);
-            }
-            else
-            {
-                return read_index;
-            }
-        }
-        return count * sizeof(T);
-    }
-
-  private:
-    IteratorType current;
-    IteratorType end;
-
-    template<typename BaseInputAdapter, size_t T>
-    friend struct wide_string_input_helper;
-
-    bool empty() const
-    {
-        return current == end;
-    }
-};
-
-template<typename BaseInputAdapter, size_t T>
-struct wide_string_input_helper;
-
-template<typename BaseInputAdapter>
-struct wide_string_input_helper<BaseInputAdapter, 4>
-{
-    // UTF-32
-    static void fill_buffer(BaseInputAdapter& input,
-                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
-                            size_t& utf8_bytes_index,
-                            size_t& utf8_bytes_filled)
-    {
-        utf8_bytes_index = 0;
-
-        if (JSON_HEDLEY_UNLIKELY(input.empty()))
-        {
-            utf8_bytes[0] = std::char_traits<char>::eof();
-            utf8_bytes_filled = 1;
-        }
-        else
-        {
-            // get the current character
-            const auto wc = input.get_character();
-
-            // UTF-32 to UTF-8 encoding
-            if (wc < 0x80)
-            {
-                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
-                utf8_bytes_filled = 1;
-            }
-            else if (wc <= 0x7FF)
-            {
-                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((static_cast<unsigned int>(wc) >> 6u) & 0x1Fu));
-                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
-                utf8_bytes_filled = 2;
-            }
-            else if (wc <= 0xFFFF)
-            {
-                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((static_cast<unsigned int>(wc) >> 12u) & 0x0Fu));
-                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
-                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
-                utf8_bytes_filled = 3;
-            }
-            else if (wc <= 0x10FFFF)
-            {
-                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | ((static_cast<unsigned int>(wc) >> 18u) & 0x07u));
-                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 12u) & 0x3Fu));
-                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
-                utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
-                utf8_bytes_filled = 4;
-            }
-            else
-            {
-                // unknown character
-                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
-                utf8_bytes_filled = 1;
-            }
-        }
-    }
-};
-
-template<typename BaseInputAdapter>
-struct wide_string_input_helper<BaseInputAdapter, 2>
-{
-    // UTF-16
-    static void fill_buffer(BaseInputAdapter& input,
-                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
-                            size_t& utf8_bytes_index,
-                            size_t& utf8_bytes_filled)
-    {
-        utf8_bytes_index = 0;
-
-        if (JSON_HEDLEY_UNLIKELY(input.empty()))
-        {
-            utf8_bytes[0] = std::char_traits<char>::eof();
-            utf8_bytes_filled = 1;
-        }
-        else
-        {
-            // get the current character
-            const auto wc = input.get_character();
-
-            // UTF-16 to UTF-8 encoding
-            if (wc < 0x80)
-            {
-                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
-                utf8_bytes_filled = 1;
-            }
-            else if (wc <= 0x7FF)
-            {
-                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((static_cast<unsigned int>(wc) >> 6u)));
-                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
-                utf8_bytes_filled = 2;
-            }
-            else if (0xD800 > wc || wc >= 0xE000)
-            {
-                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((static_cast<unsigned int>(wc) >> 12u)));
-                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
-                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
-                utf8_bytes_filled = 3;
-            }
-            else
-            {
-                if (JSON_HEDLEY_UNLIKELY(!input.empty()))
-                {
-                    const auto wc2 = static_cast<unsigned int>(input.get_character());
-                    const auto charcode = 0x10000u + (((static_cast<unsigned int>(wc) & 0x3FFu) << 10u) | (wc2 & 0x3FFu));
-                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | (charcode >> 18u));
-                    utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 12u) & 0x3Fu));
-                    utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 6u) & 0x3Fu));
-                    utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (charcode & 0x3Fu));
-                    utf8_bytes_filled = 4;
-                }
-                else
-                {
-                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
-                    utf8_bytes_filled = 1;
-                }
-            }
-        }
-    }
-};
-
-// Wraps another input adapter to convert wide character types into individual bytes.
-template<typename BaseInputAdapter, typename WideCharType>
-class wide_string_input_adapter
-{
-  public:
-    using char_type = char;
-
-    wide_string_input_adapter(BaseInputAdapter base)
-        : base_adapter(base) {}
-
-    typename std::char_traits<char>::int_type get_character() noexcept
-    {
-        // check if buffer needs to be filled
-        if (utf8_bytes_index == utf8_bytes_filled)
-        {
-            fill_buffer<sizeof(WideCharType)>();
-
-            JSON_ASSERT(utf8_bytes_filled > 0);
-            JSON_ASSERT(utf8_bytes_index == 0);
-        }
-
-        // use buffer
-        JSON_ASSERT(utf8_bytes_filled > 0);
-        JSON_ASSERT(utf8_bytes_index < utf8_bytes_filled);
-        return utf8_bytes[utf8_bytes_index++];
-    }
-
-    // parsing binary with wchar doesn't make sense, but since the parsing mode can be runtime, we need something here
-    template<class T>
-    std::size_t get_elements(T* /*dest*/, std::size_t /*count*/ = 1)
-    {
-        JSON_THROW(parse_error::create(112, 1, "wide string type cannot be interpreted as binary data", nullptr));
-    }
-
-  private:
-    BaseInputAdapter base_adapter;
-
-    template<size_t T>
-    void fill_buffer()
-    {
-        wide_string_input_helper<BaseInputAdapter, T>::fill_buffer(base_adapter, utf8_bytes, utf8_bytes_index, utf8_bytes_filled);
-    }
-
-    /// a buffer for UTF-8 bytes
-    std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
-
-    /// index to the utf8_codes array for the next valid byte
-    std::size_t utf8_bytes_index = 0;
-    /// number of valid bytes in the utf8_codes array
-    std::size_t utf8_bytes_filled = 0;
-};
-
-template<typename IteratorType, typename Enable = void>
-struct iterator_input_adapter_factory
-{
-    using iterator_type = IteratorType;
-    using char_type = typename std::iterator_traits<iterator_type>::value_type;
-    using adapter_type = iterator_input_adapter<iterator_type>;
-
-    static adapter_type create(IteratorType first, IteratorType last)
-    {
-        return adapter_type(std::move(first), std::move(last));
-    }
-};
-
-template<typename T>
-struct is_iterator_of_multibyte
-{
-    using value_type = typename std::iterator_traits<T>::value_type;
-    enum
-    {
-        value = sizeof(value_type) > 1
-    };
-};
-
-template<typename IteratorType>
-struct iterator_input_adapter_factory<IteratorType, enable_if_t<is_iterator_of_multibyte<IteratorType>::value>>
-{
-    using iterator_type = IteratorType;
-    using char_type = typename std::iterator_traits<iterator_type>::value_type;
-    using base_adapter_type = iterator_input_adapter<iterator_type>;
-    using adapter_type = wide_string_input_adapter<base_adapter_type, char_type>;
-
-    static adapter_type create(IteratorType first, IteratorType last)
-    {
-        return adapter_type(base_adapter_type(std::move(first), std::move(last)));
-    }
-};
-
-// General purpose iterator-based input
-template<typename IteratorType>
-typename iterator_input_adapter_factory<IteratorType>::adapter_type input_adapter(IteratorType first, IteratorType last)
-{
-    using factory_type = iterator_input_adapter_factory<IteratorType>;
-    return factory_type::create(first, last);
-}
-
-// Convenience shorthand from container to iterator
-// Enables ADL on begin(container) and end(container)
-// Encloses the using declarations in namespace for not to leak them to outside scope
-
-namespace container_input_adapter_factory_impl
-{
-
-using std::begin;
-using std::end;
-
-template<typename ContainerType, typename Enable = void>
-struct container_input_adapter_factory {};
-
-template<typename ContainerType>
-struct container_input_adapter_factory< ContainerType,
-       void_t<decltype(begin(std::declval<ContainerType>()), end(std::declval<ContainerType>()))>>
-       {
-           using adapter_type = decltype(input_adapter(begin(std::declval<ContainerType>()), end(std::declval<ContainerType>())));
-
-           static adapter_type create(const ContainerType& container)
-{
-    return input_adapter(begin(container), end(container));
-}
-       };
-
-}  // namespace container_input_adapter_factory_impl
-
-template<typename ContainerType>
-typename container_input_adapter_factory_impl::container_input_adapter_factory<ContainerType>::adapter_type input_adapter(const ContainerType& container)
-{
-    return container_input_adapter_factory_impl::container_input_adapter_factory<ContainerType>::create(container);
-}
-
-// specialization for std::string
-using string_input_adapter_type = decltype(input_adapter(std::declval<std::string>()));
-
-#ifndef JSON_NO_IO
-// Special cases with fast paths
-inline file_input_adapter input_adapter(std::FILE* file)
-{
-    if (file == nullptr)
-    {
-        JSON_THROW(parse_error::create(101, 0, "attempting to parse an empty input; check that your input string or stream contains the expected JSON", nullptr));
-    }
-    return file_input_adapter(file);
-}
-
-inline input_stream_adapter input_adapter(std::istream& stream)
-{
-    return input_stream_adapter(stream);
-}
-
-inline input_stream_adapter input_adapter(std::istream&& stream)
-{
-    return input_stream_adapter(stream);
-}
-#endif  // JSON_NO_IO
-
-using contiguous_bytes_input_adapter = decltype(input_adapter(std::declval<const char*>(), std::declval<const char*>()));
-
-// Null-delimited strings, and the like.
-template < typename CharT,
-           typename std::enable_if <
-               std::is_pointer<CharT>::value&&
-               !std::is_array<CharT>::value&&
-               std::is_integral<typename std::remove_pointer<CharT>::type>::value&&
-               sizeof(typename std::remove_pointer<CharT>::type) == 1,
-               int >::type = 0 >
-contiguous_bytes_input_adapter input_adapter(CharT b)
-{
-    if (b == nullptr)
-    {
-        JSON_THROW(parse_error::create(101, 0, "attempting to parse an empty input; check that your input string or stream contains the expected JSON", nullptr));
-    }
-    auto length = std::strlen(reinterpret_cast<const char*>(b));
-    const auto* ptr = reinterpret_cast<const char*>(b);
-    return input_adapter(ptr, ptr + length); // cppcheck-suppress[nullPointerArithmeticRedundantCheck]
-}
-
-template<typename T, std::size_t N>
-auto input_adapter(T (&array)[N]) -> decltype(input_adapter(array, array + N)) // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
-{
-    return input_adapter(array, array + N);
-}
-
-// This class only handles inputs of input_buffer_adapter type.
-// It's required so that expressions like {ptr, len} can be implicitly cast
-// to the correct adapter.
-class span_input_adapter
-{
-  public:
-    template < typename CharT,
-               typename std::enable_if <
-                   std::is_pointer<CharT>::value&&
-                   std::is_integral<typename std::remove_pointer<CharT>::type>::value&&
-                   sizeof(typename std::remove_pointer<CharT>::type) == 1,
-                   int >::type = 0 >
-    span_input_adapter(CharT b, std::size_t l)
-        : ia(reinterpret_cast<const char*>(b), reinterpret_cast<const char*>(b) + l) {}
-
-    template<class IteratorType,
-             typename std::enable_if<
-                 std::is_same<typename iterator_traits<IteratorType>::iterator_category, std::random_access_iterator_tag>::value,
-                 int>::type = 0>
-    span_input_adapter(IteratorType first, IteratorType last)
-        : ia(input_adapter(first, last)) {}
-
-    contiguous_bytes_input_adapter&& get()
-    {
-        return std::move(ia); // NOLINT(hicpp-move-const-arg,performance-move-const-arg)
-    }
-
-  private:
-    contiguous_bytes_input_adapter ia;
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/input/json_sax.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <cstddef>
-#include <string> // string
-#include <type_traits> // enable_if_t
-#include <utility> // move
-#include <vector> // vector
-
-// #include <nlohmann/detail/exceptions.hpp>
-
-// #include <nlohmann/detail/input/lexer.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <array> // array
-#include <clocale> // localeconv
-#include <cstddef> // size_t
-#include <cstdio> // snprintf
-#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
-#include <initializer_list> // initializer_list
-#include <string> // char_traits, string
-#include <utility> // move
-#include <vector> // vector
-
-// #include <nlohmann/detail/input/input_adapters.hpp>
-
-// #include <nlohmann/detail/input/position_t.hpp>
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-// #include <nlohmann/detail/meta/type_traits.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-///////////
-// lexer //
-///////////
-
-template<typename BasicJsonType>
-class lexer_base
-{
-  public:
-    /// token types for the parser
-    enum class token_type
-    {
-        uninitialized,    ///< indicating the scanner is uninitialized
-        literal_true,     ///< the `true` literal
-        literal_false,    ///< the `false` literal
-        literal_null,     ///< the `null` literal
-        value_string,     ///< a string -- use get_string() for actual value
-        value_unsigned,   ///< an unsigned integer -- use get_number_unsigned() for actual value
-        value_integer,    ///< a signed integer -- use get_number_integer() for actual value
-        value_float,      ///< an floating point number -- use get_number_float() for actual value
-        begin_array,      ///< the character for array begin `[`
-        begin_object,     ///< the character for object begin `{`
-        end_array,        ///< the character for array end `]`
-        end_object,       ///< the character for object end `}`
-        name_separator,   ///< the name separator `:`
-        value_separator,  ///< the value separator `,`
-        parse_error,      ///< indicating a parse error
-        end_of_input,     ///< indicating the end of the input buffer
-        literal_or_value  ///< a literal or the begin of a value (only for diagnostics)
-    };
-
-    /// return name of values of type token_type (only used for errors)
-    JSON_HEDLEY_RETURNS_NON_NULL
-    JSON_HEDLEY_CONST
-    static const char* token_type_name(const token_type t) noexcept
-    {
-        switch (t)
-        {
-            case token_type::uninitialized:
-                return "<uninitialized>";
-            case token_type::literal_true:
-                return "true literal";
-            case token_type::literal_false:
-                return "false literal";
-            case token_type::literal_null:
-                return "null literal";
-            case token_type::value_string:
-                return "string literal";
-            case token_type::value_unsigned:
-            case token_type::value_integer:
-            case token_type::value_float:
-                return "number literal";
-            case token_type::begin_array:
-                return "'['";
-            case token_type::begin_object:
-                return "'{'";
-            case token_type::end_array:
-                return "']'";
-            case token_type::end_object:
-                return "'}'";
-            case token_type::name_separator:
-                return "':'";
-            case token_type::value_separator:
-                return "','";
-            case token_type::parse_error:
-                return "<parse error>";
-            case token_type::end_of_input:
-                return "end of input";
-            case token_type::literal_or_value:
-                return "'[', '{', or a literal";
-            // LCOV_EXCL_START
-            default: // catch non-enum values
-                return "unknown token";
-                // LCOV_EXCL_STOP
-        }
-    }
-};
-/*!
-@brief lexical analysis
-
-This class organizes the lexical analysis during JSON deserialization.
-*/
-template<typename BasicJsonType, typename InputAdapterType>
-class lexer : public lexer_base<BasicJsonType>
-{
-    using number_integer_t = typename BasicJsonType::number_integer_t;
-    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-    using string_t = typename BasicJsonType::string_t;
-    using char_type = typename InputAdapterType::char_type;
-    using char_int_type = typename char_traits<char_type>::int_type;
-
-  public:
-    using token_type = typename lexer_base<BasicJsonType>::token_type;
-
-    explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) noexcept
-        : ia(std::move(adapter))
-        , ignore_comments(ignore_comments_)
-        , decimal_point_char(static_cast<char_int_type>(get_decimal_point()))
-    {}
-
-    // delete because of pointer members
-    lexer(const lexer&) = delete;
-    lexer(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
-    lexer& operator=(lexer&) = delete;
-    lexer& operator=(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
-    ~lexer() = default;
-
-  private:
-    /////////////////////
-    // locales
-    /////////////////////
-
-    /// return the locale-dependent decimal point
-    JSON_HEDLEY_PURE
-    static char get_decimal_point() noexcept
-    {
-        const auto* loc = localeconv();
-        JSON_ASSERT(loc != nullptr);
-        return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
-    }
-
-    /////////////////////
-    // scan functions
-    /////////////////////
-
-    /*!
-    @brief get codepoint from 4 hex characters following `\u`
-
-    For input "\u c1 c2 c3 c4" the codepoint is:
-      (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
-    = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
-
-    Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
-    must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
-    conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
-    between the ASCII value of the character and the desired integer value.
-
-    @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
-            non-hex character)
-    */
-    int get_codepoint()
-    {
-        // this function only makes sense after reading `\u`
-        JSON_ASSERT(current == 'u');
-        int codepoint = 0;
-
-        const auto factors = { 12u, 8u, 4u, 0u };
-        for (const auto factor : factors)
-        {
-            get();
-
-            if (current >= '0' && current <= '9')
-            {
-                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
-            }
-            else if (current >= 'A' && current <= 'F')
-            {
-                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
-            }
-            else if (current >= 'a' && current <= 'f')
-            {
-                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
-            }
-            else
-            {
-                return -1;
-            }
-        }
-
-        JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
-        return codepoint;
-    }
-
-    /*!
-    @brief check if the next byte(s) are inside a given range
-
-    Adds the current byte and, for each passed range, reads a new byte and
-    checks if it is inside the range. If a violation was detected, set up an
-    error message and return false. Otherwise, return true.
-
-    @param[in] ranges  list of integers; interpreted as list of pairs of
-                       inclusive lower and upper bound, respectively
-
-    @pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
-         1, 2, or 3 pairs. This precondition is enforced by an assertion.
-
-    @return true if and only if no range violation was detected
-    */
-    bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
-    {
-        JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
-        add(current);
-
-        for (auto range = ranges.begin(); range != ranges.end(); ++range)
-        {
-            get();
-            if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) // NOLINT(bugprone-inc-dec-in-conditions)
-            {
-                add(current);
-            }
-            else
-            {
-                error_message = "invalid string: ill-formed UTF-8 byte";
-                return false;
-            }
-        }
-
-        return true;
-    }
-
-    /*!
-    @brief scan a string literal
-
-    This function scans a string according to Sect. 7 of RFC 8259. While
-    scanning, bytes are escaped and copied into buffer token_buffer. Then the
-    function returns successfully, token_buffer is *not* null-terminated (as it
-    may contain \0 bytes), and token_buffer.size() is the number of bytes in the
-    string.
-
-    @return token_type::value_string if string could be successfully scanned,
-            token_type::parse_error otherwise
-
-    @note In case of errors, variable error_message contains a textual
-          description.
-    */
-    token_type scan_string()
-    {
-        // reset token_buffer (ignore opening quote)
-        reset();
-
-        // we entered the function by reading an open quote
-        JSON_ASSERT(current == '\"');
-
-        while (true)
-        {
-            // get next character
-            switch (get())
-            {
-                // end of file while parsing string
-                case char_traits<char_type>::eof():
-                {
-                    error_message = "invalid string: missing closing quote";
-                    return token_type::parse_error;
-                }
-
-                // closing quote
-                case '\"':
-                {
-                    return token_type::value_string;
-                }
-
-                // escapes
-                case '\\':
-                {
-                    switch (get())
-                    {
-                        // quotation mark
-                        case '\"':
-                            add('\"');
-                            break;
-                        // reverse solidus
-                        case '\\':
-                            add('\\');
-                            break;
-                        // solidus
-                        case '/':
-                            add('/');
-                            break;
-                        // backspace
-                        case 'b':
-                            add('\b');
-                            break;
-                        // form feed
-                        case 'f':
-                            add('\f');
-                            break;
-                        // line feed
-                        case 'n':
-                            add('\n');
-                            break;
-                        // carriage return
-                        case 'r':
-                            add('\r');
-                            break;
-                        // tab
-                        case 't':
-                            add('\t');
-                            break;
-
-                        // unicode escapes
-                        case 'u':
-                        {
-                            const int codepoint1 = get_codepoint();
-                            int codepoint = codepoint1; // start with codepoint1
-
-                            if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
-                            {
-                                error_message = "invalid string: '\\u' must be followed by 4 hex digits";
-                                return token_type::parse_error;
-                            }
-
-                            // check if code point is a high surrogate
-                            if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
-                            {
-                                // expect next \uxxxx entry
-                                if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
-                                {
-                                    const int codepoint2 = get_codepoint();
-
-                                    if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
-                                    {
-                                        error_message = "invalid string: '\\u' must be followed by 4 hex digits";
-                                        return token_type::parse_error;
-                                    }
-
-                                    // check if codepoint2 is a low surrogate
-                                    if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
-                                    {
-                                        // overwrite codepoint
-                                        codepoint = static_cast<int>(
-                                                        // high surrogate occupies the most significant 22 bits
-                                                        (static_cast<unsigned int>(codepoint1) << 10u)
-                                                        // low surrogate occupies the least significant 15 bits
-                                                        + static_cast<unsigned int>(codepoint2)
-                                                        // there is still the 0xD800, 0xDC00 and 0x10000 noise
-                                                        // in the result, so we have to subtract with:
-                                                        // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
-                                                        - 0x35FDC00u);
-                                    }
-                                    else
-                                    {
-                                        error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
-                                        return token_type::parse_error;
-                                    }
-                                }
-                                else
-                                {
-                                    error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
-                                    return token_type::parse_error;
-                                }
-                            }
-                            else
-                            {
-                                if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
-                                {
-                                    error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
-                                    return token_type::parse_error;
-                                }
-                            }
-
-                            // result of the above calculation yields a proper codepoint
-                            JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
-
-                            // translate codepoint into bytes
-                            if (codepoint < 0x80)
-                            {
-                                // 1-byte characters: 0xxxxxxx (ASCII)
-                                add(static_cast<char_int_type>(codepoint));
-                            }
-                            else if (codepoint <= 0x7FF)
-                            {
-                                // 2-byte characters: 110xxxxx 10xxxxxx
-                                add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
-                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
-                            }
-                            else if (codepoint <= 0xFFFF)
-                            {
-                                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
-                                add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
-                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
-                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
-                            }
-                            else
-                            {
-                                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-                                add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
-                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
-                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
-                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
-                            }
-
-                            break;
-                        }
-
-                        // other characters after escape
-                        default:
-                            error_message = "invalid string: forbidden character after backslash";
-                            return token_type::parse_error;
-                    }
-
-                    break;
-                }
-
-                // invalid control characters
-                case 0x00:
-                {
-                    error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
-                    return token_type::parse_error;
-                }
-
-                case 0x01:
-                {
-                    error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
-                    return token_type::parse_error;
-                }
-
-                case 0x02:
-                {
-                    error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
-                    return token_type::parse_error;
-                }
-
-                case 0x03:
-                {
-                    error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
-                    return token_type::parse_error;
-                }
-
-                case 0x04:
-                {
-                    error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
-                    return token_type::parse_error;
-                }
-
-                case 0x05:
-                {
-                    error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
-                    return token_type::parse_error;
-                }
-
-                case 0x06:
-                {
-                    error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
-                    return token_type::parse_error;
-                }
-
-                case 0x07:
-                {
-                    error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
-                    return token_type::parse_error;
-                }
-
-                case 0x08:
-                {
-                    error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
-                    return token_type::parse_error;
-                }
-
-                case 0x09:
-                {
-                    error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
-                    return token_type::parse_error;
-                }
-
-                case 0x0A:
-                {
-                    error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
-                    return token_type::parse_error;
-                }
-
-                case 0x0B:
-                {
-                    error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
-                    return token_type::parse_error;
-                }
-
-                case 0x0C:
-                {
-                    error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
-                    return token_type::parse_error;
-                }
-
-                case 0x0D:
-                {
-                    error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
-                    return token_type::parse_error;
-                }
-
-                case 0x0E:
-                {
-                    error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
-                    return token_type::parse_error;
-                }
-
-                case 0x0F:
-                {
-                    error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
-                    return token_type::parse_error;
-                }
-
-                case 0x10:
-                {
-                    error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
-                    return token_type::parse_error;
-                }
-
-                case 0x11:
-                {
-                    error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
-                    return token_type::parse_error;
-                }
-
-                case 0x12:
-                {
-                    error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
-                    return token_type::parse_error;
-                }
-
-                case 0x13:
-                {
-                    error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
-                    return token_type::parse_error;
-                }
-
-                case 0x14:
-                {
-                    error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
-                    return token_type::parse_error;
-                }
-
-                case 0x15:
-                {
-                    error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
-                    return token_type::parse_error;
-                }
-
-                case 0x16:
-                {
-                    error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
-                    return token_type::parse_error;
-                }
-
-                case 0x17:
-                {
-                    error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
-                    return token_type::parse_error;
-                }
-
-                case 0x18:
-                {
-                    error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
-                    return token_type::parse_error;
-                }
-
-                case 0x19:
-                {
-                    error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
-                    return token_type::parse_error;
-                }
-
-                case 0x1A:
-                {
-                    error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
-                    return token_type::parse_error;
-                }
-
-                case 0x1B:
-                {
-                    error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
-                    return token_type::parse_error;
-                }
-
-                case 0x1C:
-                {
-                    error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
-                    return token_type::parse_error;
-                }
-
-                case 0x1D:
-                {
-                    error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
-                    return token_type::parse_error;
-                }
-
-                case 0x1E:
-                {
-                    error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
-                    return token_type::parse_error;
-                }
-
-                case 0x1F:
-                {
-                    error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
-                    return token_type::parse_error;
-                }
-
-                // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
-                case 0x20:
-                case 0x21:
-                case 0x23:
-                case 0x24:
-                case 0x25:
-                case 0x26:
-                case 0x27:
-                case 0x28:
-                case 0x29:
-                case 0x2A:
-                case 0x2B:
-                case 0x2C:
-                case 0x2D:
-                case 0x2E:
-                case 0x2F:
-                case 0x30:
-                case 0x31:
-                case 0x32:
-                case 0x33:
-                case 0x34:
-                case 0x35:
-                case 0x36:
-                case 0x37:
-                case 0x38:
-                case 0x39:
-                case 0x3A:
-                case 0x3B:
-                case 0x3C:
-                case 0x3D:
-                case 0x3E:
-                case 0x3F:
-                case 0x40:
-                case 0x41:
-                case 0x42:
-                case 0x43:
-                case 0x44:
-                case 0x45:
-                case 0x46:
-                case 0x47:
-                case 0x48:
-                case 0x49:
-                case 0x4A:
-                case 0x4B:
-                case 0x4C:
-                case 0x4D:
-                case 0x4E:
-                case 0x4F:
-                case 0x50:
-                case 0x51:
-                case 0x52:
-                case 0x53:
-                case 0x54:
-                case 0x55:
-                case 0x56:
-                case 0x57:
-                case 0x58:
-                case 0x59:
-                case 0x5A:
-                case 0x5B:
-                case 0x5D:
-                case 0x5E:
-                case 0x5F:
-                case 0x60:
-                case 0x61:
-                case 0x62:
-                case 0x63:
-                case 0x64:
-                case 0x65:
-                case 0x66:
-                case 0x67:
-                case 0x68:
-                case 0x69:
-                case 0x6A:
-                case 0x6B:
-                case 0x6C:
-                case 0x6D:
-                case 0x6E:
-                case 0x6F:
-                case 0x70:
-                case 0x71:
-                case 0x72:
-                case 0x73:
-                case 0x74:
-                case 0x75:
-                case 0x76:
-                case 0x77:
-                case 0x78:
-                case 0x79:
-                case 0x7A:
-                case 0x7B:
-                case 0x7C:
-                case 0x7D:
-                case 0x7E:
-                case 0x7F:
-                {
-                    add(current);
-                    break;
-                }
-
-                // U+0080..U+07FF: bytes C2..DF 80..BF
-                case 0xC2:
-                case 0xC3:
-                case 0xC4:
-                case 0xC5:
-                case 0xC6:
-                case 0xC7:
-                case 0xC8:
-                case 0xC9:
-                case 0xCA:
-                case 0xCB:
-                case 0xCC:
-                case 0xCD:
-                case 0xCE:
-                case 0xCF:
-                case 0xD0:
-                case 0xD1:
-                case 0xD2:
-                case 0xD3:
-                case 0xD4:
-                case 0xD5:
-                case 0xD6:
-                case 0xD7:
-                case 0xD8:
-                case 0xD9:
-                case 0xDA:
-                case 0xDB:
-                case 0xDC:
-                case 0xDD:
-                case 0xDE:
-                case 0xDF:
-                {
-                    if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
-                    {
-                        return token_type::parse_error;
-                    }
-                    break;
-                }
-
-                // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
-                case 0xE0:
-                {
-                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
-                    {
-                        return token_type::parse_error;
-                    }
-                    break;
-                }
-
-                // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
-                // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
-                case 0xE1:
-                case 0xE2:
-                case 0xE3:
-                case 0xE4:
-                case 0xE5:
-                case 0xE6:
-                case 0xE7:
-                case 0xE8:
-                case 0xE9:
-                case 0xEA:
-                case 0xEB:
-                case 0xEC:
-                case 0xEE:
-                case 0xEF:
-                {
-                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
-                    {
-                        return token_type::parse_error;
-                    }
-                    break;
-                }
-
-                // U+D000..U+D7FF: bytes ED 80..9F 80..BF
-                case 0xED:
-                {
-                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
-                    {
-                        return token_type::parse_error;
-                    }
-                    break;
-                }
-
-                // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
-                case 0xF0:
-                {
-                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
-                    {
-                        return token_type::parse_error;
-                    }
-                    break;
-                }
-
-                // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
-                case 0xF1:
-                case 0xF2:
-                case 0xF3:
-                {
-                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
-                    {
-                        return token_type::parse_error;
-                    }
-                    break;
-                }
-
-                // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
-                case 0xF4:
-                {
-                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
-                    {
-                        return token_type::parse_error;
-                    }
-                    break;
-                }
-
-                // remaining bytes (80..C1 and F5..FF) are ill-formed
-                default:
-                {
-                    error_message = "invalid string: ill-formed UTF-8 byte";
-                    return token_type::parse_error;
-                }
-            }
-        }
-    }
-
-    /*!
-     * @brief scan a comment
-     * @return whether comment could be scanned successfully
-     */
-    bool scan_comment()
-    {
-        switch (get())
-        {
-            // single-line comments skip input until a newline or EOF is read
-            case '/':
-            {
-                while (true)
-                {
-                    switch (get())
-                    {
-                        case '\n':
-                        case '\r':
-                        case char_traits<char_type>::eof():
-                        case '\0':
-                            return true;
-
-                        default:
-                            break;
-                    }
-                }
-            }
-
-            // multi-line comments skip input until */ is read
-            case '*':
-            {
-                while (true)
-                {
-                    switch (get())
-                    {
-                        case char_traits<char_type>::eof():
-                        case '\0':
-                        {
-                            error_message = "invalid comment; missing closing '*/'";
-                            return false;
-                        }
-
-                        case '*':
-                        {
-                            switch (get())
-                            {
-                                case '/':
-                                    return true;
-
-                                default:
-                                {
-                                    unget();
-                                    continue;
-                                }
-                            }
-                        }
-
-                        default:
-                            continue;
-                    }
-                }
-            }
-
-            // unexpected character after reading '/'
-            default:
-            {
-                error_message = "invalid comment; expecting '/' or '*' after '/'";
-                return false;
-            }
-        }
-    }
-
-    JSON_HEDLEY_NON_NULL(2)
-    static void strtof(float& f, const char* str, char** endptr) noexcept
-    {
-        f = std::strtof(str, endptr);
-    }
-
-    JSON_HEDLEY_NON_NULL(2)
-    static void strtof(double& f, const char* str, char** endptr) noexcept
-    {
-        f = std::strtod(str, endptr);
-    }
-
-    JSON_HEDLEY_NON_NULL(2)
-    static void strtof(long double& f, const char* str, char** endptr) noexcept
-    {
-        f = std::strtold(str, endptr);
-    }
-
-    /*!
-    @brief scan a number literal
-
-    This function scans a string according to Sect. 6 of RFC 8259.
-
-    The function is realized with a deterministic finite state machine derived
-    from the grammar described in RFC 8259. Starting in state "init", the
-    input is read and used to determined the next state. Only state "done"
-    accepts the number. State "error" is a trap state to model errors. In the
-    table below, "anything" means any character but the ones listed before.
-
-    state    | 0        | 1-9      | e E      | +       | -       | .        | anything
-    ---------|----------|----------|----------|---------|---------|----------|-----------
-    init     | zero     | any1     | [error]  | [error] | minus   | [error]  | [error]
-    minus    | zero     | any1     | [error]  | [error] | [error] | [error]  | [error]
-    zero     | done     | done     | exponent | done    | done    | decimal1 | done
-    any1     | any1     | any1     | exponent | done    | done    | decimal1 | done
-    decimal1 | decimal2 | decimal2 | [error]  | [error] | [error] | [error]  | [error]
-    decimal2 | decimal2 | decimal2 | exponent | done    | done    | done     | done
-    exponent | any2     | any2     | [error]  | sign    | sign    | [error]  | [error]
-    sign     | any2     | any2     | [error]  | [error] | [error] | [error]  | [error]
-    any2     | any2     | any2     | done     | done    | done    | done     | done
-
-    The state machine is realized with one label per state (prefixed with
-    "scan_number_") and `goto` statements between them. The state machine
-    contains cycles, but any cycle can be left when EOF is read. Therefore,
-    the function is guaranteed to terminate.
-
-    During scanning, the read bytes are stored in token_buffer. This string is
-    then converted to a signed integer, an unsigned integer, or a
-    floating-point number.
-
-    @return token_type::value_unsigned, token_type::value_integer, or
-            token_type::value_float if number could be successfully scanned,
-            token_type::parse_error otherwise
-
-    @note The scanner is independent of the current locale. Internally, the
-          locale's decimal point is used instead of `.` to work with the
-          locale-dependent converters.
-    */
-    token_type scan_number()  // lgtm [cpp/use-of-goto] `goto` is used in this function to implement the number-parsing state machine described above. By design, any finite input will eventually reach the "done" state or return token_type::parse_error. In each intermediate state, 1 byte of the input is appended to the token_buffer vector, and only the already initialized variables token_buffer, number_type, and error_message are manipulated.
-    {
-        // reset token_buffer to store the number's bytes
-        reset();
-
-        // the type of the parsed number; initially set to unsigned; will be
-        // changed if minus sign, decimal point or exponent is read
-        token_type number_type = token_type::value_unsigned;
-
-        // state (init): we just found out we need to scan a number
-        switch (current)
-        {
-            case '-':
-            {
-                add(current);
-                goto scan_number_minus;
-            }
-
-            case '0':
-            {
-                add(current);
-                goto scan_number_zero;
-            }
-
-            case '1':
-            case '2':
-            case '3':
-            case '4':
-            case '5':
-            case '6':
-            case '7':
-            case '8':
-            case '9':
-            {
-                add(current);
-                goto scan_number_any1;
-            }
-
-            // all other characters are rejected outside scan_number()
-            default:            // LCOV_EXCL_LINE
-                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
-        }
-
-scan_number_minus:
-        // state: we just parsed a leading minus sign
-        number_type = token_type::value_integer;
-        switch (get())
-        {
-            case '0':
-            {
-                add(current);
-                goto scan_number_zero;
-            }
-
-            case '1':
-            case '2':
-            case '3':
-            case '4':
-            case '5':
-            case '6':
-            case '7':
-            case '8':
-            case '9':
-            {
-                add(current);
-                goto scan_number_any1;
-            }
-
-            default:
-            {
-                error_message = "invalid number; expected digit after '-'";
-                return token_type::parse_error;
-            }
-        }
-
-scan_number_zero:
-        // state: we just parse a zero (maybe with a leading minus sign)
-        switch (get())
-        {
-            case '.':
-            {
-                add(decimal_point_char);
-                decimal_point_position = token_buffer.size() - 1;
-                goto scan_number_decimal1;
-            }
-
-            case 'e':
-            case 'E':
-            {
-                add(current);
-                goto scan_number_exponent;
-            }
-
-            default:
-                goto scan_number_done;
-        }
-
-scan_number_any1:
-        // state: we just parsed a number 0-9 (maybe with a leading minus sign)
-        switch (get())
-        {
-            case '0':
-            case '1':
-            case '2':
-            case '3':
-            case '4':
-            case '5':
-            case '6':
-            case '7':
-            case '8':
-            case '9':
-            {
-                add(current);
-                goto scan_number_any1;
-            }
-
-            case '.':
-            {
-                add(decimal_point_char);
-                decimal_point_position = token_buffer.size() - 1;
-                goto scan_number_decimal1;
-            }
-
-            case 'e':
-            case 'E':
-            {
-                add(current);
-                goto scan_number_exponent;
-            }
-
-            default:
-                goto scan_number_done;
-        }
-
-scan_number_decimal1:
-        // state: we just parsed a decimal point
-        number_type = token_type::value_float;
-        switch (get())
-        {
-            case '0':
-            case '1':
-            case '2':
-            case '3':
-            case '4':
-            case '5':
-            case '6':
-            case '7':
-            case '8':
-            case '9':
-            {
-                add(current);
-                goto scan_number_decimal2;
-            }
-
-            default:
-            {
-                error_message = "invalid number; expected digit after '.'";
-                return token_type::parse_error;
-            }
-        }
-
-scan_number_decimal2:
-        // we just parsed at least one number after a decimal point
-        switch (get())
-        {
-            case '0':
-            case '1':
-            case '2':
-            case '3':
-            case '4':
-            case '5':
-            case '6':
-            case '7':
-            case '8':
-            case '9':
-            {
-                add(current);
-                goto scan_number_decimal2;
-            }
-
-            case 'e':
-            case 'E':
-            {
-                add(current);
-                goto scan_number_exponent;
-            }
-
-            default:
-                goto scan_number_done;
-        }
-
-scan_number_exponent:
-        // we just parsed an exponent
-        number_type = token_type::value_float;
-        switch (get())
-        {
-            case '+':
-            case '-':
-            {
-                add(current);
-                goto scan_number_sign;
-            }
-
-            case '0':
-            case '1':
-            case '2':
-            case '3':
-            case '4':
-            case '5':
-            case '6':
-            case '7':
-            case '8':
-            case '9':
-            {
-                add(current);
-                goto scan_number_any2;
-            }
-
-            default:
-            {
-                error_message =
-                    "invalid number; expected '+', '-', or digit after exponent";
-                return token_type::parse_error;
-            }
-        }
-
-scan_number_sign:
-        // we just parsed an exponent sign
-        switch (get())
-        {
-            case '0':
-            case '1':
-            case '2':
-            case '3':
-            case '4':
-            case '5':
-            case '6':
-            case '7':
-            case '8':
-            case '9':
-            {
-                add(current);
-                goto scan_number_any2;
-            }
-
-            default:
-            {
-                error_message = "invalid number; expected digit after exponent sign";
-                return token_type::parse_error;
-            }
-        }
-
-scan_number_any2:
-        // we just parsed a number after the exponent or exponent sign
-        switch (get())
-        {
-            case '0':
-            case '1':
-            case '2':
-            case '3':
-            case '4':
-            case '5':
-            case '6':
-            case '7':
-            case '8':
-            case '9':
-            {
-                add(current);
-                goto scan_number_any2;
-            }
-
-            default:
-                goto scan_number_done;
-        }
-
-scan_number_done:
-        // unget the character after the number (we only read it to know that
-        // we are done scanning a number)
-        unget();
-
-        char* endptr = nullptr; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
-        errno = 0;
-
-        // try to parse integers first and fall back to floats
-        if (number_type == token_type::value_unsigned)
-        {
-            const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
-
-            // we checked the number format before
-            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
-
-            if (errno != ERANGE)
-            {
-                value_unsigned = static_cast<number_unsigned_t>(x);
-                if (value_unsigned == x)
-                {
-                    return token_type::value_unsigned;
-                }
-            }
-        }
-        else if (number_type == token_type::value_integer)
-        {
-            const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
-
-            // we checked the number format before
-            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
-
-            if (errno != ERANGE)
-            {
-                value_integer = static_cast<number_integer_t>(x);
-                if (value_integer == x)
-                {
-                    return token_type::value_integer;
-                }
-            }
-        }
-
-        // this code is reached if we parse a floating-point number or if an
-        // integer conversion above failed
-        strtof(value_float, token_buffer.data(), &endptr);
-
-        // we checked the number format before
-        JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
-
-        return token_type::value_float;
-    }
-
-    /*!
-    @param[in] literal_text  the literal text to expect
-    @param[in] length        the length of the passed literal text
-    @param[in] return_type   the token type to return on success
-    */
-    JSON_HEDLEY_NON_NULL(2)
-    token_type scan_literal(const char_type* literal_text, const std::size_t length,
-                            token_type return_type)
-    {
-        JSON_ASSERT(char_traits<char_type>::to_char_type(current) == literal_text[0]);
-        for (std::size_t i = 1; i < length; ++i)
-        {
-            if (JSON_HEDLEY_UNLIKELY(char_traits<char_type>::to_char_type(get()) != literal_text[i]))
-            {
-                error_message = "invalid literal";
-                return token_type::parse_error;
-            }
-        }
-        return return_type;
-    }
-
-    /////////////////////
-    // input management
-    /////////////////////
-
-    /// reset token_buffer; current character is beginning of token
-    void reset() noexcept
-    {
-        token_buffer.clear();
-        token_string.clear();
-        decimal_point_position = std::string::npos;
-        token_string.push_back(char_traits<char_type>::to_char_type(current));
-    }
-
-    /*
-    @brief get next character from the input
-
-    This function provides the interface to the used input adapter. It does
-    not throw in case the input reached EOF, but returns a
-    `char_traits<char>::eof()` in that case.  Stores the scanned characters
-    for use in error messages.
-
-    @return character read from the input
-    */
-    char_int_type get()
-    {
-        ++position.chars_read_total;
-        ++position.chars_read_current_line;
-
-        if (next_unget)
-        {
-            // just reset the next_unget variable and work with current
-            next_unget = false;
-        }
-        else
-        {
-            current = ia.get_character();
-        }
-
-        if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof()))
-        {
-            token_string.push_back(char_traits<char_type>::to_char_type(current));
-        }
-
-        if (current == '\n')
-        {
-            ++position.lines_read;
-            position.chars_read_current_line = 0;
-        }
-
-        return current;
-    }
-
-    /*!
-    @brief unget current character (read it again on next get)
-
-    We implement unget by setting variable next_unget to true. The input is not
-    changed - we just simulate ungetting by modifying chars_read_total,
-    chars_read_current_line, and token_string. The next call to get() will
-    behave as if the unget character is read again.
-    */
-    void unget()
-    {
-        next_unget = true;
-
-        --position.chars_read_total;
-
-        // in case we "unget" a newline, we have to also decrement the lines_read
-        if (position.chars_read_current_line == 0)
-        {
-            if (position.lines_read > 0)
-            {
-                --position.lines_read;
-            }
-        }
-        else
-        {
-            --position.chars_read_current_line;
-        }
-
-        if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof()))
-        {
-            JSON_ASSERT(!token_string.empty());
-            token_string.pop_back();
-        }
-    }
-
-    /// add a character to token_buffer
-    void add(char_int_type c)
-    {
-        token_buffer.push_back(static_cast<typename string_t::value_type>(c));
-    }
-
-  public:
-    /////////////////////
-    // value getters
-    /////////////////////
-
-    /// return integer value
-    constexpr number_integer_t get_number_integer() const noexcept
-    {
-        return value_integer;
-    }
-
-    /// return unsigned integer value
-    constexpr number_unsigned_t get_number_unsigned() const noexcept
-    {
-        return value_unsigned;
-    }
-
-    /// return floating-point value
-    constexpr number_float_t get_number_float() const noexcept
-    {
-        return value_float;
-    }
-
-    /// return current string value (implicitly resets the token; useful only once)
-    string_t& get_string()
-    {
-        // translate decimal points from locale back to '.' (#4084)
-        if (decimal_point_char != '.' && decimal_point_position != std::string::npos)
-        {
-            token_buffer[decimal_point_position] = '.';
-        }
-        return token_buffer;
-    }
-
-    /////////////////////
-    // diagnostics
-    /////////////////////
-
-    /// return position of last read token
-    constexpr position_t get_position() const noexcept
-    {
-        return position;
-    }
-
-    /// return the last read token (for errors only).  Will never contain EOF
-    /// (an arbitrary value that is not a valid char value, often -1), because
-    /// 255 may legitimately occur.  May contain NUL, which should be escaped.
-    std::string get_token_string() const
-    {
-        // escape control characters
-        std::string result;
-        for (const auto c : token_string)
-        {
-            if (static_cast<unsigned char>(c) <= '\x1F')
-            {
-                // escape control characters
-                std::array<char, 9> cs{{}};
-                static_cast<void>((std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
-                result += cs.data();
-            }
-            else
-            {
-                // add character as is
-                result.push_back(static_cast<std::string::value_type>(c));
-            }
-        }
-
-        return result;
-    }
-
-    /// return syntax error message
-    JSON_HEDLEY_RETURNS_NON_NULL
-    constexpr const char* get_error_message() const noexcept
-    {
-        return error_message;
-    }
-
-    /////////////////////
-    // actual scanner
-    /////////////////////
-
-    /*!
-    @brief skip the UTF-8 byte order mark
-    @return true iff there is no BOM or the correct BOM has been skipped
-    */
-    bool skip_bom()
-    {
-        if (get() == 0xEF)
-        {
-            // check if we completely parse the BOM
-            return get() == 0xBB && get() == 0xBF;
-        }
-
-        // the first character is not the beginning of the BOM; unget it to
-        // process is later
-        unget();
-        return true;
-    }
-
-    void skip_whitespace()
-    {
-        do
-        {
-            get();
-        }
-        while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
-    }
-
-    token_type scan()
-    {
-        // initially, skip the BOM
-        if (position.chars_read_total == 0 && !skip_bom())
-        {
-            error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
-            return token_type::parse_error;
-        }
-
-        // read next character and ignore whitespace
-        skip_whitespace();
-
-        // ignore comments
-        while (ignore_comments && current == '/')
-        {
-            if (!scan_comment())
-            {
-                return token_type::parse_error;
-            }
-
-            // skip following whitespace
-            skip_whitespace();
-        }
-
-        switch (current)
-        {
-            // structural characters
-            case '[':
-                return token_type::begin_array;
-            case ']':
-                return token_type::end_array;
-            case '{':
-                return token_type::begin_object;
-            case '}':
-                return token_type::end_object;
-            case ':':
-                return token_type::name_separator;
-            case ',':
-                return token_type::value_separator;
-
-            // literals
-            case 't':
-            {
-                std::array<char_type, 4> true_literal = {{static_cast<char_type>('t'), static_cast<char_type>('r'), static_cast<char_type>('u'), static_cast<char_type>('e')}};
-                return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true);
-            }
-            case 'f':
-            {
-                std::array<char_type, 5> false_literal = {{static_cast<char_type>('f'), static_cast<char_type>('a'), static_cast<char_type>('l'), static_cast<char_type>('s'), static_cast<char_type>('e')}};
-                return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false);
-            }
-            case 'n':
-            {
-                std::array<char_type, 4> null_literal = {{static_cast<char_type>('n'), static_cast<char_type>('u'), static_cast<char_type>('l'), static_cast<char_type>('l')}};
-                return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null);
-            }
-
-            // string
-            case '\"':
-                return scan_string();
-
-            // number
-            case '-':
-            case '0':
-            case '1':
-            case '2':
-            case '3':
-            case '4':
-            case '5':
-            case '6':
-            case '7':
-            case '8':
-            case '9':
-                return scan_number();
-
-            // end of input (the null byte is needed when parsing from
-            // string literals)
-            case '\0':
-            case char_traits<char_type>::eof():
-                return token_type::end_of_input;
-
-            // error
-            default:
-                error_message = "invalid literal";
-                return token_type::parse_error;
-        }
-    }
-
-  private:
-    /// input adapter
-    InputAdapterType ia;
-
-    /// whether comments should be ignored (true) or signaled as errors (false)
-    const bool ignore_comments = false;
-
-    /// the current character
-    char_int_type current = char_traits<char_type>::eof();
-
-    /// whether the next get() call should just return current
-    bool next_unget = false;
-
-    /// the start position of the current token
-    position_t position {};
-
-    /// raw input token string (for error messages)
-    std::vector<char_type> token_string {};
-
-    /// buffer for variable-length tokens (numbers, strings)
-    string_t token_buffer {};
-
-    /// a description of occurred lexer errors
-    const char* error_message = "";
-
-    // number values
-    number_integer_t value_integer = 0;
-    number_unsigned_t value_unsigned = 0;
-    number_float_t value_float = 0;
-
-    /// the decimal point
-    const char_int_type decimal_point_char = '.';
-    /// the position of the decimal point in the input
-    std::size_t decimal_point_position = std::string::npos;
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-// #include <nlohmann/detail/string_concat.hpp>
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-
-/*!
-@brief SAX interface
-
-This class describes the SAX interface used by @ref nlohmann::json::sax_parse.
-Each function is called in different situations while the input is parsed. The
-boolean return value informs the parser whether to continue processing the
-input.
-*/
-template<typename BasicJsonType>
-struct json_sax
-{
-    using number_integer_t = typename BasicJsonType::number_integer_t;
-    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-    using string_t = typename BasicJsonType::string_t;
-    using binary_t = typename BasicJsonType::binary_t;
-
-    /*!
-    @brief a null value was read
-    @return whether parsing should proceed
-    */
-    virtual bool null() = 0;
-
-    /*!
-    @brief a boolean value was read
-    @param[in] val  boolean value
-    @return whether parsing should proceed
-    */
-    virtual bool boolean(bool val) = 0;
-
-    /*!
-    @brief an integer number was read
-    @param[in] val  integer value
-    @return whether parsing should proceed
-    */
-    virtual bool number_integer(number_integer_t val) = 0;
-
-    /*!
-    @brief an unsigned integer number was read
-    @param[in] val  unsigned integer value
-    @return whether parsing should proceed
-    */
-    virtual bool number_unsigned(number_unsigned_t val) = 0;
-
-    /*!
-    @brief a floating-point number was read
-    @param[in] val  floating-point value
-    @param[in] s    raw token value
-    @return whether parsing should proceed
-    */
-    virtual bool number_float(number_float_t val, const string_t& s) = 0;
-
-    /*!
-    @brief a string value was read
-    @param[in] val  string value
-    @return whether parsing should proceed
-    @note It is safe to move the passed string value.
-    */
-    virtual bool string(string_t& val) = 0;
-
-    /*!
-    @brief a binary value was read
-    @param[in] val  binary value
-    @return whether parsing should proceed
-    @note It is safe to move the passed binary value.
-    */
-    virtual bool binary(binary_t& val) = 0;
-
-    /*!
-    @brief the beginning of an object was read
-    @param[in] elements  number of object elements or -1 if unknown
-    @return whether parsing should proceed
-    @note binary formats may report the number of elements
-    */
-    virtual bool start_object(std::size_t elements) = 0;
-
-    /*!
-    @brief an object key was read
-    @param[in] val  object key
-    @return whether parsing should proceed
-    @note It is safe to move the passed string.
-    */
-    virtual bool key(string_t& val) = 0;
-
-    /*!
-    @brief the end of an object was read
-    @return whether parsing should proceed
-    */
-    virtual bool end_object() = 0;
-
-    /*!
-    @brief the beginning of an array was read
-    @param[in] elements  number of array elements or -1 if unknown
-    @return whether parsing should proceed
-    @note binary formats may report the number of elements
-    */
-    virtual bool start_array(std::size_t elements) = 0;
-
-    /*!
-    @brief the end of an array was read
-    @return whether parsing should proceed
-    */
-    virtual bool end_array() = 0;
-
-    /*!
-    @brief a parse error occurred
-    @param[in] position    the position in the input where the error occurs
-    @param[in] last_token  the last read token
-    @param[in] ex          an exception object describing the error
-    @return whether parsing should proceed (must return false)
-    */
-    virtual bool parse_error(std::size_t position,
-                             const std::string& last_token,
-                             const detail::exception& ex) = 0;
-
-    json_sax() = default;
-    json_sax(const json_sax&) = default;
-    json_sax(json_sax&&) noexcept = default;
-    json_sax& operator=(const json_sax&) = default;
-    json_sax& operator=(json_sax&&) noexcept = default;
-    virtual ~json_sax() = default;
-};
-
-namespace detail
-{
-constexpr std::size_t unknown_size()
-{
-    return (std::numeric_limits<std::size_t>::max)();
-}
-
-/*!
-@brief SAX implementation to create a JSON value from SAX events
-
-This class implements the @ref json_sax interface and processes the SAX events
-to create a JSON value which makes it basically a DOM parser. The structure or
-hierarchy of the JSON value is managed by the stack `ref_stack` which contains
-a pointer to the respective array or object for each recursion depth.
-
-After successful parsing, the value that is passed by reference to the
-constructor contains the parsed value.
-
-@tparam BasicJsonType  the JSON type
-*/
-template<typename BasicJsonType, typename InputAdapterType>
-class json_sax_dom_parser
-{
-  public:
-    using number_integer_t = typename BasicJsonType::number_integer_t;
-    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-    using string_t = typename BasicJsonType::string_t;
-    using binary_t = typename BasicJsonType::binary_t;
-    using lexer_t = lexer<BasicJsonType, InputAdapterType>;
-
-    /*!
-    @param[in,out] r  reference to a JSON value that is manipulated while
-                       parsing
-    @param[in] allow_exceptions_  whether parse errors yield exceptions
-    */
-    explicit json_sax_dom_parser(BasicJsonType& r, const bool allow_exceptions_ = true, lexer_t* lexer_ = nullptr)
-        : root(r), allow_exceptions(allow_exceptions_), m_lexer_ref(lexer_)
-    {}
-
-    // make class move-only
-    json_sax_dom_parser(const json_sax_dom_parser&) = delete;
-    json_sax_dom_parser(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
-    json_sax_dom_parser& operator=(const json_sax_dom_parser&) = delete;
-    json_sax_dom_parser& operator=(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
-    ~json_sax_dom_parser() = default;
-
-    bool null()
-    {
-        handle_value(nullptr);
-        return true;
-    }
-
-    bool boolean(bool val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool number_integer(number_integer_t val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool number_unsigned(number_unsigned_t val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool number_float(number_float_t val, const string_t& /*unused*/)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool string(string_t& val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool binary(binary_t& val)
-    {
-        handle_value(std::move(val));
-        return true;
-    }
-
-    bool start_object(std::size_t len)
-    {
-        ref_stack.push_back(handle_value(BasicJsonType::value_t::object));
-
-#if JSON_DIAGNOSTIC_POSITIONS
-        // Manually set the start position of the object here.
-        // Ensure this is after the call to handle_value to ensure correct start position.
-        if (m_lexer_ref)
-        {
-            // Lexer has read the first character of the object, so
-            // subtract 1 from the position to get the correct start position.
-            ref_stack.back()->start_position = m_lexer_ref->get_position() - 1;
-        }
-#endif
-
-        if (JSON_HEDLEY_UNLIKELY(len != detail::unknown_size() && len > ref_stack.back()->max_size()))
-        {
-            JSON_THROW(out_of_range::create(408, concat("excessive object size: ", std::to_string(len)), ref_stack.back()));
-        }
-
-        return true;
-    }
-
-    bool key(string_t& val)
-    {
-        JSON_ASSERT(!ref_stack.empty());
-        JSON_ASSERT(ref_stack.back()->is_object());
-
-        // add null at given key and store the reference for later
-        object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val));
-        return true;
-    }
-
-    bool end_object()
-    {
-        JSON_ASSERT(!ref_stack.empty());
-        JSON_ASSERT(ref_stack.back()->is_object());
-
-#if JSON_DIAGNOSTIC_POSITIONS
-        if (m_lexer_ref)
-        {
-            // Lexer's position is past the closing brace, so set that as the end position.
-            ref_stack.back()->end_position = m_lexer_ref->get_position();
-        }
-#endif
-
-        ref_stack.back()->set_parents();
-        ref_stack.pop_back();
-        return true;
-    }
-
-    bool start_array(std::size_t len)
-    {
-        ref_stack.push_back(handle_value(BasicJsonType::value_t::array));
-
-#if JSON_DIAGNOSTIC_POSITIONS
-        // Manually set the start position of the array here.
-        // Ensure this is after the call to handle_value to ensure correct start position.
-        if (m_lexer_ref)
-        {
-            ref_stack.back()->start_position = m_lexer_ref->get_position() - 1;
-        }
-#endif
-
-        if (JSON_HEDLEY_UNLIKELY(len != detail::unknown_size() && len > ref_stack.back()->max_size()))
-        {
-            JSON_THROW(out_of_range::create(408, concat("excessive array size: ", std::to_string(len)), ref_stack.back()));
-        }
-
-        return true;
-    }
-
-    bool end_array()
-    {
-        JSON_ASSERT(!ref_stack.empty());
-        JSON_ASSERT(ref_stack.back()->is_array());
-
-#if JSON_DIAGNOSTIC_POSITIONS
-        if (m_lexer_ref)
-        {
-            // Lexer's position is past the closing bracket, so set that as the end position.
-            ref_stack.back()->end_position = m_lexer_ref->get_position();
-        }
-#endif
-
-        ref_stack.back()->set_parents();
-        ref_stack.pop_back();
-        return true;
-    }
-
-    template<class Exception>
-    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
-                     const Exception& ex)
-    {
-        errored = true;
-        static_cast<void>(ex);
-        if (allow_exceptions)
-        {
-            JSON_THROW(ex);
-        }
-        return false;
-    }
-
-    constexpr bool is_errored() const
-    {
-        return errored;
-    }
-
-  private:
-
-#if JSON_DIAGNOSTIC_POSITIONS
-    void handle_diagnostic_positions_for_json_value(BasicJsonType& v)
-    {
-        if (m_lexer_ref)
-        {
-            // Lexer has read past the current field value, so set the end position to the current position.
-            // The start position will be set below based on the length of the string representation
-            // of the value.
-            v.end_position = m_lexer_ref->get_position();
-
-            switch (v.type())
-            {
-                case value_t::boolean:
-                {
-                    // 4 and 5 are the string length of "true" and "false"
-                    v.start_position = v.end_position - (v.m_data.m_value.boolean ? 4 : 5);
-                    break;
-                }
-
-                case value_t::null:
-                {
-                    // 4 is the string length of "null"
-                    v.start_position = v.end_position - 4;
-                    break;
-                }
-
-                case value_t::string:
-                {
-                    // include the length of the quotes, which is 2
-                    v.start_position = v.end_position - v.m_data.m_value.string->size() - 2;
-                    break;
-                }
-
-                // As we handle the start and end positions for values created during parsing,
-                // we do not expect the following value type to be called. Regardless, set the positions
-                // in case this is created manually or through a different constructor. Exclude from lcov
-                // since the exact condition of this switch is esoteric.
-                // LCOV_EXCL_START
-                case value_t::discarded:
-                {
-                    v.end_position = std::string::npos;
-                    v.start_position = v.end_position;
-                    break;
-                }
-                // LCOV_EXCL_STOP
-                case value_t::binary:
-                case value_t::number_integer:
-                case value_t::number_unsigned:
-                case value_t::number_float:
-                {
-                    v.start_position = v.end_position - m_lexer_ref->get_string().size();
-                    break;
-                }
-                case value_t::object:
-                case value_t::array:
-                {
-                    // object and array are handled in start_object() and start_array() handlers
-                    // skip setting the values here.
-                    break;
-                }
-                default: // LCOV_EXCL_LINE
-                    // Handle all possible types discretely, default handler should never be reached.
-                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert,-warnings-as-errors) LCOV_EXCL_LINE
-            }
-        }
-    }
-#endif
-
-    /*!
-    @invariant If the ref stack is empty, then the passed value will be the new
-               root.
-    @invariant If the ref stack contains a value, then it is an array or an
-               object to which we can add elements
-    */
-    template<typename Value>
-    JSON_HEDLEY_RETURNS_NON_NULL
-    BasicJsonType* handle_value(Value&& v)
-    {
-        if (ref_stack.empty())
-        {
-            root = BasicJsonType(std::forward<Value>(v));
-
-#if JSON_DIAGNOSTIC_POSITIONS
-            handle_diagnostic_positions_for_json_value(root);
-#endif
-
-            return &root;
-        }
-
-        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
-
-        if (ref_stack.back()->is_array())
-        {
-            ref_stack.back()->m_data.m_value.array->emplace_back(std::forward<Value>(v));
-
-#if JSON_DIAGNOSTIC_POSITIONS
-            handle_diagnostic_positions_for_json_value(ref_stack.back()->m_data.m_value.array->back());
-#endif
-
-            return &(ref_stack.back()->m_data.m_value.array->back());
-        }
-
-        JSON_ASSERT(ref_stack.back()->is_object());
-        JSON_ASSERT(object_element);
-        *object_element = BasicJsonType(std::forward<Value>(v));
-
-#if JSON_DIAGNOSTIC_POSITIONS
-        handle_diagnostic_positions_for_json_value(*object_element);
-#endif
-
-        return object_element;
-    }
-
-    /// the parsed JSON value
-    BasicJsonType& root;
-    /// stack to model hierarchy of values
-    std::vector<BasicJsonType*> ref_stack {};
-    /// helper to hold the reference for the next object element
-    BasicJsonType* object_element = nullptr;
-    /// whether a syntax error occurred
-    bool errored = false;
-    /// whether to throw exceptions in case of errors
-    const bool allow_exceptions = true;
-    /// the lexer reference to obtain the current position
-    lexer_t* m_lexer_ref = nullptr;
-};
-
-template<typename BasicJsonType, typename InputAdapterType>
-class json_sax_dom_callback_parser
-{
-  public:
-    using number_integer_t = typename BasicJsonType::number_integer_t;
-    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-    using string_t = typename BasicJsonType::string_t;
-    using binary_t = typename BasicJsonType::binary_t;
-    using parser_callback_t = typename BasicJsonType::parser_callback_t;
-    using parse_event_t = typename BasicJsonType::parse_event_t;
-    using lexer_t = lexer<BasicJsonType, InputAdapterType>;
-
-    json_sax_dom_callback_parser(BasicJsonType& r,
-                                 parser_callback_t cb,
-                                 const bool allow_exceptions_ = true,
-                                 lexer_t* lexer_ = nullptr)
-        : root(r), callback(std::move(cb)), allow_exceptions(allow_exceptions_), m_lexer_ref(lexer_)
-    {
-        keep_stack.push_back(true);
-    }
-
-    // make class move-only
-    json_sax_dom_callback_parser(const json_sax_dom_callback_parser&) = delete;
-    json_sax_dom_callback_parser(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
-    json_sax_dom_callback_parser& operator=(const json_sax_dom_callback_parser&) = delete;
-    json_sax_dom_callback_parser& operator=(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
-    ~json_sax_dom_callback_parser() = default;
-
-    bool null()
-    {
-        handle_value(nullptr);
-        return true;
-    }
-
-    bool boolean(bool val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool number_integer(number_integer_t val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool number_unsigned(number_unsigned_t val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool number_float(number_float_t val, const string_t& /*unused*/)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool string(string_t& val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool binary(binary_t& val)
-    {
-        handle_value(std::move(val));
-        return true;
-    }
-
-    bool start_object(std::size_t len)
-    {
-        // check callback for object start
-        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::object_start, discarded);
-        keep_stack.push_back(keep);
-
-        auto val = handle_value(BasicJsonType::value_t::object, true);
-        ref_stack.push_back(val.second);
-
-        if (ref_stack.back())
-        {
-
-#if JSON_DIAGNOSTIC_POSITIONS
-            // Manually set the start position of the object here.
-            // Ensure this is after the call to handle_value to ensure correct start position.
-            if (m_lexer_ref)
-            {
-                // Lexer has read the first character of the object, so
-                // subtract 1 from the position to get the correct start position.
-                ref_stack.back()->start_position = m_lexer_ref->get_position() - 1;
-            }
-#endif
-
-            // check object limit
-            if (JSON_HEDLEY_UNLIKELY(len != detail::unknown_size() && len > ref_stack.back()->max_size()))
-            {
-                JSON_THROW(out_of_range::create(408, concat("excessive object size: ", std::to_string(len)), ref_stack.back()));
-            }
-        }
-        return true;
-    }
-
-    bool key(string_t& val)
-    {
-        BasicJsonType k = BasicJsonType(val);
-
-        // check callback for key
-        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::key, k);
-        key_keep_stack.push_back(keep);
-
-        // add discarded value at given key and store the reference for later
-        if (keep && ref_stack.back())
-        {
-            object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val) = discarded);
-        }
-
-        return true;
-    }
-
-    bool end_object()
-    {
-        if (ref_stack.back())
-        {
-            if (!callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::object_end, *ref_stack.back()))
-            {
-                // discard object
-                *ref_stack.back() = discarded;
-
-#if JSON_DIAGNOSTIC_POSITIONS
-                // Set start/end positions for discarded object.
-                handle_diagnostic_positions_for_json_value(*ref_stack.back());
-#endif
-            }
-            else
-            {
-
-#if JSON_DIAGNOSTIC_POSITIONS
-                if (m_lexer_ref)
-                {
-                    // Lexer's position is past the closing brace, so set that as the end position.
-                    ref_stack.back()->end_position = m_lexer_ref->get_position();
-                }
-#endif
-
-                ref_stack.back()->set_parents();
-            }
-        }
-
-        JSON_ASSERT(!ref_stack.empty());
-        JSON_ASSERT(!keep_stack.empty());
-        ref_stack.pop_back();
-        keep_stack.pop_back();
-
-        if (!ref_stack.empty() && ref_stack.back() && ref_stack.back()->is_structured())
-        {
-            // remove discarded value
-            for (auto it = ref_stack.back()->begin(); it != ref_stack.back()->end(); ++it)
-            {
-                if (it->is_discarded())
-                {
-                    ref_stack.back()->erase(it);
-                    break;
-                }
-            }
-        }
-
-        return true;
-    }
-
-    bool start_array(std::size_t len)
-    {
-        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::array_start, discarded);
-        keep_stack.push_back(keep);
-
-        auto val = handle_value(BasicJsonType::value_t::array, true);
-        ref_stack.push_back(val.second);
-
-        if (ref_stack.back())
-        {
-
-#if JSON_DIAGNOSTIC_POSITIONS
-            // Manually set the start position of the array here.
-            // Ensure this is after the call to handle_value to ensure correct start position.
-            if (m_lexer_ref)
-            {
-                // Lexer has read the first character of the array, so
-                // subtract 1 from the position to get the correct start position.
-                ref_stack.back()->start_position = m_lexer_ref->get_position() - 1;
-            }
-#endif
-
-            // check array limit
-            if (JSON_HEDLEY_UNLIKELY(len != detail::unknown_size() && len > ref_stack.back()->max_size()))
-            {
-                JSON_THROW(out_of_range::create(408, concat("excessive array size: ", std::to_string(len)), ref_stack.back()));
-            }
-        }
-
-        return true;
-    }
-
-    bool end_array()
-    {
-        bool keep = true;
-
-        if (ref_stack.back())
-        {
-            keep = callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::array_end, *ref_stack.back());
-            if (keep)
-            {
-
-#if JSON_DIAGNOSTIC_POSITIONS
-                if (m_lexer_ref)
-                {
-                    // Lexer's position is past the closing bracket, so set that as the end position.
-                    ref_stack.back()->end_position = m_lexer_ref->get_position();
-                }
-#endif
-
-                ref_stack.back()->set_parents();
-            }
-            else
-            {
-                // discard array
-                *ref_stack.back() = discarded;
-
-#if JSON_DIAGNOSTIC_POSITIONS
-                // Set start/end positions for discarded array.
-                handle_diagnostic_positions_for_json_value(*ref_stack.back());
-#endif
-            }
-        }
-
-        JSON_ASSERT(!ref_stack.empty());
-        JSON_ASSERT(!keep_stack.empty());
-        ref_stack.pop_back();
-        keep_stack.pop_back();
-
-        // remove discarded value
-        if (!keep && !ref_stack.empty() && ref_stack.back()->is_array())
-        {
-            ref_stack.back()->m_data.m_value.array->pop_back();
-        }
-
-        return true;
-    }
-
-    template<class Exception>
-    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
-                     const Exception& ex)
-    {
-        errored = true;
-        static_cast<void>(ex);
-        if (allow_exceptions)
-        {
-            JSON_THROW(ex);
-        }
-        return false;
-    }
-
-    constexpr bool is_errored() const
-    {
-        return errored;
-    }
-
-  private:
-
-#if JSON_DIAGNOSTIC_POSITIONS
-    void handle_diagnostic_positions_for_json_value(BasicJsonType& v)
-    {
-        if (m_lexer_ref)
-        {
-            // Lexer has read past the current field value, so set the end position to the current position.
-            // The start position will be set below based on the length of the string representation
-            // of the value.
-            v.end_position = m_lexer_ref->get_position();
-
-            switch (v.type())
-            {
-                case value_t::boolean:
-                {
-                    // 4 and 5 are the string length of "true" and "false"
-                    v.start_position = v.end_position - (v.m_data.m_value.boolean ? 4 : 5);
-                    break;
-                }
-
-                case value_t::null:
-                {
-                    // 4 is the string length of "null"
-                    v.start_position = v.end_position - 4;
-                    break;
-                }
-
-                case value_t::string:
-                {
-                    // include the length of the quotes, which is 2
-                    v.start_position = v.end_position - v.m_data.m_value.string->size() - 2;
-                    break;
-                }
-
-                case value_t::discarded:
-                {
-                    v.end_position = std::string::npos;
-                    v.start_position = v.end_position;
-                    break;
-                }
-
-                case value_t::binary:
-                case value_t::number_integer:
-                case value_t::number_unsigned:
-                case value_t::number_float:
-                {
-                    v.start_position = v.end_position - m_lexer_ref->get_string().size();
-                    break;
-                }
-
-                case value_t::object:
-                case value_t::array:
-                {
-                    // object and array are handled in start_object() and start_array() handlers
-                    // skip setting the values here.
-                    break;
-                }
-                default: // LCOV_EXCL_LINE
-                    // Handle all possible types discretely, default handler should never be reached.
-                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert,-warnings-as-errors) LCOV_EXCL_LINE
-            }
-        }
-    }
-#endif
-
-    /*!
-    @param[in] v  value to add to the JSON value we build during parsing
-    @param[in] skip_callback  whether we should skip calling the callback
-               function; this is required after start_array() and
-               start_object() SAX events, because otherwise we would call the
-               callback function with an empty array or object, respectively.
-
-    @invariant If the ref stack is empty, then the passed value will be the new
-               root.
-    @invariant If the ref stack contains a value, then it is an array or an
-               object to which we can add elements
-
-    @return pair of boolean (whether value should be kept) and pointer (to the
-            passed value in the ref_stack hierarchy; nullptr if not kept)
-    */
-    template<typename Value>
-    std::pair<bool, BasicJsonType*> handle_value(Value&& v, const bool skip_callback = false)
-    {
-        JSON_ASSERT(!keep_stack.empty());
-
-        // do not handle this value if we know it would be added to a discarded
-        // container
-        if (!keep_stack.back())
-        {
-            return {false, nullptr};
-        }
-
-        // create value
-        auto value = BasicJsonType(std::forward<Value>(v));
-
-#if JSON_DIAGNOSTIC_POSITIONS
-        handle_diagnostic_positions_for_json_value(value);
-#endif
-
-        // check callback
-        const bool keep = skip_callback || callback(static_cast<int>(ref_stack.size()), parse_event_t::value, value);
-
-        // do not handle this value if we just learnt it shall be discarded
-        if (!keep)
-        {
-            return {false, nullptr};
-        }
-
-        if (ref_stack.empty())
-        {
-            root = std::move(value);
-            return {true, & root};
-        }
-
-        // skip this value if we already decided to skip the parent
-        // (https://github.com/nlohmann/json/issues/971#issuecomment-413678360)
-        if (!ref_stack.back())
-        {
-            return {false, nullptr};
-        }
-
-        // we now only expect arrays and objects
-        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
-
-        // array
-        if (ref_stack.back()->is_array())
-        {
-            ref_stack.back()->m_data.m_value.array->emplace_back(std::move(value));
-            return {true, & (ref_stack.back()->m_data.m_value.array->back())};
-        }
-
-        // object
-        JSON_ASSERT(ref_stack.back()->is_object());
-        // check if we should store an element for the current key
-        JSON_ASSERT(!key_keep_stack.empty());
-        const bool store_element = key_keep_stack.back();
-        key_keep_stack.pop_back();
-
-        if (!store_element)
-        {
-            return {false, nullptr};
-        }
-
-        JSON_ASSERT(object_element);
-        *object_element = std::move(value);
-        return {true, object_element};
-    }
-
-    /// the parsed JSON value
-    BasicJsonType& root;
-    /// stack to model hierarchy of values
-    std::vector<BasicJsonType*> ref_stack {};
-    /// stack to manage which values to keep
-    std::vector<bool> keep_stack {}; // NOLINT(readability-redundant-member-init)
-    /// stack to manage which object keys to keep
-    std::vector<bool> key_keep_stack {}; // NOLINT(readability-redundant-member-init)
-    /// helper to hold the reference for the next object element
-    BasicJsonType* object_element = nullptr;
-    /// whether a syntax error occurred
-    bool errored = false;
-    /// callback function
-    const parser_callback_t callback = nullptr;
-    /// whether to throw exceptions in case of errors
-    const bool allow_exceptions = true;
-    /// a discarded value for the callback
-    BasicJsonType discarded = BasicJsonType::value_t::discarded;
-    /// the lexer reference to obtain the current position
-    lexer_t* m_lexer_ref = nullptr;
-};
-
-template<typename BasicJsonType>
-class json_sax_acceptor
-{
-  public:
-    using number_integer_t = typename BasicJsonType::number_integer_t;
-    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-    using string_t = typename BasicJsonType::string_t;
-    using binary_t = typename BasicJsonType::binary_t;
-
-    bool null()
-    {
-        return true;
-    }
-
-    bool boolean(bool /*unused*/)
-    {
-        return true;
-    }
-
-    bool number_integer(number_integer_t /*unused*/)
-    {
-        return true;
-    }
-
-    bool number_unsigned(number_unsigned_t /*unused*/)
-    {
-        return true;
-    }
-
-    bool number_float(number_float_t /*unused*/, const string_t& /*unused*/)
-    {
-        return true;
-    }
-
-    bool string(string_t& /*unused*/)
-    {
-        return true;
-    }
-
-    bool binary(binary_t& /*unused*/)
-    {
-        return true;
-    }
-
-    bool start_object(std::size_t /*unused*/ = detail::unknown_size())
-    {
-        return true;
-    }
-
-    bool key(string_t& /*unused*/)
-    {
-        return true;
-    }
-
-    bool end_object()
-    {
-        return true;
-    }
-
-    bool start_array(std::size_t /*unused*/ = detail::unknown_size())
-    {
-        return true;
-    }
-
-    bool end_array()
-    {
-        return true;
-    }
-
-    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const detail::exception& /*unused*/)
-    {
-        return false;
-    }
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/input/lexer.hpp>
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-// #include <nlohmann/detail/meta/is_sax.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <cstdint> // size_t
-#include <utility> // declval
-#include <string> // string
-
-// #include <nlohmann/detail/abi_macros.hpp>
-
-// #include <nlohmann/detail/meta/detected.hpp>
-
-// #include <nlohmann/detail/meta/type_traits.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-template<typename T>
-using null_function_t = decltype(std::declval<T&>().null());
-
-template<typename T>
-using boolean_function_t =
-    decltype(std::declval<T&>().boolean(std::declval<bool>()));
-
-template<typename T, typename Integer>
-using number_integer_function_t =
-    decltype(std::declval<T&>().number_integer(std::declval<Integer>()));
-
-template<typename T, typename Unsigned>
-using number_unsigned_function_t =
-    decltype(std::declval<T&>().number_unsigned(std::declval<Unsigned>()));
-
-template<typename T, typename Float, typename String>
-using number_float_function_t = decltype(std::declval<T&>().number_float(
-                                    std::declval<Float>(), std::declval<const String&>()));
-
-template<typename T, typename String>
-using string_function_t =
-    decltype(std::declval<T&>().string(std::declval<String&>()));
-
-template<typename T, typename Binary>
-using binary_function_t =
-    decltype(std::declval<T&>().binary(std::declval<Binary&>()));
-
-template<typename T>
-using start_object_function_t =
-    decltype(std::declval<T&>().start_object(std::declval<std::size_t>()));
-
-template<typename T, typename String>
-using key_function_t =
-    decltype(std::declval<T&>().key(std::declval<String&>()));
-
-template<typename T>
-using end_object_function_t = decltype(std::declval<T&>().end_object());
-
-template<typename T>
-using start_array_function_t =
-    decltype(std::declval<T&>().start_array(std::declval<std::size_t>()));
-
-template<typename T>
-using end_array_function_t = decltype(std::declval<T&>().end_array());
-
-template<typename T, typename Exception>
-using parse_error_function_t = decltype(std::declval<T&>().parse_error(
-        std::declval<std::size_t>(), std::declval<const std::string&>(),
-        std::declval<const Exception&>()));
-
-template<typename SAX, typename BasicJsonType>
-struct is_sax
-{
-  private:
-    static_assert(is_basic_json<BasicJsonType>::value,
-                  "BasicJsonType must be of type basic_json<...>");
-
-    using number_integer_t = typename BasicJsonType::number_integer_t;
-    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-    using string_t = typename BasicJsonType::string_t;
-    using binary_t = typename BasicJsonType::binary_t;
-    using exception_t = typename BasicJsonType::exception;
-
-  public:
-    static constexpr bool value =
-        is_detected_exact<bool, null_function_t, SAX>::value &&
-        is_detected_exact<bool, boolean_function_t, SAX>::value &&
-        is_detected_exact<bool, number_integer_function_t, SAX, number_integer_t>::value &&
-        is_detected_exact<bool, number_unsigned_function_t, SAX, number_unsigned_t>::value &&
-        is_detected_exact<bool, number_float_function_t, SAX, number_float_t, string_t>::value &&
-        is_detected_exact<bool, string_function_t, SAX, string_t>::value &&
-        is_detected_exact<bool, binary_function_t, SAX, binary_t>::value &&
-        is_detected_exact<bool, start_object_function_t, SAX>::value &&
-        is_detected_exact<bool, key_function_t, SAX, string_t>::value &&
-        is_detected_exact<bool, end_object_function_t, SAX>::value &&
-        is_detected_exact<bool, start_array_function_t, SAX>::value &&
-        is_detected_exact<bool, end_array_function_t, SAX>::value &&
-        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value;
-};
-
-template<typename SAX, typename BasicJsonType>
-struct is_sax_static_asserts
-{
-  private:
-    static_assert(is_basic_json<BasicJsonType>::value,
-                  "BasicJsonType must be of type basic_json<...>");
-
-    using number_integer_t = typename BasicJsonType::number_integer_t;
-    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-    using string_t = typename BasicJsonType::string_t;
-    using binary_t = typename BasicJsonType::binary_t;
-    using exception_t = typename BasicJsonType::exception;
-
-  public:
-    static_assert(is_detected_exact<bool, null_function_t, SAX>::value,
-                  "Missing/invalid function: bool null()");
-    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
-                  "Missing/invalid function: bool boolean(bool)");
-    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
-                  "Missing/invalid function: bool boolean(bool)");
-    static_assert(
-        is_detected_exact<bool, number_integer_function_t, SAX,
-        number_integer_t>::value,
-        "Missing/invalid function: bool number_integer(number_integer_t)");
-    static_assert(
-        is_detected_exact<bool, number_unsigned_function_t, SAX,
-        number_unsigned_t>::value,
-        "Missing/invalid function: bool number_unsigned(number_unsigned_t)");
-    static_assert(is_detected_exact<bool, number_float_function_t, SAX,
-                  number_float_t, string_t>::value,
-                  "Missing/invalid function: bool number_float(number_float_t, const string_t&)");
-    static_assert(
-        is_detected_exact<bool, string_function_t, SAX, string_t>::value,
-        "Missing/invalid function: bool string(string_t&)");
-    static_assert(
-        is_detected_exact<bool, binary_function_t, SAX, binary_t>::value,
-        "Missing/invalid function: bool binary(binary_t&)");
-    static_assert(is_detected_exact<bool, start_object_function_t, SAX>::value,
-                  "Missing/invalid function: bool start_object(std::size_t)");
-    static_assert(is_detected_exact<bool, key_function_t, SAX, string_t>::value,
-                  "Missing/invalid function: bool key(string_t&)");
-    static_assert(is_detected_exact<bool, end_object_function_t, SAX>::value,
-                  "Missing/invalid function: bool end_object()");
-    static_assert(is_detected_exact<bool, start_array_function_t, SAX>::value,
-                  "Missing/invalid function: bool start_array(std::size_t)");
-    static_assert(is_detected_exact<bool, end_array_function_t, SAX>::value,
-                  "Missing/invalid function: bool end_array()");
-    static_assert(
-        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value,
-        "Missing/invalid function: bool parse_error(std::size_t, const "
-        "std::string&, const exception&)");
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/meta/type_traits.hpp>
-
-// #include <nlohmann/detail/string_concat.hpp>
-
-// #include <nlohmann/detail/value_t.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-/// how to treat CBOR tags
-enum class cbor_tag_handler_t
-{
-    error,   ///< throw a parse_error exception in case of a tag
-    ignore,  ///< ignore tags
-    store    ///< store tags as binary type
-};
-
-/*!
-@brief determine system byte order
-
-@return true if and only if system's byte order is little endian
-
-@note from https://stackoverflow.com/a/1001328/266378
-*/
-static inline bool little_endianness(int num = 1) noexcept
-{
-    return *reinterpret_cast<char*>(&num) == 1;
-}
-
-///////////////////
-// binary reader //
-///////////////////
-
-/*!
-@brief deserialization of CBOR, MessagePack, and UBJSON values
-*/
-template<typename BasicJsonType, typename InputAdapterType, typename SAX = json_sax_dom_parser<BasicJsonType, InputAdapterType>>
-class binary_reader
-{
-    using number_integer_t = typename BasicJsonType::number_integer_t;
-    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-    using string_t = typename BasicJsonType::string_t;
-    using binary_t = typename BasicJsonType::binary_t;
-    using json_sax_t = SAX;
-    using char_type = typename InputAdapterType::char_type;
-    using char_int_type = typename char_traits<char_type>::int_type;
-
-  public:
-    /*!
-    @brief create a binary reader
-
-    @param[in] adapter  input adapter to read from
-    */
-    explicit binary_reader(InputAdapterType&& adapter, const input_format_t format = input_format_t::json) noexcept : ia(std::move(adapter)), input_format(format)
-    {
-        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
-    }
-
-    // make class move-only
-    binary_reader(const binary_reader&) = delete;
-    binary_reader(binary_reader&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
-    binary_reader& operator=(const binary_reader&) = delete;
-    binary_reader& operator=(binary_reader&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
-    ~binary_reader() = default;
-
-    /*!
-    @param[in] format  the binary format to parse
-    @param[in] sax_    a SAX event processor
-    @param[in] strict  whether to expect the input to be consumed completed
-    @param[in] tag_handler  how to treat CBOR tags
-
-    @return whether parsing was successful
-    */
-    JSON_HEDLEY_NON_NULL(3)
-    bool sax_parse(const input_format_t format,
-                   json_sax_t* sax_,
-                   const bool strict = true,
-                   const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
-    {
-        sax = sax_;
-        bool result = false;
-
-        switch (format)
-        {
-            case input_format_t::bson:
-                result = parse_bson_internal();
-                break;
-
-            case input_format_t::cbor:
-                result = parse_cbor_internal(true, tag_handler);
-                break;
-
-            case input_format_t::msgpack:
-                result = parse_msgpack_internal();
-                break;
-
-            case input_format_t::ubjson:
-            case input_format_t::bjdata:
-                result = parse_ubjson_internal();
-                break;
-
-            case input_format_t::json: // LCOV_EXCL_LINE
-            default:            // LCOV_EXCL_LINE
-                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
-        }
-
-        // strict mode: next byte must be EOF
-        if (result && strict)
-        {
-            if (input_format == input_format_t::ubjson || input_format == input_format_t::bjdata)
-            {
-                get_ignore_noop();
-            }
-            else
-            {
-                get();
-            }
-
-            if (JSON_HEDLEY_UNLIKELY(current != char_traits<char_type>::eof()))
-            {
-                return sax->parse_error(chars_read, get_token_string(), parse_error::create(110, chars_read,
-                                        exception_message(input_format, concat("expected end of input; last byte: 0x", get_token_string()), "value"), nullptr));
-            }
-        }
-
-        return result;
-    }
-
-  private:
-    //////////
-    // BSON //
-    //////////
-
-    /*!
-    @brief Reads in a BSON-object and passes it to the SAX-parser.
-    @return whether a valid BSON-value was passed to the SAX parser
-    */
-    bool parse_bson_internal()
-    {
-        std::int32_t document_size{};
-        get_number<std::int32_t, true>(input_format_t::bson, document_size);
-
-        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(detail::unknown_size())))
-        {
-            return false;
-        }
-
-        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/false)))
-        {
-            return false;
-        }
-
-        return sax->end_object();
-    }
-
-    /*!
-    @brief Parses a C-style string from the BSON input.
-    @param[in,out] result  A reference to the string variable where the read
-                            string is to be stored.
-    @return `true` if the \x00-byte indicating the end of the string was
-             encountered before the EOF; false` indicates an unexpected EOF.
-    */
-    bool get_bson_cstr(string_t& result)
-    {
-        auto out = std::back_inserter(result);
-        while (true)
-        {
-            get();
-            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "cstring")))
-            {
-                return false;
-            }
-            if (current == 0x00)
-            {
-                return true;
-            }
-            *out++ = static_cast<typename string_t::value_type>(current);
-        }
-    }
-
-    /*!
-    @brief Parses a zero-terminated string of length @a len from the BSON
-           input.
-    @param[in] len  The length (including the zero-byte at the end) of the
-                    string to be read.
-    @param[in,out] result  A reference to the string variable where the read
-                            string is to be stored.
-    @tparam NumberType The type of the length @a len
-    @pre len >= 1
-    @return `true` if the string was successfully parsed
-    */
-    template<typename NumberType>
-    bool get_bson_string(const NumberType len, string_t& result)
-    {
-        if (JSON_HEDLEY_UNLIKELY(len < 1))
-        {
-            auto last_token = get_token_string();
-            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
-                                    exception_message(input_format_t::bson, concat("string length must be at least 1, is ", std::to_string(len)), "string"), nullptr));
-        }
-
-        return get_string(input_format_t::bson, len - static_cast<NumberType>(1), result) && get() != char_traits<char_type>::eof();
-    }
-
-    /*!
-    @brief Parses a byte array input of length @a len from the BSON input.
-    @param[in] len  The length of the byte array to be read.
-    @param[in,out] result  A reference to the binary variable where the read
-                            array is to be stored.
-    @tparam NumberType The type of the length @a len
-    @pre len >= 0
-    @return `true` if the byte array was successfully parsed
-    */
-    template<typename NumberType>
-    bool get_bson_binary(const NumberType len, binary_t& result)
-    {
-        if (JSON_HEDLEY_UNLIKELY(len < 0))
-        {
-            auto last_token = get_token_string();
-            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
-                                    exception_message(input_format_t::bson, concat("byte array length cannot be negative, is ", std::to_string(len)), "binary"), nullptr));
-        }
-
-        // All BSON binary values have a subtype
-        std::uint8_t subtype{};
-        get_number<std::uint8_t>(input_format_t::bson, subtype);
-        result.set_subtype(subtype);
-
-        return get_binary(input_format_t::bson, len, result);
-    }
-
-    /*!
-    @brief Read a BSON document element of the given @a element_type.
-    @param[in] element_type The BSON element type, c.f. http://bsonspec.org/spec.html
-    @param[in] element_type_parse_position The position in the input stream,
-               where the `element_type` was read.
-    @warning Not all BSON element types are supported yet. An unsupported
-             @a element_type will give rise to a parse_error.114:
-             Unsupported BSON record type 0x...
-    @return whether a valid BSON-object/array was passed to the SAX parser
-    */
-    bool parse_bson_element_internal(const char_int_type element_type,
-                                     const std::size_t element_type_parse_position)
-    {
-        switch (element_type)
-        {
-            case 0x01: // double
-            {
-                double number{};
-                return get_number<double, true>(input_format_t::bson, number) && sax->number_float(static_cast<number_float_t>(number), "");
-            }
-
-            case 0x02: // string
-            {
-                std::int32_t len{};
-                string_t value;
-                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_string(len, value) && sax->string(value);
-            }
-
-            case 0x03: // object
-            {
-                return parse_bson_internal();
-            }
-
-            case 0x04: // array
-            {
-                return parse_bson_array();
-            }
-
-            case 0x05: // binary
-            {
-                std::int32_t len{};
-                binary_t value;
-                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_binary(len, value) && sax->binary(value);
-            }
-
-            case 0x08: // boolean
-            {
-                return sax->boolean(get() != 0);
-            }
-
-            case 0x0A: // null
-            {
-                return sax->null();
-            }
-
-            case 0x10: // int32
-            {
-                std::int32_t value{};
-                return get_number<std::int32_t, true>(input_format_t::bson, value) && sax->number_integer(value);
-            }
-
-            case 0x12: // int64
-            {
-                std::int64_t value{};
-                return get_number<std::int64_t, true>(input_format_t::bson, value) && sax->number_integer(value);
-            }
-
-            default: // anything else not supported (yet)
-            {
-                std::array<char, 3> cr{{}};
-                static_cast<void>((std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(element_type))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
-                const std::string cr_str{cr.data()};
-                return sax->parse_error(element_type_parse_position, cr_str,
-                                        parse_error::create(114, element_type_parse_position, concat("Unsupported BSON record type 0x", cr_str), nullptr));
-            }
-        }
-    }
-
-    /*!
-    @brief Read a BSON element list (as specified in the BSON-spec)
-
-    The same binary layout is used for objects and arrays, hence it must be
-    indicated with the argument @a is_array which one is expected
-    (true --> array, false --> object).
-
-    @param[in] is_array Determines if the element list being read is to be
-                        treated as an object (@a is_array == false), or as an
-                        array (@a is_array == true).
-    @return whether a valid BSON-object/array was passed to the SAX parser
-    */
-    bool parse_bson_element_list(const bool is_array)
-    {
-        string_t key;
-
-        while (auto element_type = get())
-        {
-            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "element list")))
-            {
-                return false;
-            }
-
-            const std::size_t element_type_parse_position = chars_read;
-            if (JSON_HEDLEY_UNLIKELY(!get_bson_cstr(key)))
-            {
-                return false;
-            }
-
-            if (!is_array && !sax->key(key))
-            {
-                return false;
-            }
-
-            if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_internal(element_type, element_type_parse_position)))
-            {
-                return false;
-            }
-
-            // get_bson_cstr only appends
-            key.clear();
-        }
-
-        return true;
-    }
-
-    /*!
-    @brief Reads an array from the BSON input and passes it to the SAX-parser.
-    @return whether a valid BSON-array was passed to the SAX parser
-    */
-    bool parse_bson_array()
-    {
-        std::int32_t document_size{};
-        get_number<std::int32_t, true>(input_format_t::bson, document_size);
-
-        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(detail::unknown_size())))
-        {
-            return false;
-        }
-
-        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/true)))
-        {
-            return false;
-        }
-
-        return sax->end_array();
-    }
-
-    //////////
-    // CBOR //
-    //////////
-
-    /*!
-    @param[in] get_char  whether a new character should be retrieved from the
-                         input (true) or whether the last read character should
-                         be considered instead (false)
-    @param[in] tag_handler how CBOR tags should be treated
-
-    @return whether a valid CBOR value was passed to the SAX parser
-    */
-    bool parse_cbor_internal(const bool get_char,
-                             const cbor_tag_handler_t tag_handler)
-    {
-        switch (get_char ? get() : current)
-        {
-            // EOF
-            case char_traits<char_type>::eof():
-                return unexpect_eof(input_format_t::cbor, "value");
-
-            // Integer 0x00..0x17 (0..23)
-            case 0x00:
-            case 0x01:
-            case 0x02:
-            case 0x03:
-            case 0x04:
-            case 0x05:
-            case 0x06:
-            case 0x07:
-            case 0x08:
-            case 0x09:
-            case 0x0A:
-            case 0x0B:
-            case 0x0C:
-            case 0x0D:
-            case 0x0E:
-            case 0x0F:
-            case 0x10:
-            case 0x11:
-            case 0x12:
-            case 0x13:
-            case 0x14:
-            case 0x15:
-            case 0x16:
-            case 0x17:
-                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
-
-            case 0x18: // Unsigned integer (one-byte uint8_t follows)
-            {
-                std::uint8_t number{};
-                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
-            }
-
-            case 0x19: // Unsigned integer (two-byte uint16_t follows)
-            {
-                std::uint16_t number{};
-                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
-            }
-
-            case 0x1A: // Unsigned integer (four-byte uint32_t follows)
-            {
-                std::uint32_t number{};
-                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
-            }
-
-            case 0x1B: // Unsigned integer (eight-byte uint64_t follows)
-            {
-                std::uint64_t number{};
-                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
-            }
-
-            // Negative integer -1-0x00..-1-0x17 (-1..-24)
-            case 0x20:
-            case 0x21:
-            case 0x22:
-            case 0x23:
-            case 0x24:
-            case 0x25:
-            case 0x26:
-            case 0x27:
-            case 0x28:
-            case 0x29:
-            case 0x2A:
-            case 0x2B:
-            case 0x2C:
-            case 0x2D:
-            case 0x2E:
-            case 0x2F:
-            case 0x30:
-            case 0x31:
-            case 0x32:
-            case 0x33:
-            case 0x34:
-            case 0x35:
-            case 0x36:
-            case 0x37:
-                return sax->number_integer(static_cast<std::int8_t>(0x20 - 1 - current));
-
-            case 0x38: // Negative integer (one-byte uint8_t follows)
-            {
-                std::uint8_t number{};
-                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
-            }
-
-            case 0x39: // Negative integer -1-n (two-byte uint16_t follows)
-            {
-                std::uint16_t number{};
-                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
-            }
-
-            case 0x3A: // Negative integer -1-n (four-byte uint32_t follows)
-            {
-                std::uint32_t number{};
-                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
-            }
-
-            case 0x3B: // Negative integer -1-n (eight-byte uint64_t follows)
-            {
-                std::uint64_t number{};
-                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1)
-                        - static_cast<number_integer_t>(number));
-            }
-
-            // Binary data (0x00..0x17 bytes follow)
-            case 0x40:
-            case 0x41:
-            case 0x42:
-            case 0x43:
-            case 0x44:
-            case 0x45:
-            case 0x46:
-            case 0x47:
-            case 0x48:
-            case 0x49:
-            case 0x4A:
-            case 0x4B:
-            case 0x4C:
-            case 0x4D:
-            case 0x4E:
-            case 0x4F:
-            case 0x50:
-            case 0x51:
-            case 0x52:
-            case 0x53:
-            case 0x54:
-            case 0x55:
-            case 0x56:
-            case 0x57:
-            case 0x58: // Binary data (one-byte uint8_t for n follows)
-            case 0x59: // Binary data (two-byte uint16_t for n follow)
-            case 0x5A: // Binary data (four-byte uint32_t for n follow)
-            case 0x5B: // Binary data (eight-byte uint64_t for n follow)
-            case 0x5F: // Binary data (indefinite length)
-            {
-                binary_t b;
-                return get_cbor_binary(b) && sax->binary(b);
-            }
-
-            // UTF-8 string (0x00..0x17 bytes follow)
-            case 0x60:
-            case 0x61:
-            case 0x62:
-            case 0x63:
-            case 0x64:
-            case 0x65:
-            case 0x66:
-            case 0x67:
-            case 0x68:
-            case 0x69:
-            case 0x6A:
-            case 0x6B:
-            case 0x6C:
-            case 0x6D:
-            case 0x6E:
-            case 0x6F:
-            case 0x70:
-            case 0x71:
-            case 0x72:
-            case 0x73:
-            case 0x74:
-            case 0x75:
-            case 0x76:
-            case 0x77:
-            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
-            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
-            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
-            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
-            case 0x7F: // UTF-8 string (indefinite length)
-            {
-                string_t s;
-                return get_cbor_string(s) && sax->string(s);
-            }
-
-            // array (0x00..0x17 data items follow)
-            case 0x80:
-            case 0x81:
-            case 0x82:
-            case 0x83:
-            case 0x84:
-            case 0x85:
-            case 0x86:
-            case 0x87:
-            case 0x88:
-            case 0x89:
-            case 0x8A:
-            case 0x8B:
-            case 0x8C:
-            case 0x8D:
-            case 0x8E:
-            case 0x8F:
-            case 0x90:
-            case 0x91:
-            case 0x92:
-            case 0x93:
-            case 0x94:
-            case 0x95:
-            case 0x96:
-            case 0x97:
-                return get_cbor_array(
-                           conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu), tag_handler);
-
-            case 0x98: // array (one-byte uint8_t for n follows)
-            {
-                std::uint8_t len{};
-                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
-            }
-
-            case 0x99: // array (two-byte uint16_t for n follow)
-            {
-                std::uint16_t len{};
-                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
-            }
-
-            case 0x9A: // array (four-byte uint32_t for n follow)
-            {
-                std::uint32_t len{};
-                return get_number(input_format_t::cbor, len) && get_cbor_array(conditional_static_cast<std::size_t>(len), tag_handler);
-            }
-
-            case 0x9B: // array (eight-byte uint64_t for n follow)
-            {
-                std::uint64_t len{};
-                return get_number(input_format_t::cbor, len) && get_cbor_array(conditional_static_cast<std::size_t>(len), tag_handler);
-            }
-
-            case 0x9F: // array (indefinite length)
-                return get_cbor_array(detail::unknown_size(), tag_handler);
-
-            // map (0x00..0x17 pairs of data items follow)
-            case 0xA0:
-            case 0xA1:
-            case 0xA2:
-            case 0xA3:
-            case 0xA4:
-            case 0xA5:
-            case 0xA6:
-            case 0xA7:
-            case 0xA8:
-            case 0xA9:
-            case 0xAA:
-            case 0xAB:
-            case 0xAC:
-            case 0xAD:
-            case 0xAE:
-            case 0xAF:
-            case 0xB0:
-            case 0xB1:
-            case 0xB2:
-            case 0xB3:
-            case 0xB4:
-            case 0xB5:
-            case 0xB6:
-            case 0xB7:
-                return get_cbor_object(conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu), tag_handler);
-
-            case 0xB8: // map (one-byte uint8_t for n follows)
-            {
-                std::uint8_t len{};
-                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
-            }
-
-            case 0xB9: // map (two-byte uint16_t for n follow)
-            {
-                std::uint16_t len{};
-                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
-            }
-
-            case 0xBA: // map (four-byte uint32_t for n follow)
-            {
-                std::uint32_t len{};
-                return get_number(input_format_t::cbor, len) && get_cbor_object(conditional_static_cast<std::size_t>(len), tag_handler);
-            }
-
-            case 0xBB: // map (eight-byte uint64_t for n follow)
-            {
-                std::uint64_t len{};
-                return get_number(input_format_t::cbor, len) && get_cbor_object(conditional_static_cast<std::size_t>(len), tag_handler);
-            }
-
-            case 0xBF: // map (indefinite length)
-                return get_cbor_object(detail::unknown_size(), tag_handler);
-
-            case 0xC6: // tagged item
-            case 0xC7:
-            case 0xC8:
-            case 0xC9:
-            case 0xCA:
-            case 0xCB:
-            case 0xCC:
-            case 0xCD:
-            case 0xCE:
-            case 0xCF:
-            case 0xD0:
-            case 0xD1:
-            case 0xD2:
-            case 0xD3:
-            case 0xD4:
-            case 0xD8: // tagged item (1 bytes follow)
-            case 0xD9: // tagged item (2 bytes follow)
-            case 0xDA: // tagged item (4 bytes follow)
-            case 0xDB: // tagged item (8 bytes follow)
-            {
-                switch (tag_handler)
-                {
-                    case cbor_tag_handler_t::error:
-                    {
-                        auto last_token = get_token_string();
-                        return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
-                                                exception_message(input_format_t::cbor, concat("invalid byte: 0x", last_token), "value"), nullptr));
-                    }
-
-                    case cbor_tag_handler_t::ignore:
-                    {
-                        // ignore binary subtype
-                        switch (current)
-                        {
-                            case 0xD8:
-                            {
-                                std::uint8_t subtype_to_ignore{};
-                                get_number(input_format_t::cbor, subtype_to_ignore);
-                                break;
-                            }
-                            case 0xD9:
-                            {
-                                std::uint16_t subtype_to_ignore{};
-                                get_number(input_format_t::cbor, subtype_to_ignore);
-                                break;
-                            }
-                            case 0xDA:
-                            {
-                                std::uint32_t subtype_to_ignore{};
-                                get_number(input_format_t::cbor, subtype_to_ignore);
-                                break;
-                            }
-                            case 0xDB:
-                            {
-                                std::uint64_t subtype_to_ignore{};
-                                get_number(input_format_t::cbor, subtype_to_ignore);
-                                break;
-                            }
-                            default:
-                                break;
-                        }
-                        return parse_cbor_internal(true, tag_handler);
-                    }
-
-                    case cbor_tag_handler_t::store:
-                    {
-                        binary_t b;
-                        // use binary subtype and store in binary container
-                        switch (current)
-                        {
-                            case 0xD8:
-                            {
-                                std::uint8_t subtype{};
-                                get_number(input_format_t::cbor, subtype);
-                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
-                                break;
-                            }
-                            case 0xD9:
-                            {
-                                std::uint16_t subtype{};
-                                get_number(input_format_t::cbor, subtype);
-                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
-                                break;
-                            }
-                            case 0xDA:
-                            {
-                                std::uint32_t subtype{};
-                                get_number(input_format_t::cbor, subtype);
-                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
-                                break;
-                            }
-                            case 0xDB:
-                            {
-                                std::uint64_t subtype{};
-                                get_number(input_format_t::cbor, subtype);
-                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
-                                break;
-                            }
-                            default:
-                                return parse_cbor_internal(true, tag_handler);
-                        }
-                        get();
-                        return get_cbor_binary(b) && sax->binary(b);
-                    }
-
-                    default:                 // LCOV_EXCL_LINE
-                        JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
-                        return false;        // LCOV_EXCL_LINE
-                }
-            }
-
-            case 0xF4: // false
-                return sax->boolean(false);
-
-            case 0xF5: // true
-                return sax->boolean(true);
-
-            case 0xF6: // null
-                return sax->null();
-
-            case 0xF9: // Half-Precision Float (two-byte IEEE 754)
-            {
-                const auto byte1_raw = get();
-                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number")))
-                {
-                    return false;
-                }
-                const auto byte2_raw = get();
-                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number")))
-                {
-                    return false;
-                }
-
-                const auto byte1 = static_cast<unsigned char>(byte1_raw);
-                const auto byte2 = static_cast<unsigned char>(byte2_raw);
-
-                // code from RFC 7049, Appendix D, Figure 3:
-                // As half-precision floating-point numbers were only added
-                // to IEEE 754 in 2008, today's programming platforms often
-                // still only have limited support for them. It is very
-                // easy to include at least decoding support for them even
-                // without such support. An example of a small decoder for
-                // half-precision floating-point numbers in the C language
-                // is shown in Fig. 3.
-                const auto half = static_cast<unsigned int>((byte1 << 8u) + byte2);
-                const double val = [&half]
-                {
-                    const int exp = (half >> 10u) & 0x1Fu;
-                    const unsigned int mant = half & 0x3FFu;
-                    JSON_ASSERT(0 <= exp&& exp <= 32);
-                    JSON_ASSERT(mant <= 1024);
-                    switch (exp)
-                    {
-                        case 0:
-                            return std::ldexp(mant, -24);
-                        case 31:
-                            return (mant == 0)
-                            ? std::numeric_limits<double>::infinity()
-                            : std::numeric_limits<double>::quiet_NaN();
-                        default:
-                            return std::ldexp(mant + 1024, exp - 25);
-                    }
-                }();
-                return sax->number_float((half & 0x8000u) != 0
-                                         ? static_cast<number_float_t>(-val)
-                                         : static_cast<number_float_t>(val), "");
-            }
-
-            case 0xFA: // Single-Precision Float (four-byte IEEE 754)
-            {
-                float number{};
-                return get_number(input_format_t::cbor, number) && sax->number_float(static_cast<number_float_t>(number), "");
-            }
-
-            case 0xFB: // Double-Precision Float (eight-byte IEEE 754)
-            {
-                double number{};
-                return get_number(input_format_t::cbor, number) && sax->number_float(static_cast<number_float_t>(number), "");
-            }
-
-            default: // anything else (0xFF is handled inside the other types)
-            {
-                auto last_token = get_token_string();
-                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
-                                        exception_message(input_format_t::cbor, concat("invalid byte: 0x", last_token), "value"), nullptr));
-            }
-        }
-    }
-
-    /*!
-    @brief reads a CBOR string
-
-    This function first reads starting bytes to determine the expected
-    string length and then copies this number of bytes into a string.
-    Additionally, CBOR's strings with indefinite lengths are supported.
-
-    @param[out] result  created string
-
-    @return whether string creation completed
-    */
-    bool get_cbor_string(string_t& result)
-    {
-        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "string")))
-        {
-            return false;
-        }
-
-        switch (current)
-        {
-            // UTF-8 string (0x00..0x17 bytes follow)
-            case 0x60:
-            case 0x61:
-            case 0x62:
-            case 0x63:
-            case 0x64:
-            case 0x65:
-            case 0x66:
-            case 0x67:
-            case 0x68:
-            case 0x69:
-            case 0x6A:
-            case 0x6B:
-            case 0x6C:
-            case 0x6D:
-            case 0x6E:
-            case 0x6F:
-            case 0x70:
-            case 0x71:
-            case 0x72:
-            case 0x73:
-            case 0x74:
-            case 0x75:
-            case 0x76:
-            case 0x77:
-            {
-                return get_string(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
-            }
-
-            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
-            {
-                std::uint8_t len{};
-                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
-            }
-
-            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
-            {
-                std::uint16_t len{};
-                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
-            }
-
-            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
-            {
-                std::uint32_t len{};
-                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
-            }
-
-            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
-            {
-                std::uint64_t len{};
-                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
-            }
-
-            case 0x7F: // UTF-8 string (indefinite length)
-            {
-                while (get() != 0xFF)
-                {
-                    string_t chunk;
-                    if (!get_cbor_string(chunk))
-                    {
-                        return false;
-                    }
-                    result.append(chunk);
-                }
-                return true;
-            }
-
-            default:
-            {
-                auto last_token = get_token_string();
-                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read,
-                                        exception_message(input_format_t::cbor, concat("expected length specification (0x60-0x7B) or indefinite string type (0x7F); last byte: 0x", last_token), "string"), nullptr));
-            }
-        }
-    }
-
-    /*!
-    @brief reads a CBOR byte array
-
-    This function first reads starting bytes to determine the expected
-    byte array length and then copies this number of bytes into the byte array.
-    Additionally, CBOR's byte arrays with indefinite lengths are supported.
-
-    @param[out] result  created byte array
-
-    @return whether byte array creation completed
-    */
-    bool get_cbor_binary(binary_t& result)
-    {
-        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "binary")))
-        {
-            return false;
-        }
-
-        switch (current)
-        {
-            // Binary data (0x00..0x17 bytes follow)
-            case 0x40:
-            case 0x41:
-            case 0x42:
-            case 0x43:
-            case 0x44:
-            case 0x45:
-            case 0x46:
-            case 0x47:
-            case 0x48:
-            case 0x49:
-            case 0x4A:
-            case 0x4B:
-            case 0x4C:
-            case 0x4D:
-            case 0x4E:
-            case 0x4F:
-            case 0x50:
-            case 0x51:
-            case 0x52:
-            case 0x53:
-            case 0x54:
-            case 0x55:
-            case 0x56:
-            case 0x57:
-            {
-                return get_binary(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
-            }
-
-            case 0x58: // Binary data (one-byte uint8_t for n follows)
-            {
-                std::uint8_t len{};
-                return get_number(input_format_t::cbor, len) &&
-                       get_binary(input_format_t::cbor, len, result);
-            }
-
-            case 0x59: // Binary data (two-byte uint16_t for n follow)
-            {
-                std::uint16_t len{};
-                return get_number(input_format_t::cbor, len) &&
-                       get_binary(input_format_t::cbor, len, result);
-            }
-
-            case 0x5A: // Binary data (four-byte uint32_t for n follow)
-            {
-                std::uint32_t len{};
-                return get_number(input_format_t::cbor, len) &&
-                       get_binary(input_format_t::cbor, len, result);
-            }
-
-            case 0x5B: // Binary data (eight-byte uint64_t for n follow)
-            {
-                std::uint64_t len{};
-                return get_number(input_format_t::cbor, len) &&
-                       get_binary(input_format_t::cbor, len, result);
-            }
-
-            case 0x5F: // Binary data (indefinite length)
-            {
-                while (get() != 0xFF)
-                {
-                    binary_t chunk;
-                    if (!get_cbor_binary(chunk))
-                    {
-                        return false;
-                    }
-                    result.insert(result.end(), chunk.begin(), chunk.end());
-                }
-                return true;
-            }
-
-            default:
-            {
-                auto last_token = get_token_string();
-                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read,
-                                        exception_message(input_format_t::cbor, concat("expected length specification (0x40-0x5B) or indefinite binary array type (0x5F); last byte: 0x", last_token), "binary"), nullptr));
-            }
-        }
-    }
-
-    /*!
-    @param[in] len  the length of the array or detail::unknown_size() for an
-                    array of indefinite size
-    @param[in] tag_handler how CBOR tags should be treated
-    @return whether array creation completed
-    */
-    bool get_cbor_array(const std::size_t len,
-                        const cbor_tag_handler_t tag_handler)
-    {
-        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len)))
-        {
-            return false;
-        }
-
-        if (len != detail::unknown_size())
-        {
-            for (std::size_t i = 0; i < len; ++i)
-            {
-                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
-                {
-                    return false;
-                }
-            }
-        }
-        else
-        {
-            while (get() != 0xFF)
-            {
-                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(false, tag_handler)))
-                {
-                    return false;
-                }
-            }
-        }
-
-        return sax->end_array();
-    }
-
-    /*!
-    @param[in] len  the length of the object or detail::unknown_size() for an
-                    object of indefinite size
-    @param[in] tag_handler how CBOR tags should be treated
-    @return whether object creation completed
-    */
-    bool get_cbor_object(const std::size_t len,
-                         const cbor_tag_handler_t tag_handler)
-    {
-        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len)))
-        {
-            return false;
-        }
-
-        if (len != 0)
-        {
-            string_t key;
-            if (len != detail::unknown_size())
-            {
-                for (std::size_t i = 0; i < len; ++i)
-                {
-                    get();
-                    if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key)))
-                    {
-                        return false;
-                    }
-
-                    if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
-                    {
-                        return false;
-                    }
-                    key.clear();
-                }
-            }
-            else
-            {
-                while (get() != 0xFF)
-                {
-                    if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key)))
-                    {
-                        return false;
-                    }
-
-                    if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
-                    {
-                        return false;
-                    }
-                    key.clear();
-                }
-            }
-        }
-
-        return sax->end_object();
-    }
-
-    /////////////
-    // MsgPack //
-    /////////////
-
-    /*!
-    @return whether a valid MessagePack value was passed to the SAX parser
-    */
-    bool parse_msgpack_internal()
-    {
-        switch (get())
-        {
-            // EOF
-            case char_traits<char_type>::eof():
-                return unexpect_eof(input_format_t::msgpack, "value");
-
-            // positive fixint
-            case 0x00:
-            case 0x01:
-            case 0x02:
-            case 0x03:
-            case 0x04:
-            case 0x05:
-            case 0x06:
-            case 0x07:
-            case 0x08:
-            case 0x09:
-            case 0x0A:
-            case 0x0B:
-            case 0x0C:
-            case 0x0D:
-            case 0x0E:
-            case 0x0F:
-            case 0x10:
-            case 0x11:
-            case 0x12:
-            case 0x13:
-            case 0x14:
-            case 0x15:
-            case 0x16:
-            case 0x17:
-            case 0x18:
-            case 0x19:
-            case 0x1A:
-            case 0x1B:
-            case 0x1C:
-            case 0x1D:
-            case 0x1E:
-            case 0x1F:
-            case 0x20:
-            case 0x21:
-            case 0x22:
-            case 0x23:
-            case 0x24:
-            case 0x25:
-            case 0x26:
-            case 0x27:
-            case 0x28:
-            case 0x29:
-            case 0x2A:
-            case 0x2B:
-            case 0x2C:
-            case 0x2D:
-            case 0x2E:
-            case 0x2F:
-            case 0x30:
-            case 0x31:
-            case 0x32:
-            case 0x33:
-            case 0x34:
-            case 0x35:
-            case 0x36:
-            case 0x37:
-            case 0x38:
-            case 0x39:
-            case 0x3A:
-            case 0x3B:
-            case 0x3C:
-            case 0x3D:
-            case 0x3E:
-            case 0x3F:
-            case 0x40:
-            case 0x41:
-            case 0x42:
-            case 0x43:
-            case 0x44:
-            case 0x45:
-            case 0x46:
-            case 0x47:
-            case 0x48:
-            case 0x49:
-            case 0x4A:
-            case 0x4B:
-            case 0x4C:
-            case 0x4D:
-            case 0x4E:
-            case 0x4F:
-            case 0x50:
-            case 0x51:
-            case 0x52:
-            case 0x53:
-            case 0x54:
-            case 0x55:
-            case 0x56:
-            case 0x57:
-            case 0x58:
-            case 0x59:
-            case 0x5A:
-            case 0x5B:
-            case 0x5C:
-            case 0x5D:
-            case 0x5E:
-            case 0x5F:
-            case 0x60:
-            case 0x61:
-            case 0x62:
-            case 0x63:
-            case 0x64:
-            case 0x65:
-            case 0x66:
-            case 0x67:
-            case 0x68:
-            case 0x69:
-            case 0x6A:
-            case 0x6B:
-            case 0x6C:
-            case 0x6D:
-            case 0x6E:
-            case 0x6F:
-            case 0x70:
-            case 0x71:
-            case 0x72:
-            case 0x73:
-            case 0x74:
-            case 0x75:
-            case 0x76:
-            case 0x77:
-            case 0x78:
-            case 0x79:
-            case 0x7A:
-            case 0x7B:
-            case 0x7C:
-            case 0x7D:
-            case 0x7E:
-            case 0x7F:
-                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
-
-            // fixmap
-            case 0x80:
-            case 0x81:
-            case 0x82:
-            case 0x83:
-            case 0x84:
-            case 0x85:
-            case 0x86:
-            case 0x87:
-            case 0x88:
-            case 0x89:
-            case 0x8A:
-            case 0x8B:
-            case 0x8C:
-            case 0x8D:
-            case 0x8E:
-            case 0x8F:
-                return get_msgpack_object(conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));
-
-            // fixarray
-            case 0x90:
-            case 0x91:
-            case 0x92:
-            case 0x93:
-            case 0x94:
-            case 0x95:
-            case 0x96:
-            case 0x97:
-            case 0x98:
-            case 0x99:
-            case 0x9A:
-            case 0x9B:
-            case 0x9C:
-            case 0x9D:
-            case 0x9E:
-            case 0x9F:
-                return get_msgpack_array(conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));
-
-            // fixstr
-            case 0xA0:
-            case 0xA1:
-            case 0xA2:
-            case 0xA3:
-            case 0xA4:
-            case 0xA5:
-            case 0xA6:
-            case 0xA7:
-            case 0xA8:
-            case 0xA9:
-            case 0xAA:
-            case 0xAB:
-            case 0xAC:
-            case 0xAD:
-            case 0xAE:
-            case 0xAF:
-            case 0xB0:
-            case 0xB1:
-            case 0xB2:
-            case 0xB3:
-            case 0xB4:
-            case 0xB5:
-            case 0xB6:
-            case 0xB7:
-            case 0xB8:
-            case 0xB9:
-            case 0xBA:
-            case 0xBB:
-            case 0xBC:
-            case 0xBD:
-            case 0xBE:
-            case 0xBF:
-            case 0xD9: // str 8
-            case 0xDA: // str 16
-            case 0xDB: // str 32
-            {
-                string_t s;
-                return get_msgpack_string(s) && sax->string(s);
-            }
-
-            case 0xC0: // nil
-                return sax->null();
-
-            case 0xC2: // false
-                return sax->boolean(false);
-
-            case 0xC3: // true
-                return sax->boolean(true);
-
-            case 0xC4: // bin 8
-            case 0xC5: // bin 16
-            case 0xC6: // bin 32
-            case 0xC7: // ext 8
-            case 0xC8: // ext 16
-            case 0xC9: // ext 32
-            case 0xD4: // fixext 1
-            case 0xD5: // fixext 2
-            case 0xD6: // fixext 4
-            case 0xD7: // fixext 8
-            case 0xD8: // fixext 16
-            {
-                binary_t b;
-                return get_msgpack_binary(b) && sax->binary(b);
-            }
-
-            case 0xCA: // float 32
-            {
-                float number{};
-                return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast<number_float_t>(number), "");
-            }
-
-            case 0xCB: // float 64
-            {
-                double number{};
-                return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast<number_float_t>(number), "");
-            }
-
-            case 0xCC: // uint 8
-            {
-                std::uint8_t number{};
-                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
-            }
-
-            case 0xCD: // uint 16
-            {
-                std::uint16_t number{};
-                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
-            }
-
-            case 0xCE: // uint 32
-            {
-                std::uint32_t number{};
-                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
-            }
-
-            case 0xCF: // uint 64
-            {
-                std::uint64_t number{};
-                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
-            }
-
-            case 0xD0: // int 8
-            {
-                std::int8_t number{};
-                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
-            }
-
-            case 0xD1: // int 16
-            {
-                std::int16_t number{};
-                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
-            }
-
-            case 0xD2: // int 32
-            {
-                std::int32_t number{};
-                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
-            }
-
-            case 0xD3: // int 64
-            {
-                std::int64_t number{};
-                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
-            }
-
-            case 0xDC: // array 16
-            {
-                std::uint16_t len{};
-                return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast<std::size_t>(len));
-            }
-
-            case 0xDD: // array 32
-            {
-                std::uint32_t len{};
-                return get_number(input_format_t::msgpack, len) && get_msgpack_array(conditional_static_cast<std::size_t>(len));
-            }
-
-            case 0xDE: // map 16
-            {
-                std::uint16_t len{};
-                return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast<std::size_t>(len));
-            }
-
-            case 0xDF: // map 32
-            {
-                std::uint32_t len{};
-                return get_number(input_format_t::msgpack, len) && get_msgpack_object(conditional_static_cast<std::size_t>(len));
-            }
-
-            // negative fixint
-            case 0xE0:
-            case 0xE1:
-            case 0xE2:
-            case 0xE3:
-            case 0xE4:
-            case 0xE5:
-            case 0xE6:
-            case 0xE7:
-            case 0xE8:
-            case 0xE9:
-            case 0xEA:
-            case 0xEB:
-            case 0xEC:
-            case 0xED:
-            case 0xEE:
-            case 0xEF:
-            case 0xF0:
-            case 0xF1:
-            case 0xF2:
-            case 0xF3:
-            case 0xF4:
-            case 0xF5:
-            case 0xF6:
-            case 0xF7:
-            case 0xF8:
-            case 0xF9:
-            case 0xFA:
-            case 0xFB:
-            case 0xFC:
-            case 0xFD:
-            case 0xFE:
-            case 0xFF:
-                return sax->number_integer(static_cast<std::int8_t>(current));
-
-            default: // anything else
-            {
-                auto last_token = get_token_string();
-                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
-                                        exception_message(input_format_t::msgpack, concat("invalid byte: 0x", last_token), "value"), nullptr));
-            }
-        }
-    }
-
-    /*!
-    @brief reads a MessagePack string
-
-    This function first reads starting bytes to determine the expected
-    string length and then copies this number of bytes into a string.
-
-    @param[out] result  created string
-
-    @return whether string creation completed
-    */
-    bool get_msgpack_string(string_t& result)
-    {
-        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::msgpack, "string")))
-        {
-            return false;
-        }
-
-        switch (current)
-        {
-            // fixstr
-            case 0xA0:
-            case 0xA1:
-            case 0xA2:
-            case 0xA3:
-            case 0xA4:
-            case 0xA5:
-            case 0xA6:
-            case 0xA7:
-            case 0xA8:
-            case 0xA9:
-            case 0xAA:
-            case 0xAB:
-            case 0xAC:
-            case 0xAD:
-            case 0xAE:
-            case 0xAF:
-            case 0xB0:
-            case 0xB1:
-            case 0xB2:
-            case 0xB3:
-            case 0xB4:
-            case 0xB5:
-            case 0xB6:
-            case 0xB7:
-            case 0xB8:
-            case 0xB9:
-            case 0xBA:
-            case 0xBB:
-            case 0xBC:
-            case 0xBD:
-            case 0xBE:
-            case 0xBF:
-            {
-                return get_string(input_format_t::msgpack, static_cast<unsigned int>(current) & 0x1Fu, result);
-            }
-
-            case 0xD9: // str 8
-            {
-                std::uint8_t len{};
-                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
-            }
-
-            case 0xDA: // str 16
-            {
-                std::uint16_t len{};
-                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
-            }
-
-            case 0xDB: // str 32
-            {
-                std::uint32_t len{};
-                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
-            }
-
-            default:
-            {
-                auto last_token = get_token_string();
-                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read,
-                                        exception_message(input_format_t::msgpack, concat("expected length specification (0xA0-0xBF, 0xD9-0xDB); last byte: 0x", last_token), "string"), nullptr));
-            }
-        }
-    }
-
-    /*!
-    @brief reads a MessagePack byte array
-
-    This function first reads starting bytes to determine the expected
-    byte array length and then copies this number of bytes into a byte array.
-
-    @param[out] result  created byte array
-
-    @return whether byte array creation completed
-    */
-    bool get_msgpack_binary(binary_t& result)
-    {
-        // helper function to set the subtype
-        auto assign_and_return_true = [&result](std::int8_t subtype)
-        {
-            result.set_subtype(static_cast<std::uint8_t>(subtype));
-            return true;
-        };
-
-        switch (current)
-        {
-            case 0xC4: // bin 8
-            {
-                std::uint8_t len{};
-                return get_number(input_format_t::msgpack, len) &&
-                       get_binary(input_format_t::msgpack, len, result);
-            }
-
-            case 0xC5: // bin 16
-            {
-                std::uint16_t len{};
-                return get_number(input_format_t::msgpack, len) &&
-                       get_binary(input_format_t::msgpack, len, result);
-            }
-
-            case 0xC6: // bin 32
-            {
-                std::uint32_t len{};
-                return get_number(input_format_t::msgpack, len) &&
-                       get_binary(input_format_t::msgpack, len, result);
-            }
-
-            case 0xC7: // ext 8
-            {
-                std::uint8_t len{};
-                std::int8_t subtype{};
-                return get_number(input_format_t::msgpack, len) &&
-                       get_number(input_format_t::msgpack, subtype) &&
-                       get_binary(input_format_t::msgpack, len, result) &&
-                       assign_and_return_true(subtype);
-            }
-
-            case 0xC8: // ext 16
-            {
-                std::uint16_t len{};
-                std::int8_t subtype{};
-                return get_number(input_format_t::msgpack, len) &&
-                       get_number(input_format_t::msgpack, subtype) &&
-                       get_binary(input_format_t::msgpack, len, result) &&
-                       assign_and_return_true(subtype);
-            }
-
-            case 0xC9: // ext 32
-            {
-                std::uint32_t len{};
-                std::int8_t subtype{};
-                return get_number(input_format_t::msgpack, len) &&
-                       get_number(input_format_t::msgpack, subtype) &&
-                       get_binary(input_format_t::msgpack, len, result) &&
-                       assign_and_return_true(subtype);
-            }
-
-            case 0xD4: // fixext 1
-            {
-                std::int8_t subtype{};
-                return get_number(input_format_t::msgpack, subtype) &&
-                       get_binary(input_format_t::msgpack, 1, result) &&
-                       assign_and_return_true(subtype);
-            }
-
-            case 0xD5: // fixext 2
-            {
-                std::int8_t subtype{};
-                return get_number(input_format_t::msgpack, subtype) &&
-                       get_binary(input_format_t::msgpack, 2, result) &&
-                       assign_and_return_true(subtype);
-            }
-
-            case 0xD6: // fixext 4
-            {
-                std::int8_t subtype{};
-                return get_number(input_format_t::msgpack, subtype) &&
-                       get_binary(input_format_t::msgpack, 4, result) &&
-                       assign_and_return_true(subtype);
-            }
-
-            case 0xD7: // fixext 8
-            {
-                std::int8_t subtype{};
-                return get_number(input_format_t::msgpack, subtype) &&
-                       get_binary(input_format_t::msgpack, 8, result) &&
-                       assign_and_return_true(subtype);
-            }
-
-            case 0xD8: // fixext 16
-            {
-                std::int8_t subtype{};
-                return get_number(input_format_t::msgpack, subtype) &&
-                       get_binary(input_format_t::msgpack, 16, result) &&
-                       assign_and_return_true(subtype);
-            }
-
-            default:           // LCOV_EXCL_LINE
-                return false;  // LCOV_EXCL_LINE
-        }
-    }
-
-    /*!
-    @param[in] len  the length of the array
-    @return whether array creation completed
-    */
-    bool get_msgpack_array(const std::size_t len)
-    {
-        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len)))
-        {
-            return false;
-        }
-
-        for (std::size_t i = 0; i < len; ++i)
-        {
-            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal()))
-            {
-                return false;
-            }
-        }
-
-        return sax->end_array();
-    }
-
-    /*!
-    @param[in] len  the length of the object
-    @return whether object creation completed
-    */
-    bool get_msgpack_object(const std::size_t len)
-    {
-        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len)))
-        {
-            return false;
-        }
-
-        string_t key;
-        for (std::size_t i = 0; i < len; ++i)
-        {
-            get();
-            if (JSON_HEDLEY_UNLIKELY(!get_msgpack_string(key) || !sax->key(key)))
-            {
-                return false;
-            }
-
-            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal()))
-            {
-                return false;
-            }
-            key.clear();
-        }
-
-        return sax->end_object();
-    }
-
-    ////////////
-    // UBJSON //
-    ////////////
-
-    /*!
-    @param[in] get_char  whether a new character should be retrieved from the
-                         input (true, default) or whether the last read
-                         character should be considered instead
-
-    @return whether a valid UBJSON value was passed to the SAX parser
-    */
-    bool parse_ubjson_internal(const bool get_char = true)
-    {
-        return get_ubjson_value(get_char ? get_ignore_noop() : current);
-    }
-
-    /*!
-    @brief reads a UBJSON string
-
-    This function is either called after reading the 'S' byte explicitly
-    indicating a string, or in case of an object key where the 'S' byte can be
-    left out.
-
-    @param[out] result   created string
-    @param[in] get_char  whether a new character should be retrieved from the
-                         input (true, default) or whether the last read
-                         character should be considered instead
-
-    @return whether string creation completed
-    */
-    bool get_ubjson_string(string_t& result, const bool get_char = true)
-    {
-        if (get_char)
-        {
-            get();  // TODO(niels): may we ignore N here?
-        }
-
-        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "value")))
-        {
-            return false;
-        }
-
-        switch (current)
-        {
-            case 'U':
-            {
-                std::uint8_t len{};
-                return get_number(input_format, len) && get_string(input_format, len, result);
-            }
-
-            case 'i':
-            {
-                std::int8_t len{};
-                return get_number(input_format, len) && get_string(input_format, len, result);
-            }
-
-            case 'I':
-            {
-                std::int16_t len{};
-                return get_number(input_format, len) && get_string(input_format, len, result);
-            }
-
-            case 'l':
-            {
-                std::int32_t len{};
-                return get_number(input_format, len) && get_string(input_format, len, result);
-            }
-
-            case 'L':
-            {
-                std::int64_t len{};
-                return get_number(input_format, len) && get_string(input_format, len, result);
-            }
-
-            case 'u':
-            {
-                if (input_format != input_format_t::bjdata)
-                {
-                    break;
-                }
-                std::uint16_t len{};
-                return get_number(input_format, len) && get_string(input_format, len, result);
-            }
-
-            case 'm':
-            {
-                if (input_format != input_format_t::bjdata)
-                {
-                    break;
-                }
-                std::uint32_t len{};
-                return get_number(input_format, len) && get_string(input_format, len, result);
-            }
-
-            case 'M':
-            {
-                if (input_format != input_format_t::bjdata)
-                {
-                    break;
-                }
-                std::uint64_t len{};
-                return get_number(input_format, len) && get_string(input_format, len, result);
-            }
-
-            default:
-                break;
-        }
-        auto last_token = get_token_string();
-        std::string message;
-
-        if (input_format != input_format_t::bjdata)
-        {
-            message = "expected length type specification (U, i, I, l, L); last byte: 0x" + last_token;
-        }
-        else
-        {
-            message = "expected length type specification (U, i, u, I, m, l, M, L); last byte: 0x" + last_token;
-        }
-        return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format, message, "string"), nullptr));
-    }
-
-    /*!
-    @param[out] dim  an integer vector storing the ND array dimensions
-    @return whether reading ND array size vector is successful
-    */
-    bool get_ubjson_ndarray_size(std::vector<size_t>& dim)
-    {
-        std::pair<std::size_t, char_int_type> size_and_type;
-        size_t dimlen = 0;
-        bool no_ndarray = true;
-
-        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type, no_ndarray)))
-        {
-            return false;
-        }
-
-        if (size_and_type.first != npos)
-        {
-            if (size_and_type.second != 0)
-            {
-                if (size_and_type.second != 'N')
-                {
-                    for (std::size_t i = 0; i < size_and_type.first; ++i)
-                    {
-                        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_value(dimlen, no_ndarray, size_and_type.second)))
-                        {
-                            return false;
-                        }
-                        dim.push_back(dimlen);
-                    }
-                }
-            }
-            else
-            {
-                for (std::size_t i = 0; i < size_and_type.first; ++i)
-                {
-                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_value(dimlen, no_ndarray)))
-                    {
-                        return false;
-                    }
-                    dim.push_back(dimlen);
-                }
-            }
-        }
-        else
-        {
-            while (current != ']')
-            {
-                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_value(dimlen, no_ndarray, current)))
-                {
-                    return false;
-                }
-                dim.push_back(dimlen);
-                get_ignore_noop();
-            }
-        }
-        return true;
-    }
-
-    /*!
-    @param[out] result  determined size
-    @param[in,out] is_ndarray  for input, `true` means already inside an ndarray vector
-                               or ndarray dimension is not allowed; `false` means ndarray
-                               is allowed; for output, `true` means an ndarray is found;
-                               is_ndarray can only return `true` when its initial value
-                               is `false`
-    @param[in] prefix  type marker if already read, otherwise set to 0
-
-    @return whether size determination completed
-    */
-    bool get_ubjson_size_value(std::size_t& result, bool& is_ndarray, char_int_type prefix = 0)
-    {
-        if (prefix == 0)
-        {
-            prefix = get_ignore_noop();
-        }
-
-        switch (prefix)
-        {
-            case 'U':
-            {
-                std::uint8_t number{};
-                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
-                {
-                    return false;
-                }
-                result = static_cast<std::size_t>(number);
-                return true;
-            }
-
-            case 'i':
-            {
-                std::int8_t number{};
-                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
-                {
-                    return false;
-                }
-                if (number < 0)
-                {
-                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read,
-                                            exception_message(input_format, "count in an optimized container must be positive", "size"), nullptr));
-                }
-                result = static_cast<std::size_t>(number); // NOLINT(bugprone-signed-char-misuse,cert-str34-c): number is not a char
-                return true;
-            }
-
-            case 'I':
-            {
-                std::int16_t number{};
-                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
-                {
-                    return false;
-                }
-                if (number < 0)
-                {
-                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read,
-                                            exception_message(input_format, "count in an optimized container must be positive", "size"), nullptr));
-                }
-                result = static_cast<std::size_t>(number);
-                return true;
-            }
-
-            case 'l':
-            {
-                std::int32_t number{};
-                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
-                {
-                    return false;
-                }
-                if (number < 0)
-                {
-                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read,
-                                            exception_message(input_format, "count in an optimized container must be positive", "size"), nullptr));
-                }
-                result = static_cast<std::size_t>(number);
-                return true;
-            }
-
-            case 'L':
-            {
-                std::int64_t number{};
-                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
-                {
-                    return false;
-                }
-                if (number < 0)
-                {
-                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read,
-                                            exception_message(input_format, "count in an optimized container must be positive", "size"), nullptr));
-                }
-                if (!value_in_range_of<std::size_t>(number))
-                {
-                    return sax->parse_error(chars_read, get_token_string(), out_of_range::create(408,
-                                            exception_message(input_format, "integer value overflow", "size"), nullptr));
-                }
-                result = static_cast<std::size_t>(number);
-                return true;
-            }
-
-            case 'u':
-            {
-                if (input_format != input_format_t::bjdata)
-                {
-                    break;
-                }
-                std::uint16_t number{};
-                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
-                {
-                    return false;
-                }
-                result = static_cast<std::size_t>(number);
-                return true;
-            }
-
-            case 'm':
-            {
-                if (input_format != input_format_t::bjdata)
-                {
-                    break;
-                }
-                std::uint32_t number{};
-                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
-                {
-                    return false;
-                }
-                result = conditional_static_cast<std::size_t>(number);
-                return true;
-            }
-
-            case 'M':
-            {
-                if (input_format != input_format_t::bjdata)
-                {
-                    break;
-                }
-                std::uint64_t number{};
-                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
-                {
-                    return false;
-                }
-                if (!value_in_range_of<std::size_t>(number))
-                {
-                    return sax->parse_error(chars_read, get_token_string(), out_of_range::create(408,
-                                            exception_message(input_format, "integer value overflow", "size"), nullptr));
-                }
-                result = detail::conditional_static_cast<std::size_t>(number);
-                return true;
-            }
-
-            case '[':
-            {
-                if (input_format != input_format_t::bjdata)
-                {
-                    break;
-                }
-                if (is_ndarray) // ndarray dimensional vector can only contain integers, and can not embed another array
-                {
-                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimensional vector is not allowed", "size"), nullptr));
-                }
-                std::vector<size_t> dim;
-                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim)))
-                {
-                    return false;
-                }
-                if (dim.size() == 1 || (dim.size() == 2 && dim.at(0) == 1)) // return normal array size if 1D row vector
-                {
-                    result = dim.at(dim.size() - 1);
-                    return true;
-                }
-                if (!dim.empty())  // if ndarray, convert to an object in JData annotated array format
-                {
-                    for (auto i : dim) // test if any dimension in an ndarray is 0, if so, return a 1D empty container
-                    {
-                        if ( i == 0 )
-                        {
-                            result = 0;
-                            return true;
-                        }
-                    }
-
-                    string_t key = "_ArraySize_";
-                    if (JSON_HEDLEY_UNLIKELY(!sax->start_object(3) || !sax->key(key) || !sax->start_array(dim.size())))
-                    {
-                        return false;
-                    }
-                    result = 1;
-                    for (auto i : dim)
-                    {
-                        result *= i;
-                        if (result == 0 || result == npos) // because dim elements shall not have zeros, result = 0 means overflow happened; it also can't be npos as it is used to initialize size in get_ubjson_size_type()
-                        {
-                            return sax->parse_error(chars_read, get_token_string(), out_of_range::create(408, exception_message(input_format, "excessive ndarray size caused overflow", "size"), nullptr));
-                        }
-                        if (JSON_HEDLEY_UNLIKELY(!sax->number_unsigned(static_cast<number_unsigned_t>(i))))
-                        {
-                            return false;
-                        }
-                    }
-                    is_ndarray = true;
-                    return sax->end_array();
-                }
-                result = 0;
-                return true;
-            }
-
-            default:
-                break;
-        }
-        auto last_token = get_token_string();
-        std::string message;
-
-        if (input_format != input_format_t::bjdata)
-        {
-            message = "expected length type specification (U, i, I, l, L) after '#'; last byte: 0x" + last_token;
-        }
-        else
-        {
-            message = "expected length type specification (U, i, u, I, m, l, M, L) after '#'; last byte: 0x" + last_token;
-        }
-        return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format, message, "size"), nullptr));
-    }
-
-    /*!
-    @brief determine the type and size for a container
-
-    In the optimized UBJSON format, a type and a size can be provided to allow
-    for a more compact representation.
-
-    @param[out] result  pair of the size and the type
-    @param[in] inside_ndarray  whether the parser is parsing an ND array dimensional vector
-
-    @return whether pair creation completed
-    */
-    bool get_ubjson_size_type(std::pair<std::size_t, char_int_type>& result, bool inside_ndarray = false)
-    {
-        result.first = npos; // size
-        result.second = 0; // type
-        bool is_ndarray = false;
-
-        get_ignore_noop();
-
-        if (current == '$')
-        {
-            result.second = get();  // must not ignore 'N', because 'N' maybe the type
-            if (input_format == input_format_t::bjdata
-                    && JSON_HEDLEY_UNLIKELY(std::binary_search(bjd_optimized_type_markers.begin(), bjd_optimized_type_markers.end(), result.second)))
-            {
-                auto last_token = get_token_string();
-                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
-                                        exception_message(input_format, concat("marker 0x", last_token, " is not a permitted optimized array type"), "type"), nullptr));
-            }
-
-            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "type")))
-            {
-                return false;
-            }
-
-            get_ignore_noop();
-            if (JSON_HEDLEY_UNLIKELY(current != '#'))
-            {
-                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "value")))
-                {
-                    return false;
-                }
-                auto last_token = get_token_string();
-                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
-                                        exception_message(input_format, concat("expected '#' after type information; last byte: 0x", last_token), "size"), nullptr));
-            }
-
-            const bool is_error = get_ubjson_size_value(result.first, is_ndarray);
-            if (input_format == input_format_t::bjdata && is_ndarray)
-            {
-                if (inside_ndarray)
-                {
-                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(112, chars_read,
-                                            exception_message(input_format, "ndarray can not be recursive", "size"), nullptr));
-                }
-                result.second |= (1 << 8); // use bit 8 to indicate ndarray, all UBJSON and BJData markers should be ASCII letters
-            }
-            return is_error;
-        }
-
-        if (current == '#')
-        {
-            const bool is_error = get_ubjson_size_value(result.first, is_ndarray);
-            if (input_format == input_format_t::bjdata && is_ndarray)
-            {
-                return sax->parse_error(chars_read, get_token_string(), parse_error::create(112, chars_read,
-                                        exception_message(input_format, "ndarray requires both type and size", "size"), nullptr));
-            }
-            return is_error;
-        }
-
-        return true;
-    }
-
-    /*!
-    @param prefix  the previously read or set type prefix
-    @return whether value creation completed
-    */
-    bool get_ubjson_value(const char_int_type prefix)
-    {
-        switch (prefix)
-        {
-            case char_traits<char_type>::eof():  // EOF
-                return unexpect_eof(input_format, "value");
-
-            case 'T':  // true
-                return sax->boolean(true);
-            case 'F':  // false
-                return sax->boolean(false);
-
-            case 'Z':  // null
-                return sax->null();
-
-            case 'B':  // byte
-            {
-                if (input_format != input_format_t::bjdata)
-                {
-                    break;
-                }
-                std::uint8_t number{};
-                return get_number(input_format, number) && sax->number_unsigned(number);
-            }
-
-            case 'U':
-            {
-                std::uint8_t number{};
-                return get_number(input_format, number) && sax->number_unsigned(number);
-            }
-
-            case 'i':
-            {
-                std::int8_t number{};
-                return get_number(input_format, number) && sax->number_integer(number);
-            }
-
-            case 'I':
-            {
-                std::int16_t number{};
-                return get_number(input_format, number) && sax->number_integer(number);
-            }
-
-            case 'l':
-            {
-                std::int32_t number{};
-                return get_number(input_format, number) && sax->number_integer(number);
-            }
-
-            case 'L':
-            {
-                std::int64_t number{};
-                return get_number(input_format, number) && sax->number_integer(number);
-            }
-
-            case 'u':
-            {
-                if (input_format != input_format_t::bjdata)
-                {
-                    break;
-                }
-                std::uint16_t number{};
-                return get_number(input_format, number) && sax->number_unsigned(number);
-            }
-
-            case 'm':
-            {
-                if (input_format != input_format_t::bjdata)
-                {
-                    break;
-                }
-                std::uint32_t number{};
-                return get_number(input_format, number) && sax->number_unsigned(number);
-            }
-
-            case 'M':
-            {
-                if (input_format != input_format_t::bjdata)
-                {
-                    break;
-                }
-                std::uint64_t number{};
-                return get_number(input_format, number) && sax->number_unsigned(number);
-            }
-
-            case 'h':
-            {
-                if (input_format != input_format_t::bjdata)
-                {
-                    break;
-                }
-                const auto byte1_raw = get();
-                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "number")))
-                {
-                    return false;
-                }
-                const auto byte2_raw = get();
-                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "number")))
-                {
-                    return false;
-                }
-
-                const auto byte1 = static_cast<unsigned char>(byte1_raw);
-                const auto byte2 = static_cast<unsigned char>(byte2_raw);
-
-                // code from RFC 7049, Appendix D, Figure 3:
-                // As half-precision floating-point numbers were only added
-                // to IEEE 754 in 2008, today's programming platforms often
-                // still only have limited support for them. It is very
-                // easy to include at least decoding support for them even
-                // without such support. An example of a small decoder for
-                // half-precision floating-point numbers in the C language
-                // is shown in Fig. 3.
-                const auto half = static_cast<unsigned int>((byte2 << 8u) + byte1);
-                const double val = [&half]
-                {
-                    const int exp = (half >> 10u) & 0x1Fu;
-                    const unsigned int mant = half & 0x3FFu;
-                    JSON_ASSERT(0 <= exp&& exp <= 32);
-                    JSON_ASSERT(mant <= 1024);
-                    switch (exp)
-                    {
-                        case 0:
-                            return std::ldexp(mant, -24);
-                        case 31:
-                            return (mant == 0)
-                            ? std::numeric_limits<double>::infinity()
-                            : std::numeric_limits<double>::quiet_NaN();
-                        default:
-                            return std::ldexp(mant + 1024, exp - 25);
-                    }
-                }();
-                return sax->number_float((half & 0x8000u) != 0
-                                         ? static_cast<number_float_t>(-val)
-                                         : static_cast<number_float_t>(val), "");
-            }
-
-            case 'd':
-            {
-                float number{};
-                return get_number(input_format, number) && sax->number_float(static_cast<number_float_t>(number), "");
-            }
-
-            case 'D':
-            {
-                double number{};
-                return get_number(input_format, number) && sax->number_float(static_cast<number_float_t>(number), "");
-            }
-
-            case 'H':
-            {
-                return get_ubjson_high_precision_number();
-            }
-
-            case 'C':  // char
-            {
-                get();
-                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "char")))
-                {
-                    return false;
-                }
-                if (JSON_HEDLEY_UNLIKELY(current > 127))
-                {
-                    auto last_token = get_token_string();
-                    return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read,
-                                            exception_message(input_format, concat("byte after 'C' must be in range 0x00..0x7F; last byte: 0x", last_token), "char"), nullptr));
-                }
-                string_t s(1, static_cast<typename string_t::value_type>(current));
-                return sax->string(s);
-            }
-
-            case 'S':  // string
-            {
-                string_t s;
-                return get_ubjson_string(s) && sax->string(s);
-            }
-
-            case '[':  // array
-                return get_ubjson_array();
-
-            case '{':  // object
-                return get_ubjson_object();
-
-            default: // anything else
-                break;
-        }
-        auto last_token = get_token_string();
-        return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format, "invalid byte: 0x" + last_token, "value"), nullptr));
-    }
-
-    /*!
-    @return whether array creation completed
-    */
-    bool get_ubjson_array()
-    {
-        std::pair<std::size_t, char_int_type> size_and_type;
-        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type)))
-        {
-            return false;
-        }
-
-        // if bit-8 of size_and_type.second is set to 1, encode bjdata ndarray as an object in JData annotated array format (https://github.com/NeuroJSON/jdata):
-        // {"_ArrayType_" : "typeid", "_ArraySize_" : [n1, n2, ...], "_ArrayData_" : [v1, v2, ...]}
-
-        if (input_format == input_format_t::bjdata && size_and_type.first != npos && (size_and_type.second & (1 << 8)) != 0)
-        {
-            size_and_type.second &= ~(static_cast<char_int_type>(1) << 8);  // use bit 8 to indicate ndarray, here we remove the bit to restore the type marker
-            auto it = std::lower_bound(bjd_types_map.begin(), bjd_types_map.end(), size_and_type.second, [](const bjd_type & p, char_int_type t)
-            {
-                return p.first < t;
-            });
-            string_t key = "_ArrayType_";
-            if (JSON_HEDLEY_UNLIKELY(it == bjd_types_map.end() || it->first != size_and_type.second))
-            {
-                auto last_token = get_token_string();
-                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
-                                        exception_message(input_format, "invalid byte: 0x" + last_token, "type"), nullptr));
-            }
-
-            string_t type = it->second; // sax->string() takes a reference
-            if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->string(type)))
-            {
-                return false;
-            }
-
-            if (size_and_type.second == 'C' || size_and_type.second == 'B')
-            {
-                size_and_type.second = 'U';
-            }
-
-            key = "_ArrayData_";
-            if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->start_array(size_and_type.first) ))
-            {
-                return false;
-            }
-
-            for (std::size_t i = 0; i < size_and_type.first; ++i)
-            {
-                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
-                {
-                    return false;
-                }
-            }
-
-            return (sax->end_array() && sax->end_object());
-        }
-
-        // If BJData type marker is 'B' decode as binary
-        if (input_format == input_format_t::bjdata && size_and_type.first != npos && size_and_type.second == 'B')
-        {
-            binary_t result;
-            return get_binary(input_format, size_and_type.first, result) && sax->binary(result);
-        }
-
-        if (size_and_type.first != npos)
-        {
-            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(size_and_type.first)))
-            {
-                return false;
-            }
-
-            if (size_and_type.second != 0)
-            {
-                if (size_and_type.second != 'N')
-                {
-                    for (std::size_t i = 0; i < size_and_type.first; ++i)
-                    {
-                        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
-                        {
-                            return false;
-                        }
-                    }
-                }
-            }
-            else
-            {
-                for (std::size_t i = 0; i < size_and_type.first; ++i)
-                {
-                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
-                    {
-                        return false;
-                    }
-                }
-            }
-        }
-        else
-        {
-            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(detail::unknown_size())))
-            {
-                return false;
-            }
-
-            while (current != ']')
-            {
-                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal(false)))
-                {
-                    return false;
-                }
-                get_ignore_noop();
-            }
-        }
-
-        return sax->end_array();
-    }
-
-    /*!
-    @return whether object creation completed
-    */
-    bool get_ubjson_object()
-    {
-        std::pair<std::size_t, char_int_type> size_and_type;
-        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type)))
-        {
-            return false;
-        }
-
-        // do not accept ND-array size in objects in BJData
-        if (input_format == input_format_t::bjdata && size_and_type.first != npos && (size_and_type.second & (1 << 8)) != 0)
-        {
-            auto last_token = get_token_string();
-            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
-                                    exception_message(input_format, "BJData object does not support ND-array size in optimized format", "object"), nullptr));
-        }
-
-        string_t key;
-        if (size_and_type.first != npos)
-        {
-            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(size_and_type.first)))
-            {
-                return false;
-            }
-
-            if (size_and_type.second != 0)
-            {
-                for (std::size_t i = 0; i < size_and_type.first; ++i)
-                {
-                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key)))
-                    {
-                        return false;
-                    }
-                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
-                    {
-                        return false;
-                    }
-                    key.clear();
-                }
-            }
-            else
-            {
-                for (std::size_t i = 0; i < size_and_type.first; ++i)
-                {
-                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key)))
-                    {
-                        return false;
-                    }
-                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
-                    {
-                        return false;
-                    }
-                    key.clear();
-                }
-            }
-        }
-        else
-        {
-            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(detail::unknown_size())))
-            {
-                return false;
-            }
-
-            while (current != '}')
-            {
-                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key, false) || !sax->key(key)))
-                {
-                    return false;
-                }
-                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
-                {
-                    return false;
-                }
-                get_ignore_noop();
-                key.clear();
-            }
-        }
-
-        return sax->end_object();
-    }
-
-    // Note, no reader for UBJSON binary types is implemented because they do
-    // not exist
-
-    bool get_ubjson_high_precision_number()
-    {
-        // get size of following number string
-        std::size_t size{};
-        bool no_ndarray = true;
-        auto res = get_ubjson_size_value(size, no_ndarray);
-        if (JSON_HEDLEY_UNLIKELY(!res))
-        {
-            return res;
-        }
-
-        // get number string
-        std::vector<char> number_vector;
-        for (std::size_t i = 0; i < size; ++i)
-        {
-            get();
-            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "number")))
-            {
-                return false;
-            }
-            number_vector.push_back(static_cast<char>(current));
-        }
-
-        // parse number string
-        using ia_type = decltype(detail::input_adapter(number_vector));
-        auto number_lexer = detail::lexer<BasicJsonType, ia_type>(detail::input_adapter(number_vector), false);
-        const auto result_number = number_lexer.scan();
-        const auto number_string = number_lexer.get_token_string();
-        const auto result_remainder = number_lexer.scan();
-
-        using token_type = typename detail::lexer_base<BasicJsonType>::token_type;
-
-        if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input))
-        {
-            return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read,
-                                    exception_message(input_format, concat("invalid number text: ", number_lexer.get_token_string()), "high-precision number"), nullptr));
-        }
-
-        switch (result_number)
-        {
-            case token_type::value_integer:
-                return sax->number_integer(number_lexer.get_number_integer());
-            case token_type::value_unsigned:
-                return sax->number_unsigned(number_lexer.get_number_unsigned());
-            case token_type::value_float:
-                return sax->number_float(number_lexer.get_number_float(), std::move(number_string));
-            case token_type::uninitialized:
-            case token_type::literal_true:
-            case token_type::literal_false:
-            case token_type::literal_null:
-            case token_type::value_string:
-            case token_type::begin_array:
-            case token_type::begin_object:
-            case token_type::end_array:
-            case token_type::end_object:
-            case token_type::name_separator:
-            case token_type::value_separator:
-            case token_type::parse_error:
-            case token_type::end_of_input:
-            case token_type::literal_or_value:
-            default:
-                return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read,
-                                        exception_message(input_format, concat("invalid number text: ", number_lexer.get_token_string()), "high-precision number"), nullptr));
-        }
-    }
-
-    ///////////////////////
-    // Utility functions //
-    ///////////////////////
-
-    /*!
-    @brief get next character from the input
-
-    This function provides the interface to the used input adapter. It does
-    not throw in case the input reached EOF, but returns a -'ve valued
-    `char_traits<char_type>::eof()` in that case.
-
-    @return character read from the input
-    */
-    char_int_type get()
-    {
-        ++chars_read;
-        return current = ia.get_character();
-    }
-
-    /*!
-    @brief get_to read into a primitive type
-
-    This function provides the interface to the used input adapter. It does
-    not throw in case the input reached EOF, but returns false instead
-
-    @return bool, whether the read was successful
-    */
-    template<class T>
-    bool get_to(T& dest, const input_format_t format, const char* context)
-    {
-        auto new_chars_read = ia.get_elements(&dest);
-        chars_read += new_chars_read;
-        if (JSON_HEDLEY_UNLIKELY(new_chars_read < sizeof(T)))
-        {
-            // in case of failure, advance position by 1 to report failing location
-            ++chars_read;
-            sax->parse_error(chars_read, "<end of file>", parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), nullptr));
-            return false;
-        }
-        return true;
-    }
-
-    /*!
-    @return character read from the input after ignoring all 'N' entries
-    */
-    char_int_type get_ignore_noop()
-    {
-        do
-        {
-            get();
-        }
-        while (current == 'N');
-
-        return current;
-    }
-
-    template<class NumberType>
-    static void byte_swap(NumberType& number)
-    {
-        constexpr std::size_t sz = sizeof(number);
-#ifdef __cpp_lib_byteswap
-        if constexpr (sz == 1)
-        {
-            return;
-        }
-        if constexpr(std::is_integral_v<NumberType>)
-        {
-            number = std::byteswap(number);
-            return;
-        }
-#endif
-        auto* ptr = reinterpret_cast<std::uint8_t*>(&number);
-        for (std::size_t i = 0; i < sz / 2; ++i)
-        {
-            std::swap(ptr[i], ptr[sz - i - 1]);
-        }
-    }
-
-    /*
-    @brief read a number from the input
-
-    @tparam NumberType the type of the number
-    @param[in] format   the current format (for diagnostics)
-    @param[out] result  number of type @a NumberType
-
-    @return whether conversion completed
-
-    @note This function needs to respect the system's endianness, because
-          bytes in CBOR, MessagePack, and UBJSON are stored in network order
-          (big endian) and therefore need reordering on little endian systems.
-          On the other hand, BSON and BJData use little endian and should reorder
-          on big endian systems.
-    */
-    template<typename NumberType, bool InputIsLittleEndian = false>
-    bool get_number(const input_format_t format, NumberType& result)
-    {
-        // read in the original format
-
-        if (JSON_HEDLEY_UNLIKELY(!get_to(result, format, "number")))
-        {
-            return false;
-        }
-        if (is_little_endian != (InputIsLittleEndian || format == input_format_t::bjdata))
-        {
-            byte_swap(result);
-        }
-        return true;
-    }
-
-    /*!
-    @brief create a string by reading characters from the input
-
-    @tparam NumberType the type of the number
-    @param[in] format the current format (for diagnostics)
-    @param[in] len number of characters to read
-    @param[out] result string created by reading @a len bytes
-
-    @return whether string creation completed
-
-    @note We can not reserve @a len bytes for the result, because @a len
-          may be too large. Usually, @ref unexpect_eof() detects the end of
-          the input before we run out of string memory.
-    */
-    template<typename NumberType>
-    bool get_string(const input_format_t format,
-                    const NumberType len,
-                    string_t& result)
-    {
-        bool success = true;
-        for (NumberType i = 0; i < len; i++)
-        {
-            get();
-            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "string")))
-            {
-                success = false;
-                break;
-            }
-            result.push_back(static_cast<typename string_t::value_type>(current));
-        }
-        return success;
-    }
-
-    /*!
-    @brief create a byte array by reading bytes from the input
-
-    @tparam NumberType the type of the number
-    @param[in] format the current format (for diagnostics)
-    @param[in] len number of bytes to read
-    @param[out] result byte array created by reading @a len bytes
-
-    @return whether byte array creation completed
-
-    @note We can not reserve @a len bytes for the result, because @a len
-          may be too large. Usually, @ref unexpect_eof() detects the end of
-          the input before we run out of memory.
-    */
-    template<typename NumberType>
-    bool get_binary(const input_format_t format,
-                    const NumberType len,
-                    binary_t& result)
-    {
-        bool success = true;
-        for (NumberType i = 0; i < len; i++)
-        {
-            get();
-            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "binary")))
-            {
-                success = false;
-                break;
-            }
-            result.push_back(static_cast<std::uint8_t>(current));
-        }
-        return success;
-    }
-
-    /*!
-    @param[in] format   the current format (for diagnostics)
-    @param[in] context  further context information (for diagnostics)
-    @return whether the last read character is not EOF
-    */
-    JSON_HEDLEY_NON_NULL(3)
-    bool unexpect_eof(const input_format_t format, const char* context) const
-    {
-        if (JSON_HEDLEY_UNLIKELY(current == char_traits<char_type>::eof()))
-        {
-            return sax->parse_error(chars_read, "<end of file>",
-                                    parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), nullptr));
-        }
-        return true;
-    }
-
-    /*!
-    @return a string representation of the last read byte
-    */
-    std::string get_token_string() const
-    {
-        std::array<char, 3> cr{{}};
-        static_cast<void>((std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(current))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
-        return std::string{cr.data()};
-    }
-
-    /*!
-    @param[in] format   the current format
-    @param[in] detail   a detailed error message
-    @param[in] context  further context information
-    @return a message string to use in the parse_error exceptions
-    */
-    std::string exception_message(const input_format_t format,
-                                  const std::string& detail,
-                                  const std::string& context) const
-    {
-        std::string error_msg = "syntax error while parsing ";
-
-        switch (format)
-        {
-            case input_format_t::cbor:
-                error_msg += "CBOR";
-                break;
-
-            case input_format_t::msgpack:
-                error_msg += "MessagePack";
-                break;
-
-            case input_format_t::ubjson:
-                error_msg += "UBJSON";
-                break;
-
-            case input_format_t::bson:
-                error_msg += "BSON";
-                break;
-
-            case input_format_t::bjdata:
-                error_msg += "BJData";
-                break;
-
-            case input_format_t::json: // LCOV_EXCL_LINE
-            default:            // LCOV_EXCL_LINE
-                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
-        }
-
-        return concat(error_msg, ' ', context, ": ", detail);
-    }
-
-  private:
-    static JSON_INLINE_VARIABLE constexpr std::size_t npos = detail::unknown_size();
-
-    /// input adapter
-    InputAdapterType ia;
-
-    /// the current character
-    char_int_type current = char_traits<char_type>::eof();
-
-    /// the number of characters read
-    std::size_t chars_read = 0;
-
-    /// whether we can assume little endianness
-    const bool is_little_endian = little_endianness();
-
-    /// input format
-    const input_format_t input_format = input_format_t::json;
-
-    /// the SAX parser
-    json_sax_t* sax = nullptr;
-
-    // excluded markers in bjdata optimized type
-#define JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_ \
-    make_array<char_int_type>('F', 'H', 'N', 'S', 'T', 'Z', '[', '{')
-
-#define JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_ \
-    make_array<bjd_type>(                      \
-    bjd_type{'B', "byte"},                     \
-    bjd_type{'C', "char"},                     \
-    bjd_type{'D', "double"},                   \
-    bjd_type{'I', "int16"},                    \
-    bjd_type{'L', "int64"},                    \
-    bjd_type{'M', "uint64"},                   \
-    bjd_type{'U', "uint8"},                    \
-    bjd_type{'d', "single"},                   \
-    bjd_type{'i', "int8"},                     \
-    bjd_type{'l', "int32"},                    \
-    bjd_type{'m', "uint32"},                   \
-    bjd_type{'u', "uint16"})
-
-  JSON_PRIVATE_UNLESS_TESTED:
-    // lookup tables
-    // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-    const decltype(JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_) bjd_optimized_type_markers =
-        JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_;
-
-    using bjd_type = std::pair<char_int_type, string_t>;
-    // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-    const decltype(JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_) bjd_types_map =
-        JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_;
-
-#undef JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_
-#undef JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_
-};
-
-#ifndef JSON_HAS_CPP_17
-    template<typename BasicJsonType, typename InputAdapterType, typename SAX>
-    constexpr std::size_t binary_reader<BasicJsonType, InputAdapterType, SAX>::npos;
-#endif
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/input/input_adapters.hpp>
-
-// #include <nlohmann/detail/input/lexer.hpp>
-
-// #include <nlohmann/detail/input/parser.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <cmath> // isfinite
-#include <cstdint> // uint8_t
-#include <functional> // function
-#include <string> // string
-#include <utility> // move
-#include <vector> // vector
-
-// #include <nlohmann/detail/exceptions.hpp>
-
-// #include <nlohmann/detail/input/input_adapters.hpp>
-
-// #include <nlohmann/detail/input/json_sax.hpp>
-
-// #include <nlohmann/detail/input/lexer.hpp>
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-// #include <nlohmann/detail/meta/is_sax.hpp>
-
-// #include <nlohmann/detail/string_concat.hpp>
-
-// #include <nlohmann/detail/value_t.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-////////////
-// parser //
-////////////
-
-enum class parse_event_t : std::uint8_t
-{
-    /// the parser read `{` and started to process a JSON object
-    object_start,
-    /// the parser read `}` and finished processing a JSON object
-    object_end,
-    /// the parser read `[` and started to process a JSON array
-    array_start,
-    /// the parser read `]` and finished processing a JSON array
-    array_end,
-    /// the parser read a key of a value in an object
-    key,
-    /// the parser finished reading a JSON value
-    value
-};
-
-template<typename BasicJsonType>
-using parser_callback_t =
-    std::function<bool(int /*depth*/, parse_event_t /*event*/, BasicJsonType& /*parsed*/)>;
-
-/*!
-@brief syntax analysis
-
-This class implements a recursive descent parser.
-*/
-template<typename BasicJsonType, typename InputAdapterType>
-class parser
-{
-    using number_integer_t = typename BasicJsonType::number_integer_t;
-    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-    using string_t = typename BasicJsonType::string_t;
-    using lexer_t = lexer<BasicJsonType, InputAdapterType>;
-    using token_type = typename lexer_t::token_type;
-
-  public:
-    /// a parser reading from an input adapter
-    explicit parser(InputAdapterType&& adapter,
-                    parser_callback_t<BasicJsonType> cb = nullptr,
-                    const bool allow_exceptions_ = true,
-                    const bool skip_comments = false)
-        : callback(std::move(cb))
-        , m_lexer(std::move(adapter), skip_comments)
-        , allow_exceptions(allow_exceptions_)
-    {
-        // read first token
-        get_token();
-    }
-
-    /*!
-    @brief public parser interface
-
-    @param[in] strict      whether to expect the last token to be EOF
-    @param[in,out] result  parsed JSON value
-
-    @throw parse_error.101 in case of an unexpected token
-    @throw parse_error.102 if to_unicode fails or surrogate error
-    @throw parse_error.103 if to_unicode fails
-    */
-    void parse(const bool strict, BasicJsonType& result)
-    {
-        if (callback)
-        {
-            json_sax_dom_callback_parser<BasicJsonType, InputAdapterType> sdp(result, callback, allow_exceptions, &m_lexer);
-            sax_parse_internal(&sdp);
-
-            // in strict mode, input must be completely read
-            if (strict && (get_token() != token_type::end_of_input))
-            {
-                sdp.parse_error(m_lexer.get_position(),
-                                m_lexer.get_token_string(),
-                                parse_error::create(101, m_lexer.get_position(),
-                                                    exception_message(token_type::end_of_input, "value"), nullptr));
-            }
-
-            // in case of an error, return discarded value
-            if (sdp.is_errored())
-            {
-                result = value_t::discarded;
-                return;
-            }
-
-            // set top-level value to null if it was discarded by the callback
-            // function
-            if (result.is_discarded())
-            {
-                result = nullptr;
-            }
-        }
-        else
-        {
-            json_sax_dom_parser<BasicJsonType, InputAdapterType> sdp(result, allow_exceptions, &m_lexer);
-            sax_parse_internal(&sdp);
-
-            // in strict mode, input must be completely read
-            if (strict && (get_token() != token_type::end_of_input))
-            {
-                sdp.parse_error(m_lexer.get_position(),
-                                m_lexer.get_token_string(),
-                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input, "value"), nullptr));
-            }
-
-            // in case of an error, return discarded value
-            if (sdp.is_errored())
-            {
-                result = value_t::discarded;
-                return;
-            }
-        }
-
-        result.assert_invariant();
-    }
-
-    /*!
-    @brief public accept interface
-
-    @param[in] strict  whether to expect the last token to be EOF
-    @return whether the input is a proper JSON text
-    */
-    bool accept(const bool strict = true)
-    {
-        json_sax_acceptor<BasicJsonType> sax_acceptor;
-        return sax_parse(&sax_acceptor, strict);
-    }
-
-    template<typename SAX>
-    JSON_HEDLEY_NON_NULL(2)
-    bool sax_parse(SAX* sax, const bool strict = true)
-    {
-        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
-        const bool result = sax_parse_internal(sax);
-
-        // strict mode: next byte must be EOF
-        if (result && strict && (get_token() != token_type::end_of_input))
-        {
-            return sax->parse_error(m_lexer.get_position(),
-                                    m_lexer.get_token_string(),
-                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input, "value"), nullptr));
-        }
-
-        return result;
-    }
-
-  private:
-    template<typename SAX>
-    JSON_HEDLEY_NON_NULL(2)
-    bool sax_parse_internal(SAX* sax)
-    {
-        // stack to remember the hierarchy of structured values we are parsing
-        // true = array; false = object
-        std::vector<bool> states;
-        // value to avoid a goto (see comment where set to true)
-        bool skip_to_state_evaluation = false;
-
-        while (true)
-        {
-            if (!skip_to_state_evaluation)
-            {
-                // invariant: get_token() was called before each iteration
-                switch (last_token)
-                {
-                    case token_type::begin_object:
-                    {
-                        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(detail::unknown_size())))
-                        {
-                            return false;
-                        }
-
-                        // closing } -> we are done
-                        if (get_token() == token_type::end_object)
-                        {
-                            if (JSON_HEDLEY_UNLIKELY(!sax->end_object()))
-                            {
-                                return false;
-                            }
-                            break;
-                        }
-
-                        // parse key
-                        if (JSON_HEDLEY_UNLIKELY(last_token != token_type::value_string))
-                        {
-                            return sax->parse_error(m_lexer.get_position(),
-                                                    m_lexer.get_token_string(),
-                                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string, "object key"), nullptr));
-                        }
-                        if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string())))
-                        {
-                            return false;
-                        }
-
-                        // parse separator (:)
-                        if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
-                        {
-                            return sax->parse_error(m_lexer.get_position(),
-                                                    m_lexer.get_token_string(),
-                                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator, "object separator"), nullptr));
-                        }
-
-                        // remember we are now inside an object
-                        states.push_back(false);
-
-                        // parse values
-                        get_token();
-                        continue;
-                    }
-
-                    case token_type::begin_array:
-                    {
-                        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(detail::unknown_size())))
-                        {
-                            return false;
-                        }
-
-                        // closing ] -> we are done
-                        if (get_token() == token_type::end_array)
-                        {
-                            if (JSON_HEDLEY_UNLIKELY(!sax->end_array()))
-                            {
-                                return false;
-                            }
-                            break;
-                        }
-
-                        // remember we are now inside an array
-                        states.push_back(true);
-
-                        // parse values (no need to call get_token)
-                        continue;
-                    }
-
-                    case token_type::value_float:
-                    {
-                        const auto res = m_lexer.get_number_float();
-
-                        if (JSON_HEDLEY_UNLIKELY(!std::isfinite(res)))
-                        {
-                            return sax->parse_error(m_lexer.get_position(),
-                                                    m_lexer.get_token_string(),
-                                                    out_of_range::create(406, concat("number overflow parsing '", m_lexer.get_token_string(), '\''), nullptr));
-                        }
-
-                        if (JSON_HEDLEY_UNLIKELY(!sax->number_float(res, m_lexer.get_string())))
-                        {
-                            return false;
-                        }
-
-                        break;
-                    }
-
-                    case token_type::literal_false:
-                    {
-                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(false)))
-                        {
-                            return false;
-                        }
-                        break;
-                    }
-
-                    case token_type::literal_null:
-                    {
-                        if (JSON_HEDLEY_UNLIKELY(!sax->null()))
-                        {
-                            return false;
-                        }
-                        break;
-                    }
-
-                    case token_type::literal_true:
-                    {
-                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(true)))
-                        {
-                            return false;
-                        }
-                        break;
-                    }
-
-                    case token_type::value_integer:
-                    {
-                        if (JSON_HEDLEY_UNLIKELY(!sax->number_integer(m_lexer.get_number_integer())))
-                        {
-                            return false;
-                        }
-                        break;
-                    }
-
-                    case token_type::value_string:
-                    {
-                        if (JSON_HEDLEY_UNLIKELY(!sax->string(m_lexer.get_string())))
-                        {
-                            return false;
-                        }
-                        break;
-                    }
-
-                    case token_type::value_unsigned:
-                    {
-                        if (JSON_HEDLEY_UNLIKELY(!sax->number_unsigned(m_lexer.get_number_unsigned())))
-                        {
-                            return false;
-                        }
-                        break;
-                    }
-
-                    case token_type::parse_error:
-                    {
-                        // using "uninitialized" to avoid "expected" message
-                        return sax->parse_error(m_lexer.get_position(),
-                                                m_lexer.get_token_string(),
-                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::uninitialized, "value"), nullptr));
-                    }
-                    case token_type::end_of_input:
-                    {
-                        if (JSON_HEDLEY_UNLIKELY(m_lexer.get_position().chars_read_total == 1))
-                        {
-                            return sax->parse_error(m_lexer.get_position(),
-                                                    m_lexer.get_token_string(),
-                                                    parse_error::create(101, m_lexer.get_position(),
-                                                            "attempting to parse an empty input; check that your input string or stream contains the expected JSON", nullptr));
-                        }
-
-                        return sax->parse_error(m_lexer.get_position(),
-                                                m_lexer.get_token_string(),
-                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::literal_or_value, "value"), nullptr));
-                    }
-                    case token_type::uninitialized:
-                    case token_type::end_array:
-                    case token_type::end_object:
-                    case token_type::name_separator:
-                    case token_type::value_separator:
-                    case token_type::literal_or_value:
-                    default: // the last token was unexpected
-                    {
-                        return sax->parse_error(m_lexer.get_position(),
-                                                m_lexer.get_token_string(),
-                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::literal_or_value, "value"), nullptr));
-                    }
-                }
-            }
-            else
-            {
-                skip_to_state_evaluation = false;
-            }
-
-            // we reached this line after we successfully parsed a value
-            if (states.empty())
-            {
-                // empty stack: we reached the end of the hierarchy: done
-                return true;
-            }
-
-            if (states.back())  // array
-            {
-                // comma -> next value
-                if (get_token() == token_type::value_separator)
-                {
-                    // parse a new value
-                    get_token();
-                    continue;
-                }
-
-                // closing ]
-                if (JSON_HEDLEY_LIKELY(last_token == token_type::end_array))
-                {
-                    if (JSON_HEDLEY_UNLIKELY(!sax->end_array()))
-                    {
-                        return false;
-                    }
-
-                    // We are done with this array. Before we can parse a
-                    // new value, we need to evaluate the new state first.
-                    // By setting skip_to_state_evaluation to false, we
-                    // are effectively jumping to the beginning of this if.
-                    JSON_ASSERT(!states.empty());
-                    states.pop_back();
-                    skip_to_state_evaluation = true;
-                    continue;
-                }
-
-                return sax->parse_error(m_lexer.get_position(),
-                                        m_lexer.get_token_string(),
-                                        parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_array, "array"), nullptr));
-            }
-
-            // states.back() is false -> object
-
-            // comma -> next value
-            if (get_token() == token_type::value_separator)
-            {
-                // parse key
-                if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string))
-                {
-                    return sax->parse_error(m_lexer.get_position(),
-                                            m_lexer.get_token_string(),
-                                            parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string, "object key"), nullptr));
-                }
-
-                if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string())))
-                {
-                    return false;
-                }
-
-                // parse separator (:)
-                if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
-                {
-                    return sax->parse_error(m_lexer.get_position(),
-                                            m_lexer.get_token_string(),
-                                            parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator, "object separator"), nullptr));
-                }
-
-                // parse values
-                get_token();
-                continue;
-            }
-
-            // closing }
-            if (JSON_HEDLEY_LIKELY(last_token == token_type::end_object))
-            {
-                if (JSON_HEDLEY_UNLIKELY(!sax->end_object()))
-                {
-                    return false;
-                }
-
-                // We are done with this object. Before we can parse a
-                // new value, we need to evaluate the new state first.
-                // By setting skip_to_state_evaluation to false, we
-                // are effectively jumping to the beginning of this if.
-                JSON_ASSERT(!states.empty());
-                states.pop_back();
-                skip_to_state_evaluation = true;
-                continue;
-            }
-
-            return sax->parse_error(m_lexer.get_position(),
-                                    m_lexer.get_token_string(),
-                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_object, "object"), nullptr));
-        }
-    }
-
-    /// get next token from lexer
-    token_type get_token()
-    {
-        return last_token = m_lexer.scan();
-    }
-
-    std::string exception_message(const token_type expected, const std::string& context)
-    {
-        std::string error_msg = "syntax error ";
-
-        if (!context.empty())
-        {
-            error_msg += concat("while parsing ", context, ' ');
-        }
-
-        error_msg += "- ";
-
-        if (last_token == token_type::parse_error)
-        {
-            error_msg += concat(m_lexer.get_error_message(), "; last read: '",
-                                m_lexer.get_token_string(), '\'');
-        }
-        else
-        {
-            error_msg += concat("unexpected ", lexer_t::token_type_name(last_token));
-        }
-
-        if (expected != token_type::uninitialized)
-        {
-            error_msg += concat("; expected ", lexer_t::token_type_name(expected));
-        }
-
-        return error_msg;
-    }
-
-  private:
-    /// callback function
-    const parser_callback_t<BasicJsonType> callback = nullptr;
-    /// the type of the last read token
-    token_type last_token = token_type::uninitialized;
-    /// the lexer
-    lexer_t m_lexer;
-    /// whether to throw exceptions in case of errors
-    const bool allow_exceptions = true;
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/iterators/internal_iterator.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-// #include <nlohmann/detail/abi_macros.hpp>
-
-// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <cstddef> // ptrdiff_t
-#include <limits>  // numeric_limits
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-/*
-@brief an iterator for primitive JSON types
-
-This class models an iterator for primitive JSON types (boolean, number,
-string). It's only purpose is to allow the iterator/const_iterator classes
-to "iterate" over primitive values. Internally, the iterator is modeled by
-a `difference_type` variable. Value begin_value (`0`) models the begin,
-end_value (`1`) models past the end.
-*/
-class primitive_iterator_t
-{
-  private:
-    using difference_type = std::ptrdiff_t;
-    static constexpr difference_type begin_value = 0;
-    static constexpr difference_type end_value = begin_value + 1;
-
-  JSON_PRIVATE_UNLESS_TESTED:
-    /// iterator as signed integer type
-    difference_type m_it = (std::numeric_limits<std::ptrdiff_t>::min)();
-
-  public:
-    constexpr difference_type get_value() const noexcept
-    {
-        return m_it;
-    }
-
-    /// set iterator to a defined beginning
-    void set_begin() noexcept
-    {
-        m_it = begin_value;
-    }
-
-    /// set iterator to a defined past the end
-    void set_end() noexcept
-    {
-        m_it = end_value;
-    }
-
-    /// return whether the iterator can be dereferenced
-    constexpr bool is_begin() const noexcept
-    {
-        return m_it == begin_value;
-    }
-
-    /// return whether the iterator is at end
-    constexpr bool is_end() const noexcept
-    {
-        return m_it == end_value;
-    }
-
-    friend constexpr bool operator==(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
-    {
-        return lhs.m_it == rhs.m_it;
-    }
-
-    friend constexpr bool operator<(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
-    {
-        return lhs.m_it < rhs.m_it;
-    }
-
-    primitive_iterator_t operator+(difference_type n) noexcept
-    {
-        auto result = *this;
-        result += n;
-        return result;
-    }
-
-    friend constexpr difference_type operator-(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
-    {
-        return lhs.m_it - rhs.m_it;
-    }
-
-    primitive_iterator_t& operator++() noexcept
-    {
-        ++m_it;
-        return *this;
-    }
-
-    primitive_iterator_t operator++(int)& noexcept // NOLINT(cert-dcl21-cpp)
-    {
-        auto result = *this;
-        ++m_it;
-        return result;
-    }
-
-    primitive_iterator_t& operator--() noexcept
-    {
-        --m_it;
-        return *this;
-    }
-
-    primitive_iterator_t operator--(int)& noexcept // NOLINT(cert-dcl21-cpp)
-    {
-        auto result = *this;
-        --m_it;
-        return result;
-    }
-
-    primitive_iterator_t& operator+=(difference_type n) noexcept
-    {
-        m_it += n;
-        return *this;
-    }
-
-    primitive_iterator_t& operator-=(difference_type n) noexcept
-    {
-        m_it -= n;
-        return *this;
-    }
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-/*!
-@brief an iterator value
-
-@note This structure could easily be a union, but MSVC currently does not allow
-unions members with complex constructors, see https://github.com/nlohmann/json/pull/105.
-*/
-template<typename BasicJsonType> struct internal_iterator
-{
-    /// iterator for JSON objects
-    typename BasicJsonType::object_t::iterator object_iterator {};
-    /// iterator for JSON arrays
-    typename BasicJsonType::array_t::iterator array_iterator {};
-    /// generic iterator for all other types
-    primitive_iterator_t primitive_iterator {};
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/iterators/iter_impl.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <iterator> // iterator, random_access_iterator_tag, bidirectional_iterator_tag, advance, next
-#include <type_traits> // conditional, is_const, remove_const
-
-// #include <nlohmann/detail/exceptions.hpp>
-
-// #include <nlohmann/detail/iterators/internal_iterator.hpp>
-
-// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-// #include <nlohmann/detail/meta/cpp_future.hpp>
-
-// #include <nlohmann/detail/meta/type_traits.hpp>
-
-// #include <nlohmann/detail/value_t.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-// forward declare, to be able to friend it later on
-template<typename IteratorType> class iteration_proxy;
-template<typename IteratorType> class iteration_proxy_value;
-
-/*!
-@brief a template for a bidirectional iterator for the @ref basic_json class
-This class implements a both iterators (iterator and const_iterator) for the
-@ref basic_json class.
-@note An iterator is called *initialized* when a pointer to a JSON value has
-      been set (e.g., by a constructor or a copy assignment). If the iterator is
-      default-constructed, it is *uninitialized* and most methods are undefined.
-      **The library uses assertions to detect calls on uninitialized iterators.**
-@requirement The class satisfies the following concept requirements:
--
-[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
-  The iterator that can be moved can be moved in both directions (i.e.
-  incremented and decremented).
-@since version 1.0.0, simplified in version 2.0.9, change to bidirectional
-       iterators in version 3.0.0 (see https://github.com/nlohmann/json/issues/593)
-*/
-template<typename BasicJsonType>
-class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-special-member-functions)
-{
-    /// the iterator with BasicJsonType of different const-ness
-    using other_iter_impl = iter_impl<typename std::conditional<std::is_const<BasicJsonType>::value, typename std::remove_const<BasicJsonType>::type, const BasicJsonType>::type>;
-    /// allow basic_json to access private members
-    friend other_iter_impl;
-    friend BasicJsonType;
-    friend iteration_proxy<iter_impl>;
-    friend iteration_proxy_value<iter_impl>;
-
-    using object_t = typename BasicJsonType::object_t;
-    using array_t = typename BasicJsonType::array_t;
-    // make sure BasicJsonType is basic_json or const basic_json
-    static_assert(is_basic_json<typename std::remove_const<BasicJsonType>::type>::value,
-                  "iter_impl only accepts (const) basic_json");
-    // superficial check for the LegacyBidirectionalIterator named requirement
-    static_assert(std::is_base_of<std::bidirectional_iterator_tag, std::bidirectional_iterator_tag>::value
-                  &&  std::is_base_of<std::bidirectional_iterator_tag, typename std::iterator_traits<typename array_t::iterator>::iterator_category>::value,
-                  "basic_json iterator assumes array and object type iterators satisfy the LegacyBidirectionalIterator named requirement.");
-
-  public:
-    /// The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17.
-    /// The C++ Standard has never required user-defined iterators to derive from std::iterator.
-    /// A user-defined iterator should provide publicly accessible typedefs named
-    /// iterator_category, value_type, difference_type, pointer, and reference.
-    /// Note that value_type is required to be non-const, even for constant iterators.
-    using iterator_category = std::bidirectional_iterator_tag;
-
-    /// the type of the values when the iterator is dereferenced
-    using value_type = typename BasicJsonType::value_type;
-    /// a type to represent differences between iterators
-    using difference_type = typename BasicJsonType::difference_type;
-    /// defines a pointer to the type iterated over (value_type)
-    using pointer = typename std::conditional<std::is_const<BasicJsonType>::value,
-          typename BasicJsonType::const_pointer,
-          typename BasicJsonType::pointer>::type;
-    /// defines a reference to the type iterated over (value_type)
-    using reference =
-        typename std::conditional<std::is_const<BasicJsonType>::value,
-        typename BasicJsonType::const_reference,
-        typename BasicJsonType::reference>::type;
-
-    iter_impl() = default;
-    ~iter_impl() = default;
-    iter_impl(iter_impl&&) noexcept = default;
-    iter_impl& operator=(iter_impl&&) noexcept = default;
-
-    /*!
-    @brief constructor for a given JSON instance
-    @param[in] object  pointer to a JSON object for this iterator
-    @pre object != nullptr
-    @post The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    explicit iter_impl(pointer object) noexcept : m_object(object)
-    {
-        JSON_ASSERT(m_object != nullptr);
-
-        switch (m_object->m_data.m_type)
-        {
-            case value_t::object:
-            {
-                m_it.object_iterator = typename object_t::iterator();
-                break;
-            }
-
-            case value_t::array:
-            {
-                m_it.array_iterator = typename array_t::iterator();
-                break;
-            }
-
-            case value_t::null:
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-            {
-                m_it.primitive_iterator = primitive_iterator_t();
-                break;
-            }
-        }
-    }
-
-    /*!
-    @note The conventional copy constructor and copy assignment are implicitly
-          defined. Combined with the following converting constructor and
-          assignment, they support: (1) copy from iterator to iterator, (2)
-          copy from const iterator to const iterator, and (3) conversion from
-          iterator to const iterator. However conversion from const iterator
-          to iterator is not defined.
-    */
-
-    /*!
-    @brief const copy constructor
-    @param[in] other const iterator to copy from
-    @note This copy constructor had to be defined explicitly to circumvent a bug
-          occurring on msvc v19.0 compiler (VS 2015) debug build. For more
-          information refer to: https://github.com/nlohmann/json/issues/1608
-    */
-    iter_impl(const iter_impl<const BasicJsonType>& other) noexcept
-        : m_object(other.m_object), m_it(other.m_it)
-    {}
-
-    /*!
-    @brief converting assignment
-    @param[in] other const iterator to copy from
-    @return const/non-const iterator
-    @note It is not checked whether @a other is initialized.
-    */
-    iter_impl& operator=(const iter_impl<const BasicJsonType>& other) noexcept
-    {
-        if (&other != this)
-        {
-            m_object = other.m_object;
-            m_it = other.m_it;
-        }
-        return *this;
-    }
-
-    /*!
-    @brief converting constructor
-    @param[in] other  non-const iterator to copy from
-    @note It is not checked whether @a other is initialized.
-    */
-    iter_impl(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept
-        : m_object(other.m_object), m_it(other.m_it)
-    {}
-
-    /*!
-    @brief converting assignment
-    @param[in] other  non-const iterator to copy from
-    @return const/non-const iterator
-    @note It is not checked whether @a other is initialized.
-    */
-    iter_impl& operator=(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept // NOLINT(cert-oop54-cpp)
-    {
-        m_object = other.m_object;
-        m_it = other.m_it;
-        return *this;
-    }
-
-  JSON_PRIVATE_UNLESS_TESTED:
-    /*!
-    @brief set the iterator to the first value
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    void set_begin() noexcept
-    {
-        JSON_ASSERT(m_object != nullptr);
-
-        switch (m_object->m_data.m_type)
-        {
-            case value_t::object:
-            {
-                m_it.object_iterator = m_object->m_data.m_value.object->begin();
-                break;
-            }
-
-            case value_t::array:
-            {
-                m_it.array_iterator = m_object->m_data.m_value.array->begin();
-                break;
-            }
-
-            case value_t::null:
-            {
-                // set to end so begin()==end() is true: null is empty
-                m_it.primitive_iterator.set_end();
-                break;
-            }
-
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-            {
-                m_it.primitive_iterator.set_begin();
-                break;
-            }
-        }
-    }
-
-    /*!
-    @brief set the iterator past the last value
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    void set_end() noexcept
-    {
-        JSON_ASSERT(m_object != nullptr);
-
-        switch (m_object->m_data.m_type)
-        {
-            case value_t::object:
-            {
-                m_it.object_iterator = m_object->m_data.m_value.object->end();
-                break;
-            }
-
-            case value_t::array:
-            {
-                m_it.array_iterator = m_object->m_data.m_value.array->end();
-                break;
-            }
-
-            case value_t::null:
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-            {
-                m_it.primitive_iterator.set_end();
-                break;
-            }
-        }
-    }
-
-  public:
-    /*!
-    @brief return a reference to the value pointed to by the iterator
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    reference operator*() const
-    {
-        JSON_ASSERT(m_object != nullptr);
-
-        switch (m_object->m_data.m_type)
-        {
-            case value_t::object:
-            {
-                JSON_ASSERT(m_it.object_iterator != m_object->m_data.m_value.object->end());
-                return m_it.object_iterator->second;
-            }
-
-            case value_t::array:
-            {
-                JSON_ASSERT(m_it.array_iterator != m_object->m_data.m_value.array->end());
-                return *m_it.array_iterator;
-            }
-
-            case value_t::null:
-                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
-
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-            {
-                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
-                {
-                    return *m_object;
-                }
-
-                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
-            }
-        }
-    }
-
-    /*!
-    @brief dereference the iterator
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    pointer operator->() const
-    {
-        JSON_ASSERT(m_object != nullptr);
-
-        switch (m_object->m_data.m_type)
-        {
-            case value_t::object:
-            {
-                JSON_ASSERT(m_it.object_iterator != m_object->m_data.m_value.object->end());
-                return &(m_it.object_iterator->second);
-            }
-
-            case value_t::array:
-            {
-                JSON_ASSERT(m_it.array_iterator != m_object->m_data.m_value.array->end());
-                return &*m_it.array_iterator;
-            }
-
-            case value_t::null:
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-            {
-                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
-                {
-                    return m_object;
-                }
-
-                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
-            }
-        }
-    }
-
-    /*!
-    @brief post-increment (it++)
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    iter_impl operator++(int)& // NOLINT(cert-dcl21-cpp)
-    {
-        auto result = *this;
-        ++(*this);
-        return result;
-    }
-
-    /*!
-    @brief pre-increment (++it)
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    iter_impl& operator++()
-    {
-        JSON_ASSERT(m_object != nullptr);
-
-        switch (m_object->m_data.m_type)
-        {
-            case value_t::object:
-            {
-                std::advance(m_it.object_iterator, 1);
-                break;
-            }
-
-            case value_t::array:
-            {
-                std::advance(m_it.array_iterator, 1);
-                break;
-            }
-
-            case value_t::null:
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-            {
-                ++m_it.primitive_iterator;
-                break;
-            }
-        }
-
-        return *this;
-    }
-
-    /*!
-    @brief post-decrement (it--)
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    iter_impl operator--(int)& // NOLINT(cert-dcl21-cpp)
-    {
-        auto result = *this;
-        --(*this);
-        return result;
-    }
-
-    /*!
-    @brief pre-decrement (--it)
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    iter_impl& operator--()
-    {
-        JSON_ASSERT(m_object != nullptr);
-
-        switch (m_object->m_data.m_type)
-        {
-            case value_t::object:
-            {
-                std::advance(m_it.object_iterator, -1);
-                break;
-            }
-
-            case value_t::array:
-            {
-                std::advance(m_it.array_iterator, -1);
-                break;
-            }
-
-            case value_t::null:
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-            {
-                --m_it.primitive_iterator;
-                break;
-            }
-        }
-
-        return *this;
-    }
-
-    /*!
-    @brief comparison: equal
-    @pre (1) Both iterators are initialized to point to the same object, or (2) both iterators are value-initialized.
-    */
-    template < typename IterImpl, detail::enable_if_t < (std::is_same<IterImpl, iter_impl>::value || std::is_same<IterImpl, other_iter_impl>::value), std::nullptr_t > = nullptr >
-    bool operator==(const IterImpl& other) const
-    {
-        // if objects are not the same, the comparison is undefined
-        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
-        {
-            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", m_object));
-        }
-
-        // value-initialized forward iterators can be compared, and must compare equal to other value-initialized iterators of the same type #4493
-        if (m_object == nullptr)
-        {
-            return true;
-        }
-
-        switch (m_object->m_data.m_type)
-        {
-            case value_t::object:
-                return (m_it.object_iterator == other.m_it.object_iterator);
-
-            case value_t::array:
-                return (m_it.array_iterator == other.m_it.array_iterator);
-
-            case value_t::null:
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-                return (m_it.primitive_iterator == other.m_it.primitive_iterator);
-        }
-    }
-
-    /*!
-    @brief comparison: not equal
-    @pre (1) Both iterators are initialized to point to the same object, or (2) both iterators are value-initialized.
-    */
-    template < typename IterImpl, detail::enable_if_t < (std::is_same<IterImpl, iter_impl>::value || std::is_same<IterImpl, other_iter_impl>::value), std::nullptr_t > = nullptr >
-    bool operator!=(const IterImpl& other) const
-    {
-        return !operator==(other);
-    }
-
-    /*!
-    @brief comparison: smaller
-    @pre (1) Both iterators are initialized to point to the same object, or (2) both iterators are value-initialized.
-    */
-    bool operator<(const iter_impl& other) const
-    {
-        // if objects are not the same, the comparison is undefined
-        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
-        {
-            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", m_object));
-        }
-
-        // value-initialized forward iterators can be compared, and must compare equal to other value-initialized iterators of the same type #4493
-        if (m_object == nullptr)
-        {
-            // the iterators are both value-initialized and are to be considered equal, but this function checks for smaller, so we return false
-            return false;
-        }
-
-        switch (m_object->m_data.m_type)
-        {
-            case value_t::object:
-                JSON_THROW(invalid_iterator::create(213, "cannot compare order of object iterators", m_object));
-
-            case value_t::array:
-                return (m_it.array_iterator < other.m_it.array_iterator);
-
-            case value_t::null:
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-                return (m_it.primitive_iterator < other.m_it.primitive_iterator);
-        }
-    }
-
-    /*!
-    @brief comparison: less than or equal
-    @pre (1) Both iterators are initialized to point to the same object, or (2) both iterators are value-initialized.
-    */
-    bool operator<=(const iter_impl& other) const
-    {
-        return !other.operator < (*this);
-    }
-
-    /*!
-    @brief comparison: greater than
-    @pre (1) Both iterators are initialized to point to the same object, or (2) both iterators are value-initialized.
-    */
-    bool operator>(const iter_impl& other) const
-    {
-        return !operator<=(other);
-    }
-
-    /*!
-    @brief comparison: greater than or equal
-    @pre (1) The iterator is initialized; i.e. `m_object != nullptr`, or (2) both iterators are value-initialized.
-    */
-    bool operator>=(const iter_impl& other) const
-    {
-        return !operator<(other);
-    }
-
-    /*!
-    @brief add to iterator
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    iter_impl& operator+=(difference_type i)
-    {
-        JSON_ASSERT(m_object != nullptr);
-
-        switch (m_object->m_data.m_type)
-        {
-            case value_t::object:
-                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", m_object));
-
-            case value_t::array:
-            {
-                std::advance(m_it.array_iterator, i);
-                break;
-            }
-
-            case value_t::null:
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-            {
-                m_it.primitive_iterator += i;
-                break;
-            }
-        }
-
-        return *this;
-    }
-
-    /*!
-    @brief subtract from iterator
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    iter_impl& operator-=(difference_type i)
-    {
-        return operator+=(-i);
-    }
-
-    /*!
-    @brief add to iterator
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    iter_impl operator+(difference_type i) const
-    {
-        auto result = *this;
-        result += i;
-        return result;
-    }
-
-    /*!
-    @brief addition of distance and iterator
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    friend iter_impl operator+(difference_type i, const iter_impl& it)
-    {
-        auto result = it;
-        result += i;
-        return result;
-    }
-
-    /*!
-    @brief subtract from iterator
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    iter_impl operator-(difference_type i) const
-    {
-        auto result = *this;
-        result -= i;
-        return result;
-    }
-
-    /*!
-    @brief return difference
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    difference_type operator-(const iter_impl& other) const
-    {
-        JSON_ASSERT(m_object != nullptr);
-
-        switch (m_object->m_data.m_type)
-        {
-            case value_t::object:
-                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", m_object));
-
-            case value_t::array:
-                return m_it.array_iterator - other.m_it.array_iterator;
-
-            case value_t::null:
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-                return m_it.primitive_iterator - other.m_it.primitive_iterator;
-        }
-    }
-
-    /*!
-    @brief access to successor
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    reference operator[](difference_type n) const
-    {
-        JSON_ASSERT(m_object != nullptr);
-
-        switch (m_object->m_data.m_type)
-        {
-            case value_t::object:
-                JSON_THROW(invalid_iterator::create(208, "cannot use operator[] for object iterators", m_object));
-
-            case value_t::array:
-                return *std::next(m_it.array_iterator, n);
-
-            case value_t::null:
-                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
-
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-            {
-                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.get_value() == -n))
-                {
-                    return *m_object;
-                }
-
-                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
-            }
-        }
-    }
-
-    /*!
-    @brief return the key of an object iterator
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    const typename object_t::key_type& key() const
-    {
-        JSON_ASSERT(m_object != nullptr);
-
-        if (JSON_HEDLEY_LIKELY(m_object->is_object()))
-        {
-            return m_it.object_iterator->first;
-        }
-
-        JSON_THROW(invalid_iterator::create(207, "cannot use key() for non-object iterators", m_object));
-    }
-
-    /*!
-    @brief return the value of an iterator
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
-    */
-    reference value() const
-    {
-        return operator*();
-    }
-
-  JSON_PRIVATE_UNLESS_TESTED:
-    /// associated JSON instance
-    pointer m_object = nullptr;
-    /// the actual iterator of the associated instance
-    internal_iterator<typename std::remove_const<BasicJsonType>::type> m_it {};
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
-
-// #include <nlohmann/detail/iterators/json_reverse_iterator.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <cstddef> // ptrdiff_t
-#include <iterator> // reverse_iterator
-#include <utility> // declval
-
-// #include <nlohmann/detail/abi_macros.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-//////////////////////
-// reverse_iterator //
-//////////////////////
-
-/*!
-@brief a template for a reverse iterator class
-
-@tparam Base the base iterator type to reverse. Valid types are @ref
-iterator (to create @ref reverse_iterator) and @ref const_iterator (to
-create @ref const_reverse_iterator).
-
-@requirement The class satisfies the following concept requirements:
--
-[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
-  The iterator that can be moved can be moved in both directions (i.e.
-  incremented and decremented).
-- [OutputIterator](https://en.cppreference.com/w/cpp/named_req/OutputIterator):
-  It is possible to write to the pointed-to element (only if @a Base is
-  @ref iterator).
-
-@since version 1.0.0
-*/
-template<typename Base>
-class json_reverse_iterator : public std::reverse_iterator<Base>
-{
-  public:
-    using difference_type = std::ptrdiff_t;
-    /// shortcut to the reverse iterator adapter
-    using base_iterator = std::reverse_iterator<Base>;
-    /// the reference type for the pointed-to element
-    using reference = typename Base::reference;
-
-    /// create reverse iterator from iterator
-    explicit json_reverse_iterator(const typename base_iterator::iterator_type& it) noexcept
-        : base_iterator(it) {}
-
-    /// create reverse iterator from base class
-    explicit json_reverse_iterator(const base_iterator& it) noexcept : base_iterator(it) {}
-
-    /// post-increment (it++)
-    json_reverse_iterator operator++(int)& // NOLINT(cert-dcl21-cpp)
-    {
-        return static_cast<json_reverse_iterator>(base_iterator::operator++(1));
-    }
-
-    /// pre-increment (++it)
-    json_reverse_iterator& operator++()
-    {
-        return static_cast<json_reverse_iterator&>(base_iterator::operator++());
-    }
-
-    /// post-decrement (it--)
-    json_reverse_iterator operator--(int)& // NOLINT(cert-dcl21-cpp)
-    {
-        return static_cast<json_reverse_iterator>(base_iterator::operator--(1));
-    }
-
-    /// pre-decrement (--it)
-    json_reverse_iterator& operator--()
-    {
-        return static_cast<json_reverse_iterator&>(base_iterator::operator--());
-    }
-
-    /// add to iterator
-    json_reverse_iterator& operator+=(difference_type i)
-    {
-        return static_cast<json_reverse_iterator&>(base_iterator::operator+=(i));
-    }
-
-    /// add to iterator
-    json_reverse_iterator operator+(difference_type i) const
-    {
-        return static_cast<json_reverse_iterator>(base_iterator::operator+(i));
-    }
-
-    /// subtract from iterator
-    json_reverse_iterator operator-(difference_type i) const
-    {
-        return static_cast<json_reverse_iterator>(base_iterator::operator-(i));
-    }
-
-    /// return difference
-    difference_type operator-(const json_reverse_iterator& other) const
-    {
-        return base_iterator(*this) - base_iterator(other);
-    }
-
-    /// access to successor
-    reference operator[](difference_type n) const
-    {
-        return *(this->operator+(n));
-    }
-
-    /// return the key of an object iterator
-    auto key() const -> decltype(std::declval<Base>().key())
-    {
-        auto it = --this->base();
-        return it.key();
-    }
-
-    /// return the value of an iterator
-    reference value() const
-    {
-        auto it = --this->base();
-        return it.operator * ();
-    }
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
-
-// #include <nlohmann/detail/json_custom_base_class.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <type_traits> // conditional, is_same
-
-// #include <nlohmann/detail/abi_macros.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-/*!
-@brief Default base class of the @ref basic_json class.
-
-So that the correct implementations of the copy / move ctors / assign operators
-of @ref basic_json do not require complex case distinctions
-(no base class / custom base class used as customization point),
-@ref basic_json always has a base class.
-By default, this class is used because it is empty and thus has no effect
-on the behavior of @ref basic_json.
-*/
-struct json_default_base {};
-
-template<class T>
-using json_base_class = typename std::conditional <
-                        std::is_same<T, void>::value,
-                        json_default_base,
-                        T
-                        >::type;
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/json_pointer.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <algorithm> // all_of
-#include <cctype> // isdigit
-#include <cerrno> // errno, ERANGE
-#include <cstdlib> // strtoull
-#ifndef JSON_NO_IO
-    #include <iosfwd> // ostream
-#endif  // JSON_NO_IO
-#include <limits> // max
-#include <numeric> // accumulate
-#include <string> // string
-#include <utility> // move
-#include <vector> // vector
-
-// #include <nlohmann/detail/exceptions.hpp>
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-// #include <nlohmann/detail/string_concat.hpp>
-
-// #include <nlohmann/detail/string_escape.hpp>
-
-// #include <nlohmann/detail/value_t.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-
-/// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
-/// @sa https://json.nlohmann.me/api/json_pointer/
-template<typename RefStringType>
-class json_pointer
-{
-    // allow basic_json to access private members
-    NLOHMANN_BASIC_JSON_TPL_DECLARATION
-    friend class basic_json;
-
-    template<typename>
-    friend class json_pointer;
-
-    template<typename T>
-    struct string_t_helper
-    {
-        using type = T;
-    };
-
-    NLOHMANN_BASIC_JSON_TPL_DECLARATION
-    struct string_t_helper<NLOHMANN_BASIC_JSON_TPL>
-    {
-        using type = StringType;
-    };
-
-  public:
-    // for backwards compatibility accept BasicJsonType
-    using string_t = typename string_t_helper<RefStringType>::type;
-
-    /// @brief create JSON pointer
-    /// @sa https://json.nlohmann.me/api/json_pointer/json_pointer/
-    explicit json_pointer(const string_t& s = "")
-        : reference_tokens(split(s))
-    {}
-
-    /// @brief return a string representation of the JSON pointer
-    /// @sa https://json.nlohmann.me/api/json_pointer/to_string/
-    string_t to_string() const
-    {
-        return std::accumulate(reference_tokens.begin(), reference_tokens.end(),
-                               string_t{},
-                               [](const string_t& a, const string_t& b)
-        {
-            return detail::concat(a, '/', detail::escape(b));
-        });
-    }
-
-    /// @brief return a string representation of the JSON pointer
-    /// @sa https://json.nlohmann.me/api/json_pointer/operator_string/
-    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, to_string())
-    operator string_t() const
-    {
-        return to_string();
-    }
-
-#ifndef JSON_NO_IO
-    /// @brief write string representation of the JSON pointer to stream
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_ltlt/
-    friend std::ostream& operator<<(std::ostream& o, const json_pointer& ptr)
-    {
-        o << ptr.to_string();
-        return o;
-    }
-#endif
-
-    /// @brief append another JSON pointer at the end of this JSON pointer
-    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slasheq/
-    json_pointer& operator/=(const json_pointer& ptr)
-    {
-        reference_tokens.insert(reference_tokens.end(),
-                                ptr.reference_tokens.begin(),
-                                ptr.reference_tokens.end());
-        return *this;
-    }
-
-    /// @brief append an unescaped reference token at the end of this JSON pointer
-    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slasheq/
-    json_pointer& operator/=(string_t token)
-    {
-        push_back(std::move(token));
-        return *this;
-    }
-
-    /// @brief append an array index at the end of this JSON pointer
-    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slasheq/
-    json_pointer& operator/=(std::size_t array_idx)
-    {
-        return *this /= std::to_string(array_idx);
-    }
-
-    /// @brief create a new JSON pointer by appending the right JSON pointer at the end of the left JSON pointer
-    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slash/
-    friend json_pointer operator/(const json_pointer& lhs,
-                                  const json_pointer& rhs)
-    {
-        return json_pointer(lhs) /= rhs;
-    }
-
-    /// @brief create a new JSON pointer by appending the unescaped token at the end of the JSON pointer
-    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slash/
-    friend json_pointer operator/(const json_pointer& lhs, string_t token) // NOLINT(performance-unnecessary-value-param)
-    {
-        return json_pointer(lhs) /= std::move(token);
-    }
-
-    /// @brief create a new JSON pointer by appending the array-index-token at the end of the JSON pointer
-    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slash/
-    friend json_pointer operator/(const json_pointer& lhs, std::size_t array_idx)
-    {
-        return json_pointer(lhs) /= array_idx;
-    }
-
-    /// @brief returns the parent of this JSON pointer
-    /// @sa https://json.nlohmann.me/api/json_pointer/parent_pointer/
-    json_pointer parent_pointer() const
-    {
-        if (empty())
-        {
-            return *this;
-        }
-
-        json_pointer res = *this;
-        res.pop_back();
-        return res;
-    }
-
-    /// @brief remove last reference token
-    /// @sa https://json.nlohmann.me/api/json_pointer/pop_back/
-    void pop_back()
-    {
-        if (JSON_HEDLEY_UNLIKELY(empty()))
-        {
-            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", nullptr));
-        }
-
-        reference_tokens.pop_back();
-    }
-
-    /// @brief return last reference token
-    /// @sa https://json.nlohmann.me/api/json_pointer/back/
-    const string_t& back() const
-    {
-        if (JSON_HEDLEY_UNLIKELY(empty()))
-        {
-            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", nullptr));
-        }
-
-        return reference_tokens.back();
-    }
-
-    /// @brief append an unescaped token at the end of the reference pointer
-    /// @sa https://json.nlohmann.me/api/json_pointer/push_back/
-    void push_back(const string_t& token)
-    {
-        reference_tokens.push_back(token);
-    }
-
-    /// @brief append an unescaped token at the end of the reference pointer
-    /// @sa https://json.nlohmann.me/api/json_pointer/push_back/
-    void push_back(string_t&& token)
-    {
-        reference_tokens.push_back(std::move(token));
-    }
-
-    /// @brief return whether pointer points to the root document
-    /// @sa https://json.nlohmann.me/api/json_pointer/empty/
-    bool empty() const noexcept
-    {
-        return reference_tokens.empty();
-    }
-
-  private:
-    /*!
-    @param[in] s  reference token to be converted into an array index
-
-    @return integer representation of @a s
-
-    @throw parse_error.106  if an array index begins with '0'
-    @throw parse_error.109  if an array index begins not with a digit
-    @throw out_of_range.404 if string @a s could not be converted to an integer
-    @throw out_of_range.410 if an array index exceeds size_type
-    */
-    template<typename BasicJsonType>
-    static typename BasicJsonType::size_type array_index(const string_t& s)
-    {
-        using size_type = typename BasicJsonType::size_type;
-
-        // error condition (cf. RFC 6901, Sect. 4)
-        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && s[0] == '0'))
-        {
-            JSON_THROW(detail::parse_error::create(106, 0, detail::concat("array index '", s, "' must not begin with '0'"), nullptr));
-        }
-
-        // error condition (cf. RFC 6901, Sect. 4)
-        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && !(s[0] >= '1' && s[0] <= '9')))
-        {
-            JSON_THROW(detail::parse_error::create(109, 0, detail::concat("array index '", s, "' is not a number"), nullptr));
-        }
-
-        const char* p = s.c_str();
-        char* p_end = nullptr;
-        errno = 0; // strtoull doesn't reset errno
-        const unsigned long long res = std::strtoull(p, &p_end, 10); // NOLINT(runtime/int)
-        if (p == p_end // invalid input or empty string
-                || errno == ERANGE // out of range
-                || JSON_HEDLEY_UNLIKELY(static_cast<std::size_t>(p_end - p) != s.size())) // incomplete read
-        {
-            JSON_THROW(detail::out_of_range::create(404, detail::concat("unresolved reference token '", s, "'"), nullptr));
-        }
-
-        // only triggered on special platforms (like 32bit), see also
-        // https://github.com/nlohmann/json/pull/2203
-        if (res >= static_cast<unsigned long long>((std::numeric_limits<size_type>::max)()))  // NOLINT(runtime/int)
-        {
-            JSON_THROW(detail::out_of_range::create(410, detail::concat("array index ", s, " exceeds size_type"), nullptr));   // LCOV_EXCL_LINE
-        }
-
-        return static_cast<size_type>(res);
-    }
-
-  JSON_PRIVATE_UNLESS_TESTED:
-    json_pointer top() const
-    {
-        if (JSON_HEDLEY_UNLIKELY(empty()))
-        {
-            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", nullptr));
-        }
-
-        json_pointer result = *this;
-        result.reference_tokens = {reference_tokens[0]};
-        return result;
-    }
-
-  private:
-    /*!
-    @brief create and return a reference to the pointed to value
-
-    @complexity Linear in the number of reference tokens.
-
-    @throw parse_error.109 if array index is not a number
-    @throw type_error.313 if value cannot be unflattened
-    */
-    template<typename BasicJsonType>
-    BasicJsonType& get_and_create(BasicJsonType& j) const
-    {
-        auto* result = &j;
-
-        // in case no reference tokens exist, return a reference to the JSON value
-        // j which will be overwritten by a primitive value
-        for (const auto& reference_token : reference_tokens)
-        {
-            switch (result->type())
-            {
-                case detail::value_t::null:
-                {
-                    if (reference_token == "0")
-                    {
-                        // start a new array if reference token is 0
-                        result = &result->operator[](0);
-                    }
-                    else
-                    {
-                        // start a new object otherwise
-                        result = &result->operator[](reference_token);
-                    }
-                    break;
-                }
-
-                case detail::value_t::object:
-                {
-                    // create an entry in the object
-                    result = &result->operator[](reference_token);
-                    break;
-                }
-
-                case detail::value_t::array:
-                {
-                    // create an entry in the array
-                    result = &result->operator[](array_index<BasicJsonType>(reference_token));
-                    break;
-                }
-
-                /*
-                The following code is only reached if there exists a reference
-                token _and_ the current value is primitive. In this case, we have
-                an error situation, because primitive values may only occur as
-                single value; that is, with an empty list of reference tokens.
-                */
-                case detail::value_t::string:
-                case detail::value_t::boolean:
-                case detail::value_t::number_integer:
-                case detail::value_t::number_unsigned:
-                case detail::value_t::number_float:
-                case detail::value_t::binary:
-                case detail::value_t::discarded:
-                default:
-                    JSON_THROW(detail::type_error::create(313, "invalid value to unflatten", &j));
-            }
-        }
-
-        return *result;
-    }
-
-    /*!
-    @brief return a reference to the pointed to value
-
-    @note This version does not throw if a value is not present, but tries to
-          create nested values instead. For instance, calling this function
-          with pointer `"/this/that"` on a null value is equivalent to calling
-          `operator[]("this").operator[]("that")` on that value, effectively
-          changing the null value to an object.
-
-    @param[in] ptr  a JSON value
-
-    @return reference to the JSON value pointed to by the JSON pointer
-
-    @complexity Linear in the length of the JSON pointer.
-
-    @throw parse_error.106   if an array index begins with '0'
-    @throw parse_error.109   if an array index was not a number
-    @throw out_of_range.404  if the JSON pointer can not be resolved
-    */
-    template<typename BasicJsonType>
-    BasicJsonType& get_unchecked(BasicJsonType* ptr) const
-    {
-        for (const auto& reference_token : reference_tokens)
-        {
-            // convert null values to arrays or objects before continuing
-            if (ptr->is_null())
-            {
-                // check if reference token is a number
-                const bool nums =
-                    std::all_of(reference_token.begin(), reference_token.end(),
-                                [](const unsigned char x)
-                {
-                    return std::isdigit(x);
-                });
-
-                // change value to array for numbers or "-" or to object otherwise
-                *ptr = (nums || reference_token == "-")
-                       ? detail::value_t::array
-                       : detail::value_t::object;
-            }
-
-            switch (ptr->type())
-            {
-                case detail::value_t::object:
-                {
-                    // use unchecked object access
-                    ptr = &ptr->operator[](reference_token);
-                    break;
-                }
-
-                case detail::value_t::array:
-                {
-                    if (reference_token == "-")
-                    {
-                        // explicitly treat "-" as index beyond the end
-                        ptr = &ptr->operator[](ptr->m_data.m_value.array->size());
-                    }
-                    else
-                    {
-                        // convert array index to number; unchecked access
-                        ptr = &ptr->operator[](array_index<BasicJsonType>(reference_token));
-                    }
-                    break;
-                }
-
-                case detail::value_t::null:
-                case detail::value_t::string:
-                case detail::value_t::boolean:
-                case detail::value_t::number_integer:
-                case detail::value_t::number_unsigned:
-                case detail::value_t::number_float:
-                case detail::value_t::binary:
-                case detail::value_t::discarded:
-                default:
-                    JSON_THROW(detail::out_of_range::create(404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
-            }
-        }
-
-        return *ptr;
-    }
-
-    /*!
-    @throw parse_error.106   if an array index begins with '0'
-    @throw parse_error.109   if an array index was not a number
-    @throw out_of_range.402  if the array index '-' is used
-    @throw out_of_range.404  if the JSON pointer can not be resolved
-    */
-    template<typename BasicJsonType>
-    BasicJsonType& get_checked(BasicJsonType* ptr) const
-    {
-        for (const auto& reference_token : reference_tokens)
-        {
-            switch (ptr->type())
-            {
-                case detail::value_t::object:
-                {
-                    // note: at performs range check
-                    ptr = &ptr->at(reference_token);
-                    break;
-                }
-
-                case detail::value_t::array:
-                {
-                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
-                    {
-                        // "-" always fails the range check
-                        JSON_THROW(detail::out_of_range::create(402, detail::concat(
-                                "array index '-' (", std::to_string(ptr->m_data.m_value.array->size()),
-                                ") is out of range"), ptr));
-                    }
-
-                    // note: at performs range check
-                    ptr = &ptr->at(array_index<BasicJsonType>(reference_token));
-                    break;
-                }
-
-                case detail::value_t::null:
-                case detail::value_t::string:
-                case detail::value_t::boolean:
-                case detail::value_t::number_integer:
-                case detail::value_t::number_unsigned:
-                case detail::value_t::number_float:
-                case detail::value_t::binary:
-                case detail::value_t::discarded:
-                default:
-                    JSON_THROW(detail::out_of_range::create(404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
-            }
-        }
-
-        return *ptr;
-    }
-
-    /*!
-    @brief return a const reference to the pointed to value
-
-    @param[in] ptr  a JSON value
-
-    @return const reference to the JSON value pointed to by the JSON
-    pointer
-
-    @throw parse_error.106   if an array index begins with '0'
-    @throw parse_error.109   if an array index was not a number
-    @throw out_of_range.402  if the array index '-' is used
-    @throw out_of_range.404  if the JSON pointer can not be resolved
-    */
-    template<typename BasicJsonType>
-    const BasicJsonType& get_unchecked(const BasicJsonType* ptr) const
-    {
-        for (const auto& reference_token : reference_tokens)
-        {
-            switch (ptr->type())
-            {
-                case detail::value_t::object:
-                {
-                    // use unchecked object access
-                    ptr = &ptr->operator[](reference_token);
-                    break;
-                }
-
-                case detail::value_t::array:
-                {
-                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
-                    {
-                        // "-" cannot be used for const access
-                        JSON_THROW(detail::out_of_range::create(402, detail::concat("array index '-' (", std::to_string(ptr->m_data.m_value.array->size()), ") is out of range"), ptr));
-                    }
-
-                    // use unchecked array access
-                    ptr = &ptr->operator[](array_index<BasicJsonType>(reference_token));
-                    break;
-                }
-
-                case detail::value_t::null:
-                case detail::value_t::string:
-                case detail::value_t::boolean:
-                case detail::value_t::number_integer:
-                case detail::value_t::number_unsigned:
-                case detail::value_t::number_float:
-                case detail::value_t::binary:
-                case detail::value_t::discarded:
-                default:
-                    JSON_THROW(detail::out_of_range::create(404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
-            }
-        }
-
-        return *ptr;
-    }
-
-    /*!
-    @throw parse_error.106   if an array index begins with '0'
-    @throw parse_error.109   if an array index was not a number
-    @throw out_of_range.402  if the array index '-' is used
-    @throw out_of_range.404  if the JSON pointer can not be resolved
-    */
-    template<typename BasicJsonType>
-    const BasicJsonType& get_checked(const BasicJsonType* ptr) const
-    {
-        for (const auto& reference_token : reference_tokens)
-        {
-            switch (ptr->type())
-            {
-                case detail::value_t::object:
-                {
-                    // note: at performs range check
-                    ptr = &ptr->at(reference_token);
-                    break;
-                }
-
-                case detail::value_t::array:
-                {
-                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
-                    {
-                        // "-" always fails the range check
-                        JSON_THROW(detail::out_of_range::create(402, detail::concat(
-                                "array index '-' (", std::to_string(ptr->m_data.m_value.array->size()),
-                                ") is out of range"), ptr));
-                    }
-
-                    // note: at performs range check
-                    ptr = &ptr->at(array_index<BasicJsonType>(reference_token));
-                    break;
-                }
-
-                case detail::value_t::null:
-                case detail::value_t::string:
-                case detail::value_t::boolean:
-                case detail::value_t::number_integer:
-                case detail::value_t::number_unsigned:
-                case detail::value_t::number_float:
-                case detail::value_t::binary:
-                case detail::value_t::discarded:
-                default:
-                    JSON_THROW(detail::out_of_range::create(404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
-            }
-        }
-
-        return *ptr;
-    }
-
-    /*!
-    @throw parse_error.106   if an array index begins with '0'
-    @throw parse_error.109   if an array index was not a number
-    */
-    template<typename BasicJsonType>
-    bool contains(const BasicJsonType* ptr) const
-    {
-        for (const auto& reference_token : reference_tokens)
-        {
-            switch (ptr->type())
-            {
-                case detail::value_t::object:
-                {
-                    if (!ptr->contains(reference_token))
-                    {
-                        // we did not find the key in the object
-                        return false;
-                    }
-
-                    ptr = &ptr->operator[](reference_token);
-                    break;
-                }
-
-                case detail::value_t::array:
-                {
-                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
-                    {
-                        // "-" always fails the range check
-                        return false;
-                    }
-                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() == 1 && !("0" <= reference_token && reference_token <= "9")))
-                    {
-                        // invalid char
-                        return false;
-                    }
-                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() > 1))
-                    {
-                        if (JSON_HEDLEY_UNLIKELY(!('1' <= reference_token[0] && reference_token[0] <= '9')))
-                        {
-                            // first char should be between '1' and '9'
-                            return false;
-                        }
-                        for (std::size_t i = 1; i < reference_token.size(); i++)
-                        {
-                            if (JSON_HEDLEY_UNLIKELY(!('0' <= reference_token[i] && reference_token[i] <= '9')))
-                            {
-                                // other char should be between '0' and '9'
-                                return false;
-                            }
-                        }
-                    }
-
-                    const auto idx = array_index<BasicJsonType>(reference_token);
-                    if (idx >= ptr->size())
-                    {
-                        // index out of range
-                        return false;
-                    }
-
-                    ptr = &ptr->operator[](idx);
-                    break;
-                }
-
-                case detail::value_t::null:
-                case detail::value_t::string:
-                case detail::value_t::boolean:
-                case detail::value_t::number_integer:
-                case detail::value_t::number_unsigned:
-                case detail::value_t::number_float:
-                case detail::value_t::binary:
-                case detail::value_t::discarded:
-                default:
-                {
-                    // we do not expect primitive values if there is still a
-                    // reference token to process
-                    return false;
-                }
-            }
-        }
-
-        // no reference token left means we found a primitive value
-        return true;
-    }
-
-    /*!
-    @brief split the string input to reference tokens
-
-    @note This function is only called by the json_pointer constructor.
-          All exceptions below are documented there.
-
-    @throw parse_error.107  if the pointer is not empty or begins with '/'
-    @throw parse_error.108  if character '~' is not followed by '0' or '1'
-    */
-    static std::vector<string_t> split(const string_t& reference_string)
-    {
-        std::vector<string_t> result;
-
-        // special case: empty reference string -> no reference tokens
-        if (reference_string.empty())
-        {
-            return result;
-        }
-
-        // check if nonempty reference string begins with slash
-        if (JSON_HEDLEY_UNLIKELY(reference_string[0] != '/'))
-        {
-            JSON_THROW(detail::parse_error::create(107, 1, detail::concat("JSON pointer must be empty or begin with '/' - was: '", reference_string, "'"), nullptr));
-        }
-
-        // extract the reference tokens:
-        // - slash: position of the last read slash (or end of string)
-        // - start: position after the previous slash
-        for (
-            // search for the first slash after the first character
-            std::size_t slash = reference_string.find_first_of('/', 1),
-            // set the beginning of the first reference token
-            start = 1;
-            // we can stop if start == 0 (if slash == string_t::npos)
-            start != 0;
-            // set the beginning of the next reference token
-            // (will eventually be 0 if slash == string_t::npos)
-            start = (slash == string_t::npos) ? 0 : slash + 1,
-            // find next slash
-            slash = reference_string.find_first_of('/', start))
-        {
-            // use the text between the beginning of the reference token
-            // (start) and the last slash (slash).
-            auto reference_token = reference_string.substr(start, slash - start);
-
-            // check reference tokens are properly escaped
-            for (std::size_t pos = reference_token.find_first_of('~');
-                    pos != string_t::npos;
-                    pos = reference_token.find_first_of('~', pos + 1))
-            {
-                JSON_ASSERT(reference_token[pos] == '~');
-
-                // ~ must be followed by 0 or 1
-                if (JSON_HEDLEY_UNLIKELY(pos == reference_token.size() - 1 ||
-                                         (reference_token[pos + 1] != '0' &&
-                                          reference_token[pos + 1] != '1')))
-                {
-                    JSON_THROW(detail::parse_error::create(108, 0, "escape character '~' must be followed with '0' or '1'", nullptr));
-                }
-            }
-
-            // finally, store the reference token
-            detail::unescape(reference_token);
-            result.push_back(reference_token);
-        }
-
-        return result;
-    }
-
-  private:
-    /*!
-    @param[in] reference_string  the reference string to the current value
-    @param[in] value             the value to consider
-    @param[in,out] result        the result object to insert values to
-
-    @note Empty objects or arrays are flattened to `null`.
-    */
-    template<typename BasicJsonType>
-    static void flatten(const string_t& reference_string,
-                        const BasicJsonType& value,
-                        BasicJsonType& result)
-    {
-        switch (value.type())
-        {
-            case detail::value_t::array:
-            {
-                if (value.m_data.m_value.array->empty())
-                {
-                    // flatten empty array as null
-                    result[reference_string] = nullptr;
-                }
-                else
-                {
-                    // iterate array and use index as reference string
-                    for (std::size_t i = 0; i < value.m_data.m_value.array->size(); ++i)
-                    {
-                        flatten(detail::concat(reference_string, '/', std::to_string(i)),
-                                value.m_data.m_value.array->operator[](i), result);
-                    }
-                }
-                break;
-            }
-
-            case detail::value_t::object:
-            {
-                if (value.m_data.m_value.object->empty())
-                {
-                    // flatten empty object as null
-                    result[reference_string] = nullptr;
-                }
-                else
-                {
-                    // iterate object and use keys as reference string
-                    for (const auto& element : *value.m_data.m_value.object)
-                    {
-                        flatten(detail::concat(reference_string, '/', detail::escape(element.first)), element.second, result);
-                    }
-                }
-                break;
-            }
-
-            case detail::value_t::null:
-            case detail::value_t::string:
-            case detail::value_t::boolean:
-            case detail::value_t::number_integer:
-            case detail::value_t::number_unsigned:
-            case detail::value_t::number_float:
-            case detail::value_t::binary:
-            case detail::value_t::discarded:
-            default:
-            {
-                // add primitive value with its reference string
-                result[reference_string] = value;
-                break;
-            }
-        }
-    }
-
-    /*!
-    @param[in] value  flattened JSON
-
-    @return unflattened JSON
-
-    @throw parse_error.109 if array index is not a number
-    @throw type_error.314  if value is not an object
-    @throw type_error.315  if object values are not primitive
-    @throw type_error.313  if value cannot be unflattened
-    */
-    template<typename BasicJsonType>
-    static BasicJsonType
-    unflatten(const BasicJsonType& value)
-    {
-        if (JSON_HEDLEY_UNLIKELY(!value.is_object()))
-        {
-            JSON_THROW(detail::type_error::create(314, "only objects can be unflattened", &value));
-        }
-
-        BasicJsonType result;
-
-        // iterate the JSON object values
-        for (const auto& element : *value.m_data.m_value.object)
-        {
-            if (JSON_HEDLEY_UNLIKELY(!element.second.is_primitive()))
-            {
-                JSON_THROW(detail::type_error::create(315, "values in object must be primitive", &element.second));
-            }
-
-            // assign value to reference pointed to by JSON pointer; Note that if
-            // the JSON pointer is "" (i.e., points to the whole value), function
-            // get_and_create returns a reference to result itself. An assignment
-            // will then create a primitive value.
-            json_pointer(element.first).get_and_create(result) = element.second;
-        }
-
-        return result;
-    }
-
-    // can't use conversion operator because of ambiguity
-    json_pointer<string_t> convert() const&
-    {
-        json_pointer<string_t> result;
-        result.reference_tokens = reference_tokens;
-        return result;
-    }
-
-    json_pointer<string_t> convert()&&
-    {
-        json_pointer<string_t> result;
-        result.reference_tokens = std::move(reference_tokens);
-        return result;
-    }
-
-  public:
-#if JSON_HAS_THREE_WAY_COMPARISON
-    /// @brief compares two JSON pointers for equality
-    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
-    template<typename RefStringTypeRhs>
-    bool operator==(const json_pointer<RefStringTypeRhs>& rhs) const noexcept
-    {
-        return reference_tokens == rhs.reference_tokens;
-    }
-
-    /// @brief compares JSON pointer and string for equality
-    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
-    JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator==(json_pointer))
-    bool operator==(const string_t& rhs) const
-    {
-        return *this == json_pointer(rhs);
-    }
-
-    /// @brief 3-way compares two JSON pointers
-    template<typename RefStringTypeRhs>
-    std::strong_ordering operator<=>(const json_pointer<RefStringTypeRhs>& rhs) const noexcept // *NOPAD*
-    {
-        return  reference_tokens <=> rhs.reference_tokens; // *NOPAD*
-    }
-#else
-    /// @brief compares two JSON pointers for equality
-    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
-    template<typename RefStringTypeLhs, typename RefStringTypeRhs>
-    // NOLINTNEXTLINE(readability-redundant-declaration)
-    friend bool operator==(const json_pointer<RefStringTypeLhs>& lhs,
-                           const json_pointer<RefStringTypeRhs>& rhs) noexcept;
-
-    /// @brief compares JSON pointer and string for equality
-    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
-    template<typename RefStringTypeLhs, typename StringType>
-    // NOLINTNEXTLINE(readability-redundant-declaration)
-    friend bool operator==(const json_pointer<RefStringTypeLhs>& lhs,
-                           const StringType& rhs);
-
-    /// @brief compares string and JSON pointer for equality
-    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
-    template<typename RefStringTypeRhs, typename StringType>
-    // NOLINTNEXTLINE(readability-redundant-declaration)
-    friend bool operator==(const StringType& lhs,
-                           const json_pointer<RefStringTypeRhs>& rhs);
-
-    /// @brief compares two JSON pointers for inequality
-    /// @sa https://json.nlohmann.me/api/json_pointer/operator_ne/
-    template<typename RefStringTypeLhs, typename RefStringTypeRhs>
-    // NOLINTNEXTLINE(readability-redundant-declaration)
-    friend bool operator!=(const json_pointer<RefStringTypeLhs>& lhs,
-                           const json_pointer<RefStringTypeRhs>& rhs) noexcept;
-
-    /// @brief compares JSON pointer and string for inequality
-    /// @sa https://json.nlohmann.me/api/json_pointer/operator_ne/
-    template<typename RefStringTypeLhs, typename StringType>
-    // NOLINTNEXTLINE(readability-redundant-declaration)
-    friend bool operator!=(const json_pointer<RefStringTypeLhs>& lhs,
-                           const StringType& rhs);
-
-    /// @brief compares string and JSON pointer for inequality
-    /// @sa https://json.nlohmann.me/api/json_pointer/operator_ne/
-    template<typename RefStringTypeRhs, typename StringType>
-    // NOLINTNEXTLINE(readability-redundant-declaration)
-    friend bool operator!=(const StringType& lhs,
-                           const json_pointer<RefStringTypeRhs>& rhs);
-
-    /// @brief compares two JSON pointer for less-than
-    template<typename RefStringTypeLhs, typename RefStringTypeRhs>
-    // NOLINTNEXTLINE(readability-redundant-declaration)
-    friend bool operator<(const json_pointer<RefStringTypeLhs>& lhs,
-                          const json_pointer<RefStringTypeRhs>& rhs) noexcept;
-#endif
-
-  private:
-    /// the reference tokens
-    std::vector<string_t> reference_tokens;
-};
-
-#if !JSON_HAS_THREE_WAY_COMPARISON
-// functions cannot be defined inside class due to ODR violations
-template<typename RefStringTypeLhs, typename RefStringTypeRhs>
-inline bool operator==(const json_pointer<RefStringTypeLhs>& lhs,
-                       const json_pointer<RefStringTypeRhs>& rhs) noexcept
-{
-    return lhs.reference_tokens == rhs.reference_tokens;
-}
-
-template<typename RefStringTypeLhs,
-         typename StringType = typename json_pointer<RefStringTypeLhs>::string_t>
-JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator==(json_pointer, json_pointer))
-inline bool operator==(const json_pointer<RefStringTypeLhs>& lhs,
-                       const StringType& rhs)
-{
-    return lhs == json_pointer<RefStringTypeLhs>(rhs);
-}
-
-template<typename RefStringTypeRhs,
-         typename StringType = typename json_pointer<RefStringTypeRhs>::string_t>
-JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator==(json_pointer, json_pointer))
-inline bool operator==(const StringType& lhs,
-                       const json_pointer<RefStringTypeRhs>& rhs)
-{
-    return json_pointer<RefStringTypeRhs>(lhs) == rhs;
-}
-
-template<typename RefStringTypeLhs, typename RefStringTypeRhs>
-inline bool operator!=(const json_pointer<RefStringTypeLhs>& lhs,
-                       const json_pointer<RefStringTypeRhs>& rhs) noexcept
-{
-    return !(lhs == rhs);
-}
-
-template<typename RefStringTypeLhs,
-         typename StringType = typename json_pointer<RefStringTypeLhs>::string_t>
-JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator!=(json_pointer, json_pointer))
-inline bool operator!=(const json_pointer<RefStringTypeLhs>& lhs,
-                       const StringType& rhs)
-{
-    return !(lhs == rhs);
-}
-
-template<typename RefStringTypeRhs,
-         typename StringType = typename json_pointer<RefStringTypeRhs>::string_t>
-JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator!=(json_pointer, json_pointer))
-inline bool operator!=(const StringType& lhs,
-                       const json_pointer<RefStringTypeRhs>& rhs)
-{
-    return !(lhs == rhs);
-}
-
-template<typename RefStringTypeLhs, typename RefStringTypeRhs>
-inline bool operator<(const json_pointer<RefStringTypeLhs>& lhs,
-                      const json_pointer<RefStringTypeRhs>& rhs) noexcept
-{
-    return lhs.reference_tokens < rhs.reference_tokens;
-}
-#endif
-
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/json_ref.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <initializer_list>
-#include <utility>
-
-// #include <nlohmann/detail/abi_macros.hpp>
-
-// #include <nlohmann/detail/meta/type_traits.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-template<typename BasicJsonType>
-class json_ref
-{
-  public:
-    using value_type = BasicJsonType;
-
-    json_ref(value_type&& value)
-        : owned_value(std::move(value))
-    {}
-
-    json_ref(const value_type& value)
-        : value_ref(&value)
-    {}
-
-    json_ref(std::initializer_list<json_ref> init)
-        : owned_value(init)
-    {}
-
-    template <
-        class... Args,
-        enable_if_t<std::is_constructible<value_type, Args...>::value, int> = 0 >
-    json_ref(Args && ... args)
-        : owned_value(std::forward<Args>(args)...)
-    {}
-
-    // class should be movable only
-    json_ref(json_ref&&) noexcept = default;
-    json_ref(const json_ref&) = delete;
-    json_ref& operator=(const json_ref&) = delete;
-    json_ref& operator=(json_ref&&) = delete;
-    ~json_ref() = default;
-
-    value_type moved_or_copied() const
-    {
-        if (value_ref == nullptr)
-        {
-            return std::move(owned_value);
-        }
-        return *value_ref;
-    }
-
-    value_type const& operator*() const
-    {
-        return value_ref ? *value_ref : owned_value;
-    }
-
-    value_type const* operator->() const
-    {
-        return &** this;
-    }
-
-  private:
-    mutable value_type owned_value = nullptr;
-    value_type const* value_ref = nullptr;
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-// #include <nlohmann/detail/string_concat.hpp>
-
-// #include <nlohmann/detail/string_escape.hpp>
-
-// #include <nlohmann/detail/string_utils.hpp>
-
-// #include <nlohmann/detail/meta/cpp_future.hpp>
-
-// #include <nlohmann/detail/meta/type_traits.hpp>
-
-// #include <nlohmann/detail/output/binary_writer.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <algorithm> // reverse
-#include <array> // array
-#include <map> // map
-#include <cmath> // isnan, isinf
-#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
-#include <cstring> // memcpy
-#include <limits> // numeric_limits
-#include <string> // string
-#include <utility> // move
-#include <vector> // vector
-
-// #include <nlohmann/detail/input/binary_reader.hpp>
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-// #include <nlohmann/detail/output/output_adapters.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <algorithm> // copy
-#include <cstddef> // size_t
-#include <iterator> // back_inserter
-#include <memory> // shared_ptr, make_shared
-#include <string> // basic_string
-#include <vector> // vector
-
-#ifndef JSON_NO_IO
-    #include <ios>      // streamsize
-    #include <ostream>  // basic_ostream
-#endif  // JSON_NO_IO
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-/// abstract output adapter interface
-template<typename CharType> struct output_adapter_protocol
-{
-    virtual void write_character(CharType c) = 0;
-    virtual void write_characters(const CharType* s, std::size_t length) = 0;
-    virtual ~output_adapter_protocol() = default;
-
-    output_adapter_protocol() = default;
-    output_adapter_protocol(const output_adapter_protocol&) = default;
-    output_adapter_protocol(output_adapter_protocol&&) noexcept = default;
-    output_adapter_protocol& operator=(const output_adapter_protocol&) = default;
-    output_adapter_protocol& operator=(output_adapter_protocol&&) noexcept = default;
-};
-
-/// a type to simplify interfaces
-template<typename CharType>
-using output_adapter_t = std::shared_ptr<output_adapter_protocol<CharType>>;
-
-/// output adapter for byte vectors
-template<typename CharType, typename AllocatorType = std::allocator<CharType>>
-class output_vector_adapter : public output_adapter_protocol<CharType>
-{
-  public:
-    explicit output_vector_adapter(std::vector<CharType, AllocatorType>& vec) noexcept
-        : v(vec)
-    {}
-
-    void write_character(CharType c) override
-    {
-        v.push_back(c);
-    }
-
-    JSON_HEDLEY_NON_NULL(2)
-    void write_characters(const CharType* s, std::size_t length) override
-    {
-        v.insert(v.end(), s, s + length);
-    }
-
-  private:
-    std::vector<CharType, AllocatorType>& v;
-};
-
-#ifndef JSON_NO_IO
-/// output adapter for output streams
-template<typename CharType>
-class output_stream_adapter : public output_adapter_protocol<CharType>
-{
-  public:
-    explicit output_stream_adapter(std::basic_ostream<CharType>& s) noexcept
-        : stream(s)
-    {}
-
-    void write_character(CharType c) override
-    {
-        stream.put(c);
-    }
-
-    JSON_HEDLEY_NON_NULL(2)
-    void write_characters(const CharType* s, std::size_t length) override
-    {
-        stream.write(s, static_cast<std::streamsize>(length));
-    }
-
-  private:
-    std::basic_ostream<CharType>& stream;
-};
-#endif  // JSON_NO_IO
-
-/// output adapter for basic_string
-template<typename CharType, typename StringType = std::basic_string<CharType>>
-class output_string_adapter : public output_adapter_protocol<CharType>
-{
-  public:
-    explicit output_string_adapter(StringType& s) noexcept
-        : str(s)
-    {}
-
-    void write_character(CharType c) override
-    {
-        str.push_back(c);
-    }
-
-    JSON_HEDLEY_NON_NULL(2)
-    void write_characters(const CharType* s, std::size_t length) override
-    {
-        str.append(s, length);
-    }
-
-  private:
-    StringType& str;
-};
-
-template<typename CharType, typename StringType = std::basic_string<CharType>>
-class output_adapter
-{
-  public:
-    template<typename AllocatorType = std::allocator<CharType>>
-    output_adapter(std::vector<CharType, AllocatorType>& vec)
-        : oa(std::make_shared<output_vector_adapter<CharType, AllocatorType>>(vec)) {}
-
-#ifndef JSON_NO_IO
-    output_adapter(std::basic_ostream<CharType>& s)
-        : oa(std::make_shared<output_stream_adapter<CharType>>(s)) {}
-#endif  // JSON_NO_IO
-
-    output_adapter(StringType& s)
-        : oa(std::make_shared<output_string_adapter<CharType, StringType>>(s)) {}
-
-    operator output_adapter_t<CharType>()
-    {
-        return oa;
-    }
-
-  private:
-    output_adapter_t<CharType> oa = nullptr;
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/string_concat.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-/// how to encode BJData
-enum class bjdata_version_t
-{
-    draft2,
-    draft3,
-};
-
-///////////////////
-// binary writer //
-///////////////////
-
-/*!
-@brief serialization to CBOR and MessagePack values
-*/
-template<typename BasicJsonType, typename CharType>
-class binary_writer
-{
-    using string_t = typename BasicJsonType::string_t;
-    using binary_t = typename BasicJsonType::binary_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-
-  public:
-    /*!
-    @brief create a binary writer
-
-    @param[in] adapter  output adapter to write to
-    */
-    explicit binary_writer(output_adapter_t<CharType> adapter) : oa(std::move(adapter))
-    {
-        JSON_ASSERT(oa);
-    }
-
-    /*!
-    @param[in] j  JSON value to serialize
-    @pre       j.type() == value_t::object
-    */
-    void write_bson(const BasicJsonType& j)
-    {
-        switch (j.type())
-        {
-            case value_t::object:
-            {
-                write_bson_object(*j.m_data.m_value.object);
-                break;
-            }
-
-            case value_t::null:
-            case value_t::array:
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-            {
-                JSON_THROW(type_error::create(317, concat("to serialize to BSON, top-level type must be object, but is ", j.type_name()), &j));
-            }
-        }
-    }
-
-    /*!
-    @param[in] j  JSON value to serialize
-    */
-    void write_cbor(const BasicJsonType& j)
-    {
-        switch (j.type())
-        {
-            case value_t::null:
-            {
-                oa->write_character(to_char_type(0xF6));
-                break;
-            }
-
-            case value_t::boolean:
-            {
-                oa->write_character(j.m_data.m_value.boolean
-                                    ? to_char_type(0xF5)
-                                    : to_char_type(0xF4));
-                break;
-            }
-
-            case value_t::number_integer:
-            {
-                if (j.m_data.m_value.number_integer >= 0)
-                {
-                    // CBOR does not differentiate between positive signed
-                    // integers and unsigned integers. Therefore, we used the
-                    // code from the value_t::number_unsigned case here.
-                    if (j.m_data.m_value.number_integer <= 0x17)
-                    {
-                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
-                    }
-                    else if (j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
-                    {
-                        oa->write_character(to_char_type(0x18));
-                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
-                    }
-                    else if (j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)())
-                    {
-                        oa->write_character(to_char_type(0x19));
-                        write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_integer));
-                    }
-                    else if (j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)())
-                    {
-                        oa->write_character(to_char_type(0x1A));
-                        write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_integer));
-                    }
-                    else
-                    {
-                        oa->write_character(to_char_type(0x1B));
-                        write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_integer));
-                    }
-                }
-                else
-                {
-                    // The conversions below encode the sign in the first
-                    // byte, and the value is converted to a positive number.
-                    const auto positive_number = -1 - j.m_data.m_value.number_integer;
-                    if (j.m_data.m_value.number_integer >= -24)
-                    {
-                        write_number(static_cast<std::uint8_t>(0x20 + positive_number));
-                    }
-                    else if (positive_number <= (std::numeric_limits<std::uint8_t>::max)())
-                    {
-                        oa->write_character(to_char_type(0x38));
-                        write_number(static_cast<std::uint8_t>(positive_number));
-                    }
-                    else if (positive_number <= (std::numeric_limits<std::uint16_t>::max)())
-                    {
-                        oa->write_character(to_char_type(0x39));
-                        write_number(static_cast<std::uint16_t>(positive_number));
-                    }
-                    else if (positive_number <= (std::numeric_limits<std::uint32_t>::max)())
-                    {
-                        oa->write_character(to_char_type(0x3A));
-                        write_number(static_cast<std::uint32_t>(positive_number));
-                    }
-                    else
-                    {
-                        oa->write_character(to_char_type(0x3B));
-                        write_number(static_cast<std::uint64_t>(positive_number));
-                    }
-                }
-                break;
-            }
-
-            case value_t::number_unsigned:
-            {
-                if (j.m_data.m_value.number_unsigned <= 0x17)
-                {
-                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_unsigned));
-                }
-                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
-                {
-                    oa->write_character(to_char_type(0x18));
-                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_unsigned));
-                }
-                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
-                {
-                    oa->write_character(to_char_type(0x19));
-                    write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_unsigned));
-                }
-                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
-                {
-                    oa->write_character(to_char_type(0x1A));
-                    write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_unsigned));
-                }
-                else
-                {
-                    oa->write_character(to_char_type(0x1B));
-                    write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_unsigned));
-                }
-                break;
-            }
-
-            case value_t::number_float:
-            {
-                if (std::isnan(j.m_data.m_value.number_float))
-                {
-                    // NaN is 0xf97e00 in CBOR
-                    oa->write_character(to_char_type(0xF9));
-                    oa->write_character(to_char_type(0x7E));
-                    oa->write_character(to_char_type(0x00));
-                }
-                else if (std::isinf(j.m_data.m_value.number_float))
-                {
-                    // Infinity is 0xf97c00, -Infinity is 0xf9fc00
-                    oa->write_character(to_char_type(0xf9));
-                    oa->write_character(j.m_data.m_value.number_float > 0 ? to_char_type(0x7C) : to_char_type(0xFC));
-                    oa->write_character(to_char_type(0x00));
-                }
-                else
-                {
-                    write_compact_float(j.m_data.m_value.number_float, detail::input_format_t::cbor);
-                }
-                break;
-            }
-
-            case value_t::string:
-            {
-                // step 1: write control byte and the string length
-                const auto N = j.m_data.m_value.string->size();
-                if (N <= 0x17)
-                {
-                    write_number(static_cast<std::uint8_t>(0x60 + N));
-                }
-                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
-                {
-                    oa->write_character(to_char_type(0x78));
-                    write_number(static_cast<std::uint8_t>(N));
-                }
-                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
-                {
-                    oa->write_character(to_char_type(0x79));
-                    write_number(static_cast<std::uint16_t>(N));
-                }
-                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
-                {
-                    oa->write_character(to_char_type(0x7A));
-                    write_number(static_cast<std::uint32_t>(N));
-                }
-                // LCOV_EXCL_START
-                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
-                {
-                    oa->write_character(to_char_type(0x7B));
-                    write_number(static_cast<std::uint64_t>(N));
-                }
-                // LCOV_EXCL_STOP
-
-                // step 2: write the string
-                oa->write_characters(
-                    reinterpret_cast<const CharType*>(j.m_data.m_value.string->c_str()),
-                    j.m_data.m_value.string->size());
-                break;
-            }
-
-            case value_t::array:
-            {
-                // step 1: write control byte and the array size
-                const auto N = j.m_data.m_value.array->size();
-                if (N <= 0x17)
-                {
-                    write_number(static_cast<std::uint8_t>(0x80 + N));
-                }
-                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
-                {
-                    oa->write_character(to_char_type(0x98));
-                    write_number(static_cast<std::uint8_t>(N));
-                }
-                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
-                {
-                    oa->write_character(to_char_type(0x99));
-                    write_number(static_cast<std::uint16_t>(N));
-                }
-                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
-                {
-                    oa->write_character(to_char_type(0x9A));
-                    write_number(static_cast<std::uint32_t>(N));
-                }
-                // LCOV_EXCL_START
-                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
-                {
-                    oa->write_character(to_char_type(0x9B));
-                    write_number(static_cast<std::uint64_t>(N));
-                }
-                // LCOV_EXCL_STOP
-
-                // step 2: write each element
-                for (const auto& el : *j.m_data.m_value.array)
-                {
-                    write_cbor(el);
-                }
-                break;
-            }
-
-            case value_t::binary:
-            {
-                if (j.m_data.m_value.binary->has_subtype())
-                {
-                    if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint8_t>::max)())
-                    {
-                        write_number(static_cast<std::uint8_t>(0xd8));
-                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.binary->subtype()));
-                    }
-                    else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint16_t>::max)())
-                    {
-                        write_number(static_cast<std::uint8_t>(0xd9));
-                        write_number(static_cast<std::uint16_t>(j.m_data.m_value.binary->subtype()));
-                    }
-                    else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint32_t>::max)())
-                    {
-                        write_number(static_cast<std::uint8_t>(0xda));
-                        write_number(static_cast<std::uint32_t>(j.m_data.m_value.binary->subtype()));
-                    }
-                    else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint64_t>::max)())
-                    {
-                        write_number(static_cast<std::uint8_t>(0xdb));
-                        write_number(static_cast<std::uint64_t>(j.m_data.m_value.binary->subtype()));
-                    }
-                }
-
-                // step 1: write control byte and the binary array size
-                const auto N = j.m_data.m_value.binary->size();
-                if (N <= 0x17)
-                {
-                    write_number(static_cast<std::uint8_t>(0x40 + N));
-                }
-                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
-                {
-                    oa->write_character(to_char_type(0x58));
-                    write_number(static_cast<std::uint8_t>(N));
-                }
-                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
-                {
-                    oa->write_character(to_char_type(0x59));
-                    write_number(static_cast<std::uint16_t>(N));
-                }
-                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
-                {
-                    oa->write_character(to_char_type(0x5A));
-                    write_number(static_cast<std::uint32_t>(N));
-                }
-                // LCOV_EXCL_START
-                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
-                {
-                    oa->write_character(to_char_type(0x5B));
-                    write_number(static_cast<std::uint64_t>(N));
-                }
-                // LCOV_EXCL_STOP
-
-                // step 2: write each element
-                oa->write_characters(
-                    reinterpret_cast<const CharType*>(j.m_data.m_value.binary->data()),
-                    N);
-
-                break;
-            }
-
-            case value_t::object:
-            {
-                // step 1: write control byte and the object size
-                const auto N = j.m_data.m_value.object->size();
-                if (N <= 0x17)
-                {
-                    write_number(static_cast<std::uint8_t>(0xA0 + N));
-                }
-                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
-                {
-                    oa->write_character(to_char_type(0xB8));
-                    write_number(static_cast<std::uint8_t>(N));
-                }
-                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
-                {
-                    oa->write_character(to_char_type(0xB9));
-                    write_number(static_cast<std::uint16_t>(N));
-                }
-                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
-                {
-                    oa->write_character(to_char_type(0xBA));
-                    write_number(static_cast<std::uint32_t>(N));
-                }
-                // LCOV_EXCL_START
-                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
-                {
-                    oa->write_character(to_char_type(0xBB));
-                    write_number(static_cast<std::uint64_t>(N));
-                }
-                // LCOV_EXCL_STOP
-
-                // step 2: write each element
-                for (const auto& el : *j.m_data.m_value.object)
-                {
-                    write_cbor(el.first);
-                    write_cbor(el.second);
-                }
-                break;
-            }
-
-            case value_t::discarded:
-            default:
-                break;
-        }
-    }
-
-    /*!
-    @param[in] j  JSON value to serialize
-    */
-    void write_msgpack(const BasicJsonType& j)
-    {
-        switch (j.type())
-        {
-            case value_t::null: // nil
-            {
-                oa->write_character(to_char_type(0xC0));
-                break;
-            }
-
-            case value_t::boolean: // true and false
-            {
-                oa->write_character(j.m_data.m_value.boolean
-                                    ? to_char_type(0xC3)
-                                    : to_char_type(0xC2));
-                break;
-            }
-
-            case value_t::number_integer:
-            {
-                if (j.m_data.m_value.number_integer >= 0)
-                {
-                    // MessagePack does not differentiate between positive
-                    // signed integers and unsigned integers. Therefore, we used
-                    // the code from the value_t::number_unsigned case here.
-                    if (j.m_data.m_value.number_unsigned < 128)
-                    {
-                        // positive fixnum
-                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
-                    }
-                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
-                    {
-                        // uint 8
-                        oa->write_character(to_char_type(0xCC));
-                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
-                    }
-                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
-                    {
-                        // uint 16
-                        oa->write_character(to_char_type(0xCD));
-                        write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_integer));
-                    }
-                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
-                    {
-                        // uint 32
-                        oa->write_character(to_char_type(0xCE));
-                        write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_integer));
-                    }
-                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
-                    {
-                        // uint 64
-                        oa->write_character(to_char_type(0xCF));
-                        write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_integer));
-                    }
-                }
-                else
-                {
-                    if (j.m_data.m_value.number_integer >= -32)
-                    {
-                        // negative fixnum
-                        write_number(static_cast<std::int8_t>(j.m_data.m_value.number_integer));
-                    }
-                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int8_t>::min)() &&
-                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
-                    {
-                        // int 8
-                        oa->write_character(to_char_type(0xD0));
-                        write_number(static_cast<std::int8_t>(j.m_data.m_value.number_integer));
-                    }
-                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int16_t>::min)() &&
-                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
-                    {
-                        // int 16
-                        oa->write_character(to_char_type(0xD1));
-                        write_number(static_cast<std::int16_t>(j.m_data.m_value.number_integer));
-                    }
-                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int32_t>::min)() &&
-                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
-                    {
-                        // int 32
-                        oa->write_character(to_char_type(0xD2));
-                        write_number(static_cast<std::int32_t>(j.m_data.m_value.number_integer));
-                    }
-                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int64_t>::min)() &&
-                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
-                    {
-                        // int 64
-                        oa->write_character(to_char_type(0xD3));
-                        write_number(static_cast<std::int64_t>(j.m_data.m_value.number_integer));
-                    }
-                }
-                break;
-            }
-
-            case value_t::number_unsigned:
-            {
-                if (j.m_data.m_value.number_unsigned < 128)
-                {
-                    // positive fixnum
-                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
-                }
-                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
-                {
-                    // uint 8
-                    oa->write_character(to_char_type(0xCC));
-                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
-                }
-                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
-                {
-                    // uint 16
-                    oa->write_character(to_char_type(0xCD));
-                    write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_integer));
-                }
-                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
-                {
-                    // uint 32
-                    oa->write_character(to_char_type(0xCE));
-                    write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_integer));
-                }
-                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
-                {
-                    // uint 64
-                    oa->write_character(to_char_type(0xCF));
-                    write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_integer));
-                }
-                break;
-            }
-
-            case value_t::number_float:
-            {
-                write_compact_float(j.m_data.m_value.number_float, detail::input_format_t::msgpack);
-                break;
-            }
-
-            case value_t::string:
-            {
-                // step 1: write control byte and the string length
-                const auto N = j.m_data.m_value.string->size();
-                if (N <= 31)
-                {
-                    // fixstr
-                    write_number(static_cast<std::uint8_t>(0xA0 | N));
-                }
-                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
-                {
-                    // str 8
-                    oa->write_character(to_char_type(0xD9));
-                    write_number(static_cast<std::uint8_t>(N));
-                }
-                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
-                {
-                    // str 16
-                    oa->write_character(to_char_type(0xDA));
-                    write_number(static_cast<std::uint16_t>(N));
-                }
-                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
-                {
-                    // str 32
-                    oa->write_character(to_char_type(0xDB));
-                    write_number(static_cast<std::uint32_t>(N));
-                }
-
-                // step 2: write the string
-                oa->write_characters(
-                    reinterpret_cast<const CharType*>(j.m_data.m_value.string->c_str()),
-                    j.m_data.m_value.string->size());
-                break;
-            }
-
-            case value_t::array:
-            {
-                // step 1: write control byte and the array size
-                const auto N = j.m_data.m_value.array->size();
-                if (N <= 15)
-                {
-                    // fixarray
-                    write_number(static_cast<std::uint8_t>(0x90 | N));
-                }
-                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
-                {
-                    // array 16
-                    oa->write_character(to_char_type(0xDC));
-                    write_number(static_cast<std::uint16_t>(N));
-                }
-                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
-                {
-                    // array 32
-                    oa->write_character(to_char_type(0xDD));
-                    write_number(static_cast<std::uint32_t>(N));
-                }
-
-                // step 2: write each element
-                for (const auto& el : *j.m_data.m_value.array)
-                {
-                    write_msgpack(el);
-                }
-                break;
-            }
-
-            case value_t::binary:
-            {
-                // step 0: determine if the binary type has a set subtype to
-                // determine whether to use the ext or fixext types
-                const bool use_ext = j.m_data.m_value.binary->has_subtype();
-
-                // step 1: write control byte and the byte string length
-                const auto N = j.m_data.m_value.binary->size();
-                if (N <= (std::numeric_limits<std::uint8_t>::max)())
-                {
-                    std::uint8_t output_type{};
-                    bool fixed = true;
-                    if (use_ext)
-                    {
-                        switch (N)
-                        {
-                            case 1:
-                                output_type = 0xD4; // fixext 1
-                                break;
-                            case 2:
-                                output_type = 0xD5; // fixext 2
-                                break;
-                            case 4:
-                                output_type = 0xD6; // fixext 4
-                                break;
-                            case 8:
-                                output_type = 0xD7; // fixext 8
-                                break;
-                            case 16:
-                                output_type = 0xD8; // fixext 16
-                                break;
-                            default:
-                                output_type = 0xC7; // ext 8
-                                fixed = false;
-                                break;
-                        }
-
-                    }
-                    else
-                    {
-                        output_type = 0xC4; // bin 8
-                        fixed = false;
-                    }
-
-                    oa->write_character(to_char_type(output_type));
-                    if (!fixed)
-                    {
-                        write_number(static_cast<std::uint8_t>(N));
-                    }
-                }
-                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
-                {
-                    const std::uint8_t output_type = use_ext
-                                                     ? 0xC8 // ext 16
-                                                     : 0xC5; // bin 16
-
-                    oa->write_character(to_char_type(output_type));
-                    write_number(static_cast<std::uint16_t>(N));
-                }
-                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
-                {
-                    const std::uint8_t output_type = use_ext
-                                                     ? 0xC9 // ext 32
-                                                     : 0xC6; // bin 32
-
-                    oa->write_character(to_char_type(output_type));
-                    write_number(static_cast<std::uint32_t>(N));
-                }
-
-                // step 1.5: if this is an ext type, write the subtype
-                if (use_ext)
-                {
-                    write_number(static_cast<std::int8_t>(j.m_data.m_value.binary->subtype()));
-                }
-
-                // step 2: write the byte string
-                oa->write_characters(
-                    reinterpret_cast<const CharType*>(j.m_data.m_value.binary->data()),
-                    N);
-
-                break;
-            }
-
-            case value_t::object:
-            {
-                // step 1: write control byte and the object size
-                const auto N = j.m_data.m_value.object->size();
-                if (N <= 15)
-                {
-                    // fixmap
-                    write_number(static_cast<std::uint8_t>(0x80 | (N & 0xF)));
-                }
-                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
-                {
-                    // map 16
-                    oa->write_character(to_char_type(0xDE));
-                    write_number(static_cast<std::uint16_t>(N));
-                }
-                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
-                {
-                    // map 32
-                    oa->write_character(to_char_type(0xDF));
-                    write_number(static_cast<std::uint32_t>(N));
-                }
-
-                // step 2: write each element
-                for (const auto& el : *j.m_data.m_value.object)
-                {
-                    write_msgpack(el.first);
-                    write_msgpack(el.second);
-                }
-                break;
-            }
-
-            case value_t::discarded:
-            default:
-                break;
-        }
-    }
-
-    /*!
-    @param[in] j  JSON value to serialize
-    @param[in] use_count   whether to use '#' prefixes (optimized format)
-    @param[in] use_type    whether to use '$' prefixes (optimized format)
-    @param[in] add_prefix  whether prefixes need to be used for this value
-    @param[in] use_bjdata  whether write in BJData format, default is false
-    @param[in] bjdata_version  which BJData version to use, default is draft2
-    */
-    void write_ubjson(const BasicJsonType& j, const bool use_count,
-                      const bool use_type, const bool add_prefix = true,
-                      const bool use_bjdata = false, const bjdata_version_t bjdata_version = bjdata_version_t::draft2)
-    {
-        const bool bjdata_draft3 = use_bjdata && bjdata_version == bjdata_version_t::draft3;
-
-        switch (j.type())
-        {
-            case value_t::null:
-            {
-                if (add_prefix)
-                {
-                    oa->write_character(to_char_type('Z'));
-                }
-                break;
-            }
-
-            case value_t::boolean:
-            {
-                if (add_prefix)
-                {
-                    oa->write_character(j.m_data.m_value.boolean
-                                        ? to_char_type('T')
-                                        : to_char_type('F'));
-                }
-                break;
-            }
-
-            case value_t::number_integer:
-            {
-                write_number_with_ubjson_prefix(j.m_data.m_value.number_integer, add_prefix, use_bjdata);
-                break;
-            }
-
-            case value_t::number_unsigned:
-            {
-                write_number_with_ubjson_prefix(j.m_data.m_value.number_unsigned, add_prefix, use_bjdata);
-                break;
-            }
-
-            case value_t::number_float:
-            {
-                write_number_with_ubjson_prefix(j.m_data.m_value.number_float, add_prefix, use_bjdata);
-                break;
-            }
-
-            case value_t::string:
-            {
-                if (add_prefix)
-                {
-                    oa->write_character(to_char_type('S'));
-                }
-                write_number_with_ubjson_prefix(j.m_data.m_value.string->size(), true, use_bjdata);
-                oa->write_characters(
-                    reinterpret_cast<const CharType*>(j.m_data.m_value.string->c_str()),
-                    j.m_data.m_value.string->size());
-                break;
-            }
-
-            case value_t::array:
-            {
-                if (add_prefix)
-                {
-                    oa->write_character(to_char_type('['));
-                }
-
-                bool prefix_required = true;
-                if (use_type && !j.m_data.m_value.array->empty())
-                {
-                    JSON_ASSERT(use_count);
-                    const CharType first_prefix = ubjson_prefix(j.front(), use_bjdata);
-                    const bool same_prefix = std::all_of(j.begin() + 1, j.end(),
-                                                         [this, first_prefix, use_bjdata](const BasicJsonType & v)
-                    {
-                        return ubjson_prefix(v, use_bjdata) == first_prefix;
-                    });
-
-                    std::vector<CharType> bjdx = {'[', '{', 'S', 'H', 'T', 'F', 'N', 'Z'}; // excluded markers in bjdata optimized type
-
-                    if (same_prefix && !(use_bjdata && std::find(bjdx.begin(), bjdx.end(), first_prefix) != bjdx.end()))
-                    {
-                        prefix_required = false;
-                        oa->write_character(to_char_type('$'));
-                        oa->write_character(first_prefix);
-                    }
-                }
-
-                if (use_count)
-                {
-                    oa->write_character(to_char_type('#'));
-                    write_number_with_ubjson_prefix(j.m_data.m_value.array->size(), true, use_bjdata);
-                }
-
-                for (const auto& el : *j.m_data.m_value.array)
-                {
-                    write_ubjson(el, use_count, use_type, prefix_required, use_bjdata, bjdata_version);
-                }
-
-                if (!use_count)
-                {
-                    oa->write_character(to_char_type(']'));
-                }
-
-                break;
-            }
-
-            case value_t::binary:
-            {
-                if (add_prefix)
-                {
-                    oa->write_character(to_char_type('['));
-                }
-
-                if (use_type && (bjdata_draft3 || !j.m_data.m_value.binary->empty()))
-                {
-                    JSON_ASSERT(use_count);
-                    oa->write_character(to_char_type('$'));
-                    oa->write_character(bjdata_draft3 ? 'B' : 'U');
-                }
-
-                if (use_count)
-                {
-                    oa->write_character(to_char_type('#'));
-                    write_number_with_ubjson_prefix(j.m_data.m_value.binary->size(), true, use_bjdata);
-                }
-
-                if (use_type)
-                {
-                    oa->write_characters(
-                        reinterpret_cast<const CharType*>(j.m_data.m_value.binary->data()),
-                        j.m_data.m_value.binary->size());
-                }
-                else
-                {
-                    for (size_t i = 0; i < j.m_data.m_value.binary->size(); ++i)
-                    {
-                        oa->write_character(to_char_type(bjdata_draft3 ? 'B' : 'U'));
-                        oa->write_character(j.m_data.m_value.binary->data()[i]);
-                    }
-                }
-
-                if (!use_count)
-                {
-                    oa->write_character(to_char_type(']'));
-                }
-
-                break;
-            }
-
-            case value_t::object:
-            {
-                if (use_bjdata && j.m_data.m_value.object->size() == 3 && j.m_data.m_value.object->find("_ArrayType_") != j.m_data.m_value.object->end() && j.m_data.m_value.object->find("_ArraySize_") != j.m_data.m_value.object->end() && j.m_data.m_value.object->find("_ArrayData_") != j.m_data.m_value.object->end())
-                {
-                    if (!write_bjdata_ndarray(*j.m_data.m_value.object, use_count, use_type, bjdata_version))  // decode bjdata ndarray in the JData format (https://github.com/NeuroJSON/jdata)
-                    {
-                        break;
-                    }
-                }
-
-                if (add_prefix)
-                {
-                    oa->write_character(to_char_type('{'));
-                }
-
-                bool prefix_required = true;
-                if (use_type && !j.m_data.m_value.object->empty())
-                {
-                    JSON_ASSERT(use_count);
-                    const CharType first_prefix = ubjson_prefix(j.front(), use_bjdata);
-                    const bool same_prefix = std::all_of(j.begin(), j.end(),
-                                                         [this, first_prefix, use_bjdata](const BasicJsonType & v)
-                    {
-                        return ubjson_prefix(v, use_bjdata) == first_prefix;
-                    });
-
-                    std::vector<CharType> bjdx = {'[', '{', 'S', 'H', 'T', 'F', 'N', 'Z'}; // excluded markers in bjdata optimized type
-
-                    if (same_prefix && !(use_bjdata && std::find(bjdx.begin(), bjdx.end(), first_prefix) != bjdx.end()))
-                    {
-                        prefix_required = false;
-                        oa->write_character(to_char_type('$'));
-                        oa->write_character(first_prefix);
-                    }
-                }
-
-                if (use_count)
-                {
-                    oa->write_character(to_char_type('#'));
-                    write_number_with_ubjson_prefix(j.m_data.m_value.object->size(), true, use_bjdata);
-                }
-
-                for (const auto& el : *j.m_data.m_value.object)
-                {
-                    write_number_with_ubjson_prefix(el.first.size(), true, use_bjdata);
-                    oa->write_characters(
-                        reinterpret_cast<const CharType*>(el.first.c_str()),
-                        el.first.size());
-                    write_ubjson(el.second, use_count, use_type, prefix_required, use_bjdata, bjdata_version);
-                }
-
-                if (!use_count)
-                {
-                    oa->write_character(to_char_type('}'));
-                }
-
-                break;
-            }
-
-            case value_t::discarded:
-            default:
-                break;
-        }
-    }
-
-  private:
-    //////////
-    // BSON //
-    //////////
-
-    /*!
-    @return The size of a BSON document entry header, including the id marker
-            and the entry name size (and its null-terminator).
-    */
-    static std::size_t calc_bson_entry_header_size(const string_t& name, const BasicJsonType& j)
-    {
-        const auto it = name.find(static_cast<typename string_t::value_type>(0));
-        if (JSON_HEDLEY_UNLIKELY(it != BasicJsonType::string_t::npos))
-        {
-            JSON_THROW(out_of_range::create(409, concat("BSON key cannot contain code point U+0000 (at byte ", std::to_string(it), ")"), &j));
-            static_cast<void>(j);
-        }
-
-        return /*id*/ 1ul + name.size() + /*zero-terminator*/1u;
-    }
-
-    /*!
-    @brief Writes the given @a element_type and @a name to the output adapter
-    */
-    void write_bson_entry_header(const string_t& name,
-                                 const std::uint8_t element_type)
-    {
-        oa->write_character(to_char_type(element_type)); // boolean
-        oa->write_characters(
-            reinterpret_cast<const CharType*>(name.c_str()),
-            name.size() + 1u);
-    }
-
-    /*!
-    @brief Writes a BSON element with key @a name and boolean value @a value
-    */
-    void write_bson_boolean(const string_t& name,
-                            const bool value)
-    {
-        write_bson_entry_header(name, 0x08);
-        oa->write_character(value ? to_char_type(0x01) : to_char_type(0x00));
-    }
-
-    /*!
-    @brief Writes a BSON element with key @a name and double value @a value
-    */
-    void write_bson_double(const string_t& name,
-                           const double value)
-    {
-        write_bson_entry_header(name, 0x01);
-        write_number<double>(value, true);
-    }
-
-    /*!
-    @return The size of the BSON-encoded string in @a value
-    */
-    static std::size_t calc_bson_string_size(const string_t& value)
-    {
-        return sizeof(std::int32_t) + value.size() + 1ul;
-    }
-
-    /*!
-    @brief Writes a BSON element with key @a name and string value @a value
-    */
-    void write_bson_string(const string_t& name,
-                           const string_t& value)
-    {
-        write_bson_entry_header(name, 0x02);
-
-        write_number<std::int32_t>(static_cast<std::int32_t>(value.size() + 1ul), true);
-        oa->write_characters(
-            reinterpret_cast<const CharType*>(value.c_str()),
-            value.size() + 1);
-    }
-
-    /*!
-    @brief Writes a BSON element with key @a name and null value
-    */
-    void write_bson_null(const string_t& name)
-    {
-        write_bson_entry_header(name, 0x0A);
-    }
-
-    /*!
-    @return The size of the BSON-encoded integer @a value
-    */
-    static std::size_t calc_bson_integer_size(const std::int64_t value)
-    {
-        return (std::numeric_limits<std::int32_t>::min)() <= value && value <= (std::numeric_limits<std::int32_t>::max)()
-               ? sizeof(std::int32_t)
-               : sizeof(std::int64_t);
-    }
-
-    /*!
-    @brief Writes a BSON element with key @a name and integer @a value
-    */
-    void write_bson_integer(const string_t& name,
-                            const std::int64_t value)
-    {
-        if ((std::numeric_limits<std::int32_t>::min)() <= value && value <= (std::numeric_limits<std::int32_t>::max)())
-        {
-            write_bson_entry_header(name, 0x10); // int32
-            write_number<std::int32_t>(static_cast<std::int32_t>(value), true);
-        }
-        else
-        {
-            write_bson_entry_header(name, 0x12); // int64
-            write_number<std::int64_t>(static_cast<std::int64_t>(value), true);
-        }
-    }
-
-    /*!
-    @return The size of the BSON-encoded unsigned integer in @a j
-    */
-    static constexpr std::size_t calc_bson_unsigned_size(const std::uint64_t value) noexcept
-    {
-        return (value <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
-               ? sizeof(std::int32_t)
-               : sizeof(std::int64_t);
-    }
-
-    /*!
-    @brief Writes a BSON element with key @a name and unsigned @a value
-    */
-    void write_bson_unsigned(const string_t& name,
-                             const BasicJsonType& j)
-    {
-        if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
-        {
-            write_bson_entry_header(name, 0x10 /* int32 */);
-            write_number<std::int32_t>(static_cast<std::int32_t>(j.m_data.m_value.number_unsigned), true);
-        }
-        else if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
-        {
-            write_bson_entry_header(name, 0x12 /* int64 */);
-            write_number<std::int64_t>(static_cast<std::int64_t>(j.m_data.m_value.number_unsigned), true);
-        }
-        else
-        {
-            JSON_THROW(out_of_range::create(407, concat("integer number ", std::to_string(j.m_data.m_value.number_unsigned), " cannot be represented by BSON as it does not fit int64"), &j));
-        }
-    }
-
-    /*!
-    @brief Writes a BSON element with key @a name and object @a value
-    */
-    void write_bson_object_entry(const string_t& name,
-                                 const typename BasicJsonType::object_t& value)
-    {
-        write_bson_entry_header(name, 0x03); // object
-        write_bson_object(value);
-    }
-
-    /*!
-    @return The size of the BSON-encoded array @a value
-    */
-    static std::size_t calc_bson_array_size(const typename BasicJsonType::array_t& value)
-    {
-        std::size_t array_index = 0ul;
-
-        const std::size_t embedded_document_size = std::accumulate(std::begin(value), std::end(value), static_cast<std::size_t>(0), [&array_index](std::size_t result, const typename BasicJsonType::array_t::value_type & el)
-        {
-            return result + calc_bson_element_size(std::to_string(array_index++), el);
-        });
-
-        return sizeof(std::int32_t) + embedded_document_size + 1ul;
-    }
-
-    /*!
-    @return The size of the BSON-encoded binary array @a value
-    */
-    static std::size_t calc_bson_binary_size(const typename BasicJsonType::binary_t& value)
-    {
-        return sizeof(std::int32_t) + value.size() + 1ul;
-    }
-
-    /*!
-    @brief Writes a BSON element with key @a name and array @a value
-    */
-    void write_bson_array(const string_t& name,
-                          const typename BasicJsonType::array_t& value)
-    {
-        write_bson_entry_header(name, 0x04); // array
-        write_number<std::int32_t>(static_cast<std::int32_t>(calc_bson_array_size(value)), true);
-
-        std::size_t array_index = 0ul;
-
-        for (const auto& el : value)
-        {
-            write_bson_element(std::to_string(array_index++), el);
-        }
-
-        oa->write_character(to_char_type(0x00));
-    }
-
-    /*!
-    @brief Writes a BSON element with key @a name and binary value @a value
-    */
-    void write_bson_binary(const string_t& name,
-                           const binary_t& value)
-    {
-        write_bson_entry_header(name, 0x05);
-
-        write_number<std::int32_t>(static_cast<std::int32_t>(value.size()), true);
-        write_number(value.has_subtype() ? static_cast<std::uint8_t>(value.subtype()) : static_cast<std::uint8_t>(0x00));
-
-        oa->write_characters(reinterpret_cast<const CharType*>(value.data()), value.size());
-    }
-
-    /*!
-    @brief Calculates the size necessary to serialize the JSON value @a j with its @a name
-    @return The calculated size for the BSON document entry for @a j with the given @a name.
-    */
-    static std::size_t calc_bson_element_size(const string_t& name,
-            const BasicJsonType& j)
-    {
-        const auto header_size = calc_bson_entry_header_size(name, j);
-        switch (j.type())
-        {
-            case value_t::object:
-                return header_size + calc_bson_object_size(*j.m_data.m_value.object);
-
-            case value_t::array:
-                return header_size + calc_bson_array_size(*j.m_data.m_value.array);
-
-            case value_t::binary:
-                return header_size + calc_bson_binary_size(*j.m_data.m_value.binary);
-
-            case value_t::boolean:
-                return header_size + 1ul;
-
-            case value_t::number_float:
-                return header_size + 8ul;
-
-            case value_t::number_integer:
-                return header_size + calc_bson_integer_size(j.m_data.m_value.number_integer);
-
-            case value_t::number_unsigned:
-                return header_size + calc_bson_unsigned_size(j.m_data.m_value.number_unsigned);
-
-            case value_t::string:
-                return header_size + calc_bson_string_size(*j.m_data.m_value.string);
-
-            case value_t::null:
-                return header_size + 0ul;
-
-            // LCOV_EXCL_START
-            case value_t::discarded:
-            default:
-                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert)
-                return 0ul;
-                // LCOV_EXCL_STOP
-        }
-    }
-
-    /*!
-    @brief Serializes the JSON value @a j to BSON and associates it with the
-           key @a name.
-    @param name The name to associate with the JSON entity @a j within the
-                current BSON document
-    */
-    void write_bson_element(const string_t& name,
-                            const BasicJsonType& j)
-    {
-        switch (j.type())
-        {
-            case value_t::object:
-                return write_bson_object_entry(name, *j.m_data.m_value.object);
-
-            case value_t::array:
-                return write_bson_array(name, *j.m_data.m_value.array);
-
-            case value_t::binary:
-                return write_bson_binary(name, *j.m_data.m_value.binary);
-
-            case value_t::boolean:
-                return write_bson_boolean(name, j.m_data.m_value.boolean);
-
-            case value_t::number_float:
-                return write_bson_double(name, j.m_data.m_value.number_float);
-
-            case value_t::number_integer:
-                return write_bson_integer(name, j.m_data.m_value.number_integer);
-
-            case value_t::number_unsigned:
-                return write_bson_unsigned(name, j);
-
-            case value_t::string:
-                return write_bson_string(name, *j.m_data.m_value.string);
-
-            case value_t::null:
-                return write_bson_null(name);
-
-            // LCOV_EXCL_START
-            case value_t::discarded:
-            default:
-                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert)
-                return;
-                // LCOV_EXCL_STOP
-        }
-    }
-
-    /*!
-    @brief Calculates the size of the BSON serialization of the given
-           JSON-object @a j.
-    @param[in] value  JSON value to serialize
-    @pre       value.type() == value_t::object
-    */
-    static std::size_t calc_bson_object_size(const typename BasicJsonType::object_t& value)
-    {
-        const std::size_t document_size = std::accumulate(value.begin(), value.end(), static_cast<std::size_t>(0),
-                                          [](size_t result, const typename BasicJsonType::object_t::value_type & el)
-        {
-            return result += calc_bson_element_size(el.first, el.second);
-        });
-
-        return sizeof(std::int32_t) + document_size + 1ul;
-    }
-
-    /*!
-    @param[in] value  JSON value to serialize
-    @pre       value.type() == value_t::object
-    */
-    void write_bson_object(const typename BasicJsonType::object_t& value)
-    {
-        write_number<std::int32_t>(static_cast<std::int32_t>(calc_bson_object_size(value)), true);
-
-        for (const auto& el : value)
-        {
-            write_bson_element(el.first, el.second);
-        }
-
-        oa->write_character(to_char_type(0x00));
-    }
-
-    //////////
-    // CBOR //
-    //////////
-
-    static constexpr CharType get_cbor_float_prefix(float /*unused*/)
-    {
-        return to_char_type(0xFA);  // Single-Precision Float
-    }
-
-    static constexpr CharType get_cbor_float_prefix(double /*unused*/)
-    {
-        return to_char_type(0xFB);  // Double-Precision Float
-    }
-
-    /////////////
-    // MsgPack //
-    /////////////
-
-    static constexpr CharType get_msgpack_float_prefix(float /*unused*/)
-    {
-        return to_char_type(0xCA);  // float 32
-    }
-
-    static constexpr CharType get_msgpack_float_prefix(double /*unused*/)
-    {
-        return to_char_type(0xCB);  // float 64
-    }
-
-    ////////////
-    // UBJSON //
-    ////////////
-
-    // UBJSON: write number (floating point)
-    template<typename NumberType, typename std::enable_if<
-                 std::is_floating_point<NumberType>::value, int>::type = 0>
-    void write_number_with_ubjson_prefix(const NumberType n,
-                                         const bool add_prefix,
-                                         const bool use_bjdata)
-    {
-        if (add_prefix)
-        {
-            oa->write_character(get_ubjson_float_prefix(n));
-        }
-        write_number(n, use_bjdata);
-    }
-
-    // UBJSON: write number (unsigned integer)
-    template<typename NumberType, typename std::enable_if<
-                 std::is_unsigned<NumberType>::value, int>::type = 0>
-    void write_number_with_ubjson_prefix(const NumberType n,
-                                         const bool add_prefix,
-                                         const bool use_bjdata)
-    {
-        if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('i'));  // int8
-            }
-            write_number(static_cast<std::uint8_t>(n), use_bjdata);
-        }
-        else if (n <= (std::numeric_limits<std::uint8_t>::max)())
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('U'));  // uint8
-            }
-            write_number(static_cast<std::uint8_t>(n), use_bjdata);
-        }
-        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('I'));  // int16
-            }
-            write_number(static_cast<std::int16_t>(n), use_bjdata);
-        }
-        else if (use_bjdata && n <= static_cast<uint64_t>((std::numeric_limits<uint16_t>::max)()))
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('u'));  // uint16 - bjdata only
-            }
-            write_number(static_cast<std::uint16_t>(n), use_bjdata);
-        }
-        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('l'));  // int32
-            }
-            write_number(static_cast<std::int32_t>(n), use_bjdata);
-        }
-        else if (use_bjdata && n <= static_cast<uint64_t>((std::numeric_limits<uint32_t>::max)()))
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('m'));  // uint32 - bjdata only
-            }
-            write_number(static_cast<std::uint32_t>(n), use_bjdata);
-        }
-        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('L'));  // int64
-            }
-            write_number(static_cast<std::int64_t>(n), use_bjdata);
-        }
-        else if (use_bjdata && n <= (std::numeric_limits<uint64_t>::max)())
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('M'));  // uint64 - bjdata only
-            }
-            write_number(static_cast<std::uint64_t>(n), use_bjdata);
-        }
-        else
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('H'));  // high-precision number
-            }
-
-            const auto number = BasicJsonType(n).dump();
-            write_number_with_ubjson_prefix(number.size(), true, use_bjdata);
-            for (std::size_t i = 0; i < number.size(); ++i)
-            {
-                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
-            }
-        }
-    }
-
-    // UBJSON: write number (signed integer)
-    template < typename NumberType, typename std::enable_if <
-                   std::is_signed<NumberType>::value&&
-                   !std::is_floating_point<NumberType>::value, int >::type = 0 >
-    void write_number_with_ubjson_prefix(const NumberType n,
-                                         const bool add_prefix,
-                                         const bool use_bjdata)
-    {
-        if ((std::numeric_limits<std::int8_t>::min)() <= n && n <= (std::numeric_limits<std::int8_t>::max)())
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('i'));  // int8
-            }
-            write_number(static_cast<std::int8_t>(n), use_bjdata);
-        }
-        else if (static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::min)()) <= n && n <= static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::max)()))
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('U'));  // uint8
-            }
-            write_number(static_cast<std::uint8_t>(n), use_bjdata);
-        }
-        else if ((std::numeric_limits<std::int16_t>::min)() <= n && n <= (std::numeric_limits<std::int16_t>::max)())
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('I'));  // int16
-            }
-            write_number(static_cast<std::int16_t>(n), use_bjdata);
-        }
-        else if (use_bjdata && (static_cast<std::int64_t>((std::numeric_limits<std::uint16_t>::min)()) <= n && n <= static_cast<std::int64_t>((std::numeric_limits<std::uint16_t>::max)())))
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('u'));  // uint16 - bjdata only
-            }
-            write_number(static_cast<uint16_t>(n), use_bjdata);
-        }
-        else if ((std::numeric_limits<std::int32_t>::min)() <= n && n <= (std::numeric_limits<std::int32_t>::max)())
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('l'));  // int32
-            }
-            write_number(static_cast<std::int32_t>(n), use_bjdata);
-        }
-        else if (use_bjdata && (static_cast<std::int64_t>((std::numeric_limits<std::uint32_t>::min)()) <= n && n <= static_cast<std::int64_t>((std::numeric_limits<std::uint32_t>::max)())))
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('m'));  // uint32 - bjdata only
-            }
-            write_number(static_cast<uint32_t>(n), use_bjdata);
-        }
-        else if ((std::numeric_limits<std::int64_t>::min)() <= n && n <= (std::numeric_limits<std::int64_t>::max)())
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('L'));  // int64
-            }
-            write_number(static_cast<std::int64_t>(n), use_bjdata);
-        }
-        // LCOV_EXCL_START
-        else
-        {
-            if (add_prefix)
-            {
-                oa->write_character(to_char_type('H'));  // high-precision number
-            }
-
-            const auto number = BasicJsonType(n).dump();
-            write_number_with_ubjson_prefix(number.size(), true, use_bjdata);
-            for (std::size_t i = 0; i < number.size(); ++i)
-            {
-                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
-            }
-        }
-        // LCOV_EXCL_STOP
-    }
-
-    /*!
-    @brief determine the type prefix of container values
-    */
-    CharType ubjson_prefix(const BasicJsonType& j, const bool use_bjdata) const noexcept
-    {
-        switch (j.type())
-        {
-            case value_t::null:
-                return 'Z';
-
-            case value_t::boolean:
-                return j.m_data.m_value.boolean ? 'T' : 'F';
-
-            case value_t::number_integer:
-            {
-                if ((std::numeric_limits<std::int8_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
-                {
-                    return 'i';
-                }
-                if ((std::numeric_limits<std::uint8_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
-                {
-                    return 'U';
-                }
-                if ((std::numeric_limits<std::int16_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
-                {
-                    return 'I';
-                }
-                if (use_bjdata && ((std::numeric_limits<std::uint16_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)()))
-                {
-                    return 'u';
-                }
-                if ((std::numeric_limits<std::int32_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
-                {
-                    return 'l';
-                }
-                if (use_bjdata && ((std::numeric_limits<std::uint32_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)()))
-                {
-                    return 'm';
-                }
-                if ((std::numeric_limits<std::int64_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
-                {
-                    return 'L';
-                }
-                // anything else is treated as high-precision number
-                return 'H'; // LCOV_EXCL_LINE
-            }
-
-            case value_t::number_unsigned:
-            {
-                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
-                {
-                    return 'i';
-                }
-                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint8_t>::max)()))
-                {
-                    return 'U';
-                }
-                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
-                {
-                    return 'I';
-                }
-                if (use_bjdata && j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint16_t>::max)()))
-                {
-                    return 'u';
-                }
-                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
-                {
-                    return 'l';
-                }
-                if (use_bjdata && j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint32_t>::max)()))
-                {
-                    return 'm';
-                }
-                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
-                {
-                    return 'L';
-                }
-                if (use_bjdata && j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
-                {
-                    return 'M';
-                }
-                // anything else is treated as high-precision number
-                return 'H'; // LCOV_EXCL_LINE
-            }
-
-            case value_t::number_float:
-                return get_ubjson_float_prefix(j.m_data.m_value.number_float);
-
-            case value_t::string:
-                return 'S';
-
-            case value_t::array: // fallthrough
-            case value_t::binary:
-                return '[';
-
-            case value_t::object:
-                return '{';
-
-            case value_t::discarded:
-            default:  // discarded values
-                return 'N';
-        }
-    }
-
-    static constexpr CharType get_ubjson_float_prefix(float /*unused*/)
-    {
-        return 'd';  // float 32
-    }
-
-    static constexpr CharType get_ubjson_float_prefix(double /*unused*/)
-    {
-        return 'D';  // float 64
-    }
-
-    /*!
-    @return false if the object is successfully converted to a bjdata ndarray, true if the type or size is invalid
-    */
-    bool write_bjdata_ndarray(const typename BasicJsonType::object_t& value, const bool use_count, const bool use_type, const bjdata_version_t bjdata_version)
-    {
-        std::map<string_t, CharType> bjdtype = {{"uint8", 'U'},  {"int8", 'i'},  {"uint16", 'u'}, {"int16", 'I'},
-            {"uint32", 'm'}, {"int32", 'l'}, {"uint64", 'M'}, {"int64", 'L'}, {"single", 'd'}, {"double", 'D'},
-            {"char", 'C'}, {"byte", 'B'}
-        };
-
-        string_t key = "_ArrayType_";
-        auto it = bjdtype.find(static_cast<string_t>(value.at(key)));
-        if (it == bjdtype.end())
-        {
-            return true;
-        }
-        CharType dtype = it->second;
-
-        key = "_ArraySize_";
-        std::size_t len = (value.at(key).empty() ? 0 : 1);
-        for (const auto& el : value.at(key))
-        {
-            len *= static_cast<std::size_t>(el.m_data.m_value.number_unsigned);
-        }
-
-        key = "_ArrayData_";
-        if (value.at(key).size() != len)
-        {
-            return true;
-        }
-
-        oa->write_character('[');
-        oa->write_character('$');
-        oa->write_character(dtype);
-        oa->write_character('#');
-
-        key = "_ArraySize_";
-        write_ubjson(value.at(key), use_count, use_type, true,  true, bjdata_version);
-
-        key = "_ArrayData_";
-        if (dtype == 'U' || dtype == 'C' || dtype == 'B')
-        {
-            for (const auto& el : value.at(key))
-            {
-                write_number(static_cast<std::uint8_t>(el.m_data.m_value.number_unsigned), true);
-            }
-        }
-        else if (dtype == 'i')
-        {
-            for (const auto& el : value.at(key))
-            {
-                write_number(static_cast<std::int8_t>(el.m_data.m_value.number_integer), true);
-            }
-        }
-        else if (dtype == 'u')
-        {
-            for (const auto& el : value.at(key))
-            {
-                write_number(static_cast<std::uint16_t>(el.m_data.m_value.number_unsigned), true);
-            }
-        }
-        else if (dtype == 'I')
-        {
-            for (const auto& el : value.at(key))
-            {
-                write_number(static_cast<std::int16_t>(el.m_data.m_value.number_integer), true);
-            }
-        }
-        else if (dtype == 'm')
-        {
-            for (const auto& el : value.at(key))
-            {
-                write_number(static_cast<std::uint32_t>(el.m_data.m_value.number_unsigned), true);
-            }
-        }
-        else if (dtype == 'l')
-        {
-            for (const auto& el : value.at(key))
-            {
-                write_number(static_cast<std::int32_t>(el.m_data.m_value.number_integer), true);
-            }
-        }
-        else if (dtype == 'M')
-        {
-            for (const auto& el : value.at(key))
-            {
-                write_number(static_cast<std::uint64_t>(el.m_data.m_value.number_unsigned), true);
-            }
-        }
-        else if (dtype == 'L')
-        {
-            for (const auto& el : value.at(key))
-            {
-                write_number(static_cast<std::int64_t>(el.m_data.m_value.number_integer), true);
-            }
-        }
-        else if (dtype == 'd')
-        {
-            for (const auto& el : value.at(key))
-            {
-                write_number(static_cast<float>(el.m_data.m_value.number_float), true);
-            }
-        }
-        else if (dtype == 'D')
-        {
-            for (const auto& el : value.at(key))
-            {
-                write_number(static_cast<double>(el.m_data.m_value.number_float), true);
-            }
-        }
-        return false;
-    }
-
-    ///////////////////////
-    // Utility functions //
-    ///////////////////////
-
-    /*
-    @brief write a number to output input
-    @param[in] n number of type @a NumberType
-    @param[in] OutputIsLittleEndian Set to true if output data is
-                                 required to be little endian
-    @tparam NumberType the type of the number
-
-    @note This function needs to respect the system's endianness, because bytes
-          in CBOR, MessagePack, and UBJSON are stored in network order (big
-          endian) and therefore need reordering on little endian systems.
-          On the other hand, BSON and BJData use little endian and should reorder
-          on big endian systems.
-    */
-    template<typename NumberType>
-    void write_number(const NumberType n, const bool OutputIsLittleEndian = false)
-    {
-        // step 1: write number to array of length NumberType
-        std::array<CharType, sizeof(NumberType)> vec{};
-        std::memcpy(vec.data(), &n, sizeof(NumberType));
-
-        // step 2: write array to output (with possible reordering)
-        if (is_little_endian != OutputIsLittleEndian)
-        {
-            // reverse byte order prior to conversion if necessary
-            std::reverse(vec.begin(), vec.end());
-        }
-
-        oa->write_characters(vec.data(), sizeof(NumberType));
-    }
-
-    void write_compact_float(const number_float_t n, detail::input_format_t format)
-    {
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wfloat-equal"
-#endif
-        if (static_cast<double>(n) >= static_cast<double>(std::numeric_limits<float>::lowest()) &&
-                static_cast<double>(n) <= static_cast<double>((std::numeric_limits<float>::max)()) &&
-                static_cast<double>(static_cast<float>(n)) == static_cast<double>(n))
-        {
-            oa->write_character(format == detail::input_format_t::cbor
-                                ? get_cbor_float_prefix(static_cast<float>(n))
-                                : get_msgpack_float_prefix(static_cast<float>(n)));
-            write_number(static_cast<float>(n));
-        }
-        else
-        {
-            oa->write_character(format == detail::input_format_t::cbor
-                                ? get_cbor_float_prefix(n)
-                                : get_msgpack_float_prefix(n));
-            write_number(n);
-        }
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-    }
-
-  public:
-    // The following to_char_type functions are implement the conversion
-    // between uint8_t and CharType. In case CharType is not unsigned,
-    // such a conversion is required to allow values greater than 128.
-    // See <https://github.com/nlohmann/json/issues/1286> for a discussion.
-    template < typename C = CharType,
-               enable_if_t < std::is_signed<C>::value && std::is_signed<char>::value > * = nullptr >
-    static constexpr CharType to_char_type(std::uint8_t x) noexcept
-    {
-        return *reinterpret_cast<char*>(&x);
-    }
-
-    template < typename C = CharType,
-               enable_if_t < std::is_signed<C>::value && std::is_unsigned<char>::value > * = nullptr >
-    static CharType to_char_type(std::uint8_t x) noexcept
-    {
-        static_assert(sizeof(std::uint8_t) == sizeof(CharType), "size of CharType must be equal to std::uint8_t");
-        static_assert(std::is_trivial<CharType>::value, "CharType must be trivial");
-        CharType result;
-        std::memcpy(&result, &x, sizeof(x));
-        return result;
-    }
-
-    template<typename C = CharType,
-             enable_if_t<std::is_unsigned<C>::value>* = nullptr>
-    static constexpr CharType to_char_type(std::uint8_t x) noexcept
-    {
-        return x;
-    }
-
-    template < typename InputCharType, typename C = CharType,
-               enable_if_t <
-                   std::is_signed<C>::value &&
-                   std::is_signed<char>::value &&
-                   std::is_same<char, typename std::remove_cv<InputCharType>::type>::value
-                   > * = nullptr >
-    static constexpr CharType to_char_type(InputCharType x) noexcept
-    {
-        return x;
-    }
-
-  private:
-    /// whether we can assume little endianness
-    const bool is_little_endian = little_endianness();
-
-    /// the output
-    output_adapter_t<CharType> oa = nullptr;
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/output/output_adapters.hpp>
-
-// #include <nlohmann/detail/output/serializer.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2008 - 2009 Björn Hoehrmann <bjoern@hoehrmann.de>
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <algorithm> // reverse, remove, fill, find, none_of
-#include <array> // array
-#include <clocale> // localeconv, lconv
-#include <cmath> // labs, isfinite, isnan, signbit
-#include <cstddef> // size_t, ptrdiff_t
-#include <cstdint> // uint8_t
-#include <cstdio> // snprintf
-#include <limits> // numeric_limits
-#include <string> // string, char_traits
-#include <iomanip> // setfill, setw
-#include <type_traits> // is_same
-#include <utility> // move
-
-// #include <nlohmann/detail/conversions/to_chars.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2009 Florian Loitsch <https://florian.loitsch.com/>
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <array> // array
-#include <cmath>   // signbit, isfinite
-#include <cstdint> // intN_t, uintN_t
-#include <cstring> // memcpy, memmove
-#include <limits> // numeric_limits
-#include <type_traits> // conditional
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-/*!
-@brief implements the Grisu2 algorithm for binary to decimal floating-point
-conversion.
-
-This implementation is a slightly modified version of the reference
-implementation which may be obtained from
-http://florian.loitsch.com/publications (bench.tar.gz).
-
-The code is distributed under the MIT license, Copyright (c) 2009 Florian Loitsch.
-
-For a detailed description of the algorithm see:
-
-[1] Loitsch, "Printing Floating-Point Numbers Quickly and Accurately with
-    Integers", Proceedings of the ACM SIGPLAN 2010 Conference on Programming
-    Language Design and Implementation, PLDI 2010
-[2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and Accurately",
-    Proceedings of the ACM SIGPLAN 1996 Conference on Programming Language
-    Design and Implementation, PLDI 1996
-*/
-namespace dtoa_impl
-{
-
-template<typename Target, typename Source>
-Target reinterpret_bits(const Source source)
-{
-    static_assert(sizeof(Target) == sizeof(Source), "size mismatch");
-
-    Target target;
-    std::memcpy(&target, &source, sizeof(Source));
-    return target;
-}
-
-struct diyfp // f * 2^e
-{
-    static constexpr int kPrecision = 64; // = q
-
-    std::uint64_t f = 0;
-    int e = 0;
-
-    constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {}
-
-    /*!
-    @brief returns x - y
-    @pre x.e == y.e and x.f >= y.f
-    */
-    static diyfp sub(const diyfp& x, const diyfp& y) noexcept
-    {
-        JSON_ASSERT(x.e == y.e);
-        JSON_ASSERT(x.f >= y.f);
-
-        return {x.f - y.f, x.e};
-    }
-
-    /*!
-    @brief returns x * y
-    @note The result is rounded. (Only the upper q bits are returned.)
-    */
-    static diyfp mul(const diyfp& x, const diyfp& y) noexcept
-    {
-        static_assert(kPrecision == 64, "internal error");
-
-        // Computes:
-        //  f = round((x.f * y.f) / 2^q)
-        //  e = x.e + y.e + q
-
-        // Emulate the 64-bit * 64-bit multiplication:
-        //
-        // p = u * v
-        //   = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi)
-        //   = (u_lo v_lo         ) + 2^32 ((u_lo v_hi         ) + (u_hi v_lo         )) + 2^64 (u_hi v_hi         )
-        //   = (p0                ) + 2^32 ((p1                ) + (p2                )) + 2^64 (p3                )
-        //   = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo + 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3                )
-        //   = (p0_lo             ) + 2^32 (p0_hi + p1_lo + p2_lo                      ) + 2^64 (p1_hi + p2_hi + p3)
-        //   = (p0_lo             ) + 2^32 (Q                                          ) + 2^64 (H                 )
-        //   = (p0_lo             ) + 2^32 (Q_lo + 2^32 Q_hi                           ) + 2^64 (H                 )
-        //
-        // (Since Q might be larger than 2^32 - 1)
-        //
-        //   = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H)
-        //
-        // (Q_hi + H does not overflow a 64-bit int)
-        //
-        //   = p_lo + 2^64 p_hi
-
-        const std::uint64_t u_lo = x.f & 0xFFFFFFFFu;
-        const std::uint64_t u_hi = x.f >> 32u;
-        const std::uint64_t v_lo = y.f & 0xFFFFFFFFu;
-        const std::uint64_t v_hi = y.f >> 32u;
-
-        const std::uint64_t p0 = u_lo * v_lo;
-        const std::uint64_t p1 = u_lo * v_hi;
-        const std::uint64_t p2 = u_hi * v_lo;
-        const std::uint64_t p3 = u_hi * v_hi;
-
-        const std::uint64_t p0_hi = p0 >> 32u;
-        const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu;
-        const std::uint64_t p1_hi = p1 >> 32u;
-        const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu;
-        const std::uint64_t p2_hi = p2 >> 32u;
-
-        std::uint64_t Q = p0_hi + p1_lo + p2_lo;
-
-        // The full product might now be computed as
-        //
-        // p_hi = p3 + p2_hi + p1_hi + (Q >> 32)
-        // p_lo = p0_lo + (Q << 32)
-        //
-        // But in this particular case here, the full p_lo is not required.
-        // Effectively we only need to add the highest bit in p_lo to p_hi (and
-        // Q_hi + 1 does not overflow).
-
-        Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up
-
-        const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u);
-
-        return {h, x.e + y.e + 64};
-    }
-
-    /*!
-    @brief normalize x such that the significand is >= 2^(q-1)
-    @pre x.f != 0
-    */
-    static diyfp normalize(diyfp x) noexcept
-    {
-        JSON_ASSERT(x.f != 0);
-
-        while ((x.f >> 63u) == 0)
-        {
-            x.f <<= 1u;
-            x.e--;
-        }
-
-        return x;
-    }
-
-    /*!
-    @brief normalize x such that the result has the exponent E
-    @pre e >= x.e and the upper e - x.e bits of x.f must be zero.
-    */
-    static diyfp normalize_to(const diyfp& x, const int target_exponent) noexcept
-    {
-        const int delta = x.e - target_exponent;
-
-        JSON_ASSERT(delta >= 0);
-        JSON_ASSERT(((x.f << delta) >> delta) == x.f);
-
-        return {x.f << delta, target_exponent};
-    }
-};
-
-struct boundaries
-{
-    diyfp w;
-    diyfp minus;
-    diyfp plus;
-};
-
-/*!
-Compute the (normalized) diyfp representing the input number 'value' and its
-boundaries.
-
-@pre value must be finite and positive
-*/
-template<typename FloatType>
-boundaries compute_boundaries(FloatType value)
-{
-    JSON_ASSERT(std::isfinite(value));
-    JSON_ASSERT(value > 0);
-
-    // Convert the IEEE representation into a diyfp.
-    //
-    // If v is denormal:
-    //      value = 0.F * 2^(1 - bias) = (          F) * 2^(1 - bias - (p-1))
-    // If v is normalized:
-    //      value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1))
-
-    static_assert(std::numeric_limits<FloatType>::is_iec559,
-                  "internal error: dtoa_short requires an IEEE-754 floating-point implementation");
-
-    constexpr int      kPrecision = std::numeric_limits<FloatType>::digits; // = p (includes the hidden bit)
-    constexpr int      kBias      = std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1);
-    constexpr int      kMinExp    = 1 - kBias;
-    constexpr std::uint64_t kHiddenBit = std::uint64_t{1} << (kPrecision - 1); // = 2^(p-1)
-
-    using bits_type = typename std::conditional<kPrecision == 24, std::uint32_t, std::uint64_t >::type;
-
-    const auto bits = static_cast<std::uint64_t>(reinterpret_bits<bits_type>(value));
-    const std::uint64_t E = bits >> (kPrecision - 1);
-    const std::uint64_t F = bits & (kHiddenBit - 1);
-
-    const bool is_denormal = E == 0;
-    const diyfp v = is_denormal
-                    ? diyfp(F, kMinExp)
-                    : diyfp(F + kHiddenBit, static_cast<int>(E) - kBias);
-
-    // Compute the boundaries m- and m+ of the floating-point value
-    // v = f * 2^e.
-    //
-    // Determine v- and v+, the floating-point predecessor and successor if v,
-    // respectively.
-    //
-    //      v- = v - 2^e        if f != 2^(p-1) or e == e_min                (A)
-    //         = v - 2^(e-1)    if f == 2^(p-1) and e > e_min                (B)
-    //
-    //      v+ = v + 2^e
-    //
-    // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_
-    // between m- and m+ round to v, regardless of how the input rounding
-    // algorithm breaks ties.
-    //
-    //      ---+-------------+-------------+-------------+-------------+---  (A)
-    //         v-            m-            v             m+            v+
-    //
-    //      -----------------+------+------+-------------+-------------+---  (B)
-    //                       v-     m-     v             m+            v+
-
-    const bool lower_boundary_is_closer = F == 0 && E > 1;
-    const diyfp m_plus = diyfp((2 * v.f) + 1, v.e - 1);
-    const diyfp m_minus = lower_boundary_is_closer
-                          ? diyfp((4 * v.f) - 1, v.e - 2)  // (B)
-                          : diyfp((2 * v.f) - 1, v.e - 1); // (A)
-
-    // Determine the normalized w+ = m+.
-    const diyfp w_plus = diyfp::normalize(m_plus);
-
-    // Determine w- = m- such that e_(w-) = e_(w+).
-    const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e);
-
-    return {diyfp::normalize(v), w_minus, w_plus};
-}
-
-// Given normalized diyfp w, Grisu needs to find a (normalized) cached
-// power-of-ten c, such that the exponent of the product c * w = f * 2^e lies
-// within a certain range [alpha, gamma] (Definition 3.2 from [1])
-//
-//      alpha <= e = e_c + e_w + q <= gamma
-//
-// or
-//
-//      f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q
-//                          <= f_c * f_w * 2^gamma
-//
-// Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies
-//
-//      2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma
-//
-// or
-//
-//      2^(q - 2 + alpha) <= c * w < 2^(q + gamma)
-//
-// The choice of (alpha,gamma) determines the size of the table and the form of
-// the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well
-// in practice:
-//
-// The idea is to cut the number c * w = f * 2^e into two parts, which can be
-// processed independently: An integral part p1, and a fractional part p2:
-//
-//      f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e
-//              = (f div 2^-e) + (f mod 2^-e) * 2^e
-//              = p1 + p2 * 2^e
-//
-// The conversion of p1 into decimal form requires a series of divisions and
-// modulos by (a power of) 10. These operations are faster for 32-bit than for
-// 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be
-// achieved by choosing
-//
-//      -e >= 32   or   e <= -32 := gamma
-//
-// In order to convert the fractional part
-//
-//      p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ...
-//
-// into decimal form, the fraction is repeatedly multiplied by 10 and the digits
-// d[-i] are extracted in order:
-//
-//      (10 * p2) div 2^-e = d[-1]
-//      (10 * p2) mod 2^-e = d[-2] / 10^1 + ...
-//
-// The multiplication by 10 must not overflow. It is sufficient to choose
-//
-//      10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64.
-//
-// Since p2 = f mod 2^-e < 2^-e,
-//
-//      -e <= 60   or   e >= -60 := alpha
-
-constexpr int kAlpha = -60;
-constexpr int kGamma = -32;
-
-struct cached_power // c = f * 2^e ~= 10^k
-{
-    std::uint64_t f;
-    int e;
-    int k;
-};
-
-/*!
-For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached
-power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c
-satisfies (Definition 3.2 from [1])
-
-     alpha <= e_c + e + q <= gamma.
-*/
-inline cached_power get_cached_power_for_binary_exponent(int e)
-{
-    // Now
-    //
-    //      alpha <= e_c + e + q <= gamma                                    (1)
-    //      ==> f_c * 2^alpha <= c * 2^e * 2^q
-    //
-    // and since the c's are normalized, 2^(q-1) <= f_c,
-    //
-    //      ==> 2^(q - 1 + alpha) <= c * 2^(e + q)
-    //      ==> 2^(alpha - e - 1) <= c
-    //
-    // If c were an exact power of ten, i.e. c = 10^k, one may determine k as
-    //
-    //      k = ceil( log_10( 2^(alpha - e - 1) ) )
-    //        = ceil( (alpha - e - 1) * log_10(2) )
-    //
-    // From the paper:
-    // "In theory the result of the procedure could be wrong since c is rounded,
-    //  and the computation itself is approximated [...]. In practice, however,
-    //  this simple function is sufficient."
-    //
-    // For IEEE double precision floating-point numbers converted into
-    // normalized diyfp's w = f * 2^e, with q = 64,
-    //
-    //      e >= -1022      (min IEEE exponent)
-    //           -52        (p - 1)
-    //           -52        (p - 1, possibly normalize denormal IEEE numbers)
-    //           -11        (normalize the diyfp)
-    //         = -1137
-    //
-    // and
-    //
-    //      e <= +1023      (max IEEE exponent)
-    //           -52        (p - 1)
-    //           -11        (normalize the diyfp)
-    //         = 960
-    //
-    // This binary exponent range [-1137,960] results in a decimal exponent
-    // range [-307,324]. One does not need to store a cached power for each
-    // k in this range. For each such k it suffices to find a cached power
-    // such that the exponent of the product lies in [alpha,gamma].
-    // This implies that the difference of the decimal exponents of adjacent
-    // table entries must be less than or equal to
-    //
-    //      floor( (gamma - alpha) * log_10(2) ) = 8.
-    //
-    // (A smaller distance gamma-alpha would require a larger table.)
-
-    // NB:
-    // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34.
-
-    constexpr int kCachedPowersMinDecExp = -300;
-    constexpr int kCachedPowersDecStep = 8;
-
-    static constexpr std::array<cached_power, 79> kCachedPowers =
-    {
-        {
-            { 0xAB70FE17C79AC6CA, -1060, -300 },
-            { 0xFF77B1FCBEBCDC4F, -1034, -292 },
-            { 0xBE5691EF416BD60C, -1007, -284 },
-            { 0x8DD01FAD907FFC3C,  -980, -276 },
-            { 0xD3515C2831559A83,  -954, -268 },
-            { 0x9D71AC8FADA6C9B5,  -927, -260 },
-            { 0xEA9C227723EE8BCB,  -901, -252 },
-            { 0xAECC49914078536D,  -874, -244 },
-            { 0x823C12795DB6CE57,  -847, -236 },
-            { 0xC21094364DFB5637,  -821, -228 },
-            { 0x9096EA6F3848984F,  -794, -220 },
-            { 0xD77485CB25823AC7,  -768, -212 },
-            { 0xA086CFCD97BF97F4,  -741, -204 },
-            { 0xEF340A98172AACE5,  -715, -196 },
-            { 0xB23867FB2A35B28E,  -688, -188 },
-            { 0x84C8D4DFD2C63F3B,  -661, -180 },
-            { 0xC5DD44271AD3CDBA,  -635, -172 },
-            { 0x936B9FCEBB25C996,  -608, -164 },
-            { 0xDBAC6C247D62A584,  -582, -156 },
-            { 0xA3AB66580D5FDAF6,  -555, -148 },
-            { 0xF3E2F893DEC3F126,  -529, -140 },
-            { 0xB5B5ADA8AAFF80B8,  -502, -132 },
-            { 0x87625F056C7C4A8B,  -475, -124 },
-            { 0xC9BCFF6034C13053,  -449, -116 },
-            { 0x964E858C91BA2655,  -422, -108 },
-            { 0xDFF9772470297EBD,  -396, -100 },
-            { 0xA6DFBD9FB8E5B88F,  -369,  -92 },
-            { 0xF8A95FCF88747D94,  -343,  -84 },
-            { 0xB94470938FA89BCF,  -316,  -76 },
-            { 0x8A08F0F8BF0F156B,  -289,  -68 },
-            { 0xCDB02555653131B6,  -263,  -60 },
-            { 0x993FE2C6D07B7FAC,  -236,  -52 },
-            { 0xE45C10C42A2B3B06,  -210,  -44 },
-            { 0xAA242499697392D3,  -183,  -36 },
-            { 0xFD87B5F28300CA0E,  -157,  -28 },
-            { 0xBCE5086492111AEB,  -130,  -20 },
-            { 0x8CBCCC096F5088CC,  -103,  -12 },
-            { 0xD1B71758E219652C,   -77,   -4 },
-            { 0x9C40000000000000,   -50,    4 },
-            { 0xE8D4A51000000000,   -24,   12 },
-            { 0xAD78EBC5AC620000,     3,   20 },
-            { 0x813F3978F8940984,    30,   28 },
-            { 0xC097CE7BC90715B3,    56,   36 },
-            { 0x8F7E32CE7BEA5C70,    83,   44 },
-            { 0xD5D238A4ABE98068,   109,   52 },
-            { 0x9F4F2726179A2245,   136,   60 },
-            { 0xED63A231D4C4FB27,   162,   68 },
-            { 0xB0DE65388CC8ADA8,   189,   76 },
-            { 0x83C7088E1AAB65DB,   216,   84 },
-            { 0xC45D1DF942711D9A,   242,   92 },
-            { 0x924D692CA61BE758,   269,  100 },
-            { 0xDA01EE641A708DEA,   295,  108 },
-            { 0xA26DA3999AEF774A,   322,  116 },
-            { 0xF209787BB47D6B85,   348,  124 },
-            { 0xB454E4A179DD1877,   375,  132 },
-            { 0x865B86925B9BC5C2,   402,  140 },
-            { 0xC83553C5C8965D3D,   428,  148 },
-            { 0x952AB45CFA97A0B3,   455,  156 },
-            { 0xDE469FBD99A05FE3,   481,  164 },
-            { 0xA59BC234DB398C25,   508,  172 },
-            { 0xF6C69A72A3989F5C,   534,  180 },
-            { 0xB7DCBF5354E9BECE,   561,  188 },
-            { 0x88FCF317F22241E2,   588,  196 },
-            { 0xCC20CE9BD35C78A5,   614,  204 },
-            { 0x98165AF37B2153DF,   641,  212 },
-            { 0xE2A0B5DC971F303A,   667,  220 },
-            { 0xA8D9D1535CE3B396,   694,  228 },
-            { 0xFB9B7CD9A4A7443C,   720,  236 },
-            { 0xBB764C4CA7A44410,   747,  244 },
-            { 0x8BAB8EEFB6409C1A,   774,  252 },
-            { 0xD01FEF10A657842C,   800,  260 },
-            { 0x9B10A4E5E9913129,   827,  268 },
-            { 0xE7109BFBA19C0C9D,   853,  276 },
-            { 0xAC2820D9623BF429,   880,  284 },
-            { 0x80444B5E7AA7CF85,   907,  292 },
-            { 0xBF21E44003ACDD2D,   933,  300 },
-            { 0x8E679C2F5E44FF8F,   960,  308 },
-            { 0xD433179D9C8CB841,   986,  316 },
-            { 0x9E19DB92B4E31BA9,  1013,  324 },
-        }
-    };
-
-    // This computation gives exactly the same results for k as
-    //      k = ceil((kAlpha - e - 1) * 0.30102999566398114)
-    // for |e| <= 1500, but doesn't require floating-point operations.
-    // NB: log_10(2) ~= 78913 / 2^18
-    JSON_ASSERT(e >= -1500);
-    JSON_ASSERT(e <=  1500);
-    const int f = kAlpha - e - 1;
-    const int k = ((f * 78913) / (1 << 18)) + static_cast<int>(f > 0);
-
-    const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) / kCachedPowersDecStep;
-    JSON_ASSERT(index >= 0);
-    JSON_ASSERT(static_cast<std::size_t>(index) < kCachedPowers.size());
-
-    const cached_power cached = kCachedPowers[static_cast<std::size_t>(index)];
-    JSON_ASSERT(kAlpha <= cached.e + e + 64);
-    JSON_ASSERT(kGamma >= cached.e + e + 64);
-
-    return cached;
-}
-
-/*!
-For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k.
-For n == 0, returns 1 and sets pow10 := 1.
-*/
-inline int find_largest_pow10(const std::uint32_t n, std::uint32_t& pow10)
-{
-    // LCOV_EXCL_START
-    if (n >= 1000000000)
-    {
-        pow10 = 1000000000;
-        return 10;
-    }
-    // LCOV_EXCL_STOP
-    if (n >= 100000000)
-    {
-        pow10 = 100000000;
-        return  9;
-    }
-    if (n >= 10000000)
-    {
-        pow10 = 10000000;
-        return  8;
-    }
-    if (n >= 1000000)
-    {
-        pow10 = 1000000;
-        return  7;
-    }
-    if (n >= 100000)
-    {
-        pow10 = 100000;
-        return  6;
-    }
-    if (n >= 10000)
-    {
-        pow10 = 10000;
-        return  5;
-    }
-    if (n >= 1000)
-    {
-        pow10 = 1000;
-        return  4;
-    }
-    if (n >= 100)
-    {
-        pow10 = 100;
-        return  3;
-    }
-    if (n >= 10)
-    {
-        pow10 = 10;
-        return  2;
-    }
-
-    pow10 = 1;
-    return 1;
-}
-
-inline void grisu2_round(char* buf, int len, std::uint64_t dist, std::uint64_t delta,
-                         std::uint64_t rest, std::uint64_t ten_k)
-{
-    JSON_ASSERT(len >= 1);
-    JSON_ASSERT(dist <= delta);
-    JSON_ASSERT(rest <= delta);
-    JSON_ASSERT(ten_k > 0);
-
-    //               <--------------------------- delta ---->
-    //                                  <---- dist --------->
-    // --------------[------------------+-------------------]--------------
-    //               M-                 w                   M+
-    //
-    //                                  ten_k
-    //                                <------>
-    //                                       <---- rest ---->
-    // --------------[------------------+----+--------------]--------------
-    //                                  w    V
-    //                                       = buf * 10^k
-    //
-    // ten_k represents a unit-in-the-last-place in the decimal representation
-    // stored in buf.
-    // Decrement buf by ten_k while this takes buf closer to w.
-
-    // The tests are written in this order to avoid overflow in unsigned
-    // integer arithmetic.
-
-    while (rest < dist
-            && delta - rest >= ten_k
-            && (rest + ten_k < dist || dist - rest > rest + ten_k - dist))
-    {
-        JSON_ASSERT(buf[len - 1] != '0');
-        buf[len - 1]--;
-        rest += ten_k;
-    }
-}
-
-/*!
-Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+.
-M- and M+ must be normalized and share the same exponent -60 <= e <= -32.
-*/
-inline void grisu2_digit_gen(char* buffer, int& length, int& decimal_exponent,
-                             diyfp M_minus, diyfp w, diyfp M_plus)
-{
-    static_assert(kAlpha >= -60, "internal error");
-    static_assert(kGamma <= -32, "internal error");
-
-    // Generates the digits (and the exponent) of a decimal floating-point
-    // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's
-    // w, M- and M+ share the same exponent e, which satisfies alpha <= e <= gamma.
-    //
-    //               <--------------------------- delta ---->
-    //                                  <---- dist --------->
-    // --------------[------------------+-------------------]--------------
-    //               M-                 w                   M+
-    //
-    // Grisu2 generates the digits of M+ from left to right and stops as soon as
-    // V is in [M-,M+].
-
-    JSON_ASSERT(M_plus.e >= kAlpha);
-    JSON_ASSERT(M_plus.e <= kGamma);
-
-    std::uint64_t delta = diyfp::sub(M_plus, M_minus).f; // (significand of (M+ - M-), implicit exponent is e)
-    std::uint64_t dist  = diyfp::sub(M_plus, w      ).f; // (significand of (M+ - w ), implicit exponent is e)
-
-    // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0):
-    //
-    //      M+ = f * 2^e
-    //         = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e
-    //         = ((p1        ) * 2^-e + (p2        )) * 2^e
-    //         = p1 + p2 * 2^e
-
-    const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e);
-
-    auto p1 = static_cast<std::uint32_t>(M_plus.f >> -one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.)
-    std::uint64_t p2 = M_plus.f & (one.f - 1);                    // p2 = f mod 2^-e
-
-    // 1)
-    //
-    // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0]
-
-    JSON_ASSERT(p1 > 0);
-
-    std::uint32_t pow10{};
-    const int k = find_largest_pow10(p1, pow10);
-
-    //      10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1)
-    //
-    //      p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1))
-    //         = (d[k-1]         ) * 10^(k-1) + (p1 mod 10^(k-1))
-    //
-    //      M+ = p1                                             + p2 * 2^e
-    //         = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1))          + p2 * 2^e
-    //         = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e
-    //         = d[k-1] * 10^(k-1) + (                         rest) * 2^e
-    //
-    // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0)
-    //
-    //      p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0]
-    //
-    // but stop as soon as
-    //
-    //      rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e
-
-    int n = k;
-    while (n > 0)
-    {
-        // Invariants:
-        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)    (buffer = 0 for n = k)
-        //      pow10 = 10^(n-1) <= p1 < 10^n
-        //
-        const std::uint32_t d = p1 / pow10;  // d = p1 div 10^(n-1)
-        const std::uint32_t r = p1 % pow10;  // r = p1 mod 10^(n-1)
-        //
-        //      M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e
-        //         = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e)
-        //
-        JSON_ASSERT(d <= 9);
-        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
-        //
-        //      M+ = buffer * 10^(n-1) + (r + p2 * 2^e)
-        //
-        p1 = r;
-        n--;
-        //
-        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)
-        //      pow10 = 10^n
-        //
-
-        // Now check if enough digits have been generated.
-        // Compute
-        //
-        //      p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e
-        //
-        // Note:
-        // Since rest and delta share the same exponent e, it suffices to
-        // compare the significands.
-        const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2;
-        if (rest <= delta)
-        {
-            // V = buffer * 10^n, with M- <= V <= M+.
-
-            decimal_exponent += n;
-
-            // We may now just stop. But instead look if the buffer could be
-            // decremented to bring V closer to w.
-            //
-            // pow10 = 10^n is now 1 ulp in the decimal representation V.
-            // The rounding procedure works with diyfp's with an implicit
-            // exponent of e.
-            //
-            //      10^n = (10^n * 2^-e) * 2^e = ulp * 2^e
-            //
-            const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e;
-            grisu2_round(buffer, length, dist, delta, rest, ten_n);
-
-            return;
-        }
-
-        pow10 /= 10;
-        //
-        //      pow10 = 10^(n-1) <= p1 < 10^n
-        // Invariants restored.
-    }
-
-    // 2)
-    //
-    // The digits of the integral part have been generated:
-    //
-    //      M+ = d[k-1]...d[1]d[0] + p2 * 2^e
-    //         = buffer            + p2 * 2^e
-    //
-    // Now generate the digits of the fractional part p2 * 2^e.
-    //
-    // Note:
-    // No decimal point is generated: the exponent is adjusted instead.
-    //
-    // p2 actually represents the fraction
-    //
-    //      p2 * 2^e
-    //          = p2 / 2^-e
-    //          = d[-1] / 10^1 + d[-2] / 10^2 + ...
-    //
-    // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...)
-    //
-    //      p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m
-    //                      + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...)
-    //
-    // using
-    //
-    //      10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e)
-    //                = (                   d) * 2^-e + (                   r)
-    //
-    // or
-    //      10^m * p2 * 2^e = d + r * 2^e
-    //
-    // i.e.
-    //
-    //      M+ = buffer + p2 * 2^e
-    //         = buffer + 10^-m * (d + r * 2^e)
-    //         = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e
-    //
-    // and stop as soon as 10^-m * r * 2^e <= delta * 2^e
-
-    JSON_ASSERT(p2 > delta);
-
-    int m = 0;
-    for (;;)
-    {
-        // Invariant:
-        //      M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...) * 2^e
-        //         = buffer * 10^-m + 10^-m * (p2                                 ) * 2^e
-        //         = buffer * 10^-m + 10^-m * (1/10 * (10 * p2)                   ) * 2^e
-        //         = buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e + (10*p2 mod 2^-e)) * 2^e
-        //
-        JSON_ASSERT(p2 <= (std::numeric_limits<std::uint64_t>::max)() / 10);
-        p2 *= 10;
-        const std::uint64_t d = p2 >> -one.e;     // d = (10 * p2) div 2^-e
-        const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e
-        //
-        //      M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e
-        //         = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e))
-        //         = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e
-        //
-        JSON_ASSERT(d <= 9);
-        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
-        //
-        //      M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e
-        //
-        p2 = r;
-        m++;
-        //
-        //      M+ = buffer * 10^-m + 10^-m * p2 * 2^e
-        // Invariant restored.
-
-        // Check if enough digits have been generated.
-        //
-        //      10^-m * p2 * 2^e <= delta * 2^e
-        //              p2 * 2^e <= 10^m * delta * 2^e
-        //                    p2 <= 10^m * delta
-        delta *= 10;
-        dist  *= 10;
-        if (p2 <= delta)
-        {
-            break;
-        }
-    }
-
-    // V = buffer * 10^-m, with M- <= V <= M+.
-
-    decimal_exponent -= m;
-
-    // 1 ulp in the decimal representation is now 10^-m.
-    // Since delta and dist are now scaled by 10^m, we need to do the
-    // same with ulp in order to keep the units in sync.
-    //
-    //      10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e
-    //
-    const std::uint64_t ten_m = one.f;
-    grisu2_round(buffer, length, dist, delta, p2, ten_m);
-
-    // By construction this algorithm generates the shortest possible decimal
-    // number (Loitsch, Theorem 6.2) which rounds back to w.
-    // For an input number of precision p, at least
-    //
-    //      N = 1 + ceil(p * log_10(2))
-    //
-    // decimal digits are sufficient to identify all binary floating-point
-    // numbers (Matula, "In-and-Out conversions").
-    // This implies that the algorithm does not produce more than N decimal
-    // digits.
-    //
-    //      N = 17 for p = 53 (IEEE double precision)
-    //      N = 9  for p = 24 (IEEE single precision)
-}
-
-/*!
-v = buf * 10^decimal_exponent
-len is the length of the buffer (number of decimal digits)
-The buffer must be large enough, i.e. >= max_digits10.
-*/
-JSON_HEDLEY_NON_NULL(1)
-inline void grisu2(char* buf, int& len, int& decimal_exponent,
-                   diyfp m_minus, diyfp v, diyfp m_plus)
-{
-    JSON_ASSERT(m_plus.e == m_minus.e);
-    JSON_ASSERT(m_plus.e == v.e);
-
-    //  --------(-----------------------+-----------------------)--------    (A)
-    //          m-                      v                       m+
-    //
-    //  --------------------(-----------+-----------------------)--------    (B)
-    //                      m-          v                       m+
-    //
-    // First scale v (and m- and m+) such that the exponent is in the range
-    // [alpha, gamma].
-
-    const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e);
-
-    const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k
-
-    // The exponent of the products is = v.e + c_minus_k.e + q and is in the range [alpha,gamma]
-    const diyfp w       = diyfp::mul(v,       c_minus_k);
-    const diyfp w_minus = diyfp::mul(m_minus, c_minus_k);
-    const diyfp w_plus  = diyfp::mul(m_plus,  c_minus_k);
-
-    //  ----(---+---)---------------(---+---)---------------(---+---)----
-    //          w-                      w                       w+
-    //          = c*m-                  = c*v                   = c*m+
-    //
-    // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and
-    // w+ are now off by a small amount.
-    // In fact:
-    //
-    //      w - v * 10^k < 1 ulp
-    //
-    // To account for this inaccuracy, add resp. subtract 1 ulp.
-    //
-    //  --------+---[---------------(---+---)---------------]---+--------
-    //          w-  M-                  w                   M+  w+
-    //
-    // Now any number in [M-, M+] (bounds included) will round to w when input,
-    // regardless of how the input rounding algorithm breaks ties.
-    //
-    // And digit_gen generates the shortest possible such number in [M-, M+].
-    // Note that this does not mean that Grisu2 always generates the shortest
-    // possible number in the interval (m-, m+).
-    const diyfp M_minus(w_minus.f + 1, w_minus.e);
-    const diyfp M_plus (w_plus.f  - 1, w_plus.e );
-
-    decimal_exponent = -cached.k; // = -(-k) = k
-
-    grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus);
-}
-
-/*!
-v = buf * 10^decimal_exponent
-len is the length of the buffer (number of decimal digits)
-The buffer must be large enough, i.e. >= max_digits10.
-*/
-template<typename FloatType>
-JSON_HEDLEY_NON_NULL(1)
-void grisu2(char* buf, int& len, int& decimal_exponent, FloatType value)
-{
-    static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3,
-                  "internal error: not enough precision");
-
-    JSON_ASSERT(std::isfinite(value));
-    JSON_ASSERT(value > 0);
-
-    // If the neighbors (and boundaries) of 'value' are always computed for double-precision
-    // numbers, all float's can be recovered using strtod (and strtof). However, the resulting
-    // decimal representations are not exactly "short".
-    //
-    // The documentation for 'std::to_chars' (https://en.cppreference.com/w/cpp/utility/to_chars)
-    // says "value is converted to a string as if by std::sprintf in the default ("C") locale"
-    // and since sprintf promotes floats to doubles, I think this is exactly what 'std::to_chars'
-    // does.
-    // On the other hand, the documentation for 'std::to_chars' requires that "parsing the
-    // representation using the corresponding std::from_chars function recovers value exactly". That
-    // indicates that single precision floating-point numbers should be recovered using
-    // 'std::strtof'.
-    //
-    // NB: If the neighbors are computed for single-precision numbers, there is a single float
-    //     (7.0385307e-26f) which can't be recovered using strtod. The resulting double precision
-    //     value is off by 1 ulp.
-#if 0 // NOLINT(readability-avoid-unconditional-preprocessor-if)
-    const boundaries w = compute_boundaries(static_cast<double>(value));
-#else
-    const boundaries w = compute_boundaries(value);
-#endif
-
-    grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus);
-}
-
-/*!
-@brief appends a decimal representation of e to buf
-@return a pointer to the element following the exponent.
-@pre -1000 < e < 1000
-*/
-JSON_HEDLEY_NON_NULL(1)
-JSON_HEDLEY_RETURNS_NON_NULL
-inline char* append_exponent(char* buf, int e)
-{
-    JSON_ASSERT(e > -1000);
-    JSON_ASSERT(e <  1000);
-
-    if (e < 0)
-    {
-        e = -e;
-        *buf++ = '-';
-    }
-    else
-    {
-        *buf++ = '+';
-    }
-
-    auto k = static_cast<std::uint32_t>(e);
-    if (k < 10)
-    {
-        // Always print at least two digits in the exponent.
-        // This is for compatibility with printf("%g").
-        *buf++ = '0';
-        *buf++ = static_cast<char>('0' + k);
-    }
-    else if (k < 100)
-    {
-        *buf++ = static_cast<char>('0' + (k / 10));
-        k %= 10;
-        *buf++ = static_cast<char>('0' + k);
-    }
-    else
-    {
-        *buf++ = static_cast<char>('0' + (k / 100));
-        k %= 100;
-        *buf++ = static_cast<char>('0' + (k / 10));
-        k %= 10;
-        *buf++ = static_cast<char>('0' + k);
-    }
-
-    return buf;
-}
-
-/*!
-@brief prettify v = buf * 10^decimal_exponent
-
-If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point
-notation. Otherwise it will be printed in exponential notation.
-
-@pre min_exp < 0
-@pre max_exp > 0
-*/
-JSON_HEDLEY_NON_NULL(1)
-JSON_HEDLEY_RETURNS_NON_NULL
-inline char* format_buffer(char* buf, int len, int decimal_exponent,
-                           int min_exp, int max_exp)
-{
-    JSON_ASSERT(min_exp < 0);
-    JSON_ASSERT(max_exp > 0);
-
-    const int k = len;
-    const int n = len + decimal_exponent;
-
-    // v = buf * 10^(n-k)
-    // k is the length of the buffer (number of decimal digits)
-    // n is the position of the decimal point relative to the start of the buffer.
-
-    if (k <= n && n <= max_exp)
-    {
-        // digits[000]
-        // len <= max_exp + 2
-
-        std::memset(buf + k, '0', static_cast<size_t>(n) - static_cast<size_t>(k));
-        // Make it look like a floating-point number (#362, #378)
-        buf[n + 0] = '.';
-        buf[n + 1] = '0';
-        return buf + (static_cast<size_t>(n) + 2);
-    }
-
-    if (0 < n && n <= max_exp)
-    {
-        // dig.its
-        // len <= max_digits10 + 1
-
-        JSON_ASSERT(k > n);
-
-        std::memmove(buf + (static_cast<size_t>(n) + 1), buf + n, static_cast<size_t>(k) - static_cast<size_t>(n));
-        buf[n] = '.';
-        return buf + (static_cast<size_t>(k) + 1U);
-    }
-
-    if (min_exp < n && n <= 0)
-    {
-        // 0.[000]digits
-        // len <= 2 + (-min_exp - 1) + max_digits10
-
-        std::memmove(buf + (2 + static_cast<size_t>(-n)), buf, static_cast<size_t>(k));
-        buf[0] = '0';
-        buf[1] = '.';
-        std::memset(buf + 2, '0', static_cast<size_t>(-n));
-        return buf + (2U + static_cast<size_t>(-n) + static_cast<size_t>(k));
-    }
-
-    if (k == 1)
-    {
-        // dE+123
-        // len <= 1 + 5
-
-        buf += 1;
-    }
-    else
-    {
-        // d.igitsE+123
-        // len <= max_digits10 + 1 + 5
-
-        std::memmove(buf + 2, buf + 1, static_cast<size_t>(k) - 1);
-        buf[1] = '.';
-        buf += 1 + static_cast<size_t>(k);
-    }
-
-    *buf++ = 'e';
-    return append_exponent(buf, n - 1);
-}
-
-}  // namespace dtoa_impl
-
-/*!
-@brief generates a decimal representation of the floating-point number value in [first, last).
-
-The format of the resulting decimal representation is similar to printf's %g
-format. Returns an iterator pointing past-the-end of the decimal representation.
-
-@note The input number must be finite, i.e. NaN's and Inf's are not supported.
-@note The buffer must be large enough.
-@note The result is NOT null-terminated.
-*/
-template<typename FloatType>
-JSON_HEDLEY_NON_NULL(1, 2)
-JSON_HEDLEY_RETURNS_NON_NULL
-char* to_chars(char* first, const char* last, FloatType value)
-{
-    static_cast<void>(last); // maybe unused - fix warning
-    JSON_ASSERT(std::isfinite(value));
-
-    // Use signbit(value) instead of (value < 0) since signbit works for -0.
-    if (std::signbit(value))
-    {
-        value = -value;
-        *first++ = '-';
-    }
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wfloat-equal"
-#endif
-    if (value == 0) // +-0
-    {
-        *first++ = '0';
-        // Make it look like a floating-point number (#362, #378)
-        *first++ = '.';
-        *first++ = '0';
-        return first;
-    }
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10);
-
-    // Compute v = buffer * 10^decimal_exponent.
-    // The decimal digits are stored in the buffer, which needs to be interpreted
-    // as an unsigned decimal integer.
-    // len is the length of the buffer, i.e. the number of decimal digits.
-    int len = 0;
-    int decimal_exponent = 0;
-    dtoa_impl::grisu2(first, len, decimal_exponent, value);
-
-    JSON_ASSERT(len <= std::numeric_limits<FloatType>::max_digits10);
-
-    // Format the buffer like printf("%.*g", prec, value)
-    constexpr int kMinExp = -4;
-    // Use digits10 here to increase compatibility with version 2.
-    constexpr int kMaxExp = std::numeric_limits<FloatType>::digits10;
-
-    JSON_ASSERT(last - first >= kMaxExp + 2);
-    JSON_ASSERT(last - first >= 2 + (-kMinExp - 1) + std::numeric_limits<FloatType>::max_digits10);
-    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10 + 6);
-
-    return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp, kMaxExp);
-}
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/exceptions.hpp>
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-// #include <nlohmann/detail/meta/cpp_future.hpp>
-
-// #include <nlohmann/detail/output/binary_writer.hpp>
-
-// #include <nlohmann/detail/output/output_adapters.hpp>
-
-// #include <nlohmann/detail/string_concat.hpp>
-
-// #include <nlohmann/detail/value_t.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-namespace detail
-{
-
-///////////////////
-// serialization //
-///////////////////
-
-/// how to treat decoding errors
-enum class error_handler_t
-{
-    strict,  ///< throw a type_error exception in case of invalid UTF-8
-    replace, ///< replace invalid UTF-8 sequences with U+FFFD
-    ignore   ///< ignore invalid UTF-8 sequences
-};
-
-template<typename BasicJsonType>
-class serializer
-{
-    using string_t = typename BasicJsonType::string_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-    using number_integer_t = typename BasicJsonType::number_integer_t;
-    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-    using binary_char_t = typename BasicJsonType::binary_t::value_type;
-    static constexpr std::uint8_t UTF8_ACCEPT = 0;
-    static constexpr std::uint8_t UTF8_REJECT = 1;
-
-  public:
-    /*!
-    @param[in] s  output stream to serialize to
-    @param[in] ichar  indentation character to use
-    @param[in] error_handler_  how to react on decoding errors
-    */
-    serializer(output_adapter_t<char> s, const char ichar,
-               error_handler_t error_handler_ = error_handler_t::strict)
-        : o(std::move(s))
-        , loc(std::localeconv())
-        , thousands_sep(loc->thousands_sep == nullptr ? '\0' : std::char_traits<char>::to_char_type(* (loc->thousands_sep)))
-        , decimal_point(loc->decimal_point == nullptr ? '\0' : std::char_traits<char>::to_char_type(* (loc->decimal_point)))
-        , indent_char(ichar)
-        , indent_string(512, indent_char)
-        , error_handler(error_handler_)
-    {}
-
-    // delete because of pointer members
-    serializer(const serializer&) = delete;
-    serializer& operator=(const serializer&) = delete;
-    serializer(serializer&&) = delete;
-    serializer& operator=(serializer&&) = delete;
-    ~serializer() = default;
-
-    /*!
-    @brief internal implementation of the serialization function
-
-    This function is called by the public member function dump and organizes
-    the serialization internally. The indentation level is propagated as
-    additional parameter. In case of arrays and objects, the function is
-    called recursively.
-
-    - strings and object keys are escaped using `escape_string()`
-    - integer numbers are converted implicitly via `operator<<`
-    - floating-point numbers are converted to a string using `"%g"` format
-    - binary values are serialized as objects containing the subtype and the
-      byte array
-
-    @param[in] val               value to serialize
-    @param[in] pretty_print      whether the output shall be pretty-printed
-    @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
-    in the output are escaped with `\uXXXX` sequences, and the result consists
-    of ASCII characters only.
-    @param[in] indent_step       the indent level
-    @param[in] current_indent    the current indent level (only used internally)
-    */
-    void dump(const BasicJsonType& val,
-              const bool pretty_print,
-              const bool ensure_ascii,
-              const unsigned int indent_step,
-              const unsigned int current_indent = 0)
-    {
-        switch (val.m_data.m_type)
-        {
-            case value_t::object:
-            {
-                if (val.m_data.m_value.object->empty())
-                {
-                    o->write_characters("{}", 2);
-                    return;
-                }
-
-                if (pretty_print)
-                {
-                    o->write_characters("{\n", 2);
-
-                    // variable to hold indentation for recursive calls
-                    const auto new_indent = current_indent + indent_step;
-                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
-                    {
-                        indent_string.resize(indent_string.size() * 2, ' ');
-                    }
-
-                    // first n-1 elements
-                    auto i = val.m_data.m_value.object->cbegin();
-                    for (std::size_t cnt = 0; cnt < val.m_data.m_value.object->size() - 1; ++cnt, ++i)
-                    {
-                        o->write_characters(indent_string.c_str(), new_indent);
-                        o->write_character('\"');
-                        dump_escaped(i->first, ensure_ascii);
-                        o->write_characters("\": ", 3);
-                        dump(i->second, true, ensure_ascii, indent_step, new_indent);
-                        o->write_characters(",\n", 2);
-                    }
-
-                    // last element
-                    JSON_ASSERT(i != val.m_data.m_value.object->cend());
-                    JSON_ASSERT(std::next(i) == val.m_data.m_value.object->cend());
-                    o->write_characters(indent_string.c_str(), new_indent);
-                    o->write_character('\"');
-                    dump_escaped(i->first, ensure_ascii);
-                    o->write_characters("\": ", 3);
-                    dump(i->second, true, ensure_ascii, indent_step, new_indent);
-
-                    o->write_character('\n');
-                    o->write_characters(indent_string.c_str(), current_indent);
-                    o->write_character('}');
-                }
-                else
-                {
-                    o->write_character('{');
-
-                    // first n-1 elements
-                    auto i = val.m_data.m_value.object->cbegin();
-                    for (std::size_t cnt = 0; cnt < val.m_data.m_value.object->size() - 1; ++cnt, ++i)
-                    {
-                        o->write_character('\"');
-                        dump_escaped(i->first, ensure_ascii);
-                        o->write_characters("\":", 2);
-                        dump(i->second, false, ensure_ascii, indent_step, current_indent);
-                        o->write_character(',');
-                    }
-
-                    // last element
-                    JSON_ASSERT(i != val.m_data.m_value.object->cend());
-                    JSON_ASSERT(std::next(i) == val.m_data.m_value.object->cend());
-                    o->write_character('\"');
-                    dump_escaped(i->first, ensure_ascii);
-                    o->write_characters("\":", 2);
-                    dump(i->second, false, ensure_ascii, indent_step, current_indent);
-
-                    o->write_character('}');
-                }
-
-                return;
-            }
-
-            case value_t::array:
-            {
-                if (val.m_data.m_value.array->empty())
-                {
-                    o->write_characters("[]", 2);
-                    return;
-                }
-
-                if (pretty_print)
-                {
-                    o->write_characters("[\n", 2);
-
-                    // variable to hold indentation for recursive calls
-                    const auto new_indent = current_indent + indent_step;
-                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
-                    {
-                        indent_string.resize(indent_string.size() * 2, ' ');
-                    }
-
-                    // first n-1 elements
-                    for (auto i = val.m_data.m_value.array->cbegin();
-                            i != val.m_data.m_value.array->cend() - 1; ++i)
-                    {
-                        o->write_characters(indent_string.c_str(), new_indent);
-                        dump(*i, true, ensure_ascii, indent_step, new_indent);
-                        o->write_characters(",\n", 2);
-                    }
-
-                    // last element
-                    JSON_ASSERT(!val.m_data.m_value.array->empty());
-                    o->write_characters(indent_string.c_str(), new_indent);
-                    dump(val.m_data.m_value.array->back(), true, ensure_ascii, indent_step, new_indent);
-
-                    o->write_character('\n');
-                    o->write_characters(indent_string.c_str(), current_indent);
-                    o->write_character(']');
-                }
-                else
-                {
-                    o->write_character('[');
-
-                    // first n-1 elements
-                    for (auto i = val.m_data.m_value.array->cbegin();
-                            i != val.m_data.m_value.array->cend() - 1; ++i)
-                    {
-                        dump(*i, false, ensure_ascii, indent_step, current_indent);
-                        o->write_character(',');
-                    }
-
-                    // last element
-                    JSON_ASSERT(!val.m_data.m_value.array->empty());
-                    dump(val.m_data.m_value.array->back(), false, ensure_ascii, indent_step, current_indent);
-
-                    o->write_character(']');
-                }
-
-                return;
-            }
-
-            case value_t::string:
-            {
-                o->write_character('\"');
-                dump_escaped(*val.m_data.m_value.string, ensure_ascii);
-                o->write_character('\"');
-                return;
-            }
-
-            case value_t::binary:
-            {
-                if (pretty_print)
-                {
-                    o->write_characters("{\n", 2);
-
-                    // variable to hold indentation for recursive calls
-                    const auto new_indent = current_indent + indent_step;
-                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
-                    {
-                        indent_string.resize(indent_string.size() * 2, ' ');
-                    }
-
-                    o->write_characters(indent_string.c_str(), new_indent);
-
-                    o->write_characters("\"bytes\": [", 10);
-
-                    if (!val.m_data.m_value.binary->empty())
-                    {
-                        for (auto i = val.m_data.m_value.binary->cbegin();
-                                i != val.m_data.m_value.binary->cend() - 1; ++i)
-                        {
-                            dump_integer(*i);
-                            o->write_characters(", ", 2);
-                        }
-                        dump_integer(val.m_data.m_value.binary->back());
-                    }
-
-                    o->write_characters("],\n", 3);
-                    o->write_characters(indent_string.c_str(), new_indent);
-
-                    o->write_characters("\"subtype\": ", 11);
-                    if (val.m_data.m_value.binary->has_subtype())
-                    {
-                        dump_integer(val.m_data.m_value.binary->subtype());
-                    }
-                    else
-                    {
-                        o->write_characters("null", 4);
-                    }
-                    o->write_character('\n');
-                    o->write_characters(indent_string.c_str(), current_indent);
-                    o->write_character('}');
-                }
-                else
-                {
-                    o->write_characters("{\"bytes\":[", 10);
-
-                    if (!val.m_data.m_value.binary->empty())
-                    {
-                        for (auto i = val.m_data.m_value.binary->cbegin();
-                                i != val.m_data.m_value.binary->cend() - 1; ++i)
-                        {
-                            dump_integer(*i);
-                            o->write_character(',');
-                        }
-                        dump_integer(val.m_data.m_value.binary->back());
-                    }
-
-                    o->write_characters("],\"subtype\":", 12);
-                    if (val.m_data.m_value.binary->has_subtype())
-                    {
-                        dump_integer(val.m_data.m_value.binary->subtype());
-                        o->write_character('}');
-                    }
-                    else
-                    {
-                        o->write_characters("null}", 5);
-                    }
-                }
-                return;
-            }
-
-            case value_t::boolean:
-            {
-                if (val.m_data.m_value.boolean)
-                {
-                    o->write_characters("true", 4);
-                }
-                else
-                {
-                    o->write_characters("false", 5);
-                }
-                return;
-            }
-
-            case value_t::number_integer:
-            {
-                dump_integer(val.m_data.m_value.number_integer);
-                return;
-            }
-
-            case value_t::number_unsigned:
-            {
-                dump_integer(val.m_data.m_value.number_unsigned);
-                return;
-            }
-
-            case value_t::number_float:
-            {
-                dump_float(val.m_data.m_value.number_float);
-                return;
-            }
-
-            case value_t::discarded:
-            {
-                o->write_characters("<discarded>", 11);
-                return;
-            }
-
-            case value_t::null:
-            {
-                o->write_characters("null", 4);
-                return;
-            }
-
-            default:            // LCOV_EXCL_LINE
-                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
-        }
-    }
-
-  JSON_PRIVATE_UNLESS_TESTED:
-    /*!
-    @brief dump escaped string
-
-    Escape a string by replacing certain special characters by a sequence of an
-    escape character (backslash) and another character and other control
-    characters by a sequence of "\u" followed by a four-digit hex
-    representation. The escaped string is written to output stream @a o.
-
-    @param[in] s  the string to escape
-    @param[in] ensure_ascii  whether to escape non-ASCII characters with
-                             \uXXXX sequences
-
-    @complexity Linear in the length of string @a s.
-    */
-    void dump_escaped(const string_t& s, const bool ensure_ascii)
-    {
-        std::uint32_t codepoint{};
-        std::uint8_t state = UTF8_ACCEPT;
-        std::size_t bytes = 0;  // number of bytes written to string_buffer
-
-        // number of bytes written at the point of the last valid byte
-        std::size_t bytes_after_last_accept = 0;
-        std::size_t undumped_chars = 0;
-
-        for (std::size_t i = 0; i < s.size(); ++i)
-        {
-            const auto byte = static_cast<std::uint8_t>(s[i]);
-
-            switch (decode(state, codepoint, byte))
-            {
-                case UTF8_ACCEPT:  // decode found a new code point
-                {
-                    switch (codepoint)
-                    {
-                        case 0x08: // backspace
-                        {
-                            string_buffer[bytes++] = '\\';
-                            string_buffer[bytes++] = 'b';
-                            break;
-                        }
-
-                        case 0x09: // horizontal tab
-                        {
-                            string_buffer[bytes++] = '\\';
-                            string_buffer[bytes++] = 't';
-                            break;
-                        }
-
-                        case 0x0A: // newline
-                        {
-                            string_buffer[bytes++] = '\\';
-                            string_buffer[bytes++] = 'n';
-                            break;
-                        }
-
-                        case 0x0C: // formfeed
-                        {
-                            string_buffer[bytes++] = '\\';
-                            string_buffer[bytes++] = 'f';
-                            break;
-                        }
-
-                        case 0x0D: // carriage return
-                        {
-                            string_buffer[bytes++] = '\\';
-                            string_buffer[bytes++] = 'r';
-                            break;
-                        }
-
-                        case 0x22: // quotation mark
-                        {
-                            string_buffer[bytes++] = '\\';
-                            string_buffer[bytes++] = '\"';
-                            break;
-                        }
-
-                        case 0x5C: // reverse solidus
-                        {
-                            string_buffer[bytes++] = '\\';
-                            string_buffer[bytes++] = '\\';
-                            break;
-                        }
-
-                        default:
-                        {
-                            // escape control characters (0x00..0x1F) or, if
-                            // ensure_ascii parameter is used, non-ASCII characters
-                            if ((codepoint <= 0x1F) || (ensure_ascii && (codepoint >= 0x7F)))
-                            {
-                                if (codepoint <= 0xFFFF)
-                                {
-                                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
-                                    static_cast<void>((std::snprintf)(string_buffer.data() + bytes, 7, "\\u%04x",
-                                                                      static_cast<std::uint16_t>(codepoint)));
-                                    bytes += 6;
-                                }
-                                else
-                                {
-                                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
-                                    static_cast<void>((std::snprintf)(string_buffer.data() + bytes, 13, "\\u%04x\\u%04x",
-                                                                      static_cast<std::uint16_t>(0xD7C0u + (codepoint >> 10u)),
-                                                                      static_cast<std::uint16_t>(0xDC00u + (codepoint & 0x3FFu))));
-                                    bytes += 12;
-                                }
-                            }
-                            else
-                            {
-                                // copy byte to buffer (all previous bytes
-                                // been copied have in default case above)
-                                string_buffer[bytes++] = s[i];
-                            }
-                            break;
-                        }
-                    }
-
-                    // write buffer and reset index; there must be 13 bytes
-                    // left, as this is the maximal number of bytes to be
-                    // written ("\uxxxx\uxxxx\0") for one code point
-                    if (string_buffer.size() - bytes < 13)
-                    {
-                        o->write_characters(string_buffer.data(), bytes);
-                        bytes = 0;
-                    }
-
-                    // remember the byte position of this accept
-                    bytes_after_last_accept = bytes;
-                    undumped_chars = 0;
-                    break;
-                }
-
-                case UTF8_REJECT:  // decode found invalid UTF-8 byte
-                {
-                    switch (error_handler)
-                    {
-                        case error_handler_t::strict:
-                        {
-                            JSON_THROW(type_error::create(316, concat("invalid UTF-8 byte at index ", std::to_string(i), ": 0x", hex_bytes(byte | 0)), nullptr));
-                        }
-
-                        case error_handler_t::ignore:
-                        case error_handler_t::replace:
-                        {
-                            // in case we saw this character the first time, we
-                            // would like to read it again, because the byte
-                            // may be OK for itself, but just not OK for the
-                            // previous sequence
-                            if (undumped_chars > 0)
-                            {
-                                --i;
-                            }
-
-                            // reset length buffer to the last accepted index;
-                            // thus removing/ignoring the invalid characters
-                            bytes = bytes_after_last_accept;
-
-                            if (error_handler == error_handler_t::replace)
-                            {
-                                // add a replacement character
-                                if (ensure_ascii)
-                                {
-                                    string_buffer[bytes++] = '\\';
-                                    string_buffer[bytes++] = 'u';
-                                    string_buffer[bytes++] = 'f';
-                                    string_buffer[bytes++] = 'f';
-                                    string_buffer[bytes++] = 'f';
-                                    string_buffer[bytes++] = 'd';
-                                }
-                                else
-                                {
-                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xEF');
-                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBF');
-                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBD');
-                                }
-
-                                // write buffer and reset index; there must be 13 bytes
-                                // left, as this is the maximal number of bytes to be
-                                // written ("\uxxxx\uxxxx\0") for one code point
-                                if (string_buffer.size() - bytes < 13)
-                                {
-                                    o->write_characters(string_buffer.data(), bytes);
-                                    bytes = 0;
-                                }
-
-                                bytes_after_last_accept = bytes;
-                            }
-
-                            undumped_chars = 0;
-
-                            // continue processing the string
-                            state = UTF8_ACCEPT;
-                            break;
-                        }
-
-                        default:            // LCOV_EXCL_LINE
-                            JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
-                    }
-                    break;
-                }
-
-                default:  // decode found yet incomplete multi-byte code point
-                {
-                    if (!ensure_ascii)
-                    {
-                        // code point will not be escaped - copy byte to buffer
-                        string_buffer[bytes++] = s[i];
-                    }
-                    ++undumped_chars;
-                    break;
-                }
-            }
-        }
-
-        // we finished processing the string
-        if (JSON_HEDLEY_LIKELY(state == UTF8_ACCEPT))
-        {
-            // write buffer
-            if (bytes > 0)
-            {
-                o->write_characters(string_buffer.data(), bytes);
-            }
-        }
-        else
-        {
-            // we finish reading, but do not accept: string was incomplete
-            switch (error_handler)
-            {
-                case error_handler_t::strict:
-                {
-                    JSON_THROW(type_error::create(316, concat("incomplete UTF-8 string; last byte: 0x", hex_bytes(static_cast<std::uint8_t>(s.back() | 0))), nullptr));
-                }
-
-                case error_handler_t::ignore:
-                {
-                    // write all accepted bytes
-                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
-                    break;
-                }
-
-                case error_handler_t::replace:
-                {
-                    // write all accepted bytes
-                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
-                    // add a replacement character
-                    if (ensure_ascii)
-                    {
-                        o->write_characters("\\ufffd", 6);
-                    }
-                    else
-                    {
-                        o->write_characters("\xEF\xBF\xBD", 3);
-                    }
-                    break;
-                }
-
-                default:            // LCOV_EXCL_LINE
-                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
-            }
-        }
-    }
-
-  private:
-    /*!
-    @brief count digits
-
-    Count the number of decimal (base 10) digits for an input unsigned integer.
-
-    @param[in] x  unsigned integer number to count its digits
-    @return    number of decimal digits
-    */
-    unsigned int count_digits(number_unsigned_t x) noexcept
-    {
-        unsigned int n_digits = 1;
-        for (;;)
-        {
-            if (x < 10)
-            {
-                return n_digits;
-            }
-            if (x < 100)
-            {
-                return n_digits + 1;
-            }
-            if (x < 1000)
-            {
-                return n_digits + 2;
-            }
-            if (x < 10000)
-            {
-                return n_digits + 3;
-            }
-            x = x / 10000u;
-            n_digits += 4;
-        }
-    }
-
-    /*!
-     * @brief convert a byte to a uppercase hex representation
-     * @param[in] byte byte to represent
-     * @return representation ("00".."FF")
-     */
-    static std::string hex_bytes(std::uint8_t byte)
-    {
-        std::string result = "FF";
-        constexpr const char* nibble_to_hex = "0123456789ABCDEF";
-        result[0] = nibble_to_hex[byte / 16];
-        result[1] = nibble_to_hex[byte % 16];
-        return result;
-    }
-
-    // templates to avoid warnings about useless casts
-    template <typename NumberType, enable_if_t<std::is_signed<NumberType>::value, int> = 0>
-    bool is_negative_number(NumberType x)
-    {
-        return x < 0;
-    }
-
-    template < typename NumberType, enable_if_t <std::is_unsigned<NumberType>::value, int > = 0 >
-    bool is_negative_number(NumberType /*unused*/)
-    {
-        return false;
-    }
-
-    /*!
-    @brief dump an integer
-
-    Dump a given integer to output stream @a o. Works internally with
-    @a number_buffer.
-
-    @param[in] x  integer number (signed or unsigned) to dump
-    @tparam NumberType either @a number_integer_t or @a number_unsigned_t
-    */
-    template < typename NumberType, detail::enable_if_t <
-                   std::is_integral<NumberType>::value ||
-                   std::is_same<NumberType, number_unsigned_t>::value ||
-                   std::is_same<NumberType, number_integer_t>::value ||
-                   std::is_same<NumberType, binary_char_t>::value,
-                   int > = 0 >
-    void dump_integer(NumberType x)
-    {
-        static constexpr std::array<std::array<char, 2>, 100> digits_to_99
-        {
-            {
-                {{'0', '0'}}, {{'0', '1'}}, {{'0', '2'}}, {{'0', '3'}}, {{'0', '4'}}, {{'0', '5'}}, {{'0', '6'}}, {{'0', '7'}}, {{'0', '8'}}, {{'0', '9'}},
-                {{'1', '0'}}, {{'1', '1'}}, {{'1', '2'}}, {{'1', '3'}}, {{'1', '4'}}, {{'1', '5'}}, {{'1', '6'}}, {{'1', '7'}}, {{'1', '8'}}, {{'1', '9'}},
-                {{'2', '0'}}, {{'2', '1'}}, {{'2', '2'}}, {{'2', '3'}}, {{'2', '4'}}, {{'2', '5'}}, {{'2', '6'}}, {{'2', '7'}}, {{'2', '8'}}, {{'2', '9'}},
-                {{'3', '0'}}, {{'3', '1'}}, {{'3', '2'}}, {{'3', '3'}}, {{'3', '4'}}, {{'3', '5'}}, {{'3', '6'}}, {{'3', '7'}}, {{'3', '8'}}, {{'3', '9'}},
-                {{'4', '0'}}, {{'4', '1'}}, {{'4', '2'}}, {{'4', '3'}}, {{'4', '4'}}, {{'4', '5'}}, {{'4', '6'}}, {{'4', '7'}}, {{'4', '8'}}, {{'4', '9'}},
-                {{'5', '0'}}, {{'5', '1'}}, {{'5', '2'}}, {{'5', '3'}}, {{'5', '4'}}, {{'5', '5'}}, {{'5', '6'}}, {{'5', '7'}}, {{'5', '8'}}, {{'5', '9'}},
-                {{'6', '0'}}, {{'6', '1'}}, {{'6', '2'}}, {{'6', '3'}}, {{'6', '4'}}, {{'6', '5'}}, {{'6', '6'}}, {{'6', '7'}}, {{'6', '8'}}, {{'6', '9'}},
-                {{'7', '0'}}, {{'7', '1'}}, {{'7', '2'}}, {{'7', '3'}}, {{'7', '4'}}, {{'7', '5'}}, {{'7', '6'}}, {{'7', '7'}}, {{'7', '8'}}, {{'7', '9'}},
-                {{'8', '0'}}, {{'8', '1'}}, {{'8', '2'}}, {{'8', '3'}}, {{'8', '4'}}, {{'8', '5'}}, {{'8', '6'}}, {{'8', '7'}}, {{'8', '8'}}, {{'8', '9'}},
-                {{'9', '0'}}, {{'9', '1'}}, {{'9', '2'}}, {{'9', '3'}}, {{'9', '4'}}, {{'9', '5'}}, {{'9', '6'}}, {{'9', '7'}}, {{'9', '8'}}, {{'9', '9'}},
-            }
-        };
-
-        // special case for "0"
-        if (x == 0)
-        {
-            o->write_character('0');
-            return;
-        }
-
-        // use a pointer to fill the buffer
-        auto buffer_ptr = number_buffer.begin(); // NOLINT(llvm-qualified-auto,readability-qualified-auto,cppcoreguidelines-pro-type-vararg,hicpp-vararg)
-
-        number_unsigned_t abs_value;
-
-        unsigned int n_chars{};
-
-        if (is_negative_number(x))
-        {
-            *buffer_ptr = '-';
-            abs_value = remove_sign(static_cast<number_integer_t>(x));
-
-            // account one more byte for the minus sign
-            n_chars = 1 + count_digits(abs_value);
-        }
-        else
-        {
-            abs_value = static_cast<number_unsigned_t>(x);
-            n_chars = count_digits(abs_value);
-        }
-
-        // spare 1 byte for '\0'
-        JSON_ASSERT(n_chars < number_buffer.size() - 1);
-
-        // jump to the end to generate the string from backward,
-        // so we later avoid reversing the result
-        buffer_ptr += n_chars;
-
-        // Fast int2ascii implementation inspired by "Fastware" talk by Andrei Alexandrescu
-        // See: https://www.youtube.com/watch?v=o4-CwDo2zpg
-        while (abs_value >= 100)
-        {
-            const auto digits_index = static_cast<unsigned>((abs_value % 100));
-            abs_value /= 100;
-            *(--buffer_ptr) = digits_to_99[digits_index][1];
-            *(--buffer_ptr) = digits_to_99[digits_index][0];
-        }
-
-        if (abs_value >= 10)
-        {
-            const auto digits_index = static_cast<unsigned>(abs_value);
-            *(--buffer_ptr) = digits_to_99[digits_index][1];
-            *(--buffer_ptr) = digits_to_99[digits_index][0];
-        }
-        else
-        {
-            *(--buffer_ptr) = static_cast<char>('0' + abs_value);
-        }
-
-        o->write_characters(number_buffer.data(), n_chars);
-    }
-
-    /*!
-    @brief dump a floating-point number
-
-    Dump a given floating-point number to output stream @a o. Works internally
-    with @a number_buffer.
-
-    @param[in] x  floating-point number to dump
-    */
-    void dump_float(number_float_t x)
-    {
-        // NaN / inf
-        if (!std::isfinite(x))
-        {
-            o->write_characters("null", 4);
-            return;
-        }
-
-        // If number_float_t is an IEEE-754 single or double precision number,
-        // use the Grisu2 algorithm to produce short numbers which are
-        // guaranteed to round-trip, using strtof and strtod, resp.
-        //
-        // NB: The test below works if <long double> == <double>.
-        static constexpr bool is_ieee_single_or_double
-            = (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 24 && std::numeric_limits<number_float_t>::max_exponent == 128) ||
-              (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 53 && std::numeric_limits<number_float_t>::max_exponent == 1024);
-
-        dump_float(x, std::integral_constant<bool, is_ieee_single_or_double>());
-    }
-
-    void dump_float(number_float_t x, std::true_type /*is_ieee_single_or_double*/)
-    {
-        auto* begin = number_buffer.data();
-        auto* end = ::nlohmann::detail::to_chars(begin, begin + number_buffer.size(), x);
-
-        o->write_characters(begin, static_cast<size_t>(end - begin));
-    }
-
-    void dump_float(number_float_t x, std::false_type /*is_ieee_single_or_double*/)
-    {
-        // get number of digits for a float -> text -> float round-trip
-        static constexpr auto d = std::numeric_limits<number_float_t>::max_digits10;
-
-        // the actual conversion
-        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
-        std::ptrdiff_t len = (std::snprintf)(number_buffer.data(), number_buffer.size(), "%.*g", d, x);
-
-        // negative value indicates an error
-        JSON_ASSERT(len > 0);
-        // check if buffer was large enough
-        JSON_ASSERT(static_cast<std::size_t>(len) < number_buffer.size());
-
-        // erase thousands separator
-        if (thousands_sep != '\0')
-        {
-            // NOLINTNEXTLINE(readability-qualified-auto,llvm-qualified-auto): std::remove returns an iterator, see https://github.com/nlohmann/json/issues/3081
-            const auto end = std::remove(number_buffer.begin(), number_buffer.begin() + len, thousands_sep);
-            std::fill(end, number_buffer.end(), '\0');
-            JSON_ASSERT((end - number_buffer.begin()) <= len);
-            len = (end - number_buffer.begin());
-        }
-
-        // convert decimal point to '.'
-        if (decimal_point != '\0' && decimal_point != '.')
-        {
-            // NOLINTNEXTLINE(readability-qualified-auto,llvm-qualified-auto): std::find returns an iterator, see https://github.com/nlohmann/json/issues/3081
-            const auto dec_pos = std::find(number_buffer.begin(), number_buffer.end(), decimal_point);
-            if (dec_pos != number_buffer.end())
-            {
-                *dec_pos = '.';
-            }
-        }
-
-        o->write_characters(number_buffer.data(), static_cast<std::size_t>(len));
-
-        // determine if we need to append ".0"
-        const bool value_is_int_like =
-            std::none_of(number_buffer.begin(), number_buffer.begin() + len + 1,
-                         [](char c)
-        {
-            return c == '.' || c == 'e';
-        });
-
-        if (value_is_int_like)
-        {
-            o->write_characters(".0", 2);
-        }
-    }
-
-    /*!
-    @brief check whether a string is UTF-8 encoded
-
-    The function checks each byte of a string whether it is UTF-8 encoded. The
-    result of the check is stored in the @a state parameter. The function must
-    be called initially with state 0 (accept). State 1 means the string must
-    be rejected, because the current byte is not allowed. If the string is
-    completely processed, but the state is non-zero, the string ended
-    prematurely; that is, the last byte indicated more bytes should have
-    followed.
-
-    @param[in,out] state  the state of the decoding
-    @param[in,out] codep  codepoint (valid only if resulting state is UTF8_ACCEPT)
-    @param[in] byte       next byte to decode
-    @return               new state
-
-    @note The function has been edited: a std::array is used.
-
-    @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
-    @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
-    */
-    static std::uint8_t decode(std::uint8_t& state, std::uint32_t& codep, const std::uint8_t byte) noexcept
-    {
-        static const std::array<std::uint8_t, 400> utf8d =
-        {
-            {
-                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1F
-                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3F
-                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5F
-                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7F
-                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9F
-                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // A0..BF
-                8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..DF
-                0xA, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // E0..EF
-                0xB, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // F0..FF
-                0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
-                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
-                1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
-                1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
-                1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // s7..s8
-            }
-        };
-
-        JSON_ASSERT(byte < utf8d.size());
-        const std::uint8_t type = utf8d[byte];
-
-        codep = (state != UTF8_ACCEPT)
-                ? (byte & 0x3fu) | (codep << 6u)
-                : (0xFFu >> type) & (byte);
-
-        const std::size_t index = 256u + (static_cast<size_t>(state) * 16u) + static_cast<size_t>(type);
-        JSON_ASSERT(index < utf8d.size());
-        state = utf8d[index];
-        return state;
-    }
-
-    /*
-     * Overload to make the compiler happy while it is instantiating
-     * dump_integer for number_unsigned_t.
-     * Must never be called.
-     */
-    number_unsigned_t remove_sign(number_unsigned_t x)
-    {
-        JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
-        return x; // LCOV_EXCL_LINE
-    }
-
-    /*
-     * Helper function for dump_integer
-     *
-     * This function takes a negative signed integer and returns its absolute
-     * value as unsigned integer. The plus/minus shuffling is necessary as we can
-     * not directly remove the sign of an arbitrary signed integer as the
-     * absolute values of INT_MIN and INT_MAX are usually not the same. See
-     * #1708 for details.
-     */
-    number_unsigned_t remove_sign(number_integer_t x) noexcept
-    {
-        JSON_ASSERT(x < 0 && x < (std::numeric_limits<number_integer_t>::max)()); // NOLINT(misc-redundant-expression)
-        return static_cast<number_unsigned_t>(-(x + 1)) + 1;
-    }
-
-  private:
-    /// the output of the serializer
-    output_adapter_t<char> o = nullptr;
-
-    /// a (hopefully) large enough character buffer
-    std::array<char, 64> number_buffer{{}};
-
-    /// the locale
-    const std::lconv* loc = nullptr;
-    /// the locale's thousand separator character
-    const char thousands_sep = '\0';
-    /// the locale's decimal point character
-    const char decimal_point = '\0';
-
-    /// string buffer
-    std::array<char, 512> string_buffer{{}};
-
-    /// the indentation character
-    const char indent_char;
-    /// the indentation string
-    string_t indent_string;
-
-    /// error_handler how to react on decoding errors
-    const error_handler_t error_handler;
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
-// #include <nlohmann/detail/value_t.hpp>
-
-// #include <nlohmann/json_fwd.hpp>
-
-// #include <nlohmann/ordered_map.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#include <functional> // equal_to, less
-#include <initializer_list> // initializer_list
-#include <iterator> // input_iterator_tag, iterator_traits
-#include <memory> // allocator
-#include <stdexcept> // for out_of_range
-#include <type_traits> // enable_if, is_convertible
-#include <utility> // pair
-#include <vector> // vector
-
-// #include <nlohmann/detail/macro_scope.hpp>
-
-// #include <nlohmann/detail/meta/type_traits.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-
-/// ordered_map: a minimal map-like container that preserves insertion order
-/// for use within nlohmann::basic_json<ordered_map>
-template <class Key, class T, class IgnoredLess = std::less<Key>,
-          class Allocator = std::allocator<std::pair<const Key, T>>>
-              struct ordered_map : std::vector<std::pair<const Key, T>, Allocator>
-{
-    using key_type = Key;
-    using mapped_type = T;
-    using Container = std::vector<std::pair<const Key, T>, Allocator>;
-    using iterator = typename Container::iterator;
-    using const_iterator = typename Container::const_iterator;
-    using size_type = typename Container::size_type;
-    using value_type = typename Container::value_type;
-#ifdef JSON_HAS_CPP_14
-    using key_compare = std::equal_to<>;
-#else
-    using key_compare = std::equal_to<Key>;
-#endif
-
-    // Explicit constructors instead of `using Container::Container`
-    // otherwise older compilers choke on it (GCC <= 5.5, xcode <= 9.4)
-    ordered_map() noexcept(noexcept(Container())) : Container{} {}
-    explicit ordered_map(const Allocator& alloc) noexcept(noexcept(Container(alloc))) : Container{alloc} {}
-    template <class It>
-    ordered_map(It first, It last, const Allocator& alloc = Allocator())
-        : Container{first, last, alloc} {}
-    ordered_map(std::initializer_list<value_type> init, const Allocator& alloc = Allocator() )
-        : Container{init, alloc} {}
-
-    std::pair<iterator, bool> emplace(const key_type& key, T&& t)
-    {
-        for (auto it = this->begin(); it != this->end(); ++it)
-        {
-            if (m_compare(it->first, key))
-            {
-                return {it, false};
-            }
-        }
-        Container::emplace_back(key, std::forward<T>(t));
-        return {std::prev(this->end()), true};
-    }
-
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
-    std::pair<iterator, bool> emplace(KeyType && key, T && t)
-    {
-        for (auto it = this->begin(); it != this->end(); ++it)
-        {
-            if (m_compare(it->first, key))
-            {
-                return {it, false};
-            }
-        }
-        Container::emplace_back(std::forward<KeyType>(key), std::forward<T>(t));
-        return {std::prev(this->end()), true};
-    }
-
-    T& operator[](const key_type& key)
-    {
-        return emplace(key, T{}).first->second;
-    }
-
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
-    T & operator[](KeyType && key)
-    {
-        return emplace(std::forward<KeyType>(key), T{}).first->second;
-    }
-
-    const T& operator[](const key_type& key) const
-    {
-        return at(key);
-    }
-
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
-    const T & operator[](KeyType && key) const
-    {
-        return at(std::forward<KeyType>(key));
-    }
-
-    T& at(const key_type& key)
-    {
-        for (auto it = this->begin(); it != this->end(); ++it)
-        {
-            if (m_compare(it->first, key))
-            {
-                return it->second;
-            }
-        }
-
-        JSON_THROW(std::out_of_range("key not found"));
-    }
-
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
-    T & at(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward)
-    {
-        for (auto it = this->begin(); it != this->end(); ++it)
-        {
-            if (m_compare(it->first, key))
-            {
-                return it->second;
-            }
-        }
-
-        JSON_THROW(std::out_of_range("key not found"));
-    }
-
-    const T& at(const key_type& key) const
-    {
-        for (auto it = this->begin(); it != this->end(); ++it)
-        {
-            if (m_compare(it->first, key))
-            {
-                return it->second;
-            }
-        }
-
-        JSON_THROW(std::out_of_range("key not found"));
-    }
-
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
-    const T & at(KeyType && key) const // NOLINT(cppcoreguidelines-missing-std-forward)
-    {
-        for (auto it = this->begin(); it != this->end(); ++it)
-        {
-            if (m_compare(it->first, key))
-            {
-                return it->second;
-            }
-        }
-
-        JSON_THROW(std::out_of_range("key not found"));
-    }
-
-    size_type erase(const key_type& key)
-    {
-        for (auto it = this->begin(); it != this->end(); ++it)
-        {
-            if (m_compare(it->first, key))
-            {
-                // Since we cannot move const Keys, re-construct them in place
-                for (auto next = it; ++next != this->end(); ++it)
-                {
-                    it->~value_type(); // Destroy but keep allocation
-                    new (&*it) value_type{std::move(*next)};
-                }
-                Container::pop_back();
-                return 1;
-            }
-        }
-        return 0;
-    }
-
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
-    size_type erase(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward)
-    {
-        for (auto it = this->begin(); it != this->end(); ++it)
-        {
-            if (m_compare(it->first, key))
-            {
-                // Since we cannot move const Keys, re-construct them in place
-                for (auto next = it; ++next != this->end(); ++it)
-                {
-                    it->~value_type(); // Destroy but keep allocation
-                    new (&*it) value_type{std::move(*next)};
-                }
-                Container::pop_back();
-                return 1;
-            }
-        }
-        return 0;
-    }
-
-    iterator erase(iterator pos)
-    {
-        return erase(pos, std::next(pos));
-    }
-
-    iterator erase(iterator first, iterator last)
-    {
-        if (first == last)
-        {
-            return first;
-        }
-
-        const auto elements_affected = std::distance(first, last);
-        const auto offset = std::distance(Container::begin(), first);
-
-        // This is the start situation. We need to delete elements_affected
-        // elements (3 in this example: e, f, g), and need to return an
-        // iterator past the last deleted element (h in this example).
-        // Note that offset is the distance from the start of the vector
-        // to first. We will need this later.
-
-        // [ a, b, c, d, e, f, g, h, i, j ]
-        //               ^        ^
-        //             first    last
-
-        // Since we cannot move const Keys, we re-construct them in place.
-        // We start at first and re-construct (viz. copy) the elements from
-        // the back of the vector. Example for first iteration:
-
-        //               ,--------.
-        //               v        |   destroy e and re-construct with h
-        // [ a, b, c, d, e, f, g, h, i, j ]
-        //               ^        ^
-        //               it       it + elements_affected
-
-        for (auto it = first; std::next(it, elements_affected) != Container::end(); ++it)
-        {
-            it->~value_type(); // destroy but keep allocation
-            new (&*it) value_type{std::move(*std::next(it, elements_affected))}; // "move" next element to it
-        }
-
-        // [ a, b, c, d, h, i, j, h, i, j ]
-        //               ^        ^
-        //             first    last
-
-        // remove the unneeded elements at the end of the vector
-        Container::resize(this->size() - static_cast<size_type>(elements_affected));
-
-        // [ a, b, c, d, h, i, j ]
-        //               ^        ^
-        //             first    last
-
-        // first is now pointing past the last deleted element, but we cannot
-        // use this iterator, because it may have been invalidated by the
-        // resize call. Instead, we can return begin() + offset.
-        return Container::begin() + offset;
-    }
-
-    size_type count(const key_type& key) const
-    {
-        for (auto it = this->begin(); it != this->end(); ++it)
-        {
-            if (m_compare(it->first, key))
-            {
-                return 1;
-            }
-        }
-        return 0;
-    }
-
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
-    size_type count(KeyType && key) const // NOLINT(cppcoreguidelines-missing-std-forward)
-    {
-        for (auto it = this->begin(); it != this->end(); ++it)
-        {
-            if (m_compare(it->first, key))
-            {
-                return 1;
-            }
-        }
-        return 0;
-    }
-
-    iterator find(const key_type& key)
-    {
-        for (auto it = this->begin(); it != this->end(); ++it)
-        {
-            if (m_compare(it->first, key))
-            {
-                return it;
-            }
-        }
-        return Container::end();
-    }
-
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
-    iterator find(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward)
-    {
-        for (auto it = this->begin(); it != this->end(); ++it)
-        {
-            if (m_compare(it->first, key))
-            {
-                return it;
-            }
-        }
-        return Container::end();
-    }
-
-    const_iterator find(const key_type& key) const
-    {
-        for (auto it = this->begin(); it != this->end(); ++it)
-        {
-            if (m_compare(it->first, key))
-            {
-                return it;
-            }
-        }
-        return Container::end();
-    }
-
-    std::pair<iterator, bool> insert( value_type&& value )
-    {
-        return emplace(value.first, std::move(value.second));
-    }
-
-    std::pair<iterator, bool> insert( const value_type& value )
-    {
-        for (auto it = this->begin(); it != this->end(); ++it)
-        {
-            if (m_compare(it->first, value.first))
-            {
-                return {it, false};
-            }
-        }
-        Container::push_back(value);
-        return {--this->end(), true};
-    }
-
-    template<typename InputIt>
-    using require_input_iter = typename std::enable_if<std::is_convertible<typename std::iterator_traits<InputIt>::iterator_category,
-        std::input_iterator_tag>::value>::type;
-
-    template<typename InputIt, typename = require_input_iter<InputIt>>
-    void insert(InputIt first, InputIt last)
-    {
-        for (auto it = first; it != last; ++it)
-        {
-            insert(*it);
-        }
-    }
-
-private:
-    JSON_NO_UNIQUE_ADDRESS key_compare m_compare = key_compare();
-};
-
-NLOHMANN_JSON_NAMESPACE_END
-
-
-#if defined(JSON_HAS_CPP_17)
-    #if JSON_HAS_STATIC_RTTI
-        #include <any>
-    #endif
-    #include <string_view>
-#endif
-
-/*!
-@brief namespace for Niels Lohmann
-@see https://github.com/nlohmann
-@since version 1.0.0
-*/
-NLOHMANN_JSON_NAMESPACE_BEGIN
-
-/*!
-@brief a class to store JSON values
-
-@internal
-@invariant The member variables @a m_value and @a m_type have the following
-relationship:
-- If `m_type == value_t::object`, then `m_value.object != nullptr`.
-- If `m_type == value_t::array`, then `m_value.array != nullptr`.
-- If `m_type == value_t::string`, then `m_value.string != nullptr`.
-The invariants are checked by member function assert_invariant().
-
-@note ObjectType trick from https://stackoverflow.com/a/9860911
-@endinternal
-
-@since version 1.0.0
-
-@nosubgrouping
-*/
-NLOHMANN_BASIC_JSON_TPL_DECLARATION
-class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-special-member-functions)
-    : public ::nlohmann::detail::json_base_class<CustomBaseClass>
-{
-  private:
-    template<detail::value_t> friend struct detail::external_constructor;
-
-    template<typename>
-    friend class ::nlohmann::json_pointer;
-    // can be restored when json_pointer backwards compatibility is removed
-    // friend ::nlohmann::json_pointer<StringType>;
-
-    template<typename BasicJsonType, typename InputType>
-    friend class ::nlohmann::detail::parser;
-    friend ::nlohmann::detail::serializer<basic_json>;
-    template<typename BasicJsonType>
-    friend class ::nlohmann::detail::iter_impl;
-    template<typename BasicJsonType, typename CharType>
-    friend class ::nlohmann::detail::binary_writer;
-    template<typename BasicJsonType, typename InputType, typename SAX>
-    friend class ::nlohmann::detail::binary_reader;
-    template<typename BasicJsonType, typename InputAdapterType>
-    friend class ::nlohmann::detail::json_sax_dom_parser;
-    template<typename BasicJsonType, typename InputAdapterType>
-    friend class ::nlohmann::detail::json_sax_dom_callback_parser;
-    friend class ::nlohmann::detail::exception;
-
-    /// workaround type for MSVC
-    using basic_json_t = NLOHMANN_BASIC_JSON_TPL;
-    using json_base_class_t = ::nlohmann::detail::json_base_class<CustomBaseClass>;
-
-  JSON_PRIVATE_UNLESS_TESTED:
-    // convenience aliases for types residing in namespace detail;
-    using lexer = ::nlohmann::detail::lexer_base<basic_json>;
-
-    template<typename InputAdapterType>
-    static ::nlohmann::detail::parser<basic_json, InputAdapterType> parser(
-        InputAdapterType adapter,
-        detail::parser_callback_t<basic_json>cb = nullptr,
-        const bool allow_exceptions = true,
-        const bool ignore_comments = false
-                                 )
-    {
-        return ::nlohmann::detail::parser<basic_json, InputAdapterType>(std::move(adapter),
-            std::move(cb), allow_exceptions, ignore_comments);
-    }
-
-  private:
-    using primitive_iterator_t = ::nlohmann::detail::primitive_iterator_t;
-    template<typename BasicJsonType>
-    using internal_iterator = ::nlohmann::detail::internal_iterator<BasicJsonType>;
-    template<typename BasicJsonType>
-    using iter_impl = ::nlohmann::detail::iter_impl<BasicJsonType>;
-    template<typename Iterator>
-    using iteration_proxy = ::nlohmann::detail::iteration_proxy<Iterator>;
-    template<typename Base> using json_reverse_iterator = ::nlohmann::detail::json_reverse_iterator<Base>;
-
-    template<typename CharType>
-    using output_adapter_t = ::nlohmann::detail::output_adapter_t<CharType>;
-
-    template<typename InputType>
-    using binary_reader = ::nlohmann::detail::binary_reader<basic_json, InputType>;
-    template<typename CharType> using binary_writer = ::nlohmann::detail::binary_writer<basic_json, CharType>;
-
-  JSON_PRIVATE_UNLESS_TESTED:
-    using serializer = ::nlohmann::detail::serializer<basic_json>;
-
-  public:
-    using value_t = detail::value_t;
-    /// JSON Pointer, see @ref nlohmann::json_pointer
-    using json_pointer = ::nlohmann::json_pointer<StringType>;
-    template<typename T, typename SFINAE>
-    using json_serializer = JSONSerializer<T, SFINAE>;
-    /// how to treat decoding errors
-    using error_handler_t = detail::error_handler_t;
-    /// how to treat CBOR tags
-    using cbor_tag_handler_t = detail::cbor_tag_handler_t;
-    /// how to encode BJData
-    using bjdata_version_t = detail::bjdata_version_t;
-    /// helper type for initializer lists of basic_json values
-    using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;
-
-    using input_format_t = detail::input_format_t;
-    /// SAX interface type, see @ref nlohmann::json_sax
-    using json_sax_t = json_sax<basic_json>;
-
-    ////////////////
-    // exceptions //
-    ////////////////
-
-    /// @name exceptions
-    /// Classes to implement user-defined exceptions.
-    /// @{
-
-    using exception = detail::exception;
-    using parse_error = detail::parse_error;
-    using invalid_iterator = detail::invalid_iterator;
-    using type_error = detail::type_error;
-    using out_of_range = detail::out_of_range;
-    using other_error = detail::other_error;
-
-    /// @}
-
-    /////////////////////
-    // container types //
-    /////////////////////
-
-    /// @name container types
-    /// The canonic container types to use @ref basic_json like any other STL
-    /// container.
-    /// @{
-
-    /// the type of elements in a basic_json container
-    using value_type = basic_json;
-
-    /// the type of an element reference
-    using reference = value_type&;
-    /// the type of an element const reference
-    using const_reference = const value_type&;
-
-    /// a type to represent differences between iterators
-    using difference_type = std::ptrdiff_t;
-    /// a type to represent container sizes
-    using size_type = std::size_t;
-
-    /// the allocator type
-    using allocator_type = AllocatorType<basic_json>;
-
-    /// the type of an element pointer
-    using pointer = typename std::allocator_traits<allocator_type>::pointer;
-    /// the type of an element const pointer
-    using const_pointer = typename std::allocator_traits<allocator_type>::const_pointer;
-
-    /// an iterator for a basic_json container
-    using iterator = iter_impl<basic_json>;
-    /// a const iterator for a basic_json container
-    using const_iterator = iter_impl<const basic_json>;
-    /// a reverse iterator for a basic_json container
-    using reverse_iterator = json_reverse_iterator<typename basic_json::iterator>;
-    /// a const reverse iterator for a basic_json container
-    using const_reverse_iterator = json_reverse_iterator<typename basic_json::const_iterator>;
-
-    /// @}
-
-    /// @brief returns the allocator associated with the container
-    /// @sa https://json.nlohmann.me/api/basic_json/get_allocator/
-    static allocator_type get_allocator()
-    {
-        return allocator_type();
-    }
-
-    /// @brief returns version information on the library
-    /// @sa https://json.nlohmann.me/api/basic_json/meta/
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json meta()
-    {
-        basic_json result;
-
-        result["copyright"] = "(C) 2013-2023 Niels Lohmann";
-        result["name"] = "JSON for Modern C++";
-        result["url"] = "https://github.com/nlohmann/json";
-        result["version"]["string"] =
-            detail::concat(std::to_string(NLOHMANN_JSON_VERSION_MAJOR), '.',
-                           std::to_string(NLOHMANN_JSON_VERSION_MINOR), '.',
-                           std::to_string(NLOHMANN_JSON_VERSION_PATCH));
-        result["version"]["major"] = NLOHMANN_JSON_VERSION_MAJOR;
-        result["version"]["minor"] = NLOHMANN_JSON_VERSION_MINOR;
-        result["version"]["patch"] = NLOHMANN_JSON_VERSION_PATCH;
-
-#ifdef _WIN32
-        result["platform"] = "win32";
-#elif defined __linux__
-        result["platform"] = "linux";
-#elif defined __APPLE__
-        result["platform"] = "apple";
-#elif defined __unix__
-        result["platform"] = "unix";
-#else
-        result["platform"] = "unknown";
-#endif
-
-#if defined(__ICC) || defined(__INTEL_COMPILER)
-        result["compiler"] = {{"family", "icc"}, {"version", __INTEL_COMPILER}};
-#elif defined(__clang__)
-        result["compiler"] = {{"family", "clang"}, {"version", __clang_version__}};
-#elif defined(__GNUC__) || defined(__GNUG__)
-        result["compiler"] = {{"family", "gcc"}, {"version", detail::concat(
-                    std::to_string(__GNUC__), '.',
-                    std::to_string(__GNUC_MINOR__), '.',
-                    std::to_string(__GNUC_PATCHLEVEL__))
-            }
-        };
-#elif defined(__HP_cc) || defined(__HP_aCC)
-        result["compiler"] = "hp"
-#elif defined(__IBMCPP__)
-        result["compiler"] = {{"family", "ilecpp"}, {"version", __IBMCPP__}};
-#elif defined(_MSC_VER)
-        result["compiler"] = {{"family", "msvc"}, {"version", _MSC_VER}};
-#elif defined(__PGI)
-        result["compiler"] = {{"family", "pgcpp"}, {"version", __PGI}};
-#elif defined(__SUNPRO_CC)
-        result["compiler"] = {{"family", "sunpro"}, {"version", __SUNPRO_CC}};
-#else
-        result["compiler"] = {{"family", "unknown"}, {"version", "unknown"}};
-#endif
-
-#if defined(_MSVC_LANG)
-        result["compiler"]["c++"] = std::to_string(_MSVC_LANG);
-#elif defined(__cplusplus)
-        result["compiler"]["c++"] = std::to_string(__cplusplus);
-#else
-        result["compiler"]["c++"] = "unknown";
-#endif
-        return result;
-    }
-
-    ///////////////////////////
-    // JSON value data types //
-    ///////////////////////////
-
-    /// @name JSON value data types
-    /// The data types to store a JSON value. These types are derived from
-    /// the template arguments passed to class @ref basic_json.
-    /// @{
-
-    /// @brief default object key comparator type
-    /// The actual object key comparator type (@ref object_comparator_t) may be
-    /// different.
-    /// @sa https://json.nlohmann.me/api/basic_json/default_object_comparator_t/
-#if defined(JSON_HAS_CPP_14)
-    // use of transparent comparator avoids unnecessary repeated construction of temporaries
-    // in functions involving lookup by key with types other than object_t::key_type (aka. StringType)
-    using default_object_comparator_t = std::less<>;
-#else
-    using default_object_comparator_t = std::less<StringType>;
-#endif
-
-    /// @brief a type for an object
-    /// @sa https://json.nlohmann.me/api/basic_json/object_t/
-    using object_t = ObjectType<StringType,
-          basic_json,
-          default_object_comparator_t,
-          AllocatorType<std::pair<const StringType,
-          basic_json>>>;
-
-    /// @brief a type for an array
-    /// @sa https://json.nlohmann.me/api/basic_json/array_t/
-    using array_t = ArrayType<basic_json, AllocatorType<basic_json>>;
-
-    /// @brief a type for a string
-    /// @sa https://json.nlohmann.me/api/basic_json/string_t/
-    using string_t = StringType;
-
-    /// @brief a type for a boolean
-    /// @sa https://json.nlohmann.me/api/basic_json/boolean_t/
-    using boolean_t = BooleanType;
-
-    /// @brief a type for a number (integer)
-    /// @sa https://json.nlohmann.me/api/basic_json/number_integer_t/
-    using number_integer_t = NumberIntegerType;
-
-    /// @brief a type for a number (unsigned)
-    /// @sa https://json.nlohmann.me/api/basic_json/number_unsigned_t/
-    using number_unsigned_t = NumberUnsignedType;
-
-    /// @brief a type for a number (floating-point)
-    /// @sa https://json.nlohmann.me/api/basic_json/number_float_t/
-    using number_float_t = NumberFloatType;
-
-    /// @brief a type for a packed binary type
-    /// @sa https://json.nlohmann.me/api/basic_json/binary_t/
-    using binary_t = nlohmann::byte_container_with_subtype<BinaryType>;
-
-    /// @brief object key comparator type
-    /// @sa https://json.nlohmann.me/api/basic_json/object_comparator_t/
-    using object_comparator_t = detail::actual_object_comparator_t<basic_json>;
-
-    /// @}
-
-  private:
-
-    /// helper for exception-safe object creation
-    template<typename T, typename... Args>
-    JSON_HEDLEY_RETURNS_NON_NULL
-    static T* create(Args&& ... args)
-    {
-        AllocatorType<T> alloc;
-        using AllocatorTraits = std::allocator_traits<AllocatorType<T>>;
-
-        auto deleter = [&](T * obj)
-        {
-            AllocatorTraits::deallocate(alloc, obj, 1);
-        };
-        std::unique_ptr<T, decltype(deleter)> obj(AllocatorTraits::allocate(alloc, 1), deleter);
-        AllocatorTraits::construct(alloc, obj.get(), std::forward<Args>(args)...);
-        JSON_ASSERT(obj != nullptr);
-        return obj.release();
-    }
-
-    ////////////////////////
-    // JSON value storage //
-    ////////////////////////
-
-  JSON_PRIVATE_UNLESS_TESTED:
-    /*!
-    @brief a JSON value
-
-    The actual storage for a JSON value of the @ref basic_json class. This
-    union combines the different storage types for the JSON value types
-    defined in @ref value_t.
-
-    JSON type | value_t type    | used type
-    --------- | --------------- | ------------------------
-    object    | object          | pointer to @ref object_t
-    array     | array           | pointer to @ref array_t
-    string    | string          | pointer to @ref string_t
-    boolean   | boolean         | @ref boolean_t
-    number    | number_integer  | @ref number_integer_t
-    number    | number_unsigned | @ref number_unsigned_t
-    number    | number_float    | @ref number_float_t
-    binary    | binary          | pointer to @ref binary_t
-    null      | null            | *no value is stored*
-
-    @note Variable-length types (objects, arrays, and strings) are stored as
-    pointers. The size of the union should not exceed 64 bits if the default
-    value types are used.
-
-    @since version 1.0.0
-    */
-    union json_value
-    {
-        /// object (stored with pointer to save storage)
-        object_t* object;
-        /// array (stored with pointer to save storage)
-        array_t* array;
-        /// string (stored with pointer to save storage)
-        string_t* string;
-        /// binary (stored with pointer to save storage)
-        binary_t* binary;
-        /// boolean
-        boolean_t boolean;
-        /// number (integer)
-        number_integer_t number_integer;
-        /// number (unsigned integer)
-        number_unsigned_t number_unsigned;
-        /// number (floating-point)
-        number_float_t number_float;
-
-        /// default constructor (for null values)
-        json_value() = default;
-        /// constructor for booleans
-        json_value(boolean_t v) noexcept : boolean(v) {}
-        /// constructor for numbers (integer)
-        json_value(number_integer_t v) noexcept : number_integer(v) {}
-        /// constructor for numbers (unsigned)
-        json_value(number_unsigned_t v) noexcept : number_unsigned(v) {}
-        /// constructor for numbers (floating-point)
-        json_value(number_float_t v) noexcept : number_float(v) {}
-        /// constructor for empty values of a given type
-        json_value(value_t t)
-        {
-            switch (t)
-            {
-                case value_t::object:
-                {
-                    object = create<object_t>();
-                    break;
-                }
-
-                case value_t::array:
-                {
-                    array = create<array_t>();
-                    break;
-                }
-
-                case value_t::string:
-                {
-                    string = create<string_t>("");
-                    break;
-                }
-
-                case value_t::binary:
-                {
-                    binary = create<binary_t>();
-                    break;
-                }
-
-                case value_t::boolean:
-                {
-                    boolean = static_cast<boolean_t>(false);
-                    break;
-                }
-
-                case value_t::number_integer:
-                {
-                    number_integer = static_cast<number_integer_t>(0);
-                    break;
-                }
-
-                case value_t::number_unsigned:
-                {
-                    number_unsigned = static_cast<number_unsigned_t>(0);
-                    break;
-                }
-
-                case value_t::number_float:
-                {
-                    number_float = static_cast<number_float_t>(0.0);
-                    break;
-                }
-
-                case value_t::null:
-                {
-                    object = nullptr;  // silence warning, see #821
-                    break;
-                }
-
-                case value_t::discarded:
-                default:
-                {
-                    object = nullptr;  // silence warning, see #821
-                    if (JSON_HEDLEY_UNLIKELY(t == value_t::null))
-                    {
-                        JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.11.3", nullptr)); // LCOV_EXCL_LINE
-                    }
-                    break;
-                }
-            }
-        }
-
-        /// constructor for strings
-        json_value(const string_t& value) : string(create<string_t>(value)) {}
-
-        /// constructor for rvalue strings
-        json_value(string_t&& value) : string(create<string_t>(std::move(value))) {}
-
-        /// constructor for objects
-        json_value(const object_t& value) : object(create<object_t>(value)) {}
-
-        /// constructor for rvalue objects
-        json_value(object_t&& value) : object(create<object_t>(std::move(value))) {}
-
-        /// constructor for arrays
-        json_value(const array_t& value) : array(create<array_t>(value)) {}
-
-        /// constructor for rvalue arrays
-        json_value(array_t&& value) : array(create<array_t>(std::move(value))) {}
-
-        /// constructor for binary arrays
-        json_value(const typename binary_t::container_type& value) : binary(create<binary_t>(value)) {}
-
-        /// constructor for rvalue binary arrays
-        json_value(typename binary_t::container_type&& value) : binary(create<binary_t>(std::move(value))) {}
-
-        /// constructor for binary arrays (internal type)
-        json_value(const binary_t& value) : binary(create<binary_t>(value)) {}
-
-        /// constructor for rvalue binary arrays (internal type)
-        json_value(binary_t&& value) : binary(create<binary_t>(std::move(value))) {}
-
-        void destroy(value_t t)
-        {
-            if (
-                (t == value_t::object && object == nullptr) ||
-                (t == value_t::array && array == nullptr) ||
-                (t == value_t::string && string == nullptr) ||
-                (t == value_t::binary && binary == nullptr)
-            )
-            {
-                //not initialized (e.g. due to exception in the ctor)
-                return;
-            }
-            if (t == value_t::array || t == value_t::object)
-            {
-                // flatten the current json_value to a heap-allocated stack
-                std::vector<basic_json> stack;
-
-                // move the top-level items to stack
-                if (t == value_t::array)
-                {
-                    stack.reserve(array->size());
-                    std::move(array->begin(), array->end(), std::back_inserter(stack));
-                }
-                else
-                {
-                    stack.reserve(object->size());
-                    for (auto&& it : *object)
-                    {
-                        stack.push_back(std::move(it.second));
-                    }
-                }
-
-                while (!stack.empty())
-                {
-                    // move the last item to local variable to be processed
-                    basic_json current_item(std::move(stack.back()));
-                    stack.pop_back();
-
-                    // if current_item is array/object, move
-                    // its children to the stack to be processed later
-                    if (current_item.is_array())
-                    {
-                        std::move(current_item.m_data.m_value.array->begin(), current_item.m_data.m_value.array->end(), std::back_inserter(stack));
-
-                        current_item.m_data.m_value.array->clear();
-                    }
-                    else if (current_item.is_object())
-                    {
-                        for (auto&& it : *current_item.m_data.m_value.object)
-                        {
-                            stack.push_back(std::move(it.second));
-                        }
-
-                        current_item.m_data.m_value.object->clear();
-                    }
-
-                    // it's now safe that current_item get destructed
-                    // since it doesn't have any children
-                }
-            }
-
-            switch (t)
-            {
-                case value_t::object:
-                {
-                    AllocatorType<object_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, object);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, object, 1);
-                    break;
-                }
-
-                case value_t::array:
-                {
-                    AllocatorType<array_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, array);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, array, 1);
-                    break;
-                }
-
-                case value_t::string:
-                {
-                    AllocatorType<string_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, string);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, string, 1);
-                    break;
-                }
-
-                case value_t::binary:
-                {
-                    AllocatorType<binary_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, binary);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, binary, 1);
-                    break;
-                }
-
-                case value_t::null:
-                case value_t::boolean:
-                case value_t::number_integer:
-                case value_t::number_unsigned:
-                case value_t::number_float:
-                case value_t::discarded:
-                default:
-                {
-                    break;
-                }
-            }
-        }
-    };
-
-  private:
-    /*!
-    @brief checks the class invariants
-
-    This function asserts the class invariants. It needs to be called at the
-    end of every constructor to make sure that created objects respect the
-    invariant. Furthermore, it has to be called each time the type of a JSON
-    value is changed, because the invariant expresses a relationship between
-    @a m_type and @a m_value.
-
-    Furthermore, the parent relation is checked for arrays and objects: If
-    @a check_parents true and the value is an array or object, then the
-    container's elements must have the current value as parent.
-
-    @param[in] check_parents  whether the parent relation should be checked.
-               The value is true by default and should only be set to false
-               during destruction of objects when the invariant does not
-               need to hold.
-    */
-    void assert_invariant(bool check_parents = true) const noexcept
-    {
-        JSON_ASSERT(m_data.m_type != value_t::object || m_data.m_value.object != nullptr);
-        JSON_ASSERT(m_data.m_type != value_t::array || m_data.m_value.array != nullptr);
-        JSON_ASSERT(m_data.m_type != value_t::string || m_data.m_value.string != nullptr);
-        JSON_ASSERT(m_data.m_type != value_t::binary || m_data.m_value.binary != nullptr);
-
-#if JSON_DIAGNOSTICS
-        JSON_TRY
-        {
-            // cppcheck-suppress assertWithSideEffect
-            JSON_ASSERT(!check_parents || !is_structured() || std::all_of(begin(), end(), [this](const basic_json & j)
-            {
-                return j.m_parent == this;
-            }));
-        }
-        JSON_CATCH(...) {} // LCOV_EXCL_LINE
-#endif
-        static_cast<void>(check_parents);
-    }
-
-    void set_parents()
-    {
-#if JSON_DIAGNOSTICS
-        switch (m_data.m_type)
-        {
-            case value_t::array:
-            {
-                for (auto& element : *m_data.m_value.array)
-                {
-                    element.m_parent = this;
-                }
-                break;
-            }
-
-            case value_t::object:
-            {
-                for (auto& element : *m_data.m_value.object)
-                {
-                    element.second.m_parent = this;
-                }
-                break;
-            }
-
-            case value_t::null:
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-                break;
-        }
-#endif
-    }
-
-    iterator set_parents(iterator it, typename iterator::difference_type count_set_parents)
-    {
-#if JSON_DIAGNOSTICS
-        for (typename iterator::difference_type i = 0; i < count_set_parents; ++i)
-        {
-            (it + i)->m_parent = this;
-        }
-#else
-        static_cast<void>(count_set_parents);
-#endif
-        return it;
-    }
-
-    reference set_parent(reference j, std::size_t old_capacity = detail::unknown_size())
-    {
-#if JSON_DIAGNOSTICS
-        if (old_capacity != detail::unknown_size())
-        {
-            // see https://github.com/nlohmann/json/issues/2838
-            JSON_ASSERT(type() == value_t::array);
-            if (JSON_HEDLEY_UNLIKELY(m_data.m_value.array->capacity() != old_capacity))
-            {
-                // capacity has changed: update all parents
-                set_parents();
-                return j;
-            }
-        }
-
-        // ordered_json uses a vector internally, so pointers could have
-        // been invalidated; see https://github.com/nlohmann/json/issues/2962
-#ifdef JSON_HEDLEY_MSVC_VERSION
-#pragma warning(push )
-#pragma warning(disable : 4127) // ignore warning to replace if with if constexpr
-#endif
-        if (detail::is_ordered_map<object_t>::value)
-        {
-            set_parents();
-            return j;
-        }
-#ifdef JSON_HEDLEY_MSVC_VERSION
-#pragma warning( pop )
-#endif
-
-        j.m_parent = this;
-#else
-        static_cast<void>(j);
-        static_cast<void>(old_capacity);
-#endif
-        return j;
-    }
-
-  public:
-    //////////////////////////
-    // JSON parser callback //
-    //////////////////////////
-
-    /// @brief parser event types
-    /// @sa https://json.nlohmann.me/api/basic_json/parse_event_t/
-    using parse_event_t = detail::parse_event_t;
-
-    /// @brief per-element parser callback type
-    /// @sa https://json.nlohmann.me/api/basic_json/parser_callback_t/
-    using parser_callback_t = detail::parser_callback_t<basic_json>;
-
-    //////////////////
-    // constructors //
-    //////////////////
-
-    /// @name constructors and destructors
-    /// Constructors of class @ref basic_json, copy/move constructor, copy
-    /// assignment, static functions creating objects, and the destructor.
-    /// @{
-
-    /// @brief create an empty value with a given type
-    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
-    basic_json(const value_t v)
-        : m_data(v)
-    {
-        assert_invariant();
-    }
-
-    /// @brief create a null object
-    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
-    basic_json(std::nullptr_t = nullptr) noexcept // NOLINT(bugprone-exception-escape)
-        : basic_json(value_t::null)
-    {
-        assert_invariant();
-    }
-
-    /// @brief create a JSON value from compatible types
-    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
-    template < typename CompatibleType,
-               typename U = detail::uncvref_t<CompatibleType>,
-               detail::enable_if_t <
-                   !detail::is_basic_json<U>::value && detail::is_compatible_type<basic_json_t, U>::value, int > = 0 >
-    basic_json(CompatibleType && val) noexcept(noexcept( // NOLINT(bugprone-forwarding-reference-overload,bugprone-exception-escape)
-            JSONSerializer<U>::to_json(std::declval<basic_json_t&>(),
-                                       std::forward<CompatibleType>(val))))
-    {
-        JSONSerializer<U>::to_json(*this, std::forward<CompatibleType>(val));
-        set_parents();
-        assert_invariant();
-    }
-
-    /// @brief create a JSON value from an existing one
-    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
-    template < typename BasicJsonType,
-               detail::enable_if_t <
-                   detail::is_basic_json<BasicJsonType>::value&& !std::is_same<basic_json, BasicJsonType>::value, int > = 0 >
-    basic_json(const BasicJsonType& val)
-#if JSON_DIAGNOSTIC_POSITIONS
-        : start_position(val.start_pos()),
-          end_position(val.end_pos())
-#endif
-    {
-        using other_boolean_t = typename BasicJsonType::boolean_t;
-        using other_number_float_t = typename BasicJsonType::number_float_t;
-        using other_number_integer_t = typename BasicJsonType::number_integer_t;
-        using other_number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-        using other_string_t = typename BasicJsonType::string_t;
-        using other_object_t = typename BasicJsonType::object_t;
-        using other_array_t = typename BasicJsonType::array_t;
-        using other_binary_t = typename BasicJsonType::binary_t;
-
-        switch (val.type())
-        {
-            case value_t::boolean:
-                JSONSerializer<other_boolean_t>::to_json(*this, val.template get<other_boolean_t>());
-                break;
-            case value_t::number_float:
-                JSONSerializer<other_number_float_t>::to_json(*this, val.template get<other_number_float_t>());
-                break;
-            case value_t::number_integer:
-                JSONSerializer<other_number_integer_t>::to_json(*this, val.template get<other_number_integer_t>());
-                break;
-            case value_t::number_unsigned:
-                JSONSerializer<other_number_unsigned_t>::to_json(*this, val.template get<other_number_unsigned_t>());
-                break;
-            case value_t::string:
-                JSONSerializer<other_string_t>::to_json(*this, val.template get_ref<const other_string_t&>());
-                break;
-            case value_t::object:
-                JSONSerializer<other_object_t>::to_json(*this, val.template get_ref<const other_object_t&>());
-                break;
-            case value_t::array:
-                JSONSerializer<other_array_t>::to_json(*this, val.template get_ref<const other_array_t&>());
-                break;
-            case value_t::binary:
-                JSONSerializer<other_binary_t>::to_json(*this, val.template get_ref<const other_binary_t&>());
-                break;
-            case value_t::null:
-                *this = nullptr;
-                break;
-            case value_t::discarded:
-                m_data.m_type = value_t::discarded;
-                break;
-            default:            // LCOV_EXCL_LINE
-                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
-        }
-        JSON_ASSERT(m_data.m_type == val.type());
-
-        set_parents();
-        assert_invariant();
-    }
-
-    /// @brief create a container (array or object) from an initializer list
-    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
-    basic_json(initializer_list_t init,
-               bool type_deduction = true,
-               value_t manual_type = value_t::array)
-    {
-        // check if each element is an array with two elements whose first
-        // element is a string
-        bool is_an_object = std::all_of(init.begin(), init.end(),
-                                        [](const detail::json_ref<basic_json>& element_ref)
-        {
-            // The cast is to ensure op[size_type] is called, bearing in mind size_type may not be int;
-            // (many string types can be constructed from 0 via its null-pointer guise, so we get a
-            // broken call to op[key_type], the wrong semantics and a 4804 warning on Windows)
-            return element_ref->is_array() && element_ref->size() == 2 && (*element_ref)[static_cast<size_type>(0)].is_string();
-        });
-
-        // adjust type if type deduction is not wanted
-        if (!type_deduction)
-        {
-            // if array is wanted, do not create an object though possible
-            if (manual_type == value_t::array)
-            {
-                is_an_object = false;
-            }
-
-            // if object is wanted but impossible, throw an exception
-            if (JSON_HEDLEY_UNLIKELY(manual_type == value_t::object && !is_an_object))
-            {
-                JSON_THROW(type_error::create(301, "cannot create object from initializer list", nullptr));
-            }
-        }
-
-        if (is_an_object)
-        {
-            // the initializer list is a list of pairs -> create object
-            m_data.m_type = value_t::object;
-            m_data.m_value = value_t::object;
-
-            for (auto& element_ref : init)
-            {
-                auto element = element_ref.moved_or_copied();
-                m_data.m_value.object->emplace(
-                    std::move(*((*element.m_data.m_value.array)[0].m_data.m_value.string)),
-                    std::move((*element.m_data.m_value.array)[1]));
-            }
-        }
-        else
-        {
-            // the initializer list describes an array -> create array
-            m_data.m_type = value_t::array;
-            m_data.m_value.array = create<array_t>(init.begin(), init.end());
-        }
-
-        set_parents();
-        assert_invariant();
-    }
-
-    /// @brief explicitly create a binary array (without subtype)
-    /// @sa https://json.nlohmann.me/api/basic_json/binary/
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json binary(const typename binary_t::container_type& init)
-    {
-        auto res = basic_json();
-        res.m_data.m_type = value_t::binary;
-        res.m_data.m_value = init;
-        return res;
-    }
-
-    /// @brief explicitly create a binary array (with subtype)
-    /// @sa https://json.nlohmann.me/api/basic_json/binary/
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json binary(const typename binary_t::container_type& init, typename binary_t::subtype_type subtype)
-    {
-        auto res = basic_json();
-        res.m_data.m_type = value_t::binary;
-        res.m_data.m_value = binary_t(init, subtype);
-        return res;
-    }
-
-    /// @brief explicitly create a binary array
-    /// @sa https://json.nlohmann.me/api/basic_json/binary/
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json binary(typename binary_t::container_type&& init)
-    {
-        auto res = basic_json();
-        res.m_data.m_type = value_t::binary;
-        res.m_data.m_value = std::move(init);
-        return res;
-    }
-
-    /// @brief explicitly create a binary array (with subtype)
-    /// @sa https://json.nlohmann.me/api/basic_json/binary/
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json binary(typename binary_t::container_type&& init, typename binary_t::subtype_type subtype)
-    {
-        auto res = basic_json();
-        res.m_data.m_type = value_t::binary;
-        res.m_data.m_value = binary_t(std::move(init), subtype);
-        return res;
-    }
-
-    /// @brief explicitly create an array from an initializer list
-    /// @sa https://json.nlohmann.me/api/basic_json/array/
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json array(initializer_list_t init = {})
-    {
-        return basic_json(init, false, value_t::array);
-    }
-
-    /// @brief explicitly create an object from an initializer list
-    /// @sa https://json.nlohmann.me/api/basic_json/object/
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json object(initializer_list_t init = {})
-    {
-        return basic_json(init, false, value_t::object);
-    }
-
-    /// @brief construct an array with count copies of given value
-    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
-    basic_json(size_type cnt, const basic_json& val):
-        m_data{cnt, val}
-    {
-        set_parents();
-        assert_invariant();
-    }
-
-    /// @brief construct a JSON container given an iterator range
-    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
-    template < class InputIT, typename std::enable_if <
-                   std::is_same<InputIT, typename basic_json_t::iterator>::value ||
-                   std::is_same<InputIT, typename basic_json_t::const_iterator>::value, int >::type = 0 >
-    basic_json(InputIT first, InputIT last) // NOLINT(performance-unnecessary-value-param)
-    {
-        JSON_ASSERT(first.m_object != nullptr);
-        JSON_ASSERT(last.m_object != nullptr);
-
-        // make sure iterator fits the current value
-        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
-        {
-            JSON_THROW(invalid_iterator::create(201, "iterators are not compatible", nullptr));
-        }
-
-        // copy type from first iterator
-        m_data.m_type = first.m_object->m_data.m_type;
-
-        // check if iterator range is complete for primitive values
-        switch (m_data.m_type)
-        {
-            case value_t::boolean:
-            case value_t::number_float:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::string:
-            {
-                if (JSON_HEDLEY_UNLIKELY(!first.m_it.primitive_iterator.is_begin()
-                                         || !last.m_it.primitive_iterator.is_end()))
-                {
-                    JSON_THROW(invalid_iterator::create(204, "iterators out of range", first.m_object));
-                }
-                break;
-            }
-
-            case value_t::null:
-            case value_t::object:
-            case value_t::array:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-                break;
-        }
-
-        switch (m_data.m_type)
-        {
-            case value_t::number_integer:
-            {
-                m_data.m_value.number_integer = first.m_object->m_data.m_value.number_integer;
-                break;
-            }
-
-            case value_t::number_unsigned:
-            {
-                m_data.m_value.number_unsigned = first.m_object->m_data.m_value.number_unsigned;
-                break;
-            }
-
-            case value_t::number_float:
-            {
-                m_data.m_value.number_float = first.m_object->m_data.m_value.number_float;
-                break;
-            }
-
-            case value_t::boolean:
-            {
-                m_data.m_value.boolean = first.m_object->m_data.m_value.boolean;
-                break;
-            }
-
-            case value_t::string:
-            {
-                m_data.m_value = *first.m_object->m_data.m_value.string;
-                break;
-            }
-
-            case value_t::object:
-            {
-                m_data.m_value.object = create<object_t>(first.m_it.object_iterator,
-                                        last.m_it.object_iterator);
-                break;
-            }
-
-            case value_t::array:
-            {
-                m_data.m_value.array = create<array_t>(first.m_it.array_iterator,
-                                                       last.m_it.array_iterator);
-                break;
-            }
-
-            case value_t::binary:
-            {
-                m_data.m_value = *first.m_object->m_data.m_value.binary;
-                break;
-            }
-
-            case value_t::null:
-            case value_t::discarded:
-            default:
-                JSON_THROW(invalid_iterator::create(206, detail::concat("cannot construct with iterators from ", first.m_object->type_name()), first.m_object));
-        }
-
-        set_parents();
-        assert_invariant();
-    }
-
-    ///////////////////////////////////////
-    // other constructors and destructor //
-    ///////////////////////////////////////
-
-    template<typename JsonRef,
-             detail::enable_if_t<detail::conjunction<detail::is_json_ref<JsonRef>,
-                                 std::is_same<typename JsonRef::value_type, basic_json>>::value, int> = 0 >
-    basic_json(const JsonRef& ref) : basic_json(ref.moved_or_copied()) {}
-
-    /// @brief copy constructor
-    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
-    basic_json(const basic_json& other)
-        : json_base_class_t(other)
-#if JSON_DIAGNOSTIC_POSITIONS
-        , start_position(other.start_position)
-        , end_position(other.end_position)
-#endif
-    {
-        m_data.m_type = other.m_data.m_type;
-        // check of passed value is valid
-        other.assert_invariant();
-
-        switch (m_data.m_type)
-        {
-            case value_t::object:
-            {
-                m_data.m_value = *other.m_data.m_value.object;
-                break;
-            }
-
-            case value_t::array:
-            {
-                m_data.m_value = *other.m_data.m_value.array;
-                break;
-            }
-
-            case value_t::string:
-            {
-                m_data.m_value = *other.m_data.m_value.string;
-                break;
-            }
-
-            case value_t::boolean:
-            {
-                m_data.m_value = other.m_data.m_value.boolean;
-                break;
-            }
-
-            case value_t::number_integer:
-            {
-                m_data.m_value = other.m_data.m_value.number_integer;
-                break;
-            }
-
-            case value_t::number_unsigned:
-            {
-                m_data.m_value = other.m_data.m_value.number_unsigned;
-                break;
-            }
-
-            case value_t::number_float:
-            {
-                m_data.m_value = other.m_data.m_value.number_float;
-                break;
-            }
-
-            case value_t::binary:
-            {
-                m_data.m_value = *other.m_data.m_value.binary;
-                break;
-            }
-
-            case value_t::null:
-            case value_t::discarded:
-            default:
-                break;
-        }
-
-        set_parents();
-        assert_invariant();
-    }
-
-    /// @brief move constructor
-    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
-    basic_json(basic_json&& other) noexcept
-        : json_base_class_t(std::forward<json_base_class_t>(other)),
-          m_data(std::move(other.m_data)) // cppcheck-suppress[accessForwarded] TODO check
-#if JSON_DIAGNOSTIC_POSITIONS
-        , start_position(other.start_position) // cppcheck-suppress[accessForwarded] TODO check
-        , end_position(other.end_position) // cppcheck-suppress[accessForwarded] TODO check
-#endif
-    {
-        // check that passed value is valid
-        other.assert_invariant(false); // cppcheck-suppress[accessForwarded]
-
-        // invalidate payload
-        other.m_data.m_type = value_t::null;
-        other.m_data.m_value = {};
-
-#if JSON_DIAGNOSTIC_POSITIONS
-        other.start_position = std::string::npos;
-        other.end_position = std::string::npos;
-#endif
-
-        set_parents();
-        assert_invariant();
-    }
-
-    /// @brief copy assignment
-    /// @sa https://json.nlohmann.me/api/basic_json/operator=/
-    basic_json& operator=(basic_json other) noexcept (
-        std::is_nothrow_move_constructible<value_t>::value&&
-        std::is_nothrow_move_assignable<value_t>::value&&
-        std::is_nothrow_move_constructible<json_value>::value&&
-        std::is_nothrow_move_assignable<json_value>::value&&
-        std::is_nothrow_move_assignable<json_base_class_t>::value
-    )
-    {
-        // check that passed value is valid
-        other.assert_invariant();
-
-        using std::swap;
-        swap(m_data.m_type, other.m_data.m_type);
-        swap(m_data.m_value, other.m_data.m_value);
-
-#if JSON_DIAGNOSTIC_POSITIONS
-        swap(start_position, other.start_position);
-        swap(end_position, other.end_position);
-#endif
-
-        json_base_class_t::operator=(std::move(other));
-
-        set_parents();
-        assert_invariant();
-        return *this;
-    }
-
-    /// @brief destructor
-    /// @sa https://json.nlohmann.me/api/basic_json/~basic_json/
-    ~basic_json() noexcept
-    {
-        assert_invariant(false);
-    }
-
-    /// @}
-
-  public:
-    ///////////////////////
-    // object inspection //
-    ///////////////////////
-
-    /// @name object inspection
-    /// Functions to inspect the type of a JSON value.
-    /// @{
-
-    /// @brief serialization
-    /// @sa https://json.nlohmann.me/api/basic_json/dump/
-    string_t dump(const int indent = -1,
-                  const char indent_char = ' ',
-                  const bool ensure_ascii = false,
-                  const error_handler_t error_handler = error_handler_t::strict) const
-    {
-        string_t result;
-        serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler);
-
-        if (indent >= 0)
-        {
-            s.dump(*this, true, ensure_ascii, static_cast<unsigned int>(indent));
-        }
-        else
-        {
-            s.dump(*this, false, ensure_ascii, 0);
-        }
-
-        return result;
-    }
-
-    /// @brief return the type of the JSON value (explicit)
-    /// @sa https://json.nlohmann.me/api/basic_json/type/
-    constexpr value_t type() const noexcept
-    {
-        return m_data.m_type;
-    }
-
-    /// @brief return whether type is primitive
-    /// @sa https://json.nlohmann.me/api/basic_json/is_primitive/
-    constexpr bool is_primitive() const noexcept
-    {
-        return is_null() || is_string() || is_boolean() || is_number() || is_binary();
-    }
-
-    /// @brief return whether type is structured
-    /// @sa https://json.nlohmann.me/api/basic_json/is_structured/
-    constexpr bool is_structured() const noexcept
-    {
-        return is_array() || is_object();
-    }
-
-    /// @brief return whether value is null
-    /// @sa https://json.nlohmann.me/api/basic_json/is_null/
-    constexpr bool is_null() const noexcept
-    {
-        return m_data.m_type == value_t::null;
-    }
-
-    /// @brief return whether value is a boolean
-    /// @sa https://json.nlohmann.me/api/basic_json/is_boolean/
-    constexpr bool is_boolean() const noexcept
-    {
-        return m_data.m_type == value_t::boolean;
-    }
-
-    /// @brief return whether value is a number
-    /// @sa https://json.nlohmann.me/api/basic_json/is_number/
-    constexpr bool is_number() const noexcept
-    {
-        return is_number_integer() || is_number_float();
-    }
-
-    /// @brief return whether value is an integer number
-    /// @sa https://json.nlohmann.me/api/basic_json/is_number_integer/
-    constexpr bool is_number_integer() const noexcept
-    {
-        return m_data.m_type == value_t::number_integer || m_data.m_type == value_t::number_unsigned;
-    }
-
-    /// @brief return whether value is an unsigned integer number
-    /// @sa https://json.nlohmann.me/api/basic_json/is_number_unsigned/
-    constexpr bool is_number_unsigned() const noexcept
-    {
-        return m_data.m_type == value_t::number_unsigned;
-    }
-
-    /// @brief return whether value is a floating-point number
-    /// @sa https://json.nlohmann.me/api/basic_json/is_number_float/
-    constexpr bool is_number_float() const noexcept
-    {
-        return m_data.m_type == value_t::number_float;
-    }
-
-    /// @brief return whether value is an object
-    /// @sa https://json.nlohmann.me/api/basic_json/is_object/
-    constexpr bool is_object() const noexcept
-    {
-        return m_data.m_type == value_t::object;
-    }
-
-    /// @brief return whether value is an array
-    /// @sa https://json.nlohmann.me/api/basic_json/is_array/
-    constexpr bool is_array() const noexcept
-    {
-        return m_data.m_type == value_t::array;
-    }
-
-    /// @brief return whether value is a string
-    /// @sa https://json.nlohmann.me/api/basic_json/is_string/
-    constexpr bool is_string() const noexcept
-    {
-        return m_data.m_type == value_t::string;
-    }
-
-    /// @brief return whether value is a binary array
-    /// @sa https://json.nlohmann.me/api/basic_json/is_binary/
-    constexpr bool is_binary() const noexcept
-    {
-        return m_data.m_type == value_t::binary;
-    }
-
-    /// @brief return whether value is discarded
-    /// @sa https://json.nlohmann.me/api/basic_json/is_discarded/
-    constexpr bool is_discarded() const noexcept
-    {
-        return m_data.m_type == value_t::discarded;
-    }
-
-    /// @brief return the type of the JSON value (implicit)
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_value_t/
-    constexpr operator value_t() const noexcept
-    {
-        return m_data.m_type;
-    }
-
-    /// @}
-
-  private:
-    //////////////////
-    // value access //
-    //////////////////
-
-    /// get a boolean (explicit)
-    boolean_t get_impl(boolean_t* /*unused*/) const
-    {
-        if (JSON_HEDLEY_LIKELY(is_boolean()))
-        {
-            return m_data.m_value.boolean;
-        }
-
-        JSON_THROW(type_error::create(302, detail::concat("type must be boolean, but is ", type_name()), this));
-    }
-
-    /// get a pointer to the value (object)
-    object_t* get_impl_ptr(object_t* /*unused*/) noexcept
-    {
-        return is_object() ? m_data.m_value.object : nullptr;
-    }
-
-    /// get a pointer to the value (object)
-    constexpr const object_t* get_impl_ptr(const object_t* /*unused*/) const noexcept
-    {
-        return is_object() ? m_data.m_value.object : nullptr;
-    }
-
-    /// get a pointer to the value (array)
-    array_t* get_impl_ptr(array_t* /*unused*/) noexcept
-    {
-        return is_array() ? m_data.m_value.array : nullptr;
-    }
-
-    /// get a pointer to the value (array)
-    constexpr const array_t* get_impl_ptr(const array_t* /*unused*/) const noexcept
-    {
-        return is_array() ? m_data.m_value.array : nullptr;
-    }
-
-    /// get a pointer to the value (string)
-    string_t* get_impl_ptr(string_t* /*unused*/) noexcept
-    {
-        return is_string() ? m_data.m_value.string : nullptr;
-    }
-
-    /// get a pointer to the value (string)
-    constexpr const string_t* get_impl_ptr(const string_t* /*unused*/) const noexcept
-    {
-        return is_string() ? m_data.m_value.string : nullptr;
-    }
-
-    /// get a pointer to the value (boolean)
-    boolean_t* get_impl_ptr(boolean_t* /*unused*/) noexcept
-    {
-        return is_boolean() ? &m_data.m_value.boolean : nullptr;
-    }
-
-    /// get a pointer to the value (boolean)
-    constexpr const boolean_t* get_impl_ptr(const boolean_t* /*unused*/) const noexcept
-    {
-        return is_boolean() ? &m_data.m_value.boolean : nullptr;
-    }
-
-    /// get a pointer to the value (integer number)
-    number_integer_t* get_impl_ptr(number_integer_t* /*unused*/) noexcept
-    {
-        return m_data.m_type == value_t::number_integer ? &m_data.m_value.number_integer : nullptr;
-    }
-
-    /// get a pointer to the value (integer number)
-    constexpr const number_integer_t* get_impl_ptr(const number_integer_t* /*unused*/) const noexcept
-    {
-        return m_data.m_type == value_t::number_integer ? &m_data.m_value.number_integer : nullptr;
-    }
-
-    /// get a pointer to the value (unsigned number)
-    number_unsigned_t* get_impl_ptr(number_unsigned_t* /*unused*/) noexcept
-    {
-        return is_number_unsigned() ? &m_data.m_value.number_unsigned : nullptr;
-    }
-
-    /// get a pointer to the value (unsigned number)
-    constexpr const number_unsigned_t* get_impl_ptr(const number_unsigned_t* /*unused*/) const noexcept
-    {
-        return is_number_unsigned() ? &m_data.m_value.number_unsigned : nullptr;
-    }
-
-    /// get a pointer to the value (floating-point number)
-    number_float_t* get_impl_ptr(number_float_t* /*unused*/) noexcept
-    {
-        return is_number_float() ? &m_data.m_value.number_float : nullptr;
-    }
-
-    /// get a pointer to the value (floating-point number)
-    constexpr const number_float_t* get_impl_ptr(const number_float_t* /*unused*/) const noexcept
-    {
-        return is_number_float() ? &m_data.m_value.number_float : nullptr;
-    }
-
-    /// get a pointer to the value (binary)
-    binary_t* get_impl_ptr(binary_t* /*unused*/) noexcept
-    {
-        return is_binary() ? m_data.m_value.binary : nullptr;
-    }
-
-    /// get a pointer to the value (binary)
-    constexpr const binary_t* get_impl_ptr(const binary_t* /*unused*/) const noexcept
-    {
-        return is_binary() ? m_data.m_value.binary : nullptr;
-    }
-
-    /*!
-    @brief helper function to implement get_ref()
-
-    This function helps to implement get_ref() without code duplication for
-    const and non-const overloads
-
-    @tparam ThisType will be deduced as `basic_json` or `const basic_json`
-
-    @throw type_error.303 if ReferenceType does not match underlying value
-    type of the current JSON
-    */
-    template<typename ReferenceType, typename ThisType>
-    static ReferenceType get_ref_impl(ThisType& obj)
-    {
-        // delegate the call to get_ptr<>()
-        auto* ptr = obj.template get_ptr<typename std::add_pointer<ReferenceType>::type>();
-
-        if (JSON_HEDLEY_LIKELY(ptr != nullptr))
-        {
-            return *ptr;
-        }
-
-        JSON_THROW(type_error::create(303, detail::concat("incompatible ReferenceType for get_ref, actual type is ", obj.type_name()), &obj));
-    }
-
-  public:
-    /// @name value access
-    /// Direct access to the stored value of a JSON value.
-    /// @{
-
-    /// @brief get a pointer value (implicit)
-    /// @sa https://json.nlohmann.me/api/basic_json/get_ptr/
-    template<typename PointerType, typename std::enable_if<
-                 std::is_pointer<PointerType>::value, int>::type = 0>
-    auto get_ptr() noexcept -> decltype(std::declval<basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
-    {
-        // delegate the call to get_impl_ptr<>()
-        return get_impl_ptr(static_cast<PointerType>(nullptr));
-    }
-
-    /// @brief get a pointer value (implicit)
-    /// @sa https://json.nlohmann.me/api/basic_json/get_ptr/
-    template < typename PointerType, typename std::enable_if <
-                   std::is_pointer<PointerType>::value&&
-                   std::is_const<typename std::remove_pointer<PointerType>::type>::value, int >::type = 0 >
-    constexpr auto get_ptr() const noexcept -> decltype(std::declval<const basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
-    {
-        // delegate the call to get_impl_ptr<>() const
-        return get_impl_ptr(static_cast<PointerType>(nullptr));
-    }
-
-  private:
-    /*!
-    @brief get a value (explicit)
-
-    Explicit type conversion between the JSON value and a compatible value
-    which is [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
-    and [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
-    The value is converted by calling the @ref json_serializer<ValueType>
-    `from_json()` method.
-
-    The function is equivalent to executing
-    @code {.cpp}
-    ValueType ret;
-    JSONSerializer<ValueType>::from_json(*this, ret);
-    return ret;
-    @endcode
-
-    This overloads is chosen if:
-    - @a ValueType is not @ref basic_json,
-    - @ref json_serializer<ValueType> has a `from_json()` method of the form
-      `void from_json(const basic_json&, ValueType&)`, and
-    - @ref json_serializer<ValueType> does not have a `from_json()` method of
-      the form `ValueType from_json(const basic_json&)`
-
-    @tparam ValueType the returned value type
-
-    @return copy of the JSON value, converted to @a ValueType
-
-    @throw what @ref json_serializer<ValueType> `from_json()` method throws
-
-    @liveexample{The example below shows several conversions from JSON values
-    to other types. There a few things to note: (1) Floating-point numbers can
-    be converted to integers\, (2) A JSON array can be converted to a standard
-    `std::vector<short>`\, (3) A JSON object can be converted to C++
-    associative containers such as `std::unordered_map<std::string\,
-    json>`.,get__ValueType_const}
-
-    @since version 2.1.0
-    */
-    template < typename ValueType,
-               detail::enable_if_t <
-                   detail::is_default_constructible<ValueType>::value&&
-                   detail::has_from_json<basic_json_t, ValueType>::value,
-                   int > = 0 >
-    ValueType get_impl(detail::priority_tag<0> /*unused*/) const noexcept(noexcept(
-            JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), std::declval<ValueType&>())))
-    {
-        auto ret = ValueType();
-        JSONSerializer<ValueType>::from_json(*this, ret);
-        return ret;
-    }
-
-    /*!
-    @brief get a value (explicit); special case
-
-    Explicit type conversion between the JSON value and a compatible value
-    which is **not** [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
-    and **not** [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
-    The value is converted by calling the @ref json_serializer<ValueType>
-    `from_json()` method.
-
-    The function is equivalent to executing
-    @code {.cpp}
-    return JSONSerializer<ValueType>::from_json(*this);
-    @endcode
-
-    This overloads is chosen if:
-    - @a ValueType is not @ref basic_json and
-    - @ref json_serializer<ValueType> has a `from_json()` method of the form
-      `ValueType from_json(const basic_json&)`
-
-    @note If @ref json_serializer<ValueType> has both overloads of
-    `from_json()`, this one is chosen.
-
-    @tparam ValueType the returned value type
-
-    @return copy of the JSON value, converted to @a ValueType
-
-    @throw what @ref json_serializer<ValueType> `from_json()` method throws
-
-    @since version 2.1.0
-    */
-    template < typename ValueType,
-               detail::enable_if_t <
-                   detail::has_non_default_from_json<basic_json_t, ValueType>::value,
-                   int > = 0 >
-    ValueType get_impl(detail::priority_tag<1> /*unused*/) const noexcept(noexcept(
-            JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>())))
-    {
-        return JSONSerializer<ValueType>::from_json(*this);
-    }
-
-    /*!
-    @brief get special-case overload
-
-    This overloads converts the current @ref basic_json in a different
-    @ref basic_json type
-
-    @tparam BasicJsonType == @ref basic_json
-
-    @return a copy of *this, converted into @a BasicJsonType
-
-    @complexity Depending on the implementation of the called `from_json()`
-                method.
-
-    @since version 3.2.0
-    */
-    template < typename BasicJsonType,
-               detail::enable_if_t <
-                   detail::is_basic_json<BasicJsonType>::value,
-                   int > = 0 >
-    BasicJsonType get_impl(detail::priority_tag<2> /*unused*/) const
-    {
-        return *this;
-    }
-
-    /*!
-    @brief get special-case overload
-
-    This overloads avoids a lot of template boilerplate, it can be seen as the
-    identity method
-
-    @tparam BasicJsonType == @ref basic_json
-
-    @return a copy of *this
-
-    @complexity Constant.
-
-    @since version 2.1.0
-    */
-    template<typename BasicJsonType,
-             detail::enable_if_t<
-                 std::is_same<BasicJsonType, basic_json_t>::value,
-                 int> = 0>
-    basic_json get_impl(detail::priority_tag<3> /*unused*/) const
-    {
-        return *this;
-    }
-
-    /*!
-    @brief get a pointer value (explicit)
-    @copydoc get()
-    */
-    template<typename PointerType,
-             detail::enable_if_t<
-                 std::is_pointer<PointerType>::value,
-                 int> = 0>
-    constexpr auto get_impl(detail::priority_tag<4> /*unused*/) const noexcept
-    -> decltype(std::declval<const basic_json_t&>().template get_ptr<PointerType>())
-    {
-        // delegate the call to get_ptr
-        return get_ptr<PointerType>();
-    }
-
-  public:
-    /*!
-    @brief get a (pointer) value (explicit)
-
-    Performs explicit type conversion between the JSON value and a compatible value if required.
-
-    - If the requested type is a pointer to the internally stored JSON value that pointer is returned.
-    No copies are made.
-
-    - If the requested type is the current @ref basic_json, or a different @ref basic_json convertible
-    from the current @ref basic_json.
-
-    - Otherwise the value is converted by calling the @ref json_serializer<ValueType> `from_json()`
-    method.
-
-    @tparam ValueTypeCV the provided value type
-    @tparam ValueType the returned value type
-
-    @return copy of the JSON value, converted to @tparam ValueType if necessary
-
-    @throw what @ref json_serializer<ValueType> `from_json()` method throws if conversion is required
-
-    @since version 2.1.0
-    */
-    template < typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>>
-#if defined(JSON_HAS_CPP_14)
-    constexpr
-#endif
-    auto get() const noexcept(
-    noexcept(std::declval<const basic_json_t&>().template get_impl<ValueType>(detail::priority_tag<4> {})))
-    -> decltype(std::declval<const basic_json_t&>().template get_impl<ValueType>(detail::priority_tag<4> {}))
-    {
-        // we cannot static_assert on ValueTypeCV being non-const, because
-        // there is support for get<const basic_json_t>(), which is why we
-        // still need the uncvref
-        static_assert(!std::is_reference<ValueTypeCV>::value,
-                      "get() cannot be used with reference types, you might want to use get_ref()");
-        return get_impl<ValueType>(detail::priority_tag<4> {});
-    }
-
-    /*!
-    @brief get a pointer value (explicit)
-
-    Explicit pointer access to the internally stored JSON value. No copies are
-    made.
-
-    @warning The pointer becomes invalid if the underlying JSON object
-    changes.
-
-    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
-    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
-    @ref number_unsigned_t, or @ref number_float_t.
-
-    @return pointer to the internally stored JSON value if the requested
-    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise
-
-    @complexity Constant.
-
-    @liveexample{The example below shows how pointers to internal values of a
-    JSON value can be requested. Note that no type conversions are made and a
-    `nullptr` is returned if the value and the requested pointer type does not
-    match.,get__PointerType}
-
-    @sa see @ref get_ptr() for explicit pointer-member access
-
-    @since version 1.0.0
-    */
-    template<typename PointerType, typename std::enable_if<
-                 std::is_pointer<PointerType>::value, int>::type = 0>
-    auto get() noexcept -> decltype(std::declval<basic_json_t&>().template get_ptr<PointerType>())
-    {
-        // delegate the call to get_ptr
-        return get_ptr<PointerType>();
-    }
-
-    /// @brief get a value (explicit)
-    /// @sa https://json.nlohmann.me/api/basic_json/get_to/
-    template < typename ValueType,
-               detail::enable_if_t <
-                   !detail::is_basic_json<ValueType>::value&&
-                   detail::has_from_json<basic_json_t, ValueType>::value,
-                   int > = 0 >
-    ValueType & get_to(ValueType& v) const noexcept(noexcept(
-            JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), v)))
-    {
-        JSONSerializer<ValueType>::from_json(*this, v);
-        return v;
-    }
-
-    // specialization to allow calling get_to with a basic_json value
-    // see https://github.com/nlohmann/json/issues/2175
-    template<typename ValueType,
-             detail::enable_if_t <
-                 detail::is_basic_json<ValueType>::value,
-                 int> = 0>
-    ValueType & get_to(ValueType& v) const
-    {
-        v = *this;
-        return v;
-    }
-
-    template <
-        typename T, std::size_t N,
-        typename Array = T (&)[N], // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
-        detail::enable_if_t <
-            detail::has_from_json<basic_json_t, Array>::value, int > = 0 >
-    Array get_to(T (&v)[N]) const // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
-    noexcept(noexcept(JSONSerializer<Array>::from_json(
-                          std::declval<const basic_json_t&>(), v)))
-    {
-        JSONSerializer<Array>::from_json(*this, v);
-        return v;
-    }
-
-    /// @brief get a reference value (implicit)
-    /// @sa https://json.nlohmann.me/api/basic_json/get_ref/
-    template<typename ReferenceType, typename std::enable_if<
-                 std::is_reference<ReferenceType>::value, int>::type = 0>
-    ReferenceType get_ref()
-    {
-        // delegate call to get_ref_impl
-        return get_ref_impl<ReferenceType>(*this);
-    }
-
-    /// @brief get a reference value (implicit)
-    /// @sa https://json.nlohmann.me/api/basic_json/get_ref/
-    template < typename ReferenceType, typename std::enable_if <
-                   std::is_reference<ReferenceType>::value&&
-                   std::is_const<typename std::remove_reference<ReferenceType>::type>::value, int >::type = 0 >
-    ReferenceType get_ref() const
-    {
-        // delegate call to get_ref_impl
-        return get_ref_impl<ReferenceType>(*this);
-    }
-
-    /*!
-    @brief get a value (implicit)
-
-    Implicit type conversion between the JSON value and a compatible value.
-    The call is realized by calling @ref get() const.
-
-    @tparam ValueType non-pointer type compatible to the JSON value, for
-    instance `int` for JSON integer numbers, `bool` for JSON booleans, or
-    `std::vector` types for JSON arrays. The character type of @ref string_t
-    as well as an initializer list of this type is excluded to avoid
-    ambiguities as these types implicitly convert to `std::string`.
-
-    @return copy of the JSON value, converted to type @a ValueType
-
-    @throw type_error.302 in case passed type @a ValueType is incompatible
-    to the JSON value type (e.g., the JSON value is of type boolean, but a
-    string is requested); see example below
-
-    @complexity Linear in the size of the JSON value.
-
-    @liveexample{The example below shows several conversions from JSON values
-    to other types. There a few things to note: (1) Floating-point numbers can
-    be converted to integers\, (2) A JSON array can be converted to a standard
-    `std::vector<short>`\, (3) A JSON object can be converted to C++
-    associative containers such as `std::unordered_map<std::string\,
-    json>`.,operator__ValueType}
-
-    @since version 1.0.0
-    */
-    template < typename ValueType, typename std::enable_if <
-                   detail::conjunction <
-                       detail::negation<std::is_pointer<ValueType>>,
-                       detail::negation<std::is_same<ValueType, std::nullptr_t>>,
-                       detail::negation<std::is_same<ValueType, detail::json_ref<basic_json>>>,
-                                        detail::negation<std::is_same<ValueType, typename string_t::value_type>>,
-                                        detail::negation<detail::is_basic_json<ValueType>>,
-                                        detail::negation<std::is_same<ValueType, std::initializer_list<typename string_t::value_type>>>,
-#if defined(JSON_HAS_CPP_17) && (defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1910 && _MSC_VER <= 1914))
-                                                detail::negation<std::is_same<ValueType, std::string_view>>,
-#endif
-#if defined(JSON_HAS_CPP_17) && JSON_HAS_STATIC_RTTI
-                                                detail::negation<std::is_same<ValueType, std::any>>,
-#endif
-                                                detail::is_detected_lazy<detail::get_template_function, const basic_json_t&, ValueType>
-                                                >::value, int >::type = 0 >
-                                        JSON_EXPLICIT operator ValueType() const
-    {
-        // delegate the call to get<>() const
-        return get<ValueType>();
-    }
-
-    /// @brief get a binary value
-    /// @sa https://json.nlohmann.me/api/basic_json/get_binary/
-    binary_t& get_binary()
-    {
-        if (!is_binary())
-        {
-            JSON_THROW(type_error::create(302, detail::concat("type must be binary, but is ", type_name()), this));
-        }
-
-        return *get_ptr<binary_t*>();
-    }
-
-    /// @brief get a binary value
-    /// @sa https://json.nlohmann.me/api/basic_json/get_binary/
-    const binary_t& get_binary() const
-    {
-        if (!is_binary())
-        {
-            JSON_THROW(type_error::create(302, detail::concat("type must be binary, but is ", type_name()), this));
-        }
-
-        return *get_ptr<const binary_t*>();
-    }
-
-    /// @}
-
-    ////////////////////
-    // element access //
-    ////////////////////
-
-    /// @name element access
-    /// Access to the JSON value.
-    /// @{
-
-    /// @brief access specified array element with bounds checking
-    /// @sa https://json.nlohmann.me/api/basic_json/at/
-    reference at(size_type idx)
-    {
-        // at only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            JSON_TRY
-            {
-                return set_parent(m_data.m_value.array->at(idx));
-            }
-            JSON_CATCH (std::out_of_range&)
-            {
-                // create better exception explanation
-                JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), this));
-            } // cppcheck-suppress[missingReturn]
-        }
-        else
-        {
-            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
-        }
-    }
-
-    /// @brief access specified array element with bounds checking
-    /// @sa https://json.nlohmann.me/api/basic_json/at/
-    const_reference at(size_type idx) const
-    {
-        // at only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            JSON_TRY
-            {
-                return m_data.m_value.array->at(idx);
-            }
-            JSON_CATCH (std::out_of_range&)
-            {
-                // create better exception explanation
-                JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), this));
-            } // cppcheck-suppress[missingReturn]
-        }
-        else
-        {
-            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
-        }
-    }
-
-    /// @brief access specified object element with bounds checking
-    /// @sa https://json.nlohmann.me/api/basic_json/at/
-    reference at(const typename object_t::key_type& key)
-    {
-        // at only works for objects
-        if (JSON_HEDLEY_UNLIKELY(!is_object()))
-        {
-            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
-        }
-
-        auto it = m_data.m_value.object->find(key);
-        if (it == m_data.m_value.object->end())
-        {
-            JSON_THROW(out_of_range::create(403, detail::concat("key '", key, "' not found"), this));
-        }
-        return set_parent(it->second);
-    }
-
-    /// @brief access specified object element with bounds checking
-    /// @sa https://json.nlohmann.me/api/basic_json/at/
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
-    reference at(KeyType && key)
-    {
-        // at only works for objects
-        if (JSON_HEDLEY_UNLIKELY(!is_object()))
-        {
-            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
-        }
-
-        auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
-        if (it == m_data.m_value.object->end())
-        {
-            JSON_THROW(out_of_range::create(403, detail::concat("key '", string_t(std::forward<KeyType>(key)), "' not found"), this));
-        }
-        return set_parent(it->second);
-    }
-
-    /// @brief access specified object element with bounds checking
-    /// @sa https://json.nlohmann.me/api/basic_json/at/
-    const_reference at(const typename object_t::key_type& key) const
-    {
-        // at only works for objects
-        if (JSON_HEDLEY_UNLIKELY(!is_object()))
-        {
-            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
-        }
-
-        auto it = m_data.m_value.object->find(key);
-        if (it == m_data.m_value.object->end())
-        {
-            JSON_THROW(out_of_range::create(403, detail::concat("key '", key, "' not found"), this));
-        }
-        return it->second;
-    }
-
-    /// @brief access specified object element with bounds checking
-    /// @sa https://json.nlohmann.me/api/basic_json/at/
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
-    const_reference at(KeyType && key) const
-    {
-        // at only works for objects
-        if (JSON_HEDLEY_UNLIKELY(!is_object()))
-        {
-            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
-        }
-
-        auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
-        if (it == m_data.m_value.object->end())
-        {
-            JSON_THROW(out_of_range::create(403, detail::concat("key '", string_t(std::forward<KeyType>(key)), "' not found"), this));
-        }
-        return it->second;
-    }
-
-    /// @brief access specified array element
-    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
-    reference operator[](size_type idx)
-    {
-        // implicitly convert null value to an empty array
-        if (is_null())
-        {
-            m_data.m_type = value_t::array;
-            m_data.m_value.array = create<array_t>();
-            assert_invariant();
-        }
-
-        // operator[] only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            // fill up array with null values if given idx is outside range
-            if (idx >= m_data.m_value.array->size())
-            {
-#if JSON_DIAGNOSTICS
-                // remember array size & capacity before resizing
-                const auto old_size = m_data.m_value.array->size();
-                const auto old_capacity = m_data.m_value.array->capacity();
-#endif
-                m_data.m_value.array->resize(idx + 1);
-
-#if JSON_DIAGNOSTICS
-                if (JSON_HEDLEY_UNLIKELY(m_data.m_value.array->capacity() != old_capacity))
-                {
-                    // capacity has changed: update all parents
-                    set_parents();
-                }
-                else
-                {
-                    // set parent for values added above
-                    set_parents(begin() + static_cast<typename iterator::difference_type>(old_size), static_cast<typename iterator::difference_type>(idx + 1 - old_size));
-                }
-#endif
-                assert_invariant();
-            }
-
-            return m_data.m_value.array->operator[](idx);
-        }
-
-        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a numeric argument with ", type_name()), this));
-    }
-
-    /// @brief access specified array element
-    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
-    const_reference operator[](size_type idx) const
-    {
-        // const operator[] only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            return m_data.m_value.array->operator[](idx);
-        }
-
-        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a numeric argument with ", type_name()), this));
-    }
-
-    /// @brief access specified object element
-    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
-    reference operator[](typename object_t::key_type key) // NOLINT(performance-unnecessary-value-param)
-    {
-        // implicitly convert null value to an empty object
-        if (is_null())
-        {
-            m_data.m_type = value_t::object;
-            m_data.m_value.object = create<object_t>();
-            assert_invariant();
-        }
-
-        // operator[] only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            auto result = m_data.m_value.object->emplace(std::move(key), nullptr);
-            return set_parent(result.first->second);
-        }
-
-        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
-    }
-
-    /// @brief access specified object element
-    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
-    const_reference operator[](const typename object_t::key_type& key) const
-    {
-        // const operator[] only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            auto it = m_data.m_value.object->find(key);
-            JSON_ASSERT(it != m_data.m_value.object->end());
-            return it->second;
-        }
-
-        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
-    }
-
-    // these two functions resolve a (const) char * ambiguity affecting Clang and MSVC
-    // (they seemingly cannot be constrained to resolve the ambiguity)
-    template<typename T>
-    reference operator[](T* key)
-    {
-        return operator[](typename object_t::key_type(key));
-    }
-
-    template<typename T>
-    const_reference operator[](T* key) const
-    {
-        return operator[](typename object_t::key_type(key));
-    }
-
-    /// @brief access specified object element
-    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int > = 0 >
-    reference operator[](KeyType && key)
-    {
-        // implicitly convert null value to an empty object
-        if (is_null())
-        {
-            m_data.m_type = value_t::object;
-            m_data.m_value.object = create<object_t>();
-            assert_invariant();
-        }
-
-        // operator[] only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            auto result = m_data.m_value.object->emplace(std::forward<KeyType>(key), nullptr);
-            return set_parent(result.first->second);
-        }
-
-        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
-    }
-
-    /// @brief access specified object element
-    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int > = 0 >
-    const_reference operator[](KeyType && key) const
-    {
-        // const operator[] only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
-            JSON_ASSERT(it != m_data.m_value.object->end());
-            return it->second;
-        }
-
-        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
-    }
-
-  private:
-    template<typename KeyType>
-    using is_comparable_with_object_key = detail::is_comparable <
-        object_comparator_t, const typename object_t::key_type&, KeyType >;
-
-    template<typename ValueType>
-    using value_return_type = std::conditional <
-        detail::is_c_string_uncvref<ValueType>::value,
-        string_t, typename std::decay<ValueType>::type >;
-
-  public:
-    /// @brief access specified object element with default value
-    /// @sa https://json.nlohmann.me/api/basic_json/value/
-    template < class ValueType, detail::enable_if_t <
-                   !detail::is_transparent<object_comparator_t>::value
-                   && detail::is_getable<basic_json_t, ValueType>::value
-                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
-    ValueType value(const typename object_t::key_type& key, const ValueType& default_value) const
-    {
-        // value only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            // if key is found, return value and given default value otherwise
-            const auto it = find(key);
-            if (it != end())
-            {
-                return it->template get<ValueType>();
-            }
-
-            return default_value;
-        }
-
-        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
-    }
-
-    /// @brief access specified object element with default value
-    /// @sa https://json.nlohmann.me/api/basic_json/value/
-    template < class ValueType, class ReturnType = typename value_return_type<ValueType>::type,
-               detail::enable_if_t <
-                   !detail::is_transparent<object_comparator_t>::value
-                   && detail::is_getable<basic_json_t, ReturnType>::value
-                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
-    ReturnType value(const typename object_t::key_type& key, ValueType && default_value) const
-    {
-        // value only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            // if key is found, return value and given default value otherwise
-            const auto it = find(key);
-            if (it != end())
-            {
-                return it->template get<ReturnType>();
-            }
-
-            return std::forward<ValueType>(default_value);
-        }
-
-        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
-    }
-
-    /// @brief access specified object element with default value
-    /// @sa https://json.nlohmann.me/api/basic_json/value/
-    template < class ValueType, class KeyType, detail::enable_if_t <
-                   detail::is_transparent<object_comparator_t>::value
-                   && !detail::is_json_pointer<KeyType>::value
-                   && is_comparable_with_object_key<KeyType>::value
-                   && detail::is_getable<basic_json_t, ValueType>::value
-                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
-    ValueType value(KeyType && key, const ValueType& default_value) const
-    {
-        // value only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            // if key is found, return value and given default value otherwise
-            const auto it = find(std::forward<KeyType>(key));
-            if (it != end())
-            {
-                return it->template get<ValueType>();
-            }
-
-            return default_value;
-        }
-
-        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
-    }
-
-    /// @brief access specified object element via JSON Pointer with default value
-    /// @sa https://json.nlohmann.me/api/basic_json/value/
-    template < class ValueType, class KeyType, class ReturnType = typename value_return_type<ValueType>::type,
-               detail::enable_if_t <
-                   detail::is_transparent<object_comparator_t>::value
-                   && !detail::is_json_pointer<KeyType>::value
-                   && is_comparable_with_object_key<KeyType>::value
-                   && detail::is_getable<basic_json_t, ReturnType>::value
-                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
-    ReturnType value(KeyType && key, ValueType && default_value) const
-    {
-        // value only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            // if key is found, return value and given default value otherwise
-            const auto it = find(std::forward<KeyType>(key));
-            if (it != end())
-            {
-                return it->template get<ReturnType>();
-            }
-
-            return std::forward<ValueType>(default_value);
-        }
-
-        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
-    }
-
-    /// @brief access specified object element via JSON Pointer with default value
-    /// @sa https://json.nlohmann.me/api/basic_json/value/
-    template < class ValueType, detail::enable_if_t <
-                   detail::is_getable<basic_json_t, ValueType>::value
-                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
-    ValueType value(const json_pointer& ptr, const ValueType& default_value) const
-    {
-        // value only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            // if pointer resolves a value, return it or use default value
-            JSON_TRY
-            {
-                return ptr.get_checked(this).template get<ValueType>();
-            }
-            JSON_INTERNAL_CATCH (out_of_range&)
-            {
-                return default_value;
-            }
-        }
-
-        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
-    }
-
-    /// @brief access specified object element via JSON Pointer with default value
-    /// @sa https://json.nlohmann.me/api/basic_json/value/
-    template < class ValueType, class ReturnType = typename value_return_type<ValueType>::type,
-               detail::enable_if_t <
-                   detail::is_getable<basic_json_t, ReturnType>::value
-                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
-    ReturnType value(const json_pointer& ptr, ValueType && default_value) const
-    {
-        // value only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            // if pointer resolves a value, return it or use default value
-            JSON_TRY
-            {
-                return ptr.get_checked(this).template get<ReturnType>();
-            }
-            JSON_INTERNAL_CATCH (out_of_range&)
-            {
-                return std::forward<ValueType>(default_value);
-            }
-        }
-
-        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
-    }
-
-    template < class ValueType, class BasicJsonType, detail::enable_if_t <
-                   detail::is_basic_json<BasicJsonType>::value
-                   && detail::is_getable<basic_json_t, ValueType>::value
-                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
-    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
-    ValueType value(const ::nlohmann::json_pointer<BasicJsonType>& ptr, const ValueType& default_value) const
-    {
-        return value(ptr.convert(), default_value);
-    }
-
-    template < class ValueType, class BasicJsonType, class ReturnType = typename value_return_type<ValueType>::type,
-               detail::enable_if_t <
-                   detail::is_basic_json<BasicJsonType>::value
-                   && detail::is_getable<basic_json_t, ReturnType>::value
-                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
-    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
-    ReturnType value(const ::nlohmann::json_pointer<BasicJsonType>& ptr, ValueType && default_value) const
-    {
-        return value(ptr.convert(), std::forward<ValueType>(default_value));
-    }
-
-    /// @brief access the first element
-    /// @sa https://json.nlohmann.me/api/basic_json/front/
-    reference front()
-    {
-        return *begin();
-    }
-
-    /// @brief access the first element
-    /// @sa https://json.nlohmann.me/api/basic_json/front/
-    const_reference front() const
-    {
-        return *cbegin();
-    }
-
-    /// @brief access the last element
-    /// @sa https://json.nlohmann.me/api/basic_json/back/
-    reference back()
-    {
-        auto tmp = end();
-        --tmp;
-        return *tmp;
-    }
-
-    /// @brief access the last element
-    /// @sa https://json.nlohmann.me/api/basic_json/back/
-    const_reference back() const
-    {
-        auto tmp = cend();
-        --tmp;
-        return *tmp;
-    }
-
-    /// @brief remove element given an iterator
-    /// @sa https://json.nlohmann.me/api/basic_json/erase/
-    template < class IteratorType, detail::enable_if_t <
-                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
-                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int > = 0 >
-    IteratorType erase(IteratorType pos) // NOLINT(performance-unnecessary-value-param)
-    {
-        // make sure iterator fits the current value
-        if (JSON_HEDLEY_UNLIKELY(this != pos.m_object))
-        {
-            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
-        }
-
-        IteratorType result = end();
-
-        switch (m_data.m_type)
-        {
-            case value_t::boolean:
-            case value_t::number_float:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::string:
-            case value_t::binary:
-            {
-                if (JSON_HEDLEY_UNLIKELY(!pos.m_it.primitive_iterator.is_begin()))
-                {
-                    JSON_THROW(invalid_iterator::create(205, "iterator out of range", this));
-                }
-
-                if (is_string())
-                {
-                    AllocatorType<string_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.string);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.string, 1);
-                    m_data.m_value.string = nullptr;
-                }
-                else if (is_binary())
-                {
-                    AllocatorType<binary_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.binary);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.binary, 1);
-                    m_data.m_value.binary = nullptr;
-                }
-
-                m_data.m_type = value_t::null;
-                assert_invariant();
-                break;
-            }
-
-            case value_t::object:
-            {
-                result.m_it.object_iterator = m_data.m_value.object->erase(pos.m_it.object_iterator);
-                break;
-            }
-
-            case value_t::array:
-            {
-                result.m_it.array_iterator = m_data.m_value.array->erase(pos.m_it.array_iterator);
-                break;
-            }
-
-            case value_t::null:
-            case value_t::discarded:
-            default:
-                JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
-        }
-
-        return result;
-    }
-
-    /// @brief remove elements given an iterator range
-    /// @sa https://json.nlohmann.me/api/basic_json/erase/
-    template < class IteratorType, detail::enable_if_t <
-                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
-                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int > = 0 >
-    IteratorType erase(IteratorType first, IteratorType last) // NOLINT(performance-unnecessary-value-param)
-    {
-        // make sure iterator fits the current value
-        if (JSON_HEDLEY_UNLIKELY(this != first.m_object || this != last.m_object))
-        {
-            JSON_THROW(invalid_iterator::create(203, "iterators do not fit current value", this));
-        }
-
-        IteratorType result = end();
-
-        switch (m_data.m_type)
-        {
-            case value_t::boolean:
-            case value_t::number_float:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::string:
-            case value_t::binary:
-            {
-                if (JSON_HEDLEY_LIKELY(!first.m_it.primitive_iterator.is_begin()
-                                       || !last.m_it.primitive_iterator.is_end()))
-                {
-                    JSON_THROW(invalid_iterator::create(204, "iterators out of range", this));
-                }
-
-                if (is_string())
-                {
-                    AllocatorType<string_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.string);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.string, 1);
-                    m_data.m_value.string = nullptr;
-                }
-                else if (is_binary())
-                {
-                    AllocatorType<binary_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.binary);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.binary, 1);
-                    m_data.m_value.binary = nullptr;
-                }
-
-                m_data.m_type = value_t::null;
-                assert_invariant();
-                break;
-            }
-
-            case value_t::object:
-            {
-                result.m_it.object_iterator = m_data.m_value.object->erase(first.m_it.object_iterator,
-                                              last.m_it.object_iterator);
-                break;
-            }
-
-            case value_t::array:
-            {
-                result.m_it.array_iterator = m_data.m_value.array->erase(first.m_it.array_iterator,
-                                             last.m_it.array_iterator);
-                break;
-            }
-
-            case value_t::null:
-            case value_t::discarded:
-            default:
-                JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
-        }
-
-        return result;
-    }
-
-  private:
-    template < typename KeyType, detail::enable_if_t <
-                   detail::has_erase_with_key_type<basic_json_t, KeyType>::value, int > = 0 >
-    size_type erase_internal(KeyType && key)
-    {
-        // this erase only works for objects
-        if (JSON_HEDLEY_UNLIKELY(!is_object()))
-        {
-            JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
-        }
-
-        return m_data.m_value.object->erase(std::forward<KeyType>(key));
-    }
-
-    template < typename KeyType, detail::enable_if_t <
-                   !detail::has_erase_with_key_type<basic_json_t, KeyType>::value, int > = 0 >
-    size_type erase_internal(KeyType && key)
-    {
-        // this erase only works for objects
-        if (JSON_HEDLEY_UNLIKELY(!is_object()))
-        {
-            JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
-        }
-
-        const auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
-        if (it != m_data.m_value.object->end())
-        {
-            m_data.m_value.object->erase(it);
-            return 1;
-        }
-        return 0;
-    }
-
-  public:
-
-    /// @brief remove element from a JSON object given a key
-    /// @sa https://json.nlohmann.me/api/basic_json/erase/
-    size_type erase(const typename object_t::key_type& key)
-    {
-        // the indirection via erase_internal() is added to avoid making this
-        // function a template and thus de-rank it during overload resolution
-        return erase_internal(key);
-    }
-
-    /// @brief remove element from a JSON object given a key
-    /// @sa https://json.nlohmann.me/api/basic_json/erase/
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
-    size_type erase(KeyType && key)
-    {
-        return erase_internal(std::forward<KeyType>(key));
-    }
-
-    /// @brief remove element from a JSON array given an index
-    /// @sa https://json.nlohmann.me/api/basic_json/erase/
-    void erase(const size_type idx)
-    {
-        // this erase only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            if (JSON_HEDLEY_UNLIKELY(idx >= size()))
-            {
-                JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), this));
-            }
-
-            m_data.m_value.array->erase(m_data.m_value.array->begin() + static_cast<difference_type>(idx));
-        }
-        else
-        {
-            JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
-        }
-    }
-
-    /// @}
-
-    ////////////
-    // lookup //
-    ////////////
-
-    /// @name lookup
-    /// @{
-
-    /// @brief find an element in a JSON object
-    /// @sa https://json.nlohmann.me/api/basic_json/find/
-    iterator find(const typename object_t::key_type& key)
-    {
-        auto result = end();
-
-        if (is_object())
-        {
-            result.m_it.object_iterator = m_data.m_value.object->find(key);
-        }
-
-        return result;
-    }
-
-    /// @brief find an element in a JSON object
-    /// @sa https://json.nlohmann.me/api/basic_json/find/
-    const_iterator find(const typename object_t::key_type& key) const
-    {
-        auto result = cend();
-
-        if (is_object())
-        {
-            result.m_it.object_iterator = m_data.m_value.object->find(key);
-        }
-
-        return result;
-    }
-
-    /// @brief find an element in a JSON object
-    /// @sa https://json.nlohmann.me/api/basic_json/find/
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
-    iterator find(KeyType && key)
-    {
-        auto result = end();
-
-        if (is_object())
-        {
-            result.m_it.object_iterator = m_data.m_value.object->find(std::forward<KeyType>(key));
-        }
-
-        return result;
-    }
-
-    /// @brief find an element in a JSON object
-    /// @sa https://json.nlohmann.me/api/basic_json/find/
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
-    const_iterator find(KeyType && key) const
-    {
-        auto result = cend();
-
-        if (is_object())
-        {
-            result.m_it.object_iterator = m_data.m_value.object->find(std::forward<KeyType>(key));
-        }
-
-        return result;
-    }
-
-    /// @brief returns the number of occurrences of a key in a JSON object
-    /// @sa https://json.nlohmann.me/api/basic_json/count/
-    size_type count(const typename object_t::key_type& key) const
-    {
-        // return 0 for all nonobject types
-        return is_object() ? m_data.m_value.object->count(key) : 0;
-    }
-
-    /// @brief returns the number of occurrences of a key in a JSON object
-    /// @sa https://json.nlohmann.me/api/basic_json/count/
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
-    size_type count(KeyType && key) const
-    {
-        // return 0 for all nonobject types
-        return is_object() ? m_data.m_value.object->count(std::forward<KeyType>(key)) : 0;
-    }
-
-    /// @brief check the existence of an element in a JSON object
-    /// @sa https://json.nlohmann.me/api/basic_json/contains/
-    bool contains(const typename object_t::key_type& key) const
-    {
-        return is_object() && m_data.m_value.object->find(key) != m_data.m_value.object->end();
-    }
-
-    /// @brief check the existence of an element in a JSON object
-    /// @sa https://json.nlohmann.me/api/basic_json/contains/
-    template<class KeyType, detail::enable_if_t<
-                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
-    bool contains(KeyType && key) const
-    {
-        return is_object() && m_data.m_value.object->find(std::forward<KeyType>(key)) != m_data.m_value.object->end();
-    }
-
-    /// @brief check the existence of an element in a JSON object given a JSON pointer
-    /// @sa https://json.nlohmann.me/api/basic_json/contains/
-    bool contains(const json_pointer& ptr) const
-    {
-        return ptr.contains(this);
-    }
-
-    template<typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
-    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
-    bool contains(const typename ::nlohmann::json_pointer<BasicJsonType>& ptr) const
-    {
-        return ptr.contains(this);
-    }
-
-    /// @}
-
-    ///////////////
-    // iterators //
-    ///////////////
-
-    /// @name iterators
-    /// @{
-
-    /// @brief returns an iterator to the first element
-    /// @sa https://json.nlohmann.me/api/basic_json/begin/
-    iterator begin() noexcept
-    {
-        iterator result(this);
-        result.set_begin();
-        return result;
-    }
-
-    /// @brief returns an iterator to the first element
-    /// @sa https://json.nlohmann.me/api/basic_json/begin/
-    const_iterator begin() const noexcept
-    {
-        return cbegin();
-    }
-
-    /// @brief returns a const iterator to the first element
-    /// @sa https://json.nlohmann.me/api/basic_json/cbegin/
-    const_iterator cbegin() const noexcept
-    {
-        const_iterator result(this);
-        result.set_begin();
-        return result;
-    }
-
-    /// @brief returns an iterator to one past the last element
-    /// @sa https://json.nlohmann.me/api/basic_json/end/
-    iterator end() noexcept
-    {
-        iterator result(this);
-        result.set_end();
-        return result;
-    }
-
-    /// @brief returns an iterator to one past the last element
-    /// @sa https://json.nlohmann.me/api/basic_json/end/
-    const_iterator end() const noexcept
-    {
-        return cend();
-    }
-
-    /// @brief returns an iterator to one past the last element
-    /// @sa https://json.nlohmann.me/api/basic_json/cend/
-    const_iterator cend() const noexcept
-    {
-        const_iterator result(this);
-        result.set_end();
-        return result;
-    }
-
-    /// @brief returns an iterator to the reverse-beginning
-    /// @sa https://json.nlohmann.me/api/basic_json/rbegin/
-    reverse_iterator rbegin() noexcept
-    {
-        return reverse_iterator(end());
-    }
-
-    /// @brief returns an iterator to the reverse-beginning
-    /// @sa https://json.nlohmann.me/api/basic_json/rbegin/
-    const_reverse_iterator rbegin() const noexcept
-    {
-        return crbegin();
-    }
-
-    /// @brief returns an iterator to the reverse-end
-    /// @sa https://json.nlohmann.me/api/basic_json/rend/
-    reverse_iterator rend() noexcept
-    {
-        return reverse_iterator(begin());
-    }
-
-    /// @brief returns an iterator to the reverse-end
-    /// @sa https://json.nlohmann.me/api/basic_json/rend/
-    const_reverse_iterator rend() const noexcept
-    {
-        return crend();
-    }
-
-    /// @brief returns a const reverse iterator to the last element
-    /// @sa https://json.nlohmann.me/api/basic_json/crbegin/
-    const_reverse_iterator crbegin() const noexcept
-    {
-        return const_reverse_iterator(cend());
-    }
-
-    /// @brief returns a const reverse iterator to one before the first
-    /// @sa https://json.nlohmann.me/api/basic_json/crend/
-    const_reverse_iterator crend() const noexcept
-    {
-        return const_reverse_iterator(cbegin());
-    }
-
-  public:
-    /// @brief wrapper to access iterator member functions in range-based for
-    /// @sa https://json.nlohmann.me/api/basic_json/items/
-    /// @deprecated This function is deprecated since 3.1.0 and will be removed in
-    ///             version 4.0.0 of the library. Please use @ref items() instead;
-    ///             that is, replace `json::iterator_wrapper(j)` with `j.items()`.
-    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
-    static iteration_proxy<iterator> iterator_wrapper(reference ref) noexcept
-    {
-        return ref.items();
-    }
-
-    /// @brief wrapper to access iterator member functions in range-based for
-    /// @sa https://json.nlohmann.me/api/basic_json/items/
-    /// @deprecated This function is deprecated since 3.1.0 and will be removed in
-    ///         version 4.0.0 of the library. Please use @ref items() instead;
-    ///         that is, replace `json::iterator_wrapper(j)` with `j.items()`.
-    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
-    static iteration_proxy<const_iterator> iterator_wrapper(const_reference ref) noexcept
-    {
-        return ref.items();
-    }
-
-    /// @brief helper to access iterator member functions in range-based for
-    /// @sa https://json.nlohmann.me/api/basic_json/items/
-    iteration_proxy<iterator> items() noexcept
-    {
-        return iteration_proxy<iterator>(*this);
-    }
-
-    /// @brief helper to access iterator member functions in range-based for
-    /// @sa https://json.nlohmann.me/api/basic_json/items/
-    iteration_proxy<const_iterator> items() const noexcept
-    {
-        return iteration_proxy<const_iterator>(*this);
-    }
-
-    /// @}
-
-    //////////////
-    // capacity //
-    //////////////
-
-    /// @name capacity
-    /// @{
-
-    /// @brief checks whether the container is empty.
-    /// @sa https://json.nlohmann.me/api/basic_json/empty/
-    bool empty() const noexcept
-    {
-        switch (m_data.m_type)
-        {
-            case value_t::null:
-            {
-                // null values are empty
-                return true;
-            }
-
-            case value_t::array:
-            {
-                // delegate call to array_t::empty()
-                return m_data.m_value.array->empty();
-            }
-
-            case value_t::object:
-            {
-                // delegate call to object_t::empty()
-                return m_data.m_value.object->empty();
-            }
-
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-            {
-                // all other types are nonempty
-                return false;
-            }
-        }
-    }
-
-    /// @brief returns the number of elements
-    /// @sa https://json.nlohmann.me/api/basic_json/size/
-    size_type size() const noexcept
-    {
-        switch (m_data.m_type)
-        {
-            case value_t::null:
-            {
-                // null values are empty
-                return 0;
-            }
-
-            case value_t::array:
-            {
-                // delegate call to array_t::size()
-                return m_data.m_value.array->size();
-            }
-
-            case value_t::object:
-            {
-                // delegate call to object_t::size()
-                return m_data.m_value.object->size();
-            }
-
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-            {
-                // all other types have size 1
-                return 1;
-            }
-        }
-    }
-
-    /// @brief returns the maximum possible number of elements
-    /// @sa https://json.nlohmann.me/api/basic_json/max_size/
-    size_type max_size() const noexcept
-    {
-        switch (m_data.m_type)
-        {
-            case value_t::array:
-            {
-                // delegate call to array_t::max_size()
-                return m_data.m_value.array->max_size();
-            }
-
-            case value_t::object:
-            {
-                // delegate call to object_t::max_size()
-                return m_data.m_value.object->max_size();
-            }
-
-            case value_t::null:
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-            {
-                // all other types have max_size() == size()
-                return size();
-            }
-        }
-    }
-
-    /// @}
-
-    ///////////////
-    // modifiers //
-    ///////////////
-
-    /// @name modifiers
-    /// @{
-
-    /// @brief clears the contents
-    /// @sa https://json.nlohmann.me/api/basic_json/clear/
-    void clear() noexcept
-    {
-        switch (m_data.m_type)
-        {
-            case value_t::number_integer:
-            {
-                m_data.m_value.number_integer = 0;
-                break;
-            }
-
-            case value_t::number_unsigned:
-            {
-                m_data.m_value.number_unsigned = 0;
-                break;
-            }
-
-            case value_t::number_float:
-            {
-                m_data.m_value.number_float = 0.0;
-                break;
-            }
-
-            case value_t::boolean:
-            {
-                m_data.m_value.boolean = false;
-                break;
-            }
-
-            case value_t::string:
-            {
-                m_data.m_value.string->clear();
-                break;
-            }
-
-            case value_t::binary:
-            {
-                m_data.m_value.binary->clear();
-                break;
-            }
-
-            case value_t::array:
-            {
-                m_data.m_value.array->clear();
-                break;
-            }
-
-            case value_t::object:
-            {
-                m_data.m_value.object->clear();
-                break;
-            }
-
-            case value_t::null:
-            case value_t::discarded:
-            default:
-                break;
-        }
-    }
-
-    /// @brief add an object to an array
-    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
-    void push_back(basic_json&& val)
-    {
-        // push_back only works for null objects or arrays
-        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
-        {
-            JSON_THROW(type_error::create(308, detail::concat("cannot use push_back() with ", type_name()), this));
-        }
-
-        // transform null object into an array
-        if (is_null())
-        {
-            m_data.m_type = value_t::array;
-            m_data.m_value = value_t::array;
-            assert_invariant();
-        }
-
-        // add element to array (move semantics)
-        const auto old_capacity = m_data.m_value.array->capacity();
-        m_data.m_value.array->push_back(std::move(val));
-        set_parent(m_data.m_value.array->back(), old_capacity);
-        // if val is moved from, basic_json move constructor marks it null, so we do not call the destructor
-    }
-
-    /// @brief add an object to an array
-    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
-    reference operator+=(basic_json&& val)
-    {
-        push_back(std::move(val));
-        return *this;
-    }
-
-    /// @brief add an object to an array
-    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
-    void push_back(const basic_json& val)
-    {
-        // push_back only works for null objects or arrays
-        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
-        {
-            JSON_THROW(type_error::create(308, detail::concat("cannot use push_back() with ", type_name()), this));
-        }
-
-        // transform null object into an array
-        if (is_null())
-        {
-            m_data.m_type = value_t::array;
-            m_data.m_value = value_t::array;
-            assert_invariant();
-        }
-
-        // add element to array
-        const auto old_capacity = m_data.m_value.array->capacity();
-        m_data.m_value.array->push_back(val);
-        set_parent(m_data.m_value.array->back(), old_capacity);
-    }
-
-    /// @brief add an object to an array
-    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
-    reference operator+=(const basic_json& val)
-    {
-        push_back(val);
-        return *this;
-    }
-
-    /// @brief add an object to an object
-    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
-    void push_back(const typename object_t::value_type& val)
-    {
-        // push_back only works for null objects or objects
-        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
-        {
-            JSON_THROW(type_error::create(308, detail::concat("cannot use push_back() with ", type_name()), this));
-        }
-
-        // transform null object into an object
-        if (is_null())
-        {
-            m_data.m_type = value_t::object;
-            m_data.m_value = value_t::object;
-            assert_invariant();
-        }
-
-        // add element to object
-        auto res = m_data.m_value.object->insert(val);
-        set_parent(res.first->second);
-    }
-
-    /// @brief add an object to an object
-    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
-    reference operator+=(const typename object_t::value_type& val)
-    {
-        push_back(val);
-        return *this;
-    }
-
-    /// @brief add an object to an object
-    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
-    void push_back(initializer_list_t init)
-    {
-        if (is_object() && init.size() == 2 && (*init.begin())->is_string())
-        {
-            basic_json&& key = init.begin()->moved_or_copied();
-            push_back(typename object_t::value_type(
-                          std::move(key.get_ref<string_t&>()), (init.begin() + 1)->moved_or_copied()));
-        }
-        else
-        {
-            push_back(basic_json(init));
-        }
-    }
-
-    /// @brief add an object to an object
-    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
-    reference operator+=(initializer_list_t init)
-    {
-        push_back(init);
-        return *this;
-    }
-
-    /// @brief add an object to an array
-    /// @sa https://json.nlohmann.me/api/basic_json/emplace_back/
-    template<class... Args>
-    reference emplace_back(Args&& ... args)
-    {
-        // emplace_back only works for null objects or arrays
-        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
-        {
-            JSON_THROW(type_error::create(311, detail::concat("cannot use emplace_back() with ", type_name()), this));
-        }
-
-        // transform null object into an array
-        if (is_null())
-        {
-            m_data.m_type = value_t::array;
-            m_data.m_value = value_t::array;
-            assert_invariant();
-        }
-
-        // add element to array (perfect forwarding)
-        const auto old_capacity = m_data.m_value.array->capacity();
-        m_data.m_value.array->emplace_back(std::forward<Args>(args)...);
-        return set_parent(m_data.m_value.array->back(), old_capacity);
-    }
-
-    /// @brief add an object to an object if key does not exist
-    /// @sa https://json.nlohmann.me/api/basic_json/emplace/
-    template<class... Args>
-    std::pair<iterator, bool> emplace(Args&& ... args)
-    {
-        // emplace only works for null objects or arrays
-        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
-        {
-            JSON_THROW(type_error::create(311, detail::concat("cannot use emplace() with ", type_name()), this));
-        }
-
-        // transform null object into an object
-        if (is_null())
-        {
-            m_data.m_type = value_t::object;
-            m_data.m_value = value_t::object;
-            assert_invariant();
-        }
-
-        // add element to array (perfect forwarding)
-        auto res = m_data.m_value.object->emplace(std::forward<Args>(args)...);
-        set_parent(res.first->second);
-
-        // create result iterator and set iterator to the result of emplace
-        auto it = begin();
-        it.m_it.object_iterator = res.first;
-
-        // return pair of iterator and boolean
-        return {it, res.second};
-    }
-
-    /// Helper for insertion of an iterator
-    /// @note: This uses std::distance to support GCC 4.8,
-    ///        see https://github.com/nlohmann/json/pull/1257
-    template<typename... Args>
-    iterator insert_iterator(const_iterator pos, Args&& ... args) // NOLINT(performance-unnecessary-value-param)
-    {
-        iterator result(this);
-        JSON_ASSERT(m_data.m_value.array != nullptr);
-
-        auto insert_pos = std::distance(m_data.m_value.array->begin(), pos.m_it.array_iterator);
-        m_data.m_value.array->insert(pos.m_it.array_iterator, std::forward<Args>(args)...);
-        result.m_it.array_iterator = m_data.m_value.array->begin() + insert_pos;
-
-        // This could have been written as:
-        // result.m_it.array_iterator = m_data.m_value.array->insert(pos.m_it.array_iterator, cnt, val);
-        // but the return value of insert is missing in GCC 4.8, so it is written this way instead.
-
-        set_parents();
-        return result;
-    }
-
-    /// @brief inserts element into array
-    /// @sa https://json.nlohmann.me/api/basic_json/insert/
-    iterator insert(const_iterator pos, const basic_json& val) // NOLINT(performance-unnecessary-value-param)
-    {
-        // insert only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            // check if iterator pos fits to this JSON value
-            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
-            {
-                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
-            }
-
-            // insert to array and return iterator
-            return insert_iterator(pos, val);
-        }
-
-        JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
-    }
-
-    /// @brief inserts element into array
-    /// @sa https://json.nlohmann.me/api/basic_json/insert/
-    iterator insert(const_iterator pos, basic_json&& val) // NOLINT(performance-unnecessary-value-param)
-    {
-        return insert(pos, val);
-    }
-
-    /// @brief inserts copies of element into array
-    /// @sa https://json.nlohmann.me/api/basic_json/insert/
-    iterator insert(const_iterator pos, size_type cnt, const basic_json& val) // NOLINT(performance-unnecessary-value-param)
-    {
-        // insert only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            // check if iterator pos fits to this JSON value
-            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
-            {
-                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
-            }
-
-            // insert to array and return iterator
-            return insert_iterator(pos, cnt, val);
-        }
-
-        JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
-    }
-
-    /// @brief inserts range of elements into array
-    /// @sa https://json.nlohmann.me/api/basic_json/insert/
-    iterator insert(const_iterator pos, const_iterator first, const_iterator last) // NOLINT(performance-unnecessary-value-param)
-    {
-        // insert only works for arrays
-        if (JSON_HEDLEY_UNLIKELY(!is_array()))
-        {
-            JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
-        }
-
-        // check if iterator pos fits to this JSON value
-        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
-        {
-            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
-        }
-
-        // check if range iterators belong to the same JSON object
-        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
-        {
-            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", this));
-        }
-
-        if (JSON_HEDLEY_UNLIKELY(first.m_object == this))
-        {
-            JSON_THROW(invalid_iterator::create(211, "passed iterators may not belong to container", this));
-        }
-
-        // insert to array and return iterator
-        return insert_iterator(pos, first.m_it.array_iterator, last.m_it.array_iterator);
-    }
-
-    /// @brief inserts elements from initializer list into array
-    /// @sa https://json.nlohmann.me/api/basic_json/insert/
-    iterator insert(const_iterator pos, initializer_list_t ilist) // NOLINT(performance-unnecessary-value-param)
-    {
-        // insert only works for arrays
-        if (JSON_HEDLEY_UNLIKELY(!is_array()))
-        {
-            JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
-        }
-
-        // check if iterator pos fits to this JSON value
-        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
-        {
-            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
-        }
-
-        // insert to array and return iterator
-        return insert_iterator(pos, ilist.begin(), ilist.end());
-    }
-
-    /// @brief inserts range of elements into object
-    /// @sa https://json.nlohmann.me/api/basic_json/insert/
-    void insert(const_iterator first, const_iterator last) // NOLINT(performance-unnecessary-value-param)
-    {
-        // insert only works for objects
-        if (JSON_HEDLEY_UNLIKELY(!is_object()))
-        {
-            JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
-        }
-
-        // check if range iterators belong to the same JSON object
-        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
-        {
-            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", this));
-        }
-
-        // passed iterators must belong to objects
-        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()))
-        {
-            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects", this));
-        }
-
-        m_data.m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator);
-        set_parents();
-    }
-
-    /// @brief updates a JSON object from another object, overwriting existing keys
-    /// @sa https://json.nlohmann.me/api/basic_json/update/
-    void update(const_reference j, bool merge_objects = false)
-    {
-        update(j.begin(), j.end(), merge_objects);
-    }
-
-    /// @brief updates a JSON object from another object, overwriting existing keys
-    /// @sa https://json.nlohmann.me/api/basic_json/update/
-    void update(const_iterator first, const_iterator last, bool merge_objects = false) // NOLINT(performance-unnecessary-value-param)
-    {
-        // implicitly convert null value to an empty object
-        if (is_null())
-        {
-            m_data.m_type = value_t::object;
-            m_data.m_value.object = create<object_t>();
-            assert_invariant();
-        }
-
-        if (JSON_HEDLEY_UNLIKELY(!is_object()))
-        {
-            JSON_THROW(type_error::create(312, detail::concat("cannot use update() with ", type_name()), this));
-        }
-
-        // check if range iterators belong to the same JSON object
-        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
-        {
-            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", this));
-        }
-
-        // passed iterators must belong to objects
-        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()))
-        {
-            JSON_THROW(type_error::create(312, detail::concat("cannot use update() with ", first.m_object->type_name()), first.m_object));
-        }
-
-        for (auto it = first; it != last; ++it)
-        {
-            if (merge_objects && it.value().is_object())
-            {
-                auto it2 = m_data.m_value.object->find(it.key());
-                if (it2 != m_data.m_value.object->end())
-                {
-                    it2->second.update(it.value(), true);
-                    continue;
-                }
-            }
-            m_data.m_value.object->operator[](it.key()) = it.value();
-#if JSON_DIAGNOSTICS
-            m_data.m_value.object->operator[](it.key()).m_parent = this;
-#endif
-        }
-    }
-
-    /// @brief exchanges the values
-    /// @sa https://json.nlohmann.me/api/basic_json/swap/
-    void swap(reference other) noexcept (
-        std::is_nothrow_move_constructible<value_t>::value&&
-        std::is_nothrow_move_assignable<value_t>::value&&
-        std::is_nothrow_move_constructible<json_value>::value&& // NOLINT(cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
-        std::is_nothrow_move_assignable<json_value>::value
-    )
-    {
-        std::swap(m_data.m_type, other.m_data.m_type);
-        std::swap(m_data.m_value, other.m_data.m_value);
-
-        set_parents();
-        other.set_parents();
-        assert_invariant();
-    }
-
-    /// @brief exchanges the values
-    /// @sa https://json.nlohmann.me/api/basic_json/swap/
-    friend void swap(reference left, reference right) noexcept (
-        std::is_nothrow_move_constructible<value_t>::value&&
-        std::is_nothrow_move_assignable<value_t>::value&&
-        std::is_nothrow_move_constructible<json_value>::value&& // NOLINT(cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
-        std::is_nothrow_move_assignable<json_value>::value
-    )
-    {
-        left.swap(right);
-    }
-
-    /// @brief exchanges the values
-    /// @sa https://json.nlohmann.me/api/basic_json/swap/
-    void swap(array_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
-    {
-        // swap only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            using std::swap;
-            swap(*(m_data.m_value.array), other);
-        }
-        else
-        {
-            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(array_t&) with ", type_name()), this));
-        }
-    }
-
-    /// @brief exchanges the values
-    /// @sa https://json.nlohmann.me/api/basic_json/swap/
-    void swap(object_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
-    {
-        // swap only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            using std::swap;
-            swap(*(m_data.m_value.object), other);
-        }
-        else
-        {
-            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(object_t&) with ", type_name()), this));
-        }
-    }
-
-    /// @brief exchanges the values
-    /// @sa https://json.nlohmann.me/api/basic_json/swap/
-    void swap(string_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
-    {
-        // swap only works for strings
-        if (JSON_HEDLEY_LIKELY(is_string()))
-        {
-            using std::swap;
-            swap(*(m_data.m_value.string), other);
-        }
-        else
-        {
-            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(string_t&) with ", type_name()), this));
-        }
-    }
-
-    /// @brief exchanges the values
-    /// @sa https://json.nlohmann.me/api/basic_json/swap/
-    void swap(binary_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
-    {
-        // swap only works for strings
-        if (JSON_HEDLEY_LIKELY(is_binary()))
-        {
-            using std::swap;
-            swap(*(m_data.m_value.binary), other);
-        }
-        else
-        {
-            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(binary_t&) with ", type_name()), this));
-        }
-    }
-
-    /// @brief exchanges the values
-    /// @sa https://json.nlohmann.me/api/basic_json/swap/
-    void swap(typename binary_t::container_type& other) // NOLINT(bugprone-exception-escape)
-    {
-        // swap only works for strings
-        if (JSON_HEDLEY_LIKELY(is_binary()))
-        {
-            using std::swap;
-            swap(*(m_data.m_value.binary), other);
-        }
-        else
-        {
-            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(binary_t::container_type&) with ", type_name()), this));
-        }
-    }
-
-    /// @}
-
-    //////////////////////////////////////////
-    // lexicographical comparison operators //
-    //////////////////////////////////////////
-
-    /// @name lexicographical comparison operators
-    /// @{
-
-    // note parentheses around operands are necessary; see
-    // https://github.com/nlohmann/json/issues/1530
-#define JSON_IMPLEMENT_OPERATOR(op, null_result, unordered_result, default_result)                       \
-    const auto lhs_type = lhs.type();                                                                    \
-    const auto rhs_type = rhs.type();                                                                    \
-    \
-    if (lhs_type == rhs_type) /* NOLINT(readability/braces) */                                           \
-    {                                                                                                    \
-        switch (lhs_type)                                                                                \
-        {                                                                                                \
-            case value_t::array:                                                                         \
-                return (*lhs.m_data.m_value.array) op (*rhs.m_data.m_value.array);                                     \
-                \
-            case value_t::object:                                                                        \
-                return (*lhs.m_data.m_value.object) op (*rhs.m_data.m_value.object);                                   \
-                \
-            case value_t::null:                                                                          \
-                return (null_result);                                                                    \
-                \
-            case value_t::string:                                                                        \
-                return (*lhs.m_data.m_value.string) op (*rhs.m_data.m_value.string);                                   \
-                \
-            case value_t::boolean:                                                                       \
-                return (lhs.m_data.m_value.boolean) op (rhs.m_data.m_value.boolean);                                   \
-                \
-            case value_t::number_integer:                                                                \
-                return (lhs.m_data.m_value.number_integer) op (rhs.m_data.m_value.number_integer);                     \
-                \
-            case value_t::number_unsigned:                                                               \
-                return (lhs.m_data.m_value.number_unsigned) op (rhs.m_data.m_value.number_unsigned);                   \
-                \
-            case value_t::number_float:                                                                  \
-                return (lhs.m_data.m_value.number_float) op (rhs.m_data.m_value.number_float);                         \
-                \
-            case value_t::binary:                                                                        \
-                return (*lhs.m_data.m_value.binary) op (*rhs.m_data.m_value.binary);                                   \
-                \
-            case value_t::discarded:                                                                     \
-            default:                                                                                     \
-                return (unordered_result);                                                               \
-        }                                                                                                \
-    }                                                                                                    \
-    else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float)                   \
-    {                                                                                                    \
-        return static_cast<number_float_t>(lhs.m_data.m_value.number_integer) op rhs.m_data.m_value.number_float;      \
-    }                                                                                                    \
-    else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer)                   \
-    {                                                                                                    \
-        return lhs.m_data.m_value.number_float op static_cast<number_float_t>(rhs.m_data.m_value.number_integer);      \
-    }                                                                                                    \
-    else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float)                  \
-    {                                                                                                    \
-        return static_cast<number_float_t>(lhs.m_data.m_value.number_unsigned) op rhs.m_data.m_value.number_float;     \
-    }                                                                                                    \
-    else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned)                  \
-    {                                                                                                    \
-        return lhs.m_data.m_value.number_float op static_cast<number_float_t>(rhs.m_data.m_value.number_unsigned);     \
-    }                                                                                                    \
-    else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer)                \
-    {                                                                                                    \
-        return static_cast<number_integer_t>(lhs.m_data.m_value.number_unsigned) op rhs.m_data.m_value.number_integer; \
-    }                                                                                                    \
-    else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned)                \
-    {                                                                                                    \
-        return lhs.m_data.m_value.number_integer op static_cast<number_integer_t>(rhs.m_data.m_value.number_unsigned); \
-    }                                                                                                    \
-    else if(compares_unordered(lhs, rhs))\
-    {\
-        return (unordered_result);\
-    }\
-    \
-    return (default_result);
-
-  JSON_PRIVATE_UNLESS_TESTED:
-    // returns true if:
-    // - any operand is NaN and the other operand is of number type
-    // - any operand is discarded
-    // in legacy mode, discarded values are considered ordered if
-    // an operation is computed as an odd number of inverses of others
-    static bool compares_unordered(const_reference lhs, const_reference rhs, bool inverse = false) noexcept
-    {
-        if ((lhs.is_number_float() && std::isnan(lhs.m_data.m_value.number_float) && rhs.is_number())
-                || (rhs.is_number_float() && std::isnan(rhs.m_data.m_value.number_float) && lhs.is_number()))
-        {
-            return true;
-        }
-#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
-        return (lhs.is_discarded() || rhs.is_discarded()) && !inverse;
-#else
-        static_cast<void>(inverse);
-        return lhs.is_discarded() || rhs.is_discarded();
-#endif
-    }
-
-  private:
-    bool compares_unordered(const_reference rhs, bool inverse = false) const noexcept
-    {
-        return compares_unordered(*this, rhs, inverse);
-    }
-
-  public:
-#if JSON_HAS_THREE_WAY_COMPARISON
-    /// @brief comparison: equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
-    bool operator==(const_reference rhs) const noexcept
-    {
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wfloat-equal"
-#endif
-        const_reference lhs = *this;
-        JSON_IMPLEMENT_OPERATOR( ==, true, false, false)
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-    }
-
-    /// @brief comparison: equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
-    template<typename ScalarType>
-    requires std::is_scalar_v<ScalarType>
-    bool operator==(ScalarType rhs) const noexcept
-    {
-        return *this == basic_json(rhs);
-    }
-
-    /// @brief comparison: not equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
-    bool operator!=(const_reference rhs) const noexcept
-    {
-        if (compares_unordered(rhs, true))
-        {
-            return false;
-        }
-        return !operator==(rhs);
-    }
-
-    /// @brief comparison: 3-way
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_spaceship/
-    std::partial_ordering operator<=>(const_reference rhs) const noexcept // *NOPAD*
-    {
-        const_reference lhs = *this;
-        // default_result is used if we cannot compare values. In that case,
-        // we compare types.
-        JSON_IMPLEMENT_OPERATOR(<=>, // *NOPAD*
-                                std::partial_ordering::equivalent,
-                                std::partial_ordering::unordered,
-                                lhs_type <=> rhs_type) // *NOPAD*
-    }
-
-    /// @brief comparison: 3-way
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_spaceship/
-    template<typename ScalarType>
-    requires std::is_scalar_v<ScalarType>
-    std::partial_ordering operator<=>(ScalarType rhs) const noexcept // *NOPAD*
-    {
-        return *this <=> basic_json(rhs); // *NOPAD*
-    }
-
-#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
-    // all operators that are computed as an odd number of inverses of others
-    // need to be overloaded to emulate the legacy comparison behavior
-
-    /// @brief comparison: less than or equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
-    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, undef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON)
-    bool operator<=(const_reference rhs) const noexcept
-    {
-        if (compares_unordered(rhs, true))
-        {
-            return false;
-        }
-        return !(rhs < *this);
-    }
-
-    /// @brief comparison: less than or equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
-    template<typename ScalarType>
-    requires std::is_scalar_v<ScalarType>
-    bool operator<=(ScalarType rhs) const noexcept
-    {
-        return *this <= basic_json(rhs);
-    }
-
-    /// @brief comparison: greater than or equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
-    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, undef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON)
-    bool operator>=(const_reference rhs) const noexcept
-    {
-        if (compares_unordered(rhs, true))
-        {
-            return false;
-        }
-        return !(*this < rhs);
-    }
-
-    /// @brief comparison: greater than or equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
-    template<typename ScalarType>
-    requires std::is_scalar_v<ScalarType>
-    bool operator>=(ScalarType rhs) const noexcept
-    {
-        return *this >= basic_json(rhs);
-    }
-#endif
-#else
-    /// @brief comparison: equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
-    friend bool operator==(const_reference lhs, const_reference rhs) noexcept
-    {
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wfloat-equal"
-#endif
-        JSON_IMPLEMENT_OPERATOR( ==, true, false, false)
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-    }
-
-    /// @brief comparison: equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator==(const_reference lhs, ScalarType rhs) noexcept
-    {
-        return lhs == basic_json(rhs);
-    }
-
-    /// @brief comparison: equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator==(ScalarType lhs, const_reference rhs) noexcept
-    {
-        return basic_json(lhs) == rhs;
-    }
-
-    /// @brief comparison: not equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
-    friend bool operator!=(const_reference lhs, const_reference rhs) noexcept
-    {
-        if (compares_unordered(lhs, rhs, true))
-        {
-            return false;
-        }
-        return !(lhs == rhs);
-    }
-
-    /// @brief comparison: not equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator!=(const_reference lhs, ScalarType rhs) noexcept
-    {
-        return lhs != basic_json(rhs);
-    }
-
-    /// @brief comparison: not equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator!=(ScalarType lhs, const_reference rhs) noexcept
-    {
-        return basic_json(lhs) != rhs;
-    }
-
-    /// @brief comparison: less than
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_lt/
-    friend bool operator<(const_reference lhs, const_reference rhs) noexcept
-    {
-        // default_result is used if we cannot compare values. In that case,
-        // we compare types. Note we have to call the operator explicitly,
-        // because MSVC has problems otherwise.
-        JSON_IMPLEMENT_OPERATOR( <, false, false, operator<(lhs_type, rhs_type))
-    }
-
-    /// @brief comparison: less than
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_lt/
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator<(const_reference lhs, ScalarType rhs) noexcept
-    {
-        return lhs < basic_json(rhs);
-    }
-
-    /// @brief comparison: less than
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_lt/
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator<(ScalarType lhs, const_reference rhs) noexcept
-    {
-        return basic_json(lhs) < rhs;
-    }
-
-    /// @brief comparison: less than or equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
-    friend bool operator<=(const_reference lhs, const_reference rhs) noexcept
-    {
-        if (compares_unordered(lhs, rhs, true))
-        {
-            return false;
-        }
-        return !(rhs < lhs);
-    }
-
-    /// @brief comparison: less than or equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator<=(const_reference lhs, ScalarType rhs) noexcept
-    {
-        return lhs <= basic_json(rhs);
-    }
-
-    /// @brief comparison: less than or equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator<=(ScalarType lhs, const_reference rhs) noexcept
-    {
-        return basic_json(lhs) <= rhs;
-    }
-
-    /// @brief comparison: greater than
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_gt/
-    friend bool operator>(const_reference lhs, const_reference rhs) noexcept
-    {
-        // double inverse
-        if (compares_unordered(lhs, rhs))
-        {
-            return false;
-        }
-        return !(lhs <= rhs);
-    }
-
-    /// @brief comparison: greater than
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_gt/
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator>(const_reference lhs, ScalarType rhs) noexcept
-    {
-        return lhs > basic_json(rhs);
-    }
-
-    /// @brief comparison: greater than
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_gt/
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator>(ScalarType lhs, const_reference rhs) noexcept
-    {
-        return basic_json(lhs) > rhs;
-    }
-
-    /// @brief comparison: greater than or equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
-    friend bool operator>=(const_reference lhs, const_reference rhs) noexcept
-    {
-        if (compares_unordered(lhs, rhs, true))
-        {
-            return false;
-        }
-        return !(lhs < rhs);
-    }
-
-    /// @brief comparison: greater than or equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator>=(const_reference lhs, ScalarType rhs) noexcept
-    {
-        return lhs >= basic_json(rhs);
-    }
-
-    /// @brief comparison: greater than or equal
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator>=(ScalarType lhs, const_reference rhs) noexcept
-    {
-        return basic_json(lhs) >= rhs;
-    }
-#endif
-
-#undef JSON_IMPLEMENT_OPERATOR
-
-    /// @}
-
-    ///////////////////
-    // serialization //
-    ///////////////////
-
-    /// @name serialization
-    /// @{
-#ifndef JSON_NO_IO
-    /// @brief serialize to stream
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_ltlt/
-    friend std::ostream& operator<<(std::ostream& o, const basic_json& j)
-    {
-        // read width member and use it as indentation parameter if nonzero
-        const bool pretty_print = o.width() > 0;
-        const auto indentation = pretty_print ? o.width() : 0;
-
-        // reset width to 0 for subsequent calls to this stream
-        o.width(0);
-
-        // do the actual serialization
-        serializer s(detail::output_adapter<char>(o), o.fill());
-        s.dump(j, pretty_print, false, static_cast<unsigned int>(indentation));
-        return o;
-    }
-
-    /// @brief serialize to stream
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_ltlt/
-    /// @deprecated This function is deprecated since 3.0.0 and will be removed in
-    ///             version 4.0.0 of the library. Please use
-    ///             operator<<(std::ostream&, const basic_json&) instead; that is,
-    ///             replace calls like `j >> o;` with `o << j;`.
-    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator<<(std::ostream&, const basic_json&))
-    friend std::ostream& operator>>(const basic_json& j, std::ostream& o)
-    {
-        return o << j;
-    }
-#endif  // JSON_NO_IO
-    /// @}
-
-    /////////////////////
-    // deserialization //
-    /////////////////////
-
-    /// @name deserialization
-    /// @{
-
-    /// @brief deserialize from a compatible input
-    /// @sa https://json.nlohmann.me/api/basic_json/parse/
-    template<typename InputType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json parse(InputType&& i,
-                            parser_callback_t cb = nullptr,
-                            const bool allow_exceptions = true,
-                            const bool ignore_comments = false)
-    {
-        basic_json result;
-        parser(detail::input_adapter(std::forward<InputType>(i)), std::move(cb), allow_exceptions, ignore_comments).parse(true, result); // cppcheck-suppress[accessMoved,accessForwarded]
-        return result;
-    }
-
-    /// @brief deserialize from a pair of character iterators
-    /// @sa https://json.nlohmann.me/api/basic_json/parse/
-    template<typename IteratorType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json parse(IteratorType first,
-                            IteratorType last,
-                            parser_callback_t cb = nullptr,
-                            const bool allow_exceptions = true,
-                            const bool ignore_comments = false)
-    {
-        basic_json result;
-        parser(detail::input_adapter(std::move(first), std::move(last)), std::move(cb), allow_exceptions, ignore_comments).parse(true, result); // cppcheck-suppress[accessMoved]
-        return result;
-    }
-
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, parse(ptr, ptr + len))
-    static basic_json parse(detail::span_input_adapter&& i,
-                            parser_callback_t cb = nullptr,
-                            const bool allow_exceptions = true,
-                            const bool ignore_comments = false)
-    {
-        basic_json result;
-        parser(i.get(), std::move(cb), allow_exceptions, ignore_comments).parse(true, result); // cppcheck-suppress[accessMoved]
-        return result;
-    }
-
-    /// @brief check if the input is valid JSON
-    /// @sa https://json.nlohmann.me/api/basic_json/accept/
-    template<typename InputType>
-    static bool accept(InputType&& i,
-                       const bool ignore_comments = false)
-    {
-        return parser(detail::input_adapter(std::forward<InputType>(i)), nullptr, false, ignore_comments).accept(true);
-    }
-
-    /// @brief check if the input is valid JSON
-    /// @sa https://json.nlohmann.me/api/basic_json/accept/
-    template<typename IteratorType>
-    static bool accept(IteratorType first, IteratorType last,
-                       const bool ignore_comments = false)
-    {
-        return parser(detail::input_adapter(std::move(first), std::move(last)), nullptr, false, ignore_comments).accept(true);
-    }
-
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, accept(ptr, ptr + len))
-    static bool accept(detail::span_input_adapter&& i,
-                       const bool ignore_comments = false)
-    {
-        return parser(i.get(), nullptr, false, ignore_comments).accept(true);
-    }
-
-    /// @brief generate SAX events
-    /// @sa https://json.nlohmann.me/api/basic_json/sax_parse/
-    template <typename InputType, typename SAX>
-    JSON_HEDLEY_NON_NULL(2)
-    static bool sax_parse(InputType&& i, SAX* sax,
-                          input_format_t format = input_format_t::json,
-                          const bool strict = true,
-                          const bool ignore_comments = false)
-    {
-        auto ia = detail::input_adapter(std::forward<InputType>(i));
-        return format == input_format_t::json
-               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
-               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia), format).sax_parse(format, sax, strict);
-    }
-
-    /// @brief generate SAX events
-    /// @sa https://json.nlohmann.me/api/basic_json/sax_parse/
-    template<class IteratorType, class SAX>
-    JSON_HEDLEY_NON_NULL(3)
-    static bool sax_parse(IteratorType first, IteratorType last, SAX* sax,
-                          input_format_t format = input_format_t::json,
-                          const bool strict = true,
-                          const bool ignore_comments = false)
-    {
-        auto ia = detail::input_adapter(std::move(first), std::move(last));
-        return format == input_format_t::json
-               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
-               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia), format).sax_parse(format, sax, strict);
-    }
-
-    /// @brief generate SAX events
-    /// @sa https://json.nlohmann.me/api/basic_json/sax_parse/
-    /// @deprecated This function is deprecated since 3.8.0 and will be removed in
-    ///             version 4.0.0 of the library. Please use
-    ///             sax_parse(ptr, ptr + len) instead.
-    template <typename SAX>
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, sax_parse(ptr, ptr + len, ...))
-    JSON_HEDLEY_NON_NULL(2)
-    static bool sax_parse(detail::span_input_adapter&& i, SAX* sax,
-                          input_format_t format = input_format_t::json,
-                          const bool strict = true,
-                          const bool ignore_comments = false)
-    {
-        auto ia = i.get();
-        return format == input_format_t::json
-               // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
-               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
-               // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
-               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia), format).sax_parse(format, sax, strict);
-    }
-#ifndef JSON_NO_IO
-    /// @brief deserialize from stream
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_gtgt/
-    /// @deprecated This stream operator is deprecated since 3.0.0 and will be removed in
-    ///             version 4.0.0 of the library. Please use
-    ///             operator>>(std::istream&, basic_json&) instead; that is,
-    ///             replace calls like `j << i;` with `i >> j;`.
-    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator>>(std::istream&, basic_json&))
-    friend std::istream& operator<<(basic_json& j, std::istream& i)
-    {
-        return operator>>(i, j);
-    }
-
-    /// @brief deserialize from stream
-    /// @sa https://json.nlohmann.me/api/basic_json/operator_gtgt/
-    friend std::istream& operator>>(std::istream& i, basic_json& j)
-    {
-        parser(detail::input_adapter(i)).parse(false, j);
-        return i;
-    }
-#endif  // JSON_NO_IO
-    /// @}
-
-    ///////////////////////////
-    // convenience functions //
-    ///////////////////////////
-
-    /// @brief return the type as string
-    /// @sa https://json.nlohmann.me/api/basic_json/type_name/
-    JSON_HEDLEY_RETURNS_NON_NULL
-    const char* type_name() const noexcept
-    {
-        switch (m_data.m_type)
-        {
-            case value_t::null:
-                return "null";
-            case value_t::object:
-                return "object";
-            case value_t::array:
-                return "array";
-            case value_t::string:
-                return "string";
-            case value_t::boolean:
-                return "boolean";
-            case value_t::binary:
-                return "binary";
-            case value_t::discarded:
-                return "discarded";
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            default:
-                return "number";
-        }
-    }
-
-  JSON_PRIVATE_UNLESS_TESTED:
-    //////////////////////
-    // member variables //
-    //////////////////////
-
-    struct data
-    {
-        /// the type of the current element
-        value_t m_type = value_t::null;
-
-        /// the value of the current element
-        json_value m_value = {};
-
-        data(const value_t v)
-            : m_type(v), m_value(v)
-        {
-        }
-
-        data(size_type cnt, const basic_json& val)
-            : m_type(value_t::array)
-        {
-            m_value.array = create<array_t>(cnt, val);
-        }
-
-        data() noexcept = default;
-        data(data&&) noexcept = default;
-        data(const data&) noexcept = delete;
-        data& operator=(data&&) noexcept = delete;
-        data& operator=(const data&) noexcept = delete;
-
-        ~data() noexcept
-        {
-            m_value.destroy(m_type);
-        }
-    };
-
-    data m_data = {};
-
-#if JSON_DIAGNOSTICS
-    /// a pointer to a parent value (for debugging purposes)
-    basic_json* m_parent = nullptr;
-#endif
-
-#if JSON_DIAGNOSTIC_POSITIONS
-    /// the start position of the value
-    std::size_t start_position = std::string::npos;
-    /// the end position of the value
-    std::size_t end_position = std::string::npos;
-  public:
-    constexpr std::size_t start_pos() const noexcept
-    {
-        return start_position;
-    }
-
-    constexpr std::size_t end_pos() const noexcept
-    {
-        return end_position;
-    }
-#endif
-
-    //////////////////////////////////////////
-    // binary serialization/deserialization //
-    //////////////////////////////////////////
-
-    /// @name binary serialization/deserialization support
-    /// @{
-
-  public:
-    /// @brief create a CBOR serialization of a given JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/to_cbor/
-    static std::vector<std::uint8_t> to_cbor(const basic_json& j)
-    {
-        std::vector<std::uint8_t> result;
-        to_cbor(j, result);
-        return result;
-    }
-
-    /// @brief create a CBOR serialization of a given JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/to_cbor/
-    static void to_cbor(const basic_json& j, detail::output_adapter<std::uint8_t> o)
-    {
-        binary_writer<std::uint8_t>(o).write_cbor(j);
-    }
-
-    /// @brief create a CBOR serialization of a given JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/to_cbor/
-    static void to_cbor(const basic_json& j, detail::output_adapter<char> o)
-    {
-        binary_writer<char>(o).write_cbor(j);
-    }
-
-    /// @brief create a MessagePack serialization of a given JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/to_msgpack/
-    static std::vector<std::uint8_t> to_msgpack(const basic_json& j)
-    {
-        std::vector<std::uint8_t> result;
-        to_msgpack(j, result);
-        return result;
-    }
-
-    /// @brief create a MessagePack serialization of a given JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/to_msgpack/
-    static void to_msgpack(const basic_json& j, detail::output_adapter<std::uint8_t> o)
-    {
-        binary_writer<std::uint8_t>(o).write_msgpack(j);
-    }
-
-    /// @brief create a MessagePack serialization of a given JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/to_msgpack/
-    static void to_msgpack(const basic_json& j, detail::output_adapter<char> o)
-    {
-        binary_writer<char>(o).write_msgpack(j);
-    }
-
-    /// @brief create a UBJSON serialization of a given JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/to_ubjson/
-    static std::vector<std::uint8_t> to_ubjson(const basic_json& j,
-            const bool use_size = false,
-            const bool use_type = false)
-    {
-        std::vector<std::uint8_t> result;
-        to_ubjson(j, result, use_size, use_type);
-        return result;
-    }
-
-    /// @brief create a UBJSON serialization of a given JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/to_ubjson/
-    static void to_ubjson(const basic_json& j, detail::output_adapter<std::uint8_t> o,
-                          const bool use_size = false, const bool use_type = false)
-    {
-        binary_writer<std::uint8_t>(o).write_ubjson(j, use_size, use_type);
-    }
-
-    /// @brief create a UBJSON serialization of a given JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/to_ubjson/
-    static void to_ubjson(const basic_json& j, detail::output_adapter<char> o,
-                          const bool use_size = false, const bool use_type = false)
-    {
-        binary_writer<char>(o).write_ubjson(j, use_size, use_type);
-    }
-
-    /// @brief create a BJData serialization of a given JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/to_bjdata/
-    static std::vector<std::uint8_t> to_bjdata(const basic_json& j,
-            const bool use_size = false,
-            const bool use_type = false,
-            const bjdata_version_t version = bjdata_version_t::draft2)
-    {
-        std::vector<std::uint8_t> result;
-        to_bjdata(j, result, use_size, use_type, version);
-        return result;
-    }
-
-    /// @brief create a BJData serialization of a given JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/to_bjdata/
-    static void to_bjdata(const basic_json& j, detail::output_adapter<std::uint8_t> o,
-                          const bool use_size = false, const bool use_type = false,
-                          const bjdata_version_t version = bjdata_version_t::draft2)
-    {
-        binary_writer<std::uint8_t>(o).write_ubjson(j, use_size, use_type, true, true, version);
-    }
-
-    /// @brief create a BJData serialization of a given JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/to_bjdata/
-    static void to_bjdata(const basic_json& j, detail::output_adapter<char> o,
-                          const bool use_size = false, const bool use_type = false,
-                          const bjdata_version_t version = bjdata_version_t::draft2)
-    {
-        binary_writer<char>(o).write_ubjson(j, use_size, use_type, true, true, version);
-    }
-
-    /// @brief create a BSON serialization of a given JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/to_bson/
-    static std::vector<std::uint8_t> to_bson(const basic_json& j)
-    {
-        std::vector<std::uint8_t> result;
-        to_bson(j, result);
-        return result;
-    }
-
-    /// @brief create a BSON serialization of a given JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/to_bson/
-    static void to_bson(const basic_json& j, detail::output_adapter<std::uint8_t> o)
-    {
-        binary_writer<std::uint8_t>(o).write_bson(j);
-    }
-
-    /// @brief create a BSON serialization of a given JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/to_bson/
-    static void to_bson(const basic_json& j, detail::output_adapter<char> o)
-    {
-        binary_writer<char>(o).write_bson(j);
-    }
-
-    /// @brief create a JSON value from an input in CBOR format
-    /// @sa https://json.nlohmann.me/api/basic_json/from_cbor/
-    template<typename InputType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_cbor(InputType&& i,
-                                const bool strict = true,
-                                const bool allow_exceptions = true,
-                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
-    {
-        basic_json result;
-        auto ia = detail::input_adapter(std::forward<InputType>(i));
-        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler); // cppcheck-suppress[accessMoved]
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    /// @brief create a JSON value from an input in CBOR format
-    /// @sa https://json.nlohmann.me/api/basic_json/from_cbor/
-    template<typename IteratorType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_cbor(IteratorType first, IteratorType last,
-                                const bool strict = true,
-                                const bool allow_exceptions = true,
-                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
-    {
-        basic_json result;
-        auto ia = detail::input_adapter(std::move(first), std::move(last));
-        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler); // cppcheck-suppress[accessMoved]
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    template<typename T>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
-    static basic_json from_cbor(const T* ptr, std::size_t len,
-                                const bool strict = true,
-                                const bool allow_exceptions = true,
-                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
-    {
-        return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler);
-    }
-
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
-    static basic_json from_cbor(detail::span_input_adapter&& i,
-                                const bool strict = true,
-                                const bool allow_exceptions = true,
-                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
-    {
-        basic_json result;
-        auto ia = i.get();
-        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
-        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler); // cppcheck-suppress[accessMoved]
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    /// @brief create a JSON value from an input in MessagePack format
-    /// @sa https://json.nlohmann.me/api/basic_json/from_msgpack/
-    template<typename InputType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_msgpack(InputType&& i,
-                                   const bool strict = true,
-                                   const bool allow_exceptions = true)
-    {
-        basic_json result;
-        auto ia = detail::input_adapter(std::forward<InputType>(i));
-        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack).sax_parse(input_format_t::msgpack, &sdp, strict); // cppcheck-suppress[accessMoved]
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    /// @brief create a JSON value from an input in MessagePack format
-    /// @sa https://json.nlohmann.me/api/basic_json/from_msgpack/
-    template<typename IteratorType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_msgpack(IteratorType first, IteratorType last,
-                                   const bool strict = true,
-                                   const bool allow_exceptions = true)
-    {
-        basic_json result;
-        auto ia = detail::input_adapter(std::move(first), std::move(last));
-        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack).sax_parse(input_format_t::msgpack, &sdp, strict); // cppcheck-suppress[accessMoved]
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    template<typename T>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
-    static basic_json from_msgpack(const T* ptr, std::size_t len,
-                                   const bool strict = true,
-                                   const bool allow_exceptions = true)
-    {
-        return from_msgpack(ptr, ptr + len, strict, allow_exceptions);
-    }
-
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
-    static basic_json from_msgpack(detail::span_input_adapter&& i,
-                                   const bool strict = true,
-                                   const bool allow_exceptions = true)
-    {
-        basic_json result;
-        auto ia = i.get();
-        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
-        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack).sax_parse(input_format_t::msgpack, &sdp, strict); // cppcheck-suppress[accessMoved]
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    /// @brief create a JSON value from an input in UBJSON format
-    /// @sa https://json.nlohmann.me/api/basic_json/from_ubjson/
-    template<typename InputType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_ubjson(InputType&& i,
-                                  const bool strict = true,
-                                  const bool allow_exceptions = true)
-    {
-        basic_json result;
-        auto ia = detail::input_adapter(std::forward<InputType>(i));
-        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson).sax_parse(input_format_t::ubjson, &sdp, strict); // cppcheck-suppress[accessMoved]
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    /// @brief create a JSON value from an input in UBJSON format
-    /// @sa https://json.nlohmann.me/api/basic_json/from_ubjson/
-    template<typename IteratorType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_ubjson(IteratorType first, IteratorType last,
-                                  const bool strict = true,
-                                  const bool allow_exceptions = true)
-    {
-        basic_json result;
-        auto ia = detail::input_adapter(std::move(first), std::move(last));
-        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson).sax_parse(input_format_t::ubjson, &sdp, strict); // cppcheck-suppress[accessMoved]
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    template<typename T>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
-    static basic_json from_ubjson(const T* ptr, std::size_t len,
-                                  const bool strict = true,
-                                  const bool allow_exceptions = true)
-    {
-        return from_ubjson(ptr, ptr + len, strict, allow_exceptions);
-    }
-
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
-    static basic_json from_ubjson(detail::span_input_adapter&& i,
-                                  const bool strict = true,
-                                  const bool allow_exceptions = true)
-    {
-        basic_json result;
-        auto ia = i.get();
-        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
-        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson).sax_parse(input_format_t::ubjson, &sdp, strict); // cppcheck-suppress[accessMoved]
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    /// @brief create a JSON value from an input in BJData format
-    /// @sa https://json.nlohmann.me/api/basic_json/from_bjdata/
-    template<typename InputType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_bjdata(InputType&& i,
-                                  const bool strict = true,
-                                  const bool allow_exceptions = true)
-    {
-        basic_json result;
-        auto ia = detail::input_adapter(std::forward<InputType>(i));
-        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bjdata).sax_parse(input_format_t::bjdata, &sdp, strict); // cppcheck-suppress[accessMoved]
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    /// @brief create a JSON value from an input in BJData format
-    /// @sa https://json.nlohmann.me/api/basic_json/from_bjdata/
-    template<typename IteratorType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_bjdata(IteratorType first, IteratorType last,
-                                  const bool strict = true,
-                                  const bool allow_exceptions = true)
-    {
-        basic_json result;
-        auto ia = detail::input_adapter(std::move(first), std::move(last));
-        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bjdata).sax_parse(input_format_t::bjdata, &sdp, strict); // cppcheck-suppress[accessMoved]
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    /// @brief create a JSON value from an input in BSON format
-    /// @sa https://json.nlohmann.me/api/basic_json/from_bson/
-    template<typename InputType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_bson(InputType&& i,
-                                const bool strict = true,
-                                const bool allow_exceptions = true)
-    {
-        basic_json result;
-        auto ia = detail::input_adapter(std::forward<InputType>(i));
-        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson).sax_parse(input_format_t::bson, &sdp, strict); // cppcheck-suppress[accessMoved]
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    /// @brief create a JSON value from an input in BSON format
-    /// @sa https://json.nlohmann.me/api/basic_json/from_bson/
-    template<typename IteratorType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_bson(IteratorType first, IteratorType last,
-                                const bool strict = true,
-                                const bool allow_exceptions = true)
-    {
-        basic_json result;
-        auto ia = detail::input_adapter(std::move(first), std::move(last));
-        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson).sax_parse(input_format_t::bson, &sdp, strict); // cppcheck-suppress[accessMoved]
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    template<typename T>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
-    static basic_json from_bson(const T* ptr, std::size_t len,
-                                const bool strict = true,
-                                const bool allow_exceptions = true)
-    {
-        return from_bson(ptr, ptr + len, strict, allow_exceptions);
-    }
-
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
-    static basic_json from_bson(detail::span_input_adapter&& i,
-                                const bool strict = true,
-                                const bool allow_exceptions = true)
-    {
-        basic_json result;
-        auto ia = i.get();
-        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
-        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson).sax_parse(input_format_t::bson, &sdp, strict); // cppcheck-suppress[accessMoved]
-        return res ? result : basic_json(value_t::discarded);
-    }
-    /// @}
-
-    //////////////////////////
-    // JSON Pointer support //
-    //////////////////////////
-
-    /// @name JSON Pointer functions
-    /// @{
-
-    /// @brief access specified element via JSON Pointer
-    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
-    reference operator[](const json_pointer& ptr)
-    {
-        return ptr.get_unchecked(this);
-    }
-
-    template<typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
-    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
-    reference operator[](const ::nlohmann::json_pointer<BasicJsonType>& ptr)
-    {
-        return ptr.get_unchecked(this);
-    }
-
-    /// @brief access specified element via JSON Pointer
-    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
-    const_reference operator[](const json_pointer& ptr) const
-    {
-        return ptr.get_unchecked(this);
-    }
-
-    template<typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
-    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
-    const_reference operator[](const ::nlohmann::json_pointer<BasicJsonType>& ptr) const
-    {
-        return ptr.get_unchecked(this);
-    }
-
-    /// @brief access specified element via JSON Pointer
-    /// @sa https://json.nlohmann.me/api/basic_json/at/
-    reference at(const json_pointer& ptr)
-    {
-        return ptr.get_checked(this);
-    }
-
-    template<typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
-    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
-    reference at(const ::nlohmann::json_pointer<BasicJsonType>& ptr)
-    {
-        return ptr.get_checked(this);
-    }
-
-    /// @brief access specified element via JSON Pointer
-    /// @sa https://json.nlohmann.me/api/basic_json/at/
-    const_reference at(const json_pointer& ptr) const
-    {
-        return ptr.get_checked(this);
-    }
-
-    template<typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
-    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
-    const_reference at(const ::nlohmann::json_pointer<BasicJsonType>& ptr) const
-    {
-        return ptr.get_checked(this);
-    }
-
-    /// @brief return flattened JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/flatten/
-    basic_json flatten() const
-    {
-        basic_json result(value_t::object);
-        json_pointer::flatten("", *this, result);
-        return result;
-    }
-
-    /// @brief unflatten a previously flattened JSON value
-    /// @sa https://json.nlohmann.me/api/basic_json/unflatten/
-    basic_json unflatten() const
-    {
-        return json_pointer::unflatten(*this);
-    }
-
-    /// @}
-
-    //////////////////////////
-    // JSON Patch functions //
-    //////////////////////////
-
-    /// @name JSON Patch functions
-    /// @{
-
-    /// @brief applies a JSON patch in-place without copying the object
-    /// @sa https://json.nlohmann.me/api/basic_json/patch/
-    void patch_inplace(const basic_json& json_patch)
-    {
-        basic_json& result = *this;
-        // the valid JSON Patch operations
-        enum class patch_operations {add, remove, replace, move, copy, test, invalid};
-
-        const auto get_op = [](const string_t& op)
-        {
-            if (op == "add")
-            {
-                return patch_operations::add;
-            }
-            if (op == "remove")
-            {
-                return patch_operations::remove;
-            }
-            if (op == "replace")
-            {
-                return patch_operations::replace;
-            }
-            if (op == "move")
-            {
-                return patch_operations::move;
-            }
-            if (op == "copy")
-            {
-                return patch_operations::copy;
-            }
-            if (op == "test")
-            {
-                return patch_operations::test;
-            }
-
-            return patch_operations::invalid;
-        };
-
-        // wrapper for "add" operation; add value at ptr
-        const auto operation_add = [&result](json_pointer & ptr, const basic_json & val)
-        {
-            // adding to the root of the target document means replacing it
-            if (ptr.empty())
-            {
-                result = val;
-                return;
-            }
-
-            // make sure the top element of the pointer exists
-            json_pointer const top_pointer = ptr.top();
-            if (top_pointer != ptr)
-            {
-                result.at(top_pointer);
-            }
-
-            // get reference to parent of JSON pointer ptr
-            const auto last_path = ptr.back();
-            ptr.pop_back();
-            // parent must exist when performing patch add per RFC6902 specs
-            basic_json& parent = result.at(ptr);
-
-            switch (parent.m_data.m_type)
-            {
-                case value_t::null:
-                case value_t::object:
-                {
-                    // use operator[] to add value
-                    parent[last_path] = val;
-                    break;
-                }
-
-                case value_t::array:
-                {
-                    if (last_path == "-")
-                    {
-                        // special case: append to back
-                        parent.push_back(val);
-                    }
-                    else
-                    {
-                        const auto idx = json_pointer::template array_index<basic_json_t>(last_path);
-                        if (JSON_HEDLEY_UNLIKELY(idx > parent.size()))
-                        {
-                            // avoid undefined behavior
-                            JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), &parent));
-                        }
-
-                        // default case: insert add offset
-                        parent.insert(parent.begin() + static_cast<difference_type>(idx), val);
-                    }
-                    break;
-                }
-
-                // if there exists a parent it cannot be primitive
-                case value_t::string: // LCOV_EXCL_LINE
-                case value_t::boolean: // LCOV_EXCL_LINE
-                case value_t::number_integer: // LCOV_EXCL_LINE
-                case value_t::number_unsigned: // LCOV_EXCL_LINE
-                case value_t::number_float: // LCOV_EXCL_LINE
-                case value_t::binary: // LCOV_EXCL_LINE
-                case value_t::discarded: // LCOV_EXCL_LINE
-                default:            // LCOV_EXCL_LINE
-                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
-            }
-        };
-
-        // wrapper for "remove" operation; remove value at ptr
-        const auto operation_remove = [this, & result](json_pointer & ptr)
-        {
-            // get reference to parent of JSON pointer ptr
-            const auto last_path = ptr.back();
-            ptr.pop_back();
-            basic_json& parent = result.at(ptr);
-
-            // remove child
-            if (parent.is_object())
-            {
-                // perform range check
-                auto it = parent.find(last_path);
-                if (JSON_HEDLEY_LIKELY(it != parent.end()))
-                {
-                    parent.erase(it);
-                }
-                else
-                {
-                    JSON_THROW(out_of_range::create(403, detail::concat("key '", last_path, "' not found"), this));
-                }
-            }
-            else if (parent.is_array())
-            {
-                // note erase performs range check
-                parent.erase(json_pointer::template array_index<basic_json_t>(last_path));
-            }
-        };
-
-        // type check: top level value must be an array
-        if (JSON_HEDLEY_UNLIKELY(!json_patch.is_array()))
-        {
-            JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects", &json_patch));
-        }
-
-        // iterate and apply the operations
-        for (const auto& val : json_patch)
-        {
-            // wrapper to get a value for an operation
-            const auto get_value = [&val](const string_t& op,
-                                          const string_t& member,
-                                          bool string_type) -> basic_json &
-            {
-                // find value
-                auto it = val.m_data.m_value.object->find(member);
-
-                // context-sensitive error message
-                const auto error_msg = (op == "op") ? "operation" : detail::concat("operation '", op, '\''); // NOLINT(bugprone-unused-local-non-trivial-variable)
-
-                // check if desired value is present
-                if (JSON_HEDLEY_UNLIKELY(it == val.m_data.m_value.object->end()))
-                {
-                    // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
-                    JSON_THROW(parse_error::create(105, 0, detail::concat(error_msg, " must have member '", member, "'"), &val));
-                }
-
-                // check if result is of type string
-                if (JSON_HEDLEY_UNLIKELY(string_type && !it->second.is_string()))
-                {
-                    // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
-                    JSON_THROW(parse_error::create(105, 0, detail::concat(error_msg, " must have string member '", member, "'"), &val));
-                }
-
-                // no error: return value
-                return it->second;
-            };
-
-            // type check: every element of the array must be an object
-            if (JSON_HEDLEY_UNLIKELY(!val.is_object()))
-            {
-                JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects", &val));
-            }
-
-            // collect mandatory members
-            const auto op = get_value("op", "op", true).template get<string_t>();
-            const auto path = get_value(op, "path", true).template get<string_t>();
-            json_pointer ptr(path);
-
-            switch (get_op(op))
-            {
-                case patch_operations::add:
-                {
-                    operation_add(ptr, get_value("add", "value", false));
-                    break;
-                }
-
-                case patch_operations::remove:
-                {
-                    operation_remove(ptr);
-                    break;
-                }
-
-                case patch_operations::replace:
-                {
-                    // the "path" location must exist - use at()
-                    result.at(ptr) = get_value("replace", "value", false);
-                    break;
-                }
-
-                case patch_operations::move:
-                {
-                    const auto from_path = get_value("move", "from", true).template get<string_t>();
-                    json_pointer from_ptr(from_path);
-
-                    // the "from" location must exist - use at()
-                    basic_json const v = result.at(from_ptr);
-
-                    // The move operation is functionally identical to a
-                    // "remove" operation on the "from" location, followed
-                    // immediately by an "add" operation at the target
-                    // location with the value that was just removed.
-                    operation_remove(from_ptr);
-                    operation_add(ptr, v);
-                    break;
-                }
-
-                case patch_operations::copy:
-                {
-                    const auto from_path = get_value("copy", "from", true).template get<string_t>();
-                    const json_pointer from_ptr(from_path);
-
-                    // the "from" location must exist - use at()
-                    basic_json const v = result.at(from_ptr);
-
-                    // The copy is functionally identical to an "add"
-                    // operation at the target location using the value
-                    // specified in the "from" member.
-                    operation_add(ptr, v);
-                    break;
-                }
-
-                case patch_operations::test:
-                {
-                    bool success = false;
-                    JSON_TRY
-                    {
-                        // check if "value" matches the one at "path"
-                        // the "path" location must exist - use at()
-                        success = (result.at(ptr) == get_value("test", "value", false));
-                    }
-                    JSON_INTERNAL_CATCH (out_of_range&)
-                    {
-                        // ignore out of range errors: success remains false
-                    }
-
-                    // throw an exception if test fails
-                    if (JSON_HEDLEY_UNLIKELY(!success))
-                    {
-                        JSON_THROW(other_error::create(501, detail::concat("unsuccessful: ", val.dump()), &val));
-                    }
-
-                    break;
-                }
-
-                case patch_operations::invalid:
-                default:
-                {
-                    // op must be "add", "remove", "replace", "move", "copy", or
-                    // "test"
-                    JSON_THROW(parse_error::create(105, 0, detail::concat("operation value '", op, "' is invalid"), &val));
-                }
-            }
-        }
-    }
-
-    /// @brief applies a JSON patch to a copy of the current object
-    /// @sa https://json.nlohmann.me/api/basic_json/patch/
-    basic_json patch(const basic_json& json_patch) const
-    {
-        basic_json result = *this;
-        result.patch_inplace(json_patch);
-        return result;
-    }
-
-    /// @brief creates a diff as a JSON patch
-    /// @sa https://json.nlohmann.me/api/basic_json/diff/
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json diff(const basic_json& source, const basic_json& target,
-                           const string_t& path = "")
-    {
-        // the patch
-        basic_json result(value_t::array);
-
-        // if the values are the same, return empty patch
-        if (source == target)
-        {
-            return result;
-        }
-
-        if (source.type() != target.type())
-        {
-            // different types: replace value
-            result.push_back(
-            {
-                {"op", "replace"}, {"path", path}, {"value", target}
-            });
-            return result;
-        }
-
-        switch (source.type())
-        {
-            case value_t::array:
-            {
-                // first pass: traverse common elements
-                std::size_t i = 0;
-                while (i < source.size() && i < target.size())
-                {
-                    // recursive call to compare array values at index i
-                    auto temp_diff = diff(source[i], target[i], detail::concat<string_t>(path, '/', detail::to_string<string_t>(i)));
-                    result.insert(result.end(), temp_diff.begin(), temp_diff.end());
-                    ++i;
-                }
-
-                // We now reached the end of at least one array
-                // in a second pass, traverse the remaining elements
-
-                // remove my remaining elements
-                const auto end_index = static_cast<difference_type>(result.size());
-                while (i < source.size())
-                {
-                    // add operations in reverse order to avoid invalid
-                    // indices
-                    result.insert(result.begin() + end_index, object(
-                    {
-                        {"op", "remove"},
-                        {"path", detail::concat<string_t>(path, '/', detail::to_string<string_t>(i))}
-                    }));
-                    ++i;
-                }
-
-                // add other remaining elements
-                while (i < target.size())
-                {
-                    result.push_back(
-                    {
-                        {"op", "add"},
-                        {"path", detail::concat<string_t>(path, "/-")},
-                        {"value", target[i]}
-                    });
-                    ++i;
-                }
-
-                break;
-            }
-
-            case value_t::object:
-            {
-                // first pass: traverse this object's elements
-                for (auto it = source.cbegin(); it != source.cend(); ++it)
-                {
-                    // escape the key name to be used in a JSON patch
-                    const auto path_key = detail::concat<string_t>(path, '/', detail::escape(it.key()));
-
-                    if (target.find(it.key()) != target.end())
-                    {
-                        // recursive call to compare object values at key it
-                        auto temp_diff = diff(it.value(), target[it.key()], path_key);
-                        result.insert(result.end(), temp_diff.begin(), temp_diff.end());
-                    }
-                    else
-                    {
-                        // found a key that is not in o -> remove it
-                        result.push_back(object(
-                        {
-                            {"op", "remove"}, {"path", path_key}
-                        }));
-                    }
-                }
-
-                // second pass: traverse other object's elements
-                for (auto it = target.cbegin(); it != target.cend(); ++it)
-                {
-                    if (source.find(it.key()) == source.end())
-                    {
-                        // found a key that is not in this -> add it
-                        const auto path_key = detail::concat<string_t>(path, '/', detail::escape(it.key()));
-                        result.push_back(
-                        {
-                            {"op", "add"}, {"path", path_key},
-                            {"value", it.value()}
-                        });
-                    }
-                }
-
-                break;
-            }
-
-            case value_t::null:
-            case value_t::string:
-            case value_t::boolean:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::number_float:
-            case value_t::binary:
-            case value_t::discarded:
-            default:
-            {
-                // both primitive type: replace value
-                result.push_back(
-                {
-                    {"op", "replace"}, {"path", path}, {"value", target}
-                });
-                break;
-            }
-        }
-
-        return result;
-    }
-    /// @}
-
-    ////////////////////////////////
-    // JSON Merge Patch functions //
-    ////////////////////////////////
-
-    /// @name JSON Merge Patch functions
-    /// @{
-
-    /// @brief applies a JSON Merge Patch
-    /// @sa https://json.nlohmann.me/api/basic_json/merge_patch/
-    void merge_patch(const basic_json& apply_patch)
-    {
-        if (apply_patch.is_object())
-        {
-            if (!is_object())
-            {
-                *this = object();
-            }
-            for (auto it = apply_patch.begin(); it != apply_patch.end(); ++it)
-            {
-                if (it.value().is_null())
-                {
-                    erase(it.key());
-                }
-                else
-                {
-                    operator[](it.key()).merge_patch(it.value());
-                }
-            }
-        }
-        else
-        {
-            *this = apply_patch;
-        }
-    }
-
-    /// @}
-};
-
-/// @brief user-defined to_string function for JSON values
-/// @sa https://json.nlohmann.me/api/basic_json/to_string/
-NLOHMANN_BASIC_JSON_TPL_DECLARATION
-std::string to_string(const NLOHMANN_BASIC_JSON_TPL& j)
-{
-    return j.dump();
-}
-
-inline namespace literals
-{
-inline namespace json_literals
-{
-
-/// @brief user-defined string literal for JSON values
-/// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json/
-JSON_HEDLEY_NON_NULL(1)
-#if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0)
-    inline nlohmann::json operator ""_json(const char* s, std::size_t n)
-#else
-    inline nlohmann::json operator "" _json(const char* s, std::size_t n)
-#endif
-{
-    return nlohmann::json::parse(s, s + n);
-}
-
-/// @brief user-defined string literal for JSON pointer
-/// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json_pointer/
-JSON_HEDLEY_NON_NULL(1)
-#if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0)
-    inline nlohmann::json::json_pointer operator ""_json_pointer(const char* s, std::size_t n)
-#else
-    inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n)
-#endif
-{
-    return nlohmann::json::json_pointer(std::string(s, n));
-}
-
-}  // namespace json_literals
-}  // namespace literals
-NLOHMANN_JSON_NAMESPACE_END
-
-///////////////////////
-// nonmember support //
-///////////////////////
-
-namespace std // NOLINT(cert-dcl58-cpp)
-{
-
-/// @brief hash value for JSON objects
-/// @sa https://json.nlohmann.me/api/basic_json/std_hash/
-NLOHMANN_BASIC_JSON_TPL_DECLARATION
-struct hash<nlohmann::NLOHMANN_BASIC_JSON_TPL> // NOLINT(cert-dcl58-cpp)
-{
-    std::size_t operator()(const nlohmann::NLOHMANN_BASIC_JSON_TPL& j) const
-    {
-        return nlohmann::detail::hash(j);
-    }
-};
-
-// specialization for std::less<value_t>
-template<>
-struct less< ::nlohmann::detail::value_t> // do not remove the space after '<', see https://github.com/nlohmann/json/pull/679
-{
-    /*!
-    @brief compare two value_t enum values
-    @since version 3.0.0
-    */
-    bool operator()(::nlohmann::detail::value_t lhs,
-                    ::nlohmann::detail::value_t rhs) const noexcept
-    {
-#if JSON_HAS_THREE_WAY_COMPARISON
-        return std::is_lt(lhs <=> rhs); // *NOPAD*
-#else
-        return ::nlohmann::detail::operator<(lhs, rhs);
-#endif
-    }
-};
-
-// C++20 prohibit function specialization in the std namespace.
-#ifndef JSON_HAS_CPP_20
-
-/// @brief exchanges the values of two JSON objects
-/// @sa https://json.nlohmann.me/api/basic_json/std_swap/
-NLOHMANN_BASIC_JSON_TPL_DECLARATION
-inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC_JSON_TPL& j2) noexcept(  // NOLINT(readability-inconsistent-declaration-parameter-name, cert-dcl58-cpp)
-    is_nothrow_move_constructible<nlohmann::NLOHMANN_BASIC_JSON_TPL>::value&&                          // NOLINT(misc-redundant-expression,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
-    is_nothrow_move_assignable<nlohmann::NLOHMANN_BASIC_JSON_TPL>::value)
-{
-    j1.swap(j2);
-}
-
-#endif
-
-}  // namespace std
-
-#if JSON_USE_GLOBAL_UDLS
-    #if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0)
-        using nlohmann::literals::json_literals::operator ""_json; // NOLINT(misc-unused-using-decls,google-global-names-in-headers)
-        using nlohmann::literals::json_literals::operator ""_json_pointer; //NOLINT(misc-unused-using-decls,google-global-names-in-headers)
-    #else
-        using nlohmann::literals::json_literals::operator "" _json; // NOLINT(misc-unused-using-decls,google-global-names-in-headers)
-        using nlohmann::literals::json_literals::operator "" _json_pointer; //NOLINT(misc-unused-using-decls,google-global-names-in-headers)
-    #endif
-#endif
-
-// #include <nlohmann/detail/macro_unscope.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-// restore clang diagnostic settings
-#if defined(__clang__)
-    #pragma clang diagnostic pop
-#endif
-
-// clean up
-#undef JSON_ASSERT
-#undef JSON_INTERNAL_CATCH
-#undef JSON_THROW
-#undef JSON_PRIVATE_UNLESS_TESTED
-#undef NLOHMANN_BASIC_JSON_TPL_DECLARATION
-#undef NLOHMANN_BASIC_JSON_TPL
-#undef JSON_EXPLICIT
-#undef NLOHMANN_CAN_CALL_STD_FUNC_IMPL
-#undef JSON_INLINE_VARIABLE
-#undef JSON_NO_UNIQUE_ADDRESS
-#undef JSON_DISABLE_ENUM_SERIALIZATION
-#undef JSON_USE_GLOBAL_UDLS
-
-#ifndef JSON_TEST_KEEP_MACROS
-    #undef JSON_CATCH
-    #undef JSON_TRY
-    #undef JSON_HAS_CPP_11
-    #undef JSON_HAS_CPP_14
-    #undef JSON_HAS_CPP_17
-    #undef JSON_HAS_CPP_20
-    #undef JSON_HAS_FILESYSTEM
-    #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
-    #undef JSON_HAS_THREE_WAY_COMPARISON
-    #undef JSON_HAS_RANGES
-    #undef JSON_HAS_STATIC_RTTI
-    #undef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
-#endif
-
-// #include <nlohmann/thirdparty/hedley/hedley_undef.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-#undef JSON_HEDLEY_ALWAYS_INLINE
-#undef JSON_HEDLEY_ARM_VERSION
-#undef JSON_HEDLEY_ARM_VERSION_CHECK
-#undef JSON_HEDLEY_ARRAY_PARAM
-#undef JSON_HEDLEY_ASSUME
-#undef JSON_HEDLEY_BEGIN_C_DECLS
-#undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
-#undef JSON_HEDLEY_CLANG_HAS_BUILTIN
-#undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
-#undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
-#undef JSON_HEDLEY_CLANG_HAS_EXTENSION
-#undef JSON_HEDLEY_CLANG_HAS_FEATURE
-#undef JSON_HEDLEY_CLANG_HAS_WARNING
-#undef JSON_HEDLEY_COMPCERT_VERSION
-#undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
-#undef JSON_HEDLEY_CONCAT
-#undef JSON_HEDLEY_CONCAT3
-#undef JSON_HEDLEY_CONCAT3_EX
-#undef JSON_HEDLEY_CONCAT_EX
-#undef JSON_HEDLEY_CONST
-#undef JSON_HEDLEY_CONSTEXPR
-#undef JSON_HEDLEY_CONST_CAST
-#undef JSON_HEDLEY_CPP_CAST
-#undef JSON_HEDLEY_CRAY_VERSION
-#undef JSON_HEDLEY_CRAY_VERSION_CHECK
-#undef JSON_HEDLEY_C_DECL
-#undef JSON_HEDLEY_DEPRECATED
-#undef JSON_HEDLEY_DEPRECATED_FOR
-#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
-#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
-#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
-#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
-#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
-#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
-#undef JSON_HEDLEY_DIAGNOSTIC_POP
-#undef JSON_HEDLEY_DIAGNOSTIC_PUSH
-#undef JSON_HEDLEY_DMC_VERSION
-#undef JSON_HEDLEY_DMC_VERSION_CHECK
-#undef JSON_HEDLEY_EMPTY_BASES
-#undef JSON_HEDLEY_EMSCRIPTEN_VERSION
-#undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
-#undef JSON_HEDLEY_END_C_DECLS
-#undef JSON_HEDLEY_FLAGS
-#undef JSON_HEDLEY_FLAGS_CAST
-#undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
-#undef JSON_HEDLEY_GCC_HAS_BUILTIN
-#undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
-#undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
-#undef JSON_HEDLEY_GCC_HAS_EXTENSION
-#undef JSON_HEDLEY_GCC_HAS_FEATURE
-#undef JSON_HEDLEY_GCC_HAS_WARNING
-#undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
-#undef JSON_HEDLEY_GCC_VERSION
-#undef JSON_HEDLEY_GCC_VERSION_CHECK
-#undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
-#undef JSON_HEDLEY_GNUC_HAS_BUILTIN
-#undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
-#undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
-#undef JSON_HEDLEY_GNUC_HAS_EXTENSION
-#undef JSON_HEDLEY_GNUC_HAS_FEATURE
-#undef JSON_HEDLEY_GNUC_HAS_WARNING
-#undef JSON_HEDLEY_GNUC_VERSION
-#undef JSON_HEDLEY_GNUC_VERSION_CHECK
-#undef JSON_HEDLEY_HAS_ATTRIBUTE
-#undef JSON_HEDLEY_HAS_BUILTIN
-#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
-#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
-#undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
-#undef JSON_HEDLEY_HAS_EXTENSION
-#undef JSON_HEDLEY_HAS_FEATURE
-#undef JSON_HEDLEY_HAS_WARNING
-#undef JSON_HEDLEY_IAR_VERSION
-#undef JSON_HEDLEY_IAR_VERSION_CHECK
-#undef JSON_HEDLEY_IBM_VERSION
-#undef JSON_HEDLEY_IBM_VERSION_CHECK
-#undef JSON_HEDLEY_IMPORT
-#undef JSON_HEDLEY_INLINE
-#undef JSON_HEDLEY_INTEL_CL_VERSION
-#undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK
-#undef JSON_HEDLEY_INTEL_VERSION
-#undef JSON_HEDLEY_INTEL_VERSION_CHECK
-#undef JSON_HEDLEY_IS_CONSTANT
-#undef JSON_HEDLEY_IS_CONSTEXPR_
-#undef JSON_HEDLEY_LIKELY
-#undef JSON_HEDLEY_MALLOC
-#undef JSON_HEDLEY_MCST_LCC_VERSION
-#undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK
-#undef JSON_HEDLEY_MESSAGE
-#undef JSON_HEDLEY_MSVC_VERSION
-#undef JSON_HEDLEY_MSVC_VERSION_CHECK
-#undef JSON_HEDLEY_NEVER_INLINE
-#undef JSON_HEDLEY_NON_NULL
-#undef JSON_HEDLEY_NO_ESCAPE
-#undef JSON_HEDLEY_NO_RETURN
-#undef JSON_HEDLEY_NO_THROW
-#undef JSON_HEDLEY_NULL
-#undef JSON_HEDLEY_PELLES_VERSION
-#undef JSON_HEDLEY_PELLES_VERSION_CHECK
-#undef JSON_HEDLEY_PGI_VERSION
-#undef JSON_HEDLEY_PGI_VERSION_CHECK
-#undef JSON_HEDLEY_PREDICT
-#undef JSON_HEDLEY_PRINTF_FORMAT
-#undef JSON_HEDLEY_PRIVATE
-#undef JSON_HEDLEY_PUBLIC
-#undef JSON_HEDLEY_PURE
-#undef JSON_HEDLEY_REINTERPRET_CAST
-#undef JSON_HEDLEY_REQUIRE
-#undef JSON_HEDLEY_REQUIRE_CONSTEXPR
-#undef JSON_HEDLEY_REQUIRE_MSG
-#undef JSON_HEDLEY_RESTRICT
-#undef JSON_HEDLEY_RETURNS_NON_NULL
-#undef JSON_HEDLEY_SENTINEL
-#undef JSON_HEDLEY_STATIC_ASSERT
-#undef JSON_HEDLEY_STATIC_CAST
-#undef JSON_HEDLEY_STRINGIFY
-#undef JSON_HEDLEY_STRINGIFY_EX
-#undef JSON_HEDLEY_SUNPRO_VERSION
-#undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
-#undef JSON_HEDLEY_TINYC_VERSION
-#undef JSON_HEDLEY_TINYC_VERSION_CHECK
-#undef JSON_HEDLEY_TI_ARMCL_VERSION
-#undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
-#undef JSON_HEDLEY_TI_CL2000_VERSION
-#undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
-#undef JSON_HEDLEY_TI_CL430_VERSION
-#undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
-#undef JSON_HEDLEY_TI_CL6X_VERSION
-#undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
-#undef JSON_HEDLEY_TI_CL7X_VERSION
-#undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
-#undef JSON_HEDLEY_TI_CLPRU_VERSION
-#undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
-#undef JSON_HEDLEY_TI_VERSION
-#undef JSON_HEDLEY_TI_VERSION_CHECK
-#undef JSON_HEDLEY_UNAVAILABLE
-#undef JSON_HEDLEY_UNLIKELY
-#undef JSON_HEDLEY_UNPREDICTABLE
-#undef JSON_HEDLEY_UNREACHABLE
-#undef JSON_HEDLEY_UNREACHABLE_RETURN
-#undef JSON_HEDLEY_VERSION
-#undef JSON_HEDLEY_VERSION_DECODE_MAJOR
-#undef JSON_HEDLEY_VERSION_DECODE_MINOR
-#undef JSON_HEDLEY_VERSION_DECODE_REVISION
-#undef JSON_HEDLEY_VERSION_ENCODE
-#undef JSON_HEDLEY_WARNING
-#undef JSON_HEDLEY_WARN_UNUSED_RESULT
-#undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
-#undef JSON_HEDLEY_FALL_THROUGH
-
-
-
-#endif  // INCLUDE_NLOHMANN_JSON_HPP_
diff --git a/sgl-kernel/3rdparty/nlohmann/json_fwd.hpp b/sgl-kernel/3rdparty/nlohmann/json_fwd.hpp
deleted file mode 100644
index 1df3928d71b7..000000000000
--- a/sgl-kernel/3rdparty/nlohmann/json_fwd.hpp
+++ /dev/null
@@ -1,187 +0,0 @@
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
-#define INCLUDE_NLOHMANN_JSON_FWD_HPP_
-
-#include <cstdint> // int64_t, uint64_t
-#include <map> // map
-#include <memory> // allocator
-#include <string> // string
-#include <vector> // vector
-
-// #include <nlohmann/detail/abi_macros.hpp>
-//     __ _____ _____ _____
-//  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
-// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
-//
-// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
-// SPDX-License-Identifier: MIT
-
-
-
-// This file contains all macro definitions affecting or depending on the ABI
-
-#ifndef JSON_SKIP_LIBRARY_VERSION_CHECK
-    #if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH)
-        #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 3
-            #warning "Already included a different version of the library!"
-        #endif
-    #endif
-#endif
-
-#define NLOHMANN_JSON_VERSION_MAJOR 3   // NOLINT(modernize-macro-to-enum)
-#define NLOHMANN_JSON_VERSION_MINOR 11  // NOLINT(modernize-macro-to-enum)
-#define NLOHMANN_JSON_VERSION_PATCH 3   // NOLINT(modernize-macro-to-enum)
-
-#ifndef JSON_DIAGNOSTICS
-    #define JSON_DIAGNOSTICS 0
-#endif
-
-#ifndef JSON_DIAGNOSTIC_POSITIONS
-    #define JSON_DIAGNOSTIC_POSITIONS 0
-#endif
-
-#ifndef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
-    #define JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON 0
-#endif
-
-#if JSON_DIAGNOSTICS
-    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS _diag
-#else
-    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS
-#endif
-
-#if JSON_DIAGNOSTIC_POSITIONS
-    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS _dp
-#else
-    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS
-#endif
-
-#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
-    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON _ldvcmp
-#else
-    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON
-#endif
-
-#ifndef NLOHMANN_JSON_NAMESPACE_NO_VERSION
-    #define NLOHMANN_JSON_NAMESPACE_NO_VERSION 0
-#endif
-
-// Construct the namespace ABI tags component
-#define NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c) json_abi ## a ## b ## c
-#define NLOHMANN_JSON_ABI_TAGS_CONCAT(a, b, c) \
-    NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c)
-
-#define NLOHMANN_JSON_ABI_TAGS                                       \
-    NLOHMANN_JSON_ABI_TAGS_CONCAT(                                   \
-            NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS,                       \
-            NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON, \
-            NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS)
-
-// Construct the namespace version component
-#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) \
-    _v ## major ## _ ## minor ## _ ## patch
-#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(major, minor, patch) \
-    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch)
-
-#if NLOHMANN_JSON_NAMESPACE_NO_VERSION
-#define NLOHMANN_JSON_NAMESPACE_VERSION
-#else
-#define NLOHMANN_JSON_NAMESPACE_VERSION                                 \
-    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(NLOHMANN_JSON_VERSION_MAJOR, \
-                                           NLOHMANN_JSON_VERSION_MINOR, \
-                                           NLOHMANN_JSON_VERSION_PATCH)
-#endif
-
-// Combine namespace components
-#define NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b) a ## b
-#define NLOHMANN_JSON_NAMESPACE_CONCAT(a, b) \
-    NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b)
-
-#ifndef NLOHMANN_JSON_NAMESPACE
-#define NLOHMANN_JSON_NAMESPACE               \
-    nlohmann::NLOHMANN_JSON_NAMESPACE_CONCAT( \
-            NLOHMANN_JSON_ABI_TAGS,           \
-            NLOHMANN_JSON_NAMESPACE_VERSION)
-#endif
-
-#ifndef NLOHMANN_JSON_NAMESPACE_BEGIN
-#define NLOHMANN_JSON_NAMESPACE_BEGIN                \
-    namespace nlohmann                               \
-    {                                                \
-    inline namespace NLOHMANN_JSON_NAMESPACE_CONCAT( \
-                NLOHMANN_JSON_ABI_TAGS,              \
-                NLOHMANN_JSON_NAMESPACE_VERSION)     \
-    {
-#endif
-
-#ifndef NLOHMANN_JSON_NAMESPACE_END
-#define NLOHMANN_JSON_NAMESPACE_END                                     \
-    }  /* namespace (inline namespace) NOLINT(readability/namespace) */ \
-    }  // namespace nlohmann
-#endif
-
-
-/*!
-@brief namespace for Niels Lohmann
-@see https://github.com/nlohmann
-@since version 1.0.0
-*/
-NLOHMANN_JSON_NAMESPACE_BEGIN
-
-/*!
-@brief default JSONSerializer template argument
-
-This serializer ignores the template arguments and uses ADL
-([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
-for serialization.
-*/
-template<typename T = void, typename SFINAE = void>
-struct adl_serializer;
-
-/// a class to store JSON values
-/// @sa https://json.nlohmann.me/api/basic_json/
-template<template<typename U, typename V, typename... Args> class ObjectType =
-         std::map,
-         template<typename U, typename... Args> class ArrayType = std::vector,
-         class StringType = std::string, class BooleanType = bool,
-         class NumberIntegerType = std::int64_t,
-         class NumberUnsignedType = std::uint64_t,
-         class NumberFloatType = double,
-         template<typename U> class AllocatorType = std::allocator,
-         template<typename T, typename SFINAE = void> class JSONSerializer =
-         adl_serializer,
-         class BinaryType = std::vector<std::uint8_t>, // cppcheck-suppress syntaxError
-         class CustomBaseClass = void>
-class basic_json;
-
-/// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
-/// @sa https://json.nlohmann.me/api/json_pointer/
-template<typename RefStringType>
-class json_pointer;
-
-/*!
-@brief default specialization
-@sa https://json.nlohmann.me/api/json/
-*/
-using json = basic_json<>;
-
-/// @brief a minimal map-like container that preserves insertion order
-/// @sa https://json.nlohmann.me/api/ordered_map/
-template<class Key, class T, class IgnoredLess, class Allocator>
-struct ordered_map;
-
-/// @brief specialization that maintains the insertion order of object keys
-/// @sa https://json.nlohmann.me/api/ordered_json/
-using ordered_json = basic_json<nlohmann::ordered_map>;
-
-NLOHMANN_JSON_NAMESPACE_END
-
-#endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_
diff --git a/sgl-kernel/benchmark/bench_fp8_gemm.py b/sgl-kernel/benchmark/bench_fp8_gemm.py
index a8b966ebd2b9..73d2e7d1627d 100644
--- a/sgl-kernel/benchmark/bench_fp8_gemm.py
+++ b/sgl-kernel/benchmark/bench_fp8_gemm.py
@@ -44,7 +44,7 @@ def benchmark(batch_size, provider):
         )
     elif "sglang-fp8" in provider:
         ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: sgl_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=None, is_profile=False),
+            lambda: sgl_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=None),
             quantiles=quantiles,
         )
 
diff --git a/sgl-kernel/benchmark/bench_fp8_res/results.html b/sgl-kernel/benchmark/bench_fp8_res/results.html
deleted file mode 100644
index 6e17ec3d55b6..000000000000
--- a/sgl-kernel/benchmark/bench_fp8_res/results.html
+++ /dev/null
@@ -1,3 +0,0 @@
-<html><body>
-<image src="fp8 scaled matmul.png"/>
-</body></html>
diff --git a/sgl-kernel/benchmark/bench_int8_res/results.html b/sgl-kernel/benchmark/bench_int8_res/results.html
deleted file mode 100644
index f8f21993bfa1..000000000000
--- a/sgl-kernel/benchmark/bench_int8_res/results.html
+++ /dev/null
@@ -1,3 +0,0 @@
-<html><body>
-<image src="int8 scaled matmul.png"/>
-</body></html>
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 7e2c1c5c9e7d..eecd60937003 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -8,9 +8,6 @@
 
 root = Path(__file__).parent.resolve()
 
-# add debug mode control
-debug_build = os.environ.get('DEBUG_BUILD', '0').lower() in ('1', 'true', 'yes', 'on')
-print(f"Debug build: {'enabled' if debug_build else 'disabled'}")
 
 def get_version():
     with open(root / "pyproject.toml") as f:
@@ -29,45 +26,29 @@ def update_wheel_platform_tag():
 
 
 cutlass = root / "3rdparty" / "cutlass"
-nlohmann = root / "3rdparty" / "nlohmann"
 
 include_dirs = [
     cutlass.resolve() / "include",
     cutlass.resolve() / "tools" / "util" / "include",
     root / "src" / "sgl-kernel" / "csrc",
-    nlohmann.resolve(),
 ]
 
-# nvcc_flags = [
-#     "-O3",
-#     "-Xcompiler",
-#     "-fPIC",
-#     "-gencode=arch=compute_75,code=sm_75",
-#     "-gencode=arch=compute_80,code=sm_80",
-#     "-gencode=arch=compute_89,code=sm_89",
-#     "-gencode=arch=compute_90,code=sm_90",
-#     "-U__CUDA_NO_HALF_OPERATORS__",
-#     "-U__CUDA_NO_HALF2_OPERATORS__",
-# ]
 nvcc_flags = [
     "-O3",
     "-Xcompiler",
     "-fPIC",
+    "-gencode=arch=compute_75,code=sm_75",
+    "-gencode=arch=compute_80,code=sm_80",
     "-gencode=arch=compute_89,code=sm_89",
-    "-gencode=arch=compute_90,code=sm_90",
     "-gencode=arch=compute_90a,code=sm_90a",
+    "-gencode=arch=compute_90,code=sm_90",
     "-U__CUDA_NO_HALF_OPERATORS__",
     "-U__CUDA_NO_HALF2_OPERATORS__",
 ]
 
-# if debug, add debug flag
-if debug_build:
-    nvcc_flags.extend([
-        "-DSGL_DEBUG_BUILD",
-    ])
+
 cxx_flags = ["-O3"]
-if debug_build:
-    cxx_flags.extend(["-DSGL_DEBUG_BUILD"])
+
 
 libraries = ["c10", "torch", "torch_python"]
 extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib"]
@@ -78,7 +59,7 @@ def update_wheel_platform_tag():
             "src/sgl-kernel/csrc/trt_reduce_internal.cu",
             "src/sgl-kernel/csrc/trt_reduce_kernel.cu",
             "src/sgl-kernel/csrc/moe_align_kernel.cu",
-            # "src/sgl-kernel/csrc/int8_gemm_kernel.cu",
+            "src/sgl-kernel/csrc/int8_gemm_kernel.cu",
             "src/sgl-kernel/csrc/fp8_gemm_kernel.cu",
             "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
         ],
@@ -92,20 +73,6 @@ def update_wheel_platform_tag():
     ),
 ]
 
-def set_parallel_jobs():
-    if sys.platform == 'win32':
-        num_cores = int(os.environ.get('NUMBER_OF_PROCESSORS', 4))
-    else:
-        num_cores = len(os.sched_getaffinity(0)) if hasattr(os, 'sched_getaffinity') else os.cpu_count()
-    
-    # 限制并行度为核心数的1/4或更少
-    num_jobs = max(1, num_cores // 2)
-    os.environ['MAX_JOBS'] = str(num_jobs)
-    
-    # 设置CUDA编译的并行任务数
-    os.environ['CUDA_NVCC_THREADS'] = str(num_jobs)
-    return num_jobs
-set_parallel_jobs()
 setup(
     name="sgl-kernel",
     version=get_version(),
diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index 23d9cbb2c74d..b83c0ff4a68f 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -568,7 +568,7 @@ void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch
 
 torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
-                             const c10::optional<torch::Tensor>& bias, bool is_profile=false) {
+                             const c10::optional<torch::Tensor>& bias) {
 
 
   TORCH_CHECK(mat_a.is_cuda(), "mat_a must be a CUDA tensor");
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index a11a89839b31..be634aa8fc25 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -12,14 +12,14 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t b
                           torch::Tensor sorted_token_ids, torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad,
                           torch::Tensor token_cnts_buffer, torch::Tensor cumsum_buffer);
 
-// int8_scaled_mm
-// torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
-//                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
-//                              const c10::optional<torch::Tensor>& bias);
+int8_scaled_mm
+torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
+                             const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
+                             const c10::optional<torch::Tensor>& bias);
 
 torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
-                             const c10::optional<torch::Tensor>& bias, bool is_profile=false);
+                             const c10::optional<torch::Tensor>& bias);
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // trt_reduce
@@ -29,7 +29,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // moe_align_block_size
   m.def("moe_align_block_size", &moe_align_block_size, "MOE Align Block Size (CUDA)");
   // int8_scaled_mm
-  // m.def("int8_scaled_mm", &int8_scaled_mm, "INT8 scaled matmul (CUDA)");
+  m.def("int8_scaled_mm", &int8_scaled_mm, "INT8 scaled matmul (CUDA)");
   // fp8_scaled_mm
   m.def("fp8_scaled_mm", &fp8_scaled_mm, "FP8 scaled matmul (CUDA)");
 }
diff --git a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
index 8a284ac6cca2..21fcd28c0f61 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
+++ b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
@@ -3,14 +3,6 @@
 #include <fstream>
 #include <sstream>
 #include <filesystem>
-// 在头文件中定义
-#ifdef SGL_DEBUG_BUILD
-constexpr int MAX_CONFIG_ID = 6;  // debug模式下的最大配置ID
-#else
-constexpr int MAX_CONFIG_ID = 90; // 正常模式下的最大配置ID
-#endif
-
-using json = nlohmann::json;
 
 struct cuda_error : public std::runtime_error {
   /**
@@ -52,9 +44,4 @@ inline int getSMVersion() {
   CHECK_CUDA_SUCCESS(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
   CHECK_CUDA_SUCCESS(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
   return sm_major * 10 + sm_minor;
-}
-
-inline uint32_t next_pow_2(uint32_t const num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }
\ No newline at end of file
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index bc62d835d628..094d78752afa 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -1,7 +1,7 @@
 from sgl_kernel.ops._kernels import all_reduce as _all_reduce
 from sgl_kernel.ops._kernels import dispose as _dispose
 from sgl_kernel.ops._kernels import init_custom_ar as _init_custom_ar
-# from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm
+from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm
 from sgl_kernel.ops._kernels import fp8_scaled_mm as _fp8_scaled_mm
 from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size
 
@@ -40,17 +40,16 @@ def moe_align_block_size(
 
 
 def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
-    return None
-    # return _int8_scaled_mm(
-    #     mat_a,
-    #     mat_b,
-    #     scales_a,
-    #     scales_b,
-    #     out_dtype,
-    #     bias,
-    # )
-
-def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None, is_profile=False):
+    return _int8_scaled_mm(
+        mat_a,
+        mat_b,
+        scales_a,
+        scales_b,
+        out_dtype,
+        bias,
+    )
+
+def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
     return _fp8_scaled_mm(
         mat_a,
         mat_b,
@@ -58,5 +57,4 @@ def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None, is_pro
         scales_b,
         out_dtype,
         bias,
-        is_profile,
     )
\ No newline at end of file

From 2727d7daf58fd327175784a46f468ff2c3d84805 Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Tue, 21 Jan 2025 15:17:56 +0800
Subject: [PATCH 167/248] fix

---
 sgl-kernel/bench_fp8_res/results.html            | 3 +++
 sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu | 2 +-
 sgl-kernel/src/sgl-kernel/csrc/utils.hpp         | 5 +++++
 3 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100644 sgl-kernel/bench_fp8_res/results.html

diff --git a/sgl-kernel/bench_fp8_res/results.html b/sgl-kernel/bench_fp8_res/results.html
new file mode 100644
index 000000000000..6e17ec3d55b6
--- /dev/null
+++ b/sgl-kernel/bench_fp8_res/results.html
@@ -0,0 +1,3 @@
+<html><body>
+<image src="fp8 scaled matmul.png"/>
+</body></html>
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index be634aa8fc25..b12d324cc62b 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -12,7 +12,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t b
                           torch::Tensor sorted_token_ids, torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad,
                           torch::Tensor token_cnts_buffer, torch::Tensor cumsum_buffer);
 
-int8_scaled_mm
+// int8_scaled_mm
 torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
                              const c10::optional<torch::Tensor>& bias);
diff --git a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
index 21fcd28c0f61..47efff53c5e4 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
+++ b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
@@ -44,4 +44,9 @@ inline int getSMVersion() {
   CHECK_CUDA_SUCCESS(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
   CHECK_CUDA_SUCCESS(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
   return sm_major * 10 + sm_minor;
+}
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }
\ No newline at end of file

From b11682e68f7f8e6524980551d27c176fa29443bd Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Tue, 21 Jan 2025 15:46:01 +0800
Subject: [PATCH 168/248] clean code

---
 e -i HEAD~3q:q                                | 10399 ----------------
 sgl-kernel/bench_fp8_res/results.html         |     3 -
 sgl-kernel/outp                               |     0
 sgl-kernel/setup.py                           |     4 +-
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    |    29 +-
 sgl-kernel/src/sgl-kernel/csrc/utils.hpp      |     1 -
 6 files changed, 2 insertions(+), 10434 deletions(-)
 delete mode 100644 e -i HEAD~3q:q
 delete mode 100644 sgl-kernel/bench_fp8_res/results.html
 delete mode 100644 sgl-kernel/outp

diff --git a/e -i HEAD~3q:q b/e -i HEAD~3q:q
deleted file mode 100644
index 1af572690a69..000000000000
--- a/e -i HEAD~3q:q	
+++ /dev/null
@@ -1,10399 +0,0 @@
-[33mcommit ae31fd9313a1419fdccc80a43eabea16596aa2ec[m[33m ([m[1;36mHEAD -> [m[1;32myych[m[33m, [m[1;31morigin/yych[m[33m)[m
-Author: yych0745 <1398089567@qq.com>
-Date:   Tue Jan 14 11:02:52 2025 +0000
-
-    fp8 sm90-H100 singleTest done
-
-[33mcommit b0a6e2c5e8fb7c625a6585b81958ed4fb2ccaffc[m
-Author: hadoop-hmart-waimai-rank hadoop-hmart-waimai-rank <hadoop-hmart-waimai-rank@set-yg-kubernetes-pc07.mt>
-Date:   Mon Jan 13 19:38:03 2025 +0800
-
-    add config_profile for sm_89
-
-[33mcommit 798cf24446d3ba659911254248c707008cd6f043[m
-Author: hadoop-hmart-waimai-rank hadoop-hmart-waimai-rank <hadoop-hmart-waimai-rank@set-yg-kubernetes-pc07.mt>
-Date:   Fri Jan 10 17:22:15 2025 +0800
-
-    opitmize
-
-[33mcommit b028bbdd3e457b5e77e385e19a4cb7e84f030cb2[m[33m ([m[1;31morigin/main_w8a8_fp8[m[33m)[m
-Author: HandH1998 <1335248067@qq.com>
-Date:   Thu Jan 9 17:41:46 2025 +0800
-
-    support bias
-
-[33mcommit bdc1755a4a9da2e18dde8f63eb33ae51e4560367[m
-Author: HandH1998 <1335248067@qq.com>
-Date:   Wed Jan 8 19:25:23 2025 +0800
-
-    support w8a8 fp8
-
-[33mcommit 6e4db4d6b0c2648138e36ab65cb02747febe699d[m
-Author: yych0745 <1398089567@qq.com>
-Date:   Tue Jan 7 17:24:45 2025 +0800
-
-    Add performance and accuracy test code for FP8 GEMM operations
-
-[33mcommit 6fb5768372532d5b9885446d3f82fd2b09f23c28[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 7 18:17:34 2025 -0800
-
-    Disable math eval on nightly CI temporarily (#2779)
-
-[33mcommit 51caee740feece0bcfa43af780d7fcab1c213583[m
-Author: Zhiqiang Xie <xiezhq@stanford.edu>
-Date:   Tue Jan 7 13:38:37 2025 -0800
-
-    Host memory pool for hierarchical caching (#2771)
-
-[33mcommit 58f9060efe26d4377af06dcb2e33778fb012e4f3[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Tue Jan 7 19:47:37 2025 +0800
-
-    Update int8 gemm config (#2774)
-
-[33mcommit bdc1acf6cdadf6bf08f7d2d895c8099023253d36[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 7 02:52:53 2025 -0800
-
-    Misc fix for min_p_sampling, --cuda-graph-bs (#2761)
-
-[33mcommit 6d08ce2aa9f481f6704b8c04a9d2a5b138db7ebe[m
-Author: HAI <hixiao@gmail.com>
-Date:   Tue Jan 7 01:35:08 2025 -0800
-
-    Use Optional with None default (#2770)
-
-[33mcommit 380930a959ac5acb97cd686e4232823cf9d2b5bc[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Tue Jan 7 14:20:50 2025 +0800
-
-    add  benchmark_moe_align_blocks (#2767)
-
-[33mcommit 9dec582dabcf9787e6d2701f269befe598a10a86[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jan 6 16:35:45 2025 -0800
-
-    Remove --modelopt-config in server_args (#2758)
-
-[33mcommit b01febdca00e90a725b38378f6b77dfc71ea4148[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jan 6 15:36:23 2025 -0800
-
-    Update README.md (#2757)
-    
-    
-    Co-authored-by: Junjie Jin <jjjjohnsonjin@gmail.com>
-    Co-authored-by: justdoit <24875266+coolhok@users.noreply.github.com>
-
-[33mcommit 1acbaf1b5aed65f8232a689042801c569d6e2661[m
-Author: Xingyao Wang <xingyaoww@gmail.com>
-Date:   Mon Jan 6 18:04:55 2025 -0500
-
-    Add generator-style run_batch function (#2513)
-    
-    Co-authored-by: openhands <openhands@all-hands.dev>
-
-[33mcommit 287427e2e66aef4e4d857cfd666fe849e9f73617[m
-Author: Zhiyu <zhiyuc@nvidia.com>
-Date:   Mon Jan 6 14:54:52 2025 -0800
-
-    Enable Nvidia's ModelOpt fp8 quantized models (#2535)
-
-[33mcommit b8574f695359e443e40ebb6a0fb6165b9e722674[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jan 6 14:54:18 2025 -0800
-
-    Clean up eagle code (#2756)
-
-[33mcommit 2855caa4817b22f8c72fc561d25006e7adf89136[m
-Author: 王博伟 <mamomobo@live.com>
-Date:   Tue Jan 7 06:00:55 2025 +0800
-
-    feat: add devcontainer.json for VSCode development (#2745)
-
-[33mcommit 2329e1ddd03fad4eade733d3b36ce4e388cb3c02[m
-Author: Xu-Chen <956140954@qq.com>
-Date:   Tue Jan 7 05:56:28 2025 +0800
-
-    Support llamafy/Qwen-Qwen2.5-7B-Instruct-llamafied (#2748)
-    
-    Co-authored-by: chenxu02 <chenxu02@zhihu.com>
-
-[33mcommit 0f3eb1d29404f9941e07aaa8b5f1d39523e12608[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Mon Jan 6 22:51:22 2025 +0800
-
-    Support cutlass Int8 gemm (#2752)
-
-[33mcommit 06dd2eab84387cf47ff0db3b48e35373119c8347[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Mon Jan 6 22:13:28 2025 +0800
-
-    Remove unused var in moe_align_kernel (#2751)
-
-[33mcommit 439f65809f7c917165cbb962d7a6bb5167ecdcf9[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Mon Jan 6 21:59:31 2025 +0800
-
-    Fix sgl-kernel cu118 compile issue (#2750)
-
-[33mcommit 2f0d3864962343ce894b7abc705b876028f2c668[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Jan 6 01:29:54 2025 +0800
-
-    chore: bump v0.4.1.post4 (#2713)
-
-[33mcommit 3900a94afe635bccab3975852cdfa8d4ffd8fce1[m
-Author: yizhang2077 <1109276519@qq.com>
-Date:   Mon Jan 6 00:47:16 2025 +0800
-
-    Support twoshot kernel (#2688)
-
-[33mcommit ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Mon Jan 6 00:28:22 2025 +0800
-
-    improve moe_align_kernel for deepseek v3 (#2735)
-
-[33mcommit bc6ad367c2beec2587843992176089b32eb5d6b9[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Jan 5 14:45:42 2025 +0800
-
-    fix lint (#2733)
-
-[33mcommit 3a22a303d1db773455e1371bbff4f5ed65e683c6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jan 4 20:13:16 2025 -0800
-
-    Revert the GLOO_SOCKET_IFNAME change (#2731)
-
-[33mcommit bdb3929dbb8f14d22c4a27ee2d8840751752658c[m
-Author: libra <lihu723@gmail.com>
-Date:   Sat Jan 4 00:05:16 2025 +0800
-
-    Refactor SchedulePolicy to improve code organization (#2571)
-
-[33mcommit f5d0865b252ff9eb95cec73caa784194463ea03a[m
-Author: Ce Gao <gaocegege@hotmail.com>
-Date:   Fri Jan 3 22:32:30 2025 +0800
-
-    feat: Support VLM in reference_hf (#2726)
-    
-    Signed-off-by: Ce Gao <gaocegege@hotmail.com>
-
-[33mcommit afdee7b1a9d2ec2faec2f2d895df6a631e6ef573[m
-Author: Ce Gao <gaocegege@hotmail.com>
-Date:   Fri Jan 3 22:21:38 2025 +0800
-
-    [Docs] fix 404 - Contributor Guide, again (#2727)
-    
-    Signed-off-by: Ce Gao <gaocegege@hotmail.com>
-
-[33mcommit cb34d848ac9314991bf96f9b479fa0b6147ba23a[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Jan 3 00:32:20 2025 -0800
-
-    Update README.md (#2722)
-    
-    
-    Co-authored-by: Yangmin Li <2682000734@qq.com>
-    Co-authored-by: Mingyuan Ma <mamingyuan2001@berkeley.edu>
-    Co-authored-by: Zhiyu Cheng <zhiyuc@nvidia.com>
-
-[33mcommit 0f9cc6d8d3c688eeb8d61e5e869a59d8d756044b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Jan 2 18:25:26 2025 -0800
-
-     Fix package loss for small models (#2717)
-    
-    Co-authored-by: sdli1995 < mmlmonkey@163.com>
-
-[33mcommit c7ae474a49f9167c6ef3046c5a968e1442b926c8[m
-Author: yigex <yigex@amd.com>
-Date:   Fri Jan 3 08:23:19 2025 +0800
-
-    [Feature, Hardware] Enable DeepseekV3 on AMD GPUs (#2601)
-    
-    Co-authored-by: root <root@banff-cyxtera-s83-5.amd.com>
-    Co-authored-by: HAI <hixiao@gmail.com>
-    Co-authored-by: Bruce Xue <yigex@xilinx.com>
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit bdf946bf8101e2907257d9575e068af2594cc330[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Jan 2 15:07:37 2025 -0800
-
-    Support loading pre-sharded moe weights (#2716)
-
-[33mcommit 8c8779cd059d64827f148c9532645c641512a04d[m
-Author: yukavio <67678385+yukavio@users.noreply.github.com>
-Date:   Fri Jan 3 02:28:39 2025 +0800
-
-    [Fix] fix retract error in eagle speculative decoding (#2711)
-    
-    Co-authored-by: kavioyu <kavioyu@tencent.com>
-
-[33mcommit 1775b963dbb9d182496c12b92eb5b0d3155db030[m
-Author: Mick <mickjagger19@icloud.com>
-Date:   Fri Jan 3 02:28:22 2025 +0800
-
-    [Fix] fix incorrectly overwriting the port specified in ServerArgs (#2714)
-
-[33mcommit dd2e2d275f2d83065d42fa9194adfded2147af41[m
-Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
-Date:   Thu Jan 2 17:18:31 2025 +0000
-
-    Docs: Update documentation workflow and contribution guide (#2704)
-    
-    Co-authored-by: Chayenne <zhaochen20@outlook.com>
-
-[33mcommit a990daff9ce0e29d5921ab390650c3ab1e0252e2[m
-Author: Rodrigo Garcia <32329949+roG0d@users.noreply.github.com>
-Date:   Thu Jan 2 15:17:03 2025 +0100
-
-    Included multi-node DeepSeekv3 example (#2707)
-
-[33mcommit ba5112ff691d791a9e38c6c71f59324a5fcb49d0[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Jan 2 21:47:44 2025 +0800
-
-    feat: support moe_align_block_size_triton (#2712)
-    
-    Co-authored-by: WANDY666 <1060304770@qq.com>
-
-[33mcommit 815dce0554793d0788faf4eaacf0c7271c070e95[m
-Author: yukavio <67678385+yukavio@users.noreply.github.com>
-Date:   Thu Jan 2 19:22:34 2025 +0800
-
-    Eagle speculative decoding part 4: Add EAGLE2 worker (#2150)
-    
-    Co-authored-by: kavioyu <kavioyu@tencent.com>
-    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
-
-[33mcommit ad20b7957e26f14a91e3052a13b822b8744bd931[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Jan 2 02:09:08 2025 -0800
-
-    Eagle speculative decoding part 3: small modifications to the general scheduler (#2709)
-    
-    Co-authored-by: kavioyu <kavioyu@tencent.com>
-
-[33mcommit 9183c23eca51bf76159e81dfd6edf5770796c2d8[m
-Author: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
-Date:   Thu Jan 2 18:05:19 2025 +0800
-
-    Speed up `update_weights_from_tensor` (#2695)
-
-[33mcommit 148254d4db8bf3bffee23710cd1acbd5711ebd1b[m
-Author: kk <43161300+kkHuang-amd@users.noreply.github.com>
-Date:   Thu Jan 2 17:11:06 2025 +0800
-
-    Improve moe reduce sum kernel performance (#2705)
-    
-    Co-authored-by: wunhuang <wunhuang@amd.com>
-
-[33mcommit a4d6d6f1ddc9f15bfa904e7e286e3f5ba4ba5a50[m
-Author: Xiaotong Jiang <jiangxiaotong728@gmail.com>
-Date:   Wed Jan 1 15:29:35 2025 -0800
-
-    [feat]: Add math eval to CI nightly run (#2663)
-    
-    Co-authored-by: Chayenne <zhaochen20@outlook.com>
-
-[33mcommit 062c48d2bd1ade784faf00bc2c73bc90131e878e[m
-Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
-Date:   Wed Jan 1 23:08:43 2025 +0000
-
-    [Docs] Add Support for Pydantic Structured Output Format (#2697)
-
-[33mcommit b6e0cfb5e1c355f9526defdf9bbee430c0bfebaa[m
-Author: kk <43161300+kkHuang-amd@users.noreply.github.com>
-Date:   Wed Jan 1 12:12:19 2025 +0800
-
-    ROCm base image update (#2692)
-    
-    Co-authored-by: wunhuang <wunhuang@amd.com>
-
-[33mcommit 0d8d97b8e6d7b3f35c2c2bce9508bdd5dd83cc9b[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Tue Dec 31 14:35:48 2024 -0800
-
-    Doc: Rename contribution_guide.md (#2691)
-
-[33mcommit 0a765bbccca04005a28364dc53cbf0fc379aa8d7[m
-Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
-Date:   Tue Dec 31 22:11:00 2024 +0000
-
-    Docs: Refactor Contribution Guide  (#2690)
-
-[33mcommit 286cad3ee31b360a2c374ce7da6fac9731f913c2[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Tue Dec 31 23:17:36 2024 +0800
-
-    h200 tuning  fused_moe_triton config for  Mixtral 8x7B/8x22B and Qwen2 57BA14B (#2689)
-
-[33mcommit dc7eb01f192f406afc4f0f9af84f3fbd970b6b06[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Dec 31 02:48:19 2024 -0800
-
-    [Fix] fix openai adapter (#2685)
-
-[33mcommit b0524c3789725642c3dd93323d47b18b9ba77a51[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Dec 31 02:25:05 2024 -0800
-
-    Eagle speculative decoding part 2: Fix cuda graph + DP attention hanging  (#2684)
-    
-    Co-authored-by: yukavio <kavioyu@gmail.com>
-
-[33mcommit 6c42fa229d9730bd7c5aff33232c1a2eb2cea387[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Dec 31 00:13:10 2024 -0800
-
-    Update README.md (#2683)
-
-[33mcommit d49b13c6f8c70f22b0076d1a3807eb707e8eeb98[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Dec 31 15:52:09 2024 +0800
-
-    feat: use CUDA 12.4 by default (for FA3) (#2682)
-
-[33mcommit bedc4c7a50fe4ffb98b8f76bf3fa08ff1cf47a13[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Dec 31 15:04:50 2024 +0800
-
-    misc: update CODEOWNERS (#2680)
-
-[33mcommit f44d143949f6c6fbca6cb96c52381b8bc1769a87[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Dec 30 22:58:55 2024 -0800
-
-    Support target model verification in the attention backend (#2678)
-    
-    
-    Co-authored-by: yukavio <kavioyu@gmail.com>
-
-[33mcommit b6b57fc2007524a1c16b71ff88e5c04f8271caca[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Dec 31 14:52:00 2024 +0800
-
-    minor: cleanup sgl-kernel (#2679)
-
-[33mcommit b4403985d00939958db69194f94a795d3ea95bce[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Tue Dec 31 14:28:29 2024 +0800
-
-    Add cutlass submodule for sgl-kernel (#2676)
-
-[33mcommit 339c69a243cfb8c504861a9bd92f206d3a6d0f10[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Dec 30 21:40:14 2024 -0800
-
-    Improve the computation for time_per_output_token Prometheus metrics (#2674)
-
-[33mcommit f7074700190a19f7e22f70620d30d2efe0b1d9b4[m
-Author: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
-Date:   Tue Dec 31 11:04:01 2024 +0800
-
-    CI: Update scripts to fail fast (#2672)
-
-[33mcommit 21ec66e59e466ba8bef05478296fabfcb1f94421[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Dec 30 05:42:08 2024 -0800
-
-    Minor follow-up fixes for the logprob refactor (#2670)
-
-[33mcommit c5210dfa3802dbe08a8de9e860cea0c932307c9d[m
-Author: HAI <hixiao@gmail.com>
-Date:   Mon Dec 30 05:31:12 2024 -0800
-
-    AMD DeepSeek_V3 FP8 Numerical fix (#2667)
-
-[33mcommit a29dd9501da901b874eb55f2ade694fcaf79a5ee[m
-Author: mobicham <37179323+mobicham@users.noreply.github.com>
-Date:   Mon Dec 30 14:27:29 2024 +0100
-
-    Add GemLite caching after each capture (#2669)
-
-[33mcommit 9c6ba2484f03be55aa3732b5be50ad062a2d8720[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Dec 30 04:51:38 2024 -0800
-
-    Refactor logprob computation to return the real logprob used in sampling (#2664)
-
-[33mcommit b02da24a5b8cc0b8e4971f59a7e0f8afcfeab9b3[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Mon Dec 30 18:07:01 2024 +0800
-
-    Refactor sgl-kernel build (#2642)
-
-[33mcommit bdd2827a804960f379ab3cd1650252b8c6e4503d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Dec 30 00:46:41 2024 -0800
-
-    Update structured_outputs.ipynb (#2666)
-
-[33mcommit 8c3b420eec03ea94e4ccce04681891558ca892ca[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 29 23:57:16 2024 -0800
-
-    [Docs] clean up structured outputs docs (#2654)
-
-[33mcommit e6f523b5f241d2d094a54cf11ed1aadfed904150[m
-Author: HAI <hixiao@gmail.com>
-Date:   Sun Dec 29 23:45:02 2024 -0800
-
-    fix typo in python/sglang/srt/layers/quantization/fp8.py (#2655)
-
-[33mcommit 32318178611b2cbbeddb272e8388ea25f4b64cf5[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 29 23:05:50 2024 -0800
-
-    Revert "[feat] Add math eval to CI" (#2656)
-
-[33mcommit a11f8d5f6a80595cd90982b369284a5b87d50163[m
-Author: Xiaotong Jiang <xiaotong.jiang@databricks.com>
-Date:   Sun Dec 29 22:49:41 2024 -0800
-
-    [feat] Add math eval to CI (#2652)
-
-[33mcommit 098d659c0e809a6cb4a6a0792cbbf159db011c10[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Dec 30 13:33:29 2024 +0800
-
-    docs: update README (#2651)
-
-[33mcommit 76d14f8cb92c73ac75a1d859a088629670de4290[m
-Author: Lzhang-hub <57925599+Lzhang-hub@users.noreply.github.com>
-Date:   Mon Dec 30 13:04:38 2024 +0800
-
-    add 2*h20 node serving example for deepseek v3 (#2650)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit b08c308ebc7f742acc24d971d15121d862f80c4f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 29 14:51:07 2024 -0800
-
-    Update the timeout in nightly-test.yml (#2649)
-
-[33mcommit 03d5fbfd441ad1b03feb66a4d1cbb03088399eff[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 29 14:25:53 2024 -0800
-
-    Release 0.4.1.post3 - upload the config.json to PyPI (#2647)
-
-[33mcommit 1703d766d8c54e41de6e46b814732732a25c81ff[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Sun Dec 29 13:52:50 2024 -0800
-
-    CI: skip special token for engine token ids unit test (#2648)
-
-[33mcommit 09e6e2aa334b4d716d35934b0faf21ecd648caa1[m
-Merge: 35bdb485 fad29f7f
-Author: zhaochenyang20 <zhaochen20@outlook.com>
-Date:   Sun Dec 29 21:48:21 2024 +0000
-
-    Merge branch 'main' of github.com:sgl-project/sglang
-
-[33mcommit fad29f7f52cf3be868dcdcf28967930045545954[m
-Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
-Date:   Sun Dec 29 21:28:59 2024 +0000
-
-    CI: Fix unittest for engine input token ids and output token ids (#2646)
-
-[33mcommit 35bdb48557d6b55e1bfadbadd1084cb23c56f7f4[m
-Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
-Date:   Sun Dec 29 20:28:27 2024 +0000
-
-    [Feature] Get Token IDs with Engine.generate() (#2636)
-    
-    Co-authored-by: Chayenne <zhaochen20@outlook.com>
-
-[33mcommit b085e06b0159c08f74d348fad3ea0a27e6a45a5e[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Dec 30 02:22:54 2024 +0800
-
-    docs: add development guide using docker (#2645)
-
-[33mcommit 763dd55d17bcb5fb7670ae62ce927fa1f27a8776[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Dec 30 01:24:06 2024 +0800
-
-    docs: update README (#2644)
-
-[33mcommit 3ccf566b0d4941a446353b14b5251ac9a30090a4[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Dec 30 00:11:46 2024 +0800
-
-    chore: bump v0.4.1.post2 (#2643)
-
-[33mcommit afa0341e57ec3bfe4e0af19767af88b1f5baf7c3[m
-Author: HandH1998 <1335248067@qq.com>
-Date:   Sun Dec 29 22:53:47 2024 +0800
-
-    Update Triton configs for block fp8 kernels (#2641)
-
-[33mcommit 30828e7192122468d0ca1e700a5ec8e85fe8fdf0[m
-Author: HAI <hixiao@gmail.com>
-Date:   Sun Dec 29 03:23:39 2024 -0800
-
-    AMD: set weights and scaling numbers properly for block FP8 (#2637)
-
-[33mcommit e0e09fceeb1c4ea80cfc3bd1652bfd1931762525[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Dec 29 02:10:27 2024 -0800
-
-    [Session] Update session control interface (#2635)
-
-[33mcommit 9c05c6898e262b1f7dd07b1401918be2d48fe4e2[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 29 01:45:35 2024 -0800
-
-    Add llama_eagle.py (#2640)
-    
-    Co-authored-by: kavioyu <kavioyu@tencent.com>
-
-[33mcommit 3464e57b620ca88f8ec9913ee25f4cf6d597993a[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 29 17:28:11 2024 +0800
-
-    minor: add nsys cli for docker dev (#2639)
-
-[33mcommit 3815b23ccb3d3a54cad705123da2f89aafdde0d2[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 29 00:45:57 2024 -0800
-
-    Clean up wrapper in flashinfer backend (#2638)
-
-[33mcommit fd34f2da3581dc8cf0a4d4bb1ce74ae2fca4b63a[m
-Author: Adarsh Shirawalmath <114558126+adarshxs@users.noreply.github.com>
-Date:   Sun Dec 29 13:35:00 2024 +0530
-
-    [Docs] Add EBNF to sampling params docs  (#2609)
-
-[33mcommit 8ee9a8501a897395e9d21dbf02986b0f98b378d0[m
-Author: Tanjiro <tushar.goel.ml@gmail.com>
-Date:   Sun Dec 29 11:28:52 2024 +0530
-
-    [Feature] Function Calling (#2544)
-    
-    Co-authored-by: Haoyu Wang <120358163+HaoyuWang4188@users.noreply.github.com>
-
-[33mcommit fd28640dc51d06786b411d1dfbfa043d084919e0[m
-Author: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
-Date:   Sun Dec 29 05:30:27 2024 +0800
-
-    Add `update_weights_from_tensor` (#2631)
-
-[33mcommit 7863e4368abfd92b331d59aed0ef60680ebe8559[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Dec 28 23:12:04 2024 +0800
-
-    add configs for block fp8 related kernels (#2628)
-    
-    Co-authored-by: HandH1998 <1335248067@qq.com>
-
-[33mcommit 333e3bfde5d7589e9ea23a13ef4a790830cc316a[m
-Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
-Date:   Sat Dec 28 15:00:38 2024 +0000
-
-    [docs]Refactor constrained decoding tutorial (#2633)
-
-[33mcommit 239c9d4d3a40841b1323db700b1e351d3565e7e4[m
-Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
-Date:   Sat Dec 28 07:54:28 2024 +0000
-
-    Docs: Add constrained decoding tutorial (#2614)
-    
-    Co-authored-by: Chayenne <zhaochen20@outlook.com>
-
-[33mcommit 855d0ba381f6bfa69f906797e33efcd0708797b9[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Dec 27 22:16:39 2024 -0800
-
-    [CI] Fix nightly test and raise better error message (#2626)
-    
-    Co-authored-by: Sangbin <rkooo567@gmail.com>
-
-[33mcommit 9254a33ad46d226069dc6a60e7c2e40abde96920[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Sat Dec 28 14:01:35 2024 +0800
-
-    avoid fused_moe_triton `padding` circular import (#2624)
-
-[33mcommit 8a2681e26a1a7993f12a46fb19f7ee238117df83[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Sat Dec 28 13:39:56 2024 +0800
-
-    Update readme (#2625)
-
-[33mcommit 5276a675f589932e3c743719803af324efce5162[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Dec 27 13:41:41 2024 -0800
-
-    Add more supporting organizations (#2623)
-
-[33mcommit 751e5ca2734287e5506e29d69d947e1912d70cdb[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Dec 27 11:23:46 2024 -0800
-
-    [minor] clean up docs and eos id (#2622)
-
-[33mcommit 7a7ac6bea15fdffc4b078dd81c485b517534389e[m
-Author: Yang Zheng <50227060+zhengy001@users.noreply.github.com>
-Date:   Sat Dec 28 02:59:56 2024 +0800
-
-    [FIX] Update EOS from config (#2475)
-
-[33mcommit d9e6ee382b84153a831ee7cf9dc5d65cd6b8ed68[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Dec 28 00:21:53 2024 +0800
-
-    docs: update README (#2618)
-
-[33mcommit ef5b0ff90b526e4f8dc301553f3a1ce04e9dd71d[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Dec 28 00:11:06 2024 +0800
-
-    chore: bump v0.4.1.post1 (#2616)
-
-[33mcommit 6e5305158cded4aa7523c28435339905adb2f610[m
-Author: HandH1998 <1335248067@qq.com>
-Date:   Sat Dec 28 00:01:13 2024 +0800
-
-    update sgl_moe_align_block_size usage (#2617)
-
-[33mcommit 77d1210b3610eda49fee06bc8b7400e2af4dd5e5[m
-Author: HandH1998 <1335248067@qq.com>
-Date:   Fri Dec 27 23:32:53 2024 +0800
-
-    fix moe_align_block_size (#2615)
-
-[33mcommit 70dc2fbe2d1ebecbb9b1a052f864253c446ec301[m
-Author: kk <43161300+kkHuang-amd@users.noreply.github.com>
-Date:   Fri Dec 27 16:32:17 2024 +0800
-
-    Change extend attention kernel launch parameter for ROCm platform to … (#2610)
-    
-    Co-authored-by: wunhuang <wunhuang@amd.com>
-    Co-authored-by: HAI <hixiao@gmail.com>
-
-[33mcommit b438a2e5125ca19a214dd8ac060f957cbf595673[m
-Author: kk <43161300+kkHuang-amd@users.noreply.github.com>
-Date:   Fri Dec 27 15:54:38 2024 +0800
-
-    Fix triton kernel performance regression (#2611)
-    
-    Co-authored-by: wunhuang <wunhuang@amd.com>
-
-[33mcommit 7ca751ff7d8ddb341cb22cf000c0e8ce5fc1bb4d[m
-Author: kk <43161300+kkHuang-amd@users.noreply.github.com>
-Date:   Fri Dec 27 15:38:22 2024 +0800
-
-    Fused moe triton cfg opt for rocm (#2612)
-    
-    Co-authored-by: wunhuang <wunhuang@amd.com>
-
-[33mcommit c75adfec59afbf8419d312346129f9365fd33b0d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Dec 26 20:58:08 2024 -0800
-
-    Update CODEOWNERS (#2608)
-
-[33mcommit 7722c11c1d2a2da5b914f3e043b7e8fcd182c0f5[m
-Author: HAI <hixiao@gmail.com>
-Date:   Thu Dec 26 20:22:14 2024 -0800
-
-    Regression fix to AMD/ROCm from recent change (#2606)
-
-[33mcommit b2ed5c8ea784c316fe1217ac0af6c23405cca132[m
-Author: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
-Date:   Fri Dec 27 09:53:09 2024 +0800
-
-    Tiny code cleanup in tokenizer_manager.py (#2586)
-
-[33mcommit f46f394f4d4dbe4aae85403dec006199b34d2840[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Dec 26 10:58:49 2024 -0800
-
-    Update README.md (#2605)
-
-[33mcommit 2125898af5224464f5b5999e32a6cc93f442199c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Dec 26 08:36:13 2024 -0800
-
-    Update contributor_guide.md (#2603)
-
-[33mcommit 44f011d2241945b173bcfd13545b523e80b806bd[m
-Author: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
-Date:   Fri Dec 27 00:28:01 2024 +0800
-
-    Super tiny typo fix (#2564)
-
-[33mcommit ed91e003bb2348c735f4c99125eb0f860306d568[m
-Author: kzhou003 <zhoukuan1@gmail.com>
-Date:   Thu Dec 26 08:24:18 2024 -0800
-
-    [UTILS] improve makefile a bit by adding help info (#2570)
-    
-    Co-authored-by: Hongpeng Guo <hpguo@anyscale.com>
-    Co-authored-by: Xingyao Wang <xingyao@all-hands.dev>
-    Co-authored-by: yigex <yigex@amd.com>
-
-[33mcommit 531d6ea9689acb27068a1ae89986f472f8d7a32a[m
-Author: yudian0504 <138860534+yudian0504@users.noreply.github.com>
-Date:   Fri Dec 27 00:16:48 2024 +0800
-
-    fix: package data missing (#2521)
-
-[33mcommit dc3bee4815183d19761082b9ac90ada72d0151ef[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Dec 26 07:56:26 2024 -0800
-
-    Fix test and benchmark scripts (#2598)
-
-[33mcommit a74d194146354af749947edfc1782f62bfea8b94[m
-Author: Zhizhou Sha <shazhizhou0@gmail.com>
-Date:   Thu Dec 26 06:54:43 2024 -0800
-
-    [unittest] add unit test to test quant args of srt engine (#2574)
-
-[33mcommit 3169e66c2347c7dddd74661a58357eb7b3e55d78[m
-Author: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
-Date:   Thu Dec 26 22:49:32 2024 +0800
-
-    Fix duplicated handling of GetWeightsByNameReqInput (#2565)
-
-[33mcommit 773951548ddd2d9f98f062788bc1c13aecbcf66d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Dec 26 06:27:45 2024 -0800
-
-    Fix logprob_start_len for multi modal models  (#2597)
-    
-    Co-authored-by: libra <lihu723@gmail.com>
-    Co-authored-by: fzyzcjy <ch271828n@outlook.com>
-    Co-authored-by: Wang, Haoyu <haoyu.wang@intel.com>
-
-[33mcommit 637de9e8ce91fd3e92755eb2a842860925954ab1[m
-Author: fsygd <fsygd1996@163.com>
-Date:   Thu Dec 26 21:31:56 2024 +0800
-
-    update readme of DeepSeek V3 (#2596)
-
-[33mcommit acb340728c169a9338e16783ff65510ab21179be[m
-Author: Adarsh Shirawalmath <114558126+adarshxs@users.noreply.github.com>
-Date:   Thu Dec 26 18:42:41 2024 +0530
-
-    [Feature] Support new parameter - EBNF in xgrammar (#2526)
-
-[33mcommit 08effbff35849a4e252b5a161f838731ed6d5deb[m
-Author: Sangchun Ha (Patrick) <seomk9896@gmail.com>
-Date:   Thu Dec 26 22:10:37 2024 +0900
-
-    Error occurs when loading the gemma model in bitsandbytes format. (#2557)
-
-[33mcommit 60bd32723ad0c7bda461bddd38f9475a28165c6f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Dec 26 03:31:50 2024 -0800
-
-    Update README.md (#2594)
-
-[33mcommit e7ebecf82eeacf6ed8dcd5b4ebffcb4e1e180671[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Dec 26 03:14:28 2024 -0800
-
-    Fix cache hit rate when chunked prefill (#2555)
-
-[33mcommit 9a23c484562770e6acb721af30d41f201d9b9936[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Thu Dec 26 19:13:31 2024 +0800
-
-    h100 tuning fused_moe_triton for qwen2 moe (#2560)
-
-[33mcommit 635a042623960722d04ae6ce39c788715c7d2e6b[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 26 17:43:37 2024 +0800
-
-    docs: update deepseek v3 example (#2592)
-
-[33mcommit 2dccecf43261207aaf5a8da7a92f5d1ae3f52e5b[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 26 16:56:59 2024 +0800
-
-    fix: only enable moe_align_block_size for now (#2590)
-
-[33mcommit 75ad0a143fb5ff91499578179028d772d21d3d49[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 26 15:26:54 2024 +0800
-
-    docs: add deepseek v3 launch instructions (#2589)
-
-[33mcommit efc52f85e2d5c9b31545d4092f2b361b6ff04d67[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 26 07:14:51 2024 +0800
-
-    chore: bump v0.4.1 (#2582)
-
-[33mcommit 60e2fdcf4fdb84742bbe452fdd4e2db2a3307c15[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 26 06:29:08 2024 +0800
-
-    use sgl-kernel moe_align_block_size (#2581)
-    
-    Co-authored-by: ispobock <ispobaoke@163.com>
-    Co-authored-by: HandH1998 <1335248067@qq.com>
-
-[33mcommit d7c0e872b0ffc88d825b7b580b839de427b43e23[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 26 06:11:39 2024 +0800
-
-    chore: bump 0.0.2.post8 for sgl-kernel (#2580)
-
-[33mcommit 31548116a8dc8c6df7e146e0587335a59fc5b9d7[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 26 05:31:04 2024 +0800
-
-    fix moe_align_block_size_kernel for shared memory issue (#2579)
-    
-    Co-authored-by: ispobock <ispobaoke@163.com>
-
-[33mcommit 53aed988cbaa7433c59c070d72d5aad3815cb286[m
-Author: HandH1998 <1335248067@qq.com>
-Date:   Thu Dec 26 00:02:14 2024 +0800
-
-    Refactor MoE (#2575)
-    
-    Co-authored-by: zhyncs <me@zhyncs.com>
-
-[33mcommit 8a56b43175c7daa0e5d900a27a137778edbc4e97[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Dec 23 19:21:21 2024 -0800
-
-    [Bench] Flush cache before benchmarking (#2566)
-
-[33mcommit e835a50021e03bf7c14b8af68b210189bb6e768b[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Tue Dec 24 01:10:22 2024 +0800
-
-    Reorg moe code (#2563)
-
-[33mcommit 23e5e50fd5fba7f315e04294f55060a8171fcc69[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 22 20:21:17 2024 -0800
-
-    Fix gemlite import (#2553)
-
-[33mcommit 25e5d589e39b3b605296395e4f9c96ec42f09055[m
-Author: Shi Shuai <126407087+shuaills@users.noreply.github.com>
-Date:   Mon Dec 23 01:14:40 2024 +0000
-
-    Doc: Update Grammar Backend (#2545)
-    
-    Co-authored-by: Chayenne <zhaochen20@outlook.com>
-
-[33mcommit 41b1db69b8104bc7ee0252a6215df4a262e8352b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 22 15:44:32 2024 -0800
-
-    A better aio rwlock that guarantees the order (#2547)
-
-[33mcommit 8496701934b9b9f6c178cacdf7d282e654716f29[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 22 06:25:57 2024 -0800
-
-    [Misc] Fix metrics, weight update lock, request logging (#2543)
-
-[33mcommit 7d672d277be6b24e4e5a157c8539dbabe07c5246[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Sun Dec 22 18:31:02 2024 +0800
-
-    [kernel optimize] benchmark write_req_to_token_pool_triton and optimize kernel (#2509)
-
-[33mcommit d4b174817dc6d97dd3b5b3366eff221b29566ec6[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 22 18:29:04 2024 +0800
-
-    docs: update sponsorship (DataCrunch) (#2523)
-
-[33mcommit 19ba2b0ea9241bf6a3ff5918916cd725755d1b6b[m
-Author: Lei <zhou.lei@outlook.com>
-Date:   Sun Dec 22 02:23:33 2024 -0800
-
-    Add lora_paths to v1_chat_generate_request (#2529)
-
-[33mcommit 4e1e3cff2075eba31c4656cad8512b1eba306570[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 22 00:14:41 2024 +0800
-
-    fix #2528 (#2541)
-
-[33mcommit 8f4d04e5403510fdea9d6ab83fc67f3e07ea4e32[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Dec 21 21:16:34 2024 +0800
-
-    chore: bump v0.4.0.post2 (#2525)
-
-[33mcommit feb2b768ba43577594c30f9dac55355954721e01[m
-Author: Jerry Zhang <jerryzh168@gmail.com>
-Date:   Fri Dec 20 08:25:25 2024 -0800
-
-    Add integration with gemlite weight only quant (#2528)
-
-[33mcommit d95a5f5bf53c639975a426381e0f11aa4099c076[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 19 23:24:30 2024 +0800
-
-    fix followup #2517 (#2524)
-
-[33mcommit 4b83db24f12861b51f58ffab35035414b44b080f[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 19 14:03:24 2024 +0800
-
-    fix: continue to use flashinfer 0.1.6 temporarily (#2517)
-
-[33mcommit 64456cf023539661f117bcbb90a4ab76c82dfb0d[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 19 13:44:02 2024 +0800
-
-    docs: update README (#2516)
-
-[33mcommit bb4a922023f401db849f0c3b36400cacb297cf0e[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 19 13:37:09 2024 +0800
-
-    feat: add llama3 eval (#2515)
-
-[33mcommit 21e9e63ad56f8bd25663fa6907ed92f47a2b2724[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Dec 17 06:20:44 2024 -0800
-
-    Print progress bar during cuda graph capture (#2502)
-
-[33mcommit 1fc84cf60be05bbbb45cabdfb5ba9454b03638a6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Dec 17 04:33:36 2024 -0800
-
-    Update readme (#2500)
-    
-    Co-authored-by: Ravi Theja <ravi03071991@gmail.com>
-    Co-authored-by: “yixin-huang1” <yixinhuang1@berkeley.edu>
-
-[33mcommit 361ea8d9120879b6eed517416fc70db829c8ed2e[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Dec 17 04:14:14 2024 -0800
-
-    Fix openai protocols and pass top_k, min_p (#2499)
-
-[33mcommit 33c5ff2845a980e2022f7c10268474b91416d19a[m
-Author: Lei <zhou.lei@outlook.com>
-Date:   Tue Dec 17 03:47:49 2024 -0800
-
-    Add lora_path to chat completion (#2438)
-
-[33mcommit 5ce9daea59774f64933036790ad7bb659fc10386[m
-Author: Hui Liu <96135754+hliuca@users.noreply.github.com>
-Date:   Tue Dec 17 03:45:14 2024 -0800
-
-    ROCm support for sglang.check_env (#2426)
-
-[33mcommit ce094a5d79aa3794f0cdb86ebf03e4897764e1bd[m
-Author: Ata Fatahi <immrata@gmail.com>
-Date:   Tue Dec 17 06:42:40 2024 -0500
-
-    Clean up GPU memory after killing sglang processes (#2457)
-    
-    Signed-off-by: Ata Fatahi <immrata@gmail.com>
-
-[33mcommit e21026690db23c833908c12bbe5fba94bb601735[m
-Author: bjmsong <wq.songbob@gmail.com>
-Date:   Tue Dec 17 19:31:57 2024 +0800
-
-    benchmark decoding attention kernel with cudnn (#2467)
-    
-    Co-authored-by: root <bjmsong@126.com>
-
-[33mcommit bd6196163ec3293b5254ecb5c6f14c16cb3577b6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Dec 16 19:21:11 2024 -0800
-
-    Small fix for the order of apply_torchao_config (#2495)
-
-[33mcommit 56198b45d9712bdbb161d226f94b4647738d33f5[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Dec 16 18:49:02 2024 -0800
-
-    Add a benchmark script for in-batch prefix caching (#2494)
-
-[33mcommit ba36b5520ab6759045abfd89d1d108f861053fb1[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Dec 16 15:04:16 2024 -0800
-
-    Revert "Small fixes for torchao quant" (#2493)
-
-[33mcommit 9cd9dc83b31e70723fb28ad820994a446cd57f8b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Dec 16 14:17:27 2024 -0800
-
-    Temporarily disable unit test of torch native attention backend (#2492)
-
-[33mcommit 7a1aecb9389cb5928f4595af4a1fb5f88e85b5f8[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Dec 16 14:11:09 2024 -0800
-
-    Simplify pytorch sampling kernel and logit processor (#2491)
-
-[33mcommit 82699474fdaad513c7fa8d73e892a262c47f6569[m
-Author: Jerry Zhang <jerryzh168@gmail.com>
-Date:   Mon Dec 16 14:08:12 2024 -0800
-
-    Small fixes for torchao quant (#2476)
-
-[33mcommit 7154b4b1df1410a8f64d996f912b7084dea7b270[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Dec 16 23:02:49 2024 +0800
-
-    minor: update flashinfer nightly (#2490)
-
-[33mcommit b532a5fd16d0c2b0d7945bffdc3beab1d7018975[m
-Author: xiaobochen <35516720+xiaobochen123@users.noreply.github.com>
-Date:   Mon Dec 16 20:54:02 2024 +0800
-
-    fix moe-ep accuracy issue for fp8 (#2489)
-
-[33mcommit a0592c059f14e9aaa73432497fa8b9f01399cfc7[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Sun Dec 15 13:52:08 2024 +0800
-
-    [Benchmark] add a benchmark for hf/vllm/sglang rmsnorm (#2486)
-
-[33mcommit e8dbdf75bc72c7e5ba0230ceebc253a2eaa1bd6d[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 15 13:44:55 2024 +0800
-
-    fix typo (#2487)
-
-[33mcommit e04d3f289753b942bc2d201988df408d01baf73c[m
-Author: yizhang2077 <1109276519@qq.com>
-Date:   Sun Dec 15 13:15:59 2024 +0800
-
-    adapt tensorrt llm custom all reduce to sgl-kernel (#2481)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit 5f2595be430239ba13c5adbe559e21333f5adf9e[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 15 02:47:26 2024 +0800
-
-    hotfix: checking for HIP (#2485)
-
-[33mcommit 0ba2c58947633e88f6bd848d2ba943132f43e901[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Sat Dec 14 23:53:54 2024 +0800
-
-    Remove cuda graph batch size adjustment for dp attention (#2484)
-
-[33mcommit fccbfa3752abc4c599a58be32573246a6215b747[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Dec 14 22:36:04 2024 +0800
-
-    format: add clang-format for sgl-kernel (#2483)
-
-[33mcommit 2f9bd0fafd7bfe9f8c085a5f482635c8638accc6[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Sat Dec 14 16:50:54 2024 +0800
-
-    Fix correctness issue for triton decoding kernel (#2479)
-
-[33mcommit 5282a4735f5b0bc99303bc57838fa926aa437066[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Dec 12 14:34:47 2024 -0800
-
-    [Minor] Fix grok model loader (#2473)
-
-[33mcommit f0ed9c353e954b9860a0cab8bf76e80c651c48b6[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Dec 13 02:23:52 2024 +0800
-
-    feat: support dev image (#2469)
-
-[33mcommit e3b3acfa6fff7e9c34a80e641260c92adccf1b22[m
-Author: Ata Fatahi <immrata@gmail.com>
-Date:   Thu Dec 12 12:40:41 2024 -0500
-
-    Rename rust folder to sgl-router (#2464)
-    
-    Signed-off-by: Ata Fatahi <immrata@gmail.com>
-
-[33mcommit 2673fa29d4b9188463898da7d1234a85a1685377[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 12 18:05:48 2024 +0800
-
-    fix: set runtime path (#2466)
-
-[33mcommit dedaf8cd48c1079a8b80780755a84f199cd621db[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 12 15:21:45 2024 +0800
-
-    minor: update pypi tag (#2463)
-
-[33mcommit 32ed01604187c05160f64520025c96c9eeae0dd1[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 12 14:58:05 2024 +0800
-
-    chore: bump v0.0.2 for sgl-kernel (#2462)
-
-[33mcommit 6efa9e4a6da41c6906ac5b8a7fe7faf7e2692291[m
-Author: Ata Fatahi <immrata@gmail.com>
-Date:   Wed Dec 11 20:40:03 2024 -0500
-
-    Bump sglang-router to 0.1.1 (#2459)
-    
-    Signed-off-by: Ata Fatahi <immrata@gmail.com>
-
-[33mcommit 7791fd994847e4e5ef68c79328cf5d665776b808[m
-Author: Ata Fatahi <immrata@gmail.com>
-Date:   Wed Dec 11 20:31:20 2024 -0500
-
-    Include version info into the router package (#2456)
-    
-    Signed-off-by: Ata Fatahi <immrata@gmail.com>
-    Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
-
-[33mcommit 2ac36b9a7bd602ea68e36d7a364b87ea3d321798[m
-Author: Ata Fatahi <immrata@gmail.com>
-Date:   Wed Dec 11 19:55:21 2024 -0500
-
-    Make request payload size configurable (#2444)
-    
-    Signed-off-by: Ata Fatahi <immrata@gmail.com>
-
-[33mcommit 2d60a5ee75a8506bf7ee97090afc3f0d1baf88d4[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Wed Dec 11 13:48:18 2024 -0800
-
-    Update v0.1.0.md
-
-[33mcommit 2e4a5907c9670e64fffbd1c90c9a3ecf52ec6a50[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Wed Dec 11 13:42:35 2024 -0800
-
-    [router] Release router 0.1.0 with dynamic scaling and fault tolerance (#2455)
-
-[33mcommit c0ee46fe10e7627903543b7ab7e536b02ef7a4d3[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Wed Dec 11 13:11:42 2024 -0800
-
-    [router] Update doc for dynamic scaling and fault tolerance (#2454)
-
-[33mcommit 9208618b3ef91dcabd0eb4157cf941d11391d8da[m
-Author: SangBin Cho <sangbin@x.ai>
-Date:   Wed Dec 11 12:51:50 2024 -0800
-
-    [Core] in batch prefix caching by delay scheduling (#2442)
-
-[33mcommit 864bf2ba00daecda60cb7e4636aae3a7ff277dbe[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Wed Dec 11 12:13:19 2024 -0800
-
-    [router] remove main.rs because only lib.rs is used for py binding (#2453)
-
-[33mcommit a4cca7fc53da2b0c58495e208bb17e0199246e12[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Wed Dec 11 12:13:08 2024 -0800
-
-    [router] Add retries based fault tolerance (#2452)
-
-[33mcommit 993956c6b1e4da007c6d821c1d181221410825d2[m
-Author: Fred Reiss <frreiss@us.ibm.com>
-Date:   Wed Dec 11 06:30:23 2024 -0800
-
-    Add support for IBM Granite 3.x models (#2437)
-
-[33mcommit f8548295d63d56b79599d900c61d6539bb6cfc74[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Dec 11 06:16:01 2024 -0800
-
-    Fix warmup in bench_offline_throughput.py (#2449)
-
-[33mcommit 959735fc9e38d6507651ba9196aa205430687b05[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Dec 11 05:21:23 2024 -0800
-
-    Fix model loader for more quantization formats (#2448)
-
-[33mcommit f67723940d92dc7e78b79897bbefc42c3bec6a80[m
-Author: bjmsong <wq.songbob@gmail.com>
-Date:   Wed Dec 11 20:46:59 2024 +0800
-
-    decoding attention kernel benchmark (#2425)
-    
-    Co-authored-by: root <bjmsong@126.com>
-
-[33mcommit 626a99ac13e475c4bfca270d241c9c9a2c949d0b[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Dec 11 20:44:28 2024 +0800
-
-    chore: update ao v0.7.0 (#2447)
-
-[33mcommit ece724910afd624e8aac85444472440d6586c3e4[m
-Author: Ke Wen <kw2501@meta.com>
-Date:   Wed Dec 11 04:21:42 2024 -0800
-
-    Make torch TP composable with torchao (#2436)
-
-[33mcommit 0fb88aaa77827d23b14fe64099121976adafce10[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Wed Dec 11 01:38:50 2024 -0800
-
-    [router] Use borrow if possible to save cost (#2441)
-
-[33mcommit d4de9a62359d1299cb639a67f39cfb40fda5d957[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Wed Dec 11 00:51:21 2024 -0800
-
-    [router] Refactor: decouple select and send stage (#2440)
-
-[33mcommit 7310aede97a0fdacc0be3219b6f8174b53351075[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Dec 11 06:48:45 2024 +0800
-
-    fix: compatible with PEP 440 (#2435)
-
-[33mcommit 5de9a58eca5fe9ce6ed4ad6f09efe5f2cecbab6e[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Dec 11 06:17:41 2024 +0800
-
-    fix: use manylinux2014_x86_64 tag (#2434)
-
-[33mcommit 56fcd8e8a53438838172454077e10944719e8fa1[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Dec 11 06:06:19 2024 +0800
-
-    feat: support sgl-kernel PyPI (#2433)
-    
-    Co-authored-by: Zhangyi <1109276519@qq.com>
-
-[33mcommit 2b340adfb1ebf6dee420885479ee92296694078c[m
-Author: Adarsh Shirawalmath <114558126+adarshxs@users.noreply.github.com>
-Date:   Tue Dec 10 11:19:40 2024 +0530
-
-    Typo fix in router.md (#2424)
-
-[33mcommit 8586b72da0fe8a9d32a76770eba4b83a893b5ddf[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Dec 9 09:52:38 2024 -0800
-
-    [feat] Enable chunked prefill for llava-onevision (#2412)
-
-[33mcommit 641b7d0ae051d3964926d5062d6426520367488e[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Dec 9 06:30:35 2024 -0800
-
-    [Minor] Improve code style (#2422)
-
-[33mcommit 0ce091a82d29bd6c0ea6564bc372311d14b6f5eb[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Dec 9 03:05:59 2024 -0800
-
-    [Minor] Improve code style (#2419)
-
-[33mcommit 835f8afc7788ad03a1175a6930f741a6bbf3f92e[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 8 23:30:51 2024 -0800
-
-    Migrate llama_classification to use the /classify interface (#2417)
-
-[33mcommit 3844feb9bb1cdd1ee59653b85e3b40e8a4d107d1[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Mon Dec 9 14:46:10 2024 +0800
-
-    Add a unittest for fused_moe (#2416)
-
-[33mcommit 27f7bed7a75b52538a2a4de69054f1dd19e1455c[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sun Dec 8 21:17:31 2024 -0800
-
-    reduce watchdog interval to 5s (#2410)
-
-[33mcommit 6387098f5f98101ee103732efe8da9d6cb54d92d[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sun Dec 8 17:17:37 2024 -0800
-
-    [router] add health checking in router init (#2393)
-
-[33mcommit 2a717c5078ed5feb7c8df70943e25d27e50a89eb[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sun Dec 8 16:58:41 2024 -0800
-
-    [Router] fix interrupt from terminal (#2413)
-
-[33mcommit a1e697b25b31287b67afe009a61f803b2fd6592f[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sun Dec 8 15:24:02 2024 -0800
-
-    [router] Improve cleanup logic (#2411)
-
-[33mcommit a6ca736c8e35b308ecb9d8e21c53692ef5c7ac4f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 8 12:27:13 2024 -0800
-
-    Simplify stream_output (#2398)
-
-[33mcommit f62055b528c2cac6cebdb6303e00bb479d7d2402[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Dec 9 04:15:21 2024 +0800
-
-    minor: add random flashinfer vs triton use case (#2409)
-
-[33mcommit 74bc9184c3eb8fcd2135a665424d484a652fe50a[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Dec 9 03:21:35 2024 +0800
-
-    minor: add random use case (#2408)
-
-[33mcommit 0f8eb15323ea8776a945d917517990ca7cbfbdcb[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Dec 9 02:29:55 2024 +0800
-
-    feat: support custom task runner (#2407)
-
-[33mcommit 67470bbb28591cc2a82a4cda419cdf6664ce46d2[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 8 20:55:04 2024 +0800
-
-    minor: update correct measurement unit (#2406)
-
-[33mcommit cc858953a0b0f99e5b7cf07dcf3335a158097df5[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 8 04:08:04 2024 -0800
-
-    Fix recv_requests (#2405)
-
-[33mcommit 6128f7cff5e61517f69fafa6aec148d8d40657cf[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 8 20:07:30 2024 +0800
-
-    fix: specify dtype with begin_forward aka plan (#2404)
-
-[33mcommit a2486eb58fa32661965bf66034625155e87cfc05[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 8 03:55:27 2024 -0800
-
-    Fix a bug with logprob streaming + chunked prefill (#2403)
-
-[33mcommit 61dec545b0446256b655d4a8aeccb50d3a341ee4[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Sun Dec 8 19:37:03 2024 +0800
-
-    Remove unused vars in the triton backend (#2401)
-
-[33mcommit 96db0f666d850156555b721ace0e3a9464249f34[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 8 01:56:26 2024 -0800
-
-    Update killall_sglang.sh (#2397)
-
-[33mcommit 7dc66fcb40aa693a299bdcf17247f52cc9deeff0[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Sun Dec 8 17:17:37 2024 +0800
-
-    Optimize Triton decoding kernel for long context (#2394)
-
-[33mcommit 1f09e84b9a31a8fa98fee6cbb9c5d8409967e653[m
-Author: SangBin Cho <rkooo567@gmail.com>
-Date:   Sun Dec 8 01:06:15 2024 -0800
-
-    nit: Remove busy waiting on scheduler (#2382)
-
-[33mcommit 63dfab1beada0c6800b6694bf28eb8eb85657615[m
-Author: Sangchun Ha (Patrick) <seomk9896@naver.com>
-Date:   Sun Dec 8 18:04:08 2024 +0900
-
-    Fix shape error that occurred when loading lora weight of gemma2 model. (#2330)
-
-[33mcommit ef995dae1e9e7cdc7cfa7d78a195a3943d7e3e6b[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Dec 7 15:39:54 2024 -0800
-
-    [router] Health check on worker before adding to the router (#2392)
-
-[33mcommit 75ae968959566da691a0bde8e6f96f463ce531b3[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 8 04:21:00 2024 +0800
-
-    minor: update killall script (#2391)
-
-[33mcommit 95f93f493a60a4dfdb30aa3d24ba3fc3b8666d3e[m
-Author: HAI <hixiao@gmail.com>
-Date:   Sat Dec 7 05:18:26 2024 -0800
-
-    Fp8 MoE optimizations on AMD (#2388)
-
-[33mcommit aaac33fd8dbc5f11790298d9d1ef325da487f3e4[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Dec 7 21:09:16 2024 +0800
-
-    fix: update xgrammar v0.1.6 (#2390)
-
-[33mcommit d332aa3b0c0ac131df4724084fc167f852611503[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Dec 7 19:28:53 2024 +0800
-
-    fix: resolve fp8 moe issue (#2387)
-
-[33mcommit c36736c841f735aa3a03bfa0db52c9d603c5fb49[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Fri Dec 6 17:16:03 2024 -0800
-
-    [router] Add remove worker api (#2380)
-
-[33mcommit 1bf9e34745e8056f9043065f4c485b4aa4d3864b[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Fri Dec 6 11:53:15 2024 -0800
-
-    [router] add remove tenant method in the radix tree (#2379)
-
-[33mcommit 499c85f1318d5ad914a599050bd3f616a28007e0[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Fri Dec 6 11:26:07 2024 -0800
-
-    [Router] remove duplicate char count (#2378)
-
-[33mcommit e5f227c0ee9f491ed8a625733314e7218988e744[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Dec 6 06:08:19 2024 -0800
-
-    Release v0.4.0.post1 (#2375)
-
-[33mcommit 0e7409adb64ac19db2db3583ef3e4077cc569b30[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Dec 6 05:49:29 2024 -0800
-
-    Fix the overlap for xgrammar (#2377)
-
-[33mcommit 3cde5eb62940556b4defbe285170658027fca353[m
-Author: vchzls <zhaohoulong@outlook.com>
-Date:   Fri Dec 6 20:27:17 2024 +0800
-
-    docs: Improve instructions for supporting new models (#2363)
-    
-    Co-authored-by: zhaohoulong <zhaohoulong@xiaomi.com>
-
-[33mcommit f5b2a3aa67efb10918965b9f3555ff24ef971902[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Dec 6 02:01:23 2024 -0800
-
-    Use proc.join instead of busy waiting (#2374)
-
-[33mcommit f68175967cb61983377a634a25994c5c8e9fb7e0[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Dec 6 17:59:26 2024 +0800
-
-    docs: update adoption (Meituan) (#2373)
-
-[33mcommit 67b657945a1b62bafc0376cda78c91b1ef2a614a[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Fri Dec 6 01:17:04 2024 -0800
-
-    [router] support `/add_worker` api (#2369)
-
-[33mcommit 37ee906f616efbd89b80fc2273e85bf8dbdd6682[m
-Author: Qun Yang <quyang@habana.ai>
-Date:   Fri Dec 6 17:16:33 2024 +0800
-
-    Add more support for intel Gaudi accelerators (#2357)
-
-[33mcommit 34b364e07355f5216babd8c6fac7cb476f85e42c[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Fri Dec 6 17:13:04 2024 +0800
-
-    optimize cuda graph max_bs_settings on low-end gpus (#2360)
-
-[33mcommit 84d96b3ae52ebf65baa6557647e09488b28eee3b[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Dec 6 15:42:10 2024 +0800
-
-    Move FP8 to SGLang (#2370)
-    
-    Co-authored-by: HaiShaw <hixiao@gmail.com>
-
-[33mcommit 3d32e4a32c4cd0c29da176bbc9f6b4f018c54fa5[m
-Author: xiaobochen <35516720+xiaobochen123@users.noreply.github.com>
-Date:   Fri Dec 6 15:05:21 2024 +0800
-
-    Resubmit MoE-EP (#2371)
-
-[33mcommit 64fceab8afae962ac2f64b6491d873591a58c051[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Thu Dec 5 17:46:21 2024 -0800
-
-    [router] use 2-gpu-runner (#2368)
-
-[33mcommit 71e2a27753fa6908eeaa0151ad27df0b05fd407a[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Dec 5 13:42:47 2024 -0800
-
-    Fix the cuda graph capture range for small #max-running-requests (#2359)
-
-[33mcommit 4a63c181f19015a0a8812b1fe5c33daf90ec8590[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Fri Dec 6 00:46:48 2024 +0800
-
-    Fix AWQ with enable MLA (#2364)
-
-[33mcommit 2b0fc5941d3d7f3dfe4a56c053ddddf9d4f77670[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Dec 4 19:02:08 2024 -0800
-
-    [Minor] Code style improvements (#2355)
-
-[33mcommit 9cc733b38ceb4fc9df0daa6aed7335f2f8a4ba82[m
-Author: Jerry Zhang <jerryzh168@gmail.com>
-Date:   Wed Dec 4 17:26:42 2024 -0800
-
-    move apply_torchao_config_ to model_runner (#2342)
-
-[33mcommit d693ec0427bd70c8676316c634e00bd27514b7ec[m
-Author: Ke Wen <kw2501@meta.com>
-Date:   Wed Dec 4 17:26:00 2024 -0800
-
-    Make torch TP composable with torch.compile (#2352)
-
-[33mcommit 18ea841f408c01a28c1a1db92f37ae95cfa12523[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Wed Dec 4 15:41:22 2024 -0800
-
-    Add Docs For SGLang Native Router (#2308)
-
-[33mcommit 786be44da52e4994c499fdddbbac0f5d79a9fd6e[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Wed Dec 4 11:19:46 2024 -0800
-
-    Fix Docs CI When Compile Error (#2323)
-
-[33mcommit 2db4469808158700036de79bd41a9c463bb89bdc[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 5 02:00:34 2024 +0800
-
-    minor: limit the range of vllm versions (#2350)
-
-[33mcommit ed45e509df91663698f42d132253ae485baba00c[m
-Author: Ata Fatahi <immrata@gmail.com>
-Date:   Wed Dec 4 09:53:02 2024 -0800
-
-    Check gpu availability at server args creation (#2340)
-    
-    Signed-off-by: Ata Fatahi <immrata@gmail.com>
-
-[33mcommit ec52464ddeabcc70b1fd3117b93adfefd5cb7ed0[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Thu Dec 5 01:50:28 2024 +0800
-
-    MLA prefill w/o weight absorption (#2349)
-
-[33mcommit eb0c1f53735c2a6f4c0ae0f0846f7cdc959ebada[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Dec 5 01:24:51 2024 +0800
-
-    docs: add SGLang v0.4 blog (#2341)
-
-[33mcommit b2986d7aa5a40740b71c0d2f59a9277cfa10c67f[m
-Author: HAI <hixiao@gmail.com>
-Date:   Wed Dec 4 03:01:33 2024 -0800
-
-    Adding SGLang FP8 Utils (#2348)
-
-[33mcommit f8b0326934bacb7a7d4eba68fb6eddebaa6ff751[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Dec 4 03:55:41 2024 +0800
-
-    chore: bump v0.4.0 (#2338)
-
-[33mcommit 0495796517a706e6ddf22189359f9da8e6f2b36b[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Tue Dec 3 10:27:43 2024 -0800
-
-    [router] Copy license when publishing & bump version (#2339)
-
-[33mcommit 1228f7ca69e6ee3f5076f2381c3a187120e0de00[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Dec 3 07:12:33 2024 -0800
-
-    Fix gptq for moe layers (#2300)
-    
-    Co-authored-by: root <me@zhyncs.com>
-
-[33mcommit fda628d8f210058b5386d0e6b4eefcd6a8fb8947[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Dec 3 21:22:19 2024 +0800
-
-    fix: resolve cmake url for Dockerfile.dev (#2335)
-
-[33mcommit 07ec07ad1fa59e0f07a4fcd1b1f324123c2e2bd4[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Dec 3 01:58:25 2024 -0800
-
-    Improve torch compile for fused moe (#2327)
-
-[33mcommit 83b340e371a0151c9fdefac9f07e0f89ba5e6c37[m
-Author: Ata Fatahi <afbcesh91@gmail.com>
-Date:   Tue Dec 3 00:06:25 2024 -0800
-
-    Add missing license for router wheel (#2324)
-    
-    Signed-off-by: Ata Fatahi <immrata@gmail.com>
-
-[33mcommit 0639bf15d1077fafe6f1be41dad72d6c87b301a9[m
-Author: HAI <hixiao@gmail.com>
-Date:   Mon Dec 2 23:20:33 2024 -0800
-
-    ROCm Container: set SGLANG_SET_CPU_AFFINITY=1 (#2328)
-
-[33mcommit aa47f642230f35269b45d81cba837a30a3015eb3[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Dec 2 23:11:13 2024 -0800
-
-    Revert "[feat] Enable chunked prefill for llava-onevision" (#2329)
-
-[33mcommit 3ddb1c467979eb13afc629506ea80806935390e8[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Dec 2 20:45:53 2024 -0800
-
-    [Minor] Fix logger and style (#2325)
-
-[33mcommit 480e38a73350f2af57d003b023fab5cbc9a1e65e[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Dec 2 20:19:02 2024 -0800
-
-    [feat] Enable chunked prefill for llava-onevision (#2281)
-
-[33mcommit 69e2d4fb66e8dd9df7e9472df44ae29afc1320d1[m
-Author: HAI <hixiao@gmail.com>
-Date:   Mon Dec 2 19:05:58 2024 -0800
-
-    Relax to include more AMD GPUs (#2319)
-
-[33mcommit 85e1a6f3aa5a2288ca85fe3fe922c733b6533fa7[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Dec 2 23:22:13 2024 +0800
-
-    Update  model_loader deps and qqq quantization deps (#2220) (#2318)
-    
-    Co-authored-by: HandH1998 <1335248067@qq.com>
-
-[33mcommit 33deca81b5e346c8cd0a04bd0896746d61515dc9[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Dec 2 04:26:55 2024 -0800
-
-    Add more fused moe benchmark utilities (#2314)
-
-[33mcommit 18108abe5d0e2cb21e1ac7efe22144fbfe19d8af[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Dec 2 02:27:36 2024 -0800
-
-    [Minor] Fix code style (#2311)
-
-[33mcommit c54bda300ab2d9128eded374b802a4779302b9ff[m
-Author: HAI <hixiao@gmail.com>
-Date:   Mon Dec 2 00:15:45 2024 -0800
-
-    Use rocminfo instead of rocm-smi for more OS/WSL support (#2310)
-
-[33mcommit 3c79ad35cae8c7883b2d0d9f067b46804c24f544[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 1 23:36:28 2024 -0800
-
-    [Fix] Fix the padded hash value for image tokens (#2309)
-
-[33mcommit 983bfcf386861812aeaf1f0495371549a94b01c1[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Sun Dec 1 23:23:18 2024 -0800
-
-    Online weight updates from torch.distributed (#2279)
-
-[33mcommit 28bc60dcab1290933c35e77e5b28a95285c34703[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Dec 2 02:03:49 2024 +0800
-
-    misc: update build setup (#2306)
-
-[33mcommit 7301a39b13c769e3b9eac38f8e08e6c22018a799[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 1 23:55:19 2024 +0800
-
-    fix: resolve CodeQL cpp issue (#2305)
-
-[33mcommit 47eb139f810a84f16d426087268991bef8a4540f[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 1 22:43:50 2024 +0800
-
-    feat: use warp reduce as a simple example (#2304)
-
-[33mcommit 5c18a037337ec39f1a3d0609574f56d1eb05f339[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 1 05:17:05 2024 -0800
-
-    Fix logprob for completions (#2301)
-
-[33mcommit 5c91a315d779690b800638a868e1dbd6479ef49c[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 1 20:11:21 2024 +0800
-
-    feat: support sgl-kernel pypi (#2302)
-
-[33mcommit 3dbd73d3194ddc91320c88ae8399e86094f91764[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 1 19:24:12 2024 +0800
-
-    minor: rm unused _grouped_size_compiled_for_decode_kernels (#2299)
-
-[33mcommit e9a6203dee21cda91a8f5a113ea4171f3b221571[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 1 19:18:57 2024 +0800
-
-    feat: skip good first issue (#2298)
-
-[33mcommit 62c516ac45a74af107d64d421cf3639701b3b17b[m
-Author: Qun Yang <quyang@habana.ai>
-Date:   Sun Dec 1 19:01:25 2024 +0800
-
-    Add a simple torch native attention backend (#2241)
-
-[33mcommit fc78640e00e39520fa7126789d23369d2f104d0c[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 1 18:55:26 2024 +0800
-
-    minor: support flashinfer nightly (#2295)
-
-[33mcommit 906d795f15e4df3535f1b76af709932076a07797[m
-Author: gobraves <gobraves@users.noreply.github.com>
-Date:   Sun Dec 1 18:07:27 2024 +0800
-
-    Feat: upgrade outlines & support compatibility with the old version (#2292)
-
-[33mcommit 118b6af35e37ebe2bc82905be30637e444b304c6[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 1 18:01:16 2024 +0800
-
-    feat: add should_use_tensor_core (#2179)
-
-[33mcommit 9449a95431dd8a3e2c1c817782bc52eb7bc50d03[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Dec 1 01:47:30 2024 -0800
-
-    [CI] Balance CI tests (#2293)
-
-[33mcommit 5f12f0e7af585973b366c262a5c3faea5ce5bf0a[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Dec 1 00:37:53 2024 -0800
-
-    Fix chunked prefill when ignore eos (#2290)
-
-[33mcommit d5b95cbb53b05a8cfe6884a989a05c88e8363295[m
-Author: yizhang2077 <1109276519@qq.com>
-Date:   Sun Dec 1 15:54:52 2024 +0800
-
-    adapt vllm distributed module to sglang (#2244)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit 0303ca918fdf7b87df63c8902a36f2623cb4dea0[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 30 23:53:34 2024 -0800
-
-    [CI] Fix missing files in run_suite.py (#2288)
-
-[33mcommit 00181098dd2b4b5dacdec299205f499c7bcb62b9[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Dec 1 15:27:52 2024 +0800
-
-    feat: add Dockerfile for development (#2289)
-
-[33mcommit 4936be8accdf5604152ee77369db87c03399c726[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 30 22:14:48 2024 -0800
-
-    Revert "Revert "[FEAT] Support GGUF format"" (#2287)
-
-[33mcommit 1bfa511b95896d15b70cf505d10f2e28344ffb33[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 30 21:12:03 2024 -0800
-
-    [CI] Fix ci tests (#2284)
-
-[33mcommit f5b5f2bff9c492911494d6d4da96d82083fc3c96[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 30 19:03:42 2024 -0800
-
-    Revert "[Fix] fix assertion error for chunked prefill when disabling cache" (#2286)
-
-[33mcommit 7e4c6dd8dac5e33a45eb31c1c508fa2ceb6df023[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 30 19:03:26 2024 -0800
-
-    Revert "[FEAT] Support GGUF format" (#2285)
-
-[33mcommit d622851dc9f1d044e820a45a89ab38614d70c543[m
-Author: Rui Wang <45031995+wangraying@users.noreply.github.com>
-Date:   Sun Dec 1 09:53:43 2024 +0800
-
-    [Fix] fix assertion error for chunked prefill when disabling cache (#2282)
-
-[33mcommit 883c955489d70a9dbb7c0e5612f8dfc7ced40472[m
-Author: Yang Zheng <50227060+zhengy001@users.noreply.github.com>
-Date:   Sat Nov 30 16:44:48 2024 +0800
-
-    [FEAT] Support GGUF format (#2215)
-    
-    Co-authored-by: Yang Zheng(SW)(Alex) <you@example.com>
-
-[33mcommit 0d6a49bd7d86e2a51887f0b5bad2f47d174d26ba[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 30 00:24:30 2024 -0800
-
-    [CI] Kill zombie processes (#2280)
-
-[33mcommit ccaf1f997c4e96010bf33be916a06e4476300ace[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 29 23:48:54 2024 -0800
-
-    [CI] Print summary on github actions (#2274)
-
-[33mcommit 7d1485d3765eed0ed2f55c60210dc47c6573478a[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Fri Nov 29 23:36:38 2024 -0800
-
-    Add get weights by parameter name for llama (#2266)
-
-[33mcommit 7d5d1d3d2915d386d20890d5bda466834fc220be[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Fri Nov 29 17:17:00 2024 -0800
-
-    udate weights from disk (#2265)
-
-[33mcommit b53d6cbda38dee5ec1cf153c4b56e84cbd3dfdc8[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 29 16:37:52 2024 -0800
-
-    Add new contributors so they can trigger CI automatically (#2269)
-    
-    Co-authored-by: Qun Yang <qun.yang@intel.com>
-    Co-authored-by: zhengy001 <zhengy.gator@gmail.com>
-    Co-authored-by: HandH1998 <1335248067@qq.com>
-    Co-authored-by: xiaobo <xiaob.chen@outlook.com>
-
-[33mcommit 01017d4c206ccfe01c9fc458f3acafc52d81b848[m
-Author: bjmsong <wq.songbob@gmail.com>
-Date:   Sat Nov 30 08:13:38 2024 +0800
-
-    Support LoRA in Completion API (#2243)
-    
-    Co-authored-by: root <bjmsong@126.com>
-
-[33mcommit 94e167ea5aaf2871e9176c2b02e4584f5101dd64[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 29 16:03:32 2024 -0800
-
-    Fix the default chunked prefill size (#2268)
-
-[33mcommit 262e370f78c0f96cd261773e0053b980bce2b157[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Sat Nov 30 05:36:45 2024 +0800
-
-    [benchmark] Add fused_moe_triton benchmark and tuning tools (#2225)
-    
-    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
-    Co-authored-by: HAI <hixiao@gmail.com>
-
-[33mcommit 419a57e771197b14dbe1b5d9b8eacec4ab517d5b[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Nov 30 02:27:35 2024 +0800
-
-    minor: add sgl-kernel dir (#2261)
-
-[33mcommit fae4e5e99a93f8f5e7fa462833754c91ecbea1c2[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Nov 30 01:41:16 2024 +0800
-
-    chore: bump v0.3.6.post3 (#2259)
-
-[33mcommit afe1e46586dac2a6b42326351856ba1dc05d1508[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 29 04:24:20 2024 -0800
-
-    [Minor] fix the style for multimodal models (#2257)
-
-[33mcommit f50a6cf4435bd39b854efcf00814bc796b7f9b21[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 29 03:15:58 2024 -0800
-
-    Fix hash collision for multi modal models (#2256)
-
-[33mcommit fe97a2d40f9faeac16dbf58fae9161718cdb4b31[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 29 02:18:51 2024 -0800
-
-    Simplify tokenizer manager (#2254)
-
-[33mcommit 8b48496aaf90fdcd90698dbe9a9e11acdfc4a4d3[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Nov 28 23:58:54 2024 -0800
-
-    Revert "Revert "Add simple CPU offloading support"" (#2253)
-    
-    Co-authored-by: Jani Monoses <jani.monoses@gmail.com>
-    Co-authored-by: youkaichao <youkaichao@gmail.com>
-
-[33mcommit 4057ea82c9a11f4f2379189c390f4a4f88f73854[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Nov 28 23:36:55 2024 -0800
-
-    Revert "Add simple CPU offloading support" (#2252)
-    
-    We'll re-add the commit to correctly ack Kaichao's authorship
-
-[33mcommit 4f2ee48ed1c66ee0e189daa4120581de324ee814[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Nov 28 23:18:07 2024 -0800
-
-    Update backend.md (#2251)
-
-[33mcommit 71ff2728a1e4ab6cab870737d6563c97eb048929[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Nov 28 23:14:36 2024 -0800
-
-    Update backend.md (#2250)
-
-[33mcommit b7038fec9b29f2251a781074d90e9a068d838db2[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Nov 28 12:08:13 2024 -0800
-
-    [fix] Fix prefix caching for multi-image/video (#2239)
-
-[33mcommit 65fdb289294f890c1814277ffc6160fa93b07750[m
-Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
-Date:   Thu Nov 28 13:24:47 2024 +0000
-
-    fix missing launch server import (#2242)
-
-[33mcommit b2ccf36d4d93d47b59399a93e7e00444b812a28c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Nov 28 02:22:15 2024 -0800
-
-    Fix memory leak during abort (#2238)
-
-[33mcommit d4fc1a70e3187c914043a1ffc619adbb0c3c6860[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Nov 28 00:22:39 2024 -0800
-
-    Crash the server correctly during error (#2231)
-
-[33mcommit db674e3d24dd224df42aef37cad55be130062a6f[m
-Author: Jani Monoses <jani.monoses@gmail.com>
-Date:   Thu Nov 28 10:15:20 2024 +0200
-
-    Add OLMo2 model. (#2233)
-
-[33mcommit fb915bd1a2e0f1425ecfd3ab47cace317abf1ddb[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Nov 27 23:44:33 2024 -0800
-
-    Disable overlap scheduler for multimodal models (#2235)
-
-[33mcommit 09798b36cd31f8f9787cc43a5aed9bca173ada40[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Nov 27 23:37:20 2024 -0800
-
-    Fix chunked prefill size for bench_offline_throughput (#2234)
-
-[33mcommit b79fffdcb5c52ba8fdc72a9f18aabc3cd50bc7ff[m
-Author: HAI <hixiao@gmail.com>
-Date:   Wed Nov 27 22:46:55 2024 -0800
-
-    Update Install Method 2. From source (#2232)
-
-[33mcommit cd51758fade4119b3f6233444c3bfac91ed5eba9[m
-Author: HAI <hixiao@gmail.com>
-Date:   Wed Nov 27 21:18:51 2024 -0800
-
-    Rename tuned MI300X config files for fused_moe_triton (#2228)
-
-[33mcommit 91e5dbf5547382e2df51435a2113be14949188bf[m
-Author: bjmsong <wq.songbob@gmail.com>
-Date:   Thu Nov 28 06:57:13 2024 +0800
-
-    add profile in offline benchmark & update doc (#2123)
-    
-    Co-authored-by: root <bjmsong@126.com>
-
-[33mcommit dd5eba4c88991cb5d6cddb140279cdbc398f827f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Nov 27 14:28:55 2024 -0800
-
-    Remove fused_moe_grok (#2223)
-
-[33mcommit a4fd2f9b465b5a4ad47345be7f4ae0781150cb94[m
-Author: Baoyuan Qi <qibaoyuan@126.com>
-Date:   Thu Nov 28 04:07:00 2024 +0800
-
-    fix typo prompts (#2224)
-
-[33mcommit 92d1253e5802c75cb892b0eb5172b604ce6f60e9[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Wed Nov 27 11:23:32 2024 -0800
-
-    Bump sglang-router to 0.0.10 for env name change (#2226)
-
-[33mcommit a9ca297d769b52251a8fca7073c1a41700825fa4[m
-Author: kk <43161300+kkHuang-amd@users.noreply.github.com>
-Date:   Thu Nov 28 02:23:10 2024 +0800
-
-    [3rdparty, document] Updated Documentation that for triton fused_moe kernel tuning for AMD Instinct GPUs (#2191)
-    
-    Co-authored-by: wunhuang <wunhuang@amd.com>
-    Co-authored-by: HAI <hixiao@gmail.com>
-
-[33mcommit 2a02185c5f9be353fb493fc3548552ec5a5aafad[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Nov 27 09:36:36 2024 -0800
-
-    Rename DP_RANK to SGLANG_DP_RANK (#2218)
-
-[33mcommit fed4c6946acd476ab94cad85a1210900a3ae6076[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Nov 27 03:35:30 2024 -0800
-
-    Release v0.3.6.post2 (#2214)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit fb6e04a0c28bc33996aa3c7da51842012a965ffc[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Nov 27 02:52:46 2024 -0800
-
-    Use an env var SGLANG_SET_CPU_AFFINITY to set cpu affinity; turn it off by default (#2222)
-
-[33mcommit 6997e28f6e46a506eaacc18e6a3c62fcb63e60b9[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Nov 27 02:02:01 2024 -0800
-
-    Revert "Use an env var SGLANG_SET_CPU_AFFINITY to set cpu affinity; turn it off by default" (#2221)
-
-[33mcommit a0e58740a8307b7edd2a8868d520c9371a532e92[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Nov 27 01:13:41 2024 -0800
-
-    Use an env var SGLANG_SET_CPU_AFFINITY to set cpu affinity; turn it off by default (#2217)
-
-[33mcommit 37c8a5761f05d83b5ef3f946c8ebacbd51891651[m[33m ([m[1;31morigin/main[m[33m, [m[1;31morigin/HEAD[m[33m, [m[1;32mmain[m[33m)[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Nov 27 00:03:29 2024 -0800
-
-    [feat] Support session control for vision language models (#2210)
-
-[33mcommit c754652fcd1a5ac0e727343486657f5ef71b3252[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Nov 26 23:06:20 2024 -0800
-
-    Fix flasky tests (#2212)
-
-[33mcommit 0b46b951ae088dd22fe980acc7d855947ce2537f[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Tue Nov 26 15:00:41 2024 -0800
-
-    Fix rust warning (#2208)
-
-[33mcommit 2763c0a73adeafb42b6b38b5bd756e3bbe8d68b1[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Tue Nov 26 13:30:28 2024 -0800
-
-    Bump router to 0.0.9 with better logging (#2207)
-
-[33mcommit de3b67b77d3a7fe328e5caefdf6486d6e221ffce[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Nov 27 04:57:16 2024 +0800
-
-    docs: update adoption (#2204)
-
-[33mcommit 19f33b3237fb4dbdb6c7431fb45a2c5548713acb[m
-Author: Yudi Xue <10211+binarycrayon@users.noreply.github.com>
-Date:   Tue Nov 26 12:10:23 2024 -0800
-
-    add sglang version to get_server_info (#2206)
-
-[33mcommit 30ce5b599e3676695dde7dcce0c99b48c6a609fe[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Nov 26 18:22:55 2024 +0800
-
-    minor: update check_env (#2201)
-
-[33mcommit bc1f6fda0d479ebf26b32d31f9835a5d8f05e1ef[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Nov 26 17:24:18 2024 +0800
-
-    fix: add cuda-python for xgrammar (#2199)
-
-[33mcommit 867e092f8289bcbb60b5ffa14ddb7587c9fdd65c[m
-Author: Wang Ran (汪然) <wangr@smail.nju.edu.cn>
-Date:   Tue Nov 26 17:00:38 2024 +0800
-
-    using `is not` not `!=` to test `None` (#2196)
-
-[33mcommit 88c7763f536cc80478ca59814c409e23e9d7556a[m
-Author: Andrew Lyu <apemost@gmail.com>
-Date:   Tue Nov 26 16:59:58 2024 +0800
-
-    Remove unresolved reference 'self' (#2198)
-
-[33mcommit e4118b15b30ae4fd2ed9bb2cf6bad5b6f62d8ac9[m
-Author: Wang Ran (汪然) <wangr@smail.nju.edu.cn>
-Date:   Tue Nov 26 16:59:36 2024 +0800
-
-    remove unused imports (#2195)
-
-[33mcommit ba4ee37fa4f9b0e991ce9a55a54d54946c7eb34f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Nov 26 00:58:57 2024 -0800
-
-    Update sampler.py to skip the success  check (#2197)
-
-[33mcommit ac5a0f048870364126c7c97ed8660306be58609d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Nov 25 17:31:37 2024 -0800
-
-    Release v0.3.6.post1 (#2189)
-
-[33mcommit ea34350d882624e421101ffa0477f2d3bf3364e2[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Nov 25 17:12:08 2024 -0800
-
-    Rename double sparsity config file (#2188)
-
-[33mcommit 1605ae121e6c792e4f38813814b287b3c8669eb5[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Nov 25 16:38:43 2024 -0800
-
-    [CI] Minor fix for CI (#2187)
-
-[33mcommit 1aea19f64b06cee64368a6f0488af1fb2a39e328[m
-Author: Rin Intachuen <113603872+RinRin-32@users.noreply.github.com>
-Date:   Mon Nov 25 19:35:04 2024 -0500
-
-    Input_embeds support (#2052)
-
-[33mcommit 1f76fc6e3f6f95e823e350330e575e573f4bb3ee[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Nov 25 16:02:03 2024 -0800
-
-    [router] Rust e2e test (#2184)
-
-[33mcommit 7f076c2ce6d2de2625233b98c4b6990d24d09b66[m
-Author: Yixin Dong <ubospica@gmail.com>
-Date:   Mon Nov 25 18:58:30 2024 -0500
-
-    Update XGrammar to the latest API (#2176)
-    
-    Co-authored-by: Ben Gitter <gitterbd@gmail.com>
-
-[33mcommit 3c5538f781acd0b052330b2eb9f0f8d860e4d1ca[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Nov 25 15:24:17 2024 -0800
-
-    Update CI threshold (#2186)
-
-[33mcommit 10189d08dde1096f5759316c0a6ff05962714c4b[m
-Author: HAI <hixiao@gmail.com>
-Date:   Mon Nov 25 14:57:32 2024 -0800
-
-    [Performance]: Process affinity to CPU cores with multiple sockets support (#2171)
-
-[33mcommit c4336b2b60acdc2a835842f5033c05226d211e56[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Nov 25 14:55:01 2024 -0800
-
-    Use custom allreduce w/ torch.compile (#2185)
-
-[33mcommit 4d62bca5429405830e3de0e18d3e4fde6e022a6a[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Nov 25 13:36:02 2024 -0800
-
-    [router] Replace print with logger (#2183)
-
-[33mcommit e1e595d702fe61883fd1fbfa0377075fd34e7694[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Nov 25 12:32:51 2024 -0800
-
-    [feat] Refactor session control interface and add CI (#2173)
-
-[33mcommit 5ada33ffa08a16a0ffbc71feca5055aa24904803[m
-Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
-Date:   Tue Nov 26 03:22:33 2024 +0800
-
-    Bump rustls from 0.23.16 to 0.23.18 in /rust (#2182)
-    
-    Signed-off-by: dependabot[bot] <support@github.com>
-    Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
-
-[33mcommit 254fd130e27363de8d56364e5a13fad0188fb7a2[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Nov 25 04:58:16 2024 -0800
-
-    [CI] Split test cases in CI for better load balancing (#2180)
-
-[33mcommit 538fa0ae135c4e7ef70c65439359eff7bec2b616[m
-Author: Yixin Dong <ubospica@gmail.com>
-Date:   Mon Nov 25 04:31:25 2024 -0500
-
-    [Fix] Avoid calling fill_vocab_mask for terminated requests (#2175)
-
-[33mcommit 55842eb81a782da7e522ec0210c3fa1f3f74dc0a[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Nov 25 17:06:36 2024 +0800
-
-    feat: fused_moe fp8 monkey patch (#2174)
-
-[33mcommit a866b65e1d7b1a0284bb8e3ab967d94134d7d748[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sun Nov 24 23:17:38 2024 -0800
-
-    Bump rust router to 0.0.8
-
-[33mcommit 4b0a1c9365efbbe1890858d2c8ad86046aaa3e7b[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sun Nov 24 23:17:11 2024 -0800
-
-    Replace prob based with threshold based load balancing  (#2170)
-
-[33mcommit 8e1adb8441a47e102e9c6ac2485c3d66c8e66e62[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 24 20:58:17 2024 -0800
-
-    Allow overwrite flashinfer use_tensorcore (#2169)
-
-[33mcommit dd44173dad4ecca49430886f000198a391eefac0[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Mon Nov 25 10:37:50 2024 +0800
-
-    [Fused moe] add tuning fused configs for qwen2 57b and mixtral 8x7b (#2167)
-
-[33mcommit 8912b7637f5c8dca0f18c31a17e46f427cf53152[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 24 08:25:56 2024 -0800
-
-    Fix docs (#2164)
-
-[33mcommit be0124bda09dc10267f6cbbcb097bf14dd4fd8b6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 24 08:12:35 2024 -0800
-
-    Rename triton_fused_moe -> fused_moe_triton (#2163)
-
-[33mcommit fe5d3e818fbcf940743481a10f638f3ebe6f4e1f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 24 07:38:52 2024 -0800
-
-    Balance CI tests (#2162)
-
-[33mcommit 731146f6cbec40f502e16dc971a150ed46b207ad[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 24 07:17:37 2024 -0800
-
-    Fix mixed chunked prefill in overlap mode (#2158)
-
-[33mcommit fa271613809bc5d901c0e864c4f9b9d3d3a101bd[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Nov 24 22:37:04 2024 +0800
-
-    fix: use torch.sum for compatible (#2161)
-
-[33mcommit 5652c565352c73889b3a39a7e2a014ca4c5dafcb[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 24 06:29:38 2024 -0800
-
-    Update CI threshold & Improve code style (#2159)
-
-[33mcommit e3938b2f9c9644e979407e77c613322b60a1c622[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Nov 24 21:36:34 2024 +0800
-
-    feat: update other MoE models deps (#2156)
-
-[33mcommit c211e7b669c72a35dc8c128f2af20ac928f73280[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 24 04:47:10 2024 -0800
-
-    Simplify batch update (#2154)
-
-[33mcommit d90c3d6b8bcc30943b775aad0bb37402663adfaa[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Nov 24 20:38:26 2024 +0800
-
-    fix: resolve end-of-file-fixer (#2157)
-
-[33mcommit 9e8f8fbf95a1c0adfbc8bdfe373a17b92b094cd2[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Nov 24 20:24:58 2024 +0800
-
-    feat: update gitignore and add tuning config for FusedMoE (#2155)
-
-[33mcommit b509db5832c96e2a47dd82d500bc9d4c855c9b4c[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Nov 24 20:09:27 2024 +0800
-
-    feat: remove the dependency on FusedMoE (#2153)
-
-[33mcommit dbe1729395d8c0cdfda419fe4378c7499157f563[m
-Author: Henry Hyeonmok Ko <52618631+henryhmko@users.noreply.github.com>
-Date:   Sun Nov 24 01:37:58 2024 -0800
-
-    Merged three native APIs into one: get_server_info (#2152)
-
-[33mcommit 84a1698d67d63911e8d1f55c979b00d65d84dc37[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Nov 23 17:35:25 2024 -0800
-
-    Update release-pypi-router.yml
-
-[33mcommit 32293a299c7aa4c5d985d97bbb1885e5f32e4862[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Nov 23 17:34:24 2024 -0800
-
-    Improve sglang router (#2148)
-
-[33mcommit 79216908931c26e7c0f2dbe0429d1e8a94cb6149[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Nov 23 15:35:02 2024 -0800
-
-    add prefix match for certain tenant (#2147)
-
-[33mcommit bbb81c24578cddcde0f6241ffa993ab471b6b214[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Nov 23 15:10:26 2024 -0800
-
-    Add more api routes (completion, health, etc) to the router (#2146)
-
-[33mcommit 52f58fc42ab1f00ae3d0e0279594664c07504142[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Nov 23 11:46:21 2024 -0800
-
-    fix dp_rank env (#2144)
-
-[33mcommit 145c0ddc2df7fdb68fb45fea7ad179a3a49934c1[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Nov 23 11:01:04 2024 -0800
-
-    update router doc (#2143)
-
-[33mcommit 505d7f71a6d8f59459506ef9e2dffe5524088539[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Nov 23 08:35:46 2024 -0800
-
-    Bump sglang-router to 0.0.5 (#2142)
-
-[33mcommit cbedd1db1d8bdde867efadf90b3c801dfe4e9964[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Nov 23 08:34:48 2024 -0800
-
-    [router] cache-aware load-balancing router v1 (#2114)
-
-[33mcommit ad47749b827b8087c914d489d2d26ac485121c59[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Nov 23 17:45:42 2024 +0800
-
-    fix: resolve bench_serving args (#2139)
-
-[33mcommit 751c3a037cdfa27e58cec5e316b3f23cb0b80db2[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 23 01:22:26 2024 -0800
-
-    Fix dp print message (#2138)
-
-[33mcommit 60769be14d00fb0d61159312db796c5f47bff6f7[m
-Author: Yunmeng <cym103@126.com>
-Date:   Sat Nov 23 17:07:07 2024 +0800
-
-    Add concurrency option for benchmark (#2136)
-
-[33mcommit a78d8f8db380e86c4e534a5e466ffc0ae7b13a5c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 23 01:00:07 2024 -0800
-
-    [CI] Fix test cases (#2137)
-
-[33mcommit c5f865013e729a6449384c595492018041e9fb64[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Sat Nov 23 16:51:46 2024 +0800
-
-    Fix grid size in Triton decoding kernel (#2134)
-
-[33mcommit d98fa1e93dc9af557c2dd0aa80f8ba80a2fe65e5[m
-Author: Jani Monoses <jani.monoses@gmail.com>
-Date:   Sat Nov 23 08:23:53 2024 +0200
-
-    Add simple CPU offloading support. (#2081)
-
-[33mcommit 865233e2565fa4cbb89e806bf371866f4ef9d56f[m
-Author: Ankur Neog <anneog@habana.ai>
-Date:   Sat Nov 23 09:52:23 2024 +0530
-
-    Add initial support for intel Gaudi accelerators (#2121)
-
-[33mcommit 66d4859acfb24ae3afe358d8310299a88ce8fce1[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 22 15:46:16 2024 -0800
-
-    Revert "Only stream output on tp rank 0" (#2130)
-
-[33mcommit e1b63624d79d7153e85ae6fe884619e097ffc1bd[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 22 15:13:44 2024 -0800
-
-    Only stream output on tp rank 0 (#2124)
-
-[33mcommit c35cd1f8c7cbcb5086276ae960572d9a49dc50f3[m
-Author: Henry Hyeonmok Ko <52618631+henryhmko@users.noreply.github.com>
-Date:   Fri Nov 22 15:10:10 2024 -0800
-
-    Expose max total num tokens from Runtime & Engine API (#2092)
-
-[33mcommit 72f87b723bd49287258b88a7b420cd75239ec83a[m
-Author: Xuehai Pan <XuehaiPan@pku.edu.cn>
-Date:   Sat Nov 23 05:04:51 2024 +0800
-
-    feat(pre-commit): trim unnecessary notebook metadata from git history (#2127)
-
-[33mcommit 62a4a339ebc1b2a9ecf5deac10ebf1de9108bca3[m
-Author: Xuehai Pan <XuehaiPan@pku.edu.cn>
-Date:   Fri Nov 22 22:16:53 2024 +0800
-
-    docs: fix module docstrings and copyright headers (#2077)
-
-[33mcommit 2797bc34221568f5362cb59cc8e5c3f65078730a[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Nov 22 20:53:11 2024 +0800
-
-    fix: add xgrammar dependency (#2126)
-
-[33mcommit 9a00e6f453e764c0b286e2a62f652a1202c0bf9c[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Nov 22 19:27:30 2024 +0800
-
-    chore: bump v0.3.6 (#2120)
-
-[33mcommit 4f8c3aeafccbbd1eb28065474652e2b181206b86[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Nov 22 19:23:58 2024 +0800
-
-    minor: update gsm8k threshold (#2125)
-
-[33mcommit 2369e88209afca1e431d355bc86f7deae08e4b7c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 22 01:50:42 2024 -0800
-
-    [minor] Clean up unused imports (#2122)
-    
-    Co-authored-by: rinrin32 <rinrin.int@gmail.com>
-
-[33mcommit ad30d5cf9a15c20e9b04eaa674c822161dba58ce[m
-Author: bjmsong <wq.songbob@gmail.com>
-Date:   Fri Nov 22 15:29:50 2024 +0800
-
-    Benchmark with Pytorch Profiler easily (#2110)
-    
-    Co-authored-by: root <bjmsong@126.com>
-
-[33mcommit dfec7fca0616be1af145e2bd89eee6d9db3235a4[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Nov 21 20:07:48 2024 -0800
-
-    Rename sglang.bench_latency to sglang.bench_one_batch (#2118)
-
-[33mcommit 8048c28c11b7b377d769bfc38fd8b8c87fb187de[m
-Author: Jake Poznanski <jakep@allenai.org>
-Date:   Thu Nov 21 19:05:41 2024 -0800
-
-    Fix #2037 - Context length check does not take into out pad tokens for visual models (#2106)
-
-[33mcommit 30af7dfb3426065f97ea6a217a721d84f0bb2b56[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Thu Nov 21 17:13:33 2024 -0800
-
-    [router] add base_gpu_id server args & merged radix tree python reference (#2115)
-
-[33mcommit f6f713797bcbc63d225136d66deaa00495cdedfe[m
-Author: James Xu <jamesxu1288@Gmail.com>
-Date:   Thu Nov 21 17:24:25 2024 -0500
-
-    Add support for Qwen2-VL-based embedding models (#2055)
-
-[33mcommit f35cb46cc37661d772db5964653c906dac41edbe[m
-Author: HAI <hixiao@gmail.com>
-Date:   Thu Nov 21 12:23:21 2024 -0800
-
-    ROCm: Fix MoE padding for none FP8 cases (#2111)
-
-[33mcommit 7f8fcd39cd405dbb5667265eb4171ae68935b47d[m
-Author: Jerry Zhang <jerryzh168@gmail.com>
-Date:   Thu Nov 21 12:19:49 2024 -0800
-
-    Turn off autotune for scaled mm for fp8 dynamic quant in torchao (#2116)
-
-[33mcommit 5c6a41facfacdc80d01015a65be5b8a5ec8eb91e[m
-Author: Jerry Zhang <jerryzh168@gmail.com>
-Date:   Wed Nov 20 17:37:28 2024 -0800
-
-    Error out when torchao-config option is not recognized (#2107)
-
-[33mcommit 722530fa018290fd3921c8f030fb806b190f32b7[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Nov 20 02:58:35 2024 -0800
-
-    Enable overlap scheduler by default for the triton attention backend (#2105)
-
-[33mcommit 56a347f7d30b8e9c702b823646f67cc8c8f2f11c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Nov 20 01:28:27 2024 -0800
-
-    Move test_session_id.py to playground (#2104)
-
-[33mcommit 3295cd8af2e6b1f3bc2dfbbce3390f4d64eb78ca[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Nov 20 01:25:21 2024 -0800
-
-    Allow skipping warmup in bench_offline_throughput.py (#2103)
-
-[33mcommit 5942dfc00a3131c38ea469ea7937ed403b28ddcf[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Nov 20 00:36:53 2024 -0800
-
-    [feat] Add session control (#2073)
-
-[33mcommit 63a395b98517ee4a65476f8650919af43cc4c993[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Nov 19 22:15:02 2024 -0800
-
-    Update nightly-eval.yml (#2100)
-
-[33mcommit 7d671e4ad2977d8090f44be5e94f351a15f4c9bf[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Nov 19 22:07:58 2024 -0800
-
-    Enable overlap by default (#2067)
-
-[33mcommit 699384cb017c4096815cb090f473c4004388e5ad[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Wed Nov 20 12:57:18 2024 +0800
-
-    Set schedule policy more conservative for DP attention (#2096)
-
-[33mcommit ffd20fcd037fa2815c6a5dd8fa165200f7649d0d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Nov 19 15:04:43 2024 -0800
-
-    Make constrained decoding work for overlap scheduler (#2095)
-
-[33mcommit 55bd97f3e5ea839e76388aa85876ec20160f8266[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Nov 20 06:07:27 2024 +0800
-
-    minor: add dataset dump and questions shuffle (#2093)
-
-[33mcommit e57c3e12b89ad5b06a5166f300991ccfe9867560[m
-Author: HAI <hixiao@gmail.com>
-Date:   Tue Nov 19 14:06:29 2024 -0800
-
-    Use native fp8 format on MI300X (#2094)
-
-[33mcommit f239268fad20b659cb6b8a2c33d9e9ae24da4474[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Nov 19 20:36:55 2024 +0800
-
-    minor: update gsm8k eval (#2091)
-
-[33mcommit 929c7621afc382d9ecbda2616187833255ee7332[m
-Author: Alexander Waitz <ajwaitz@gmail.com>
-Date:   Tue Nov 19 04:21:36 2024 -0800
-
-    Fix: incorrect top_logprobs in chat completion (#2088)
-
-[33mcommit b7a065eae3d9b2c05030a25ff57391b7432f8cc4[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Nov 19 00:21:46 2024 -0800
-
-    Use cuda event wait and synchronization instead of busy waiting (#2089)
-
-[33mcommit b110453802779285a8fb9dca6808f34cddbf68ee[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Nov 18 17:48:28 2024 -0800
-
-    Simplify logits penalizer (#2086)
-
-[33mcommit 3b44bbeecf7178b1802e5d1817f2f8b9bd94eccf[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Nov 18 14:59:15 2024 -0800
-
-    Allow passing extra request body to bench_offline_throughput.py (#2085)
-
-[33mcommit 80e2c4a8de3ad34af12f6127956975b69c1beaa7[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Nov 18 13:16:28 2024 -0800
-
-    Fix chunked prefill with output logprob (#2083)
-
-[33mcommit 66318ffe962b7361c3b8f90eac8ec31c8380c970[m
-Author: Jani Monoses <jani.monoses@gmail.com>
-Date:   Mon Nov 18 23:00:02 2024 +0200
-
-    Rename layer_idx to layer_id for consistency (#2078)
-
-[33mcommit 766192610e2d4b3cdf381498ca01e0c6fe6a2ae4[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Nov 18 21:29:13 2024 +0800
-
-    feat: update torch 2.5.1 (#2069)
-
-[33mcommit 2a3992b6f11bf7e71817020416a6d376818bd12d[m
-Author: yukavio <67678385+yukavio@users.noreply.github.com>
-Date:   Mon Nov 18 17:06:59 2024 +0800
-
-    support set role as 'tool' (#2075)
-    
-    Co-authored-by: kavioyu <kavioyu@tencent.com>
-
-[33mcommit 4af3f889fc6f406c0fc3b7a310e3ad7220b01ff6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Nov 18 00:02:36 2024 -0800
-
-    Simplify flashinfer indices update for prefill (#2074)
-    
-    Co-authored-by: kavioyu <kavioyu@tencent.com>
-    Co-authored-by: kavioyu <kavioyu@gmail.com>
-
-[33mcommit df7fe4521a121bd2738a8ac8ee28163c22f40bf7[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 17 22:18:11 2024 -0800
-
-    Crash the CI jobs on model import errors (#2072)
-
-[33mcommit a7164b620f862aeb4542713a6675ce3b4d9aea45[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 17 21:51:00 2024 -0800
-
-    Tune the threshold for accuracy tests in CI (#2071)
-
-[33mcommit 116685337e817e6e328ced94becdeb4979d83f36[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 17 21:29:30 2024 -0800
-
-    Fix cuda illegal memory access in overlap mode (#2070)
-
-[33mcommit a9e90b4bcecd61ec2f8fe09aab884dbcb6ddf732[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 17 19:49:20 2024 -0800
-
-    [Minor] Fix styles for overlap mode (#2068)
-
-[33mcommit 8c280cee550980edb842ff692e2cacee75b2641f[m
-Author: Tanjiro <tushar.goel.ml@gmail.com>
-Date:   Sun Nov 17 18:47:43 2024 -0800
-
-    add phi-3 small support (#2062)
-    
-    Co-authored-by: Tushar Goel <114812108+AI-Tushar@users.noreply.github.com>
-
-[33mcommit 9c745d078e29e153a64300bd07636c7c9c1c42d5[m
-Author: DarkSharpness <76582120+DarkSharpness@users.noreply.github.com>
-Date:   Mon Nov 18 09:58:49 2024 +0900
-
-    [Performance] Update xgrammar-related constrained decoding (#2056)
-
-[33mcommit ebaa2f31996e80e4128b832d70f29f288b59944e[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 17 16:53:44 2024 -0800
-
-    Rename arguments `--disable-nan-detection` to `--enable-nan-detection` (#2066)
-
-[33mcommit 62832bb2728e0e8ac5f97dc7687eaf263aaa927f[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Mon Nov 18 08:29:20 2024 +0800
-
-    Support cuda graph for DP attention (#2061)
-
-[33mcommit 11f881d173c4744a3ebf31736c264a0b0af4396f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 17 16:20:58 2024 -0800
-
-    Deprecate --disable-flashinfer and --disable-flashinfer-sampling (#2065)
-
-[33mcommit 38625e2139941fe8a02db81ebdd2babda359f05b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 17 15:48:12 2024 -0800
-
-    Remove monkey_patch_vllm_dummy_weight_loader (#2064)
-
-[33mcommit c1f401fc580c8b7875a5b7ac415058b31c7a4331[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 17 15:29:38 2024 -0800
-
-    Revert "chore: update torch v2.5.1" (#2063)
-
-[33mcommit 3b878863f7bb96726c8573efd1b8a6ba90de65a8[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Nov 18 00:06:00 2024 +0800
-
-    chore: update torch v2.5.1 (#1849)
-
-[33mcommit f719d9aebc1820bad70be738b8473fbf2f1dd370[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 16 17:13:36 2024 -0800
-
-    Launch dp ranks in parallel (#2053)
-    
-    Co-authored-by: Haotian Liu <6631389+haotian-liu@users.noreply.github.com>
-
-[33mcommit edad3731351bd3c3769ea97374a0a36c79aec7cb[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 16 16:14:23 2024 -0800
-
-    Fix illegal memory access in overlap mode & Use more fused triton kernels for building meta data (#2051)
-
-[33mcommit 976bc302e52b12d1d2e581cc5d8a952ac1c6b0a4[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Sat Nov 16 17:01:43 2024 +0800
-
-    Support DP MLA (#1970)
-
-[33mcommit 2f2e07439ce2ab7598a6b2ee92ee51ac14b7dc01[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 16 00:30:39 2024 -0800
-
-    Fix weight update for data parallelism (#2050)
-
-[33mcommit 2ffe0a7363aa0be9f1d7311daf8efe3ec9098338[m
-Author: HAI <hixiao@gmail.com>
-Date:   Fri Nov 15 22:51:48 2024 -0800
-
-    Add get_amdgpu_memory_capacity() (#2049)
-
-[33mcommit cf2489762b0ae6d9243b6dbe152721ec23cd91a2[m
-Author: Ke Wen <kw2501@meta.com>
-Date:   Fri Nov 15 21:26:00 2024 -0800
-
-    Add Tensor Parallel to torch_native_llama (#1876)
-
-[33mcommit e5c6715003da433da5cf57d143fc5794f9d5c942[m
-Author: HAI <hixiao@gmail.com>
-Date:   Fri Nov 15 21:24:42 2024 -0800
-
-    Fix core (MI300X) with --enable-overlap (#2048)
-
-[33mcommit 023d0a73df989a24535653f5290d63de369b8d75[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Sat Nov 16 03:09:10 2024 +0800
-
-    fix small typos in docs (#2047)
-
-[33mcommit 32c9a7ec11b8dde282f637614f7f8e51a2f20b11[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 15 06:54:00 2024 -0800
-
-    Release v0.3.5.post2 (#2046)
-
-[33mcommit b01df48cf2abb78114ba8a28cbe31139515dd112[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 15 06:21:57 2024 -0800
-
-    [Fix] Adjust default chunked prefill size and cuda graph max bs according to GPU memory capacity (#2044)
-
-[33mcommit c29b98e04393aa73680e6376bfc8774f4081eb35[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 15 05:33:43 2024 -0800
-
-    Fix json benchmark (#2043)
-
-[33mcommit 954f4e6bd607ae8ed08cc60dab7c8117e1ff1776[m
-Author: DarkSharpness <76582120+DarkSharpness@users.noreply.github.com>
-Date:   Fri Nov 15 22:06:19 2024 +0900
-
-    benchmark json schema (#2030)
-
-[33mcommit 2558d6a6752ad45e047900b7c42da1ebc27512d4[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 15 05:02:44 2024 -0800
-
-    Fix the default arguments of bench_offline_throughput.py & simplify detokenizer manager (#2042)
-
-[33mcommit 29ebe3dff475b87f9e252fa9257ab9b64ee4988f[m
-Author: ws <lj6922020@gmail.com>
-Date:   Fri Nov 15 19:39:10 2024 +0800
-
-    fix: align enable_overlap_scheduler naming between code and docs (#2038)
-
-[33mcommit f6dd648620cab687170390bebb7da85ab94ebbd7[m
-Author: zolinthecow <32052672+zolinthecow@users.noreply.github.com>
-Date:   Thu Nov 14 21:59:33 2024 -0800
-
-    Offline LLM Engine Benchmark Throughput (#1968)
-    
-    Co-authored-by: ByronHsu <byronhsu1230@gmail.com>
-
-[33mcommit ea53c63bad67f07f491718d38c2f65c3dd9d656b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Nov 14 19:09:21 2024 -0800
-
-    Expose no_stop_trim and skip_special_tokens in openai api (#2039)
-
-[33mcommit a10d5309436f83c34c1dc948f5601dd8895e7df2[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Nov 14 12:52:40 2024 -0800
-
-    Fix outlines version (#2036)
-
-[33mcommit aae5434bdffd13dc9e00417379168146602553e5[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Nov 14 11:08:37 2024 -0800
-
-    Fix unit tests (#2034)
-
-[33mcommit c3eac1b010b3da3086457e40af555690da0787a6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Nov 14 01:30:24 2024 -0800
-
-    Fix torch.compile for MoE (#2033)
-
-[33mcommit b275ce00439b07c8d73d9f11b6d0fb74dda46629[m
-Author: HAI <hixiao@gmail.com>
-Date:   Wed Nov 13 23:57:18 2024 -0800
-
-    Github runner instructions for AMD (#2031)
-
-[33mcommit 13ce3e4b5d40b55d6ae72758ac5ced8ce6f4937c[m
-Author: Patrick Yi <21299683+pjyi2147@users.noreply.github.com>
-Date:   Thu Nov 14 02:26:56 2024 -0500
-
-    Add download_dir ServerArgs property (#2027)
-
-[33mcommit df246e699d2a18873da2b2c47b432d07b17d8cca[m
-Author: Tzu Gwo <gotzehsing@gmail.com>
-Date:   Thu Nov 14 15:02:39 2024 +0800
-
-    chore: open lto and optimization in release profile (#2028)
-
-[33mcommit fb9fb3518b2598eaae21b9f31a56768eeb03f4bd[m
-Author: chottolabs <171991982+chottolabs@users.noreply.github.com>
-Date:   Wed Nov 13 20:06:02 2024 -0500
-
-    set content to empty string (#2026)
-
-[33mcommit c722d9bdc30e9730f82f6d646c171c43a4837e12[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Nov 13 14:04:25 2024 -0800
-
-    Fix dependency and error message for xgrammar (#2024)
-
-[33mcommit 218ab3611ddf46ce6acf8a465611a01faa275eb7[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Nov 13 11:39:16 2024 -0800
-
-    Do not let invalid grammar crash the server (#2023)
-
-[33mcommit f407fcf9ef0ef637deb6b62cd9044e1778c53b89[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Nov 13 10:27:12 2024 -0800
-
-    Release v0.3.5.post1 (#2022)
-
-[33mcommit 54479d6f301c0178be1c43209d1adc1684542520[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Nov 13 01:49:45 2024 -0800
-
-    Fix grammar backend for tensor parallelism (#2020)
-
-[33mcommit ba069a24d3e116b37399cf3ebd295c97c49ae6fd[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Nov 12 21:17:38 2024 -0800
-
-    Fix grammar backend (#2018)
-
-[33mcommit 125b1199c5858db069a98a17af8c917e35891480[m
-Author: DarkSharpness <76582120+DarkSharpness@users.noreply.github.com>
-Date:   Wed Nov 13 01:45:28 2024 +0900
-
-    support parallel grammar preprocessing (#1996)
-    
-    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
-
-[33mcommit eff468dd5a3d24646560eb044276585f7a11ac3c[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Tue Nov 12 23:21:16 2024 +0800
-
-    fix test_embedding_models prompt length too long's bug (#2015)
-
-[33mcommit a1bd7190315d021c7326ed34fe7d73a368c0f572[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Tue Nov 12 16:49:45 2024 +0800
-
-    fix a bug in v1_embeeding_request (#2014)
-
-[33mcommit 78c1d6445fa64667e5691826abbb35b1423e8486[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Nov 11 23:24:41 2024 -0800
-
-    Fix finish reason (#2013)
-
-[33mcommit 027e65248f26845057ed6eef663f33bbcd2602f7[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Tue Nov 12 15:21:20 2024 +0800
-
-    support echo=true and logprobs in openai api when logprobs=1 in lm-evaluation-harness  (#1998)
-
-[33mcommit b808a38365b082e6ef0e25c673ed56b9bdd6f73c[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Tue Nov 12 14:53:41 2024 +0800
-
-    Filter empty prompt in random bench serving (#2011)
-
-[33mcommit 602ebc661d7173cfa2ab1edfa51c8785b57b91d7[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Nov 11 20:45:52 2024 -0800
-
-    remove sglang folder in rust (#2010)
-
-[33mcommit 530ae1bdc80f8740975977d4a347b62760fd381d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Nov 11 17:52:42 2024 -0800
-
-    Fix weight loading for tied word embedding when TP > 1 (#2009)
-
-[33mcommit befc6beb863b9a5c04fb0364ad0356378ec027fc[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Nov 11 16:34:10 2024 -0800
-
-    Fix a typo in io_struct.py (#2008)
-
-[33mcommit 59a5ba9be0c7d2453aeacf61ac20bad99c04ec10[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Nov 11 15:36:14 2024 -0800
-
-    [Minor] Remove unused imports (#2006)
-
-[33mcommit 86c37d010aeabb1bd2f4a05e19b6bb7f14c3d8da[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Nov 11 15:20:14 2024 -0800
-
-    fix sglang_router not found (#2005)
-
-[33mcommit f18b9c72520dc403c6cc00d57321f499ca42803f[m
-Author: RangiLyu <lyuchqi@gmail.com>
-Date:   Tue Nov 12 07:09:58 2024 +0800
-
-    support internlm2-reward (#1994)
-
-[33mcommit 3e33574374c0228ca223739842654cca5e0e4851[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Nov 11 14:46:08 2024 -0800
-
-    run rust test on ubuntu instead of 1-gpu-runner (#2003)
-
-[33mcommit 0d94f1dd036a046620b9a2e767b21af5e5887cc1[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Nov 11 14:42:22 2024 -0800
-
-    Bump router to 0.0.3 (#2004)
-
-[33mcommit e728258d34dd59c6e0ea783863c72c40359b2292[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Nov 11 14:30:25 2024 -0800
-
-    release router from py38 to py312 (#2002)
-
-[33mcommit 239eafbd2e71d43e574eaa6d604ed2918b39200e[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Nov 11 14:18:21 2024 -0800
-
-    Fix rust unit test and pypi token (#2001)
-
-[33mcommit 9d427265fdc527d6bf568c0b373c1789e25b8a39[m
-Author: James Xu <jamesxu1288@Gmail.com>
-Date:   Mon Nov 11 16:43:35 2024 -0500
-
-    Add Engine::encode example (#2000)
-
-[33mcommit 00ffde206f893c4dcbeea8eb15c6a0caf261ea23[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Nov 11 12:19:32 2024 -0800
-
-    setup router python binding ci (#1999)
-
-[33mcommit ddeb9d42dec70ba032929f1a48fc64381fdda2b2[m
-Author: James Xu <jamesxu1288@Gmail.com>
-Date:   Mon Nov 11 14:48:17 2024 -0500
-
-    Add engine encode (#1995)
-    
-    Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
-
-[33mcommit aaf0a3156edf311956189fae3cf3271fb8df4ff0[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Nov 11 21:03:16 2024 +0800
-
-    docs: add slides link in README (#1997)
-
-[33mcommit f9633fa9b94c633677863bfd0dc183b8717cfd77[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sun Nov 10 21:57:32 2024 -0800
-
-    [rust] cache-aware DP - approx tree (#1934)
-
-[33mcommit 087ab832236ef264746d8c75af8cd8752f56ca6b[m
-Author: HAI <hixiao@gmail.com>
-Date:   Sun Nov 10 18:54:43 2024 -0800
-
-    [Performance, Triton] Optimize over mask compute to tl.load in fused_moe_kernel (#1980)
-
-[33mcommit 8169c6f4ef5c5d5705fbb8309dc7a27544bd0a37[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sun Nov 10 16:39:56 2024 -0800
-
-    Add gen-shared-prefix dataset in bench_serving (#1990)
-
-[33mcommit 3d043319aa8e307b61b7c91f172acdcd76813ddb[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 10 11:45:01 2024 -0800
-
-    [CI] Balance unit tests (#1988)
-
-[33mcommit a8aad9357d2099064c9198d828375a829c270aab[m
-Author: yizhang2077 <1109276519@qq.com>
-Date:   Mon Nov 11 00:10:45 2024 +0800
-
-    qwen2vl fix bug for #1971 #1897 (#1984)
-
-[33mcommit 47ffe7af816b9dcc0ecce43b633c752dba4eeccb[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Nov 10 22:14:48 2024 +0800
-
-    docs: add shm size for docker run (#1986)
-
-[33mcommit b3523af8eb332c1a53c3abfb914f475be371f8fc[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Nov 10 21:33:23 2024 +0800
-
-    fix: update pyzmq version (#1983)
-
-[33mcommit 1929c067625089c9c3c04321578f450275f24041[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 10 04:39:32 2024 -0800
-
-    Simplify prometheus metrics (#1981)
-    
-    Co-authored-by: Mohit Reddy <mohitreddy1996@users.noreply.github.com>
-
-[33mcommit ed53ac84b4b54c7086096d42c2137740ac8cc6c5[m
-Author: Huanzhi (Hans) Mao <huanzhimao@gmail.com>
-Date:   Sun Nov 10 01:32:07 2024 -0800
-
-    Specify `zmq` Version Requirement (#1982)
-
-[33mcommit 520f0094e4e74a72920860dc7767e0f599ab0b26[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 9 16:46:14 2024 -0800
-
-    [CI] balance unit tests (#1977)
-
-[33mcommit 9c939a3d8b3c5a60d51609789fcb6d98c64ccd30[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 9 15:43:20 2024 -0800
-
-    Clean up metrics code (#1972)
-
-[33mcommit 549e8b83667b4816f1a697048c561b57e0f3f5b1[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 9 15:07:27 2024 -0800
-
-    [Minor] Fix a typo in test_torchao.py (#1976)
-
-[33mcommit a1f32867cacaecb3a12c630fb6a025adfdbd91bc[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 9 13:53:35 2024 -0800
-
-    Update pr-test-rust.yml to add a "finish" step (#1975)
-
-[33mcommit 760552e068edb58d9cd6e68aa1b714c247027d92[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 9 11:32:13 2024 -0800
-
-    Update README.md (#1974)
-
-[33mcommit d9aada9db15be00d92bf386bbfabac6b8ea2b337[m
-Author: Kursat Aktas <kursat.ce@gmail.com>
-Date:   Sat Nov 9 22:29:26 2024 +0300
-
-    Introducing SGLang Guru on Gurubase.io (#1745)
-
-[33mcommit f11eb90fe42db9043d4d3a15a16def8f3c33cbdd[m
-Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
-Date:   Sat Nov 9 19:28:02 2024 +0000
-
-    Initialize model_worker_batch variable (#1973)
-
-[33mcommit 95a4ed129ae24df6bca2d0e01c522253b2d385cb[m
-Author: Yudi Xue <10211+binarycrayon@users.noreply.github.com>
-Date:   Fri Nov 8 23:21:11 2024 -0800
-
-    Fix metrics (#1963)
-
-[33mcommit d1150e9a001d7642e48dd105ab92c816ef27c5c7[m
-Author: leishaoSC <165223994+leishaoSC@users.noreply.github.com>
-Date:   Fri Nov 8 23:19:03 2024 -0800
-
-    Updated Instructions on Profiling SGLang Infer System with AMD GPUs (#1966)
-    
-    Co-authored-by: wunhuang <wunhuang@amd.com>
-
-[33mcommit e3126e3c5ffabe77c90222aeff63fe778d525ff5[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Fri Nov 8 11:46:25 2024 -0800
-
-    Update README.md's Slack invitation link (#1962)
-
-[33mcommit a509552087fa29a62113ca0e24a6c35aa9502b30[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 8 02:19:41 2024 -0800
-
-    [minor] Improve code style and compatibility (#1961)
-
-[33mcommit 7ef0084b0d2e3b91fe1fa7cd5e396d47aa613797[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 8 01:21:29 2024 -0800
-
-    Add sentence_transformers to CI dependency  (#1958)
-
-[33mcommit f9a377f6501b92896263a8210b45bfcaabe89f2a[m
-Author: HAI <hixiao@gmail.com>
-Date:   Fri Nov 8 00:14:15 2024 -0800
-
-    [Release, ROCm] release ROCm docker build for AMD MI GPUs (#1957)
-
-[33mcommit 4ade15dd32397c0a45bd41202b9f949dd78cafe3[m
-Author: aqweteddy <keddy940199@gmail.com>
-Date:   Fri Nov 8 16:10:54 2024 +0800
-
-    Adjust reward model's score module and pooler module order for reducing computation (#1956)
-
-[33mcommit 8dc84da08479aabcde2480e8a9c67c249595eb62[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Nov 7 23:15:08 2024 -0800
-
-    Remove the useless to_srt_kwargs (#1955)
-
-[33mcommit f16eb15d0d4f6fbd48c2c8e1730c3ab14f9ecaa6[m
-Author: aqweteddy <keddy940199@gmail.com>
-Date:   Fri Nov 8 14:42:27 2024 +0800
-
-    Gemma2 reward model support (#1954)
-
-[33mcommit 5bc2508b80a438dda141c757af5b443db65defe9[m
-Author: Yudi Xue <10211+binarycrayon@users.noreply.github.com>
-Date:   Thu Nov 7 22:14:16 2024 -0800
-
-    Monitoring documentation (#1933)
-
-[33mcommit a71a44f20369384c986a99836af25d1b302653af[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Nov 7 19:20:47 2024 -0800
-
-    Update setup_github_runner.md (#1952)
-
-[33mcommit 691808d587deff22bfa7f8209a7122564514ea7d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Nov 7 18:28:29 2024 -0800
-
-    Add a timeout for execute-notebook.yml (#1951)
-
-[33mcommit d32fba2a4d4cee32d4ba25bb4f04c765fd7f1b9a[m
-Author: HAI <hixiao@gmail.com>
-Date:   Thu Nov 7 18:24:36 2024 -0800
-
-    [ENV, ROCm] update environment settings (#1939)
-
-[33mcommit 67c424cce310d36b7261992ebce00bd218378769[m
-Author: HAI <hixiao@gmail.com>
-Date:   Thu Nov 7 18:24:02 2024 -0800
-
-    [Performance, Triton Kernel Args] extend_attention, optimize kern args to _fwd_kernel (#1941)
-
-[33mcommit 1ae270c5d0873c0bcd02b9078e3a6bd0f12fbc1d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Nov 7 18:20:41 2024 -0800
-
-    [Doc] fix docs (#1949)
-
-[33mcommit c77c1e05badb5f5bf774872c3498b21eeb0aef20[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Thu Nov 7 15:42:47 2024 -0800
-
-    fix black in pre-commit (#1940)
-
-[33mcommit dca87ec34801e4a541cf8324e977522a2b06c067[m
-Author: HAI <hixiao@gmail.com>
-Date:   Thu Nov 7 00:50:45 2024 -0800
-
-    [Docs] fix 404 - Contributor Guide (#1942)
-
-[33mcommit 4b1d7a2583cacb9bf3abf200a5efbcf4556a91c5[m
-Author: Austin Liu <austin362667@gmail.com>
-Date:   Thu Nov 7 10:08:30 2024 +0800
-
-    Add Rust Router Python Binding (#1891)
-    
-    Signed-off-by: Austin Liu <austin362667@gmail.com>
-    Co-authored-by: ByronHsu <byronhsu1230@gmail.com>
-
-[33mcommit a5e0defb5a560a6d42882008c1dd8a739002ab7d[m
-Author: Xuehai Pan <XuehaiPan@outlook.com>
-Date:   Wed Nov 6 21:46:04 2024 +0800
-
-    minor: Add basic editorconfig and pre-commit hooks to enforce style for whitespaces (#1926)
-
-[33mcommit 96766101b4d181f9b3141da9484ca10ff7656743[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Wed Nov 6 00:02:02 2024 -0800
-
-    [rust] refactor server and router (#1922)
-
-[33mcommit a146d9990e148fdf2c247d639ba5d2a572175e9c[m
-Author: Lzhang-hub <57925599+Lzhang-hub@users.noreply.github.com>
-Date:   Wed Nov 6 12:42:53 2024 +0800
-
-    support prometheus metrics (#1853)
-    
-    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
-    Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
-
-[33mcommit f5113e50aed22cfca0b411e9815ea37c40103615[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Nov 5 01:12:10 2024 -0800
-
-    [Doc] improve relative links and structure (#1924)
-
-[33mcommit 02755768d32765eb49f9fa1499ed841c3aab7edb[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Mon Nov 4 23:53:44 2024 -0800
-
-    Change judge to classify & Modify make file (#1920)
-
-[33mcommit 463d56bf4439b078748c47421dffaf73d8eaede4[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Nov 4 17:13:41 2024 -0800
-
-    Update CODEOWNERS (#1916)
-
-[33mcommit 530ff541cf272956ad629a3703ecda80ff68fc63[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Nov 4 10:56:52 2024 -0800
-
-    [router] Impl radix tree and set up CI (#1893)
-    
-    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
-
-[33mcommit 3cd28092771bc8af4eaaa56d24586c26ca76a3d1[m
-Author: HAI <hixiao@gmail.com>
-Date:   Mon Nov 4 01:40:57 2024 -0800
-
-    [Docs, ROCm] update install to cover ROCm with MI GPUs (#1915)
-
-[33mcommit 704f8e8ed1a4ab992ac626bc91cd62e4909faa8f[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Sun Nov 3 22:33:03 2024 -0800
-
-    Add Reward API Docs etc (#1910)
-    
-    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
-
-[33mcommit 1853c3523bf13fd6664d8e68b4744f646c53f9d6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 3 14:18:16 2024 -0800
-
-    Fix regex docs (#1909)
-
-[33mcommit 65859754f1463ce280bbaaf68d04797705849240[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 3 13:48:11 2024 -0800
-
-    Release v0.3.5 (#1908)
-
-[33mcommit 2ce32db6fb317c252f9c877880c1b5dd47dca7b6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 3 13:27:12 2024 -0800
-
-    Let reward model take text inputs instead of message lists (#1907)
-    
-    Co-authored-by: Kyle Corbitt <kyle@corbt.com>
-
-[33mcommit 793b79dbe901fd2f4257744125f15edcc14567f4[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Nov 3 12:56:10 2024 -0800
-
-    feat: support truss endpoint for benchmark serving (#1906)
-
-[33mcommit 1363b51983415a5180fd3981e676a6b20ea77ed8[m
-Author: Iñaki Arango <arangoinaki@gmail.com>
-Date:   Sun Nov 3 12:27:11 2024 -0800
-
-    Escape backwards slash (#1902)
-
-[33mcommit 0abbf289a8acd01cafd182da8d6a5cc0fccb6953[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 3 12:25:39 2024 -0800
-
-    Unify the model type checking (#1905)
-
-[33mcommit c17c57810891591b3f7d5151d65b1e8d13af50f9[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 3 08:38:26 2024 -0800
-
-    Simplify tokenizer manager (#1904)
-
-[33mcommit 916b3cdddcbaa0f902a27fac0a1ec02f72cd62e9[m
-Author: Jani Monoses <jani.monoses@gmail.com>
-Date:   Sun Nov 3 18:24:37 2024 +0200
-
-    Allow passing dtype and max_new_tokens to HF reference script (#1903)
-
-[33mcommit 838dcda162e465b2e84f5b33434e55c1df8f6942[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 3 03:52:38 2024 -0800
-
-    Simplify tokenizer manager (#1899)
-
-[33mcommit efbc116a0f81e7c3f09f45b0720152aa5b91dc0d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Nov 3 01:45:20 2024 -0700
-
-    Do not use longest prefix matching when #queue-req is large (#1896)
-
-[33mcommit 6aed0445ed5182acc6309c1a80c743dc33ecb837[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Sun Nov 3 00:19:12 2024 -0700
-
-    turn off log (#1895)
-
-[33mcommit 908dd7f9aae52a9c961c836d99e46ba6681fee42[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Sat Nov 2 22:03:38 2024 -0700
-
-    Add engine api (#1894)
-
-[33mcommit f4cd8040732f348b7c55e432ae772b6ea70520db[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Sat Nov 2 19:08:49 2024 -0700
-
-    Fix ci and link error (#1892)
-    
-    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
-
-[33mcommit be7986e00544a28832841c916c07793173fd512c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 2 13:26:32 2024 -0700
-
-    Fix docs (#1890)
-
-[33mcommit 5a5f18432f574c16cbdb08234a8d1e6efce1bb2a[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Sat Nov 2 11:57:22 2024 -0700
-
-    Fix docs ci (#1888)
-
-[33mcommit 7b394e5f2b26b05363303738792aa841573ebbbf[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 2 11:46:00 2024 -0700
-
-    Fix docs (#1889)
-
-[33mcommit 3b60558dd79e1f4aeadc34ed5dbae45cb75e5a00[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Sat Nov 2 01:02:17 2024 -0700
-
-    Native api (#1886)
-    
-    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
-
-[33mcommit 5a9a4f41c695daa8b46c25abe8200117e68fbab2[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Nov 2 00:20:33 2024 -0700
-
-    Update index.rst (#1885)
-
-[33mcommit 72e979bfb5ed031282deef800774cbcde3d572b3[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Sat Nov 2 00:17:30 2024 -0700
-
-    add native api docs (#1883)
-    
-    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
-
-[33mcommit 146f6134051a23cda360a9de2a1abc1f447b9787[m
-Author: Ran Chen <ranchen19@icloud.com>
-Date:   Sat Nov 2 00:04:50 2024 -0700
-
-    Fix incorrect context length for llama3.2-11b (#1873)
-
-[33mcommit 660ecb731f0aa8d08d0220ea2ef91757ac24d33c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 1 20:42:30 2024 -0700
-
-    Fix doc links (#1882)
-
-[33mcommit 2565cb0f40b9d8b92711efab7e8ff073b4058478[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 1 20:29:41 2024 -0700
-
-    Update docs and workflow (#1881)
-
-[33mcommit 066e8a4ef0e9728cb8744944155c6da815c3d8a0[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 1 20:00:41 2024 -0700
-
-    Update docs title (#1879)
-
-[33mcommit 2134f0898ceb833c5202c3a7c5e9e74535d96697[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 1 18:21:14 2024 -0700
-
-    Fix links in the docs (#1878)
-
-[33mcommit a54f278d44afb42bea1f77990efd3640e71a2af3[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 1 18:16:29 2024 -0700
-
-    Add a FAQ documentation (#1877)
-
-[33mcommit d1b31b06842829cbb8516271f28af4bedce00546[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Nov 1 17:47:44 2024 -0700
-
-    Improve docs and fix the broken links (#1875)
-
-[33mcommit d59a47828cb8983704b9438b6207d38564b46fdc[m
-Author: jacky.cheng <yi-chih.cheng@amd.com>
-Date:   Sat Nov 2 03:12:59 2024 +0800
-
-    [3rdparty, document] Updated Documentation that covers performance tuning techniques for AMD Instinct GPUs. (#1871)
-    
-    Co-authored-by: root <root@dell300x-pla-t10-23.pla.dcgpu>
-
-[33mcommit 104bf2609b1adf182a8c34a533b500914a219b8d[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Nov 1 21:38:29 2024 +0800
-
-    minor: update nightly eval (#1867)
-
-[33mcommit 3bf3d011ed9650cd98b9c20bac86bebd6f87c7d9[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Fri Nov 1 00:51:15 2024 -0700
-
-    Add vlm document (#1866)
-    
-    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
-
-[33mcommit d86a2d6562840455281bfb7bd4a9a0bcc9461992[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Nov 1 14:29:20 2024 +0800
-
-    minor: add human eval (#1754)
-
-[33mcommit 16eb33ffe2afc9c043e4f80f18e15ea57b984944[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Fri Nov 1 11:13:07 2024 +0800
-
-    Update vocab embedding deps and add TP switch (#1856)
-
-[33mcommit 61cf00e1121509c0dfa19d2a8608471b23a3f6a9[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Thu Oct 31 20:10:16 2024 -0700
-
-    change file tree (#1859)
-    
-    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
-
-[33mcommit b9fd178f1b7bab721b384c017dcec30a3ba0f323[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Oct 31 18:27:42 2024 -0700
-
-    Fix retraction + overlap (#1860)
-    
-    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
-
-[33mcommit d8e9d61f8621742948cfb63b3d81cdbf5cdda316[m
-Author: HAI <hixiao@gmail.com>
-Date:   Thu Oct 31 16:38:16 2024 -0700
-
-    [Build, ROCm] Dockerfile.rocm for Instinct GPUs, with package updates (#1861)
-
-[33mcommit a2e0424abfc0d9f382331c813b1d96e0ef39d3e0[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Oct 31 14:51:51 2024 -0700
-
-    Fix memory leak for chunked prefill 2 (#1858)
-    
-    Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
-
-[33mcommit 8ce202a493294f8d98660c86d502edbfad74b741[m
-Author: geeker-smallwhite <1453684133@qq.com>
-Date:   Thu Oct 31 19:33:55 2024 +0800
-
-    delete unused character (#1855)
-
-[33mcommit d913d52c9a25f64238ab3fb23b02e0ced2d9c625[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 30 22:28:00 2024 -0700
-
-    Fix warnings in doc build (#1852)
-
-[33mcommit 0ab7bcaf6600722a916f01e2f8c01c8e9da99106[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 30 21:57:49 2024 -0700
-
-    Simplify documentation in README.md (#1851)
-
-[33mcommit 438526a814f94fafcc2c753220930edeff947e05[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Wed Oct 30 21:32:18 2024 -0700
-
-    Refactor tokenizer manager (#1846)
-
-[33mcommit f7102fbd2b5d1e6bc0373e54b5bead7370dab160[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 30 21:20:41 2024 -0700
-
-    Fix mixed chunked prefill (#1850)
-
-[33mcommit a7a0a6886b61574598d21628c71bd20b32504f98[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Wed Oct 30 19:59:20 2024 -0700
-
-    Make decode log interval configurable (#1847)
-
-[33mcommit 2d4ce1b7928d253144bc4b030a643af2b9267b40[m
-Author: HAI <hixiao@gmail.com>
-Date:   Wed Oct 30 17:33:36 2024 -0700
-
-    [Performance, Triton Kernel Args] _decode_grouped_softmax_reducev_fwd… (#1845)
-
-[33mcommit 4ba815b84e176033f37ea4e6b0311edd9066e946[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Wed Oct 30 12:28:12 2024 -0700
-
-    Fix suggest edit (#1842)
-    
-    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
-
-[33mcommit 5f65e2b830a4cf752ae8ab5739ae7ad958eced83[m
-Author: HAI <hixiao@gmail.com>
-Date:   Wed Oct 30 12:17:32 2024 -0700
-
-    [Performance, Hardware] MoE weights padding to AMD MI300x GPUs (#1836)
-
-[33mcommit 4e2af03cfa124096a7235281634ecee064bae037[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Oct 30 10:22:56 2024 -0700
-
-    [Production] Drain requests before exit when receive SIGTERM (#1838)
-
-[33mcommit 3184aa95a78bc4eca4532ef97f7065302a053816[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 30 03:16:43 2024 -0700
-
-    Update README.md (#1840)
-
-[33mcommit b548801ddbf94643e722168dc7303d0a2c5b43fc[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 30 02:49:08 2024 -0700
-
-    Update docs (#1839)
-
-[33mcommit 539df95d2cb5634b92d01ed83ed7c5c60d299a28[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Wed Oct 30 00:39:41 2024 -0700
-
-    Imporve openai api documents (#1827)
-    
-    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
-
-[33mcommit 5e00ddebc09e6919996e55407c45d89c50d6c522[m
-Author: DanielC12321 <73292458+DanielC12321@users.noreply.github.com>
-Date:   Tue Oct 29 19:52:33 2024 -0500
-
-    Add new model: Gpt2 (#1833)
-
-[33mcommit 54dd3ea12277f782823c8067ed723279136c40bb[m
-Author: HAI <hixiao@gmail.com>
-Date:   Tue Oct 29 13:58:03 2024 -0700
-
-    [FP8 KV Cache, Mixtral] Avoid KeyError at loading pre-quantized FP8 m… (#1835)
-
-[33mcommit d04899d7ca645671335db6876758f0062f239ebc[m
-Author: yizhang2077 <1109276519@qq.com>
-Date:   Wed Oct 30 04:30:41 2024 +0800
-
-    stop_str of qwen2-vl template should be a tuple not a str (#1834)
-    
-    Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
-
-[33mcommit 5010e0d2ca87716c872b6c78c0c754128812bd90[m
-Author: HAI <hixiao@gmail.com>
-Date:   Tue Oct 29 10:51:02 2024 -0700
-
-    [3rdparty, document] Add 3rdparty/amd, with profiling and tuning instructions to be added (#1822)
-
-[33mcommit 5e6c32657e384b023faf03d79e06f7727feedb7c[m
-Author: Yanyi Liu <wolfsonliu@163.com>
-Date:   Tue Oct 29 14:51:47 2024 +0800
-
-    Support setting `use_thread` in the `run_program` for easier debugging. (#1823)
-    
-    Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
-
-[33mcommit 680cad20233be46da97e92db0ba29d2b8fa41c03[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Oct 28 23:07:14 2024 -0700
-
-    fix get_memory_pool_size deadlock for DP (#1830)
-
-[33mcommit 0a24eb850a8ed690c5ae5f3cbffc10f2b0c1c42e[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Oct 28 12:02:23 2024 -0700
-
-    Fix update_weights deadlock for DP (#1825)
-
-[33mcommit 3839be2913f29e1f234a789e1a0159d876251f02[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Oct 28 09:49:48 2024 -0700
-
-    [Router] Add a rust-based router (#1790)
-
-[33mcommit 6e13b650a98275750835ce7999890052d01d1c45[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Sun Oct 27 21:03:41 2024 -0700
-
-    Fix docs deploy ci (#1821)
-
-[33mcommit 6fcd6d7d6dec7aea858d7441effd8a04b6d05474[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sun Oct 27 14:02:34 2024 -0700
-
-    Support token ids in `engine.generate` (#1820)
-
-[33mcommit c77762d57f4161efae8222ad828b818d95f8d268[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Mon Oct 28 01:54:38 2024 +0800
-
-    Fix Triton decode kernel & ut (#1819)
-
-[33mcommit 51c81e339bb50db2cb5fb282498886562883b5f0[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Sun Oct 27 10:51:42 2024 -0700
-
-    Add openAI compatible API (#1810)
-    
-    Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
-
-[33mcommit eaade87a421e7dddc4e91af6a97a03441944a36d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Oct 27 03:04:54 2024 -0700
-
-    Fix unit tests (#1817)
-
-[33mcommit 86fc0d79d0b564fba1c313feafd15323ba731418[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Oct 27 02:00:50 2024 -0700
-
-    Add a watch dog thread (#1816)
-
-[33mcommit 1be853ee69a23eda57da57f88451feb9143a1838[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 26 21:08:44 2024 -0700
-
-    Update hyperparameter_tuning.md (#1813)
-
-[33mcommit 86e0dde555284b1df0001cafdbd04d2645784cbc[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 26 16:39:41 2024 -0700
-
-    Improve the user control of new_token_ratio (#1811)
-
-[33mcommit 2b80978859794eb9dcf0156066a4e7c7b7abc713[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 26 15:09:33 2024 -0700
-
-    Provide an argument to set the maximum batch size for cuda graph (#1809)
-
-[33mcommit 9d6fb084575c75b58cfeb0464a842d6fd4e39e13[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Sat Oct 26 11:23:51 2024 -0700
-
-    Fix docs ci (#1808)
-
-[33mcommit ced362f7c60f9bf36d659423aa23aba6c9691018[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Sat Oct 26 10:44:11 2024 -0700
-
-    Simplify our docs with complicated functions into utils (#1807)
-    
-    Co-authored-by: Chayenne <zhaochenyang@ucla.edu>
-
-[33mcommit 9084a864453e5a898beb6ce36c2d56ed1f2f3c46[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 26 04:46:01 2024 -0700
-
-    Update links (#1805)
-
-[33mcommit 6aa94b967c2bd79c7c0844d15d6ed90665c58f6a[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 26 04:32:36 2024 -0700
-
-    Update ci workflows (#1804)
-
-[33mcommit c26507484fca9c6a901754b16af56285df29aa2b[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Oct 26 00:09:44 2024 -0700
-
-    fix int conversion for `SGLANG_CPU_COUNT` (#1803)
-
-[33mcommit 07bf2e846a413d8125d35121210365a2cf0ee3fa[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Oct 25 23:43:24 2024 -0700
-
-    Allow consecutive ports when launching multiple sglang servers. (#1802)
-
-[33mcommit a628dd8e3162be93a5fad6e27dc9a1bc176d63cf[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Oct 25 23:15:56 2024 -0700
-
-    Set `ZMQ` buffer size heuristic (#1801)
-
-[33mcommit 1e8903414a4385f29765e4a1ac2551825e9aba66[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Oct 25 23:07:07 2024 -0700
-
-    Fix possible ZMQ hanging (#1800)
-
-[33mcommit 715b16c140b4519a67e9c9a10db311731f0d930b[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Fri Oct 25 20:48:35 2024 -0700
-
-    Add support for ipynb (#1786)
-
-[33mcommit 9ce8e1a93cd6ca15e177789c1e075d9f833aa822[m
-Author: Hui Liu <96135754+hliuca@users.noreply.github.com>
-Date:   Fri Oct 25 19:30:50 2024 -0700
-
-    move max_position_embeddings to the last (#1799)
-
-[33mcommit fb99aaa527199de19271668f0aa1e70b780f83fa[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 25 18:51:59 2024 -0700
-
-    [Fix] Fix --skip-tokenizer-init (#1798)
-
-[33mcommit b77a02cdfdb4cd58be3ebc6a66d076832c309cfc[m
-Author: DarkSharpness <76582120+DarkSharpness@users.noreply.github.com>
-Date:   Sat Oct 26 06:47:02 2024 +0900
-
-    [Performance] Support both xgrammar and outlines for constrained decoding (#1752)
-
-[33mcommit 30643fed7f92be32540dfcdf9e4310e477ce0f6d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 25 11:07:19 2024 -0700
-
-    Release v0.3.4.post2 (#1796)
-    
-    Co-authored-by: DarkSharpness <76582120+DarkSharpness@users.noreply.github.com>
-
-[33mcommit e646c5901e7910228e128861e39d8de16241afc1[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 25 11:06:57 2024 -0700
-
-    Fix logprob in the overlapped mode (#1795)
-
-[33mcommit c555ce2ca20cd8a2fc87a0e048c39c181614388e[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 25 10:24:44 2024 -0700
-
-    Revert "Fix memory leak when doing chunked prefill" (#1797)
-
-[33mcommit 40900baea7f689e9175a70b9683b5a4fe9be1dc8[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 25 08:31:08 2024 -0700
-
-    [Fix] Fix the log parsing in chunked prefill uni tests (#1794)
-
-[33mcommit a2f5e7555fdd0d94d00e9feaa1d463505ec7cc7b[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Oct 25 08:01:17 2024 -0700
-
-    Fix memory leak when doing chunked prefill (#1787)
-
-[33mcommit 2148914e1b4740a812d01776d5e5c38257a10552[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 25 08:00:55 2024 -0700
-
-    Fix log parsing in the chunked prefill unit tests (#1793)
-
-[33mcommit def55bc8762bf0f69f32512cd6a425c790a9e6ea[m
-Author: yizhang2077 <1109276519@qq.com>
-Date:   Fri Oct 25 22:45:17 2024 +0800
-
-    Qwen2vl support cuda graph and disable radix cache (#1780)
-
-[33mcommit 86a2c473b775f9051f460b4107a34c5e662fd1a3[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Oct 24 21:26:05 2024 -0700
-
-    [Fix] Fix seq_lens_sum for cuda graph runner in padded cases (#1789)
-
-[33mcommit 1701b0db31544b0a95b1c18474d79bd1bc401253[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Oct 24 21:23:09 2024 -0700
-
-    Enhance the test case for chunked prefill (#1785)
-
-[33mcommit 384d85ba358a6a097090f9d7dbe0f621c8c47829[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Oct 24 13:30:11 2024 -0700
-
-    Re-introduce `get_cuda_graph_seq_len_fill_value` (#1783)
-
-[33mcommit 605972195bafcd3ffd7a3489dbed4e1d2d0d51dd[m
-Author: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
-Date:   Fri Oct 25 03:40:36 2024 +0800
-
-    check user-specified model_max_len with hf derived max_model_len (#1778)
-
-[33mcommit fc82f5a743f48d50c633a08e89eff3d6522fb4a3[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Oct 24 12:33:15 2024 -0700
-
-    [Fix] Fix cuda graph padding for triton attention backend (#1782)
-
-[33mcommit 0089c4bc96013806162aa47c929f597d5327d662[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Oct 24 04:16:59 2024 -0700
-
-    [Fix] Fix NaN issues by fixing the cuda graph padding values for flashinfer (#1779)
-
-[33mcommit 72e7b57a750615187af9c761e526bf4570fc1288[m
-Author: zolinthecow <32052672+zolinthecow@users.noreply.github.com>
-Date:   Thu Oct 24 01:54:53 2024 -0700
-
-    [Bug] Catch any errors caused by parsing json schema (#1776)
-
-[33mcommit 87a7cfa080cec3f123618c1429b5f998bf5d99cb[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 23 17:47:05 2024 -0700
-
-    Fix MockTokenizer in the unit tests (#1774)
-
-[33mcommit 8f8f96a6217ea737c94e7429e480196319594459[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 23 16:45:21 2024 -0700
-
-    Fix the perf regression due to additional_stop_token_ids (#1773)
-
-[33mcommit 05b3bf5e8e4751cf51510198ae2e864c4b11ac2f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 23 16:27:13 2024 -0700
-
-    Crash the server on warnings in CI (#1772)
-
-[33mcommit 3f5ac88d029964f756270c25a9f677f60adb28e7[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Oct 23 15:20:39 2024 -0700
-
-    Fix out of memory message. (#1771)
-
-[33mcommit 0d800090b4effc3f683054ba411cc62521f6ddb4[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 23 12:18:59 2024 -0700
-
-    Fix missing additional_stop_token_ids (#1769)
-
-[33mcommit b7d0559496569a7210de911cb0b23faf384d0bba[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 23 11:28:48 2024 -0700
-
-    Update docs (#1768)
-    
-    Co-authored-by: Chayenne Zhao <zhaochenyang20@gmail.com>
-    Co-authored-by: Chayenne <zhaochen20@outlook.com>
-
-[33mcommit 80a905475d31fea8d3c4eca0681b2a2e8d456106[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 23 10:47:12 2024 -0700
-
-    Fix stop condition for <|eom_id|> (#1766)
-
-[33mcommit 9af7b88e3cbb3afea3a3b71ffd2b984fcbed58bb[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 23 10:46:29 2024 -0700
-
-    [Fix] Fix abort in dp (#1767)
-
-[33mcommit fbcbb26327e1da685139b3f66cdc75c49ae608c0[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 23 09:57:08 2024 -0700
-
-    Fix perf regression for set_kv_buffer (#1765)
-
-[33mcommit 2fce449b1c0a6cadde4946984426336621baed22[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Oct 23 00:02:29 2024 -0700
-
-    [API] add get memory pool size (#1760)
-    
-    Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
-
-[33mcommit ad4125d1a9c4796cdbc6c6a5cdb69b09e60e5509[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Oct 22 23:20:43 2024 -0700
-
-    Fuse more ops & Simplify token mapping (#1758)
-
-[33mcommit 17536e7e3dde0518097dd4c22cea35f7db8e5d5a[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Tue Oct 22 21:00:25 2024 -0700
-
-    Fix edge case for truncated (#1747)
-
-[33mcommit 1f26e8b8e4c8b884e59036dccd87929b2af592f9[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Oct 21 21:16:43 2024 -0700
-
-    Release v0.3.4.post1 (#1749)
-
-[33mcommit 5e1558f1f26f0fc060ea261c9e81b767dc8e3fb9[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Oct 21 16:12:04 2024 -0700
-
-    Update `max_req_len` and `max_req_input_len` (#1748)
-
-[33mcommit 94cde10920035648b0554abec5323176eea8486d[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Oct 21 15:01:21 2024 -0700
-
-    Llama3.2 vision model support (#1551)
-
-[33mcommit 00611286a1a57da6d305a634bf959beb8f5549f6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Oct 21 13:47:12 2024 -0700
-
-    Fix sliding window attention and gemma-2 unit tests in CI (#1746)
-
-[33mcommit e68b9e7667db64e240c25c1b872f7b4d69f54698[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Oct 21 06:28:32 2024 -0700
-
-    misc: add CODEOWNERS (#1737)
-
-[33mcommit 7ce36068914503c3a53ad7be23ab29831fb8aa63[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Oct 21 04:30:52 2024 -0700
-
-    Faster overlap mode scheduler (#1738)
-
-[33mcommit efb099cdee90b9ad332fcda96d89dd91ddebe072[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Oct 21 03:54:35 2024 -0700
-
-    Fix prefill oom (#1743)
-
-[33mcommit 09603c6dc93244cc31de0a1092281bc685187a4f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Oct 21 01:43:16 2024 -0700
-
-    Maintain seq_lens_sum to make more FlashInfer operations non-blocking (#1741)
-
-[33mcommit cf470fea322aeb5cd53b4f6c8a63dbd16821e80c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Oct 20 23:25:14 2024 -0700
-
-    Make token mapping non-blocking in the overlapped mode (#1740)
-
-[33mcommit 45d5af2416f53940e48100754bdfbb6360c4e586[m
-Author: sixgod <evethwillbeok@outlook.com>
-Date:   Mon Oct 21 12:08:30 2024 +0800
-
-    Add GLM-4 TextGeneration Model support for SGLang (#1736)
-
-[33mcommit b121bc03a3c30888caeffd49e96d5ffef473edbf[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Oct 20 19:47:14 2024 -0700
-
-    Simplify batch result resolution (#1735)
-
-[33mcommit e12358dc91361925c4979e552251522e2774fc11[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Oct 20 18:17:41 2024 -0700
-
-    Simplify the usage of device (#1734)
-
-[33mcommit 554fbf93cd67234fa63f811aa458fe0f60f17e42[m
-Author: yizhang2077 <1109276519@qq.com>
-Date:   Sun Oct 20 17:38:35 2024 +0800
-
-    [Bugfix] qwen2vl forward_extend (#1727)
-
-[33mcommit b48edff67fd8051f32f03a7e58499717173f8574[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Oct 20 00:29:29 2024 -0700
-
-    Split the overlapped version of TpModelWorkerClient into a separate file (#1726)
-
-[33mcommit 593b19f29d20065102f8c92580a2dbecbfa95485[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Oct 20 00:05:45 2024 -0700
-
-    Temporarily skip this test_mixed_batch for QWen2VL (#1725)
-
-[33mcommit 59cbf476264d1385405dba4db12effda32cc2053[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 19 23:19:26 2024 -0700
-
-    Unify the memory pool api and tp worker API (#1724)
-
-[33mcommit 95946271afaf472430acb240db3f78e711c2807c[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Oct 19 22:29:12 2024 -0700
-
-    Update README.md
-
-[33mcommit 5c4ce6563153bae73a97d3918821a97804ea0d67[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Oct 19 22:27:38 2024 -0700
-
-    Update README.md (#1722)
-
-[33mcommit cbbc82b7b81bb293265e6d2dd71667ecbbfa9199[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Oct 19 21:44:38 2024 -0700
-
-    Support qwen2 vl model  (#1721)
-    
-    Co-authored-by: yizhang2077 <1109276519@qq.com>
-    Co-authored-by: ispobock <ISPObaoke@163.com>
-
-[33mcommit 8bee20f80b47f1b79eb87e3d53b117d84d4ff948[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Oct 19 20:45:41 2024 -0700
-
-    Update vllm to 0.6.3 (#1711) (#1720)
-    
-    Co-authored-by: Ke Bao <ISPObaoke@163.com>
-
-[33mcommit 12cad0feaecdd9f206cea585f9f57729f12c8bf3[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 19 17:39:38 2024 -0700
-
-    Simplify the interface of tp_worker (#1718)
-
-[33mcommit b6cd903604a7439fcd082290ade83a02b164eca0[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 19 12:58:55 2024 -0700
-
-    Update readme and workflow (#1716)
-
-[33mcommit 087257ea032a8a8fb74798aba7a35a8314d9a4f4[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 19 08:17:41 2024 -0700
-
-    Release v0.3.4 (#1714)
-
-[33mcommit 736f04025d2b01893cbf3ece20614991d0a94951[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 19 07:11:02 2024 -0700
-
-    Update README.md (#1713)
-
-[33mcommit 769bf11c05209a1cc08ac8c5180f1e4da68ba21f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 19 06:50:56 2024 -0700
-
-    Fix the race condition in overlap mode (#1712)
-
-[33mcommit 3db43d1b0803d75220f5d2e3ebd08f2dccdf61c5[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 18 21:01:52 2024 -0700
-
-    Fix `is_all_ready` for overlap copy (#1710)
-
-[33mcommit f0f8a7699b4adfd7caa8b349f77ff50cf4f80610[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 18 20:21:24 2024 -0700
-
-    Simplify the nan detection and greedy check in sampler (#1709)
-
-[33mcommit 2bcfba1b080e137e51e5a726d8e3ddc8dbcb3a79[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 18 17:54:03 2024 -0700
-
-    Skip unnecessary penalizer (#1707)
-
-[33mcommit bc12d4033f3e49314a837249288d5012d1bf7501[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 18 13:21:05 2024 -0700
-
-    Add grouped free operations (#1706)
-
-[33mcommit 392f2863c8da8697fd7fb6f72222a9c82f198ed6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 18 12:18:15 2024 -0700
-
-    Add dtype for more operations (#1705)
-
-[33mcommit 6d0fa73ece7d8e6694ce9a435b5204acaed20876[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Oct 17 22:54:14 2024 -0700
-
-    Simplify flashinfer utilities (#1704)
-
-[33mcommit 9e0dac1ad706aa8769860796e60a218d6c20868b[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Oct 17 18:33:21 2024 -0700
-
-    Fix regex and logprob conflicts when chunked prefilling (#1703)
-
-[33mcommit a95d5589c3bbfeecaec9a1109e601785b24d014c[m
-Author: Gleb Drozdov <159446314+g-drozdov@users.noreply.github.com>
-Date:   Thu Oct 17 22:06:52 2024 +0400
-
-    Add matched_stop token or str to distinguish between eos or stop str finish_reason generation (#1684)
-
-[33mcommit d17d19e5b84ec459e8fcce238232781a731ca488[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Oct 17 10:27:26 2024 -0700
-
-    Fix mixed batch for multi modal models (#1702)
-
-[33mcommit dd3809fad8de5519bef52f14fe0da2496848b28c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Oct 17 09:53:32 2024 -0700
-
-    Fix engine unit test (#1701)
-
-[33mcommit 7feba41584a0b02108c10bd23c2a2bdea6c6a03e[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Oct 17 09:23:29 2024 -0700
-
-    Fix failed ci tests on long prompts; Better error messages for embedding models (#1700)
-
-[33mcommit 30ee36305e468c2a467b8ec13b201c1a61368420[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Oct 17 08:13:29 2024 -0700
-
-    Fix the failed unit tests (#1699)
-
-[33mcommit e5db40dcbce67157e005f524bf6a5bea7dcb7f34[m
-Author: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
-Date:   Thu Oct 17 08:03:08 2024 -0700
-
-    ORJson. Faster Json serialization (#1694)
-
-[33mcommit b170930534acbb9c1619a3c83670a839ceee763a[m
-Author: wxsm <wxsms@foxmail.com>
-Date:   Thu Oct 17 23:01:27 2024 +0800
-
-    feat: radix tree code optimize (#1697)
-
-[33mcommit 5ab20cceba227479bf5088a3fc95b1b4fe0ac3a9[m
-Author: Jani Monoses <jani.monoses@gmail.com>
-Date:   Thu Oct 17 17:50:01 2024 +0300
-
-    Use SGLang imports for linear layer (#1696)
-
-[33mcommit 02f7f3e4889b4941425c4da1ca4a907ac4a5c9a0[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 16 19:03:55 2024 -0700
-
-    Update the transformers version in CI (#1690)
-
-[33mcommit 2782132be8c67f7108042a3dedb7094779b81a0c[m
-Author: Zeng Zhongchao <zengzhongchao@gmail.com>
-Date:   Thu Oct 17 09:54:55 2024 +0800
-
-    Add date to logging messages (#1623) (#1679)
-
-[33mcommit d19cc0b9c9520328f29dc88f827caa2081458382[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 16 18:36:24 2024 -0700
-
-    Update README.md (#1689)
-
-[33mcommit b0facb3316dedad4a1f5418a89ddfebf811d0b1d[m
-Author: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
-Date:   Wed Oct 16 18:14:30 2024 -0700
-
-    add orjson for jsonresponse (#1688)
-
-[33mcommit ecb8bad276ea13e243a36cc23adca8207fac4657[m
-Author: havetc <corentin.havet@hotmail.fr>
-Date:   Wed Oct 16 20:49:22 2024 +0200
-
-    Returning a per request metric for number of cached_tokens read (#1599)
-
-[33mcommit dbec2f18478cf1de196d566e5fcc9b18754d0fa4[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 16 11:20:17 2024 -0700
-
-    Launch a thread to overlap CPU and GPU (#1687)
-
-[33mcommit e4b367baa80db372452bcf1f7da12724af71640e[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Oct 16 10:58:14 2024 -0700
-
-    [Event] Add online meetup meeting link (#1686)
-
-[33mcommit d10b933a36eed76b72c351f7914efa6c4c86d842[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Wed Oct 16 23:21:20 2024 +0800
-
-    Fix srt dependency (#1685)
-
-[33mcommit 9116b2896fb9b6ae8510f48ce5f12f012b64483f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 16 01:33:20 2024 -0700
-
-    Add a new event loop (#1677)
-
-[33mcommit a5114b6f910f3c2a45b628a4052d47c9b518ccea[m
-Author: Jani Monoses <jani.monoses@gmail.com>
-Date:   Wed Oct 16 10:11:18 2024 +0300
-
-    Add OLMo model (#1676)
-
-[33mcommit b6b40946211f8284686fcc8a8044527bbd132f51[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Oct 15 22:59:26 2024 -0700
-
-    Fix filter_batch function call (#1681)
-
-[33mcommit f1088e0fc87027a18347c98c0319e54eabbe6a03[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Oct 15 08:15:08 2024 -0700
-
-    Fix memory leak during abort (#1674)
-
-[33mcommit 175afed370e2c527125de762c367105c7ea4a942[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Oct 14 21:53:01 2024 -0700
-
-    Improve benchmark scripts (#1672)
-
-[33mcommit 4a292f670db863fbdea906ad41aec2c631eedbdb[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Oct 14 20:08:03 2024 -0700
-
-    [Minor] Add some utility functions (#1671)
-
-[33mcommit cd0be7489f3ad0f12ca21db23962c0dd52788262[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Oct 14 19:56:21 2024 -0700
-
-    [doc] improve engine doc and add to readme (#1670)
-
-[33mcommit 56503d9bc93a13db22b9314a93c83a4f2ce4c362[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Oct 14 09:06:34 2024 -0700
-
-    [1/N] Remove `CacheConfig` import in all model files (#1658)
-
-[33mcommit 02bc95796d64c9dc1ad8eef6a8b50b3cbd98e9f8[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Oct 14 06:47:50 2024 -0700
-
-    Simplify chunked prefill (#1667)
-
-[33mcommit 24f3e1511cc289b1b7e3e94e4ee19ab559a5e7f9[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Oct 14 05:25:00 2024 -0700
-
-    [Minor] Improve style (#1666)
-
-[33mcommit 6790240cc31c11ee69fe3f38ae170390b1615ef6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Oct 14 02:01:44 2024 -0700
-
-    Fix unit test order to balance the tasks in CI (#1665)
-
-[33mcommit 061e54631352d9c54eb136042acd5474fc3478ca[m
-Author: Shuo Yang <73746844+andy-yang-1@users.noreply.github.com>
-Date:   Mon Oct 14 02:00:41 2024 -0700
-
-    Support double sparsity (#1459)
-
-[33mcommit 0c1e87964b87f201f1cc9d3bd6d54ae3280a9b31[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Oct 14 01:15:34 2024 -0700
-
-    Move filter_batch out of stream_output (#1663)
-
-[33mcommit 869f1c02c4a7140c674ea92127a45eac0211bf74[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Oct 13 20:32:37 2024 -0700
-
-    Add a test case to test retract (#1662)
-
-[33mcommit 2725f8da61a30b902be789d161074cec1c1ad988[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Oct 13 20:30:03 2024 -0700
-
-    [Minor] Rename no_eos_trim to no_stop_trim (#1661)
-
-[33mcommit da1ffed689f2e18702b152d3d86ed5312e86f33a[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Oct 13 19:54:02 2024 -0700
-
-    Add output_ids into ScheduleBatch (#1659)
-
-[33mcommit 48761171716302446a95c8d9d1fe1a469f12309e[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Oct 13 01:07:09 2024 -0700
-
-    [Fix] fix eos trim inconsistency (#1650)
-
-[33mcommit c3f2fc5a7a152a3679e753dcd023f38ef2458676[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sun Oct 13 00:33:58 2024 -0700
-
-    [doc] Add engine section in backend.md (#1656)
-
-[33mcommit 7ee6c259ff8c9cc29f92c4c68530810c8bfc2b30[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 12 21:35:30 2024 -0700
-
-    Simplify the event loop and expose `--num-continuous-decode-steps` as an argument (#1652)
-
-[33mcommit 9610fcd46964507a7c5418ea35e94935c7881815[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 12 19:47:24 2024 -0700
-
-    Fix the batch_is_full check for jump-forward decoding (#1654)
-
-[33mcommit 31fad29ab02d1d516632da7fc2f128eabc89aee9[m
-Author: Patrick Yi <21299683+pjyi2147@users.noreply.github.com>
-Date:   Sat Oct 12 22:39:35 2024 -0400
-
-    Add get_tokenizer function for Engine class (#1653)
-
-[33mcommit 9da5a60b18bcd0331a7b54e89d3d697db599f924[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 12 17:53:23 2024 -0700
-
-    Add an option to disable penalizer (#1651)
-
-[33mcommit 69aa937aa528f0066ab5226bb428cbdf37dec048[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 12 14:49:24 2024 -0700
-
-    Fix unit tests and type annotations (#1648)
-
-[33mcommit 5d638c92f5215aa43c4fb38d5106d2d3967cabb1[m
-Author: Zhang, Liangang <liangang.zhang@intel.com>
-Date:   Sun Oct 13 02:10:32 2024 +0800
-
-    [Feature, Hardware] Enable SGLang on XPU GPUs via PyTorch (#1480)
-
-[33mcommit e37cdab0c6fd1e924522ab3cb720908d0c02e226[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Oct 12 00:36:28 2024 -0700
-
-    Fix ignore_eos (#1645)
-
-[33mcommit 1d9deeacdb32a5253d7afb25c20d28cb8fc07786[m
-Author: LI MOU <142368437+learninmou@users.noreply.github.com>
-Date:   Sat Oct 12 12:37:20 2024 +0800
-
-    fix missing ignore_eos in v1/chat/completions (#1642)
-
-[33mcommit dafb6a5266f7c04cbd75e8bb0f8e6d63630fa336[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 11 16:05:58 2024 -0700
-
-    [Fix] Fix the style of test_large_max_new_tokens.py (#1638)
-
-[33mcommit 862cd265e5149df71858658c12d8dbbf82d72c44[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Fri Oct 11 15:26:25 2024 -0700
-
-    [engine] support async and streaming (#1614)
-
-[33mcommit 00c7e6368bbf598da9af5443f24353e8ff2a6fd0[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 11 07:56:16 2024 -0700
-
-    Release v0.3.3.post1 (#1636)
-
-[33mcommit 23cc66f7b65f885969d4608fd4964e0ba98fb7f5[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 11 07:22:48 2024 -0700
-
-    Add back data parallelism (#1635)
-
-[33mcommit 5d09ca5735462eacc36a0b0aed7f4108c3d33f2f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 11 06:26:20 2024 -0700
-
-    Fix constrained decoding (#1634)
-
-[33mcommit 81c33274021b96743fbf6d05d8f8e43ef2af2f62[m
-Author: Janumala Akhilendra <82641474+JanumalaAkhilendra@users.noreply.github.com>
-Date:   Fri Oct 11 18:55:30 2024 +0530
-
-    Added a "Back To Top" Button (#1633)
-
-[33mcommit f13d86f9209b62c701dcd12d08cad8f15c600fae[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 11 05:07:51 2024 -0700
-
-    Add image_token in conversation.py (#1632)
-    
-    Co-authored-by: yizhang2077 <1109276519@qq.com>
-
-[33mcommit aba9eae4c653ee4949bb7d5723b4d1b918d206b6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 11 05:03:20 2024 -0700
-
-    Fix the correctness test in bench_latency.py when tp > 1 and test_generation_models.py (#1631)
-
-[33mcommit bbd72bfc8609d1c5d8bc9ebb29b9c3b9e218bb90[m
-Author: 科英 <abatom@163.com>
-Date:   Fri Oct 11 17:34:25 2024 +0800
-
-    Add the ability to enable and disable the Profiler via HTTP API. (#1626)
-
-[33mcommit b503881bd214eed7ff6d46e965004d721a7a11fe[m
-Author: Yiding-Lu <45527994+OBJECT907@users.noreply.github.com>
-Date:   Fri Oct 11 17:25:04 2024 +0800
-
-    [Bug] Fix the Image Input of Batch Generation (#1579)
-
-[33mcommit 58093b868f48fead1a224264b2d5534568faaee4[m
-Author: glen-amd <146770157+glen-amd@users.noreply.github.com>
-Date:   Fri Oct 11 02:17:47 2024 -0700
-
-    Nit about the decorator of `PortArgs.init_new` (#1611)
-
-[33mcommit 8275049ce3dd3dd3a64f0d1db833f5622202bc9c[m
-Author: Zhang, Liangang <liangang.zhang@intel.com>
-Date:   Fri Oct 11 17:05:58 2024 +0800
-
-    Add device support (#1607)
-
-[33mcommit 5476ccad8fa3f1ecf71cceb066ffbc88a0269e8d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 11 01:59:49 2024 -0700
-
-    Update README.md
-
-[33mcommit b040ed71f774d3ba05c5c66dcd1a333dbfae2b3b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 11 01:58:25 2024 -0700
-
-    Update README.md (#1629)
-
-[33mcommit c9e665869982d5322293d66301761bf1425dc0e9[m
-Author: Kushal Agrawal <98145879+kushal34712@users.noreply.github.com>
-Date:   Fri Oct 11 14:27:42 2024 +0530
-
-    Update README.md (#1625)
-
-[33mcommit e11ab79e68c1ccc5ead4177d71582bdc8bf61510[m
-Author: HAI <hixiao@gmail.com>
-Date:   Thu Oct 10 22:48:15 2024 -0700
-
-    [Performance, hardware] MoE tuning update to AMD MI300x GPUs (#1619)
-
-[33mcommit 01fdb2f377e2a4bbc1be9fa4ffaac69929ddb513[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Thu Oct 10 16:34:13 2024 -0700
-
-    Fix test_vision_openai_server on CI (#1620)
-
-[33mcommit c996e8ccd415f6e1077ace5bc645d19a8dd40203[m
-Author: Amos You <91300605+amosyou@users.noreply.github.com>
-Date:   Tue Oct 8 21:11:19 2024 -0700
-
-    [Minor] Fix logging typo (#1615)
-
-[33mcommit 7b69d91b4f94a73f6b8fa3a86de3a910a16dc645[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Oct 8 12:58:41 2024 -0700
-
-    Release v0.3.3 (#1605)
-
-[33mcommit e8613df071fb126f97c0d1254977586f39362e08[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Oct 7 21:26:56 2024 -0700
-
-    [Engine] Fix generate hanging issue after the first call (#1606)
-
-[33mcommit c5325aba75b29cd6c893ba6bb67b31870f03692d[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Oct 7 14:37:16 2024 -0700
-
-    [Profile] Add pytorch profiler (#1604)
-
-[33mcommit ebbc42d989bb206d296f074a5dea7aed948d3715[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Oct 7 13:05:53 2024 -0700
-
-    Optimize broadcast & Reorg code (#1598)
-
-[33mcommit 3ff641132e45881ed9ff86be31d673e3ae1d6812[m
-Author: Jani Monoses <jani.monoses@gmail.com>
-Date:   Mon Oct 7 21:30:41 2024 +0300
-
-    Remove references to squeezellm (#1603)
-
-[33mcommit 2b302b93938c3de5fc98c5149a7ebcce86648051[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Oct 7 00:44:38 2024 -0700
-
-    Fix the port_args in bench_latency (#1597)
-
-[33mcommit 68f8b60d22a34f81ca6bff044170d86e97f09b6e[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Mon Oct 7 14:34:14 2024 +0800
-
-    Fix chunked prefill condition (#1594)
-
-[33mcommit 6a5b352aaf1e53c490945bd87ebe6ab456b5eda6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Oct 6 22:54:05 2024 -0700
-
-    Use is_flashinfer_available to replace is_hip for flashinfer check (#1596)
-    
-    Co-authored-by: Zhang Liangang <liangang.zhang@intel.com>
-
-[33mcommit 565b05f02fc785a02855b2f596a5649a1bee7336[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sun Oct 6 22:18:45 2024 -0700
-
-    Use `atexit` hook to implicitly shutdown `Runtime` (#1595)
-
-[33mcommit b6aad70ab1160a521151a69202e717dbd652e331[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Oct 6 20:30:02 2024 -0700
-
-    [Fix] Fix the case where prompt_len = 0 (#1593)
-
-[33mcommit 551a3a9d3870e57c285025827be8870f197daa0a[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sun Oct 6 20:27:03 2024 -0700
-
-    Provide an offline engine API (#1567)
-
-[33mcommit 91877a9f9c8763a504373e41d238dbf9adf65f8c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Oct 6 15:43:32 2024 -0700
-
-    Fix modality for image inputs (#1592)
-
-[33mcommit f7cce751f9f9006ae46506c029a14dd4115fe810[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Oct 6 15:14:29 2024 -0700
-
-    Update README.md (#1591)
-
-[33mcommit 17e998f1a8f7f08e85f6ba8b0096f25aa4d6b666[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sun Oct 6 15:02:27 2024 -0700
-
-    Test consistency for single and batch seperately (#1590)
-
-[33mcommit c98e84c21e4313d7d307425ca43e61753a53a9f7[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Oct 6 13:15:05 2024 -0700
-
-    [Minor, Performance] Use torch.argmax for greedy sampling (#1589)
-
-[33mcommit 9c064bf78af8558dbc50fbd809f65dcafd6fd965[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Oct 6 10:33:44 2024 -0700
-
-    [LoRA, Performance] Speedup multi-LoRA serving - Step 1 (#1587)
-
-[33mcommit 58d1082e392cabbf26c404cb7ec18e4cb51b99e9[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Oct 6 03:24:04 2024 -0700
-
-    Clean up event loop (#1586)
-
-[33mcommit 4d086719e5cee5dc84d89d9b47522b11bb776157[m
-Author: HAI <hixiao@gmail.com>
-Date:   Sun Oct 6 01:09:09 2024 -0700
-
-    [Bug] Fix decode stats error on output_len 1 (#1585)
-
-[33mcommit 9244f27f0af24deb199921c32e24f2491380e016[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Oct 6 00:10:48 2024 -0700
-
-    [Minor] Improve the style and fix flaky tests (#1584)
-
-[33mcommit 2422de5193bfb00ed73b767957448de528dfb14f[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Oct 5 21:51:12 2024 -0700
-
-    Support min_tokens in sgl.gen (#1573)
-
-[33mcommit 521f862d9067ddac679a4d8e048f35bb8fecf47f[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Oct 5 17:59:05 2024 -0700
-
-    Fix runtime.generate when sampling param is not passed (#1582)
-
-[33mcommit 34c32d2820caabcc9d481bedf854d9866fed48e7[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Oct 5 17:52:14 2024 -0700
-
-    Fix styling  (#1583)
-
-[33mcommit dde8bb16fe9180bd1642bdb8d4f0aa283b120ee4[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Oct 5 17:27:43 2024 -0700
-
-    default sampling param should be deepcopied (#1581)
-
-[33mcommit 8ac3ccc060fe59c0ff76011ac40cb1a2f22edfdc[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Oct 5 11:47:35 2024 -0700
-
-    Backend method not found when SRT Runtime is used (#1576)
-
-[33mcommit 9b0926ceeb3393e6af94060cc2bcb005368f7932[m
-Author: Jerry Zhang <jerryzh168@gmail.com>
-Date:   Sat Oct 5 11:22:27 2024 -0700
-
-    Add llama implementation with no tensor parallel linears (#1561)
-
-[33mcommit 1c1bdc769975b16f5537828230e7a98986669d18[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Oct 5 11:16:47 2024 -0700
-
-    [Event] Update README.md (#1572)
-
-[33mcommit 6bfdb4031dec751d51b6e68c09c768bd41922291[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Oct 5 11:07:41 2024 -0700
-
-    [Easy] use .text() instead of .text (#1577)
-
-[33mcommit f8fb4ce9b0a788c7e1826a26035ad3c103ca45d3[m
-Author: Ikko Eltociear Ashimine <eltociear@gmail.com>
-Date:   Sun Oct 6 03:05:57 2024 +0900
-
-    chore: update README.md (#1580)
-
-[33mcommit 5d0ba4038f9c30d0996338c431a93b0d5324f2ae[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Oct 4 18:00:18 2024 -0700
-
-    Refine the add request reasons to avoid corner cases. (#1574)
-
-[33mcommit 04b262cd91cbcb6b7ec1e096cf6e00631f2c4ead[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Oct 4 01:51:11 2024 -0700
-
-    [Fix] Fix major performance bug in certain cases (#1563)
-    
-    Co-authored-by: hnyls2002 <hnyls2002@gmail.com>
-
-[33mcommit 2432ad40c6fd630e0c40cd1850f5f66130914aa4[m
-Author: FredericOdermatt <50372080+FredericOdermatt@users.noreply.github.com>
-Date:   Fri Oct 4 10:16:53 2024 +0200
-
-    [Minifix] Remove extra space in cot example (#1569)
-
-[33mcommit 45473d4b2b2f026da18095dbbd573f739dc440b0[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 4 01:09:59 2024 -0700
-
-    Make input_ids a torch.Tensor (#1568)
-
-[33mcommit 114bbc8651c864093ac49e704db7e888f9f453d2[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Oct 4 00:45:52 2024 -0700
-
-    Use ipc instead of tcp in zmq (#1566)
-
-[33mcommit 32eb6e96f2b344eb917cb417c2a924281bacdcb2[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Oct 3 18:29:49 2024 -0700
-
-    Organize sampling batch info better (#1562)
-
-[33mcommit e0b5dbcec13cea0bed7737463052480f7468ce41[m
-Author: HAI <hixiao@gmail.com>
-Date:   Thu Oct 3 01:52:26 2024 -0700
-
-    [FP8 KV Cache] Avoid KeyError at loading pre-quantized FP8 model with kv_scale (#1559)
-
-[33mcommit e6852b0dd28979d548885c6b203e0b74616d9af7[m
-Author: Minsang Song <mssongit@gmail.com>
-Date:   Thu Oct 3 12:41:15 2024 +0900
-
-    [Fix] Fix AttributeError in Qwen2.5 LoRA: 'Qwen2ForCausalLM' object has no attribute 'get_hidden_dim' (#1536)
-    
-    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
-
-[33mcommit 4ae0969c0abfd576d119de0698894a11f5991b0c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 2 18:23:35 2024 -0700
-
-    Move status check in the memory pool to CPU (#1557)
-
-[33mcommit 317631cadac533d2d548ae35fc153b8a8fbbccc8[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 2 17:18:04 2024 -0700
-
-    [Fix] Move ScheduleBatch out of SamplingInfo (#1556)
-
-[33mcommit b564835364e13979226faa6d56ba6d70e07caa9f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Oct 2 13:19:44 2024 -0700
-
-    [Fix] do not maintain regex_fsm in SamplingBatchInfo (#1555)
-
-[33mcommit 2c7d0a5b8b33d9a90ede19a0ee227393982ac340[m
-Author: Theresa Barton <tbarton16@gmail.com>
-Date:   Wed Oct 2 10:12:07 2024 -0700
-
-    [Fix] Fix all the Huggingface paths (#1553)
-
-[33mcommit 8cdc76f6d4cd61ced1d84a44c243b8a89e0a1f74[m
-Author: kk <43161300+kkHuang-amd@users.noreply.github.com>
-Date:   Thu Oct 3 00:52:46 2024 +0800
-
-    [Performance, Hardware] MoE tuning on AMD MI300x GPUs (#1554)
-    
-    Co-authored-by: wunhuang <wunhuang@amd.com>
-
-[33mcommit f202ed97121a42fbc960572fa953101f584f17d4[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Oct 1 10:25:32 2024 -0700
-
-    [Refactor] Simplify io_struct and tokenizer_manager (#1549)
-
-[33mcommit 100f5b8bc976773b595923665715eb13d3bfcab6[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Oct 1 00:28:42 2024 -0700
-
-    Simplify flashinfer dispatch (#1552)
-
-[33mcommit 619bb6ddda39cada67f75426979b77e7b42bb15e[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Sep 30 23:12:36 2024 -0700
-
-    Dispatch flashinfer wrappers (#1550)
-
-[33mcommit b88ea90d4ad98992790395d11ae20bf27b9657f8[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Sep 30 17:09:54 2024 -0700
-
-    Fix bugs of `logprobs_nums` (#1548)
-
-[33mcommit 99ec439da476c1a83ce29863395433833f0ac850[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Sep 30 15:54:18 2024 -0700
-
-    Organize Attention Backends (#1547)
-
-[33mcommit 0f4fb19bc8cf87f518b0273ee970d5b3eef8beb5[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Sep 30 10:06:08 2024 -0700
-
-    [Fix, LoRA] fix LoRA with updates in main (#1545)
-
-[33mcommit 63ba2f8d7bf895938a3f4039910044ce6912d57e[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Sep 30 06:41:49 2024 -0700
-
-    Clean up batch data structures: Introducing ModelWorkerBatch (#1544)
-
-[33mcommit 36d5acfca54f56567f9ece4c0664ba6c8c02d76f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Sep 30 02:41:11 2024 -0700
-
-    Rename InputMetadata -> ForwardBatch (#1543)
-
-[33mcommit 3f0fe08d3748dfd5aa70f4620bb44bdd6765e335[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Sep 29 20:28:45 2024 -0700
-
-    Let ModelRunner take InputMetadata as input, instead of ScheduleBatch (#1541)
-
-[33mcommit 55b974f96f14dc359111bb34a47655eec006655f[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Sep 29 18:52:43 2024 -0700
-
-    Process image in parallel (#1539)
-
-[33mcommit f86c1e611f9cfead8040966ae290679e8932db8c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Sep 29 17:42:45 2024 -0700
-
-    Move scheduler code from tp_worker.py to scheduler.py (#1538)
-
-[33mcommit acaffd233fbaee164947ffca45189204db4da6f3[m
-Author: Xinyu Yang <cauyxy@163.com>
-Date:   Mon Sep 30 02:02:40 2024 +0800
-
-    [Fix] fix ipv6 url when warm up model (#1537)
-
-[33mcommit 048685430d4c46fd5bc150675b0df49fc6a681d3[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Sep 29 02:36:12 2024 -0700
-
-    Improve process creation (#1534)
-
-[33mcommit fd9ad817ec449592ec58b1cb7b57ac2e55d49b02[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Sep 28 23:28:55 2024 -0700
-
-    Organize image inputs (#1531)
-
-[33mcommit e165a9fc1bb104d07763d2992ee45642b5fcda28[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Sep 28 19:33:09 2024 -0700
-
-    Make detokenizer_manager.py not asyncio (#1532)
-
-[33mcommit 4e4459b91fae13c6d75252ffc41ecacc03950372[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Sep 28 14:43:35 2024 -0700
-
-    Multiple minor fixes (#1530)
-
-[33mcommit 065bb9475344c66d468c9a7ba71fb1ea465292a0[m
-Author: Jeffrey Fong <jeffreyfong94@gmail.com>
-Date:   Sun Sep 29 05:04:06 2024 +0800
-
-    Fix RuntimeEndpoint.select method (#1495)
-
-[33mcommit f42e9bfb52b4e1af282806697ba29de1164df480[m
-Author: Kylin <kose2livs@gmail.com>
-Date:   Sun Sep 29 03:43:22 2024 +0800
-
-    [bugfix] Add modelscope package to avoid docker image without modelscope (#1520)
-
-[33mcommit 840c5dbcb303d55f982cc386b539a892b5c4ba2e[m
-Author: Ninglin Du <du00cs@outlook.com>
-Date:   Sun Sep 29 03:42:06 2024 +0800
-
-    [FIX] Catch syntax error of Regex Guide to avoid crash (#1521)
-
-[33mcommit 63e845d0bb3a4095af9640242aaca4ed8656fed8[m
-Author: Jerry Zhang <jerryzh168@gmail.com>
-Date:   Sat Sep 28 12:27:54 2024 -0700
-
-    Add float8 dynamic quant to torchao_utils (#1528)
-
-[33mcommit 9aa6553d2abd3d0a76629fe1bc02ce9b58f3907e[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Sep 27 23:32:11 2024 -0700
-
-    [Feature] Support reward model LxzGordon/URM-LLaMa-3.1-8B (#1525)
-
-[33mcommit b1e330bcb0db76928279ef9c2ad603cc1e666965[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Sep 27 13:30:04 2024 -0700
-
-    [Event] Update meeting link (#1529)
-
-[33mcommit 4353acb469d46afe3b652928729803492873d0cd[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Sep 27 01:49:16 2024 -0700
-
-    minor: fix config (#1524)
-
-[33mcommit 9ae1db0bdcb1e9c4ecf70cc1cc7473a23eee97dc[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Sep 25 11:32:21 2024 -0700
-
-    [Fix] Ignore import error (#1513)
-
-[33mcommit 37c5899fc2100de1c9afd51a7b1977b2f8185a28[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Sep 24 23:17:09 2024 -0700
-
-    Release v0.3.2 (#1512)
-
-[33mcommit f39a0197fdc39aeb468f8b51f7b9e7631a02bf98[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Sep 24 22:50:31 2024 -0700
-
-    Revert "kernel: use tensor cores for flashinfer gqa kernels" (#1511)
-
-[33mcommit 3c93187cafd675ad8c05dcf4095513ce4ec0bae3[m
-Author: TianyiQ <34389237+TianyiQ@users.noreply.github.com>
-Date:   Tue Sep 24 21:50:20 2024 -0700
-
-    Add support for tie_word_embeddings when loading weights + support for SmolLM (#1508)
-
-[33mcommit fb2d0680e0479acd7ea69737cbb09eec21755af9[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Sep 24 21:37:33 2024 -0700
-
-    [Fix] Fix clean_up_tokenization_spaces in tokenizer (#1510)
-
-[33mcommit 067d8e16fc8d35bbb5bbe339850a47cb39e59b85[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Sep 24 17:42:07 2024 -0700
-
-    Simplify bench_latency.py (#1503)
-
-[33mcommit e6692bf4a53780572635f65d524f579f5ce08220[m
-Author: luzengxiangcn <60803814+luzengxiangcn@users.noreply.github.com>
-Date:   Tue Sep 24 19:58:01 2024 +0800
-
-    debug radixcache stack_overflow (#1499)
-
-[33mcommit 28b4d8e14472eda2f4e779c2e3a7f9eac5a522b3[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Sep 24 03:17:10 2024 -0700
-
-    Update test_srt_backend.py (#1502)
-
-[33mcommit bc068e96181d4b42989e1c13b59f4b64de94bd99[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Sep 24 02:06:28 2024 -0700
-
-    [CI] Move AMD test to a separate file (#1500)
-
-[33mcommit 8d4ed42ad51dfab931c61e007b58a76bb1a2eeb8[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Tue Sep 24 16:46:59 2024 +0800
-
-    MoE torch compile (#1497)
-
-[33mcommit 2854a5ea9fbb31165936f633ab99915dec760f8d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Sep 23 07:38:14 2024 -0700
-
-    Fix the overhead due to penalizer in bench_latency (#1496)
-
-[33mcommit 42a2d82ba71dc86ca3b6342c978db450658b750c[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Sep 23 20:40:17 2024 +0800
-
-    minor: add mla fp8 test (#1494)
-
-[33mcommit e4780cf839b5dcaf41cd60fa384faa9616372025[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Sep 22 06:46:17 2024 -0700
-
-    [API, Feature] Support response prefill for openai API (#1490)
-
-[33mcommit 39bb49d156f2319d2aec67c458c2db980bb0f4c3[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Sep 22 04:49:16 2024 -0700
-
-    Update dockerfile to include datamodel_code_generator (#1492)
-
-[33mcommit 6f3cf1297e7600f4b2ba8dd3af3a5cc2e33de6ef[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Sep 22 04:45:10 2024 -0700
-
-    [CI, AMD] Add AMD tests to CI (#1491)
-
-[33mcommit 13f1357ef000e0d9dcf6f13dd178d809126f3ac7[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Sep 22 02:21:05 2024 -0700
-
-    Add a unit test for data parallelism (#1489)
-
-[33mcommit 2a99993cd968854eca891b4b9e9ad406560155cd[m
-Author: wellhowtosay <aaa7327403@qq.com>
-Date:   Sun Sep 22 17:20:26 2024 +0800
-
-    Pr fix max workers (#1456)
-    
-    Co-authored-by: baolujia <baolujia@shizhuang-inc.com>
-    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
-
-[33mcommit 167591e864f1aab0787bdd8efd8eb91852b1a1a6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Sep 22 01:50:37 2024 -0700
-
-    Better unit tests for adding a new model (#1488)
-
-[33mcommit 441c22db8cbcb005b5f005b991e8aa1a65d79bb6[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Sep 21 22:05:12 2024 +0800
-
-    doc: update backend (#1486)
-
-[33mcommit ce636ac441f8085ab5a118b26c52005a78fcb2bf[m
-Author: Ran Chen <ranchen19@icloud.com>
-Date:   Sat Sep 21 05:36:23 2024 -0700
-
-    fix incorrect links in documentation (#1481)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit 82136eb0b58cf93c953b9f701360aa1fe4718c14[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Sep 21 11:17:45 2024 +0800
-
-    chore: bump v0.3.1.post3 (#1483)
-
-[33mcommit b8ccaf4d737a3c6a7965317a5eedddf3f4af903d[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Sat Sep 21 11:16:13 2024 +0800
-
-    Add MLA gsm8k eval (#1484)
-
-[33mcommit a68cb201dd5f4ae6155b324d22054bbb0de15fba[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Sat Sep 21 10:25:20 2024 +0800
-
-    Fix triton head num (#1482)
-
-[33mcommit 014982b5e00cbcf18caa20dc662a1c09220c3fa7[m
-Author: Niklas Muennighoff <n.muennighoff@gmail.com>
-Date:   Thu Sep 19 19:32:49 2024 -0700
-
-    Add OLMoE (#1476)
-
-[33mcommit a6db88626e137d65b2b452609174a9e7aff2d779[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Sep 20 01:57:19 2024 +0800
-
-    minor: add quant eval compared with base (#1475)
-
-[33mcommit b4408b0d1667bb34729a146a01efb31179d28edf[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Sep 19 20:53:11 2024 +0800
-
-    feat: update linear deps 1/N (#1305)
-
-[33mcommit 2cd7e181dddd825862e8007b3709e12339e62f03[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Sep 19 03:19:26 2024 -0700
-
-    Fix env vars in bench_latency (#1472)
-
-[33mcommit 5ce55aee15311802b7f57caf80eb278e4d8453f4[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Sep 19 02:03:38 2024 -0700
-
-    Release v0.3.1.post2 (#1470)
-
-[33mcommit 2d346a57c2fb753d9353f346dff3bbc5b208bd0b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Sep 19 01:52:15 2024 -0700
-
-    Fix padding in the cuda graph (#1469)
-
-[33mcommit 446ea3327735e125e19d37b6a2c25aed7ead68f3[m
-Author: Li Bo <drluodian@gmail.com>
-Date:   Thu Sep 19 16:31:48 2024 +0800
-
-    fix: creat new dict everytime for putting new frame (#1464)
-
-[33mcommit 8f527e29409f714f9de839ece1e7aace15d9b27a[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Sep 18 08:53:22 2024 -0700
-
-    [Event] Add public meeting invite to README (#1458)
-
-[33mcommit 7f24ea95c344ae85c6633d47083722ebc5377f07[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Sep 18 04:35:35 2024 -0700
-
-    Fuse top_k and top_k in the sampler (#1457)
-
-[33mcommit 1acccb364a46b4c7fe6bb6aabdfd44a11cf8b204[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Sep 18 03:45:19 2024 -0700
-
-    Fix oom issues with fp8 for llama (#1454)
-
-[33mcommit aa2750beb30fe6663fa68162b8937399cebb03e4[m
-Author: HAI <hixiao@gmail.com>
-Date:   Wed Sep 18 02:01:35 2024 -0700
-
-    [Bugfix] Enable SGLang on AMD GPUs via PyTorch for ROCm (#1419) (#1453)
-
-[33mcommit 5e62a6b706b9de7d3a7628e2523d24b7e67b30c8[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Sep 18 00:56:06 2024 -0700
-
-    Add bench_server_latency.py (#1452)
-
-[33mcommit 5752f25eef583cc97a961d0e625c7bf46f8526af[m
-Author: Xiao Yu <39458711+jasonyux@users.noreply.github.com>
-Date:   Wed Sep 18 03:46:32 2024 -0400
-
-    Fixed n>1 causing list index out of range with VLM (#1449)
-
-[33mcommit 7c162fa9c5f7ab029104e8aeb93460d0a38d3034[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Sep 17 22:59:32 2024 -0700
-
-    Fix schedule bug (#1451)
-
-[33mcommit 36078fb24760de837b95f4bea87ede00c0fd91e8[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Sep 17 16:33:53 2024 -0700
-
-    fix schedule bug (#1450)
-
-[33mcommit b3710d2c93b6f1ef608990096d71817c5cf35608[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Tue Sep 17 22:07:53 2024 +0800
-
-    Fix attention backend (#1448)
-
-[33mcommit c6b6d2e71b2b5626097d8e0d8c18f810e828d58e[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Tue Sep 17 19:42:48 2024 +0800
-
-    Enable MLA by default (#1447)
-
-[33mcommit 90a26be31cea2e15c26ef5e91b296bc76a75004c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Sep 17 01:47:31 2024 -0700
-
-    Release 0.3.1.post1 (#1445)
-
-[33mcommit 1f4b5f770dd257ebe84f1e1b875de2f5f782c5bc[m
-Author: Jani Monoses <jani.monoses@gmail.com>
-Date:   Tue Sep 17 11:14:53 2024 +0300
-
-    Add OLMoE model (#1444)
-
-[33mcommit 76524b70d1f8bb4a3d0648010c7b35b54f01d654[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Tue Sep 17 15:52:08 2024 +0800
-
-    Fix torch compile for deepseek-v2 (#1442)
-
-[33mcommit 3a6e04185b8dc3fd69f6c308a09186419b746c43[m
-Author: HAI <hixiao@gmail.com>
-Date:   Tue Sep 17 00:43:52 2024 -0700
-
-    [Feature, Hardware] Enable SGLang on AMD GPUs via PyTorch for ROCm (#1420)
-
-[33mcommit 2fa5cec7754432590f0b1478bf7b2ee8333ae4d3[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Sep 16 21:23:31 2024 -0700
-
-    Simplify sampler and its error handling (#1441)
-
-[33mcommit 27b557aea794d267e371d3bdaa4722a4db45a1e1[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Sep 16 18:16:27 2024 -0700
-
-    Clean up model loader (#1440)
-
-[33mcommit 93dffd699bd653fb1dfef44f30eb3d7ec40d6a4d[m
-Author: zifeitong <zifeitong@gmail.com>
-Date:   Mon Sep 16 13:29:18 2024 -0700
-
-    Add constrained_json_whitespace_pattern to ServerArgs (#1438)
-
-[33mcommit 2abe4f1cb6e9b4d36c332b0fb04c0dec2ad38bc6[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Sep 15 15:22:32 2024 -0700
-
-    Revert "[Minor] Raise exception for wrong import (#1409)" (#1432)
-
-[33mcommit 37963394aa28769abb1843d4373ae799d4e93f07[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Sep 15 12:46:04 2024 -0700
-
-    [Feature] Support LoRA path renaming and add LoRA serving benchmarks (#1433)
-
-[33mcommit 899cf5c4389b83fc7d913bf23ce07282d09ffb91[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Sep 15 08:52:18 2024 -0700
-
-    Remove deprecated configs (#1431)
-
-[33mcommit e79f6cd73d8e0fd74db29b8c441c2ead0e2fff37[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Sep 15 07:03:16 2024 -0700
-
-    Release v0.3.1 (#1430)
-
-[33mcommit 9ba1f0976035fe7212002cac3b2b9df9f0685334[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Sep 15 06:36:06 2024 -0700
-
-    [Fix] Fix logprob and normalized_logprob (#1428)
-
-[33mcommit 282681b8a15affd7f7d9e16584c38954ba4e8413[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Sep 15 02:55:34 2024 -0700
-
-    Update backend.md (#1429)
-
-[33mcommit 58cafe23a7dcbeac8b45ad79212c3a535f1da6ce[m
-Author: William Arnold <will748@gmail.com>
-Date:   Sun Sep 15 15:40:31 2024 +0900
-
-    Add libibverbs-dev to Dockerfile (#1427)
-
-[33mcommit 9463bc13856a6c7954e9777a3230bd457d111898[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Sep 14 15:38:37 2024 -0700
-
-    Enable torch.compile for triton backend (#1422)
-
-[33mcommit e3fc4658f473bc1848a9b68345a87acc80fafddc[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Sep 15 01:07:52 2024 +0900
-
-    fix: resolve nightly eval (#1426)
-
-[33mcommit 33b54e7c40a58d64c988c067e3e607f96a04ae58[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Sat Sep 14 23:15:30 2024 +0800
-
-    Add pytorch sampling backend ut (#1425)
-
-[33mcommit 30b404ce72b52e02076fa46ff5ee16f3e1a68a98[m
-Author: Jerry Zhang <jerryzh168@gmail.com>
-Date:   Fri Sep 13 23:46:55 2024 -0700
-
-    Add torchao quant for mixtral and qwen_moe (#1418)
-
-[33mcommit 70b6802982198a739b233a1c72a8fa9871aabec8[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Sep 13 20:27:53 2024 -0700
-
-    Optimize conflicts between CUDA graph and vocab mask tensors (#1392)
-
-[33mcommit f3d32f888a25ff62f0d9f5994a1d0420637beb0e[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Sep 14 00:01:30 2024 +0900
-
-    ci: fix finish (#1414)
-
-[33mcommit 8779da95d62230bdb52e05d25bac45457cf805f1[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Sep 13 00:37:13 2024 -0700
-
-    Update pr-test.yml (#1412)
-
-[33mcommit ad0ff62a4c25f9d47533c22be083cacf38f60c68[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Sep 12 23:29:44 2024 -0700
-
-    Balance test in CI (#1411)
-
-[33mcommit 9a903a878413dd6ef894ff481f28294e9293a9a4[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Sep 12 23:02:36 2024 -0700
-
-    [Minor] Raise exception for wrong import (#1409)
-
-[33mcommit 68be2f6d3b8df28ee0e3553c528c8842987c18f2[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Sep 12 21:36:41 2024 -0700
-
-    [CI] Include triton backend and online serving benchmark into CI (#1408)
-
-[33mcommit b912de11b0a58330064b3d72db6ea0fad515d468[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Sep 12 20:47:31 2024 -0700
-
-    Make stop reason a dict instead of str (#1407)
-
-[33mcommit eb02c1618a93428c2e4e3e170648340dcff2286e[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Sep 12 16:49:50 2024 -0700
-
-    [Minor, CI] remove lora test from minimal suite (#1406)
-
-[33mcommit 712216928fa252d6592a1518579018a69cb72bfe[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Sep 12 16:46:14 2024 -0700
-
-    [Feature] Initial support for multi-LoRA serving (#1307)
-
-[33mcommit c33d82a2111434f02159cd97e02f3cb6657595a4[m
-Author: hxer7963 <hxer7963@gmail.com>
-Date:   Thu Sep 12 16:47:52 2024 +0800
-
-    Add Support for XVERSE Models (Dense and MoE) to sglang (#1397)
-    
-    Co-authored-by: will he <hexin@xverse.cn>
-    Co-authored-by: root <root@localhost.localdomain>
-    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
-
-[33mcommit 8234e663e9c43f3dd1552866b37da21e2bc249ae[m
-Author: Kaichen Zhang - NTU <kaichenzhang358@outlook.com>
-Date:   Thu Sep 12 16:10:26 2024 +0800
-
-    [Minor Fix] Fix llava modalities issue for single-image (#1402)
-
-[33mcommit debbdb5178f347159b42550298806625d0989ff8[m
-Author: Zihao Ye <zihaoye.cs@gmail.com>
-Date:   Thu Sep 12 00:38:18 2024 -0700
-
-    kernel: use tensor cores for flashinfer gqa kernels (#1403)
-
-[33mcommit 3efa798116419311eeabbb450f69b181e21fd461[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Sep 12 00:36:55 2024 -0700
-
-    Support cuda graph in the triton attention backend (#1401)
-
-[33mcommit 2a71be5e2554e01368b3bc4265db1f7822b0ae3c[m
-Author: William <acha131441373@gmail.com>
-Date:   Thu Sep 12 14:46:51 2024 +0800
-
-    Fix README format (#1399)
-
-[33mcommit 446213777773f217a36b0b415e28c6a8d88d793f[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Sep 11 14:40:45 2024 -0700
-
-    Add no commit to main rule (#1393)
-
-[33mcommit fec185ce0cbaf3b0597d0d1a71c335a8c52ce1ba[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Sep 11 11:44:26 2024 -0700
-
-    Refactor attention backend (#1381)
-
-[33mcommit c03cece42f425cc8e73df77a6d1fcc316fd44b50[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Sep 11 04:50:04 2024 -0700
-
-    Improve error reporting during server launch (#1390)
-
-[33mcommit 15c75e41462dfdb6e405bf061ab0640bb04ccdbf[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Sep 11 04:36:21 2024 -0700
-
-    [Fix] Fix --disable-flashinfer (#1389)
-
-[33mcommit 224200e3c2accfe4e1c1ca4fb5906a5b8b609586[m
-Author: Vectory <blacker@aliyun.com>
-Date:   Wed Sep 11 18:55:24 2024 +0800
-
-    BaiChuan2 Model (#1367)
-    
-    Co-authored-by: wanpenghan <wanpenghan@sohu-inc.com>
-
-[33mcommit 8c0efa514dd17c49de2cf334a3cb49ec40fa3f3a[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Wed Sep 11 03:22:07 2024 -0700
-
-    remove assertion in triton attention and add an unit test (#1385)
-
-[33mcommit 144bc70fcceede77fc2c2fbd286676b57f9a0c94[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Sep 10 17:38:59 2024 -0700
-
-    Organize flashinfer indices update (#1378)
-
-[33mcommit 46094e0c1b9c81a1f12f356472af694d9ef613cc[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Sep 10 17:11:16 2024 -0700
-
-    Deprecate --disable-flashinfer and introduce --attention-backend (#1380)
-
-[33mcommit 3a6e8b6d78d8d33b5c241b4d95f531ac20e31964[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Sep 10 15:15:08 2024 -0700
-
-    [Minor] move triton attention kernels into a separate folder (#1379)
-
-[33mcommit fbb4754cb8c6585763ab631231508e84e6c287e2[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Sep 10 13:10:36 2024 -0700
-
-    Fix vocab mask update bug (#1376)
-
-[33mcommit 6c7cb903655d4b8523e45838e597c11e10a6600f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Sep 10 11:27:03 2024 -0700
-
-    [Minor] improve kill scripts and torchao import (#1375)
-
-[33mcommit dff2860a690757966e408b598a8f0b47a29a4713[m
-Author: josephrocca <1167575+josephrocca@users.noreply.github.com>
-Date:   Wed Sep 11 00:35:03 2024 +0800
-
-    Fix CORS compatibility with OpenAI, vLLM, TGI, LMDeploy (#1373)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit e72275cf7f6f9783cbd6031a1dcfd93bd45e40da[m
-Author: William <acha131441373@gmail.com>
-Date:   Tue Sep 10 17:57:52 2024 +0800
-
-    Support MiniCPM3 (#1371)
-
-[33mcommit fec2d1223c82f5701a384030c842dc92e0543e22[m
-Author: wangchao <wcsjtu@163.com>
-Date:   Tue Sep 10 16:17:37 2024 +0800
-
-    [Fix] fix bug of `undefined is_single` in meth `create_abort_task` (#1370)
-
-[33mcommit 8d1095dbf0565cb7d6e5e3d10728a6542c8db6ae[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Sep 9 20:48:28 2024 -0700
-
-    [Docs] Improve documentations (#1368)
-
-[33mcommit 743007e1ce07b99529b49d95413f4879853be1ac[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Tue Sep 10 10:09:13 2024 +0800
-
-    Adding Documentation for installation (#1300)
-    
-    Co-authored-by: zhaochen20 <zhaochenyang20@gmail.com>
-
-[33mcommit 9144ed1067f27ae682d48fc4f183e24098b72f6d[m
-Author: zifeitong <zifeitong@gmail.com>
-Date:   Mon Sep 9 19:08:25 2024 -0700
-
-    Support OpenAI API json_schema response format (#1363)
-
-[33mcommit 69b3bb9ae1c504925455e8b258eefa0fcc15bd81[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Sep 9 13:49:29 2024 -0700
-
-    Unify forward mode (#1360)
-
-[33mcommit 689ff588eca5b6d401b6bfd736cf98cd2b776144[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Sep 9 13:05:13 2024 -0700
-
-    [CI] Return output logprobs in unit test (#1361)
-
-[33mcommit a7c47e0f028c2a9e67cbc99ab67692ec765d3dd0[m
-Author: Jerry Zhang <jerryzh168@gmail.com>
-Date:   Mon Sep 9 05:32:41 2024 -0700
-
-    Add torchao quant (int4/int8/fp8) to llama models (#1341)
-    
-    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
-
-[33mcommit e4d68afcf00869a5467f101d176fecc3cd97b7b8[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Sep 9 04:14:11 2024 -0700
-
-    [Minor] Many cleanup (#1357)
-
-[33mcommit c9b75917d577523ba1c1c581c638b9d2e94b777b[m
-Author: Kai-Hsun Chen <kaihsun@anyscale.com>
-Date:   Mon Sep 9 02:14:25 2024 -0700
-
-    [server] Passing `model_override_args` to `launch_server` via the CLI. (#1298)
-    
-    Signed-off-by: Kai-Hsun Chen <kaihsun@anyscale.com>
-
-[33mcommit 662ecd93680c8195eda799cb9a497f93efdc521a[m
-Author: Kaichen Zhang - NTU <kaichenzhang358@outlook.com>
-Date:   Mon Sep 9 17:07:34 2024 +0800
-
-    [Feat] Add modalities for vision server when handling pixel values for llava (#1346)
-
-[33mcommit 8e6bdf851c4aa6619baa584fc450af748720319d[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Mon Sep 9 01:30:24 2024 -0700
-
-    [triton] Support head_dim not 2^n in triton extend and decode attention (#1281)
-
-[33mcommit 05bea6883c4b3f2fb7f01287cd8dccefeacd545f[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Sep 7 20:46:27 2024 -0700
-
-    Fix some online scheduling delay (#1345)
-
-[33mcommit ab4a83b25909aa98330b838a224e4fe5c943e483[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Sep 5 14:30:26 2024 -0700
-
-    Optimize schedule (#1339)
-
-[33mcommit 62f15eea5a0b4266cdae965d0337fd33f6673736[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Sep 6 04:25:14 2024 +1000
-
-    docs: add conclusion (#1340)
-
-[33mcommit 79794af52d90abfb00e73871109f0cdc4e0b7f34[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Sep 6 00:00:06 2024 +1000
-
-    docs: highlight ttft itl and throughput (#1337)
-
-[33mcommit 3494b32c3a77e32d1a492b8c2a408b3662c08229[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Sep 5 23:39:44 2024 +1000
-
-    docs: update README (#1336)
-
-[33mcommit eda7c09048b39bd03b0e34aa16ffef9398072663[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Sep 4 05:37:32 2024 -0700
-
-    Remove useless fields in global_config.py (#1328)
-
-[33mcommit 5ab9418f5b4c9ad1a90d72a803331d9a0b26d13e[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Sep 4 21:21:21 2024 +1000
-
-    [Doc] update news (#1327)
-
-[33mcommit 843e63d809f59e1446d6b0a61306c9a001b404d6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Sep 4 04:15:11 2024 -0700
-
-    Fix the flaky test test_moe_eval_accuracy_large.py (#1326)
-
-[33mcommit a63c8275c6c5aa642f06793d0bfc60c9653e75a7[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Sep 4 06:32:18 2024 +1000
-
-    chore: bump v0.3.0 (#1320)
-
-[33mcommit dc67d9769382cf83b3e2644a4366d6473445a6c6[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Sep 4 04:29:53 2024 +1000
-
-    misc: speedup load safetensors (#1319)
-    
-    Co-authored-by: ispobock <ISPObaoke@163.com>
-
-[33mcommit 1e495e08470b6dc56645081f644831e0c620dfa5[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Sep 3 06:31:45 2024 -0700
-
-    [Fix] Fix select by ensuring each request has at least one token (#1318)
-
-[33mcommit 12cb115d381cc19605c2fd3aa696ddf550f480de[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Sep 3 05:32:14 2024 -0700
-
-    Fix llama2 weight loader (#1317)
-
-[33mcommit c500f96bb16c686ee8ba5d5f1fc716a0bd8e5fff[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Sep 3 01:43:08 2024 -0700
-
-    Update README.md for llava-onevision instructions (#1313)
-
-[33mcommit 474317f2b606a79ac6811798c612d13b83f719fd[m
-Author: Jani Monoses <jani.monoses@gmail.com>
-Date:   Tue Sep 3 07:49:40 2024 +0300
-
-    Support Phi3 mini and medium (#1299)
-
-[33mcommit f64eae3a291ade9654f1b030878df098bdefa9ee[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Sep 2 21:44:45 2024 -0700
-
-    [Fix] Reduce memory usage for loading llava model & Remove EntryClassRemapping (#1308)
-
-[33mcommit a5a134f39f9b032496fa895050e56485d8fe9957[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Sep 2 16:18:48 2024 -0700
-
-    Fix bugs in sampler with CUDA graph / torch.compile (#1306)
-
-[33mcommit 2561ed012ce10e109ac888f7e9e7ffe44ccb4a94[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Sep 3 01:18:41 2024 +1000
-
-    feat: update nightly gsm8k eval (#1304)
-
-[33mcommit 9999442756cf34a298933d2e072bd07493346d52[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Sep 1 22:22:38 2024 -0700
-
-    Release v0.2.15 (#1295)
-
-[33mcommit 6def9b018c6b4c87410e870f0a5d0469ba50d637[m
-Author: Max Shawabkeh <max99x@gmail.com>
-Date:   Sun Sep 1 21:56:33 2024 -0700
-
-    Fix hang when doing s += None. (#1297)
-    
-    Co-authored-by: max99x <mshawabkeh@jamandtea.studio>
-
-[33mcommit 47f20da223c62473577231cec49dedb86c56220f[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Sep 1 21:50:58 2024 -0700
-
-    Fix regex mask (#1296)
-
-[33mcommit 4a9f8ea43bc23609f1fcce46e6efb2711b452fe5[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sun Sep 1 14:46:36 2024 -0700
-
-    [doc] Fix more broken links (#1294)
-
-[33mcommit 58fa6076223ab2438e840c1a4bb2e5508fd2c1f2[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Sep 1 12:20:46 2024 -0700
-
-    Fix the flaky tests in test_moe_eval_accuracy_large.py (#1293)
-
-[33mcommit 6487ef64c659fe1b1c10743a37f6377f70044ecd[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Sep 2 03:19:49 2024 +1000
-
-    ci: add nightly eval (#1291)
-
-[33mcommit 9b0805242eeaf81bc41f6920788eaa379b43488b[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Sep 2 00:29:06 2024 +1000
-
-    fix: resolve fp8 for mixtral (#1290)
-
-[33mcommit 32a4141d5aaca699c9377dd0d5c689ac019f91b9[m
-Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
-Date:   Sun Sep 1 11:42:29 2024 +0100
-
-    Allow new lines during JSON generation (#1277)
-
-[33mcommit 0836055324af92cae74a13bdb137fb754b6c8aae[m
-Author: Kai-Hsun Chen <kaihsun@apache.org>
-Date:   Sun Sep 1 03:14:56 2024 -0700
-
-    [Chore] Rename model_overide_args to model_override_args (#1284)
-    
-    Signed-off-by: Kai-Hsun Chen <kaihsun@anyscale.com>
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit 00b19f198f198bd2f7182596773d80f5217ab757[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sun Sep 1 03:12:06 2024 -0700
-
-    [triton] Remove the zero initialization of qk_acc by directly writing the result (#1288)
-
-[33mcommit 6cb32ef92c99ee7c1192ff90023692adc106049c[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Sun Sep 1 17:46:40 2024 +0800
-
-    Support Triton fp8 e5m2 kv cache (#1286)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit 761b2cebd65ff7fbf2cd55b63e1230df1bf6f6ca[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Sep 1 02:36:56 2024 -0700
-
-    [CI] merge all ci tests into one file (#1289)
-
-[33mcommit 54772f784adb9c9774c359c23661cfb0a3bbac17[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Sep 1 17:28:06 2024 +1000
-
-    feat: fix fp8 for MLA and support bmm fp8 for DeepSeek V2 (#1285)
-    
-    Co-authored-by: ispobock <ispobaoke@163.com>
-
-[33mcommit 1b5d56f7f885cdc4284579ee863f9944f4c12bce[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Sep 1 00:27:25 2024 -0700
-
-    [CI] Add more multi-gpu tests (#1280)
-
-[33mcommit d134c139a19cfec512d59f55656f12f5b421e14b[m
-Author: xiaobochen <35516720+xiaobochen123@users.noreply.github.com>
-Date:   Sat Aug 31 23:40:28 2024 -0700
-
-    Optimize the update flashinfer indices (#1262)
-
-[33mcommit 6cc9c52521976450b1371c7555102ded79670b2e[m
-Author: Byron Hsu <byronhsu1230@gmail.com>
-Date:   Sat Aug 31 22:54:34 2024 -0700
-
-    [doc] fix quick start link (#1282)
-
-[33mcommit 52cefdbf5797f612d38f43a120f52ae45b9d1380[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Sep 1 00:44:29 2024 +1000
-
-    fix: resolve the fp8 bug introduced by vLLM 0.5.5 (#1276)
-
-[33mcommit 51c554d812f4969f4727e21531224322281efc2f[m
-Author: Christopher Chou <49086305+BabyChouSr@users.noreply.github.com>
-Date:   Fri Aug 30 11:51:44 2024 -0700
-
-    Allow more flexible assistant and system response (#1256)
-
-[33mcommit 79ece2c51f47ee6b792c6282a6f76987892c5f8d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Aug 30 06:05:01 2024 -0700
-
-    Report median instead of mean in bench_latency.py (#1269)
-
-[33mcommit 55f5976b42d736f3dfe2f8f9b91a6536c212744a[m
-Author: 김종곤 <149566442+Deepfocused@users.noreply.github.com>
-Date:   Fri Aug 30 17:49:07 2024 +0900
-
-    Update README.md - Supported Models add Exaone 3.0 (#1267)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit b7f834101476209767b7c8a52f17aa86cad79f44[m
-Author: 김종곤 <149566442+Deepfocused@users.noreply.github.com>
-Date:   Fri Aug 30 17:08:28 2024 +0900
-
-    EXAONE 3.0 Model Support (#1258)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit f414352ae6783dc20dc93e09be00ea62f4438931[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Fri Aug 30 14:45:40 2024 +0800
-
-    Transpose mla weight offline (#1261)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit a362340b33258eae0f48504be09659e2e9dcd035[m
-Author: lxww302 <68112258+lxww302@users.noreply.github.com>
-Date:   Thu Aug 29 23:43:41 2024 -0700
-
-    fix: multimodal_config in monkey_patch_vllm_dummy_weight_loader (#1260)
-
-[33mcommit 381dd57bd69f027a3298d107d8eb851c3c29d8e4[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Aug 28 18:58:52 2024 -0700
-
-    Sampler cudagraph (#1253)
-
-[33mcommit 8153168c96c76cdc77eabcbe03b167f9f3b4385f[m
-Author: Zhiqiang Xie <zhiqiangx@nvidia.com>
-Date:   Wed Aug 28 18:57:54 2024 -0700
-
-    fix data racing due to mutable reference using deepcopy (#1255)
-
-[33mcommit 6c34d6339c040628e895d167cf22f2ab7104f8b3[m
-Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
-Date:   Thu Aug 29 02:57:10 2024 +0100
-
-    make json_schema usable from gen (#1254)
-
-[33mcommit 13ac95b8946ff0bc62527567931bdf647cc43c5e[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Aug 29 04:46:33 2024 +1000
-
-    chore: bump v0.2.14.post2 (#1250)
-
-[33mcommit 492143bf32b25848300dcc18bd51fef6c25d02d7[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Aug 29 04:25:46 2024 +1000
-
-    fix: resolve qwen2 moe weight loader (#1252)
-
-[33mcommit 0a97d7962d31728a3e4d5936b105ab27a83cd1a9[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Aug 28 08:38:50 2024 -0700
-
-    [Fix] Fix OOM in llava base class (#1249)
-
-[33mcommit c411f32e1c9b551011a52566b5afae1320a99fde[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Aug 29 00:07:02 2024 +1000
-
-    feat: replace GeluAndMul (#1234)
-
-[33mcommit bf53bf5142bd3393d495608e58c86f6d8c991664[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Aug 28 06:33:05 2024 -0700
-
-    [Fix] Fix llava on multi images (#1247)
-
-[33mcommit b1a540ec42cdd7b2875ce4b84587c522458bc065[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Aug 28 22:47:34 2024 +1000
-
-    feat: update GemmaRMSNorm (#1232)
-
-[33mcommit 66975360e7575a5f573cdaf5c6892d81afc3ed19[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Aug 28 22:12:36 2024 +1000
-
-    fix: increase max_new_tokens when testing generation models (#1244)
-
-[33mcommit 6c498313942b32e548dd0b499f279db0abc5b085[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Aug 28 04:20:54 2024 -0700
-
-    Add  sglang.bench_latency to CI (#1243)
-
-[33mcommit f25f4dfde5af9a81be52c1ba6d99cc2ac5cca179[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Aug 28 21:16:47 2024 +1000
-
-    hotfix: revert sampler CUDA Graph (#1242)
-
-[33mcommit 184ae1c68316c58a7f5b4ad813639b08604369f5[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Aug 28 02:15:52 2024 -0700
-
-    Update README.md (#1239)
-
-[33mcommit 198974cd1a805a7fab2d81fe9e6b5fbd73d03fb8[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Aug 28 18:39:12 2024 +1000
-
-    feat: support sm75 with FlashInfer v0.1.6 (#1233)
-
-[33mcommit 6cc38b2bf31c141e3ae06ca8c1150e35dbeb5578[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Aug 28 00:54:26 2024 -0700
-
-    [Minor] Add more type annotations (#1237)
-
-[33mcommit 1ece2cda3dde1df62c924c0288ec514f5c5e2af5[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Aug 28 00:37:32 2024 -0700
-
-    Fix bench latency benchmark (#1225)
-
-[33mcommit c8a9e79186503c3bd1955cdbd4c364b04db333fc[m
-Author: Dr. Artificial曾小健 <875100501@qq.com>
-Date:   Wed Aug 28 14:51:41 2024 +0800
-
-    Fix readme (#1236)
-
-[33mcommit 3602692c7ca7c3757cc3d2b5dfc829209205731a[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Aug 27 21:15:31 2024 +1000
-
-    feat: replace get_act_fn for gpt_bigcode (#1231)
-
-[33mcommit 909f34363bf551711c20dbadbd5cc7fb6517a614[m
-Author: havetc <corentin.havet@hotmail.fr>
-Date:   Tue Aug 27 12:10:46 2024 +0200
-
-    [FIX] Wrong logger (#1230)
-
-[33mcommit 5ff25cdf5b1310e83d9e595142b39ae4d7b561e9[m
-Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
-Date:   Mon Aug 26 22:04:52 2024 -0700
-
-    [Minor] add delete test and delete tmp file on ci server (#1227)
-
-[33mcommit 2f1d92834f41df42e266ed6d7036b4add906d21f[m
-Author: caiyueliang <393900414@qq.com>
-Date:   Tue Aug 27 07:28:26 2024 +0800
-
-    [FEAT] Support batches cancel (#1222)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit c61a1b6f97c61ebd80bada10c60c8ab75d2745b9[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Aug 26 13:52:58 2024 -0700
-
-    Torch compile CI throughput test (#1223)
-
-[33mcommit 9935f97b3e594e246776466d04134decff1b59ae[m
-Author: havetc <corentin.havet@hotmail.fr>
-Date:   Mon Aug 26 18:37:26 2024 +0200
-
-    [FEAT] JSON constrained support (#1125)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit c5fe11a8e175d48b00b32aafd7412953180314e4[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Aug 27 00:28:24 2024 +1000
-
-    chore: bump v0.2.14 (#1155)
-
-[33mcommit 75ce37f40139394bd2f3f55250095477d8c9b16d[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Aug 26 07:02:50 2024 -0700
-
-    Move sampler into CUDA graph (#1201)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit 97589a60a2cf2ef75d26ca0de9a78f30e2b63c4e[m
-Author: Mingyi <wisclmy0611@gmail.com>
-Date:   Sun Aug 25 21:54:02 2024 -0700
-
-    [CI] Parallelize unit tests in CI (#1219)
-
-[33mcommit 632d506d0b526f641f9ced4f408dad8bd64b5009[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Aug 25 21:26:31 2024 -0700
-
-    minor: improve CI and dependencies (#1212)
-
-[33mcommit 3579162ab102351b8cac5d17eab29e05fee63abe[m
-Author: Kaichen Zhang - NTU <kaichenzhang358@outlook.com>
-Date:   Mon Aug 26 11:58:51 2024 +0800
-
-    [Fix] Multi-images loading error (#1218)
-
-[33mcommit 7514b9f8d3660417c085538076cf5162f32ce2fb[m
-Author: Mingyi <wisclmy0611@gmail.com>
-Date:   Sun Aug 25 19:56:42 2024 -0700
-
-    [CI] Fix CI (#1217)
-
-[33mcommit 158e8f1e2d499e225add6ed0554896c94fd5a891[m
-Author: Mingyi <wisclmy0611@gmail.com>
-Date:   Sun Aug 25 19:02:08 2024 -0700
-
-    improve the threshold and ports in tests (#1215)
-
-[33mcommit d3efcb3930cfb1c79958dda00ce3e044fd85b714[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Aug 25 17:45:35 2024 -0700
-
-    Update workflow files (#1214)
-
-[33mcommit 2c615d120fa5da4ff6b88f59ca7656b8d595ccd0[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Mon Aug 26 08:38:11 2024 +0800
-
-    [Feature] Support fp8 e5m2 kv cache with flashinfer (#1204)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit 61bb223e0fc1ccd0c26ac3137f0d9154bcecc25a[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Aug 25 17:31:52 2024 -0700
-
-    Update CI runner docs (#1213)
-
-[33mcommit 15f1a49d2dcbd488155de373e7fcf854f29a7de8[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Aug 25 16:43:07 2024 -0700
-
-    Update CI workflows (#1210)
-
-[33mcommit 308d024092d8a671998b978f419dd40262bef9b5[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Aug 25 16:21:37 2024 -0700
-
-    [CI] Fix the issue of unit test hanging (#1211)
-
-[33mcommit ab4990e4bfd79fe60815a3f872c5857df57798bb[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Aug 25 14:49:23 2024 -0700
-
-    [Minor] Temporarily skip flaky test (#1209)
-
-[33mcommit 902278008a6e5cf0f054c0b6ce4ba0cc64ce7437[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Aug 25 14:46:34 2024 -0700
-
-    [Minor] Improve the function organization in TokenizerManager & improve loggers (#1208)
-
-[33mcommit 30b4f771b0c515c18179f3e1ee0b4662b2606a95[m
-Author: Chayenne <zhaochen20@outlook.com>
-Date:   Mon Aug 26 01:29:12 2024 +0800
-
-    Support Alibaba-NLP/gte-Qwen2-7B-instruct embedding Model (#1186)
-    
-    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
-
-[33mcommit 66e7dcaf7008d2ffe892044a21513a6e06424d1a[m
-Author: Kaichen Zhang - NTU <kaichenzhang358@outlook.com>
-Date:   Mon Aug 26 01:28:23 2024 +0800
-
-    [Fix] Fixing the multi-images error for llava-onevision (#1205)
-
-[33mcommit bc4c7a35457b0a1cb4e83b9f80a01f2cbee9f0e9[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Aug 25 10:27:02 2024 -0700
-
-    Relax the assert in moe throughput test to fix the flaky CI (#1207)
-
-[33mcommit 1cb4da5c5f1fbaafa5c48b052b1f05abedd97fe5[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Aug 24 21:43:03 2024 -0700
-
-    [Fix] the issue of random order when input is a list (#1199)
-
-[33mcommit e61d13acdf3193606c3bc57fb59f0de33eab7490[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Aug 24 18:35:55 2024 -0700
-
-    [CI] Fix the problem of hf runner too slow (#1202)
-
-[33mcommit b20daf982a82bbeda120d2c30532c74970bd053d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Aug 24 14:50:05 2024 -0700
-
-    Update README.md (#1198)
-
-[33mcommit f6af3a6561b2528531bcb4815012b085280d4ec7[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Aug 24 08:02:23 2024 -0700
-
-    Cleanup readme, llava examples, usage examples and nccl init (#1194)
-
-[33mcommit c9064e6fd9a5356ee579e9d452bfad725f8e6f2c[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Aug 24 18:58:16 2024 +1000
-
-    feat: use gelu_tanh_and_mul (#1193)
-
-[33mcommit a5b14ad04337a3371ca2513ef95a5add28b3f34d[m
-Author: Kaichen Zhang - NTU <kaichenzhang358@outlook.com>
-Date:   Sat Aug 24 05:11:16 2024 +0800
-
-    [Feat/WIP] add llava-onevision, with support for (1) siglip encoder, (2) qwen2 decoder (3) openai api compatible server. (#1123)
-    
-    Co-authored-by: Bo Li <drluodian@gmail.com>
-
-[33mcommit 5fafcac00834253a18a3f10551dfc8221fcc360b[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 22 02:03:25 2024 -0700
-
-    Fix benchmark script (#1185)
-
-[33mcommit 364d3d72a78ba4ce3b0cfde7e28e40d91679cb8e[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Aug 22 01:16:35 2024 -0700
-
-    Fix broken penalty (#1184)
-
-[33mcommit 5623826f7363e41f97db2cfe6e7f1244d9222d35[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Aug 21 19:24:36 2024 -0700
-
-    [Minor] Improve logging and rename the health check endpoint name (#1180)
-
-[33mcommit 83e23c69b35ce26857ee415b243812973fdb9573[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Aug 21 16:48:24 2024 -0700
-
-    Improve code style of sampler (#1168)
-
-[33mcommit ac1b74fa8548adf4f3b3a14b737702158c95c8d9[m
-Author: Zhanghao Wu <zhanghao.wu@outlook.com>
-Date:   Wed Aug 21 16:05:33 2024 -0700
-
-    [Docs] Fix rendering of details in README (#1179)
-
-[33mcommit 068e9eae55daf2ca1666cfa64ad66139b02fa623[m
-Author: intervitens <155717317+intervitens@users.noreply.github.com>
-Date:   Thu Aug 22 01:49:32 2024 +0300
-
-    Support min-p sampling (#1167)
-
-[33mcommit d6aeb9fa1552939e7444d845a9d0f5e9225daf02[m
-Author: rainred <107027757+gryffindor-rr@users.noreply.github.com>
-Date:   Thu Aug 22 05:28:35 2024 +0800
-
-    [Feature] Add a function to convert sampling_params to kwargs (#1170)
-    
-    Co-authored-by: lzhang <zhanglei@modelbest.cn>
-
-[33mcommit 1fb94599087e4881c8b31dc4de46b1685fcaa124[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Aug 22 07:26:35 2024 +1000
-
-    fix: custom op fallback forward native when lower sm80 (#1177)
-
-[33mcommit bea2bb9eeae6cf6f1bdfbb6aaaae2d91adea7bac[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Aug 20 22:35:05 2024 -0700
-
-    Improve multi-node stability (#1171)
-
-[33mcommit cd10654e7ed99616d25fc1d6958ae74b21531bd6[m
-Author: Shan Yu <shanyu1@g.ucla.edu>
-Date:   Tue Aug 20 13:48:24 2024 -0700
-
-    [Feat] Support update weights without restart server (#1157)
-
-[33mcommit 350a81609b1e69194465a9dcbc7b8c1dd1a09e7c[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Aug 21 03:23:52 2024 +1000
-
-    fix: resolve README render (#1166)
-
-[33mcommit 6242c399abb7582fb3d9a4e6a11f6af7d248841b[m
-Author: Lucien <lucien@lucien.ink>
-Date:   Wed Aug 21 01:14:34 2024 +0800
-
-    Generate 1 token to verify the health of the inference service in /health (#1154)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit 04707b09b7240e19039c991ffc6981335c649caa[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Aug 21 02:14:51 2024 +1000
-
-    misc: add hypervisor vendor (#1165)
-
-[33mcommit ff2cfdb1a21867700c21cf903dcd720c55ad60fe[m
-Author: Xu-Chen <956140954@qq.com>
-Date:   Tue Aug 20 23:44:12 2024 +0800
-
-    [Feature] add disable-custom-all-reduce (#1148)
-    
-    Co-authored-by: chenxu02 <chenxu02@zhihu.com>
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit a8ae640328f469b5cd9f1d1c21712c10fd0c5869[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Aug 20 08:31:29 2024 -0700
-
-    Improve docs and warnings (#1164)
-
-[33mcommit d8476818efc88188d0aa0a8a176024a0b82e7a1d[m
-Author: Juwan Yoo <ryan@tmfi.us>
-Date:   Tue Aug 20 08:06:55 2024 -0700
-
-    feat: allow streaming for multi-prompt and/or parallel sampling (#1134)
-
-[33mcommit df191254abc002b3284560d9c4b94214a4656265[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Mon Aug 19 18:23:07 2024 +0800
-
-    Optimize MLA/GQA/MQA Triton decoding (#1138)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit b997a18d74213e905052c47941eebefd36a4d276[m
-Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
-Date:   Sun Aug 18 23:45:41 2024 -0700
-
-    [Feat]Add support for optional start len of logprobs (#1035)
-    
-    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
-    Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
-
-[33mcommit d8627ed16d403751e7cecbdc0563f26230c6ea25[m
-Author: Zhanghao Wu <zhanghao.wu@outlook.com>
-Date:   Sun Aug 18 23:01:55 2024 -0700
-
-    [Docs] Add instruction for running on clouds and kubernetes with SkyPilot (#1144)
-    
-    Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>
-
-[33mcommit fa13b95d6be5c246693492a1c7246cb112930252[m
-Author: min-xu-et <168487304+min-xu-et@users.noreply.github.com>
-Date:   Sun Aug 18 14:29:09 2024 -0700
-
-    fixed a typo (#1143)
-
-[33mcommit 3c1f5a92200e112a07d467771af879942d2dd440[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Aug 17 18:03:00 2024 -0700
-
-    Fix duplicated imports in hf_transformers_utils.py (#1141)
-
-[33mcommit 57d0bd91ec1775cd150629db14d39e07a876a45b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Aug 17 17:43:23 2024 -0700
-
-    Improve benchmark (#1140)
-
-[33mcommit cdc8d607524a9cf663d2319ff452168d99645e39[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Aug 17 14:37:52 2024 -0700
-
-    Improve the code style: more comments and remove useless packages (#1139)
-
-[33mcommit 9208591f05c39963f423fb3fee841f94276da187[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Aug 17 22:45:42 2024 +0800
-
-    fix: use fp16 dtype for sm75 (#1136)
-
-[33mcommit 5d0d40d0eb8c347d8b3598f0a375696728df66c4[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Aug 16 21:41:11 2024 -0700
-
-    Fix CI accuracy && time out limit (#1133)
-
-[33mcommit f624f6a6cc0a5578b9ef056b610e54e04518b26c[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Aug 16 15:12:38 2024 -0700
-
-    Fix port conflicts between local CI and runner CI. (#1131)
-
-[33mcommit 3694f8f996e25c862cd67057e2bfa5844900fc98[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Aug 16 02:13:00 2024 -0700
-
-    Mixed style of chunked prefill (#1013)
-
-[33mcommit 5a261bd0552c049f7eb14dfd20a1ae43f61c9f46[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Aug 16 01:39:24 2024 -0700
-
-    Fix the deadlock in multi-node tp (#1122)
-
-[33mcommit 6aa8ad14f8a9b09904c11413449b9b5d942a115a[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Aug 16 13:46:43 2024 +0800
-
-    fix: resolve Python.h header missing (#1119)
-
-[33mcommit 26e9c12c159277684078d70724247b16611d9e08[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Aug 16 02:26:44 2024 +0800
-
-    ci: compatible with fork repo (#1115)
-
-[33mcommit 87a0db82b867d52e775b96e344b0e16ff60cdb67[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Aug 15 10:54:24 2024 -0700
-
-    update hyperparameter guide (#1114)
-
-[33mcommit 5bd953749b520070a5b72b5b99b9a92853698685[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Aug 16 01:50:43 2024 +0800
-
-    chore: bump v0.2.13 (#1111)
-
-[33mcommit 0cb099e20a0b9ccd308fff5ef133a2e4b26a7f7a[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Aug 15 10:47:39 2024 -0700
-
-    set CUDA_DEVICE_MAX_CONNECTIONS=1 (#1113)
-
-[33mcommit 93d4e354d82b95663f52f3d031f8e432ad0c1803[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 15 10:33:20 2024 -0700
-
-    [Fix] Window attention compatible with RadixAttention and chunked prefill (#1112)
-
-[33mcommit 9195d1362aa33db052c01fb9589301299d6fc50c[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Aug 15 23:29:35 2024 +0800
-
-    misc: rm unused model_loader (#1110)
-
-[33mcommit 14cb544d56b06b25483c4cf9c817b657acff8604[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 15 00:53:24 2024 -0700
-
-    [Fix] fix flashinfer usage for window attention (#1107)
-
-[33mcommit e86b1ccbf07d29ec040b5d1d4092f152237db0f8[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Aug 14 21:56:20 2024 -0700
-
-    Enable chunked prefill by default (#1040)
-
-[33mcommit 8d2d876fc8ec690db8728d363c593174ee3b97c0[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Aug 14 21:56:01 2024 -0700
-
-    [Fix] fix the typo bug for window attention (#1106)
-
-[33mcommit 326df4bab25583eb1dcfaaf0f5f1f28b20d35ae7[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Aug 14 19:25:37 2024 -0700
-
-    Use a single workspace for flashinfer (#1077)
-
-[33mcommit 6767e2229f6245a30fff0373ecceb1c13792d594[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Aug 14 17:43:14 2024 -0700
-
-    Support jinja as chat template file (#1104)
-
-[33mcommit 73cf6834f2a6ee0d566a1ca70db5e2c05c76486b[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Aug 14 17:31:39 2024 -0700
-
-    Support `stop_token_ids` in sglang API (#1092)
-
-[33mcommit 1c2b5f524041752442856428db451510a75add96[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Aug 15 01:39:15 2024 +0800
-
-    docs: update nsys usage (#1103)
-
-[33mcommit 96a2093ef021b7fb10cf727050e0c87494c5463a[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Aug 14 10:37:01 2024 -0700
-
-    [Fix] Compatibility of window attention and cuda graph (#1090)
-
-[33mcommit a34dd86a7dd734ef95ba37a86ba929479bbbac64[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Aug 14 08:58:07 2024 -0700
-
-    Use `dtype` to control generate (#1082)
-    
-    Co-authored-by: zhyncs <me@zhyncs.com>
-
-[33mcommit 67c0d832a644090810a479d6d4655555a07d44a7[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Aug 14 20:25:39 2024 +0800
-
-    docs: update pr template (#1099)
-
-[33mcommit a59636bb5e68f36308bb092674429d27c05cf125[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Aug 14 04:40:44 2024 -0700
-
-    Update grok 1 model (#1095)
-
-[33mcommit fe5024325b8bf952714a49575c86e9b608d01f58[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Aug 14 19:40:05 2024 +0800
-
-    docs: update README (#1098)
-
-[33mcommit f14569f64aa19bcdbf51e08d0aba7e19ccfb5b88[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Aug 14 18:36:24 2024 +0800
-
-    ci: remove workflow path trigger (#1096)
-
-[33mcommit 8f790ac1005cfb5403a0a1e847bb0e050a4282da[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Aug 14 03:25:38 2024 -0700
-
-    Fix a bug in cuda graph runner (#1094)
-
-[33mcommit 616b59f384ad13b824fa8bb634444b43967f8c8a[m
-Author: rainred <107027757+gryffindor-rr@users.noreply.github.com>
-Date:   Wed Aug 14 15:28:04 2024 +0800
-
-    [Feature] modify Runtime to support skip_tokenizer_init (#1088)
-    
-    Co-authored-by: lzhang <zhanglei@modelbest.cn>
-
-[33mcommit c8423ca3112f6bf638f294a548e16ab4a3e79f1f[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Aug 14 15:27:35 2024 +0800
-
-    ci: update timeout and retry (#1086)
-    
-    Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
-
-[33mcommit e205527cb11148b19ba4061d8503e7866c3f25dd[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Aug 13 21:14:05 2024 -0700
-
-    Fix jump forward final state circular path bug. (#1084)
-
-[33mcommit 0909bb0d2f87e3d6a73a8e0dc0e38f55ce44a4d4[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Aug 13 17:01:26 2024 -0700
-
-    [Feat] Add window attention for gemma-2 (#1056)
-
-[33mcommit ad3e4f16199a51862d72845f5f7ea53cc92442d2[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Aug 13 15:44:25 2024 -0700
-
-    Update the mixtral to use the better FusedMoE layer (#1081)
-
-[33mcommit 312e8492556dd092368452f349ed45af3e3a68b6[m
-Author: Lucien <lucien@lucien.ink>
-Date:   Wed Aug 14 06:07:57 2024 +0800
-
-    Example file for docker compose and k8s (#1006)
-
-[33mcommit 95f5fbf1a75f4256cedb35da5c2e38f7841d0ba4[m
-Author: rainred <107027757+gryffindor-rr@users.noreply.github.com>
-Date:   Tue Aug 13 20:47:22 2024 +0800
-
-    Fix create_abort_task, GenerateReqInput does not have rids. (#1079)
-    
-    Co-authored-by: lzhang <zhanglei@modelbest.cn>
-
-[33mcommit cebd78d83ee193b1d35f0591e7beb62f2b944b8e[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Aug 13 20:12:58 2024 +0800
-
-    ci: add accuracy timeout (#1078)
-
-[33mcommit 0076f1154160f53a6c5de8a3716783071f6ef617[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Aug 13 19:08:43 2024 +0800
-
-    fix: use devel for Triton's compiler requirements (#1074)
-
-[33mcommit f7fb68d2925201ce234e97d81ad3095e4dc48cbb[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Aug 13 16:43:23 2024 +0800
-
-    ci: add moe test (#1053)
-
-[33mcommit 396a13e6ad6b62f850aac026e4ddc57134e5f4e7[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Aug 13 16:16:50 2024 +0800
-
-    ci: add cancel pr workflow (#1070)
-
-[33mcommit 65915f9f3e93a0f682c97fe8ece268f2f2c00fa5[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Aug 13 13:48:54 2024 +0800
-
-    fix: temporary solution for DeepSeek V2 H100 layout conversion issue (#1060)
-    
-    Co-authored-by: ispobock <ISPObaoke@163.com>
-
-[33mcommit 162f3ccb01d9b31d21f1a1ae3d6cabbfe4079838[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Tue Aug 13 13:48:07 2024 +0800
-
-    Fix layernorm input shape (#1066)
-    
-    Co-authored-by: Yineng Zhang <me@zhyncs.com>
-
-[33mcommit 65e89baea9f152837f32ce8b0baa5b877bf39a5c[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Aug 13 13:12:56 2024 +0800
-
-    fix: not use the default port (#1068)
-
-[33mcommit 6a38efa8342ef4b924b093d90260ead6d1f6cea7[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Aug 13 00:15:59 2024 +0800
-
-    feat: replace all rmsnorm and silu (#1057)
-
-[33mcommit b0ad0c1bc8787937a7df5bc0487af1e9db6efb5e[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Aug 12 18:59:38 2024 +0800
-
-    chore: bump v0.2.12 (#1048)
-
-[33mcommit c877292cc12a61011694d7d0ea53c05f247003f6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Aug 12 03:39:01 2024 -0700
-
-    Re-organize CI tests (#1052)
-
-[33mcommit 0c1c72a0b409f255a1fcea666705af8140da5f1e[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Aug 12 02:48:40 2024 -0700
-
-    Fix accuracy test (#1051)
-
-[33mcommit 41598e0d8e7de0aa777941c4ff5e1fddfb6f573c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Aug 12 02:21:38 2024 -0700
-
-    Add longer accuracy test on CI (#1049)
-
-[33mcommit 89f23a5178769ce867e8fb3af3cb44da2f5399ec[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Aug 12 16:11:38 2024 +0800
-
-    docs: update setup github runner (#1050)
-
-[33mcommit cb99ba4fc6194e4feffa0fbb22223ab0119e5e36[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Aug 12 14:24:06 2024 +0800
-
-    feat: update Dockerfile (#1033)
-    
-    Co-authored-by: vhain <vhain6512@gmail.com>
-
-[33mcommit 32f614432355ed5fa2ba4a7ec58f634571ac60f6[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Aug 11 23:13:45 2024 -0700
-
-    fix: Fix returned prefill logits and add output str test (#1046)
-
-[33mcommit fb1f28cbbbd3e2abcbf40dc043e5b2556938abec[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Aug 11 22:54:37 2024 -0700
-
-    Clean up the comments and names under python/sglang/srt/layers (#1047)
-
-[33mcommit fb7421db0ddcb263b2cd1d8bbbe63282c97606aa[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Aug 11 22:35:44 2024 -0700
-
-    minor: some potential bugs (#1044)
-
-[33mcommit 14b64930871c6a2e2236af6c648734852ac7a35c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Aug 11 21:31:52 2024 -0700
-
-    Delete the useless test/srt/test_throughput.py (#1045)
-
-[33mcommit 8207637029082563cab74951fe8d5f86b574b85e[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Aug 11 18:27:33 2024 -0700
-
-    Improve end-to-end throughput test and its coverage (#1039)
-
-[33mcommit 7de6034534fbb586474691e8add93e5f75a7ac20[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Aug 11 17:57:02 2024 -0700
-
-    Fix the prefix indices (#1037)
-
-[33mcommit d84c5e70f7a0d309978eb64fa3e7aa5ac47fbb7a[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Aug 11 16:41:03 2024 -0700
-
-    Test the case when max_new_tokens is very large (#1038)
-
-[33mcommit d785412077d53cef16c4e70caec0f0156aca5edc[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Aug 11 15:20:18 2024 -0700
-
-    Fix the case when max_new_tokens is too large (#1025)
-
-[33mcommit 7b6a5332cad494b4016f5ac658e55bf40224fb7e[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Aug 11 12:11:26 2024 -0700
-
-    Fix triton args init (#1034)
-
-[33mcommit 4080e82244f72dd8c60a1e89928c568e2dc5dd1c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Aug 11 04:53:51 2024 -0700
-
-    Fix the case where r.prefix_indices is None (#1031)
-
-[33mcommit c245b78973c934752b5d3b73f0bb62047b1c4f3d[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Aug 11 17:45:59 2024 +0800
-
-    hotfix: add CustomOp abstraction (#1027)
-
-[33mcommit 9dae4078122bc675a07dbdfd40e879e925c6e3ed[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Aug 11 02:44:59 2024 -0700
-
-    Improve type annotation (#1029)
-
-[33mcommit fcc0f5ed9932f165fb1da558efc412054294d298[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Aug 11 02:22:16 2024 -0700
-
-    Fix wrong assert (#1028)
-
-[33mcommit a97df79124d6de281a412db2005ad9210cbadee8[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Aug 11 01:18:52 2024 -0700
-
-    Clean up readme and arguments of chunked prefill (#1022)
-
-[33mcommit 33d61356b885c8cb7805733dc65def442b30cf63[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Aug 11 15:34:30 2024 +0800
-
-    misc: update issue template (#1024)
-
-[33mcommit 94752ac811f1412caec55f940ea0fa0d52911e5d[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Aug 11 12:57:13 2024 +0800
-
-    feat: use FlashInfer rmsnorm and silu (#907)
-
-[33mcommit 43fbb6d919d9b6c07ab256a8ab04bc4d7462df66[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Aug 10 16:24:12 2024 -0700
-
-    Fix `input_ids` && rename to `fill_ids` (#1021)
-
-[33mcommit 54fb1c80c0d7bbf100d4efc84d1aad4bee094ff0[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Aug 10 15:09:03 2024 -0700
-
-    Clean up unit tests (#1020)
-
-[33mcommit b68c4c073ba730f3ced08830fd804132269bdfc9[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Aug 10 13:46:42 2024 -0700
-
-    fix: force max new tokens to be 1 for embedding request (#1019)
-
-[33mcommit e712837d389e307fe0a4d07d06a6a940695455d8[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Aug 11 02:20:30 2024 +0800
-
-    misc: update test config (#990)
-
-[33mcommit 7599badeaf5aeab8c9f72659ceb55bcaf9472e56[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Aug 10 08:39:05 2024 -0700
-
-    Support embedding input as a list (#1014)
-
-[33mcommit 62757db6f0f09a6dff15b1ee1ac3029602951509[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Aug 9 16:36:57 2024 -0700
-
-    Reduce the overhead when cache is disabled (#1010)
-
-[33mcommit 73fa2d49d539fd67548b0458a365528d3e3b6edc[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Aug 9 15:16:23 2024 -0700
-
-    Some warnings to crash when CI (#1009)
-
-[33mcommit 61728884d73390f2af17644e399e2c489e106d07[m
-Author: Mingyi <wisclmy0611@gmail.com>
-Date:   Fri Aug 9 13:18:58 2024 -0700
-
-    Fix benchmark latency (#1007)
-
-[33mcommit 9cf0a5bada133d9f9f5bcc7f8f8cf0ba56848fb9[m
-Author: gryffindor-rr <107027757+gryffindor-rr@users.noreply.github.com>
-Date:   Sat Aug 10 03:14:13 2024 +0800
-
-    Add skip_tokenizer_init args. (#959)
-    
-    Co-authored-by: lzhang <zhanglei@modelbest.cn>
-
-[33mcommit b16e856f11cf7b88cdaf7e5739e53e9321009485[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Aug 9 11:19:18 2024 -0700
-
-    Add openai embedding API (#997)
-
-[33mcommit 05c50a82b82c108ad963fec4e572e1a888e62962[m
-Author: Roger Wang <136131678+ywang96@users.noreply.github.com>
-Date:   Fri Aug 9 09:53:50 2024 -0700
-
-    Minor bugfix on benchmark serving (#1005)
-
-[33mcommit b568df5d03daa5052de214405d608a760dd379cc[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Aug 9 21:21:42 2024 +0800
-
-    fix: resolve correctness_test issue (#1002)
-
-[33mcommit 10bca45bc6415afc2d6fb764c697626875831af9[m
-Author: Juwan Yoo <ryan@tmfi.us>
-Date:   Fri Aug 9 04:46:24 2024 -0700
-
-    bugfix: penalizers to be merged before reqs (#1001)
-
-[33mcommit b91a4cb1b1c21b94ca74f4e75305e4b26673dc5b[m
-Author: liuyhwangyh <liuyhwangyh@163.com>
-Date:   Fri Aug 9 17:52:14 2024 +0800
-
-    support models from www.modelscope.cn (#994)
-    
-    Co-authored-by: mulin.lyh <mulin.lyh@taobao.com>
-
-[33mcommit 95a28019ba6c7288c1d2e747665d6a9dd005fdc2[m
-Author: Juwan Yoo <ryan@tmfi.us>
-Date:   Thu Aug 8 23:30:50 2024 -0700
-
-    test: negative value testing for frequency, presence penalizers (#995)
-
-[33mcommit e040a2450b950f6e1674c73def37a9238064104d[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 8 16:31:19 2024 -0700
-
-    Add e5-mistral embedding model - step 3/3 (#988)
-
-[33mcommit 9f662501a36b332ec4ac9b4ece29233ad7563c01[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 8 13:20:30 2024 -0700
-
-    Move torch.compile configs into cuda_graph_runner.py (#993)
-
-[33mcommit ab7875941b34200529eddd1fb950efa981dc3866[m
-Author: Juwan Yoo <vhain6512@gmail.com>
-Date:   Thu Aug 8 04:21:08 2024 -0700
-
-    feat: frequency, min_new_tokens, presence, and repetition penalties (#973)
-
-[33mcommit 228cf47547a3d2f7f38f636f40a5e85b0c3cd646[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 8 03:58:47 2024 -0700
-
-    Create contributor_guide.md (#992)
-
-[33mcommit 3a79613c28319030a5fe7fe22284b178f56984e1[m
-Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
-Date:   Thu Aug 8 17:41:57 2024 +0800
-
-    support more optioin about usage in stream mode (#985)
-    
-    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
-
-[33mcommit 1ac304eeb483c4ce3435dd1673426ddd7271d02c[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Aug 8 01:11:22 2024 -0700
-
-    Adjust `InputeMetadata` and `ScheduleBatch` (#981)
-
-[33mcommit 20a4f927dc0cd5c9f75592eb9efd91e79fb90141[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 8 00:52:31 2024 -0700
-
-    Add io struct for embedding models [unreachable code] - step 2/3 (#987)
-
-[33mcommit 0de7c2d09efe1e6bd25bbff5f572ca629c04e197[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 8 00:04:15 2024 -0700
-
-    Add e5-mistral modules [unreachable code] - step 1/3 (#983)
-
-[33mcommit 6ed4e3b8fb8b264d38378ddff49b185a56e1e810[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Aug 7 22:28:42 2024 -0700
-
-    Fix chunked prefill (#984)
-
-[33mcommit 00023d622a6d484e67ef4a0e444f708b8fc861c8[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Aug 7 18:48:45 2024 -0700
-
-    [minor] Update type annotation in tokenizer_manager.py (#982)
-
-[33mcommit c62d560c03bdb9edd8d36d82f5771d5e8c18a899[m
-Author: foszto <foszto@gmail.com>
-Date:   Thu Aug 8 02:54:46 2024 +0200
-
-    #590 Increase default , track changes in examples and documentation (#971)
-    
-    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
-
-[33mcommit 2b8257f325f3135f0c1cbeae50f3186a98daf6f0[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Aug 7 17:41:26 2024 -0700
-
-    Adjust max prefix len (#980)
-
-[33mcommit 7623091d9769f074680beefcdf23a6fb2ecac753[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Aug 7 15:52:24 2024 -0700
-
-    RadixCache method adjust (#977)
-
-[33mcommit f724f1f1e99406a120874de2579e671f304ca58c[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Aug 7 13:47:28 2024 -0700
-
-    PrefillAdder abstraction (#968)
-
-[33mcommit 6db27f7b3b883ab47216114bd611b4f628bdfaa2[m
-Author: Zhiqiang Xie <zhiqiangx@nvidia.com>
-Date:   Wed Aug 7 13:40:07 2024 -0700
-
-    misc: correct the int data type for token ids and indices (#969)
-
-[33mcommit 4d929107aef1598ffb681daf563c92f44167a918[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Aug 7 13:16:36 2024 -0700
-
-    Run purge-cache only in sgl-project (#976)
-
-[33mcommit fbe0c818c253198a59c1c41670a32493c2e53437[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Aug 7 12:43:12 2024 -0700
-
-    Purge self-runner's pip cache weekly (#975)
-
-[33mcommit dc9d06d886151707f97d0b78095df9de262fd3c9[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Aug 7 20:47:53 2024 +0800
-
-    chore: bump v0.2.11 (#970)
-
-[33mcommit c31f084c713cb91f0fdb54306f0851aa2780fdf5[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Aug 7 19:15:41 2024 +0800
-
-    chore: update vllm to 0.5.4 (#966)
-
-[33mcommit a01ddd9605fe3b01fa8e4c9545c35290ad541afe[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Aug 7 01:52:10 2024 -0700
-
-    misc: fix the req_to_token member change (#967)
-
-[33mcommit 7fa54a1ab3143d730e160069e1ef5c6536eb2752[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Aug 7 01:41:25 2024 -0700
-
-    Make `req_pool_indices` on CPU (#960)
-
-[33mcommit 05abd1261c197f13330dfed3115fe27f2e461299[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Aug 7 16:39:36 2024 +0800
-
-    misc: add compute capability in check_env (#965)
-
-[33mcommit 5f6fa04a3f3f8c2451835d6e99ddaf5b9d48c8f8[m
-Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
-Date:   Wed Aug 7 16:23:27 2024 +0800
-
-    misc: simplify test (#964)
-
-[33mcommit 58a09708539b5d3c8c12f3aaceb18178e1483d16[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Aug 7 15:41:21 2024 +0800
-
-    misc: update issue template (#963)
-
-[33mcommit ff68ae857ab952f60564b0b4a56e030f91e3f67e[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Aug 6 23:57:06 2024 -0700
-
-    Show more error messages for warmup errors (#932)
-
-[33mcommit 795eab6dda7b7c3df552b9c44dce65c695a0f97c[m
-Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
-Date:   Wed Aug 7 14:52:10 2024 +0800
-
-    Add support for Batch API test  (#936)
-
-[33mcommit 41bb1ab10d7585f874ab4809744a0b55a5b351b7[m
-Author: Meng, Peng <pengmeng@tencent.com>
-Date:   Wed Aug 7 11:51:21 2024 +0800
-
-    fix nsys cannot profile cuda kernel (#957)
-
-[33mcommit 87e8c090e910c20f9619808179d6e38ba10e2034[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Aug 6 20:50:32 2024 -0700
-
-    Organize code (rename, movement) (#953)
-
-[33mcommit ad56e684950ab296e5829e8b44c80fcedc1eb426[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Aug 6 01:05:58 2024 -0700
-
-    Fix stuck in `get_new_prefill_batch` (#948)
-
-[33mcommit ffb15744b583e85e4d3f479784a30f33f6c799aa[m
-Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
-Date:   Tue Aug 6 02:12:53 2024 +0800
-
-    Support multiple args options (#941)
-
-[33mcommit a9c833d5802d87279d4eb2af4297a765d2bd3b0a[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Tue Aug 6 00:46:34 2024 +0800
-
-    Fix union operator (#940)
-
-[33mcommit 94e0115186d91fe00910a17b02da2a62de6b2d45[m
-Author: Aidan Cooper <30752032+AidanCooper@users.noreply.github.com>
-Date:   Mon Aug 5 11:27:49 2024 +0100
-
-    Feat: add alternative choices selection methods (#835)
-
-[33mcommit b216a545b357b12b2c47879497d63bedf80fcc54[m
-Author: Aidan Cooper <30752032+AidanCooper@users.noreply.github.com>
-Date:   Mon Aug 5 11:25:48 2024 +0100
-
-    Remove leftover auth_token (#934)
-
-[33mcommit fde8340550d5c5587a97eef69839d3e8a57a43c8[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Mon Aug 5 18:06:06 2024 +0800
-
-    docs: update README (#935)
-
-[33mcommit fd7926e46e37b57c4231e4da4dca5a241850f795[m
-Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
-Date:   Mon Aug 5 15:56:08 2024 +0800
-
-    Fix prompt len in parallel sampling (#928)
-
-[33mcommit 399cad91f36e9b4baea78408801d6ef0d7bfd436[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Aug 4 23:01:35 2024 -0700
-
-    Update README.md (#927)
-
-[33mcommit 0a4f5f9beab7915d8d4d0d325b8fbd7c0eed5037[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Aug 4 22:52:41 2024 -0700
-
-    Test regex in vision api (#926)
-
-[33mcommit 3bc99e6fe4d77979f0e0de707a59a9cf305504d6[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Aug 4 20:51:55 2024 -0700
-
-    Test openai vision api (#925)
-
-[33mcommit ebf69964cd7e5af9a079eb430ecdd7f67e8566d0[m
-Author: min-xu-et <168487304+min-xu-et@users.noreply.github.com>
-Date:   Sun Aug 4 18:15:23 2024 -0700
-
-    latency test enhancement - final part (#921)
-
-[33mcommit 141e8c71a31eb24041a373141c5d3da3b92938d8[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Aug 4 16:52:51 2024 -0700
-
-    Bump version to 0.2.10 (#923)
-
-[33mcommit d53dcf9c989fe4badcfbeb9d598adb7a3b6c9ab3[m
-Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
-Date:   Mon Aug 5 07:43:09 2024 +0800
-
-    Support more OpenAI API test (#916)
-
-[33mcommit bb66cc4c52b1440a8e85247b706b2b3d645e902d[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Aug 4 16:02:05 2024 -0700
-
-    Fix CI && python3.8 compatible (#920)
-
-[33mcommit 975adb802b39bf98b9b0ac4a715cd3670f18b61a[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Aug 4 13:51:52 2024 -0700
-
-    Update hyperparameter_tuning.md (#918)
-
-[33mcommit 0d4f3a9fcdea60ac327a6a5897a281a1d763c3ac[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Aug 4 13:35:44 2024 -0700
-
-    Make API Key OpenAI-compatible (#917)
-
-[33mcommit afd411d09f6d143cfb299d759056597a9fae3209[m
-Author: min-xu-et <168487304+min-xu-et@users.noreply.github.com>
-Date:   Sun Aug 4 12:27:25 2024 -0700
-
-    enhance latency test - part 2 (#915)
-
-[33mcommit e1eae1fd15ed8e125ddcd18d0193ae8529c0c309[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Mon Aug 5 01:40:33 2024 +0800
-
-    Support MLA for DeepSeek-V2 with Triton - step 1 (#905)
-
-[33mcommit f4d9953d9d6e7e86d933a2b7be0328bec8527d0a[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Aug 4 21:20:59 2024 +0800
-
-    misc: add triton in check_env PACKAGE_LIST (#914)
-
-[33mcommit 4f005250573c60b1538b6cee4b30ae691d9f2e98[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Aug 4 14:34:50 2024 +0800
-
-    fix: use e2e and unit test only for original repo or pr (#912)
-
-[33mcommit 995af5a54b03495ff34af28f5499f381d19758da[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Aug 3 23:09:21 2024 -0700
-
-    Improve the structure of CI (#911)
-
-[33mcommit 539856455d8950b0249248aecdc844a78168a003[m
-Author: min-xu-et <168487304+min-xu-et@users.noreply.github.com>
-Date:   Sat Aug 3 22:44:58 2024 -0700
-
-    latency test enhancement - part 1 (#909)
-
-[33mcommit 70cc0749ce0d8a6fa28323c057311ebe88a6157e[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Aug 3 18:20:50 2024 -0700
-
-    Add model accuracy test - step 1 (#866)
-
-[33mcommit 7dd8a7e6d973f2dba7f669e92b40baeeb7983248[m
-Author: min-xu-et <168487304+min-xu-et@users.noreply.github.com>
-Date:   Sat Aug 3 17:42:17 2024 -0700
-
-    fixed an error handling in bench_latency.py (#904)
-
-[33mcommit 947402c8293f4c03e014a02aee54c2b76bde1d39[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Aug 3 16:18:50 2024 -0700
-
-    Reorder CI unit tests. (#908)
-
-[33mcommit 8c5382e62c53d251104da36b121a9a28d0eae21a[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Aug 3 12:58:41 2024 -0700
-
-    Update README.md
-
-[33mcommit 001b0bdd089a626ada2ee217fdc59b4212f0b461[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Aug 2 21:54:57 2024 -0700
-
-    Update the base image of the docker (#900)
-
-[33mcommit b906c015926c9064e086dd1f378e705330076da3[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Aug 2 12:08:00 2024 -0700
-
-    Bump version to 0.2.9.post1 (#899)
-
-[33mcommit 9319cd139c91b3c86775601e694a78420f8d01db[m
-Author: min-xu-et <168487304+min-xu-et@users.noreply.github.com>
-Date:   Fri Aug 2 09:39:28 2024 -0700
-
-    [minor] fixed code formatting doc (#896)
-
-[33mcommit 046c2b339e4f88649c37751bcc6156924e025bd1[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Aug 2 22:50:19 2024 +0800
-
-    chore: add multipart dep for fastapi (#895)
-
-[33mcommit 6b8f66efe1ba754c23326ae6e71f96e05ae132de[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Aug 2 22:40:52 2024 +0800
-
-    misc: update cuda graph capture exception log (#894)
-
-[33mcommit 7937a886b2da950f5f17d4c3f788b4cbfda398e7[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Aug 2 19:03:53 2024 +0800
-
-    docs: update setup runner (#884)
-
-[33mcommit 2e218b9e04c5e92d0400bbb1f26d3893096639c3[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Aug 2 18:48:56 2024 +0800
-
-    fix: set env in runner (#891)
-
-[33mcommit 30a9b2ef203329110477dd10cf3ff45df0c2e8f4[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Aug 2 01:45:48 2024 -0700
-
-    Bump version to v0.2.9 (#890)
-
-[33mcommit 3cadecf0c4a1e8dbce63700ad7a1ba3716494e95[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Aug 2 00:47:23 2024 -0700
-
-    Increase openai client limit (#886)
-
-[33mcommit e90e3a50d45250a29efe3cdc6afa202b63a622f3[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Aug 2 00:46:41 2024 -0700
-
-    Add benchmark: HumanEval (#889)
-
-[33mcommit fbd6b94d6982298c1b488779e581029a1792df9b[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Aug 2 00:30:50 2024 -0700
-
-    Fix the double BOS problem in the HF chat template (#888)
-
-[33mcommit 4c8093c8dbe815ab0982b2c1caeaef49277a6e36[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 1 21:29:46 2024 -0700
-
-    Update workflow name (#883)
-
-[33mcommit ae7ee01a8e59f755d47426c4b08641053b765a89[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 1 21:20:17 2024 -0700
-
-    Add accuracy test to CI: MMLU (#882)
-
-[33mcommit 76e59088d88f1c79f258c69edc3b887fbcf4ce61[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 1 18:14:33 2024 -0700
-
-    Add more unit tests to CI (#880)
-
-[33mcommit 12ce3befb6382c34a32225c05ef0fe4ba7953f29[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Aug 1 17:37:47 2024 -0700
-
-    Update runner docs (#879)
-
-[33mcommit 4013a4e1b0a1210c0cfa729bd68130fc96030d50[m
-Author: 任嘉 <dionren@users.noreply.github.com>
-Date:   Fri Aug 2 08:13:51 2024 +0800
-
-    Implement served_model_name to customize model id when use local mode… (#749)
-    
-    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
-
-[33mcommit 60340a3643b14620d7ade3941caad5bcbdbf4ac0[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 1 16:01:30 2024 -0700
-
-    Improve the coverage of the openai api server test (#878)
-
-[33mcommit 70c78cfb03f9b991a734e0487bc3e317093a4d4f[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Aug 1 15:32:33 2024 -0700
-
-    Update runner docs (#876)
-
-[33mcommit 72b6ea88b4354ad7551aab1594db0c967065c11d[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 1 14:34:55 2024 -0700
-
-    Make scripts under `/test/srt` as unit tests (#875)
-
-[33mcommit e4d3333c6c9841f139222ea675a4f29241362f49[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 1 14:18:26 2024 -0700
-
-    bump to 0.2.8 (#877)
-
-[33mcommit 6f221d4ca03731542bedf2dd4da002178e5babbc[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 1 12:39:12 2024 -0700
-
-    Fix unit tests for the frontend language part (#872)
-
-[33mcommit aba6f51f88029524b841a706570047b99a7a2257[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Aug 2 03:27:05 2024 +0800
-
-    misc: update unit test config (#873)
-
-[33mcommit 7f6c690b67a9d6311041d3a65ec1be1123b475cf[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Aug 2 03:12:20 2024 +0800
-
-    misc: use pip cache purge and add unit test ci (#871)
-
-[33mcommit 40e6f5131a3f78eab970374761a2b07f49562dce[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 1 09:39:09 2024 -0700
-
-    Fix openai CI tests (#870)
-
-[33mcommit 4075677621f3be941f205cac669d37b8db3a8851[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 1 09:25:24 2024 -0700
-
-    Add OpenAI backend to the CI test (#869)
-
-[33mcommit 9e8d2c7f7490d553c98abbfc84ecb60a8789f89c[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Aug 1 16:26:54 2024 +0800
-
-    misc: add cancel previous at e2e (#864)
-
-[33mcommit c9bff5fcc80e3c5edad31d1998bf11cfc7781f57[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Aug 1 15:46:51 2024 +0800
-
-    misc: disable auto release (#862)
-
-[33mcommit b04444ac015a40c696a635fefdbbb8089a1cda73[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 1 00:39:55 2024 -0700
-
-    Rename github workflows (#861)
-
-[33mcommit 3d617a21ba04fea69223a0b8c162f292d4524860[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Aug 1 15:38:24 2024 +0800
-
-    misc: update e2e test paths config (#860)
-
-[33mcommit c020f9cedafe1b3eb0c4575c9a6d394e05fc8277[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Aug 1 00:29:01 2024 -0700
-
-    Support chunked prefill when radix cache is disabled (#811)
-
-[33mcommit ca600e8cd6ece48da1c8a8bdbfc53c32ac0d05f6[m
-Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
-Date:   Thu Aug 1 15:08:21 2024 +0800
-
-    Add support for logprobs in OpenAI chat API (#852)
-
-[33mcommit 0c0c81372ebebacb10f52a7424215e21d6421dae[m
-Author: Kai Fronsdal <kaifronsdal@gmail.com>
-Date:   Thu Aug 1 00:05:39 2024 -0700
-
-    Fix #857 (#858)
-
-[33mcommit 90286d857638f539ea47df1e0604fcc4f341234d[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Aug 1 00:05:26 2024 -0700
-
-    Add troubleshooting doc (#856)
-
-[33mcommit 5e7dd984fe0151198148b9cee6e613805e80998b[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Jul 31 15:48:31 2024 -0700
-
-    Fix llama for classification (#855)
-
-[33mcommit bc3eaac2b82a8464730b9ba4d6b2fbebf19fa314[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Aug 1 02:37:05 2024 +0800
-
-    chore: update flashinfer to v0.1.3 (#850)
-
-[33mcommit a78d98de190a336c0e9a1c27d849a769054cc065[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Jul 31 16:37:29 2024 +0800
-
-    misc: update e2e test paths config (#848)
-
-[33mcommit 7d5ed7c6ee2546d9a5e8aacfe310d430c7586874[m
-Author: Ikko Eltociear Ashimine <eltociear@gmail.com>
-Date:   Wed Jul 31 11:48:18 2024 +0900
-
-    docs: update README.md (#843)
-
-[33mcommit a6c7ebbbcb1ed135b117eab2b03fa1af9c9da05a[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Jul 30 18:29:01 2024 -0700
-
-    Add req slots leaking check (#842)
-
-[33mcommit bb0501c0d96cc9531dc8885dec8d5e701156010d[m
-Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
-Date:   Wed Jul 31 04:40:51 2024 +0800
-
-    Fix List input bug (#838)
-
-[33mcommit 6b0f2e908815acc3dbcf6630b5cdff4b9fbece72[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Jul 30 13:33:55 2024 -0700
-
-    Add `--max-total-tokens` (#840)
-
-[33mcommit 1edd4e07d6ad52f4f63e7f6beaa5987c1e1cf621[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Jul 30 20:41:10 2024 +1000
-
-    chore: bump v0.2.7 (#830)
-
-[33mcommit 62c673c46f3694c66895d29800b796c561aa1873[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Jul 30 19:43:40 2024 +1000
-
-    docs: add set up runner (#829)
-
-[33mcommit 377c5dc9a90eed82b04c2a16fb7ce59c0987f0c2[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Jul 30 19:26:23 2024 +1000
-
-    misc: enable e2e test when push (#828)
-
-[33mcommit f52eda35ea4dae83a4ada499d0a752252e04b938[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Jul 30 19:19:23 2024 +1000
-
-    misc: update e2e test benchmark config (#825)
-
-[33mcommit b579ecf028f571e6eadc216629470d6e456550e4[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Jul 30 02:04:51 2024 -0700
-
-    Add awq_marlin (#826)
-
-[33mcommit e7487b08bcda8cb39beea5eb225df493dc490028[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Jul 30 01:58:31 2024 -0700
-
-    Adjust default mem fraction to avoid OOM (#823)
-
-[33mcommit ae5c0fc442716e9fdc6fddba33c970ab3fe6f208[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Jul 30 01:42:07 2024 -0700
-
-    Support disable_ignore_eos in bench_serving.py (#824)
-
-[33mcommit a30d5d75bfde72c99fbd5ffc30a309e793520f66[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Jul 30 18:31:26 2024 +1000
-
-    feat: add pr e2e test (#822)
-
-[33mcommit 17af39c5dc6a20f39d5a68dd1ac668477eacadce[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Jul 30 17:32:13 2024 +1000
-
-    feat: add runner (#821)
-
-[33mcommit daf593a385db3f50cdaf8a71fb1f37548cd73bf8[m
-Author: ObjectNotFound <13832753+objnf-dev@users.noreply.github.com>
-Date:   Tue Jul 30 15:32:07 2024 +0800
-
-    Fix streaming bug (#820)
-
-[33mcommit bece265f5a189b23bac9ad31d140e11072d5efdf[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Jul 30 16:17:50 2024 +1000
-
-    docs: update README (#819)
-
-[33mcommit cdcbde5fc3155edaa6b98a13ab8764101e657b23[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Jul 29 23:04:48 2024 -0700
-
-    Code structure refactor (#807)
-
-[33mcommit 21e22b9e96c16e3cef9ba3d2aa16df45a20f7a2b[m
-Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
-Date:   Tue Jul 30 06:38:02 2024 +0100
-
-    Fix LiteLLM kwargs (#817)
-
-[33mcommit a50c8a14b3db0c2fb2f998c9969dc56b17162a30[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Jul 30 12:40:35 2024 +1000
-
-    fix: use v0.2.5 for benchmark (#814)
-
-[33mcommit db6089e6f341878080e645860a75766fea6207db[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Jul 29 19:40:28 2024 -0700
-
-    Revert "Organize public APIs" (#815)
-
-[33mcommit 3520f75fb14d1932fa226aea534937cc87c1b819[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Jul 29 18:34:25 2024 -0700
-
-    Remove inf value for chunked prefill size (#812)
-
-[33mcommit c8e9fed87a85241180cb83230c8407d5d96c5f85[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Jul 29 15:34:16 2024 -0700
-
-    Organize public APIs (#809)
-
-[33mcommit 084fa54d371e439cbca8b21930c6f658c1ef4671[m
-Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
-Date:   Tue Jul 30 04:07:18 2024 +0800
-
-    Add support for OpenAI API : offline batch(file) processing (#699)
-    
-    Co-authored-by: hnyls2002 <hnyls2002@gmail.com>
-
-[33mcommit eba458bd19042ee449d52ea4c9f5b30a79d1165e[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Jul 29 12:20:42 2024 -0700
-
-    Revert "Revert "fix: update flashinfer to 0.1.2 to fix sampling for cu118"" (#806)
-
-[33mcommit 3d1cb0af83a2e8e669edd25bea6c80ce22bfba6a[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Jul 30 05:18:03 2024 +1000
-
-    feat: add chat template for internlm2-chat (#802)
-
-[33mcommit 7d352b4fdd3505d1ff527bd3f5b6b069b63f53bd[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Jul 29 11:39:12 2024 -0700
-
-    Revert "fix: update flashinfer to 0.1.2 to fix sampling for cu118" (#805)
-
-[33mcommit 87064015d9a31a116137574607bdb32a96cc3c17[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Tue Jul 30 04:00:52 2024 +1000
-
-    fix: update flashinfer to 0.1.2 to fix sampling for cu118 (#803)
-
-[33mcommit 7cd4f244a42178d0cdfb6a81156f38e87a7d92cd[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Jul 29 03:32:58 2024 -0700
-
-    Chunked prefill (#800)
-
-[33mcommit 98111fbe3ebd429258923ae00c3e1c7b1be8dcec[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Jul 29 02:38:31 2024 -0700
-
-    Revert "Chunked prefill support" (#799)
-
-[33mcommit 2ec39ab712245e89c0897a9cc2aa257b0c577c8d[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Jul 29 02:21:50 2024 -0700
-
-    Chunked prefill support (#797)
-
-[33mcommit 8f6274c82be3221b45848836756223a918cd1d07[m
-Author: ObjectNotFound <13832753+objnf-dev@users.noreply.github.com>
-Date:   Mon Jul 29 14:02:49 2024 +0800
-
-    Add role documentation, add system begin & end tokens (#793)
-
-[33mcommit 325a06c2deef25067f8b37e73358b4569e13def7[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Jul 28 23:01:45 2024 -0700
-
-    Fix logging (#796)
-
-[33mcommit 79f816292e02d1a56762f956741d83fc557c1040[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Jul 28 22:09:50 2024 -0700
-
-    Fix lazy import location (#795)
-
-[33mcommit b688fd858d89258ba0018e5903c2907badf49afa[m
-Author: Eric Yoon <bgyoon@gmail.com>
-Date:   Mon Jul 29 13:57:41 2024 +0900
-
-    Lazy-import third-party backends (#794)
-
-[33mcommit 5bd899243be414d087ebc3ad3dd98b571581ff16[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Jul 28 21:57:23 2024 -0700
-
-    Update README.md (#792)
-
-[33mcommit 8d908a937cb75b211822a7fed94b352660b2347e[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Jul 28 17:09:16 2024 -0700
-
-    Fix echo + lobprob for OpenAI API when the prompt is a list (#791)
-
-[33mcommit dd7e8b9421f31a2af824cf578cf4c8bed92da52a[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Jul 28 23:07:12 2024 +1000
-
-    chore: add copyright for srt (#790)
-
-[33mcommit 1f013d64eb78a7336e59244b9628ab36f5d32bf1[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Jul 28 22:27:52 2024 +1000
-
-    docs: make badges center (#789)
-
-[33mcommit 628e1fa7603ea583a1511e08aa6f508f97be658f[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Jul 28 22:24:27 2024 +1000
-
-    docs: update README (#788)
-
-[33mcommit c71880f896a356255c84928d3e86618ca19cf891[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Jul 28 05:22:14 2024 -0700
-
-    Vectorize logprobs computation (#787)
-
-[33mcommit bcb6611a46cf5380e27bc05e7ac4945c3bd57ca8[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Jul 28 01:00:06 2024 -0700
-
-    Update README.md
-
-[33mcommit fa2aa0db0afe09630c58b26ed5e29ea0b3dc936b[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Jul 28 17:22:00 2024 +1000
-
-    docs: update index (#786)
-
-[33mcommit 6a387a69ccf3277380d694a285a616f1c4d46526[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Jul 28 17:08:16 2024 +1000
-
-    fix: exclude logo png in gitignore (#785)
-
-[33mcommit 27f5ce0a6cf2d61a24db87c0e844177125ba16b1[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Jul 28 16:55:54 2024 +1000
-
-    fix: init readthedocs support (#784)
-
-[33mcommit 948625799e1d173308f63462c88de31e58a28f9e[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Jul 28 16:50:31 2024 +1000
-
-    docs: init readthedocs support (#783)
-
-[33mcommit 68e5262699e2272c89993abf8c5bd47e046b4bce[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Jul 28 14:06:24 2024 +1000
-
-    fix: replace pillow with PIL in PACKAGE_LIST (#781)
-
-[33mcommit bc1154c399b3350e502adb56320e1bf1b00d2750[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 27 20:29:33 2024 -0700
-
-    Bump version to 0.2.6 (#779)
-
-[33mcommit 752e643007070107808794f0ad46aa196b39c66c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 27 20:18:56 2024 -0700
-
-    Allow disabling flashinfer sampling kernel (#778)
-
-[33mcommit 30db99b3d98cbc4886dc3e35dce0f1658a44939c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 27 19:50:34 2024 -0700
-
-    Rename prefill_token_logprobs -> input_token_logprobs; decode_token_logprobs -> output_token_logprobs (#776)
-
-[33mcommit 0a409bd438354d01897b349eb0579b9d30f5ccd2[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 27 19:15:09 2024 -0700
-
-    Fix return_log_probs with cuda graph (#775)
-
-[33mcommit e4db4e5ba58c6e5c9850327fff8b34e5366dd925[m
-Author: Mingyi <wisclmy0611@gmail.com>
-Date:   Sat Jul 27 19:03:40 2024 -0700
-
-    minor refactor: move check server args to server_args.py (#774)
-
-[33mcommit bbc07c4197fd2a40e0ba8aa53ce1d69c116e3081[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 27 17:30:12 2024 -0700
-
-    Move sampling logits to float32 (#773)
-
-[33mcommit a036d41980b0a2890e28ada25316406025b3a0b4[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 27 17:22:18 2024 -0700
-
-    Fix max new tokens (#772)
-
-[33mcommit f95e6617576ee30597ecf1d5de7e6feada70394d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 27 15:44:27 2024 -0700
-
-    Fix max_tokens for OpenAI chat completion API (#766)
-
-[33mcommit de854fb5c58594b08a505c76c3a70a0054ff6d9c[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sun Jul 28 02:22:22 2024 +1000
-
-    feat: add fake tag (#770)
-
-[33mcommit f64b2a9bc061fafe8bbc8fa834869a10d4d3cbf7[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 27 06:29:15 2024 -0700
-
-    Add slack invitation link.
-
-[33mcommit 9f95dcc64f5e97ce1a949818803e1371259d285f[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Jul 27 06:12:16 2024 -0700
-
-    Update readme (#769)
-    
-    Co-authored-by: Mingyi <wisclmy0611@gmail.com>
-
-[33mcommit 0736b270202696b8f865e2915aadc36d3d51811b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 27 05:05:15 2024 -0700
-
-    [Minor] Improve the code style in TokenizerManager (#767)
-
-[33mcommit 3fdab91912fb271c20642e21c2055df0e23d514e[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Sat Jul 27 17:44:46 2024 +0800
-
-    Fix TransformerTokenizer init for chatglm2 & 3 (#761)
-
-[33mcommit ba29504b211df9855a90d5d30c014f296dec9e12[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Jul 26 22:53:53 2024 -0700
-
-    Update supported models (#763)
-
-[33mcommit a72342f180fdbdcc7af2ad24e98959f53cbf3bc1[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Jul 27 14:51:33 2024 +1000
-
-    fix: not run workflows on fork repo (#762)
-
-[33mcommit c3c74bf87480c9cfa98ae182b7917cae979bc484[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Jul 27 14:07:37 2024 +1000
-
-    docs: update model support (#760)
-
-[33mcommit d9fccfefe27c9d63212698578084aaf9f688a066[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Jul 26 18:13:13 2024 -0700
-
-    Fix context length (#757)
-
-[33mcommit 679ebcbbdc37e129b16d305660514dfafda0f7ca[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Jul 26 17:10:07 2024 -0700
-
-    Deepseek v2 support (#693)
-
-[33mcommit 5bd06b45992e850548e0520c4cb4b46493f68ab9[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Jul 27 05:56:30 2024 +1000
-
-    fix: use REPO_TOKEN (#755)
-
-[33mcommit 9a61182732e124ecc69501ff306ea92573c1d25f[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Jul 27 05:48:38 2024 +1000
-
-    fix: add release tag workflow (#754)
-
-[33mcommit eeb2482186dc9aac06570867c89e040dc6027278[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Jul 27 05:37:02 2024 +1000
-
-    feat: add release tag workflow (#753)
-
-[33mcommit 3e455b016efd7ad202e2d26a83e724b7df2917ac[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Jul 27 04:19:30 2024 +1000
-
-    misc: replace deprecated variable HUGGING_FACE_HUB_TOKEN with HF_TOKEN (#752)
-
-[33mcommit 8628ab9c8bdf9b01c4671e3c6caabf49afd73395[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Sat Jul 27 03:54:51 2024 +1000
-
-    feat: add docker workflow (#751)
-
-[33mcommit 1b77670f394b02426444f06164790fcc62a204af[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Jul 26 21:27:41 2024 +1000
-
-    chore: bump v0.2.1 (#740)
-
-[33mcommit 768e05d08fc0de1eca07df93866890a5691cd24c[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Jul 26 21:26:13 2024 +1000
-
-    fix benchmark (#743)
-    
-    Co-authored-by: hnyls2002 <hnyls2002@gmail.com>
-    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
-
-[33mcommit 01fbb11bb7c066cd9052166b694048835d0e6cc9[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Jul 26 21:05:53 2024 +1000
-
-    docs: fix typo (#742)
-
-[33mcommit 05d216da32b9a5f0da9caad44ac0c17c7102338e[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Jul 26 21:03:20 2024 +1000
-
-    docs: add llama 3.1 405b instruction (#739)
-    
-    Co-authored-by: Ying1123 <sqy1415@gmail.com>
-
-[33mcommit 6b32bb1c0b95480c1f4534d026bede81bb3fcc83[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Fri Jul 26 21:00:51 2024 +1000
-
-    misc: format (#741)
-
-[33mcommit 40facad5f122259c9d6868153859792748ca0d10[m
-Author: Toshiki Kataoka <tos.lunar@gmail.com>
-Date:   Fri Jul 26 18:53:17 2024 +0900
-
-    feat: support token ids in /v1/completions (#736)
-
-[33mcommit da504445dc731b33e32ee31012905b8534065b2b[m
-Author: Toshiki Kataoka <tos.lunar@gmail.com>
-Date:   Fri Jul 26 17:27:56 2024 +0900
-
-    fix /generate without sampling_params (#734)
-
-[33mcommit 252e0f7bbd554a020e488a607fc67c09bc7ea07b[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Jul 25 21:14:54 2024 -0700
-
-    fix: small bug for llama-405b fp16 (#733)
-
-[33mcommit 7f6f2f0f09fc3216fb40214e66e54c4c442a9aaf[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Jul 25 09:13:37 2024 -0700
-
-    Update readme (#731)
-
-[33mcommit 7802df1e2b30508c14c20126ad1e9955659b4084[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Jul 25 08:14:36 2024 -0700
-
-    Update readme
-
-[33mcommit 1a491d00cb765c62864ecbd60b0357dc81b4371e[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Jul 25 08:03:36 2024 -0700
-
-    Bump version to 0.2.0 (#730)
-
-[33mcommit 8fbba3de3ddf4384b29bb1c582d837ecd8c08916[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Jul 25 07:42:00 2024 -0700
-
-    Fix bugs (fp8 checkpoints, triton cache manager) (#729)
-
-[33mcommit ae0f6130cb79419f9ddfb2b5194d0d39644057ea[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Jul 25 07:25:33 2024 -0700
-
-    Revert "fix: fp8 config" (#728)
-
-[33mcommit 60105897832e12ebcfc4773f6ad3d9c8686982f7[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Jul 25 20:52:37 2024 +1000
-
-    misc: update bug issue template (#727)
-
-[33mcommit 926ac01b649eb54eb8bbc6a1418f0acbbdd4a651[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Jul 25 20:47:46 2024 +1000
-
-    fix: resolve the logo display issue on the PyPI page (#726)
-
-[33mcommit 25c881a005f11ce2002968b0ce1d8ca6abf319c2[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Jul 25 20:04:35 2024 +1000
-
-    chore: bump v0.1.25 (#725)
-
-[33mcommit 04ec6ba2ac7a4e4beee8be9dc15bc1922544ca82[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Jul 25 03:04:21 2024 -0700
-
-    Fix dockerfile and triton cache manager (#720)
-
-[33mcommit d63f13c13b2fc216a9de448d572b60ea39bbf2bf[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Jul 25 02:01:56 2024 -0700
-
-    fix: fp8 config (#723)
-
-[33mcommit fded67441d9ef12939cba8e41618dba9cff91749[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Jul 25 17:08:11 2024 +1000
-
-    misc: update bulid instruction (#724)
-
-[33mcommit 6e4539405167da85107fd46fce1db6d874067fc4[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Jul 25 16:31:23 2024 +1000
-
-    chore: add close inactive issues workflow (#722)
-
-[33mcommit 97e0f7d250ac2f098e0339fb451dd02625926273[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Jul 25 10:51:18 2024 +1000
-
-    docs: update comment (#721)
-
-[33mcommit d5146baec9e94a9a3fc57d114906e68d451a479d[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Jul 25 09:34:01 2024 +1000
-
-    docs: update supported models (#719)
-
-[33mcommit 459abad2615e09f4e1bd28313b60fb1ada12c432[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Jul 24 15:55:01 2024 -0700
-
-    Bump version to 0.1.24 (#718)
-
-[33mcommit 30d8e130e78645e80318f25dcabcf1efd12c4d05[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Jul 24 14:44:14 2024 -0700
-
-    Improve benchmark scripts (#717)
-
-[33mcommit 08a3bd19ccfb8442a6928175c5cc288bd2e08d3d[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Jul 24 13:38:06 2024 -0700
-
-    docs: update doc (#716)
-
-[33mcommit 321a963b01c7bfb478799429d33859792a2f5b43[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Jul 25 06:05:46 2024 +1000
-
-    misc: update doc (#715)
-
-[33mcommit e17deb27b5cb3417c0315791ceddb5b4e40e7e91[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Jul 25 02:37:41 2024 +1000
-
-    fix: llama 3.1 405b fp8 (#714)
-
-[33mcommit 2d3ae4e1258791a04a28279044359c08c16af99e[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Thu Jul 25 00:03:17 2024 +1000
-
-    docs: update doc (#713)
-
-[33mcommit 75f4ccb7ddea2fd1abaa6475855da141b6c63980[m
-Author: Yineng Zhang <me@zhyncs.com>
-Date:   Wed Jul 24 23:33:28 2024 +1000
-
-    docs: update README (#712)
-
-[33mcommit 83d2b30d759ec2e7e781d4da7d4c98c0b778b941[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Jul 24 10:53:07 2024 +0000
-
-    format
-
-[33mcommit 4367f4bb8d8d9b0cf6c2977cccb03871d300cd9b[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Jul 24 03:41:15 2024 -0700
-
-    Fix prefill size (#711)
-
-[33mcommit 00e4baa7289bf6f83611246b1e02a7411d774ff7[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jul 24 01:22:30 2024 -0700
-
-    Update schedule_heuristic.py
-
-[33mcommit 4cd64b8ee626d57d5e3211d326de141eb6520408[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Jul 23 22:06:02 2024 -0700
-
-    Auto adjust new ratio (#708)
-
-[33mcommit 01d66ae2e8aa08a3fa2b93f8023063f8798477f0[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jul 23 21:53:36 2024 -0700
-
-    Fix multi-node deadlock (#709)
-
-[33mcommit a523a3c13af258856210a678f296d14c825da837[m
-Author: Mingyi <wisclmy0611@gmail.com>
-Date:   Tue Jul 23 16:42:21 2024 -0700
-
-    Reduce hardcoded logic of kernel usage (#707)
-
-[33mcommit 9f94728f5aa7d4e73727fa31dfd58f3ab2d2035b[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Jul 23 13:53:19 2024 -0700
-
-    bump version to 0.1.23 (#706)
-
-[33mcommit 444a02441a50f87d86c406901b3726b323a3fc0f[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Jul 23 13:49:34 2024 -0700
-
-    Update vllm version to support llama3.1 (#705)
-
-[33mcommit fa7ccb3316dccdf0326913222c337da20b436251[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Wed Jul 24 05:51:10 2024 +1000
-
-    feat: add e2e latency (#704)
-
-[33mcommit 268684439b4d5e99cc73937848c222a0322dc50a[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Jul 23 11:52:50 2024 -0700
-
-    Use min new token ratio at start (#701)
-
-[33mcommit 824a77d04d90662eeb3864d3f36e9f2458d4b9f6[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Wed Jul 24 02:39:08 2024 +0800
-
-    Fix hf config loading (#702)
-
-[33mcommit cf99eab7d5037eb0671486a8cb60ae85ce842732[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Jul 23 01:27:01 2024 -0700
-
-    Fix flashinfer (#700)
-
-[33mcommit 9fdea29d054fa4e11da7102b5b07ea6f87ff2465[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Tue Jul 23 02:00:27 2024 +1000
-
-    misc: fix typo (#698)
-
-[33mcommit df7c4c19b4b93ec5b7ba9f4e227f802f8d82a246[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Jul 22 06:32:41 2024 -0700
-
-    Fix trt benchmark (#697)
-
-[33mcommit c3f1aac811c85abe97129ad5c917e8878890e2f9[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Jul 22 03:19:24 2024 -0700
-
-    Tune params (#696)
-
-[33mcommit d198791fe8eaa5ff59fe76d4274c0d2479f196cb[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Mon Jul 22 19:34:05 2024 +1000
-
-    misc: update output token logic (#695)
-
-[33mcommit c07526e46c98dd72c62822c8a5e0fb21e1aeeb16[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Mon Jul 22 18:23:33 2024 +1000
-
-    fix: update bench serving (#694)
-
-[33mcommit 7b597475f2f7c52f49a614914066eafaff527f0f[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Mon Jul 22 03:41:20 2024 +1000
-
-    docs: update README (#692)
-
-[33mcommit 5303c1ed2274a0aa8b970545745c4bd7b54a487a[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Mon Jul 22 01:36:53 2024 +0800
-
-    Support Mistral-Nemo (#691)
-
-[33mcommit 65bd13386b096157739bcdb1d31cbb1d832dbf3a[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Mon Jul 22 00:13:33 2024 +1000
-
-    misc: recommend to use chat model for benchmark (#690)
-
-[33mcommit eedc12e12ed3a4ecf9cc8c6648d9e2bc2caffc23[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Jul 21 03:09:29 2024 -0700
-
-    Support Deepseek MoE Model (#689)
-
-[33mcommit 5a4ef2b5c8625a14bd7b72ef05c14528f11d2191[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Jul 21 02:58:57 2024 -0700
-
-    update readme
-
-[33mcommit 9dab947d5667e3cc8b1f4f37a682d6bd0c492f4d[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Sun Jul 21 18:32:58 2024 +1000
-
-    docs: update README (#688)
-
-[33mcommit 33ee97b0bfdba0c86780fada828587f47b95e11d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Jul 21 01:12:34 2024 -0700
-
-    Allow disabling streaming in bench (#687)
-
-[33mcommit 6a846bb1fd38fd420e481a2ec30cefe9ff27747e[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Sun Jul 21 18:07:30 2024 +1000
-
-    misc: update output file logic (#686)
-
-[33mcommit 0fdb3127a12d0636731638b1a600f5d225dcc4b1[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Sun Jul 21 16:46:58 2024 +1000
-
-    feat: update bench serving (#685)
-
-[33mcommit 5ad033a0706c16c7025618a9d1f65133a024a931[m
-Author: Max Shawabkeh <max99x@gmail.com>
-Date:   Sat Jul 20 23:32:11 2024 -0700
-
-    Fix StreamExecutor.fork() losing the current role start index. (#684)
-
-[33mcommit 77e592e8e08c5a0cd7cae537c5ffb9f5827efa52[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 20 18:36:42 2024 -0700
-
-    support non-streaming benchmark (#682)
-
-[33mcommit caaad53b52f58c2d5ccfacd2e830c1af63597ca5[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Jul 20 18:34:37 2024 -0700
-
-    Support gpt-bigcode model class (#681)
-
-[33mcommit 69d19188fc1ae1fb2caee5d633d58d2c8bf9cce3[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Jul 20 14:16:45 2024 -0700
-
-    Decouple kv (#679)
-
-[33mcommit 4b4a67f81488fc7e5ee3b22070300d318140a10a[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Sun Jul 21 04:05:35 2024 +1000
-
-    feat: support TRT LLM benchmark and multiple benchmarks (#670)
-
-[33mcommit 0ac94c36cbc89c6b4b31a61779cb86982999211e[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Sun Jul 21 01:44:54 2024 +0800
-
-    Fallback when sampling failed (#678)
-
-[33mcommit 2b4c64627727eaf8c14c56d3695c53fd9832a084[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Jul 20 03:39:50 2024 -0700
-
-    Update version to 0.1.22 (#677)
-
-[33mcommit f424e76d96e9cdc580cf648d7fdc75853a8530e1[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Jul 20 03:11:15 2024 -0700
-
-    Fix illegal tokens during sampling (#676)
-
-[33mcommit 490a1f39dd54115b56e3c587b457cca49e0a9bfc[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 20 02:43:55 2024 -0700
-
-    Fix cuda graph with flashinfer (#675)
-
-[33mcommit 06487f126e888d9f1f42330955f9ea604affaa65[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Jul 20 02:18:22 2024 -0700
-
-    refactor model loader: initial refactor (#664)
-
-[33mcommit 39c57317e102be88024aecc95774e56af61aabc4[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Jul 20 02:06:31 2024 -0700
-
-    Revert "Temporary fix invalid sample results" (#673)
-
-[33mcommit 9592a1f3bd07cbe5f826ef0357356df237a3476f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 20 01:57:43 2024 -0700
-
-    Fix random dataset (#671)
-
-[33mcommit 35759efa91f39168c6aa255fd9b14fd50aea968b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 20 01:06:43 2024 -0700
-
-    Support random dataset in bench_serving.py (#669)
-
-[33mcommit 8f4b1559e796bd37cf43d6fa61a8fa7e191eb872[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Jul 20 00:51:05 2024 -0700
-
-    Temporary fix invalid sample results (#668)
-
-[33mcommit e3046ea3a8189aa897a24428da94af67a10a0ee1[m
-Author: Mingyi <wisclmy0611@gmail.com>
-Date:   Fri Jul 19 23:20:54 2024 -0700
-
-    Update OpenAI API (#667)
-
-[33mcommit 49c5e0eca9fe0193e716d0a51bdc2ec7c90a0184[m
-Author: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com>
-Date:   Sat Jul 20 14:10:01 2024 +0800
-
-    Add support for OpenAI API parallel sampling (#640)
-
-[33mcommit ec2150b2944edd7d805cbbeb40565aa1a8df70d0[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Sat Jul 20 12:43:11 2024 +0800
-
-    Fix kill process util (#666)
-
-[33mcommit 7620cd37dd3935eb23963c33b6c7f0444b8fd909[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Jul 19 16:42:06 2024 -0700
-
-    Fix jump forward when streaming (#665)
-
-[33mcommit 50a53887bee1d6c565532c40f9bfc3c12b86040e[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Jul 19 11:40:06 2024 -0700
-
-    Update docs
-
-[33mcommit 11c8efff73fb869b728fbe75aa0ecd7387f814da[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Jul 19 11:12:23 2024 -0700
-
-    Add benchmark instructions (#663)
-
-[33mcommit e87c7fd501ceac8ae692a0f7f3430948b64f2d37[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Jul 19 10:58:03 2024 -0700
-
-    Improve docs (#662)
-
-[33mcommit 630479c3a6acc8a36554843be2101e1e6ff04e8c[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Sat Jul 20 02:54:15 2024 +1000
-
-    feat: update check env (#661)
-
-[33mcommit 51fda1439fe76cba5b2d59f0db8f4f10e1cd6b4f[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Jul 19 09:54:01 2024 -0700
-
-    Update Readme (#660)
-    
-    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
-
-[33mcommit dc4e4a6acc533e9622af1980e4dfa1e9f7da8c8c[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Sat Jul 20 02:27:39 2024 +1000
-
-    misc: update SGLang package description (#659)
-
-[33mcommit 2d96da813e3a78999bc5d7632a946fe8a949d771[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Jul 19 09:27:06 2024 -0700
-
-    refactor model loader [unreachable code]: initial refactor (#655)
-
-[33mcommit c126a6ccba240afd48254c2f254f1e483c13aa8d[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Sat Jul 20 02:15:21 2024 +1000
-
-    feat: add benchmark serving (#657)
-
-[33mcommit ac971ff633de330de3ded7f7475caaf7cd5bbdcd[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Sat Jul 20 02:14:22 2024 +1000
-
-    perf: reduce ttft and itl with stream_interval 1 (#658)
-
-[33mcommit e1792cca2491af86f29782a3b83533a6566ac75b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Jul 18 23:28:40 2024 -0700
-
-    Remove cached triton launcher (#656)
-
-[33mcommit 1b7adbb5a0cfb6826e8b7d45807fc3900b4a5f25[m
-Author: shrirajh <22592342+shrirajh@users.noreply.github.com>
-Date:   Fri Jul 19 14:25:29 2024 +0930
-
-    `TokenizerManager.context_len` should inherit from `server_args.conte… (#654)
-
-[33mcommit a9ef49c12ccd1c36fb225b8831f8a434d90485f4[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Jul 18 17:57:40 2024 -0700
-
-    Detokenize incrementally when streaming (#653)
-
-[33mcommit 21ba3a88a10c3dadb6997320950facfb8c567626[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Jul 18 05:31:44 2024 -0700
-
-    Remove useless variables in infer_batch.py (#651)
-
-[33mcommit 9c5cac24506a230f487659a97b7cf09c920bb480[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Thu Jul 18 20:33:21 2024 +1000
-
-    fix: resolve lint error (#650)
-
-[33mcommit 5960a6e5058df3ccad2ccd0dc47d5f8e09b348b0[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Thu Jul 18 20:04:30 2024 +1000
-
-    feat: add lint workflow (#648)
-
-[33mcommit b050d9283f28f5841a6fa727cd77f92bae12d61d[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Thu Jul 18 19:35:45 2024 +1000
-
-    fix: set ulimit -n 65535 (#647)
-
-[33mcommit 6a4dc996973847b25829c8c0d93f768704a34c11[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Thu Jul 18 19:35:38 2024 +1000
-
-    misc: rm rpyc from PACKAGE_LIST (#649)
-
-[33mcommit d774acad5cef7a538da33d39207f9e2bc51474eb[m
-Author: Mingyi <wisclmy0611@gmail.com>
-Date:   Thu Jul 18 02:13:54 2024 -0700
-
-    Remove the dependency of rpyc (#646)
-
-[33mcommit d93388da3e362216e4a8e050afd0eea537d0d2ea[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Thu Jul 18 14:39:28 2024 +1000
-
-    feat: add check_env (#645)
-
-[33mcommit 476584cb6e1c4535e09e2439ff139357ca78477a[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Jul 17 15:44:41 2024 -0700
-
-    Increase the capacity of the memory pool (#643)
-
-[33mcommit abd5385ac5be52a9e378a678cb0164673b2febf1[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Jul 17 13:49:15 2024 -0700
-
-    Move `global_server_args_dict` (#642)
-
-[33mcommit 3de2f30a27b1d9ffef6dfddcdcc7877c2a2dc857[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Jul 17 13:24:43 2024 -0700
-
-    Flashinfer sample kernel (#617)
-
-[33mcommit 4efcc59d4f96b1863137358673c444f20c4fac0d[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Thu Jul 18 04:58:11 2024 +1000
-
-    misc: add issue and pr template (#638)
-
-[33mcommit 2e341cd4930e20a11456a379f818301801ebe214[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Thu Jul 18 04:55:39 2024 +1000
-
-    misc: add pre-commit config (#637)
-
-[33mcommit a8552cb18b452e9a0a7e421651caf9d3c4deb673[m
-Author: zhyncs <me@zhyncs.com>
-Date:   Wed Jul 17 15:40:03 2024 +1000
-
-    feat: support internlm2 (#636)
-
-[33mcommit a470e60c97d080901ce84d66b5b0d63f5cbae798[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Jul 16 20:15:24 2024 -0700
-
-    clean up step function (#635)
-
-[33mcommit 5f90e0769c83873cb79e404632367542f6650215[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Jul 16 19:18:54 2024 -0700
-
-    Update README.md
-
-[33mcommit 8832ecb1e451a58a85cbdcd7029586187c1c9574[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Jul 16 16:12:12 2024 -0700
-
-    Reduce docker size (#632)
-
-[33mcommit 5ff60eda7829cf075fe607b1ba2cf7be66917168[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Jul 16 16:07:19 2024 -0700
-
-    Fix vertexai (#633)
-
-[33mcommit c193002297d18efeacbc0887ec1c3a4c7b2c039e[m
-Author: Aidan Cooper <30752032+AidanCooper@users.noreply.github.com>
-Date:   Tue Jul 16 19:54:42 2024 +0100
-
-    Add support for VertexAI safety settings (#624)
-
-[33mcommit fe3be1595dc52ec3a10b784191b54b611804574f[m
-Author: ylying <373472509@qq.com>
-Date:   Wed Jul 17 02:48:49 2024 +0800
-
-    Add qwen2 tie word embedding (#630)
-
-[33mcommit 0aa189f150b59ed6b2bd91605e1eb056a7c6b98c[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Jul 16 09:05:10 2024 -0700
-
-    Disable NCCL_NVLS by default (#631)
-
-[33mcommit f6b29f69208397f5ee83990bff77fa49860ddf05[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Jul 16 01:12:37 2024 -0700
-
-    Update docker file (#629)
-
-[33mcommit c9ee3d3559717dd7a92616315b1f997dd6ba7acc[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Jul 15 22:09:09 2024 -0700
-
-    Fix model forward grad (#628)
-
-[33mcommit 41d1f67704a3761423131f48c357b957452a00a9[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jul 15 19:56:55 2024 -0700
-
-    Fix flush cache (#627)
-
-[33mcommit 56f5fc4ab5e1d9e270dcdab5f1d6f67e6785eebe[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Jul 15 13:10:53 2024 -0700
-
-    Bump version to 0.1.21 (#626)
-
-[33mcommit 6a2941f4d037cb5fa7c927342dc7f09387c29ab0[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Jul 15 07:10:51 2024 -0700
-
-    Improve tensor parallel performance (#625)
-    
-    Co-authored-by: Mingyi <wisclmy0611@gmail.com>
-
-[33mcommit 5ac8b80677614a9c024740e94f9a087a39eb3499[m
-Author: Mingyi <wisclmy0611@gmail.com>
-Date:   Mon Jul 15 02:01:09 2024 -0700
-
-    Simplify mem state (#623)
-
-[33mcommit bae9541e4c727ac8d7e082a2b8fd4e028e58b606[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Jul 14 14:38:13 2024 -0700
-
-    Update benchmark script (#621)
-
-[33mcommit a56858ba671400f5954b17accdaf9a087b632cd1[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Jul 14 12:55:55 2024 -0700
-
-    Unify index operations (#620)
-
-[33mcommit 564a898ad975192b593be81387d11faf15cb1d3e[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Jul 13 23:39:37 2024 -0700
-
-    Optimize mem indices mangement (#619)
-
-[33mcommit 5d264a90ac5154d8e368ee558337dd3dd92e720b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 13 17:27:55 2024 -0700
-
-    Bump version to 0.1.20 (#618)
-
-[33mcommit 5949b1ca0ec50d58ad921442683ddfb9a3bdb157[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Jul 13 16:45:11 2024 -0700
-
-    Fix memory pool index error (#616)
-
-[33mcommit 0feca02dd9659ff7bbaa5e5aa7b9eb1d4422f080[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 13 15:59:04 2024 -0700
-
-    Improve benchmark scripts (#615)
-
-[33mcommit 10143e1a5f2ee5826f7e566432d29e221d8c4af0[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Jul 13 15:24:03 2024 -0700
-
-    Memorypool chunked prefetch (#614)
-
-[33mcommit 65c65776969dddb50f198c4790f801a0d55137ff[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 13 15:00:26 2024 -0700
-
-    Improve benchmark scripts & fix llava (#613)
-
-[33mcommit 665815969a71a478b840999cb821054814a723fc[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jul 13 05:29:46 2024 -0700
-
-    Enable cuda graph by default (#612)
-
-[33mcommit 396a69240fc99e54d079d9f623ad83239eb39167[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Jul 12 18:21:11 2024 -0700
-
-    Cleanup attention backend: flashinfer and triton (#611)
-
-[33mcommit af4e7910e75bcac920a6b2fdf800188fd5615205[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Jul 12 13:00:03 2024 -0700
-
-    Clean up the usage of flashinfer (#610)
-
-[33mcommit 519e20cfda4aad594e32c86e844effdec753dcca[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Jul 12 12:28:09 2024 -0700
-
-    Code clean up: Remove deprecated prefill move InputMetadata to infer_batch.py (#609)
-
-[33mcommit d9a6902986f7c74c77aa0a570f6c7e59d85d2125[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Jul 11 14:37:01 2024 -0700
-
-    Fix bench latency (#607)
-
-[33mcommit ad872feb14738d2974a434c2d6bfccf8c0ee0062[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jul 9 02:23:14 2024 -0700
-
-    bump version to 0.1.19
-
-[33mcommit da2e5d6546755507fc3b893d9c68e62f07c77311[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jul 9 01:52:55 2024 -0700
-
-    Fix the default argument of OpenAI Chat completion (#605)
-
-[33mcommit ce62dc73f06c6dcc37631dc1e94cc74d434e0a6d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jul 9 01:32:46 2024 -0700
-
-    Update model_support.md
-
-[33mcommit 02b72586584d7141fd6b964ae572b65e474f876a[m
-Author: 胡译文 <1020030101@qq.com>
-Date:   Tue Jul 9 15:35:39 2024 +0800
-
-    [Feat] Expose logprob options to `sgl.gen` API (#503)
-    
-    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
-
-[33mcommit d557e9f3b7929c1a781514ce8272fdeb1f8267b2[m
-Author: prophe <49546352+for-just-we@users.noreply.github.com>
-Date:   Tue Jul 9 14:55:44 2024 +0800
-
-    Update chat template for qwen and yi-1.5. (#530)
-
-[33mcommit 740c46a1520207d5369a39ff25da97eeceef6ec4[m
-Author: Tommy Yang <tommyyang0524@gmail.com>
-Date:   Tue Jul 9 14:44:59 2024 +0800
-
-    Add Qwen2 MoE support (#603)
-
-[33mcommit b38687226a140fd997676501229597366949cb56[m
-Author: Tommy Yang <tommyyang0524@gmail.com>
-Date:   Tue Jul 9 14:44:22 2024 +0800
-
-    Make sglang compat with vllm 0.5.1 (#598)
-
-[33mcommit 710f614ebe26def92c4cfb5655fba7804cc24b06[m
-Author: Pan Lyu <titan.pann@gmail.com>
-Date:   Tue Jul 9 14:27:04 2024 +0800
-
-    add minicpm support (#602)
-
-[33mcommit f25b76c02abbc2971b5e5532c0c49e960e662e23[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Jul 8 17:46:55 2024 -0700
-
-    add `LogitsMetadata` (#604)
-
-[33mcommit f4e885b7c3fbed59ce48c7c3046e628e7a58d396[m
-Author: Mingyi <wisclmy0611@gmail.com>
-Date:   Sun Jul 7 19:35:22 2024 -0700
-
-    Reduce number of workspaces (#601)
-
-[33mcommit 0877f1e75b508f74ca06adc93beb86d80732a310[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Jul 7 01:55:58 2024 -0700
-
-    Fix streaming (#600)
-
-[33mcommit 5304b4ef58ecf101abac01c80d2dd5fe1e506d7f[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Jul 6 23:34:10 2024 -0700
-
-    Add `--enable-p2p-check` option (#599)
-
-[33mcommit 26908d9568ef6b7f658cbaea6096f5fcd7df5451[m
-Author: Pan Lyu <titan.pann@gmail.com>
-Date:   Sun Jul 7 05:53:22 2024 +0800
-
-    * fix(detokenizer_manager.py): fix truncated decoded output (#586)
-    
-    Co-authored-by: hnyls2002 <hnyls2002@gmail.com>
-
-[33mcommit c0982ac5535ed99bc7c1d6404e948dabf86a8af6[m
-Author: Mingyi <wisclmy0611@gmail.com>
-Date:   Sat Jul 6 00:58:46 2024 -0700
-
-    Fix Llava model (#594)
-
-[33mcommit dc1b8bcfaac5cd3deb9ea786cfff094b04f5b3de[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Jul 5 10:06:17 2024 -0700
-
-    Format (#593)
-
-[33mcommit 5a57b8addd3fc6fc40e7a605750e327d84141aec[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Jul 5 09:48:54 2024 -0700
-
-    Add Gemma2 (#592)
-
-[33mcommit d737da5f17ebd179fa9d6a79fb28e6d09398848d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Jul 4 00:55:40 2024 -0700
-
-    Update README.md
-
-[33mcommit ac113887560c5864d93e7f16c6e8933dc0d11c8c[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Jul 4 00:53:49 2024 -0700
-
-    Add docker file (#588)
-    
-    Co-authored-by: Ying Sheng <ying.sheng@databricks.com>
-
-[33mcommit dc8cef1d0c392672af5806b126d9fbb638ba1da5[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Jul 4 00:05:40 2024 -0700
-
-    Update README.md
-
-[33mcommit 2f11936f953e3b7fab698c45054c6893f98c62e4[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Jul 4 06:27:29 2024 +0000
-
-    bump version to 0.1.18
-
-[33mcommit 63fbef9876c3e09592a5410ddbc68eba9245f05f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jul 3 23:19:33 2024 -0700
-
-    fix flashinfer & http log level
-
-[33mcommit 2a754e57b052e249ed4f8572cb6f0069ba6a495e[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Jul 3 16:14:57 2024 -0700
-
-    2x performance improvement for large prefill & Fix workspace conflicts (#579)
-
-[33mcommit 96c503eb6029d37f896e91466e23469378dfc3dc[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Jul 4 07:01:19 2024 +0800
-
-    fix the broken server args (#585)
-
-[33mcommit 441cca773d28b2147d9fd14c6e699f29fe9754e7[m
-Author: Chen Xuechen Li <xuechen@x.ai>
-Date:   Wed Jul 3 12:23:30 2024 -0700
-
-    support gptj style rope in llama
-
-[33mcommit c7709d3abe6234e003c311ee059b1a90601b0cc7[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jul 3 02:07:34 2024 -0700
-
-    Update install commands (#583)
-
-[33mcommit 9380f50ff9cbc36afc1888c7a5b69f53c9a488f5[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Jul 2 02:25:07 2024 -0700
-
-    Turn on flashinfer by default (#578)
-
-[33mcommit 95dc093b195e5999699cd7bdba60867c7e60fc92[m
-Author: Daniel Hernandez Garcia <dhgarcia@users.noreply.github.com>
-Date:   Tue Jul 2 06:10:07 2024 +0100
-
-    [BugFix] gemma loading weights "lm_head.weight" key error (#577)
-
-[33mcommit d9ac639202fdc97f42fe41ff75a604089a7cac37[m
-Author: Yueyang Pan <pyyjason@gmail.com>
-Date:   Tue Jul 2 07:08:39 2024 +0200
-
-    Fix flashinfer version (#576)
-
-[33mcommit 26294b2f3d1a07302e0be7783eb47cccb3fd5359[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jul 1 09:54:08 2024 -0700
-
-    Update README.md
-
-[33mcommit 75b31a2a88411f931c623f78a23cfef36124550f[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Jun 30 18:26:04 2024 -0700
-
-    Update run_batch interface and max_prefill_tokens (#574)
-
-[33mcommit 11616fc6bd0ca9c156144839fedf586c2aa9a0e5[m
-Author: sglang <157339885+ZackZeng999@users.noreply.github.com>
-Date:   Sat Jun 29 23:42:14 2024 -0700
-
-    Minor fix in compiler & format (#545)
-
-[33mcommit 9ce89bc14b4b2fc934528fa13cce7c9e6e351b50[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri Jun 28 00:44:22 2024 -0700
-
-    Update benchmark script (#571)
-
-[33mcommit badf3fa02011f9e1af9a043033a41ff8c25dfbec[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Jun 27 23:30:39 2024 -0700
-
-    Expose dtype argument (#569)
-
-[33mcommit 945aa9beb233c7caf2b7345b5b023ecdabc838e5[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Jun 27 11:37:49 2024 -0700
-
-    Update readme (#568)
-
-[33mcommit 2e6e62e1562dc8d5dfe53ef469ad8595743a4f3c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jun 26 09:33:45 2024 -0700
-
-    Increase the number of thread limitation for tp worker managers. (#567)
-
-[33mcommit a385ee27bd0025781eba61578889e470a1c027fb[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jun 25 12:46:00 2024 -0700
-
-    Warmup cublas (#566)
-
-[33mcommit eb1ae6ae0c7e044ebdb405992efebf7b28042370[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jun 25 03:38:04 2024 -0700
-
-    Add sglang.bench_latency for offline benchmark (#564)
-
-[33mcommit 2187f36237eb532f7a9eab92c198ebd3571e1494[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jun 25 01:16:20 2024 -0700
-
-    Add a new arguments log_level_http to control the HTTP logging (#563)
-
-[33mcommit 9465b668b9d3d7d319a1e5b4364e4323bd6e83b8[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jun 24 15:24:21 2024 -0700
-
-    Allow running with vllm==0.4.3 (#561)
-
-[33mcommit 05471f210318fa72570ccb6af3c56cdcde86e55a[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Jun 24 15:23:57 2024 +0800
-
-    Update test_flashinfer (#560)
-
-[33mcommit 1fa15099d85087deeaa5090c76361e53abf9d4a6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jun 22 00:45:33 2024 -0700
-
-    Add LlamaForClassification (#559)
-
-[33mcommit 303ef8883e523456545a6d4f8a713bc04b8eed1c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jun 22 00:25:24 2024 -0700
-
-    Clean up logits processor (#558)
-
-[33mcommit 92cb93f39078a3aa8f356ecd26e71bfa7a3dd962[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Jun 22 15:11:04 2024 +0800
-
-    Fix latency benchmark (#557)
-
-[33mcommit e94e60d6fbb39d967638347c01a711cbe82e2c42[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Jun 21 17:32:36 2024 -0700
-
-    make flashinfer workspace larger
-
-[33mcommit d2f8bfb2e142348b38cdb4f8c5cd82f0ef3dcbff[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Jun 20 23:19:52 2024 -0700
-
-    Follow-up fixes for flashinfer 0.0.5 (#556)
-
-[33mcommit b7e2f800ac1f5056f8789a2cb7b851a11df36433[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Jun 20 20:29:06 2024 -0700
-
-    Update flashinfer to 0.0.5 (#554)
-
-[33mcommit 09593e9bc930f099a151cd57f66ccd2c69b3d43d[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Jun 17 20:41:24 2024 -0700
-
-    Multi-node Tensor Parallelism (#550)
-    
-    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
-
-[33mcommit 53a7ebd89a0bc64f720065035f73cc4d99e864d5[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jun 17 09:47:58 2024 -0700
-
-    Update fused_moe (#553)
-
-[33mcommit ad5f04d6ce2198db81bcc8b173cceb1d3989d7eb[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Jun 16 21:45:04 2024 +0800
-
-    Fix the Jump-Forward with Chinese (#551)
-
-[33mcommit bbec01c9aa219701320395133c8db848e635cde9[m
-Author: Qubitium-modelcloud <417764+Qubitium@users.noreply.github.com>
-Date:   Sat Jun 15 13:56:10 2024 +0800
-
-    Fix tp worker only checking req[0] for stream (#546)
-
-[33mcommit 40e53d65cbb8b609a6ff8e977d2318044d0f0ee0[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Jun 13 16:37:12 2024 +0800
-
-    Add disk cache for loading ShareGPT dataset. (#542)
-
-[33mcommit fb9296f0ed07f4b9fd41f5bd9c670d5a607ae46a[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Jun 12 21:48:40 2024 -0700
-
-    Higher priority for user input of max_prefill_tokens & format (#540)
-
-[33mcommit 1374334d38f698adaafeaac6e0e6394d2abe90a7[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Jun 12 21:23:19 2024 -0700
-
-    Fix dependency & crash issues (#539)
-
-[33mcommit 94aead9e8d9340764a2ef92fe6e079ec2475fb0b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jun 12 13:17:35 2024 -0700
-
-    Fix dependency (#538)
-
-[33mcommit 9c902b1954c55ec152a5ea91ed47e8cb696f7e46[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Jun 12 14:39:12 2024 +0800
-
-    Decode Incrementally (#517)
-
-[33mcommit 111991fe2335fbfeb03330209bca1b051b11e69f[m
-Author: ZhouXingg <165115237+ZhouXingg@users.noreply.github.com>
-Date:   Wed Jun 12 14:27:17 2024 +0800
-
-    Fix Regression: Disable p2p for 4090 (#531)
-    
-    Co-authored-by: Qubitium <417764+Qubitium@users.noreply.github.com>
-
-[33mcommit a8c787d2b316c1672d9c626e38496066c71d8adb[m
-Author: Qubitium <417764+Qubitium@users.noreply.github.com>
-Date:   Wed Jun 12 07:39:52 2024 +0800
-
-    Add ChatGLM Model Support (#516)
-    
-    Co-authored-by: ZX <zx@lbx.dev>
-
-[33mcommit 5f283991e9983dcc367317e5d13e2546b7f1e4e4[m
-Author: Fabian Preiß <fpreiss@digon.io>
-Date:   Wed Jun 12 01:37:27 2024 +0200
-
-    [Minor] Correct Optional type hints in api (#526)
-
-[33mcommit b6667a53b98b086eb0996cdd0ef38530953c35fe[m
-Author: Fabian Preiß <fpreiss@digon.io>
-Date:   Wed Jun 12 01:36:43 2024 +0200
-
-    Fix RAG nb, parea setup (parea -> parea-ai) (#525)
-
-[33mcommit 542bc733d6ebb6da2554704fc101830a07791584[m
-Author: Fabian Preiß <fpreiss@digon.io>
-Date:   Mon Jun 10 21:13:50 2024 +0200
-
-    Fix missing numpy dependency in pyproject.toml (#524)
-
-[33mcommit f6dbd24043b8c18d87a14b3c6fe5c4f567f6c1ba[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jun 8 02:06:52 2024 -0700
-
-    Improve doc strings (#518)
-
-[33mcommit e8a2327d523ce646edf400a2c6da647ca7d8c645[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Jun 7 19:49:18 2024 -0700
-
-    Update version to 0.1.17 (#515)
-
-[33mcommit 91f93f141f79fe8ec1ff8c7a10f33f3f94f96846[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Jun 7 19:22:34 2024 -0700
-
-    Crash the server when error or OOM happens (#514)
-
-[33mcommit f70f72586ad26c1738a0d6dc6fbcaa878997b68c[m
-Author: Qubitium <417764+Qubitium@users.noreply.github.com>
-Date:   Sat Jun 8 04:20:40 2024 +0800
-
-    Fix rid state map leak + Refractor .finished (#505)
-    
-    Co-authored-by: ZX <zx@lbx.dev>
-
-[33mcommit c0ae70c8ed7b9c77164f7483b1075b15fe78ed34[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Jun 7 12:51:40 2024 -0700
-
-    Improve logging & fix litellm dependency. (#512)
-
-[33mcommit 87260b7bfd7c46cfb4511024b44bc9fc43073ad5[m
-Author: 胡译文 <1020030101@qq.com>
-Date:   Sat Jun 8 03:24:28 2024 +0800
-
-    Litellm Backend (#502)
-
-[33mcommit 651a23ee7cb8f5a0560bb4ec03bb6276f48a1cbb[m
-Author: Amos You <91300605+amosyou@users.noreply.github.com>
-Date:   Fri Jun 7 12:23:29 2024 -0700
-
-    remove redundant pad_input_ids function (#500)
-
-[33mcommit bf3e271fe05f586c372d765422d2094bf0d5981c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Jun 7 12:11:31 2024 -0700
-
-    Update vllm to  v0.4.3 (#511)
-    
-    Co-authored-by: Qubitium <417764+Qubitium@users.noreply.github.com>
-    Co-authored-by: ZX <zx@lbx.dev>
-
-[33mcommit 3bc01ac1377001540b38fd8ccb470b29c0e74804[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jun 3 18:11:34 2024 -0700
-
-    [Minor] improve code style
-
-[33mcommit 9f009261f20fd95dbc5bf5b8236009095b2d4eec[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jun 1 17:46:08 2024 -0500
-
-    Improve docs
-
-[33mcommit 159cc741e47539897eb45cde0f328053c23f5bd2[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri May 31 23:33:34 2024 -0700
-
-    Make the server random by default (#493)
-
-[33mcommit 7d1ebc2d716c9f732f1500da542f1368aa6fdf4d[m
-Author: Yuanhan Zhang <yuanhan002@ntu.edu.sg>
-Date:   Sat Jun 1 14:31:56 2024 +0800
-
-    update the script: examples/usage/llava_video/srt_example_llava_v.sh (#491)
-
-[33mcommit 83525a1df20c1c5514bf5311388e4794f5fd1abf[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri May 31 12:00:21 2024 -0700
-
-    Revert "Make the server random by default" (#492)
-
-[33mcommit 80a33ce8b0b1f95638e4d54f5d13d03e57f89f82[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed May 29 18:41:18 2024 -0400
-
-    Do not set the default value of global random seed (#488)
-
-[33mcommit 1a57e4167915780c9ba458ff6f3ad5a18e048ee4[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon May 27 23:00:16 2024 -0700
-
-    do not launch workers in parallel
-
-[33mcommit adc974268a21fb31f15720aab5cea09d0c72b313[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon May 27 22:46:04 2024 -0700
-
-    Update docs (#486)
-
-[33mcommit 0463f7fb52f06dcae2b10b7ca2a18a86ac135f96[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon May 27 21:24:10 2024 -0700
-
-    Support data parallelism (static) (#480)
-    
-    Co-authored-by: Ying Sheng <ying.sheng@databricks.com>
-    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
-    Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
-    Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
-
-[33mcommit 565d727409f4fbf3a4f5d995aa7190f33c3e6e86[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon May 27 14:32:05 2024 -0700
-
-    improve logging & fix vllm version
-
-[33mcommit 09de730dee31d13451fc8967b7ca31967b1a2420[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon May 27 14:13:26 2024 -0700
-
-    Improve benchmark scripts & add more models (#484)
-
-[33mcommit 55c16436273d4a42f7cfe342df5f10ad05a8d0fe[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun May 26 12:51:45 2024 -0700
-
-    Improve benchmark scripts & rename some scripts (#477)
-
-[33mcommit 2b605ab1d76db912b599b0fd7af59bf399b81d5f[m
-Author: Li Bo <drluodian@gmail.com>
-Date:   Mon May 27 03:29:51 2024 +0800
-
-    [Feat/Fix] Refactoring Llava models into single file (#475)
-
-[33mcommit 947bda73fe7cc8d72b31619b532d1d33459cfc4a[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun May 26 12:09:03 2024 -0700
-
-    Add benchmark scripts (#476)
-
-[33mcommit f06e90c2cfb41f283a22ee705ca20d2f451aab6e[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun May 26 00:07:26 2024 +0800
-
-    Optimize retract (#440)
-
-[33mcommit 2cea6146d8735780da602c0dfa0569b0fb5d47ba[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri May 24 03:48:53 2024 -0700
-
-    Improve logging & add logit cap (#471)
-
-[33mcommit 44c998fcb553f5210a96f1dc033d24f15003486c[m
-Author: Yuanhan Zhang <yuanhan002@ntu.edu.sg>
-Date:   Fri May 24 18:38:20 2024 +0800
-
-    Add the instruction link to the LLaVA-NeXT-Video at README (#463)
-
-[33mcommit 3167d8dabcb29139e4af8aee4dca627344868281[m
-Author: bing <bingwork@users.noreply.github.com>
-Date:   Fri May 24 18:38:01 2024 +0800
-
-    fix test bug in srt_llava_next_test.py (#470)
-
-[33mcommit 0fafc5606b0dc205518002dc2058e7b9a8d5019a[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue May 21 11:46:35 2024 -0700
-
-    port fp8 mixtral (#460)
-
-[33mcommit 19d2135cb8219494e979995d8082b508459862b1[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue May 21 09:13:37 2024 -0700
-
-    Use model loader from vllm (#459)
-
-[33mcommit ced77c66262a7f7266b307f189b132cd66019ae1[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon May 20 18:44:23 2024 -0700
-
-    Rename api_num_spec_tokens -> num_api_spec_tokens (#458)
-
-[33mcommit 8dbdc018a31e1437fb40fec48fd0a9a6ba4fdaee[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon May 20 18:41:21 2024 -0700
-
-    Abort disconnected requests (#457)
-
-[33mcommit 3e684be7a3c0cf7e9db987d92d74ecc0d09f71be[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon May 20 17:01:13 2024 -0700
-
-    Fix openai speculative execution (#456)
-
-[33mcommit ec380dfd30a754f05bf66f92573f0ae93450f047[m
-Author: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
-Date:   Sat May 18 22:23:53 2024 -0700
-
-    openai chat speculative execution (#250)
-    
-    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
-
-[33mcommit 5b647543c141a6b21307f3fbc679d2a0a9231c41[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun May 19 13:00:12 2024 +0800
-
-    Fix the broken `--disable-radix-cache` (#451)
-
-[33mcommit 8210ec60f473a7c13ef26332210d355e30c093e6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri May 17 05:49:31 2024 -0700
-
-    Improve error handling & abort disconnected requests (#449)
-
-[33mcommit 5be9eb8a8c52ed7cc48c1cabd1c7bc5be334f6b6[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Fri May 17 02:35:15 2024 -0700
-
-    Add PUT for generate api (#448)
-
-[33mcommit c05956e53495a219bdb12d9f995d22afa89fd6cd[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu May 16 18:07:30 2024 -0700
-
-    Simplify port allocation (#447)
-
-[33mcommit d75dc20fae888aaa4613af6a8e2dd73e57045752[m
-Author: Matthias Gerstgrasser <matthias@gerstgrasser.net>
-Date:   Thu May 16 14:55:05 2024 -0700
-
-    Add finish_reason to OpenAI API (#446)
-
-[33mcommit 690d162d9746e96d37cc62c5bf00d22f71c32583[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue May 14 22:40:46 2024 +0800
-
-    Format code (#441)
-
-[33mcommit 664287b2a787ff774b6ce9529b2a784e304ee38c[m
-Author: Kaichen Zhang - NTU <zhan0564@e.ntu.edu.sg>
-Date:   Tue May 14 13:17:50 2024 +0800
-
-    [Feat] Add llava qwen, llava mistral (#419)
-    
-    Co-authored-by: Bo Li <drluodian@gmail.com>
-
-[33mcommit e0ae5d42ecdd06a749969f28b67df688e9edec8b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon May 13 17:29:17 2024 -0700
-
-    Update version to 0.1.16 (#438)
-
-[33mcommit 32de16ce2fb6a50534c4878d5ea1430597847829[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon May 13 17:26:18 2024 -0700
-
-    Fix streaming (#437)
-
-[33mcommit 0992d85f92688035cd669d12735518faba93b545[m
-Author: Yuanhan Zhang <yuanhan002@ntu.edu.sg>
-Date:   Tue May 14 07:57:00 2024 +0800
-
-    support llava video (#426)
-
-[33mcommit 5dc55a5f02de2aff87a12f5dfcf5a8a781f1220a[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon May 13 15:56:00 2024 -0700
-
-    Handle truncation errors (#436)
-
-[33mcommit 4231a42fa8475c039b2b468f6de2a5c294241ae7[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon May 13 12:11:55 2024 -0700
-
-    Fix import of global_config
-
-[33mcommit 455c9ccc4a8bbaac141d373f34e0d70716248e7c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon May 13 00:17:02 2024 -0700
-
-    Update readme (#434)
-
-[33mcommit 39191c851532b8899b81c8dfac1bf558ee6be160[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon May 13 12:47:13 2024 +0800
-
-    Cache optimizations (#418)
-
-[33mcommit 562b8857d8060905b64a9698d334c6133c024dd9[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun May 12 20:49:04 2024 -0700
-
-    Improve error handling (#433)
-
-[33mcommit 04c0b21488e2edbbc1a191d01abc77bd13e2c2a3[m
-Author: Shannon Shen <22512825+lolipopshock@users.noreply.github.com>
-Date:   Sun May 12 12:29:00 2024 -1000
-
-    Allow `input_ids` in the input of the `/generate` endpoint  (#363)
-
-[33mcommit 6e09cf6a1514067d08cf7c1efb42cf56634c8529[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun May 12 15:05:40 2024 -0700
-
-    Misc fixes (#432)
-
-[33mcommit 72bb3443880d5a7e382cb9783a8231021e130ae2[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun May 12 14:22:33 2024 -0700
-
-    Update version to 0.1.15 (#431)
-
-[33mcommit 2d580e7a8991b77f4bdc940fcc23b70c9fdc6b1e[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun May 12 08:18:53 2024 -0700
-
-    Fix flashinfer (#430)
-
-[33mcommit 3fc97f67095f5c183a9942bfb45679deab08c127[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun May 12 06:41:32 2024 -0700
-
-    Move openai api server into a separate file (#429)
-
-[33mcommit abc548c7079638628b8f19c6c742b518f0db0031[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun May 12 05:10:35 2024 -0700
-
-    Minor fix for the import path (#428)
-
-[33mcommit aee4f523cfd92f844208118e42dcc6bfeb271d08[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun May 12 04:54:07 2024 -0700
-
-    Fix logit processor bugs (#427)
-
-[33mcommit 7023f413c6c57cf29c20a7b28582fa01398de1b6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat May 11 20:55:00 2024 -0700
-
-    Clean up (#422)
-
-[33mcommit 09deb20deef8181a23f66c933ea74b86fee47366[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat May 11 16:56:42 2024 -0700
-
-    Optimize the memory usage of logits processor (#420)
-
-[33mcommit 33b242df303e03886835d08a583fefe979a3ee88[m
-Author: Qubitium <417764+Qubitium@users.noreply.github.com>
-Date:   Sun May 12 07:37:49 2024 +0800
-
-    Compat with latest VLLM 0.4.2 main + fork.number rename + Flashinfer 0.0.4 (#380)
-    
-    Co-authored-by: ZX <zx@lbx.dev>
-    Co-authored-by: ZhouXingg <165115237+ZhouXingg@users.noreply.github.com>
-
-[33mcommit a511a2d0895b63505fce68cfff5476b251528b34[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu May 9 15:49:29 2024 -0700
-
-    restrict vllm version
-
-[33mcommit 6ec65f4555f24fe832860679bb0eff86798f6987[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu May 9 15:39:22 2024 +0800
-
-    Make public APIs more standard. (#416)
-
-[33mcommit e2c31fca5cbbb8c145b69f6b3cdaabdcf7039add[m
-Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
-Date:   Thu May 9 08:14:01 2024 +0100
-
-    Include finish reason in meta info response (#415)
-
-[33mcommit d5de20a3ee372339872c287cbc4209df41b834f8[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed May 8 15:15:18 2024 +0800
-
-    Fix `sync()` when `fork(1)` (#412)
-
-[33mcommit 4a1c6ae2ced0f85e1a7414cb13ace9f342a13a56[m
-Author: YoungJoong Noah Kim <142862540+noah-kim-theori@users.noreply.github.com>
-Date:   Tue May 7 16:18:15 2024 +0900
-
-    Add Cohere Command R chat template (#411)
-
-[33mcommit 14522e6a26e7dfb6e27c0b79f211f11d190259fe[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun May 5 16:14:17 2024 +0800
-
-    Organize Benchmark (#381)
-
-[33mcommit 183df4728260a1469612f848f980cc71266591b9[m
-Author: ZhouXingg <165115237+ZhouXingg@users.noreply.github.com>
-Date:   Wed May 1 07:17:12 2024 +0800
-
-    SamplingParams add "spaces_between_special_tokens" argument (#392)
-
-[33mcommit 5c5aba59005d80109e8e290f770fe6501028fa4f[m
-Author: Joschka Braun <47435119+joschkabraun@users.noreply.github.com>
-Date:   Tue Apr 30 19:13:28 2024 -0400
-
-    Adding RAG tracing & eval cookbook using Parea (#390)
-
-[33mcommit ba67101f99ec8a43b738afaff39757795573dc09[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Apr 30 15:53:39 2024 -0700
-
-    Fix chatml template (#406)
-
-[33mcommit 95c4e0dfac5a5f4a2f7f9292402fec26d0838f31[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Apr 28 21:06:22 2024 +0800
-
-    Format Benchmark Code (#399)
-
-[33mcommit 19818b9c2f8df06436412f61d192d065bc2f976e[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Apr 26 01:01:36 2024 +0800
-
-    Minor: style improvement of radix_cache and memory_pool (#395)
-
-[33mcommit 9216b10678a036a1797e19693b0445c889016687[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Apr 25 17:29:07 2024 +0800
-
-    Improve performance when running with full parallel (#394)
-
-[33mcommit da19434c2f3cbe4f367f84993da0bcbd84efb6ba[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Apr 24 02:23:01 2024 +0800
-
-    Benchmark Updates (#382)
-
-[33mcommit 150d7020ed8fcba4f3fdef52b770850aff8ae048[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Apr 23 22:36:33 2024 +0800
-
-    Revert removing the unused imports (#385)
-
-[33mcommit 9acc6e350475a64207a6702a579850c93ab27b43[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Apr 22 22:38:09 2024 +0800
-
-    add `.isort.cfg` (#378)
-
-[33mcommit cf9d8efdd374b65dcdea15f8d5bb89b47f98d8ed[m
-Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
-Date:   Sun Apr 21 17:40:12 2024 +0100
-
-    llama3 instruct template (#372)
-
-[33mcommit 1bf1cf195302fdff14a4321eb8a17831f5c2fc11[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Apr 21 17:25:14 2024 +0800
-
-    Reduce overhead when `fork(1)` (#375)
-
-[33mcommit e822e5900b98d89d19e0a293d9ad384f4df2945a[m
-Author: Ke Bao <ISPObaoke@163.com>
-Date:   Thu Apr 18 00:47:37 2024 +0800
-
-    Optimize radix tree matching (#364)
-
-[33mcommit ca4f1ab89c0c9bdd80fdfabcec52968fbde108bb[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Apr 17 00:16:32 2024 -0700
-
-    Update model support in readme (#370)
-
-[33mcommit 2b6d99919143080b84db5fb1a8cb5ea504e5fabe[m
-Author: Fronx <fronx@wurmus.de>
-Date:   Tue Apr 16 20:18:24 2024 +0200
-
-    Fix issue #367 – System message not supported for Anthropic (anthropic.BadRequestError) (#368)
-    
-    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
-
-[33mcommit 65501a9cf1dc9e73bba24f35b88988f5633866a9[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Apr 16 18:10:12 2024 +0000
-
-    Fix commandr import; format code
-
-[33mcommit db611066ade4314c19973133f13b3dae49f3ee86[m
-Author: ZhouXingg <165115237+ZhouXingg@users.noreply.github.com>
-Date:   Wed Apr 17 01:36:51 2024 +0800
-
-    support `command-r` (#369)
-
-[33mcommit c93293c57e16566c18443bae1bfacc107b714c74[m
-Author: Ikko Eltociear Ashimine <eltociear@gmail.com>
-Date:   Wed Apr 10 00:39:30 2024 +0900
-
-    Update README.md (#358)
-
-[33mcommit 62b3812b696862588e7f88533bde5cc57e8d2acf[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Apr 9 23:27:31 2024 +0800
-
-    Time cost utils (#355)
-
-[33mcommit 550a4f78f382b5a7f4008d7d21e876e71ab2d2b6[m
-Author: Tom Dörr <tomdoerr96@gmail.com>
-Date:   Tue Apr 9 09:10:05 2024 +0200
-
-    Fix typos in infer_batch.py (#354)
-
-[33mcommit ff99c38a0711ee82926840129db840a70e91f0d9[m
-Author: SimoneRaponi <s.raponi.93@gmail.com>
-Date:   Wed Apr 3 16:22:06 2024 +0200
-
-    Add timeout to get_meta_info (#346)
-    
-    Co-authored-by: simone <simone.raponi@equixely.com>
-
-[33mcommit c9de3e169cca4028875967bfff55182ee10e7890[m
-Author: Qubitium <417764+Qubitium@users.noreply.github.com>
-Date:   Wed Apr 3 13:56:06 2024 +0800
-
-    Eliminate 2 gpu ops during sampling when logit_bias is zero (#338)
-    
-    Co-authored-by: hnyls2002 <hnyls2002@gmail.com>
-
-[33mcommit ed27a6b99258c905502bdc7f37300ea060d9b9b1[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Apr 3 12:45:01 2024 +0800
-
-    Revert "Eliminate 2 gpu ops during sampling when logit_bias is zero" (#345)
-
-[33mcommit 463c6632a8d1adebd4a18b106311944cba110f55[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Apr 2 19:14:55 2024 +0800
-
-    Eliminate 2 gpu ops during sampling when logit_bias is zero (#343)
-    
-    Co-authored-by: Qubitium <417764+Qubitium@users.noreply.github.com>
-
-[33mcommit b0890631a011be28d5ef5a0b4d5551fdeb94ab25[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Apr 1 07:35:58 2024 +0000
-
-    fix gemma import error
-
-[33mcommit cb389c91bcff6ffac4a95a0551a05d67e21ba306[m
-Author: Junlong Li <45759388+lockon-n@users.noreply.github.com>
-Date:   Fri Mar 29 10:24:54 2024 +0800
-
-    Fix llava parallelism/fork bug (#315)
-
-[33mcommit eddaa2b599be266f1a74d83518fb3a7a81b103cd[m
-Author: Qubitium <417764+Qubitium@users.noreply.github.com>
-Date:   Fri Mar 29 10:24:16 2024 +0800
-
-    Add support for new autogptq quant_config.checkpoint_format (#332)
-
-[33mcommit 2af565b3bb22cb8ba06acc17a2bbfa8d0ade0145[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Mar 29 01:05:19 2024 +0800
-
-    [model] DBRX-instruct support (#337)
-
-[33mcommit 3842eba5fa305edfc2c66f82e8389d72784d5911[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Mar 28 14:34:49 2024 +0800
-
-    Logprobs Refractor (#331)
-
-[33mcommit 24e59f53501bf5a66f71d08238bfc17e15b8114a[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Mar 24 19:48:37 2024 +0800
-
-    `model_runner` simplify (#329)
-
-[33mcommit 75235419621f38a4b53ae5c2882997a4ce7e698e[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Mar 24 15:41:24 2024 +0800
-
-    `model_rpc` style improvement (#293)
-
-[33mcommit 64ee9c030e250aedbecbc12ce2528569afea55ea[m
-Author: Jani Monoses <jani.monoses@gmail.com>
-Date:   Sat Mar 23 19:16:24 2024 +0200
-
-    Openrouter usage example (#327)
-
-[33mcommit 30d17840fca0118e9c4f28367e68c3898e36119c[m
-Author: Jani Monoses <jani.monoses@gmail.com>
-Date:   Sat Mar 23 19:15:58 2024 +0200
-
-    Update dependencies (#326)
-
-[33mcommit ce216c80dc413e404b7f6f63a151436253c8b837[m
-Author: Qubitium <417764+Qubitium@users.noreply.github.com>
-Date:   Sun Mar 24 01:15:16 2024 +0800
-
-    Cleanup codebase: removed unnecessary code/logic (#298)
-
-[33mcommit 51104cd405a9573a872e846ba4d85a4298f92c4f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Mar 22 13:42:22 2024 -0700
-
-    Update version to v0.1.14 (#324)
-
-[33mcommit e2b2f0a21322d6a29106dceaa92bb593b963677e[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Mar 22 13:37:57 2024 -0700
-
-    Support oai in benchmark/mmlu (#323)
-
-[33mcommit b57abe16632605ae9e8b0473dbb45fb0fd25e6f1[m
-Author: Jani Monoses <jani.monoses@gmail.com>
-Date:   Fri Mar 22 22:24:08 2024 +0200
-
-    Add StableLM model. (#301)
-
-[33mcommit e57f079275425872bad6f2d1102cdfad6b523e58[m
-Author: Jani Monoses <jani.monoses@gmail.com>
-Date:   Fri Mar 22 22:23:31 2024 +0200
-
-    Use Anthropic messages API (#304)
-
-[33mcommit 08df63a6f8b9d27ae80cf705ab9496632c8a18c2[m
-Author: Li Bo <drluodian@gmail.com>
-Date:   Sat Mar 23 03:19:58 2024 +0800
-
-    [Fix/Potential Bugs] Can not correctly import models in python/sglang/srt/models (#311)
-
-[33mcommit 77835756a7e9511bb63ac74a5b5c11543f69344c[m
-Author: ZhouGongZaiShi <993566077@qq.com>
-Date:   Sat Mar 23 03:19:11 2024 +0800
-
-    Fix outlines-0.0.35 incompatibility (#291)
-    
-    Co-authored-by: ZX <zx@lbx.dev>
-
-[33mcommit ed3157997153fdbbe142bf4ef995ecaaae62fc34[m
-Author: Liurl <liurl021@gmail.com>
-Date:   Wed Mar 13 13:15:43 2024 +0800
-
-    Fix marlin model loading compat with autogptq (#290)
-    
-    Co-authored-by: LRL <lrl@lbx.dev>
-
-[33mcommit 92e2d74fd0426afb98621465d6574ad2a823e842[m
-Author: Qubitium <417764+Qubitium@users.noreply.github.com>
-Date:   Wed Mar 13 13:02:48 2024 +0800
-
-    Fix env (docker) compat due to __file__ usage (#288)
-
-[33mcommit d9b3b0188338c6a1411c2995db5e8da7f56f6e4d[m
-Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
-Date:   Wed Mar 13 02:10:12 2024 +0000
-
-    enable marlin kernels (#286)
-
-[33mcommit 745ea007acf662b3e439eeb9a1c24ddd06f10b58[m
-Author: Arsalan <41029759+amirarsalan90@users.noreply.github.com>
-Date:   Tue Mar 12 22:09:38 2024 -0400
-
-    Fix Incorrect CURL Request Example in README (#287)
-
-[33mcommit ad1dd74673a2e918a39d869865c1830fb634d150[m
-Author: Qubitium <417764+Qubitium@users.noreply.github.com>
-Date:   Tue Mar 12 21:45:58 2024 +0800
-
-    Fix flashinfer >= 0.0.3 compat (#282)
-
-[33mcommit eb4308c4c9e3efbd58e86ec63e4f88dc36c363a8[m
-Author: Arsalan <41029759+amirarsalan90@users.noreply.github.com>
-Date:   Tue Mar 12 03:16:06 2024 -0400
-
-    adding the triton docker build minimal example (#242)
-
-[33mcommit b2eb080501b4b4a0d72eb5a0e6be30d43811dcbd[m
-Author: Qubitium <417764+Qubitium@users.noreply.github.com>
-Date:   Mon Mar 11 22:32:15 2024 +0800
-
-    Fix Runtime missing some ServerArgs options (#281)
-
-[33mcommit 4aa5dd2c5f1e386e8bf7d9c6309dc414e2fded7e[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Mar 11 05:49:27 2024 -0700
-
-    Update version to v0.1.13 (#280)
-
-[33mcommit 13662fd5336fc8428e130567fdb1695d664eea24[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Mar 11 05:24:24 2024 -0700
-
-    Fix RuntimeEndpoint (#279)
-
-[33mcommit d5ae2ebaa2a1021b53caa25aa541ca39d590918d[m
-Author: Alessio Dalla Piazza <alessio.dallapiazza@gmail.com>
-Date:   Mon Mar 11 13:16:10 2024 +0100
-
-    Add Support for API Key Authentication (#230)
-
-[33mcommit 1b355479276bf07502c40ffc7bc8b6c494b93b10[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Mar 11 20:06:52 2024 +0800
-
-    Organize `server_args` (#277)
-
-[33mcommit faba293a0d6c144de0a9687ffc0ed2be6699600d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Mar 11 04:43:39 2024 -0700
-
-    Improve gemma and documentations (#278)
-
-[33mcommit 89885b31efa6f36faf070b405640763431f3074e[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Mar 11 12:14:27 2024 +0800
-
-    Gemma Support (#256)
-
-[33mcommit 64fe311593edee917a28506be8723127d4e938c9[m
-Author: Geary.Z <92413813+TideDra@users.noreply.github.com>
-Date:   Mon Mar 11 10:04:52 2024 +0800
-
-    replace skip_embed with input_embeds (#222)
-
-[33mcommit a7ace9c88d48263b863cda5cece386630ac62da6[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Mar 11 09:54:18 2024 +0800
-
-    Fix qwen config (#261)
-
-[33mcommit a833de05d3ae49c5ba16b097c16df82308fba727[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Mar 10 18:51:47 2024 -0700
-
-    Add logo (#275)
-
-[33mcommit 30d67b2bca647d7a52fddc42a6d48842610cfec3[m
-Author: Lin Tianchuan <47070449+1024th@users.noreply.github.com>
-Date:   Thu Mar 7 23:20:11 2024 +0800
-
-    Add `set_var` to interpreter.py (#263)
-
-[33mcommit b0b722ee8e90bfa2b379eadb1432e2f6852a6ad0[m
-Author: Xinwei Xiong <3293172751NSS@gmail.com>
-Date:   Sun Mar 3 17:52:36 2024 +0800
-
-    Refactor ChatTemplate for Enhanced Clarity and Efficiency (#201)
-
-[33mcommit 01b07ea3ac0e693dfa9093938dd1ed15a7881240[m
-Author: Srinivas Billa <nivibilla@gmail.com>
-Date:   Sun Mar 3 09:41:41 2024 +0000
-
-    Add SSL Cert Functionality (#224)
-
-[33mcommit dfb13ac45518a1a6c26e3d9223f44612fa2778ac[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Mar 3 17:09:16 2024 +0800
-
-    Fix addr reuse in check_port (#253)
-
-[33mcommit ec90b9c054461f79cf902f389279853d8e40aabc[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Feb 24 19:03:46 2024 +0800
-
-    Upload `agent_calls.jsonl` download link (#226)
-
-[33mcommit 9759d927cf6fe741d998015ce35bea546a2cf0a4[m
-Author: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
-Date:   Sat Feb 24 08:34:22 2024 +0000
-
-    fix chatml template (#195)
-
-[33mcommit 8d0a7fae3b89eecae8cd2a755a673462c23ab31f[m
-Author: Zhang Wenbin <22811973@qq.com>
-Date:   Sat Feb 24 16:27:34 2024 +0800
-
-    Fix interpreter.py `get_var(var_name)` in text iter when `stream` is not enabled (#198)
-
-[33mcommit c4e9ebe3a480128818eeda4a3ce59ee7a8da53bf[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Feb 24 16:05:21 2024 +0800
-
-    Fix stop str merging (#225)
-    
-    Co-authored-by: Enrique Shockwave <33002121+qeternity@users.noreply.github.com>
-
-[33mcommit 3c2c5869ad719d41d87f6aca8a71e683ebcadc76[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Fri Feb 23 23:06:17 2024 -0800
-
-    Support outlines > 0.0.31 (#219)
-
-[33mcommit 4cb9aaedf3dfe4f876ba447ab2ac1ac9c75da911[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Thu Feb 22 10:33:03 2024 -0800
-
-    Fix logprobs with logprob_start_len (#193)
-
-[33mcommit 9de9a46815bded248b01daba75936b642c2a7c06[m
-Author: psych0v0yager <105936906+psych0v0yager@users.noreply.github.com>
-Date:   Tue Feb 20 18:22:56 2024 -0600
-
-    Added the ability to Modify the Context Length (#210)
-
-[33mcommit ce3b2610535281a2ebde8be2ad072f22207a7a05[m
-Author: Ikko Eltociear Ashimine <eltociear@gmail.com>
-Date:   Tue Feb 20 02:09:03 2024 +0900
-
-    Update README.md (#207)
-
-[33mcommit 91e036334f1e159dba2943cbfab1a0e8185eb60e[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Feb 17 13:40:39 2024 +0800
-
-    Adjust outlines version. (#200)
-    
-    Co-authored-by: comaniac <hao.yu.cody@gmail.com>
-
-[33mcommit 2a74748b2fb0f061934e056a90db5edfbf2a7dee[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Fri Feb 16 13:01:40 2024 -0800
-
-    Pin outlines version (#196)
-
-[33mcommit 63ba630bbbb2d55787ac54ac0a01cbde993afc20[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Thu Feb 15 10:54:20 2024 -0800
-
-    Refactor decoding logprob and add completion_tokens_wo_jump_forward (#189)
-
-[33mcommit 6493256b7d4b290ede988e5ee5425508249064c7[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Feb 12 12:43:48 2024 +0000
-
-    improve print
-
-[33mcommit 06008bc295e5e83761c2e332642b783f334e8a3c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Feb 12 04:43:14 2024 -0800
-
-    Fix server launch for jupyter notebook (#186)
-
-[33mcommit bb824da41a24371202f83889d48b06a4b730d1ad[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Feb 12 01:06:38 2024 -0800
-
-    Add Together and AzureOpenAI examples (#184)
-
-[33mcommit 931213245ce69908843c731edbee7bd662f0647b[m
-Author: Yaya Sy <58347382+yaya-sy@users.noreply.github.com>
-Date:   Sun Feb 11 22:26:20 2024 +0100
-
-    correct reference dtype openai.py (#181)
-
-[33mcommit c97fdae4aa0e196f28e7a0d35b24134c5fdc3009[m
-Author: Yaya Sy <58347382+yaya-sy@users.noreply.github.com>
-Date:   Sun Feb 11 22:25:57 2024 +0100
-
-    correct a mistake on the README.md (#182)
-
-[33mcommit 624b21e742f2bfc493b30ca17e7c86ca9255e1e6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Feb 11 06:43:45 2024 -0800
-
-    Update version to 0.1.12 (#178)
-
-[33mcommit c51020cf0c64498865538362aa34baaed13a3b50[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Feb 11 05:50:13 2024 -0800
-
-    Fix the chat template for llava-v1.6-34b & format code (#177)
-
-[33mcommit 50afed4eaafeec6c87a4f120ec95742846b4130f[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Sat Feb 10 17:21:33 2024 -0800
-
-    Support extra field regex in OpenAI API (#172)
-
-[33mcommit 4d303c4fa365dbe8b4d474be6e613954bb829939[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Fri Feb 9 20:06:15 2024 -0800
-
-    Fix token usage with jump forward (#174)
-
-[33mcommit 37b42297f87645e6ed0517281beb0e5301a63d8e[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Feb 9 10:13:02 2024 +0800
-
-    import outlines (#168)
-
-[33mcommit cba50273324e9770c587754a4abbea974a4124ba[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Thu Feb 8 17:23:09 2024 -0800
-
-    Fix BaseCache metric (#170)
-
-[33mcommit a6aa46dd3f320a407b364aa53641f1eb99cca520[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Thu Feb 8 04:35:25 2024 +0000
-
-    minor
-
-[33mcommit 405f26b00b452815852e1a1da2b6937f3a8b9ce9[m
-Author: Srinivas Billa <nivibilla@gmail.com>
-Date:   Thu Feb 8 04:07:31 2024 +0000
-
-    Add Auth Token to RuntimeEndpoint (#162)
-
-[33mcommit b1a3a454ee5b681ff8b193d186530c1d37a6bb3b[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Feb 8 00:50:12 2024 +0800
-
-    add `--disable-disk-cache` (#160)
-    
-    Co-authored-by: Ja1Zhou <50169346+Ja1Zhou@users.noreply.github.com>
-
-[33mcommit 79e6b84bec463fc1832e89fd70dc4fb55ae09baa[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Feb 6 23:14:59 2024 -0800
-
-    Update README.md
-
-[33mcommit 26c3494152131a77cd4eb1b4a25e016f3def5313[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Tue Feb 6 19:28:29 2024 -0800
-
-    [Submodule] Change FlashInfer to import (#156)
-
-[33mcommit cb8e1982f83796cdcf5243fbfb44aeb036cc7621[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Feb 6 18:44:37 2024 -0800
-
-    Update README.md
-
-[33mcommit 23f05005fd33a606af72db68f7320ef411720761[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Feb 6 13:27:46 2024 -0800
-
-    Format code & move functions (#155)
-
-[33mcommit a7334aeea138149906b83e38a593f5ce6e07ea32[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Tue Feb 6 12:24:55 2024 -0800
-
-    Support decode token logprobs (#130)
-
-[33mcommit ee1df26a77971f004444c118eabb7998dbafb14a[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Feb 6 11:35:42 2024 -0800
-
-    Update README.md
-
-[33mcommit 3ae78a09b386364088799c088f8add8ae9e2c584[m
-Author: Arcmoon <50002441+Arcmoon-Hu@users.noreply.github.com>
-Date:   Wed Feb 7 03:35:04 2024 +0800
-
-    Add gptq quantization model support  (#141)
-
-[33mcommit ccbe1e67d8d671d38b248557ef10cd14a10e4acf[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Tue Feb 6 11:34:15 2024 -0800
-
-    Temporary fix OpenAI API for Pydantic v1/v2 (#153)
-
-[33mcommit e2bf732bc3dda44f800caa1aa3d2786ee66d93d1[m
-Author: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
-Date:   Tue Feb 6 12:38:41 2024 +0800
-
-    add openai error handler with retry and logger (#148)
-
-[33mcommit 322421fae36424cdcef16ecc913e7f6e92d4b7d2[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Mon Feb 5 14:21:16 2024 -0800
-
-    Add warmup to SRT server (#146)
-
-[33mcommit 8ff870bf3e6a87d2b12d4d01820c0e86de194664[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Feb 5 11:22:06 2024 +0000
-
-    improve docs
-
-[33mcommit 26f0bedc8f351ed9b67d9b85ee30aa0c5f2aef45[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Feb 5 16:50:37 2024 +0800
-
-    jump-forward rename (#144)
-
-[33mcommit 82fa69b3cc0c8b9b3b31148f1d53070649f0d433[m
-Author: Yaya Sy <58347382+yaya-sy@users.noreply.github.com>
-Date:   Sun Feb 4 23:27:52 2024 +0100
-
-    fix undfined variable (#142)
-
-[33mcommit 8fb7459e08ecb94e045a78e2ba8ee08c27ab34c7[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Feb 3 17:42:01 2024 -0800
-
-    update json decoding docs
-
-[33mcommit bb3a3b6675b1844a13ebe368ad693f3dc75b315b[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Feb 3 23:32:05 2024 +0800
-
-    Support Faster JSON decoding for llava (#137)
-    
-    When sending fast-forwarded reqs to model_rpc, re-calculate `pad_input_ids`
-
-[33mcommit 45d6592d4053fe8b2b8dc9440f64c900de040d09[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Feb 3 04:59:06 2024 -0800
-
-    Fix no-cache mode (#136)
-
-[33mcommit f6bfe3aaff6fc9c80ed632646c8db06f4a6c0048[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Feb 3 02:50:13 2024 -0800
-
-    Release 0.1.11 (#134)
-
-[33mcommit e095b16236917d478f537c0bd71f45bcaa408d24[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Feb 3 02:35:54 2024 -0800
-
-    Add max_prefill_num_token into server arguments (#133)
-
-[33mcommit 67be11c790f600b0003ed36be94e748eb3341be6[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sat Feb 3 01:38:00 2024 -0800
-
-    fix bug of race condition in copy()
-
-[33mcommit cd8c3ccd95596db38dfeb610fa19e3b22d9b857f[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Feb 3 11:48:01 2024 +0800
-
-    Fix `is_multimodal_model` judge (#132)
-
-[33mcommit 9c121f2a45dca269c47812379f851f9ca9478852[m
-Author: hnyls2002 <hnyls2002@gmail.com>
-Date:   Fri Feb 2 09:58:24 2024 +0000
-
-    minor fix: result dump format
-
-[33mcommit 03e04b23312a1c6f5f16cd4dfffd530fb4210a65[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Feb 1 22:44:05 2024 +0000
-
-    update docs for Yi-VL
-
-[33mcommit 864425300fb52782d3f410db248371eb0451cf26[m
-Author: Christopher Chou <49086305+BabyChouSr@users.noreply.github.com>
-Date:   Thu Feb 1 08:33:22 2024 -0800
-
-    Yi-VL Model (#112)
-
-[33mcommit 79cb018e4bcabe9d22db64657649cc1930370a3d[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Feb 1 13:38:47 2024 +0800
-
-    Add city doc benchmark mode (#129)
-
-[33mcommit c7af9f73938a79b84dab6b6b4b9aa339118b593b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 31 18:52:15 2024 +0000
-
-    Fix a bug in llava-hd
-
-[33mcommit 876db8dc7a4155041c29fec64fc6d7d4d2772ab3[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 31 10:18:43 2024 -0800
-
-    Update sampling_params.md
-
-[33mcommit ad82bac6f530ce949995c271c6fb4e93c7c26d8c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 30 23:49:52 2024 -0800
-
-    Fix model loading & format code (#125)
-
-[33mcommit 71b54eea7d21a2bb1d8ef340e7002983a29b1d5f[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Tue Jan 30 22:13:14 2024 -0800
-
-    Add cache metrics (#119)
-
-[33mcommit 74b3bfaaf8238f89df56681af7b601a10486eebb[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 30 16:36:10 2024 +0000
-
-    format code
-
-[33mcommit 4a634cf64613d62986f8c1ba97e24ad6c23fc07b[m
-Author: Jay Zhou <50169346+Ja1Zhou@users.noreply.github.com>
-Date:   Tue Jan 30 08:34:51 2024 -0800
-
-    [Feature] Allow specifying all ports to use in advance (#116)
-
-[33mcommit a49dc52bfa1e04f7713644266b0992e8e977a2bb[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 30 15:37:43 2024 +0000
-
-    release v0.1.10
-
-[33mcommit 873d0e85378792f170cab1eac6c39a2efbb89ab1[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 30 14:52:06 2024 +0000
-
-    Ignore detokenization error
-
-[33mcommit 1d0fbe8e436037257e4b2df83e890b2cf637d7f6[m
-Author: Keith Stevens <keith@surfacedata.org>
-Date:   Tue Jan 30 23:12:33 2024 +0900
-
-    [Feature] Adds basic support for image content in OpenAI chat routes (#113)
-
-[33mcommit 97aa9b3284566a4d84c08f7c1fee3699bf694e3d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 30 05:45:27 2024 -0800
-
-    Improve docs & Add JSON decode example (#121)
-
-[33mcommit 0617528632fe266427e1ee6cf5037e3fca06e538[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 30 04:29:32 2024 -0800
-
-    Update quick start examples (#120)
-
-[33mcommit 4ea92f83077ce70381528d7d1fcc565db7698d69[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jan 29 17:08:12 2024 -0800
-
-    Format code (#118)
-
-[33mcommit 6b0af2853c138b945e61e78ec7a0ab4a1a239bb8[m
-Author: Junyang Lin <justinlin930319@hotmail.com>
-Date:   Tue Jan 30 09:06:02 2024 +0800
-
-    Add qwen2 (#114)
-
-[33mcommit 6f560c761b2fc2f577682d0cfda62630f37a3bb0[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jan 29 17:05:42 2024 -0800
-
-    Improve the control of streaming and improve the first token latency in streaming (#117)
-
-[33mcommit cd6872334e9ead684049b8fccd5f2dac9433b1b4[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Fri Jan 26 09:38:43 2024 -0800
-
-    Fix Mistral model loading (#108)
-    
-    Co-authored-by: johndun <dunavent.jm@gmail.com>
-
-[33mcommit 81561f8e2d55d105aabbe0eab1b3b33f4fc04b0b[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Fri Jan 26 13:32:59 2024 +0800
-
-    Flush Cache API (#103)
-
-[33mcommit 3a581e9949d14992400c1a9455d0804d6a3fb7e0[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Thu Jan 25 15:29:07 2024 -0800
-
-    Dynamic model class loading (#101)
-
-[33mcommit 0147f940ddc5642e6f88e404123881d69c2b7f0a[m
-Author: shiyi.c_98 <shicao@berkeley.edu>
-Date:   Thu Jan 25 07:56:25 2024 -0800
-
-    fix batch error for llava-hd (#98)
-
-[33mcommit 23950056f0de45cb2c0b12c432c31e8f03047f22[m
-Author: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
-Date:   Thu Jan 25 03:57:06 2024 -0600
-
-    support speculative execution for openai API (#48)
-    
-    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
-
-[33mcommit 93414c8238c0fae97b8c741940f33dff58aec7c6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 24 22:25:33 2024 -0800
-
-    Add a link to HF paper page
-
-[33mcommit ed7c7eca0ec3485def2a3c5124e516479cea60a4[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 24 16:52:21 2024 -0800
-
-    Update README.md
-
-[33mcommit 0c457bae8f177b2af01a68edea488e8d898102ed[m
-Author: isaac-vidas <80056737+isaac-vidas@users.noreply.github.com>
-Date:   Wed Jan 24 19:23:11 2024 -0500
-
-    Handle grayscale images in expand2square (#97)
-
-[33mcommit d3fc86a43e2287e0446a4b3c9acf1300611f1f85[m
-Author: Haotian Liu <6631389+haotian-liu@users.noreply.github.com>
-Date:   Wed Jan 24 14:23:27 2024 -0600
-
-    Improve Chinese character streaming when the last char is half Chinese word. (#95)
-
-[33mcommit 01ee0fbc051f4e177ad917ef90ab26904c7d6cab[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Thu Jan 25 01:16:25 2024 +0800
-
-    fast regex decode
-    
-    Auto-detect constant str path in regex FSM, then extend instead.
-
-[33mcommit 711d3435305144d984b3358bbb34fdde40b9a63f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 24 11:44:07 2024 +0000
-
-    add a batch llava example
-
-[33mcommit 6dceab4d1786a6b94dfcbd2bc4030caa4cb3b0ba[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 24 11:37:25 2024 +0000
-
-    bump version to 0.1.9
-
-[33mcommit c70b3cfa9e3a06b129eef4e2a0b32d67a3c7eb33[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 24 03:33:34 2024 -0800
-
-    Bump the version to v0.1.8 (#93)
-
-[33mcommit 489796c7ea4bc8aa02b94c082400eced5a9a32bc[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Jan 24 10:45:44 2024 +0000
-
-    minor performance fix
-
-[33mcommit fa7a696d04f65848362332c2edf13d2d5c6d4921[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 24 10:44:32 2024 +0000
-
-    Fix max_new_tokens for limited memory
-
-[33mcommit bef0b359022f62cbc6de7eaef06d074ac635f7ee[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 24 10:35:31 2024 +0000
-
-    Fix llava & Fix multiprocessing
-
-[33mcommit c6576e820c87a801d2c9c94ad81e812159c75804[m
-Author: shiyi.c_98 <shicao@berkeley.edu>
-Date:   Wed Jan 24 01:51:21 2024 -0800
-
-    Llava-hd Support (#92)
-    
-    Co-authored-by: Haotian Liu <liuhaotian.cn@gmail.com>
-
-[33mcommit 99258181c62e98c8a7365a78b74522eefb9d04de[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 24 08:55:38 2024 +0000
-
-    set start method to spawn
-
-[33mcommit 3de54a1b5576efe0a06ce73b856f4aa8f40a8b29[m
-Author: isaac-vidas <80056737+isaac-vidas@users.noreply.github.com>
-Date:   Tue Jan 23 22:00:28 2024 -0500
-
-    Add health endpoint to SGLang runtime server (#90)
-
-[33mcommit 7358fa64f7da3f18ce7512148d330755b0c1f1fe[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 23 22:10:17 2024 +0000
-
-    Fix a bug in runtime backend
-
-[33mcommit 9a16fea0123ca731e74f919e494facfc5c30c30d[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 23 05:07:30 2024 -0800
-
-    Return logprob for choices (#87)
-
-[33mcommit 9e037c822ccabaf593c0145a9d8377f177e22ff9[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 23 03:43:19 2024 -0800
-
-    Update README.md
-
-[33mcommit 9076386d904171c7cc88ace681ca3ebbec2c71ea[m
-Author: 0xWe11es.eth <83463505+CSWellesSun@users.noreply.github.com>
-Date:   Tue Jan 23 16:25:26 2024 +0800
-
-    Fix SRT endpoint api json syntax (#84)
-
-[33mcommit 959c4174b218595dc77bc8853bdc695a670394b0[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jan 22 21:46:47 2024 -0800
-
-    Fix the chat template for QWen (#83)
-
-[33mcommit 94e05770db538cadce18f5c201572067ab87840e[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jan 22 21:17:05 2024 -0800
-
-    Fix after QWen support (#82)
-
-[33mcommit 63e97e5e4ccc835a089e140f8ae7079c0e6d823f[m
-Author: Arcmoon <50002441+Arcmoon-Hu@users.noreply.github.com>
-Date:   Tue Jan 23 12:14:51 2024 +0800
-
-    Suppport qwen model and solve some problems (#75)
-
-[33mcommit e08bca2840ca8442c4047e31a659b4329ab943f6[m
-Author: isaac-vidas <80056737+isaac-vidas@users.noreply.github.com>
-Date:   Mon Jan 22 21:15:48 2024 -0500
-
-    Support load fine-tuned LLaVA model (#80)
-
-[33mcommit cd3ccb2ed7aaeaa8f56acd467af9ad8fb482f465[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Jan 21 16:51:45 2024 -0800
-
-    Add a note about triton version for older GPUs (#72)
-
-[33mcommit 3f5c2f4c4aa6b8342497b612a3c35b1294bd2314[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Jan 21 15:17:30 2024 -0800
-
-    Add an async example (#37)
-
-[33mcommit 007eeb4eb91cb30bd6b0c2bb17a0655ea5841c15[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Jan 21 14:56:25 2024 -0800
-
-    Fix the error message and dependency of openai backend (#71)
-
-[33mcommit e8f2b155fe25a24cffdf085a8045c1e702f9503e[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Sun Jan 21 02:45:58 2024 -0800
-
-    Update README.md
-
-[33mcommit 723f0421638b8a0ee72ff0a84eed283946753ed6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Jan 21 10:31:02 2024 +0000
-
-    release v0.1.7 & fix bugs
-
-[33mcommit 585eababa1ffbd68ad364681aff4ca1b7b2b1824[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Jan 21 10:13:45 2024 +0000
-
-    Improve error message of openai
-
-[33mcommit cc3ada983f6445f98a88fa890a8130f40eb0f2af[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Jan 21 01:45:02 2024 -0800
-
-    Bump version to 0.1.6 (#68)
-
-[33mcommit a837166e6f80f6c5148bf1eb24bd88e309bbc86e[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sun Jan 21 01:39:23 2024 -0800
-
-    Fix select and normalized logprobs (#67)
-
-[33mcommit 11f3cca64fa7bd91a795075876ed2407c4b1ec86[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Sat Jan 20 23:20:35 2024 -0800
-
-    Fix select (#64)
-
-[33mcommit ca13f3b8c58e419c04e706bb5a6711073f466aa0[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sun Jan 21 13:26:11 2024 +0800
-
-    Disk FSM cache and adjust code. (#63)
-
-[33mcommit 0b2efc2adc8c5e01c1a4ef3a5ec6c9f5bac684be[m
-Author: Ikko Eltociear Ashimine <eltociear@gmail.com>
-Date:   Sat Jan 20 14:00:29 2024 +0900
-
-    Update README.md (#58)
-
-[33mcommit f30abd090a1d02377a1211a8c8f5b10deac0e763[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Fri Jan 19 17:03:33 2024 -0800
-
-    Improve error message & Add vicuna template (#57)
-
-[33mcommit 40ab1f01294f1c6cebcc0ed8597b76db039028bb[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Sat Jan 20 03:01:15 2024 +0800
-
-    Fix the possible bug of decode out of memory (#36)
-
-[33mcommit 199e82a15d2d06c9955c71c4e19c68afbe9dc860[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Jan 18 23:51:19 2024 -0800
-
-    Format code & Improve readme (#52)
-
-[33mcommit 23471f9aa3d4fb7bea89d2f9c0b471025292406e[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Thu Jan 18 23:43:09 2024 -0800
-
-    Support v1/chat/completions (#50)
-
-[33mcommit 61d4c93962001da758aee799e8618672c17bec53[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Thu Jan 18 17:00:56 2024 -0800
-
-    Support stream=True in v1/completions (#49)
-
-[33mcommit 98a3e8ef78f673175a6890fd0ed8f39a6bd51b07[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Jan 18 13:46:38 2024 -0800
-
-    Add a llava example (#47)
-
-[33mcommit 2b079f89315553897d9abe49366a24534eee7ec0[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Jan 18 13:30:10 2024 -0800
-
-    Increase interpreter parallelism (#46)
-
-[33mcommit 05b4c398df9f72b5461d9f013f0058da4c44882b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Thu Jan 18 11:49:27 2024 -0800
-
-    Document sampling parameters (#45)
-
-[33mcommit dafafe5b111d1a74ecade029d342bbea5df4edfa[m
-Author: Cody Yu <hao.yu.cody@gmail.com>
-Date:   Thu Jan 18 11:18:22 2024 -0800
-
-    Use HTTP link in 3rdparty module (#42)
-
-[33mcommit b240f751006bfdb4f4f249348c6915ae962fda62[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 17 22:26:32 2024 -0800
-
-    Add a parallel sampling case (#34)
-
-[33mcommit 501f94444529bad2ea944a42c7c38deae986b827[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 17 21:14:31 2024 -0800
-
-    Bump version to 0.1.5 (#33)
-
-[33mcommit 22ec7bc2a1a0849870323cffa98cf7abf7d416ff[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 17 18:37:02 2024 -0800
-
-    Expose more arguments to control the scheduling policy (#32)
-
-[33mcommit c0454b323c87c78f650074659c2db92b0fd2524d[m
-Author: Christopher Chou <49086305+BabyChouSr@users.noreply.github.com>
-Date:   Wed Jan 17 18:15:02 2024 -0800
-
-    Add option to return metadata in async streaming (#18)
-
-[33mcommit 8024fc5eec67c9ba9bd4df36bade64a939624ab4[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 17 16:38:20 2024 -0800
-
-    Fix streaming (#30)
-
-[33mcommit 70528762bf0800793dab10d1b51d40f21a0608ba[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 17 10:42:55 2024 -0800
-
-    update readme
-
-[33mcommit 71d30d6ddc00f7142b403f858d12f2f1d8385378[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Wed Jan 17 09:49:53 2024 -0800
-
-    Update README.md
-
-[33mcommit f9d723816ab762c20279463797f3b1a95158f23b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 17 04:43:17 2024 -0800
-
-    Teak mem fraction (#20)
-
-[33mcommit bf51ddc6e52d872700724ddc181089162811319a[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Wed Jan 17 02:54:41 2024 -0800
-
-    Improve docs & Rename Gemini -> VertexAI (#19)
-
-[33mcommit fd7c4792391127e7cd8d9e3647e118c6a38bf382[m
-Author: shiyi.c_98 <shiyicao314@gmail.com>
-Date:   Tue Jan 16 22:29:37 2024 -0800
-
-    Gemini Backend (#9)
-    
-    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
-
-[33mcommit c4707f1bb52c1743d1f438940d388ae0da36c92b[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 16 19:53:55 2024 -0800
-
-    Improve docs (#17)
-
-[33mcommit ffe4aaee1da5ccfdda1c6228ae911d6f139b68eb[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Tue Jan 16 15:49:03 2024 -0800
-
-    Fix for T4 GPUs (#16)
-    
-    Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
-
-[33mcommit 5b27a1dce413987f07f3a063a938ca27ecc82ea0[m
-Author: Christopher Chou <49086305+BabyChouSr@users.noreply.github.com>
-Date:   Tue Jan 16 15:41:30 2024 -0800
-
-    Rename image_url to image_file (#15)
-
-[33mcommit e71d4ab3f941e8ecec461480b582e50170a6842e[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 16 04:18:54 2024 -0800
-
-    Update docs (#12)
-
-[33mcommit fbf42263f1ba0c6775b92062661a5d2c1fb3bd68[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 16 02:46:27 2024 -0800
-
-    Update Readme (#11)
-
-[33mcommit 2ccd9fd8c5331ee5bd53185772d7ed615a3a8c30[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 16 05:55:25 2024 +0000
-
-    update version to 0.1.3
-
-[33mcommit 46b7ea7c851a98b532e35243957f5438bed9ba89[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jan 15 21:37:11 2024 -0800
-
-    Improve Readme (#10)
-
-[33mcommit 70359bf31a4b86bb772cd0853cbf833faf159a7c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jan 15 16:12:57 2024 -0800
-
-    Update benchmark scripts (#8)
-
-[33mcommit 01ca82d7650c597a9619cec01fd542ca7bcf8620[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Jan 16 01:42:46 2024 +0800
-
-    fix radix cache match (#7)
-
-[33mcommit 4bd8233f2cf88da571ff7edcf9518e273e65475c[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jan 15 01:15:53 2024 -0800
-
-    Fix test cases (#6)
-
-[33mcommit 08ab2a1655224a671fd8d356387aa83f3179129a[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Mon Jan 15 16:49:29 2024 +0800
-
-    Json Decode && Mutl-Turns (#4)
-
-[33mcommit f652494df16ef9fa0fac998ddf63961aee0849d4[m
-Author: hnyls2002 <hnyls2002@gmail.com>
-Date:   Wed Jan 10 04:21:17 2024 +0000
-
-    fix typo
-
-[33mcommit 30720e732c599abf62516ebc6562abc9ad93157f[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Tue Jan 9 12:43:40 2024 -0800
-
-    Add install with pip (#3)
-
-[33mcommit 331848de9d639952ef30cec35e7a58f2760154f8[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Wed Jan 10 04:35:44 2024 +0800
-
-    Add SRT json decode example (#2)
-
-[33mcommit 93eeb543bacf2d7f5f56dd7e7dd4767884026b38[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jan 8 21:20:23 2024 +0000
-
-    Update readme.md
-
-[33mcommit ead5b39f82f0664f6b8ffd571c9b41b8a2dd1e22[m
-Author: Liangsheng Yin <hnyls2002@gmail.com>
-Date:   Tue Jan 9 00:26:18 2024 +0800
-
-    Add flashinfer && Oultines (#1)
-
-[33mcommit 22085081bb247cc57fe971c3d72eb66f053d77b6[m
-Author: Lianmin Zheng <lianminzheng@gmail.com>
-Date:   Mon Jan 8 04:37:50 2024 +0000
-
-    release initial code
-    
-    Co-authored-by: Ying Sheng <sqy1415@gmail.com>
-    Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
-    Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
-    Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
-    Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
-    Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
-
-[33mcommit f6d40df0ee1e1fc53db3edc04bf90575f221cf23[m
-Author: Ying Sheng <sqy1415@gmail.com>
-Date:   Mon Oct 9 15:41:15 2023 -0700
-
-    Initial commit
diff --git a/sgl-kernel/bench_fp8_res/results.html b/sgl-kernel/bench_fp8_res/results.html
deleted file mode 100644
index 6e17ec3d55b6..000000000000
--- a/sgl-kernel/bench_fp8_res/results.html
+++ /dev/null
@@ -1,3 +0,0 @@
-<html><body>
-<image src="fp8 scaled matmul.png"/>
-</body></html>
diff --git a/sgl-kernel/outp b/sgl-kernel/outp
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index eecd60937003..0e2b9dc3f9ea 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -2,9 +2,7 @@
 
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-import os
-import sys
-import multiprocessing
+
 
 root = Path(__file__).parent.resolve()
 
diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index b83c0ff4a68f..fdef3d55ddc5 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -493,34 +493,7 @@ void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const
     TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
 
     auto status = gemm_op.run(args, workspace.data_ptr(), stream);
-    
-    if (status != cutlass::Status::kSuccess) {
-        std::stringstream error_msg;
-        error_msg << "GEMM execution failed. Status: " 
-                  << cutlass::cutlassGetStatusString(status) << "\n"
-                  << "Problem size: M=" << a.size(0) << ", N=" << b.size(1) << ", K=" << a.size(1) << "\n"
-                  << "Device: " << a.device() << "\n"
-                  << "Data types - A: " << a.dtype() 
-                  << ", B: " << b.dtype() 
-                  << ", Out: " << out.dtype() << "\n"
-                  << "Memory alignment - A: " << reinterpret_cast<std::uintptr_t>(a.data_ptr()) % 16 
-                  << ", B: " << reinterpret_cast<std::uintptr_t>(b.data_ptr()) % 16 
-                  << ", Out: " << reinterpret_cast<std::uintptr_t>(out.data_ptr()) % 16
-                  << ", workspace_size: " << workspace_size
-                  << ", workspace_options: " << workspace_options;
-        
-        cudaError_t cuda_err = cudaGetLastError();
-        if (cuda_err != cudaSuccess) {
-            error_msg << "\nCUDA error: " << cudaGetErrorString(cuda_err);
-        }
-        
-        TORCH_CHECK(false, error_msg.str());
-    }
 
-    cudaError_t sync_err = cudaStreamSynchronize(stream);
-    if (sync_err != cudaSuccess) {
-        TORCH_CHECK(false, "CUDA sync error: ", cudaGetErrorString(sync_err));
-    }
     TORCH_CHECK(status == cutlass::Status::kSuccess)
 }
 
@@ -615,7 +588,7 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
             sm89_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
         }
   } else {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented int8_scaled_mm for current compute capability: ", sm_version);
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented fp8_scaled_mm for current compute capability: ", sm_version);
   }
 
 
diff --git a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
index 47efff53c5e4..7f616216fb10 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
+++ b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
@@ -2,7 +2,6 @@
 #include <torch/extension.h>
 #include <fstream>
 #include <sstream>
-#include <filesystem>
 
 struct cuda_error : public std::runtime_error {
   /**

From a4331cd260c969ff08a0dbd7465c9b5d87b472b6 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 21 Jan 2025 02:55:14 -0800
Subject: [PATCH 169/248] Add accuracy and latency tests of eagle into CI 
 (#3027)

---
 .github/workflows/pr-test.yml       |  18 ++-
 python/sglang/test/test_utils.py    |   6 +-
 test/srt/models/test_qwen_models.py |   6 +-
 test/srt/test_bench_one_batch.py    |  26 +++-
 test/srt/test_bench_serving.py      |  34 ++++-
 test/srt/test_eagle_infer.py        | 217 ++++++++++++++--------------
 test/srt/test_torch_compile.py      |   2 +-
 7 files changed, 186 insertions(+), 123 deletions(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 8b8d7c56e7f9..c5eeeee3c141 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -128,7 +128,7 @@ jobs:
         timeout-minutes: 10
         run: |
           cd test/srt
-          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_default
+          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1
 
       - name: Benchmark online latency
         timeout-minutes: 10
@@ -148,6 +148,13 @@ jobs:
           cd test/srt
           python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
 
+      - name: Benchmark online latency (EAGLE)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
+
+
   performance-test-1-gpu-part-2:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
     runs-on: 1-gpu-runner
@@ -196,7 +203,13 @@ jobs:
         timeout-minutes: 10
         run: |
           cd test/srt
-          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_default
+          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
+
+      - name: Benchmark single latency + torch.compile (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
 
       - name: Benchmark offline throughput (TP=2)
         timeout-minutes: 10
@@ -210,6 +223,7 @@ jobs:
           cd test/srt
           python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
 
+
   accuracy-test-1-gpu:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
     runs-on: 1-gpu-runner
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index ad8ff6cbf4d4..ee5ae278d139 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -42,6 +42,9 @@
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
 
+DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
+DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmzheng/sglang-EAGLE-llama2-chat-7B"
+
 
 def is_in_ci():
     """Return whether it is in CI runner."""
@@ -538,6 +541,7 @@ def run_bench_serving(
     random_input_len=4096,
     random_output_len=2048,
     disable_stream=False,
+    disable_ignore_eos=False,
     need_warmup=False,
 ):
     # Launch the server
@@ -572,7 +576,7 @@ def run_bench_serving(
         disable_stream=disable_stream,
         return_logprob=False,
         seed=0,
-        disable_ignore_eos=False,
+        disable_ignore_eos=disable_ignore_eos,
         extra_request_body=None,
         apply_chat_template=False,
         profile=None,
diff --git a/test/srt/models/test_qwen_models.py b/test/srt/models/test_qwen_models.py
index 9e61930a76e4..c7788fa8e500 100644
--- a/test/srt/models/test_qwen_models.py
+++ b/test/srt/models/test_qwen_models.py
@@ -37,8 +37,7 @@ def test_gsm8k(self):
             port=int(self.base_url.split(":")[-1]),
         )
         metrics = run_eval(args)
-        print(metrics)
-
+        print(f"{metrics=}")
         self.assertGreater(metrics["accuracy"], 0.81)
 
 
@@ -69,8 +68,7 @@ def test_gsm8k(self):
             port=int(self.base_url.split(":")[-1]),
         )
         metrics = run_eval(args)
-        print(metrics)
-
+        print(f"{metrics=}")
         self.assertGreater(metrics["accuracy"], 0.79)
 
 
diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py
index c1bc98e8e042..c6562170d610 100644
--- a/test/srt/test_bench_one_batch.py
+++ b/test/srt/test_bench_one_batch.py
@@ -5,24 +5,46 @@
     DEFAULT_MOE_MODEL_NAME_FOR_TEST,
     is_in_ci,
     run_bench_one_batch,
+    write_github_step_summary,
 )
 
 
 class TestBenchOneBatch(unittest.TestCase):
-    def test_default(self):
+    def test_bs1(self):
         output_throughput = run_bench_one_batch(DEFAULT_MODEL_NAME_FOR_TEST, [])
 
         if is_in_ci():
+            write_github_step_summary(
+                f"### test_bs1\n"
+                f"output_throughput : {output_throughput:.2f} token/s\n"
+            )
             self.assertGreater(output_throughput, 135)
 
-    def test_moe_default(self):
+    def test_moe_tp2_bs1(self):
         output_throughput = run_bench_one_batch(
             DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2"]
         )
 
         if is_in_ci():
+            write_github_step_summary(
+                f"### test_moe_tp2_bs1\n"
+                f"output_throughput : {output_throughput:.2f} token/s\n"
+            )
             self.assertGreater(output_throughput, 125)
 
+    def test_torch_compile_tp2_bs1(self):
+        output_throughput = run_bench_one_batch(
+            DEFAULT_MODEL_NAME_FOR_TEST,
+            ["--tp", "2", "--enable-torch-compile", "--cuda-graph-max-bs", "2"],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_torch_compile_tp2_bs1\n"
+                f"output_throughput : {output_throughput:.2f} token/s\n"
+            )
+            self.assertGreater(output_throughput, 240)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py
index b882f12f9df5..b55260f71a63 100644
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -1,6 +1,8 @@
 import unittest
 
 from sglang.test.test_utils import (
+    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
     DEFAULT_FP8_MODEL_NAME_FOR_TEST,
     DEFAULT_MODEL_NAME_FOR_TEST,
     DEFAULT_MOE_MODEL_NAME_FOR_TEST,
@@ -47,7 +49,7 @@ def test_offline_throughput_non_stream_small_batch_size(self):
             )
             # There is a regression with torch 2.5
             # This number was 950 for torch 2.4
-            self.assertGreater(res["output_throughput"], 800)
+            self.assertGreater(res["output_throughput"], 850)
 
     def test_offline_throughput_without_radix_cache(self):
         res = run_bench_serving(
@@ -131,6 +133,36 @@ def test_online_latency_default(self):
             self.assertLess(res["median_ttft_ms"], 86)
             self.assertLess(res["median_itl_ms"], 10)
 
+    def test_online_latency_eagle(self):
+        res = run_bench_serving(
+            model=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+            num_prompts=50,
+            request_rate=1,
+            disable_ignore_eos=True,
+            dataset_name="sharegpt",
+            other_server_args=[
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+                "--speculative-num-steps",
+                "5",
+                "--speculative-eagle-topk",
+                "8",
+                "--speculative-num-draft-tokens",
+                "64",
+                "--mem-fraction-static",
+                "0.7",
+            ],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_online_latency_eagle\n"
+                f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n'
+            )
+            self.assertLess(res["median_e2e_latency_ms"], 10000)
+
     def test_moe_offline_throughput_default(self):
         res = run_bench_serving(
             model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
diff --git a/test/srt/test_eagle_infer.py b/test/srt/test_eagle_infer.py
index 92127b8ef591..b01c260496a8 100644
--- a/test/srt/test_eagle_infer.py
+++ b/test/srt/test_eagle_infer.py
@@ -1,14 +1,18 @@
-import multiprocessing
 import random
+import threading
 import time
 import unittest
+from types import SimpleNamespace
 
 import requests
-from transformers import AutoConfig, AutoTokenizer
 
 import sglang as sgl
+from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
 from sglang.test.test_utils import (
+    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
     popen_launch_server,
@@ -19,60 +23,59 @@ class TestEAGLEEngine(unittest.TestCase):
 
     def test_eagle_accuracy(self):
         prompt = "Today is a sunny day and I like"
-        target_model_path = "meta-llama/Llama-2-7b-chat-hf"
-        speculative_draft_model_path = "lmzheng/sglang-EAGLE-llama2-chat-7B"
-
         sampling_params = {"temperature": 0, "max_new_tokens": 8}
 
+        # Get the reference output
+        ref_engine = sgl.Engine(model_path=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST)
+        ref_output = ref_engine.generate(prompt, sampling_params)["text"]
+        ref_engine.shutdown()
+
+        # Launch EAGLE engine
         engine = sgl.Engine(
-            model_path=target_model_path,
-            speculative_draft_model_path=speculative_draft_model_path,
+            model_path=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+            speculative_draft_model_path=DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
             speculative_algorithm="EAGLE",
-            speculative_num_steps=3,
-            speculative_eagle_topk=4,
-            speculative_num_draft_tokens=16,
+            speculative_num_steps=5,
+            speculative_eagle_topk=8,
+            speculative_num_draft_tokens=64,
+            mem_fraction_static=0.7,
         )
-        out1 = engine.generate(prompt, sampling_params)["text"]
-        engine.shutdown()
-
-        engine = sgl.Engine(model_path=target_model_path)
-        out2 = engine.generate(prompt, sampling_params)["text"]
-        engine.shutdown()
 
-        print("==== Answer 1 ====")
-        print(out1)
-
-        print("==== Answer 2 ====")
-        print(out2)
-        self.assertEqual(out1, out2)
+        # Case 1: Test the output of EAGLE engine is the same as normal engine
+        out1 = engine.generate(prompt, sampling_params)["text"]
+        print(f"{out1=}, {ref_output=}")
+        self.assertEqual(out1, ref_output)
 
-    def test_eagle_end_check(self):
+        # Case 2: Test the output of EAGLE engine does not contain unexpected EOS
         prompt = "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nToday is a sunny day and I like [/INST]"
-        target_model_path = "meta-llama/Llama-2-7b-chat-hf"
-        tokenizer = AutoTokenizer.from_pretrained(target_model_path)
-        speculative_draft_model_path = "lmzheng/sglang-EAGLE-llama2-chat-7B"
-
         sampling_params = {
             "temperature": 0,
             "max_new_tokens": 1024,
             "skip_special_tokens": False,
         }
 
-        engine = sgl.Engine(
-            model_path=target_model_path,
-            speculative_draft_model_path=speculative_draft_model_path,
-            speculative_algorithm="EAGLE",
-            speculative_num_steps=3,
-            speculative_eagle_topk=4,
-            speculative_num_draft_tokens=16,
-        )
-        out1 = engine.generate(prompt, sampling_params)["text"]
-        engine.shutdown()
-        print("==== Answer 1 ====")
-        print(repr(out1))
-        tokens = tokenizer.encode(out1, truncation=False)
+        tokenizer = get_tokenizer(DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST)
+        out2 = engine.generate(prompt, sampling_params)["text"]
+        print(f"{out2=}")
+        tokens = tokenizer.encode(out2, truncation=False)
         assert tokenizer.eos_token_id not in tokens
 
+        # Case 3: Batched prompts
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        sampling_params = {"temperature": 0, "max_new_tokens": 30}
+        outputs = engine.generate(prompts, sampling_params)
+        for prompt, output in zip(prompts, outputs):
+            print("===============================")
+            print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
+
+        # Shutdown the engine
+        engine.shutdown()
+
 
 prompts = [
     "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nToday is a sunny day and I like[/INST]"
@@ -83,64 +86,27 @@ def test_eagle_end_check(self):
 ]
 
 
-def process(server_url: str):
-    time.sleep(random.uniform(0, 2))
-    for prompt in prompts:
-        url = server_url
-        data = {
-            "model": "base",
-            "text": prompt,
-            "sampling_params": {
-                "temperature": 0,
-                "max_new_tokens": 1024,
-            },
-        }
-        response = requests.post(url, json=data)
-        assert response.status_code == 200
-
-
-def abort_process(server_url: str):
-    for prompt in prompts:
-        try:
-            time.sleep(1)
-            url = server_url
-            data = {
-                "model": "base",
-                "text": prompt,
-                "sampling_params": {
-                    "temperature": 0,
-                    "max_new_tokens": 1024,
-                },
-            }
-            # set timeout = 1s,mock disconnected
-            requests.post(url, json=data, timeout=1)
-        except:
-            pass
-
-
-class TestEAGLELaunchServer(unittest.TestCase):
+class TestEAGLEServer(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        speculative_draft_model_path = "lmzheng/sglang-EAGLE-llama2-chat-7B"
-        cls.model = "meta-llama/Llama-2-7b-chat-hf"
         cls.base_url = DEFAULT_URL_FOR_TEST
         cls.process = popen_launch_server(
-            cls.model,
+            DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
             cls.base_url,
             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
             other_args=[
                 "--speculative-algorithm",
                 "EAGLE",
                 "--speculative-draft-model-path",
-                speculative_draft_model_path,
+                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
                 "--speculative-num-steps",
-                "3",
+                "5",
                 "--speculative-eagle-topk",
-                "4",
+                "8",
                 "--speculative-num-draft-tokens",
-                "16",
-                "--served-model-name",
-                "base",
+                "64",
+                "--mem-fraction-static",
+                "0.7",
             ],
         )
 
@@ -148,40 +114,67 @@ def setUpClass(cls):
     def tearDownClass(cls):
         kill_process_tree(cls.process.pid)
 
-    def test_eagle_server_concurrency(self):
-        concurrency = 4
-        processes = [
-            multiprocessing.Process(
-                target=process,
-                kwargs={"server_url": self.base_url + "/generate"},
-            )
-            for _ in range(concurrency)
-        ]
-        for worker in processes:
-            worker.start()
-        for p in processes:
-            p.join()
-
-    def test_eagle_server_request_abort(self):
+    def send_request(self):
+        time.sleep(random.uniform(0, 2))
+        for prompt in prompts:
+            url = self.base_url + "/generate"
+            data = {
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 1024,
+                },
+            }
+            response = requests.post(url, json=data)
+            assert response.status_code == 200
+
+    def send_requests_abort(self):
+        for prompt in prompts:
+            try:
+                time.sleep(random.uniform(0, 2))
+                url = self.base_url + "/generate"
+                data = {
+                    "model": "base",
+                    "text": prompt,
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": 1024,
+                    },
+                }
+                # set timeout = 1s,mock disconnected
+                requests.post(url, json=data, timeout=1)
+            except Exception as e:
+                print(e)
+                pass
+
+    def test_request_abort(self):
         concurrency = 4
-        processes = [
-            multiprocessing.Process(
-                target=process,
-                kwargs={"server_url": self.base_url + "/generate"},
-            )
-            for _ in range(concurrency)
+        threads = [
+            threading.Thread(target=self.send_request) for _ in range(concurrency)
         ] + [
-            multiprocessing.Process(
-                target=abort_process,
-                kwargs={"server_url": self.base_url + "/generate"},
-            )
+            threading.Thread(target=self.send_requests_abort)
             for _ in range(concurrency)
         ]
-        for worker in processes:
+        for worker in threads:
             worker.start()
-        for p in processes:
+        for p in threads:
             p.join()
 
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+
+        self.assertGreater(metrics["accuracy"], 0.20)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py
index 6f3b344b3cce..e71de3391177 100644
--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -23,7 +23,7 @@ def setUpClass(cls):
             cls.model,
             cls.base_url,
             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=["--enable-torch-compile"],
+            other_args=["--enable-torch-compile", "--cuda-graph-max-bs", "4"],
         )
 
     @classmethod

From 5a0d680a14fc9aa29b2640b69baaf5d28d5975b9 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Tue, 21 Jan 2025 20:44:49 +0800
Subject: [PATCH 170/248] feat: add flashinfer as 3rdparty and use rmsnorm as
 example (#3033)

---
 .github/workflows/pr-test-sgl-kernel.yml      |   1 +
 .gitignore                                    |   2 +
 .gitmodules                                   |   3 +
 sgl-kernel/3rdparty/flashinfer                |   1 +
 sgl-kernel/THIRDPARTYNOTICES.txt              | 225 ++++++++++++++++++
 sgl-kernel/setup.py                           |  21 +-
 sgl-kernel/src/sgl-kernel/__init__.py         |   2 +
 sgl-kernel/src/sgl-kernel/csrc/norm.cu        |  28 +++
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     |   5 +
 sgl-kernel/src/sgl-kernel/ops/__init__.py     |  18 ++
 sgl-kernel/tests/test_rmsnorm.py              |  31 +++
 11 files changed, 335 insertions(+), 2 deletions(-)
 create mode 160000 sgl-kernel/3rdparty/flashinfer
 create mode 100644 sgl-kernel/THIRDPARTYNOTICES.txt
 create mode 100644 sgl-kernel/src/sgl-kernel/csrc/norm.cu
 create mode 100644 sgl-kernel/tests/test_rmsnorm.py

diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
index 31360c0a068b..0c29322a4021 100644
--- a/.github/workflows/pr-test-sgl-kernel.yml
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -41,6 +41,7 @@ jobs:
       - name: Install
         run: |
           pip3 install torch==2.5.1
+          pip3 install pytest
           pip3 uninstall sgl-kernel -y || true
           cd sgl-kernel
           pip3 install .
diff --git a/.gitignore b/.gitignore
index 91966c664b53..75e29fac373a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -225,3 +225,5 @@ compile_commands.json
 
 # VSCode
 .vscode
+
+1
diff --git a/.gitmodules b/.gitmodules
index c584a21e8bd2..ed7603bfd3c1 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "sgl-kernel/3rdparty/cccl"]
 	path = sgl-kernel/3rdparty/cccl
 	url = https://github.com/NVIDIA/cccl.git
+[submodule "sgl-kernel/3rdparty/flashinfer"]
+	path = sgl-kernel/3rdparty/flashinfer
+	url = https://github.com/flashinfer-ai/flashinfer.git
diff --git a/sgl-kernel/3rdparty/flashinfer b/sgl-kernel/3rdparty/flashinfer
new file mode 160000
index 000000000000..a0e99a3a8201
--- /dev/null
+++ b/sgl-kernel/3rdparty/flashinfer
@@ -0,0 +1 @@
+Subproject commit a0e99a3a820109763d9a757138a5cdf7bbcd1f85
diff --git a/sgl-kernel/THIRDPARTYNOTICES.txt b/sgl-kernel/THIRDPARTYNOTICES.txt
new file mode 100644
index 000000000000..c930aa5dd3d8
--- /dev/null
+++ b/sgl-kernel/THIRDPARTYNOTICES.txt
@@ -0,0 +1,225 @@
+Notice for flashinfer-ai/flashinfer
+-------------------------------
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+-------------------------------------------------------------------------------------------------
+Some of the code in this project are adapted from other open-source projects with different
+licenses. This product also bundles some third-party components under other open source licenses.
+This section summarizes those components and their licenses.
+See licenses/ for text of these licenses.
+
+BSD 3-Clause License
+--------------------
+
+include/flashinfer/attention/hopper/epilogue.cuh
+include/flashinfer/attention/hopper/mainloop.cuh
+include/flashinfer/attention/hopper/kernel_traits.cuh
+include/flashinfer/attention/hopper/named_barrier.cuh
+include/flashinfer/attention/hopper/tile_scheduler.cuh
+include/flashinfer/attention/hopper/utils.cuh
+
+BSD 3-Clause "New" License
+--------------------------
+
+3rdparty/cutlass
+include/flashinfer/attention/hopper/block_sparse_gather.cuh
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 9f9867113387..a8d9517bb25b 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -1,5 +1,6 @@
 from pathlib import Path
 
+import torch
 from setuptools import find_packages, setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 
@@ -24,10 +25,13 @@ def update_wheel_platform_tag():
 
 
 cutlass = root / "3rdparty" / "cutlass"
+flashinfer = root / "3rdparty" / "flashinfer"
 include_dirs = [
     cutlass.resolve() / "include",
     cutlass.resolve() / "tools" / "util" / "include",
     root / "src" / "sgl-kernel" / "csrc",
+    flashinfer.resolve() / "include",
+    flashinfer.resolve() / "csrc",
 ]
 nvcc_flags = [
     "-DNDEBUG",
@@ -39,9 +43,21 @@ def update_wheel_platform_tag():
     "-gencode=arch=compute_89,code=sm_89",
     "-gencode=arch=compute_90,code=sm_90",
     "-gencode=arch=compute_90a,code=sm_90a",
-    "-U__CUDA_NO_HALF_OPERATORS__",
-    "-U__CUDA_NO_HALF2_OPERATORS__",
+    "-std=c++17",
+    "-use_fast_math",
+    "-DFLASHINFER_ENABLE_F16",
+    "-DFLASHINFER_ENABLE_BF16",
 ]
+for flag in [
+    "-D__CUDA_NO_HALF_OPERATORS__",
+    "-D__CUDA_NO_HALF_CONVERSIONS__",
+    "-D__CUDA_NO_BFLOAT16_CONVERSIONS__",
+    "-D__CUDA_NO_HALF2_OPERATORS__",
+]:
+    try:
+        torch.utils.cpp_extension.COMMON_NVCC_FLAGS.remove(flag)
+    except ValueError:
+        pass
 cxx_flags = ["-O3"]
 libraries = ["c10", "torch", "torch_python", "cuda"]
 extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib", "-L/usr/lib/x86_64-linux-gnu"]
@@ -56,6 +72,7 @@ def update_wheel_platform_tag():
             "src/sgl-kernel/csrc/sampling_scaling_penalties.cu",
             "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
             "src/sgl-kernel/csrc/rotary_embedding.cu",
+            "src/sgl-kernel/csrc/norm.cu",
         ],
         include_dirs=include_dirs,
         extra_compile_args={
diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py
index 480bec71f365..3352abeb5506 100644
--- a/sgl-kernel/src/sgl-kernel/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/__init__.py
@@ -6,6 +6,7 @@
     int8_scaled_mm,
     moe_align_block_size,
     register_graph_buffers,
+    rmsnorm,
     rotary_embedding,
     sampling_scaling_penalties,
 )
@@ -20,4 +21,5 @@
     "get_graph_buffer_ipc_meta",
     "register_graph_buffers",
     "rotary_embedding",
+    "rmsnorm",
 ]
diff --git a/sgl-kernel/src/sgl-kernel/csrc/norm.cu b/sgl-kernel/src/sgl-kernel/csrc/norm.cu
new file mode 100644
index 000000000000..ad102a50d3f0
--- /dev/null
+++ b/sgl-kernel/src/sgl-kernel/csrc/norm.cu
@@ -0,0 +1,28 @@
+#include <cstdint>
+#include <flashinfer/norm.cuh>
+
+#include "pytorch_extension_utils.h"
+
+using namespace flashinfer;
+
+void rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, double eps, int64_t cuda_stream) {
+  CHECK_INPUT(input);
+  CHECK_INPUT(weight);
+  auto device = input.device();
+  CHECK_EQ(weight.device(), device);
+  CHECK_DIM(2, input);   // input: (batch_size, hidden_size)
+  CHECK_DIM(1, weight);  // weight: (hidden_size)
+  CHECK_EQ(input.size(1), weight.size(0));
+  unsigned int batch_size = input.size(0);
+  unsigned int hidden_size = input.size(1);
+  CHECK_EQ(output.size(0), batch_size);
+  CHECK_EQ(output.size(1), hidden_size);
+
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(input.scalar_type(), c_type, [&] {
+    cudaError_t status = norm::RMSNorm(static_cast<c_type*>(input.data_ptr()), static_cast<c_type*>(weight.data_ptr()),
+                                       static_cast<c_type*>(output.data_ptr()), batch_size, hidden_size, eps, stream);
+    TORCH_CHECK(status == cudaSuccess, "RMSNorm failed with error code " + std::string(cudaGetErrorString(status)));
+    return true;
+  });
+}
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index f2ae95d7f794..ed359bfbb0a1 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -30,6 +30,9 @@ torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& ma
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, torch::Tensor& key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox);
 
+// rms norm
+void rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, double eps, int64_t cuda_stream);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // trt_reduce
   m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)");
@@ -45,4 +48,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("int8_scaled_mm", &int8_scaled_mm, "INT8 scaled matmul (CUDA)");
   // rotary embedding
   m.def("rotary_embedding", &rotary_embedding, "Rotary Embedding (CUDA)");
+  // rms norm
+  m.def("rmsnorm", &rmsnorm, "RMSNorm (CUDA)");
 }
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index b8abd57d39df..e9eadb759cfa 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -1,3 +1,6 @@
+from typing import Optional
+
+import torch
 from sgl_kernel.ops._kernels import all_reduce as _all_reduce
 from sgl_kernel.ops._kernels import dispose as _dispose
 from sgl_kernel.ops._kernels import (
@@ -7,6 +10,7 @@
 from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm
 from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size
 from sgl_kernel.ops._kernels import register_graph_buffers as _register_graph_buffers
+from sgl_kernel.ops._kernels import rmsnorm as _rmsnorm
 from sgl_kernel.ops._kernels import rotary_embedding as _rotary_embedding
 from sgl_kernel.ops._kernels import (
     sampling_scaling_penalties as _sampling_scaling_penalties,
@@ -76,3 +80,17 @@ def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
 
 def rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox):
     return _rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox)
+
+
+def rmsnorm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    out: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    if out is None:
+        out = torch.empty_like(input)
+    stream = torch.cuda.current_stream().cuda_stream
+    stream_int = int(stream)
+    _rmsnorm(out, input, weight, eps, stream_int)
+    return out
diff --git a/sgl-kernel/tests/test_rmsnorm.py b/sgl-kernel/tests/test_rmsnorm.py
new file mode 100644
index 000000000000..dda225de9e3f
--- /dev/null
+++ b/sgl-kernel/tests/test_rmsnorm.py
@@ -0,0 +1,31 @@
+import pytest
+import torch
+from sgl_kernel import rmsnorm
+
+
+def llama_rms_norm(x, w, eps=1e-6):
+    orig_dtype = x.dtype
+    x = x.float()
+    variance = x.pow(2).mean(dim=-1, keepdim=True)
+    x = x * torch.rsqrt(variance + eps)
+    x = x * w.float()
+    x = x.to(orig_dtype)
+    return x
+
+
+@pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
+@pytest.mark.parametrize("hidden_size", [111, 500, 1024, 3072, 3584, 4096, 8192, 16384])
+@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("specify_out", [True, False])
+def test_norm(batch_size, hidden_size, dtype, specify_out):
+    x = torch.randn(batch_size, hidden_size).to(0).to(dtype)
+    w = torch.randn(hidden_size).to(0).to(dtype)
+
+    y_ref = llama_rms_norm(x, w)
+    if specify_out:
+        y = torch.empty_like(x)
+        rmsnorm(x, w, out=y)
+    else:
+        y = rmsnorm(x, w)
+
+    torch.testing.assert_close(y_ref, y, rtol=1e-3, atol=1e-3)

From 0ac019f17189e2ba3a3bab047cf441e060d339a1 Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Tue, 21 Jan 2025 22:21:54 +0800
Subject: [PATCH 171/248] Support sm90 Int8 gemm (#3035)

---
 .../src/sgl-kernel/csrc/int8_gemm_kernel.cu   | 210 +++++++++++++++++-
 sgl-kernel/tests/test_int8_gemm.py            |   2 +-
 2 files changed, 210 insertions(+), 2 deletions(-)

diff --git a/sgl-kernel/src/sgl-kernel/csrc/int8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/int8_gemm_kernel.cu
index cce32c2d894a..8e3f72757028 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/int8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/int8_gemm_kernel.cu
@@ -3,13 +3,23 @@
 #include <cutlass/epilogue/thread/linear_combination.h>
 #include <cutlass/epilogue/threadblock/epilogue_with_visitor.h>
 #include <cutlass/gemm/device/gemm.h>
+#include <cutlass/gemm/device/gemm_universal_adapter.h>
 #include <cutlass/numeric_types.h>
 
+#include <cute/atom/mma_atom.hpp>
+#include <cute/tensor.hpp>
+#include <cutlass/epilogue/collective/collective_builder.hpp>
+#include <cutlass/gemm/collective/collective_builder.hpp>
+#include <cutlass/gemm/kernel/gemm_universal.hpp>
+#include <cutlass/util/packed_stride.hpp>
+
 #include "cutlass_extensions/epilogue/epilogue_per_row_per_col_scale.h"
 #include "cutlass_extensions/gemm/gemm_universal_base_compat.h"
 #include "cutlass_extensions/gemm/gemm_with_epilogue_visitor.h"
 #include "utils.hpp"
 
+using namespace cute;
+
 template <typename ElementOutput, typename ArchTag, typename ThreadblockShape, typename WarpShape,
           typename InstructionShape, int NumStages>
 void cutlass_int8_scaled_mm(torch::Tensor& out, const torch::Tensor& mat_a, const torch::Tensor& mat_b,
@@ -166,6 +176,186 @@ void sm80_dispatch_shape(torch::Tensor& out, const torch::Tensor& mat_a, const t
   }
 }
 
+template <typename ElementOutput, typename TileShape, typename ClusterShape, typename MainloopScheduleType,
+          bool WithBias>
+void cutlass_int8_scaled_mm_sm90(torch::Tensor& out, const torch::Tensor& mat_a, const torch::Tensor& mat_b,
+                                 const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                                 const c10::optional<torch::Tensor>& bias) {
+  using ArchTag = cutlass::arch::Sm90;
+
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+  using ElementInputA = int8_t;
+  using ElementInputB = int8_t;
+
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementInputA>::value;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementInputB>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized;
+  using TileSchedulerType = cutlass::gemm::PersistentScheduler;
+
+  using XScale = cutlass::epilogue::fusion::Sm90ColBroadcast<0, TileShape, ElementCompute, ElementCompute,
+                                                             Stride<Int<1>, Int<0>, Int<0>>>;
+
+  using WScale = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementCompute, ElementCompute,
+                                                             Stride<Int<0>, Int<1>, Int<0>>>;
+
+  using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementOutput, ElementOutput,
+                                                           Stride<Int<0>, Int<1>, Int<0>>>;
+
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  // Scale
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies, ElementCompute, ElementCompute,
+                                                          cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 = cutlass::epilogue::fusion::Sm90EVT<Compute0, WScale, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies, ElementOutput, ElementCompute,
+                                                          cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute1 = cutlass::epilogue::fusion::Sm90EVT<Compute1, XScale, EVTCompute0>;
+
+  // With bias
+  using ComputeWithBias = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiply_add, ElementOutput, ElementCompute,
+                                                                 cutlass::FloatRoundStyle::round_to_nearest>;
+  using EVTComputeWithBias = cutlass::epilogue::fusion::Sm90EVT<ComputeWithBias, XScale, EVTCompute0, Bias>;
+
+  using EpilogueEVT = typename cutlass::platform::conditional<WithBias, EVTComputeWithBias, EVTCompute1>::type;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag, OperatorClass, TileShape, ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute, ElementOutput, cutlass::layout::RowMajor, AlignmentC, ElementOutput,
+      cutlass::layout::RowMajor, AlignmentOutput, EpilogueScheduleType, EpilogueEVT>::CollectiveOp;
+
+  using Stages = cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+      sizeof(typename CollectiveEpilogue::SharedStorage))>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag, OperatorClass, ElementInputA, cutlass::layout::RowMajor, AlignmentA, ElementInputB,
+      cutlass::layout::ColumnMajor, AlignmentB, ElementAccumulator, TileShape, ClusterShape, Stages,
+      MainloopScheduleType>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>,  // Indicates ProblemShape
+                                                          CollectiveMainloop, CollectiveEpilogue, TileSchedulerType>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  Gemm gemm_op;
+
+  int m = mat_a.size(0);
+  int k = mat_a.size(1);
+  int n = mat_b.size(1);
+
+  auto a_ptr = static_cast<ElementInputA*>(mat_a.data_ptr());
+  auto b_ptr = static_cast<ElementInputB*>(mat_b.data_ptr());
+  auto o_ptr = static_cast<ElementOutput*>(out.data_ptr());
+
+  auto a_s_ptr = static_cast<ElementCompute*>(scales_a.data_ptr());
+  auto b_s_ptr = static_cast<ElementCompute*>(scales_b.data_ptr());
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, make_shape(m, k, 1));
+  StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, make_shape(n, k, 1));
+  StrideC stride_c;
+  StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, make_shape(m, n, 1));
+
+  typename Gemm::Arguments args = {cutlass::gemm::GemmUniversalMode::kGemm,
+                                   {m, n, k, 1},
+                                   {a_ptr, stride_a, b_ptr, stride_b},
+                                   {{},  // epilogue.thread
+                                    nullptr,
+                                    stride_c,
+                                    o_ptr,
+                                    stride_d}};
+
+  if constexpr (WithBias) {
+    ElementOutput* bias_ptr = static_cast<ElementOutput*>(bias->data_ptr());
+    args.epilogue.thread = {
+        {a_s_ptr},
+        {{b_s_ptr}, {}, {}},
+        {bias_ptr},
+        {},
+    };
+  } else {
+    args.epilogue.thread = {
+        {a_s_ptr},
+        {{b_s_ptr}, {}, {}},
+        {},
+    };
+  }
+
+  auto workspace = torch::empty(gemm_op.get_workspace_size(args),
+                                torch::TensorOptions().dtype(torch::kUInt8).device(mat_a.device()));
+
+  auto stream = at::cuda::getCurrentCUDAStream(mat_a.get_device());
+
+  auto can_implement = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement == cutlass::Status::kSuccess,
+              "gemm cannot implement, error: ", cutlassGetStatusString(can_implement));
+
+  auto status = gemm_op(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "gemm executioin failed, error: ", cutlassGetStatusString(status));
+}
+
+template <typename ElementOutput, typename TileShape, typename ClusterShape, typename MainloopScheduleType>
+void sm90_dispatch_bias(torch::Tensor& out, const torch::Tensor& mat_a, const torch::Tensor& mat_b,
+                        const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                        const c10::optional<torch::Tensor>& bias) {
+  if (bias) {
+    cutlass_int8_scaled_mm_sm90<ElementOutput, TileShape, ClusterShape, MainloopScheduleType, true>(
+        out, mat_a, mat_b, scales_a, scales_b, bias);
+  } else {
+    cutlass_int8_scaled_mm_sm90<ElementOutput, TileShape, ClusterShape, MainloopScheduleType, false>(
+        out, mat_a, mat_b, scales_a, scales_b, bias);
+  }
+}
+
+template <typename ElementOutput>
+void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& mat_a, const torch::Tensor& mat_b,
+                         const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                         const c10::optional<torch::Tensor>& bias) {
+  int m = mat_a.size(0);
+  int n = mat_b.size(1);
+  if (m <= 32) {
+    if (n < 8192) {
+      return sm90_dispatch_bias<ElementOutput, Shape<_64, _64, _128>, Shape<_1, _8, _1>,
+                                cutlass::gemm::KernelTmaWarpSpecialized>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      return sm90_dispatch_bias<ElementOutput, Shape<_64, _128, _128>, Shape<_1, _8, _1>,
+                                cutlass::gemm::KernelTmaWarpSpecialized>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 64) {
+    if (n < 8192) {
+      return sm90_dispatch_bias<ElementOutput, Shape<_64, _64, _128>, Shape<_1, _4, _1>,
+                                cutlass::gemm::KernelTmaWarpSpecialized>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      return sm90_dispatch_bias<ElementOutput, Shape<_64, _64, _256>, Shape<_1, _1, _1>,
+                                cutlass::gemm::KernelTmaWarpSpecialized>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 128) {
+    if (n <= 4096) {
+      return sm90_dispatch_bias<ElementOutput, Shape<_64, _64, _128>, Shape<_2, _1, _1>,
+                                cutlass::gemm::KernelTmaWarpSpecialized>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      return sm90_dispatch_bias<ElementOutput, Shape<_64, _128, _128>, Shape<_2, _1, _1>,
+                                cutlass::gemm::KernelTmaWarpSpecialized>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else {
+    return sm90_dispatch_bias<ElementOutput, Shape<_128, _128, _128>, Shape<_2, _1, _1>,
+                              cutlass::gemm::KernelTmaWarpSpecializedPingpong>(out, mat_a, mat_b, scales_a, scales_b,
+                                                                               bias);
+  }
+}
+
 torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
                              const c10::optional<torch::Tensor>& bias) {
@@ -204,7 +394,24 @@ torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& ma
     TORCH_CHECK(out_dtype == torch::kHalf, "out_dtype must be Half for SM75");
     sm75_dispatch_shape<cutlass::half_t, cutlass::arch::Sm75, cutlass::gemm::GemmShape<8, 8, 16>>(
         out, mat_a, mat_b, scales_a, scales_b, bias);
-  } else if (sm_version >= 80 && sm_version <= 90) {
+  } else if (sm_version >= 80 && sm_version < 90) {
+    if (out_dtype == torch::kBFloat16) {
+      sm80_dispatch_shape<cutlass::bfloat16_t, cutlass::arch::Sm80, cutlass::gemm::GemmShape<16, 8, 32>>(
+          out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      sm80_dispatch_shape<cutlass::half_t, cutlass::arch::Sm80, cutlass::gemm::GemmShape<16, 8, 32>>(
+          out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else if (sm_version == 90) {
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+    // cutlass 3.x
+    if (out_dtype == torch::kBFloat16) {
+      sm90_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      sm90_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+#else
+    // fallback to cutlass 2.x
     if (out_dtype == torch::kBFloat16) {
       sm80_dispatch_shape<cutlass::bfloat16_t, cutlass::arch::Sm80, cutlass::gemm::GemmShape<16, 8, 32>>(
           out, mat_a, mat_b, scales_a, scales_b, bias);
@@ -212,6 +419,7 @@ torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& ma
       sm80_dispatch_shape<cutlass::half_t, cutlass::arch::Sm80, cutlass::gemm::GemmShape<16, 8, 32>>(
           out, mat_a, mat_b, scales_a, scales_b, bias);
     }
+#endif
   } else {
     TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented int8_scaled_mm for current compute capability.");
   }
diff --git a/sgl-kernel/tests/test_int8_gemm.py b/sgl-kernel/tests/test_int8_gemm.py
index 34d17d1c76ac..c33a3effcafd 100644
--- a/sgl-kernel/tests/test_int8_gemm.py
+++ b/sgl-kernel/tests/test_int8_gemm.py
@@ -25,7 +25,7 @@ def _test_accuracy_once(self, M, N, K, with_bias, out_dtype, device):
         scale_a = torch.randn((M,), device="cuda", dtype=torch.float32)
         scale_b = torch.randn((N,), device="cuda", dtype=torch.float32)
         if with_bias:
-            bias = torch.ones((N,), device="cuda", dtype=out_dtype) * 10
+            bias = torch.randn((N,), device="cuda", dtype=out_dtype) * 10
         else:
             bias = None
 

From a42213dbd4d952e9484ce0415ea53939d74a51db Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Wed, 22 Jan 2025 00:56:42 +0800
Subject: [PATCH 172/248] fix pr-test-sgl-kernel (#3036)

---
 .github/workflows/pr-test-sgl-kernel.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
index 0c29322a4021..3d9802658310 100644
--- a/.github/workflows/pr-test-sgl-kernel.yml
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -35,15 +35,13 @@ jobs:
     runs-on: 1-gpu-runner
     steps:
       - uses: actions/checkout@v4
-        with:
-          submodules: 'recursive'
 
       - name: Install
         run: |
-          pip3 install torch==2.5.1
-          pip3 install pytest
+          pip3 install torch==2.5.1 && pip3 install pytest && pip3 install vllm
           pip3 uninstall sgl-kernel -y || true
           cd sgl-kernel
+          git submodule deinit --all --force && git submodule sync --recursive && git submodule update --init --force --recursive
           pip3 install .
           pip3 list | grep sgl-kernel
 

From fe490cc3f49930ceaefc7ec941b5ead98aac1e75 Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Tue, 7 Jan 2025 17:24:45 +0800
Subject: [PATCH 173/248] Add performance and accuracy test code for FP8 GEMM
 operations

---
 sgl-kernel/benchmark/bench_fp8_gemm.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sgl-kernel/benchmark/bench_fp8_gemm.py b/sgl-kernel/benchmark/bench_fp8_gemm.py
index 73d2e7d1627d..b6e6da36263c 100644
--- a/sgl-kernel/benchmark/bench_fp8_gemm.py
+++ b/sgl-kernel/benchmark/bench_fp8_gemm.py
@@ -5,7 +5,6 @@
 from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
 from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
 from sgl_kernel import fp8_scaled_mm as sgl_scaled_mm
-import time
 
 @triton.testing.perf_report(
         triton.testing.Benchmark(
@@ -52,4 +51,4 @@ def benchmark(batch_size, provider):
     return gbps(ms), gbps(max_ms), gbps(min_ms)
 
 
-benchmark.run(print_data=True, show_plots=True, save_path="bench_fp8_res")
\ No newline at end of file
+benchmark.run(print_data=True, show_plots=True, save_path="bench_fp8_res")

From b2de73de5330f572481e072d409c9eee9ac8893a Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Wed, 8 Jan 2025 19:25:23 +0800
Subject: [PATCH 174/248] support w8a8 fp8

---
 sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu | 8 ++------
 sgl-kernel/src/sgl-kernel/csrc/utils.hpp          | 3 +--
 sgl-kernel/src/sgl-kernel/ops/__init__.py         | 2 +-
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index fdef3d55ddc5..b092b46b44ff 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -3,7 +3,6 @@
 // https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h
 
 #pragma once
-#include <chrono>
 
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -32,7 +31,6 @@
 
 #include "utils.hpp"
 
-
 using namespace cute;
 
 template <typename ElementType, typename OutElementType, typename AccumElementType, typename CtaShape,
@@ -138,7 +136,6 @@ typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch::
     ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
     ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
 
-
     typename Gemm::Arguments args(cutlass::gemm::GemmUniversalMode::kGemm, // Mode
         {m, n, k},                                                         // Problem size
         1,                                                                 // Split-k factor
@@ -239,7 +236,7 @@ void sm89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
 
-  if (m == 1) {
+  if (m <= 1) {
     if (np2 <= 8192) {
         return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
     } else if (np2 <= 16384) {
@@ -591,6 +588,5 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
     TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented fp8_scaled_mm for current compute capability: ", sm_version);
   }
 
-
   return out;
-}
\ No newline at end of file
+}
diff --git a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
index 7f616216fb10..4343a340db79 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
+++ b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
@@ -1,6 +1,5 @@
 #pragma once
 #include <torch/extension.h>
-#include <fstream>
 #include <sstream>
 
 struct cuda_error : public std::runtime_error {
@@ -48,4 +47,4 @@ inline int getSMVersion() {
 inline uint32_t next_pow_2(uint32_t const num) {
   if (num <= 1) return num;
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
\ No newline at end of file
+}
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index 094d78752afa..8634e4f135c5 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -57,4 +57,4 @@ def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
         scales_b,
         out_dtype,
         bias,
-    )
\ No newline at end of file
+    )

From 3d8f1c9bcf50b4d41fcf4ad2dfc430230276b4ab Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 21 Jan 2025 19:46:09 -0800
Subject: [PATCH 175/248] Use int64 as indices for set_kv_buffer (#3039)

---
 python/sglang/bench_one_batch.py              |  8 ++---
 python/sglang/srt/layers/logits_processor.py  |  2 +-
 python/sglang/srt/layers/sampler.py           |  7 ++--
 python/sglang/srt/managers/schedule_batch.py  | 14 ++++----
 .../srt/model_executor/cuda_graph_runner.py   | 32 +++++++++----------
 .../srt/model_executor/forward_batch_info.py  |  4 +--
 6 files changed, 30 insertions(+), 37 deletions(-)

diff --git a/python/sglang/bench_one_batch.py b/python/sglang/bench_one_batch.py
index e01919399b5f..bc7a9c7a1a71 100644
--- a/python/sglang/bench_one_batch.py
+++ b/python/sglang/bench_one_batch.py
@@ -99,10 +99,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         parser.add_argument("--correctness-test", action="store_true")
         parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
         parser.add_argument(
-            "--profile",
-            action="store_true",
-            help="Use Torch Profiler. The endpoint must be launched with "
-            "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+            "--profile", action="store_true", help="Use Torch Profiler."
         )
         parser.add_argument(
             "--profile-filename-prefix",
@@ -381,6 +378,7 @@ def latency_test_run_once(
         parent_dir = os.path.dirname(os.path.abspath(profile_filename))
         os.makedirs(parent_dir, exist_ok=True)
         profiler.export_chrome_trace(profile_filename)
+        rank_print(f"torch profiler chrome trace saved to {profile_filename}")
 
     # Record decode timing from 2nd output
     if output_len > 1:
@@ -451,7 +449,7 @@ def latency_test(
             il,
             ol,
             server_args.device,
-            bench_args.profile,
+            bench_args.profile if tp_rank == 0 else None,
             bench_args.profile_filename_prefix,
         )
         if ret is not None:
diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py
index e5794f052c33..08ee5a3509b9 100644
--- a/python/sglang/srt/layers/logits_processor.py
+++ b/python/sglang/srt/layers/logits_processor.py
@@ -296,7 +296,7 @@ def fused_softcap_kernel(
     n_elements,
     BLOCK_SIZE: tl.constexpr,
 ):
-    pid = tl.program_id(0)
+    pid = tl.program_id(0).to(tl.int64)
     block_start = pid * BLOCK_SIZE
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
     mask = offsets < n_elements
diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index ebaa1aa0e7e9..f3c376ed1eb8 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -1,12 +1,11 @@
 import logging
-from typing import Dict, List
+from typing import List
 
 import torch
 from torch import nn
 
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import global_server_args_dict
-from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.utils import crash_on_warnings, is_flashinfer_available
 
@@ -109,8 +108,6 @@ def forward(
                     f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
                 )
 
-        batch_next_token_ids = batch_next_token_ids.to(torch.int32)
-
         # Attach logprobs to logits_output (in-place modification)
         if return_logprob:
             if any(x > 0 for x in top_logprobs_nums):
@@ -124,7 +121,7 @@ def forward(
                 batch_next_token_ids,
             ]
 
-        return batch_next_token_ids
+        return batch_next_token_ids.to(torch.int32)
 
     def _apply_custom_logit_processor(
         self, logits: torch.Tensor, sampling_batch_info: SamplingBatchInfo
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index d9af81515348..6c44b17ffd86 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -550,13 +550,13 @@ class ScheduleBatch:
     next_batch_sampling_info: SamplingBatchInfo = None
 
     # Batched arguments to model runner
-    input_ids: torch.Tensor = None
-    input_embeds: torch.Tensor = None
-    req_pool_indices: torch.Tensor = None
-    seq_lens: torch.Tensor = None
+    input_ids: torch.Tensor = None  # shape: [b], int32
+    input_embeds: torch.Tensor = None  # shape: [b, hidden_size], float32
+    req_pool_indices: torch.Tensor = None  # shape: [b], int32
+    seq_lens: torch.Tensor = None  # shape: [b], int64
     # The output locations of the KV cache
-    out_cache_loc: torch.Tensor = None
-    output_ids: torch.Tensor = None
+    out_cache_loc: torch.Tensor = None  # shape: [b], int32
+    output_ids: torch.Tensor = None  # shape: [b], int32
 
     # The sum of all sequence lengths
     seq_lens_sum: int = None
@@ -1026,7 +1026,7 @@ def prepare_for_idle(self):
         self.input_ids = torch.empty(0, dtype=torch.int32, device=self.device)
         self.seq_lens = torch.empty(0, dtype=torch.int64, device=self.device)
         self.out_cache_loc = torch.empty(0, dtype=torch.int32, device=self.device)
-        self.req_pool_indices = torch.empty(0, dtype=torch.int64, device=self.device)
+        self.req_pool_indices = torch.empty(0, dtype=torch.int32, device=self.device)
         self.seq_lens_sum = 0
         self.extend_num_tokens = 0
         self.sampling_info = SamplingBatchInfo.from_schedule_batch(
diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
index 762dac140fb8..169b64343681 100644
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -24,7 +24,7 @@
 from vllm.model_executor.custom_op import CustomOp
 
 from sglang.srt.distributed import get_tensor_model_parallel_rank
-from sglang.srt.distributed.parallel_state import graph_capture
+from sglang.srt.distributed.parallel_state import GroupCoordinator, graph_capture
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.moe.fused_moe_native import fused_moe_forward_native
 from sglang.srt.layers.torchao_utils import save_gemlite_cache
@@ -63,7 +63,7 @@ def patch_model(
     model: torch.nn.Module,
     enable_compile: bool,
     batch_size: int,
-    tp_group: "GroupCoordinator",
+    tp_group: GroupCoordinator,
 ):
     """Patch the model to make it compatible with with torch.compile"""
     backup_ca_comm = None
@@ -149,9 +149,18 @@ def __init__(self, model_runner: "ModelRunner"):
             and bs <= model_runner.server_args.cuda_graph_max_bs
         ]
 
+        self.compile_bs = (
+            [
+                bs
+                for bs in self.capture_bs
+                if bs <= self.model_runner.server_args.torch_compile_max_bs
+            ]
+            if self.use_torch_compile
+            else []
+        )
+
         self.capture_forward_mode = ForwardMode.DECODE
         self.num_tokens_per_bs = 1
-
         if model_runner.spec_algorithm.is_eagle():
             if self.model_runner.is_draft_worker:
                 self.num_tokens_per_bs = (
@@ -163,16 +172,6 @@ def __init__(self, model_runner: "ModelRunner"):
                     self.model_runner.server_args.speculative_num_draft_tokens
                 )
 
-        self.compile_bs = (
-            [
-                bs
-                for bs in self.capture_bs
-                if bs <= self.model_runner.server_args.torch_compile_max_bs
-            ]
-            if self.use_torch_compile
-            else []
-        )
-
         # Attention backend
         self.max_bs = max(self.capture_bs)
         self.max_num_token = self.max_bs * self.num_tokens_per_bs
@@ -180,7 +179,6 @@ def __init__(self, model_runner: "ModelRunner"):
         self.seq_len_fill_value = (
             self.model_runner.attn_backend.get_cuda_graph_seq_len_fill_value()
         )
-
         # FIXME(lsyin): leave it here for now, I don't know whether it is necessary
         self.encoder_len_fill_value = 0
 
@@ -189,14 +187,14 @@ def __init__(self, model_runner: "ModelRunner"):
 
         # Common inputs
         with torch.device("cuda"):
-            self.input_ids = torch.zeros((self.max_num_token,), dtype=torch.int32)
+            self.input_ids = torch.zeros((self.max_num_token,), dtype=torch.int64)
             self.req_pool_indices = torch.zeros((self.max_bs,), dtype=torch.int32)
             self.seq_lens = torch.full(
                 (self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
             )
-            self.out_cache_loc = torch.zeros((self.max_num_token,), dtype=torch.int32)
+            self.out_cache_loc = torch.zeros((self.max_num_token,), dtype=torch.int64)
             self.positions = torch.zeros((self.max_num_token,), dtype=torch.int64)
-            self.mrope_positions = torch.zeros((3, self.max_bs), dtype=torch.int32)
+            self.mrope_positions = torch.zeros((3, self.max_bs), dtype=torch.int64)
 
             # Speculative_inference
             if model_runner.spec_algorithm.is_eagle():
diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py
index 8ef5c57b891a..8bd1052754c9 100644
--- a/python/sglang/srt/model_executor/forward_batch_info.py
+++ b/python/sglang/srt/model_executor/forward_batch_info.py
@@ -38,7 +38,7 @@
 import triton.language as tl
 
 from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
-from sglang.srt.utils import maybe_torch_compile
+from sglang.srt.utils import get_compiler_backend
 
 if TYPE_CHECKING:
     from sglang.srt.layers.attention import AttentionBackend
@@ -415,6 +415,6 @@ def compute_position_torch(
     return positions.to(torch.int64), extend_start_loc
 
 
-@maybe_torch_compile(dynamic=True)
+@torch.compile(dynamic=True, backend=get_compiler_backend())
 def clamp_position(seq_lens):
     return torch.clamp((seq_lens - 1), min=0).to(torch.int64)

From 3691d68972b2742a79aff8dab1eade5b23000d32 Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Thu, 9 Jan 2025 17:41:46 +0800
Subject: [PATCH 176/248] support bias

---
 sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index b092b46b44ff..48d68b8f8ef0 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -136,6 +136,7 @@ typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch::
     ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
     ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
 
+
     typename Gemm::Arguments args(cutlass::gemm::GemmUniversalMode::kGemm, // Mode
         {m, n, k},                                                         // Problem size
         1,                                                                 // Split-k factor

From 38bcf5255e838f862a6e27d37e94680ec67e988f Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Tue, 21 Jan 2025 11:41:56 +0000
Subject: [PATCH 177/248] fix compilation

---
 sgl-kernel/setup.py                           |  1 +
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    | 24 +++++++++++++++----
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 0e2b9dc3f9ea..777bdb370d0e 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -42,6 +42,7 @@ def update_wheel_platform_tag():
     "-gencode=arch=compute_90,code=sm_90",
     "-U__CUDA_NO_HALF_OPERATORS__",
     "-U__CUDA_NO_HALF2_OPERATORS__",
+    "-DNDEBUG",
 ]
 
 
diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index 48d68b8f8ef0..42dd059408dc 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -4,6 +4,8 @@
 
 #pragma once
 
+#include <cudaTypedefs.h>
+
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 
@@ -33,6 +35,7 @@
 
 using namespace cute;
 
+#if defined CUDA_VERSION && CUDA_VERSION >= 12040
 template <typename ElementType, typename OutElementType, typename AccumElementType, typename CtaShape,
     typename WarpShape, int Stages, bool WithBias,
     typename FP8MathOperator = cutlass::arch::OpMultiplyAdd,
@@ -301,7 +304,9 @@ void sm89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch
     }
   }
 }
+#endif
 
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
 template <typename ElementType, typename OutElementType, typename AccumElementType, typename CTAShape,
     typename ClusterShape, typename MainloopScheduleType, typename EpilogueScheduleType,
     typename TileSchedulerType = void, bool WithBias = false>
@@ -398,7 +403,7 @@ struct DeviceGemmFp8RowwiseSm90
     using FastPongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
 
     using SlowAccum = DefaultSchedule;
-    using FastAccum = FastDefaultSchedule;
+    using FastAccum = FastPongSchedule; // Default apply Pingpong
     using MainLoopSchedule = cute::conditional_t<FAST_ACCUM, FastAccum, SlowAccum>;
 
     using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<ArchTag, OperatorClass, ElementA,
@@ -536,6 +541,7 @@ void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch
         return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
     }
 }
+#endif
 
 torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
@@ -573,21 +579,29 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
   TORCH_CHECK((out.size(1) * out.element_size()) % 16 == 0, "out must be multiple of 16 bytes for memory alignment");
 
   auto sm_version = getSMVersion();
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
   if (sm_version >= 90) {
         if (out_dtype == torch::kBFloat16) {
             sm90_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
         } else {
             sm90_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
         }
-  } else if (sm_version == 89) {
+    return out;
+  }
+#endif
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12040
+if (sm_version == 89) {
         if (out_dtype == torch::kBFloat16) {
             sm89_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
         } else {
             sm89_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
         }
-  } else {
+    return out;
+}
+#endif
+  
     TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented fp8_scaled_mm for current compute capability: ", sm_version);
-  }
 
-  return out;
 }

From d57f756c0867c2323ab5c7a0c0ee6d20370386d6 Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Wed, 22 Jan 2025 14:06:31 +0800
Subject: [PATCH 178/248] clean code

---
 sgl-kernel/bench_fp8_res/results.html         |   3 +
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    | 129 +++++++++---------
 2 files changed, 65 insertions(+), 67 deletions(-)
 create mode 100644 sgl-kernel/bench_fp8_res/results.html

diff --git a/sgl-kernel/bench_fp8_res/results.html b/sgl-kernel/bench_fp8_res/results.html
new file mode 100644
index 000000000000..6e17ec3d55b6
--- /dev/null
+++ b/sgl-kernel/bench_fp8_res/results.html
@@ -0,0 +1,3 @@
+<html><body>
+<image src="fp8 scaled matmul.png"/>
+</body></html>
diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index 42dd059408dc..3f26162c6a90 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -240,69 +240,61 @@ void sm89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
 
-  if (m <= 1) {
-    if (np2 <= 8192) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-    } else if (np2 <= 16384) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-    } else {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-    }
-  } else if (mp2 <= 16) {
-    // M in (1, 16]
-    if (np2 <= 8192) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
-    } else if (np2 <= 16384) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-    } else {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-    }
-  } else if (mp2 <= 64) {
-    // M in (16, 64]
-    if (np2 <= 8192) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-    } else if (np2 <= 16384) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-    } else {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-    }
-  } else if (mp2 <= 128) {
-    // M in (64, 128]
-    if (np2 <= 8192) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
-    } else if (np2 <= 16384) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-    } else {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-    }
-  } else if (mp2 <= 256) {
-    // M in (128, 256]
-    if (np2 <= 8192) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-    } else if (np2 <= 16384) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-    } else {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 128>, cutlass::gemm::GemmShape<64, 32, 128>, 4>(out, a, b, scales_a, scales_b, bias);
-    }
-  } else if (mp2 <= 512) {
-    // M in (256, 512)
-    if (np2 <= 8192) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
-    } else if (np2 <= 16384) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
-    } else {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 4>(out, a, b, scales_a, scales_b, bias);
-    }
-  } else {
-    // M in (512, inf)
-    if (np2 <= 8192) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 3>(out, a, b, scales_a, scales_b, bias);
-    } else if (np2 <= 16384) {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
-    } else {
-        return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
+    if (mp2 == 1) {
+        if (np2 <= 8192) {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
+        } else {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        }
+      } else if (mp2 <= 16) {
+        // M in (1, 16]
+        if (np2 <= 8192) {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
+        } else if (np2 <= 16384) {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        } else {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
+        }
+      } else if (mp2 <= 64) {
+        // M in (16, 64]
+        if (np2 <= 16384) {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
+        } else {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
+        }
+      } else if (mp2 <= 128) {
+        // M in (64, 128]
+        if (np2 <= 8192) {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
+        } else if (np2 <= 16384) {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        } else {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        }
+      } else if (mp2 <= 256) {
+        // M in (128, 256]
+        if (np2 <= 8192) {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 5>(out, a, b, scales_a, scales_b, bias);
+        } else if (np2 <= 16384) {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 7>(out, a, b, scales_a, scales_b, bias);
+        } else {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 128>, cutlass::gemm::GemmShape<64, 32, 128>, 4>(out, a, b, scales_a, scales_b, bias);
+        }
+      } else if (mp2 <= 512) {
+        // M in (256, 512)
+        if (np2 <= 16384) {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
+        } else {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 4>(out, a, b, scales_a, scales_b, bias);
+        }
+      } else {
+        // M in (512, inf)
+        if (np2 <= 8192) {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 3>(out, a, b, scales_a, scales_b, bias);
+        } else {
+            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
+        }
     }
-  }
 }
 #endif
 
@@ -532,12 +524,15 @@ void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch
 
     if (mp2 <= 64) {
         // m in [1, 64]
-        return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _8, _1>>(out, a, b, scales_a, scales_b, bias);
-    } else if (mp2 <= 128) {
-        // m in (64, 128]
-        return sm90_dispatch_bias<OutType, Shape<_64, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+        return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _4, _1>>(out, a, b, scales_a, scales_b, bias);
+    } else if (mp2 <= 256) {
+        // m in (64, 256]
+        return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+    } else if (mp2 <= 1024) {
+        // m in (256, 1024]
+        return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_1, _1, _1>>(out, a, b, scales_a, scales_b, bias);
     } else {
-        // m in (128, inf)
+        // m in (1024, inf)
         return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
     }
 }

From e620244d232c0ad5810a3e408f4c52be1f469ae7 Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Wed, 22 Jan 2025 14:10:20 +0800
Subject: [PATCH 179/248] clean code

---
 sgl-kernel/bench_fp8_res/results.html | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 sgl-kernel/bench_fp8_res/results.html

diff --git a/sgl-kernel/bench_fp8_res/results.html b/sgl-kernel/bench_fp8_res/results.html
deleted file mode 100644
index 6e17ec3d55b6..000000000000
--- a/sgl-kernel/bench_fp8_res/results.html
+++ /dev/null
@@ -1,3 +0,0 @@
-<html><body>
-<image src="fp8 scaled matmul.png"/>
-</body></html>

From 604f4f5182a5b75ef504d4369f63d65d97504be1 Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Wed, 22 Jan 2025 07:52:18 +0000
Subject: [PATCH 180/248] format

---
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    | 1007 +++++++++--------
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     |    4 +-
 sgl-kernel/src/sgl-kernel/csrc/utils.hpp      |    1 +
 sgl-kernel/tests/test_fp8_gemm.py             |    2 +-
 4 files changed, 513 insertions(+), 501 deletions(-)

diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index 3f26162c6a90..1348ab8305c2 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -1,548 +1,558 @@
-// Adapted from https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h
+// Adapted from
+// https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h
 // https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm89.h
 // https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h
 
 #pragma once
 
+#include <ATen/cuda/CUDAContext.h>
 #include <cudaTypedefs.h>
-
 #include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
 
 #include "cute/tensor.hpp"
 #include "cutlass/conv/convolution.h"
 // Order matters here, packed_stride.hpp is missing cute and convolution includes
-#include "cutlass/util/packed_stride.hpp"
-
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm.h"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
-#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
-
+#include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/activation.h"
 #include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
 #include "cutlass/gemm/dispatch_policy.hpp"
-
-#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
 #include "cutlass/gemm/kernel/gemm_universal.hpp"
-
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-
+#include "cutlass/util/packed_stride.hpp"
 #include "utils.hpp"
 
 using namespace cute;
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12040
 template <typename ElementType, typename OutElementType, typename AccumElementType, typename CtaShape,
-    typename WarpShape, int Stages, bool WithBias,
-    typename FP8MathOperator = cutlass::arch::OpMultiplyAdd,
-    template <typename...> typename EpilogueVisitor = cutlass::epilogue::threadblock::Sm80EVT,
-    typename ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>>
-struct DeviceGemmFp8RowwiseSm89
-{
-    static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
-
-    using ElementA = ElementType;
-    using LayoutA = cutlass::layout::RowMajor;
-    static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
-
-    using ElementB = ElementType;
-    using LayoutB = cutlass::layout::ColumnMajor;
-    static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
-
-    using ElementC = OutElementType;
-    using LayoutC = cutlass::layout::RowMajor;
-    static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
-
-    using ElementOutput = OutElementType;
-    using LayoutOutput = cutlass::layout::RowMajor;
-    static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
-
-    using ElementAccumulator = AccumElementType;
-    using ElementComputeEpilogue = float;
-    using ArchTag = cutlass::arch::Sm89;
-    using OperatorClass = cutlass::arch::OpClassTensorOp;
-
-    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
-    // Number of epilogue stages in EVT
-    static constexpr int EVTEpilogueStages = 1;
-
-    using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<CtaShape, WarpShape, ElementC,
-        AlignmentC, EVTEpilogueStages>;
-
-    // Definition of EVT
-    using accSrc = cutlass::epilogue::threadblock::VisitorAccFetch;
-
-    using ComputeBScale = cutlass::epilogue::threadblock::VisitorCompute<cutlass::multiplies, ElementComputeEpilogue,
-        ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
-    using bScaleSrc = cutlass::epilogue::threadblock::VisitorRowBroadcast<OutputTileThreadMap, ElementComputeEpilogue,
-        Stride<_0, _1, _0>>;
-    using EpilogueBScale = cutlass::epilogue::threadblock::Sm80EVT<ComputeBScale, accSrc, bScaleSrc>;
-
-    using ComputeAScale = cutlass::epilogue::threadblock::VisitorCompute<cutlass::multiplies, ElementC,
-        ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
-    using aScaleSrc = cutlass::epilogue::threadblock::VisitorColBroadcast<OutputTileThreadMap, ElementComputeEpilogue,
-        Stride<_1, _0, _0>>;
-    using EpilogueAScale = cutlass::epilogue::threadblock::Sm80EVT<ComputeAScale, EpilogueBScale, aScaleSrc>;
-
-    // With bias
-    using biasSrc = cutlass::epilogue::threadblock::VisitorRowBroadcast<OutputTileThreadMap, ElementOutput, Stride<_0, _1, _0>>;
-    using ComputeAScaleWithBias = cutlass::epilogue::threadblock::VisitorCompute<cutlass::multiply_add, ElementC,
-        ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
-    using EpilogueAScaleWithBias = cutlass::epilogue::threadblock::Sm80EVT<ComputeAScaleWithBias, EpilogueBScale, aScaleSrc, biasSrc>;
-
-    using dTar = cutlass::epilogue::threadblock::VisitorAuxStore<OutputTileThreadMap, ElementC,
-        cutlass::FloatRoundStyle::round_to_nearest, Stride<int64_t, _1, _0>>;
-    using EpilogueStore = typename cutlass::platform::conditional<WithBias, cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScaleWithBias>,
-        cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScale>>::type;
-    
-    using EpilogueOp = EpilogueStore;
-
-    using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmWithVisitor<ElementA, LayoutA,
-        cutlass::ComplexTransform::kNone, AlignmentA, ElementB, LayoutB, cutlass::ComplexTransform::kNone, AlignmentB,
-        ElementC, LayoutC, AlignmentC, ElementAccumulator, ElementComputeEpilogue, OperatorClass, ArchTag, CtaShape,
-        WarpShape, InstructionShape, EpilogueOp, ThreadblockSwizzle,
-        Stages, FP8MathOperator, EVTEpilogueStages>::GemmKernel;
-
-    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+          typename WarpShape, int Stages, bool WithBias, typename FP8MathOperator = cutlass::arch::OpMultiplyAdd,
+          template <typename...> typename EpilogueVisitor = cutlass::epilogue::threadblock::Sm80EVT,
+          typename ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>>
+struct DeviceGemmFp8RowwiseSm89 {
+  static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
+
+  using ElementA = ElementType;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+  using ElementB = ElementType;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  using ElementC = OutElementType;
+  using LayoutC = cutlass::layout::RowMajor;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+
+  using ElementOutput = OutElementType;
+  using LayoutOutput = cutlass::layout::RowMajor;
+  static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+  using ElementAccumulator = AccumElementType;
+  using ElementComputeEpilogue = float;
+  using ArchTag = cutlass::arch::Sm89;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  // Number of epilogue stages in EVT
+  static constexpr int EVTEpilogueStages = 1;
+
+  using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<CtaShape, WarpShape, ElementC,
+                                                                                     AlignmentC, EVTEpilogueStages>;
+
+  // Definition of EVT
+  using accSrc = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+  using ComputeBScale = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, ElementComputeEpilogue, ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
+  using bScaleSrc = cutlass::epilogue::threadblock::VisitorRowBroadcast<OutputTileThreadMap, ElementComputeEpilogue,
+                                                                        Stride<_0, _1, _0>>;
+  using EpilogueBScale = cutlass::epilogue::threadblock::Sm80EVT<ComputeBScale, accSrc, bScaleSrc>;
+
+  using ComputeAScale =
+      cutlass::epilogue::threadblock::VisitorCompute<cutlass::multiplies, ElementC, ElementComputeEpilogue,
+                                                     cutlass::FloatRoundStyle::round_to_nearest>;
+  using aScaleSrc = cutlass::epilogue::threadblock::VisitorColBroadcast<OutputTileThreadMap, ElementComputeEpilogue,
+                                                                        Stride<_1, _0, _0>>;
+  using EpilogueAScale = cutlass::epilogue::threadblock::Sm80EVT<ComputeAScale, EpilogueBScale, aScaleSrc>;
+
+  // With bias
+  using biasSrc =
+      cutlass::epilogue::threadblock::VisitorRowBroadcast<OutputTileThreadMap, ElementOutput, Stride<_0, _1, _0>>;
+  using ComputeAScaleWithBias =
+      cutlass::epilogue::threadblock::VisitorCompute<cutlass::multiply_add, ElementC, ElementComputeEpilogue,
+                                                     cutlass::FloatRoundStyle::round_to_nearest>;
+  using EpilogueAScaleWithBias =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeAScaleWithBias, EpilogueBScale, aScaleSrc, biasSrc>;
+
+  using dTar = cutlass::epilogue::threadblock::VisitorAuxStore<
+      OutputTileThreadMap, ElementC, cutlass::FloatRoundStyle::round_to_nearest, Stride<int64_t, _1, _0>>;
+  using EpilogueStore =
+      typename cutlass::platform::conditional<WithBias,
+                                              cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScaleWithBias>,
+                                              cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScale>>::type;
+
+  using EpilogueOp = EpilogueStore;
+
+  using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+      ElementA, LayoutA, cutlass::ComplexTransform::kNone, AlignmentA, ElementB, LayoutB,
+      cutlass::ComplexTransform::kNone, AlignmentB, ElementC, LayoutC, AlignmentC, ElementAccumulator,
+      ElementComputeEpilogue, OperatorClass, ArchTag, CtaShape, WarpShape, InstructionShape, EpilogueOp,
+      ThreadblockSwizzle, Stages, FP8MathOperator, EVTEpilogueStages>::GemmKernel;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 };
 
-
 template <typename Gemm, bool WithBias>
-typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias)
-{
-    using ElementT = typename Gemm::ElementA;
-    using ElementOutput = typename Gemm::ElementD;
-    using ElementComputeEpilogue = float;
-
-    int32_t m = a.size(0);
-    int32_t n = b.size(1);
-    int32_t k = a.size(1);
-
-    int64_t lda = a.stride(0);
-    int64_t ldb = b.stride(1);
-    int64_t ldc = out.stride(0);
-
-    ElementT const* ptr_a = reinterpret_cast<ElementT const*>(a.data_ptr());
-    ElementT const* ptr_b = reinterpret_cast<ElementT const*>(b.data_ptr());
-    ElementOutput const* ptr_bias = nullptr;
-    if constexpr (WithBias) {
-        TORCH_CHECK(bias.has_value())
-        ptr_bias = reinterpret_cast<ElementOutput const*>(bias.value().data_ptr());
-    }
-    ElementOutput* ptr_d = reinterpret_cast<ElementOutput*>(out.data_ptr());
-    ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
-    ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
-
-
-    typename Gemm::Arguments args(cutlass::gemm::GemmUniversalMode::kGemm, // Mode
-        {m, n, k},                                                         // Problem size
-        1,                                                                 // Split-k factor
-        {},                                                                // Epilogue args
-        ptr_a,                                                             // a pointer
-        ptr_b,                                                             // b pointer
-        nullptr,                                                           // c pointer (unused)
-        nullptr,                                                           // d pointer (unused)
-        m * k,                                                             // batch stride a (unused)
-        n * k,                                                             // batch stride b (unused)
-        m * n,                                                             // batch stride c (unused)
-        m * n,                                                             // batch stride d (unused)
-        lda,                                                               // stride a
-        ldb,                                                               // stride b
-        ldc,                                                               // stride c (unused)
-        ldc);                                                              // stride d (unused)
-    if constexpr (WithBias) {
-        args.epilogue = {
-        {
-            {
-                {}, // Accumulator
-                {ptr_scales_b, ElementComputeEpilogue(0),
-                    {_0{}, _1{}, _0{}}},
-                {} // Multiplies
-            },
-            {ptr_scales_a, ElementComputeEpilogue(0), {_1{}, _0{}, _0{}}},
-            {ptr_bias, ElementOutput(0), {_0{}, _1{}, _0{}}},
-            {} // Multiplies
-        },
-        {ptr_d, {n, _1{}, _0{}}}};
-    } else {
-        args.epilogue = {
-        {
-            {
-                {}, // Accumulator
-                {ptr_scales_b, ElementComputeEpilogue(0),
-                    {_0{}, _1{}, _0{}}},
-                {} // Multiplies
-            },
-            {ptr_scales_a, ElementComputeEpilogue(0), {_1{}, _0{}, _0{}}},
-            {} // Multiplies
-        },
-        {ptr_d, {n, _1{}, _0{}}}};
-    }
+typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                                               const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                                               const c10::optional<torch::Tensor>& bias) {
+  using ElementT = typename Gemm::ElementA;
+  using ElementOutput = typename Gemm::ElementD;
+  using ElementComputeEpilogue = float;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  ElementT const* ptr_a = reinterpret_cast<ElementT const*>(a.data_ptr());
+  ElementT const* ptr_b = reinterpret_cast<ElementT const*>(b.data_ptr());
+  ElementOutput const* ptr_bias = nullptr;
+  if constexpr (WithBias) {
+    TORCH_CHECK(bias.has_value())
+    ptr_bias = reinterpret_cast<ElementOutput const*>(bias.value().data_ptr());
+  }
+  ElementOutput* ptr_d = reinterpret_cast<ElementOutput*>(out.data_ptr());
+  ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
+  ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
+
+  typename Gemm::Arguments args(cutlass::gemm::GemmUniversalMode::kGemm,  // Mode
+                                {m, n, k},                                // Problem size
+                                1,                                        // Split-k factor
+                                {},                                       // Epilogue args
+                                ptr_a,                                    // a pointer
+                                ptr_b,                                    // b pointer
+                                nullptr,                                  // c pointer (unused)
+                                nullptr,                                  // d pointer (unused)
+                                m * k,                                    // batch stride a (unused)
+                                n * k,                                    // batch stride b (unused)
+                                m * n,                                    // batch stride c (unused)
+                                m * n,                                    // batch stride d (unused)
+                                lda,                                      // stride a
+                                ldb,                                      // stride b
+                                ldc,                                      // stride c (unused)
+                                ldc);                                     // stride d (unused)
+  if constexpr (WithBias) {
+    args.epilogue = {{
+                         {
+                             {},  // Accumulator
+                             {ptr_scales_b, ElementComputeEpilogue(0), {_0{}, _1{}, _0{}}},
+                             {}  // Multiplies
+                         },
+                         {ptr_scales_a, ElementComputeEpilogue(0), {_1{}, _0{}, _0{}}},
+                         {ptr_bias, ElementOutput(0), {_0{}, _1{}, _0{}}},
+                         {}  // Multiplies
+                     },
+                     {ptr_d, {n, _1{}, _0{}}}};
+  } else {
+    args.epilogue = {{
+                         {
+                             {},  // Accumulator
+                             {ptr_scales_b, ElementComputeEpilogue(0), {_0{}, _1{}, _0{}}},
+                             {}  // Multiplies
+                         },
+                         {ptr_scales_a, ElementComputeEpilogue(0), {_1{}, _0{}, _0{}}},
+                         {}  // Multiplies
+                     },
+                     {ptr_d, {n, _1{}, _0{}}}};
+  }
 
-    return args;
+  return args;
 }
 
 template <typename Gemm, bool WithBias>
-void launch_sm89_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias)
-{
-    auto args = prepare_sm89_fp8_args<Gemm, WithBias>(out, a, b, scales_a, scales_b, bias);
-    Gemm gemm_op;
-
-    size_t workspace_size = gemm_op.get_workspace_size(args);
-    auto const workspace_options =
-        torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-    auto workspace = torch::empty(workspace_size, workspace_options);
-    auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-    auto can_implement = gemm_op.can_implement(args);
-    TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
-
-    auto status = gemm_op(args, workspace.data_ptr(), stream);
-    TORCH_CHECK(status == cutlass::Status::kSuccess)
+void launch_sm89_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                               const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                               const c10::optional<torch::Tensor>& bias) {
+  auto args = prepare_sm89_fp8_args<Gemm, WithBias>(out, a, b, scales_a, scales_b, bias);
+  Gemm gemm_op;
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options = torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  auto can_implement = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
+
+  auto status = gemm_op(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess)
 }
 
 template <typename OutType, typename CtaShape, typename WarpShape, int Stages>
-void sm89_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias) {
-    using ElementInput = cutlass::float_e4m3_t;
-    using ElementOutput = OutType;
-    using AccumElementType = float;
-    if (bias) {
-        using Gemm = typename DeviceGemmFp8RowwiseSm89<ElementInput, ElementOutput, AccumElementType, CtaShape, WarpShape,
-            Stages, true>::Gemm;
-        return launch_sm89_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
-    } else {
-        using Gemm = typename DeviceGemmFp8RowwiseSm89<ElementInput, ElementOutput, AccumElementType, CtaShape, WarpShape,
-            Stages, false>::Gemm;
-        return launch_sm89_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
-    }
+void sm89_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                        const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                        const c10::optional<torch::Tensor>& bias) {
+  using ElementInput = cutlass::float_e4m3_t;
+  using ElementOutput = OutType;
+  using AccumElementType = float;
+  if (bias) {
+    using Gemm = typename DeviceGemmFp8RowwiseSm89<ElementInput, ElementOutput, AccumElementType, CtaShape, WarpShape,
+                                                   Stages, true>::Gemm;
+    return launch_sm89_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
+  } else {
+    using Gemm = typename DeviceGemmFp8RowwiseSm89<ElementInput, ElementOutput, AccumElementType, CtaShape, WarpShape,
+                                                   Stages, false>::Gemm;
+    return launch_sm89_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
+  }
 }
 
-
 template <typename OutType>
-void sm89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias) {
-    uint32_t const m = a.size(0);
-    // uint32_t const mp2 =
-    //     std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
-    uint32_t const mp2 = next_pow_2(m);     // next power of 2
-
-    uint32_t const n = out.size(1);
-    uint32_t const np2 = next_pow_2(n);
-
-    if (mp2 == 1) {
-        if (np2 <= 8192) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-        } else {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-        }
-      } else if (mp2 <= 16) {
-        // M in (1, 16]
-        if (np2 <= 8192) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
-        } else if (np2 <= 16384) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-        } else {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-        }
-      } else if (mp2 <= 64) {
-        // M in (16, 64]
-        if (np2 <= 16384) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-        } else {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-        }
-      } else if (mp2 <= 128) {
-        // M in (64, 128]
-        if (np2 <= 8192) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
-        } else if (np2 <= 16384) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-        } else {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-        }
-      } else if (mp2 <= 256) {
-        // M in (128, 256]
-        if (np2 <= 8192) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-        } else if (np2 <= 16384) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-        } else {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 128>, cutlass::gemm::GemmShape<64, 32, 128>, 4>(out, a, b, scales_a, scales_b, bias);
-        }
-      } else if (mp2 <= 512) {
-        // M in (256, 512)
-        if (np2 <= 16384) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
-        } else {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 4>(out, a, b, scales_a, scales_b, bias);
-        }
-      } else {
-        // M in (512, inf)
-        if (np2 <= 8192) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 3>(out, a, b, scales_a, scales_b, bias);
-        } else {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
-        }
+void sm89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                         const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                         const c10::optional<torch::Tensor>& bias) {
+  uint32_t const m = a.size(0);
+  // uint32_t const mp2 =
+  //     std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+  uint32_t const mp2 = next_pow_2(m);  // next power of 2
+
+  uint32_t const n = out.size(1);
+  uint32_t const np2 = next_pow_2(n);
+
+  if (mp2 == 1) {
+    if (np2 <= 8192) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                7>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                5>(out, a, b, scales_a, scales_b, bias);
     }
+  } else if (mp2 <= 16) {
+    // M in (1, 16]
+    if (np2 <= 8192) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                4>(out, a, b, scales_a, scales_b, bias);
+    } else if (np2 <= 16384) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                5>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                7>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (mp2 <= 64) {
+    // M in (16, 64]
+    if (np2 <= 16384) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                7>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                7>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    if (np2 <= 8192) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>,
+                                4>(out, a, b, scales_a, scales_b, bias);
+    } else if (np2 <= 16384) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>,
+                                5>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                5>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (mp2 <= 256) {
+    // M in (128, 256]
+    if (np2 <= 8192) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
+                                5>(out, a, b, scales_a, scales_b, bias);
+    } else if (np2 <= 16384) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
+                                7>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 128>, cutlass::gemm::GemmShape<64, 32, 128>,
+                                4>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (mp2 <= 512) {
+    // M in (256, 512)
+    if (np2 <= 16384) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
+                                2>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
+                                4>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else {
+    // M in (512, inf)
+    if (np2 <= 8192) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
+                                3>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
+                                2>(out, a, b, scales_a, scales_b, bias);
+    }
+  }
 }
 #endif
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
 template <typename ElementType, typename OutElementType, typename AccumElementType, typename CTAShape,
-    typename ClusterShape, typename MainloopScheduleType, typename EpilogueScheduleType,
-    typename TileSchedulerType = void, bool WithBias = false>
-struct DeviceGemmFp8RowwiseSm90
-{
-    static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
-
-    // A matrix configuration
-    using ElementA = ElementType;                      // Element type for A matrix operand
-    using LayoutA = cutlass::layout::RowMajor;         // Layout type for A matrix operand
-    static constexpr int AlignmentA
-        = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A
-                                                       // matrix in units of elements (up to 16 bytes)
-
-    // B matrix configuration
-    using ElementB = ElementType;                      // Element type for B matrix operand
-    using LayoutB = cutlass::layout::ColumnMajor;      // Layout type for B matrix operand
-    static constexpr int AlignmentB
-        = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B
-                                                       // matrix in units of elements (up to 16 bytes)
-
-    // C/D matrix configuration
-    using ElementC = void;                                   // Element type for C matrix operands
-    using LayoutC = cutlass::layout::RowMajor;               // Layout type for C matrix operands
-    static constexpr int AlignmentC
-        = 128 / cutlass::sizeof_bits<OutElementType>::value; // Memory access granularity/alignment of C matrices in
-                                                             // units of elements (up to 16 bytes)
-
-    // Output matrix configuration
-    using ElementOutput = OutElementType;           // Element type for output matrix operands
-    using LayoutOutput = cutlass::layout::RowMajor; // Layout type for output matrix operands
-    static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
-
-    // // Auxiliary matrix configuration and other fusion types
-    // using ElementBias = float;
-
-    // Multiply-accumulate blocking/pipelining details
-    using ElementAccumulator = AccumElementType; // Element type for internal accumulation
-    using ElementCompute = float;                // Element type for compute
-    using ElementComputeEpilogue = float;
-    using ArchTag = cutlass::arch::Sm90;         // Tag indicating the minimum SM that supports the intended feature
-    using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
-    using TileShape = CTAShape;                           // Threadblock-level tile size
-    using TileScheduler = TileSchedulerType;
-
-    static constexpr bool PONG = false;
-    static constexpr bool FAST_ACCUM = true;
-    static constexpr bool USE_BIAS = false;
-
-    using StageCountType = cutlass::gemm::collective::StageCountAuto;     // Stage count maximized
-                                                                          // based on the tile size
-    using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; // Kernel to launch based on the default
-                                                                          // setting in the Collective Builder
-    // Implement rowwise scaling epilogue.
-    using XScale = cutlass::epilogue::fusion::Sm90ColBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue,
-        cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>>;
-
-    using WScale = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue,
-        cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
-
-    using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementOutput, ElementOutput,
-        cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
-
-    using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
-
-    using Compute0 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies,
-        ElementComputeEpilogue, // First stage output type.
-        ElementComputeEpilogue, // First stage input types.
-        cutlass::FloatRoundStyle::round_to_nearest>;
-
-    using EVTCompute0 = cutlass::epilogue::fusion::Sm90EVT<Compute0, WScale, Accum>;
-
-    using Compute1 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies, ElementOutput,
-        ElementComputeEpilogue, // Second stage input types.
-        cutlass::FloatRoundStyle::round_to_nearest>;
-
-    using EVTCompute1 = cutlass::epilogue::fusion::Sm90EVT<Compute1, XScale, EVTCompute0>;
-
-    // With bias
-    using ComputeWithBias = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiply_add, ElementOutput,
-        ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
-    using EVTComputeWithBias = cutlass::epilogue::fusion::Sm90EVT<ComputeWithBias, XScale, EVTCompute0, Bias>;
-
-    using EpilogueEVT = typename cutlass::platform::conditional<WithBias, EVTComputeWithBias, EVTCompute1>::type;
-
-    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<cutlass::arch::Sm90,
-        cutlass::arch::OpClassTensorOp, TileShape, ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-        ElementAccumulator, ElementComputeEpilogue, ElementC, LayoutC, AlignmentC, ElementOutput, LayoutOutput,
-        AlignmentOutput, cutlass::epilogue::TmaWarpSpecialized, EpilogueEVT>::CollectiveOp;
-
-    using DefaultSchedule = cutlass::gemm::KernelTmaWarpSpecialized;
-    using PongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-    using FastDefaultSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
-    using FastPongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-
-    using SlowAccum = DefaultSchedule;
-    using FastAccum = FastPongSchedule; // Default apply Pingpong
-    using MainLoopSchedule = cute::conditional_t<FAST_ACCUM, FastAccum, SlowAccum>;
-
-    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<ArchTag, OperatorClass, ElementA,
-        LayoutA, AlignmentA, ElementB, LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape,
-        cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
-            sizeof(typename CollectiveEpilogue::SharedStorage))>,
-        MainLoopSchedule>::CollectiveOp;
-
-    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>, // Indicates ProblemShape
-        CollectiveMainloop, CollectiveEpilogue, TileScheduler>;
-
-    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+          typename ClusterShape, typename MainloopScheduleType, typename EpilogueScheduleType,
+          typename TileSchedulerType = void, bool WithBias = false>
+struct DeviceGemmFp8RowwiseSm90 {
+  static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
+
+  // A matrix configuration
+  using ElementA = ElementType;               // Element type for A matrix operand
+  using LayoutA = cutlass::layout::RowMajor;  // Layout type for A matrix operand
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementA>::value;  // Memory access granularity/alignment of A
+                                                    // matrix in units of elements (up to 16 bytes)
+
+  // B matrix configuration
+  using ElementB = ElementType;                  // Element type for B matrix operand
+  using LayoutB = cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementB>::value;  // Memory access granularity/alignment of B
+                                                    // matrix in units of elements (up to 16 bytes)
+
+  // C/D matrix configuration
+  using ElementC = void;                      // Element type for C matrix operands
+  using LayoutC = cutlass::layout::RowMajor;  // Layout type for C matrix operands
+  static constexpr int AlignmentC =
+      128 / cutlass::sizeof_bits<OutElementType>::value;  // Memory access granularity/alignment of C matrices in
+                                                          // units of elements (up to 16 bytes)
+
+  // Output matrix configuration
+  using ElementOutput = OutElementType;            // Element type for output matrix operands
+  using LayoutOutput = cutlass::layout::RowMajor;  // Layout type for output matrix operands
+  static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+  // // Auxiliary matrix configuration and other fusion types
+  // using ElementBias = float;
+
+  // Multiply-accumulate blocking/pipelining details
+  using ElementAccumulator = AccumElementType;  // Element type for internal accumulation
+  using ElementCompute = float;                 // Element type for compute
+  using ElementComputeEpilogue = float;
+  using ArchTag = cutlass::arch::Sm90;  // Tag indicating the minimum SM that supports the intended feature
+  using OperatorClass = cutlass::arch::OpClassTensorOp;  // Operator class tag
+  using TileShape = CTAShape;                            // Threadblock-level tile size
+  using TileScheduler = TileSchedulerType;
+
+  static constexpr bool PONG = false;
+  static constexpr bool FAST_ACCUM = true;
+  static constexpr bool USE_BIAS = false;
+
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;      // Stage count maximized
+                                                                         // based on the tile size
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;  // Kernel to launch based on the default
+                                                                         // setting in the Collective Builder
+  // Implement rowwise scaling epilogue.
+  using XScale =
+      cutlass::epilogue::fusion::Sm90ColBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue,
+                                                  cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>>;
+
+  using WScale =
+      cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue,
+                                                  cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
+
+  using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementOutput, ElementOutput,
+                                                           cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
+
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies,
+                                                          ElementComputeEpilogue,  // First stage output type.
+                                                          ElementComputeEpilogue,  // First stage input types.
+                                                          cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 = cutlass::epilogue::fusion::Sm90EVT<Compute0, WScale, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies, ElementOutput,
+                                                          ElementComputeEpilogue,  // Second stage input types.
+                                                          cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute1 = cutlass::epilogue::fusion::Sm90EVT<Compute1, XScale, EVTCompute0>;
+
+  // With bias
+  using ComputeWithBias =
+      cutlass::epilogue::fusion::Sm90Compute<cutlass::multiply_add, ElementOutput, ElementComputeEpilogue,
+                                             cutlass::FloatRoundStyle::round_to_nearest>;
+  using EVTComputeWithBias = cutlass::epilogue::fusion::Sm90EVT<ComputeWithBias, XScale, EVTCompute0, Bias>;
+
+  using EpilogueEVT = typename cutlass::platform::conditional<WithBias, EVTComputeWithBias, EVTCompute1>::type;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator, ElementComputeEpilogue, ElementC, LayoutC,
+      AlignmentC, ElementOutput, LayoutOutput, AlignmentOutput, cutlass::epilogue::TmaWarpSpecialized,
+      EpilogueEVT>::CollectiveOp;
+
+  using DefaultSchedule = cutlass::gemm::KernelTmaWarpSpecialized;
+  using PongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using FastDefaultSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using FastPongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+
+  using SlowAccum = DefaultSchedule;
+  using FastAccum = FastPongSchedule;  // Default apply Pingpong
+  using MainLoopSchedule = cute::conditional_t<FAST_ACCUM, FastAccum, SlowAccum>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB, LayoutB, AlignmentB, ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+          sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainLoopSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>,  // Indicates ProblemShape
+                                                          CollectiveMainloop, CollectiveEpilogue, TileScheduler>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 };
 
 template <typename Gemm, bool WithBias>
-typename Gemm::Arguments prepare_sm90_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias)
-{
-    using ElementT = typename Gemm::ElementA;
-    using ElementOutput = typename Gemm::ElementD;
-    using ElementComputeEpilogue = float;
-    using StrideA = typename Gemm::GemmKernel::StrideA;
-    using StrideB = typename Gemm::GemmKernel::StrideB;
-    using StrideC = typename Gemm::GemmKernel::StrideC;
-    using StrideD = typename Gemm::GemmKernel::StrideD;
-
-    int32_t m = a.size(0);
-    int32_t n = b.size(1);
-    int32_t k = a.size(1);
-    ElementT const* ptr_a = reinterpret_cast<ElementT const*>(a.data_ptr());
-    ElementT const* ptr_b = reinterpret_cast<ElementT const*>(b.data_ptr());
-    ElementOutput const* ptr_bias = nullptr;
-    if constexpr (WithBias) {
-        TORCH_CHECK(bias.has_value())
-        ptr_bias = reinterpret_cast<ElementOutput const*>(bias.value().data_ptr());
-    }
-    ElementOutput* ptr_d = reinterpret_cast<ElementOutput*>(out.data_ptr());
-    ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
-    ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
-
-    // TODO: confirm correctess
-    StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, make_shape(m, k, 1));
-    StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, make_shape(n, k, 1));
-    StrideC stride_c;
-    StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, make_shape(m, n, 1));
-    typename Gemm::Arguments args
-        = {cutlass::gemm::GemmUniversalMode::kGemm, {m, n, k, 1}, {ptr_a, stride_a, ptr_b, stride_b},
-            {{}, // epilogue.thread
-                nullptr, stride_c, ptr_d, stride_d}};
-    if constexpr (WithBias) {
-        args.epilogue.thread = {
-            {ptr_scales_a},
-            {
-                {ptr_scales_b}, {}, // Accumulator
-                {}                                                                             // Multiplies
-            },
-            {ptr_bias},
-            {},                                                                                // Multiplies
-        };
-    } else {
-        args.epilogue.thread = {
-            {ptr_scales_a},
-            {
-                {ptr_scales_b}, {}, // Accumulator
-                {}                                                                             // Multiplies
-            },
-            {},                                                                                // Multiplies
-        };
-    }
+typename Gemm::Arguments prepare_sm90_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                                               const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                                               const c10::optional<torch::Tensor>& bias) {
+  using ElementT = typename Gemm::ElementA;
+  using ElementOutput = typename Gemm::ElementD;
+  using ElementComputeEpilogue = float;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+  ElementT const* ptr_a = reinterpret_cast<ElementT const*>(a.data_ptr());
+  ElementT const* ptr_b = reinterpret_cast<ElementT const*>(b.data_ptr());
+  ElementOutput const* ptr_bias = nullptr;
+  if constexpr (WithBias) {
+    TORCH_CHECK(bias.has_value())
+    ptr_bias = reinterpret_cast<ElementOutput const*>(bias.value().data_ptr());
+  }
+  ElementOutput* ptr_d = reinterpret_cast<ElementOutput*>(out.data_ptr());
+  ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
+  ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
+
+  // TODO: confirm correctess
+  StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, make_shape(m, k, 1));
+  StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, make_shape(n, k, 1));
+  StrideC stride_c;
+  StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, make_shape(m, n, 1));
+  typename Gemm::Arguments args = {cutlass::gemm::GemmUniversalMode::kGemm,
+                                   {m, n, k, 1},
+                                   {ptr_a, stride_a, ptr_b, stride_b},
+                                   {{},  // epilogue.thread
+                                    nullptr,
+                                    stride_c,
+                                    ptr_d,
+                                    stride_d}};
+  if constexpr (WithBias) {
+    args.epilogue.thread = {
+        {ptr_scales_a},
+        {
+            {ptr_scales_b},
+            {},  // Accumulator
+            {}   // Multiplies
+        },
+        {ptr_bias},
+        {},  // Multiplies
+    };
+  } else {
+    args.epilogue.thread = {
+        {ptr_scales_a},
+        {
+            {ptr_scales_b},
+            {},  // Accumulator
+            {}   // Multiplies
+        },
+        {},  // Multiplies
+    };
+  }
 
-    return args;
+  return args;
 }
 
 template <typename Gemm, bool WithBias>
-void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias)
-{
-    auto args = prepare_sm90_fp8_args<Gemm, WithBias>(out, a, b, scales_a, scales_b, bias);
-    Gemm gemm_op;
+void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                               const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                               const c10::optional<torch::Tensor>& bias) {
+  auto args = prepare_sm90_fp8_args<Gemm, WithBias>(out, a, b, scales_a, scales_b, bias);
+  Gemm gemm_op;
 
-    size_t workspace_size = gemm_op.get_workspace_size(args);
-    auto const workspace_options =
-        torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-    auto workspace = torch::empty(workspace_size, workspace_options);
-    auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options = torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
 
-    auto can_implement = gemm_op.can_implement(args);
-    TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
+  auto can_implement = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
 
-    auto status = gemm_op.run(args, workspace.data_ptr(), stream);
+  auto status = gemm_op.run(args, workspace.data_ptr(), stream);
 
-    TORCH_CHECK(status == cutlass::Status::kSuccess)
+  TORCH_CHECK(status == cutlass::Status::kSuccess)
 }
 
-
 template <typename OutType, typename CTAShape, typename ClusterShape>
-void sm90_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias) {
-    using ElementInput = cutlass::float_e4m3_t;
-    using ElementOutput = OutType;
-    using AccumElementType = float;
-    using MainloopScheduleType = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
-    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized;
-    using TileSchedulerType = void;
-    if (bias) {
-        using Gemm = typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape,
-            ClusterShape, MainloopScheduleType, EpilogueScheduleType, TileSchedulerType, true>::Gemm;
-        return launch_sm90_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
-    } else {
-        using Gemm = typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape,
-            ClusterShape, MainloopScheduleType, EpilogueScheduleType, TileSchedulerType, false>::Gemm;
-        return launch_sm90_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
-    }
+void sm90_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                        const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                        const c10::optional<torch::Tensor>& bias) {
+  using ElementInput = cutlass::float_e4m3_t;
+  using ElementOutput = OutType;
+  using AccumElementType = float;
+  using MainloopScheduleType = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized;
+  using TileSchedulerType = void;
+  if (bias) {
+    using Gemm =
+        typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape, ClusterShape,
+                                          MainloopScheduleType, EpilogueScheduleType, TileSchedulerType, true>::Gemm;
+    return launch_sm90_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
+  } else {
+    using Gemm =
+        typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape, ClusterShape,
+                                          MainloopScheduleType, EpilogueScheduleType, TileSchedulerType, false>::Gemm;
+    return launch_sm90_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
+  }
 }
 
 template <typename OutType>
-void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias) {
-    uint32_t const m = a.size(0);
-    uint32_t const mp2 =
-        std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
-
-    if (mp2 <= 64) {
-        // m in [1, 64]
-        return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _4, _1>>(out, a, b, scales_a, scales_b, bias);
-    } else if (mp2 <= 256) {
-        // m in (64, 256]
-        return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _1, _1>>(out, a, b, scales_a, scales_b, bias);
-    } else if (mp2 <= 1024) {
-        // m in (256, 1024]
-        return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_1, _1, _1>>(out, a, b, scales_a, scales_b, bias);
-    } else {
-        // m in (1024, inf)
-        return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
-    }
+void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                         const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                         const c10::optional<torch::Tensor>& bias) {
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 = std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    // m in [1, 64]
+    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _4, _1>>(out, a, b, scales_a, scales_b, bias);
+  } else if (mp2 <= 256) {
+    // m in (64, 256]
+    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+  } else if (mp2 <= 1024) {
+    // m in (256, 1024]
+    return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_1, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+  } else {
+    // m in (1024, inf)
+    return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+  }
 }
 #endif
 
 torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
-                             const c10::optional<torch::Tensor>& bias) {
-
-
+                            const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
+                            const c10::optional<torch::Tensor>& bias) {
   TORCH_CHECK(mat_a.is_cuda(), "mat_a must be a CUDA tensor");
   TORCH_CHECK(mat_b.is_cuda(), "mat_b must be a CUDA tensor");
   TORCH_CHECK(mat_a.dim() == 2, "mat_a must be a 2D tensor");
@@ -551,8 +561,10 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
   TORCH_CHECK(mat_b.stride(0) == 1, "mat_a must be a column major tensor");
   TORCH_CHECK(mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied");
 
-  TORCH_CHECK((mat_a.size(1) * mat_a.element_size()) % 16 == 0, "mat_a must be multiple of 16 bytes for memory alignment");
-  TORCH_CHECK((mat_b.size(0) * mat_b.element_size()) % 16 == 0, "mat_b must be multiple of 16 bytes for memory alignment");
+  TORCH_CHECK((mat_a.size(1) * mat_a.element_size()) % 16 == 0,
+              "mat_a must be multiple of 16 bytes for memory alignment");
+  TORCH_CHECK((mat_b.size(0) * mat_b.element_size()) % 16 == 0,
+              "mat_b must be multiple of 16 bytes for memory alignment");
   TORCH_CHECK(mat_a.scalar_type() == torch::kFloat8_e4m3fn, "mat_a must be Float8_e4m3fn");
   TORCH_CHECK(mat_b.scalar_type() == torch::kFloat8_e4m3fn, "mat_b must be Float8_e4m3fn");
   TORCH_CHECK(out_dtype == torch::kHalf || out_dtype == torch::kBFloat16, "out_dtype must be Half or BFloat16");
@@ -577,26 +589,25 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
   if (sm_version >= 90) {
-        if (out_dtype == torch::kBFloat16) {
-            sm90_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
-        } else {
-            sm90_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
-        }
+    if (out_dtype == torch::kBFloat16) {
+      sm90_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      sm90_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
     return out;
   }
 #endif
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12040
-if (sm_version == 89) {
-        if (out_dtype == torch::kBFloat16) {
-            sm89_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
-        } else {
-            sm89_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
-        }
+  if (sm_version == 89) {
+    if (out_dtype == torch::kBFloat16) {
+      sm89_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      sm89_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
     return out;
-}
+  }
 #endif
-  
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented fp8_scaled_mm for current compute capability: ", sm_version);
 
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented fp8_scaled_mm for current compute capability: ", sm_version);
 }
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index 57e74bdc363d..4f224ed90212 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -27,8 +27,8 @@ torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& ma
                              const c10::optional<torch::Tensor>& bias);
 
 torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
-                             const c10::optional<torch::Tensor>& bias);
+                            const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
+                            const c10::optional<torch::Tensor>& bias);
 // rotary embedding
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, torch::Tensor& key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox);
diff --git a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
index 4343a340db79..5820b1350ab5 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
+++ b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
@@ -1,5 +1,6 @@
 #pragma once
 #include <torch/extension.h>
+
 #include <sstream>
 
 struct cuda_error : public std::runtime_error {
diff --git a/sgl-kernel/tests/test_fp8_gemm.py b/sgl-kernel/tests/test_fp8_gemm.py
index 386910db1ec3..b55bd089a4d9 100644
--- a/sgl-kernel/tests/test_fp8_gemm.py
+++ b/sgl-kernel/tests/test_fp8_gemm.py
@@ -1,8 +1,8 @@
 import unittest
 
 import torch
-from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
 from sgl_kernel import fp8_scaled_mm
+from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
 
 
 def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):

From 98dc70d68d61dcc4d5139b2dde50dcfec1ea3399 Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Wed, 22 Jan 2025 08:42:31 +0000
Subject: [PATCH 181/248] format

---
 sgl-kernel/benchmark/bench_fp8_gemm.py        |   29 +-
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    | 1007 +++++++++--------
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     |    4 +-
 sgl-kernel/src/sgl-kernel/csrc/utils.hpp      |    1 +
 sgl-kernel/tests/test_fp8_gemm.py             |    2 +-
 5 files changed, 532 insertions(+), 511 deletions(-)

diff --git a/sgl-kernel/benchmark/bench_fp8_gemm.py b/sgl-kernel/benchmark/bench_fp8_gemm.py
index b6e6da36263c..e68695a3f396 100644
--- a/sgl-kernel/benchmark/bench_fp8_gemm.py
+++ b/sgl-kernel/benchmark/bench_fp8_gemm.py
@@ -1,26 +1,35 @@
 import torch
 import torch.nn.functional as F
 import triton
-
+from sgl_kernel import fp8_scaled_mm as sgl_scaled_mm
 from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
 from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
-from sgl_kernel import fp8_scaled_mm as sgl_scaled_mm
+
 
 @triton.testing.perf_report(
-        triton.testing.Benchmark(
+    triton.testing.Benchmark(
         x_names=["batch_size"],
         x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048],
         x_log=False,
         line_arg="provider",
-        line_vals=["vllm-fp8-fp16", "vllm-fp8-bf16", "sglang-fp8-fp16", "sglang-fp8-bf16"],
-        line_names=["vllm-fp8-fp16", "vllm-fp8-bf16", "sglang-fp8-fp16", "sglang-fp8-bf16"],
+        line_vals=[
+            "vllm-fp8-fp16",
+            "vllm-fp8-bf16",
+            "sglang-fp8-fp16",
+            "sglang-fp8-bf16",
+        ],
+        line_names=[
+            "vllm-fp8-fp16",
+            "vllm-fp8-bf16",
+            "sglang-fp8-fp16",
+            "sglang-fp8-bf16",
+        ],
         styles=[("green", "-"), ("green", "--"), ("blue", "-"), ("blue", "--")],
         ylabel="GB/s",
         plot_name="fp8 scaled matmul",
         args={},
     )
 )
-
 def benchmark(batch_size, provider):
     M, N, K = batch_size, 4096, 8192
     a = torch.ones((M, K), device="cuda") * 5.0
@@ -36,14 +45,14 @@ def benchmark(batch_size, provider):
 
     if "vllm-fp8" in provider:
         ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: vllm_scaled_mm(
-                a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype
-            ),
+            lambda: vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype),
             quantiles=quantiles,
         )
     elif "sglang-fp8" in provider:
         ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: sgl_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=None),
+            lambda: sgl_scaled_mm(
+                a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=None
+            ),
             quantiles=quantiles,
         )
 
diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index 3f26162c6a90..1348ab8305c2 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -1,548 +1,558 @@
-// Adapted from https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h
+// Adapted from
+// https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h
 // https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm89.h
 // https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h
 
 #pragma once
 
+#include <ATen/cuda/CUDAContext.h>
 #include <cudaTypedefs.h>
-
 #include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
 
 #include "cute/tensor.hpp"
 #include "cutlass/conv/convolution.h"
 // Order matters here, packed_stride.hpp is missing cute and convolution includes
-#include "cutlass/util/packed_stride.hpp"
-
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm.h"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
-#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
-
+#include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/activation.h"
 #include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
 #include "cutlass/gemm/dispatch_policy.hpp"
-
-#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
 #include "cutlass/gemm/kernel/gemm_universal.hpp"
-
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-
+#include "cutlass/util/packed_stride.hpp"
 #include "utils.hpp"
 
 using namespace cute;
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12040
 template <typename ElementType, typename OutElementType, typename AccumElementType, typename CtaShape,
-    typename WarpShape, int Stages, bool WithBias,
-    typename FP8MathOperator = cutlass::arch::OpMultiplyAdd,
-    template <typename...> typename EpilogueVisitor = cutlass::epilogue::threadblock::Sm80EVT,
-    typename ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>>
-struct DeviceGemmFp8RowwiseSm89
-{
-    static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
-
-    using ElementA = ElementType;
-    using LayoutA = cutlass::layout::RowMajor;
-    static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
-
-    using ElementB = ElementType;
-    using LayoutB = cutlass::layout::ColumnMajor;
-    static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
-
-    using ElementC = OutElementType;
-    using LayoutC = cutlass::layout::RowMajor;
-    static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
-
-    using ElementOutput = OutElementType;
-    using LayoutOutput = cutlass::layout::RowMajor;
-    static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
-
-    using ElementAccumulator = AccumElementType;
-    using ElementComputeEpilogue = float;
-    using ArchTag = cutlass::arch::Sm89;
-    using OperatorClass = cutlass::arch::OpClassTensorOp;
-
-    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
-    // Number of epilogue stages in EVT
-    static constexpr int EVTEpilogueStages = 1;
-
-    using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<CtaShape, WarpShape, ElementC,
-        AlignmentC, EVTEpilogueStages>;
-
-    // Definition of EVT
-    using accSrc = cutlass::epilogue::threadblock::VisitorAccFetch;
-
-    using ComputeBScale = cutlass::epilogue::threadblock::VisitorCompute<cutlass::multiplies, ElementComputeEpilogue,
-        ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
-    using bScaleSrc = cutlass::epilogue::threadblock::VisitorRowBroadcast<OutputTileThreadMap, ElementComputeEpilogue,
-        Stride<_0, _1, _0>>;
-    using EpilogueBScale = cutlass::epilogue::threadblock::Sm80EVT<ComputeBScale, accSrc, bScaleSrc>;
-
-    using ComputeAScale = cutlass::epilogue::threadblock::VisitorCompute<cutlass::multiplies, ElementC,
-        ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
-    using aScaleSrc = cutlass::epilogue::threadblock::VisitorColBroadcast<OutputTileThreadMap, ElementComputeEpilogue,
-        Stride<_1, _0, _0>>;
-    using EpilogueAScale = cutlass::epilogue::threadblock::Sm80EVT<ComputeAScale, EpilogueBScale, aScaleSrc>;
-
-    // With bias
-    using biasSrc = cutlass::epilogue::threadblock::VisitorRowBroadcast<OutputTileThreadMap, ElementOutput, Stride<_0, _1, _0>>;
-    using ComputeAScaleWithBias = cutlass::epilogue::threadblock::VisitorCompute<cutlass::multiply_add, ElementC,
-        ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
-    using EpilogueAScaleWithBias = cutlass::epilogue::threadblock::Sm80EVT<ComputeAScaleWithBias, EpilogueBScale, aScaleSrc, biasSrc>;
-
-    using dTar = cutlass::epilogue::threadblock::VisitorAuxStore<OutputTileThreadMap, ElementC,
-        cutlass::FloatRoundStyle::round_to_nearest, Stride<int64_t, _1, _0>>;
-    using EpilogueStore = typename cutlass::platform::conditional<WithBias, cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScaleWithBias>,
-        cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScale>>::type;
-    
-    using EpilogueOp = EpilogueStore;
-
-    using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmWithVisitor<ElementA, LayoutA,
-        cutlass::ComplexTransform::kNone, AlignmentA, ElementB, LayoutB, cutlass::ComplexTransform::kNone, AlignmentB,
-        ElementC, LayoutC, AlignmentC, ElementAccumulator, ElementComputeEpilogue, OperatorClass, ArchTag, CtaShape,
-        WarpShape, InstructionShape, EpilogueOp, ThreadblockSwizzle,
-        Stages, FP8MathOperator, EVTEpilogueStages>::GemmKernel;
-
-    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+          typename WarpShape, int Stages, bool WithBias, typename FP8MathOperator = cutlass::arch::OpMultiplyAdd,
+          template <typename...> typename EpilogueVisitor = cutlass::epilogue::threadblock::Sm80EVT,
+          typename ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>>
+struct DeviceGemmFp8RowwiseSm89 {
+  static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
+
+  using ElementA = ElementType;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+  using ElementB = ElementType;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  using ElementC = OutElementType;
+  using LayoutC = cutlass::layout::RowMajor;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+
+  using ElementOutput = OutElementType;
+  using LayoutOutput = cutlass::layout::RowMajor;
+  static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+  using ElementAccumulator = AccumElementType;
+  using ElementComputeEpilogue = float;
+  using ArchTag = cutlass::arch::Sm89;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  // Number of epilogue stages in EVT
+  static constexpr int EVTEpilogueStages = 1;
+
+  using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<CtaShape, WarpShape, ElementC,
+                                                                                     AlignmentC, EVTEpilogueStages>;
+
+  // Definition of EVT
+  using accSrc = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+  using ComputeBScale = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, ElementComputeEpilogue, ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
+  using bScaleSrc = cutlass::epilogue::threadblock::VisitorRowBroadcast<OutputTileThreadMap, ElementComputeEpilogue,
+                                                                        Stride<_0, _1, _0>>;
+  using EpilogueBScale = cutlass::epilogue::threadblock::Sm80EVT<ComputeBScale, accSrc, bScaleSrc>;
+
+  using ComputeAScale =
+      cutlass::epilogue::threadblock::VisitorCompute<cutlass::multiplies, ElementC, ElementComputeEpilogue,
+                                                     cutlass::FloatRoundStyle::round_to_nearest>;
+  using aScaleSrc = cutlass::epilogue::threadblock::VisitorColBroadcast<OutputTileThreadMap, ElementComputeEpilogue,
+                                                                        Stride<_1, _0, _0>>;
+  using EpilogueAScale = cutlass::epilogue::threadblock::Sm80EVT<ComputeAScale, EpilogueBScale, aScaleSrc>;
+
+  // With bias
+  using biasSrc =
+      cutlass::epilogue::threadblock::VisitorRowBroadcast<OutputTileThreadMap, ElementOutput, Stride<_0, _1, _0>>;
+  using ComputeAScaleWithBias =
+      cutlass::epilogue::threadblock::VisitorCompute<cutlass::multiply_add, ElementC, ElementComputeEpilogue,
+                                                     cutlass::FloatRoundStyle::round_to_nearest>;
+  using EpilogueAScaleWithBias =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeAScaleWithBias, EpilogueBScale, aScaleSrc, biasSrc>;
+
+  using dTar = cutlass::epilogue::threadblock::VisitorAuxStore<
+      OutputTileThreadMap, ElementC, cutlass::FloatRoundStyle::round_to_nearest, Stride<int64_t, _1, _0>>;
+  using EpilogueStore =
+      typename cutlass::platform::conditional<WithBias,
+                                              cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScaleWithBias>,
+                                              cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScale>>::type;
+
+  using EpilogueOp = EpilogueStore;
+
+  using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+      ElementA, LayoutA, cutlass::ComplexTransform::kNone, AlignmentA, ElementB, LayoutB,
+      cutlass::ComplexTransform::kNone, AlignmentB, ElementC, LayoutC, AlignmentC, ElementAccumulator,
+      ElementComputeEpilogue, OperatorClass, ArchTag, CtaShape, WarpShape, InstructionShape, EpilogueOp,
+      ThreadblockSwizzle, Stages, FP8MathOperator, EVTEpilogueStages>::GemmKernel;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 };
 
-
 template <typename Gemm, bool WithBias>
-typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias)
-{
-    using ElementT = typename Gemm::ElementA;
-    using ElementOutput = typename Gemm::ElementD;
-    using ElementComputeEpilogue = float;
-
-    int32_t m = a.size(0);
-    int32_t n = b.size(1);
-    int32_t k = a.size(1);
-
-    int64_t lda = a.stride(0);
-    int64_t ldb = b.stride(1);
-    int64_t ldc = out.stride(0);
-
-    ElementT const* ptr_a = reinterpret_cast<ElementT const*>(a.data_ptr());
-    ElementT const* ptr_b = reinterpret_cast<ElementT const*>(b.data_ptr());
-    ElementOutput const* ptr_bias = nullptr;
-    if constexpr (WithBias) {
-        TORCH_CHECK(bias.has_value())
-        ptr_bias = reinterpret_cast<ElementOutput const*>(bias.value().data_ptr());
-    }
-    ElementOutput* ptr_d = reinterpret_cast<ElementOutput*>(out.data_ptr());
-    ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
-    ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
-
-
-    typename Gemm::Arguments args(cutlass::gemm::GemmUniversalMode::kGemm, // Mode
-        {m, n, k},                                                         // Problem size
-        1,                                                                 // Split-k factor
-        {},                                                                // Epilogue args
-        ptr_a,                                                             // a pointer
-        ptr_b,                                                             // b pointer
-        nullptr,                                                           // c pointer (unused)
-        nullptr,                                                           // d pointer (unused)
-        m * k,                                                             // batch stride a (unused)
-        n * k,                                                             // batch stride b (unused)
-        m * n,                                                             // batch stride c (unused)
-        m * n,                                                             // batch stride d (unused)
-        lda,                                                               // stride a
-        ldb,                                                               // stride b
-        ldc,                                                               // stride c (unused)
-        ldc);                                                              // stride d (unused)
-    if constexpr (WithBias) {
-        args.epilogue = {
-        {
-            {
-                {}, // Accumulator
-                {ptr_scales_b, ElementComputeEpilogue(0),
-                    {_0{}, _1{}, _0{}}},
-                {} // Multiplies
-            },
-            {ptr_scales_a, ElementComputeEpilogue(0), {_1{}, _0{}, _0{}}},
-            {ptr_bias, ElementOutput(0), {_0{}, _1{}, _0{}}},
-            {} // Multiplies
-        },
-        {ptr_d, {n, _1{}, _0{}}}};
-    } else {
-        args.epilogue = {
-        {
-            {
-                {}, // Accumulator
-                {ptr_scales_b, ElementComputeEpilogue(0),
-                    {_0{}, _1{}, _0{}}},
-                {} // Multiplies
-            },
-            {ptr_scales_a, ElementComputeEpilogue(0), {_1{}, _0{}, _0{}}},
-            {} // Multiplies
-        },
-        {ptr_d, {n, _1{}, _0{}}}};
-    }
+typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                                               const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                                               const c10::optional<torch::Tensor>& bias) {
+  using ElementT = typename Gemm::ElementA;
+  using ElementOutput = typename Gemm::ElementD;
+  using ElementComputeEpilogue = float;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  ElementT const* ptr_a = reinterpret_cast<ElementT const*>(a.data_ptr());
+  ElementT const* ptr_b = reinterpret_cast<ElementT const*>(b.data_ptr());
+  ElementOutput const* ptr_bias = nullptr;
+  if constexpr (WithBias) {
+    TORCH_CHECK(bias.has_value())
+    ptr_bias = reinterpret_cast<ElementOutput const*>(bias.value().data_ptr());
+  }
+  ElementOutput* ptr_d = reinterpret_cast<ElementOutput*>(out.data_ptr());
+  ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
+  ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
+
+  typename Gemm::Arguments args(cutlass::gemm::GemmUniversalMode::kGemm,  // Mode
+                                {m, n, k},                                // Problem size
+                                1,                                        // Split-k factor
+                                {},                                       // Epilogue args
+                                ptr_a,                                    // a pointer
+                                ptr_b,                                    // b pointer
+                                nullptr,                                  // c pointer (unused)
+                                nullptr,                                  // d pointer (unused)
+                                m * k,                                    // batch stride a (unused)
+                                n * k,                                    // batch stride b (unused)
+                                m * n,                                    // batch stride c (unused)
+                                m * n,                                    // batch stride d (unused)
+                                lda,                                      // stride a
+                                ldb,                                      // stride b
+                                ldc,                                      // stride c (unused)
+                                ldc);                                     // stride d (unused)
+  if constexpr (WithBias) {
+    args.epilogue = {{
+                         {
+                             {},  // Accumulator
+                             {ptr_scales_b, ElementComputeEpilogue(0), {_0{}, _1{}, _0{}}},
+                             {}  // Multiplies
+                         },
+                         {ptr_scales_a, ElementComputeEpilogue(0), {_1{}, _0{}, _0{}}},
+                         {ptr_bias, ElementOutput(0), {_0{}, _1{}, _0{}}},
+                         {}  // Multiplies
+                     },
+                     {ptr_d, {n, _1{}, _0{}}}};
+  } else {
+    args.epilogue = {{
+                         {
+                             {},  // Accumulator
+                             {ptr_scales_b, ElementComputeEpilogue(0), {_0{}, _1{}, _0{}}},
+                             {}  // Multiplies
+                         },
+                         {ptr_scales_a, ElementComputeEpilogue(0), {_1{}, _0{}, _0{}}},
+                         {}  // Multiplies
+                     },
+                     {ptr_d, {n, _1{}, _0{}}}};
+  }
 
-    return args;
+  return args;
 }
 
 template <typename Gemm, bool WithBias>
-void launch_sm89_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias)
-{
-    auto args = prepare_sm89_fp8_args<Gemm, WithBias>(out, a, b, scales_a, scales_b, bias);
-    Gemm gemm_op;
-
-    size_t workspace_size = gemm_op.get_workspace_size(args);
-    auto const workspace_options =
-        torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-    auto workspace = torch::empty(workspace_size, workspace_options);
-    auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-    auto can_implement = gemm_op.can_implement(args);
-    TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
-
-    auto status = gemm_op(args, workspace.data_ptr(), stream);
-    TORCH_CHECK(status == cutlass::Status::kSuccess)
+void launch_sm89_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                               const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                               const c10::optional<torch::Tensor>& bias) {
+  auto args = prepare_sm89_fp8_args<Gemm, WithBias>(out, a, b, scales_a, scales_b, bias);
+  Gemm gemm_op;
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options = torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  auto can_implement = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
+
+  auto status = gemm_op(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess)
 }
 
 template <typename OutType, typename CtaShape, typename WarpShape, int Stages>
-void sm89_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias) {
-    using ElementInput = cutlass::float_e4m3_t;
-    using ElementOutput = OutType;
-    using AccumElementType = float;
-    if (bias) {
-        using Gemm = typename DeviceGemmFp8RowwiseSm89<ElementInput, ElementOutput, AccumElementType, CtaShape, WarpShape,
-            Stages, true>::Gemm;
-        return launch_sm89_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
-    } else {
-        using Gemm = typename DeviceGemmFp8RowwiseSm89<ElementInput, ElementOutput, AccumElementType, CtaShape, WarpShape,
-            Stages, false>::Gemm;
-        return launch_sm89_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
-    }
+void sm89_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                        const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                        const c10::optional<torch::Tensor>& bias) {
+  using ElementInput = cutlass::float_e4m3_t;
+  using ElementOutput = OutType;
+  using AccumElementType = float;
+  if (bias) {
+    using Gemm = typename DeviceGemmFp8RowwiseSm89<ElementInput, ElementOutput, AccumElementType, CtaShape, WarpShape,
+                                                   Stages, true>::Gemm;
+    return launch_sm89_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
+  } else {
+    using Gemm = typename DeviceGemmFp8RowwiseSm89<ElementInput, ElementOutput, AccumElementType, CtaShape, WarpShape,
+                                                   Stages, false>::Gemm;
+    return launch_sm89_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
+  }
 }
 
-
 template <typename OutType>
-void sm89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias) {
-    uint32_t const m = a.size(0);
-    // uint32_t const mp2 =
-    //     std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
-    uint32_t const mp2 = next_pow_2(m);     // next power of 2
-
-    uint32_t const n = out.size(1);
-    uint32_t const np2 = next_pow_2(n);
-
-    if (mp2 == 1) {
-        if (np2 <= 8192) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-        } else {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-        }
-      } else if (mp2 <= 16) {
-        // M in (1, 16]
-        if (np2 <= 8192) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
-        } else if (np2 <= 16384) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-        } else {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-        }
-      } else if (mp2 <= 64) {
-        // M in (16, 64]
-        if (np2 <= 16384) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-        } else {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-        }
-      } else if (mp2 <= 128) {
-        // M in (64, 128]
-        if (np2 <= 8192) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
-        } else if (np2 <= 16384) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-        } else {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-        }
-      } else if (mp2 <= 256) {
-        // M in (128, 256]
-        if (np2 <= 8192) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 5>(out, a, b, scales_a, scales_b, bias);
-        } else if (np2 <= 16384) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 7>(out, a, b, scales_a, scales_b, bias);
-        } else {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 128>, cutlass::gemm::GemmShape<64, 32, 128>, 4>(out, a, b, scales_a, scales_b, bias);
-        }
-      } else if (mp2 <= 512) {
-        // M in (256, 512)
-        if (np2 <= 16384) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
-        } else {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 4>(out, a, b, scales_a, scales_b, bias);
-        }
-      } else {
-        // M in (512, inf)
-        if (np2 <= 8192) {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 3>(out, a, b, scales_a, scales_b, bias);
-        } else {
-            return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
-        }
+void sm89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                         const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                         const c10::optional<torch::Tensor>& bias) {
+  uint32_t const m = a.size(0);
+  // uint32_t const mp2 =
+  //     std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+  uint32_t const mp2 = next_pow_2(m);  // next power of 2
+
+  uint32_t const n = out.size(1);
+  uint32_t const np2 = next_pow_2(n);
+
+  if (mp2 == 1) {
+    if (np2 <= 8192) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                7>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                5>(out, a, b, scales_a, scales_b, bias);
     }
+  } else if (mp2 <= 16) {
+    // M in (1, 16]
+    if (np2 <= 8192) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                4>(out, a, b, scales_a, scales_b, bias);
+    } else if (np2 <= 16384) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                5>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                7>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (mp2 <= 64) {
+    // M in (16, 64]
+    if (np2 <= 16384) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                7>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                7>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    if (np2 <= 8192) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>,
+                                4>(out, a, b, scales_a, scales_b, bias);
+    } else if (np2 <= 16384) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>,
+                                5>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
+                                5>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (mp2 <= 256) {
+    // M in (128, 256]
+    if (np2 <= 8192) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
+                                5>(out, a, b, scales_a, scales_b, bias);
+    } else if (np2 <= 16384) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
+                                7>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 128>, cutlass::gemm::GemmShape<64, 32, 128>,
+                                4>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (mp2 <= 512) {
+    // M in (256, 512)
+    if (np2 <= 16384) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
+                                2>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
+                                4>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else {
+    // M in (512, inf)
+    if (np2 <= 8192) {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
+                                3>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
+                                2>(out, a, b, scales_a, scales_b, bias);
+    }
+  }
 }
 #endif
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
 template <typename ElementType, typename OutElementType, typename AccumElementType, typename CTAShape,
-    typename ClusterShape, typename MainloopScheduleType, typename EpilogueScheduleType,
-    typename TileSchedulerType = void, bool WithBias = false>
-struct DeviceGemmFp8RowwiseSm90
-{
-    static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
-
-    // A matrix configuration
-    using ElementA = ElementType;                      // Element type for A matrix operand
-    using LayoutA = cutlass::layout::RowMajor;         // Layout type for A matrix operand
-    static constexpr int AlignmentA
-        = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A
-                                                       // matrix in units of elements (up to 16 bytes)
-
-    // B matrix configuration
-    using ElementB = ElementType;                      // Element type for B matrix operand
-    using LayoutB = cutlass::layout::ColumnMajor;      // Layout type for B matrix operand
-    static constexpr int AlignmentB
-        = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B
-                                                       // matrix in units of elements (up to 16 bytes)
-
-    // C/D matrix configuration
-    using ElementC = void;                                   // Element type for C matrix operands
-    using LayoutC = cutlass::layout::RowMajor;               // Layout type for C matrix operands
-    static constexpr int AlignmentC
-        = 128 / cutlass::sizeof_bits<OutElementType>::value; // Memory access granularity/alignment of C matrices in
-                                                             // units of elements (up to 16 bytes)
-
-    // Output matrix configuration
-    using ElementOutput = OutElementType;           // Element type for output matrix operands
-    using LayoutOutput = cutlass::layout::RowMajor; // Layout type for output matrix operands
-    static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
-
-    // // Auxiliary matrix configuration and other fusion types
-    // using ElementBias = float;
-
-    // Multiply-accumulate blocking/pipelining details
-    using ElementAccumulator = AccumElementType; // Element type for internal accumulation
-    using ElementCompute = float;                // Element type for compute
-    using ElementComputeEpilogue = float;
-    using ArchTag = cutlass::arch::Sm90;         // Tag indicating the minimum SM that supports the intended feature
-    using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
-    using TileShape = CTAShape;                           // Threadblock-level tile size
-    using TileScheduler = TileSchedulerType;
-
-    static constexpr bool PONG = false;
-    static constexpr bool FAST_ACCUM = true;
-    static constexpr bool USE_BIAS = false;
-
-    using StageCountType = cutlass::gemm::collective::StageCountAuto;     // Stage count maximized
-                                                                          // based on the tile size
-    using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; // Kernel to launch based on the default
-                                                                          // setting in the Collective Builder
-    // Implement rowwise scaling epilogue.
-    using XScale = cutlass::epilogue::fusion::Sm90ColBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue,
-        cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>>;
-
-    using WScale = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue,
-        cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
-
-    using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementOutput, ElementOutput,
-        cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
-
-    using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
-
-    using Compute0 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies,
-        ElementComputeEpilogue, // First stage output type.
-        ElementComputeEpilogue, // First stage input types.
-        cutlass::FloatRoundStyle::round_to_nearest>;
-
-    using EVTCompute0 = cutlass::epilogue::fusion::Sm90EVT<Compute0, WScale, Accum>;
-
-    using Compute1 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies, ElementOutput,
-        ElementComputeEpilogue, // Second stage input types.
-        cutlass::FloatRoundStyle::round_to_nearest>;
-
-    using EVTCompute1 = cutlass::epilogue::fusion::Sm90EVT<Compute1, XScale, EVTCompute0>;
-
-    // With bias
-    using ComputeWithBias = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiply_add, ElementOutput,
-        ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
-    using EVTComputeWithBias = cutlass::epilogue::fusion::Sm90EVT<ComputeWithBias, XScale, EVTCompute0, Bias>;
-
-    using EpilogueEVT = typename cutlass::platform::conditional<WithBias, EVTComputeWithBias, EVTCompute1>::type;
-
-    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<cutlass::arch::Sm90,
-        cutlass::arch::OpClassTensorOp, TileShape, ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-        ElementAccumulator, ElementComputeEpilogue, ElementC, LayoutC, AlignmentC, ElementOutput, LayoutOutput,
-        AlignmentOutput, cutlass::epilogue::TmaWarpSpecialized, EpilogueEVT>::CollectiveOp;
-
-    using DefaultSchedule = cutlass::gemm::KernelTmaWarpSpecialized;
-    using PongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-    using FastDefaultSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
-    using FastPongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-
-    using SlowAccum = DefaultSchedule;
-    using FastAccum = FastPongSchedule; // Default apply Pingpong
-    using MainLoopSchedule = cute::conditional_t<FAST_ACCUM, FastAccum, SlowAccum>;
-
-    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<ArchTag, OperatorClass, ElementA,
-        LayoutA, AlignmentA, ElementB, LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape,
-        cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
-            sizeof(typename CollectiveEpilogue::SharedStorage))>,
-        MainLoopSchedule>::CollectiveOp;
-
-    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>, // Indicates ProblemShape
-        CollectiveMainloop, CollectiveEpilogue, TileScheduler>;
-
-    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+          typename ClusterShape, typename MainloopScheduleType, typename EpilogueScheduleType,
+          typename TileSchedulerType = void, bool WithBias = false>
+struct DeviceGemmFp8RowwiseSm90 {
+  static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
+
+  // A matrix configuration
+  using ElementA = ElementType;               // Element type for A matrix operand
+  using LayoutA = cutlass::layout::RowMajor;  // Layout type for A matrix operand
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementA>::value;  // Memory access granularity/alignment of A
+                                                    // matrix in units of elements (up to 16 bytes)
+
+  // B matrix configuration
+  using ElementB = ElementType;                  // Element type for B matrix operand
+  using LayoutB = cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementB>::value;  // Memory access granularity/alignment of B
+                                                    // matrix in units of elements (up to 16 bytes)
+
+  // C/D matrix configuration
+  using ElementC = void;                      // Element type for C matrix operands
+  using LayoutC = cutlass::layout::RowMajor;  // Layout type for C matrix operands
+  static constexpr int AlignmentC =
+      128 / cutlass::sizeof_bits<OutElementType>::value;  // Memory access granularity/alignment of C matrices in
+                                                          // units of elements (up to 16 bytes)
+
+  // Output matrix configuration
+  using ElementOutput = OutElementType;            // Element type for output matrix operands
+  using LayoutOutput = cutlass::layout::RowMajor;  // Layout type for output matrix operands
+  static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+  // // Auxiliary matrix configuration and other fusion types
+  // using ElementBias = float;
+
+  // Multiply-accumulate blocking/pipelining details
+  using ElementAccumulator = AccumElementType;  // Element type for internal accumulation
+  using ElementCompute = float;                 // Element type for compute
+  using ElementComputeEpilogue = float;
+  using ArchTag = cutlass::arch::Sm90;  // Tag indicating the minimum SM that supports the intended feature
+  using OperatorClass = cutlass::arch::OpClassTensorOp;  // Operator class tag
+  using TileShape = CTAShape;                            // Threadblock-level tile size
+  using TileScheduler = TileSchedulerType;
+
+  static constexpr bool PONG = false;
+  static constexpr bool FAST_ACCUM = true;
+  static constexpr bool USE_BIAS = false;
+
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;      // Stage count maximized
+                                                                         // based on the tile size
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;  // Kernel to launch based on the default
+                                                                         // setting in the Collective Builder
+  // Implement rowwise scaling epilogue.
+  using XScale =
+      cutlass::epilogue::fusion::Sm90ColBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue,
+                                                  cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>>;
+
+  using WScale =
+      cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue,
+                                                  cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
+
+  using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementOutput, ElementOutput,
+                                                           cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
+
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies,
+                                                          ElementComputeEpilogue,  // First stage output type.
+                                                          ElementComputeEpilogue,  // First stage input types.
+                                                          cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 = cutlass::epilogue::fusion::Sm90EVT<Compute0, WScale, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies, ElementOutput,
+                                                          ElementComputeEpilogue,  // Second stage input types.
+                                                          cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute1 = cutlass::epilogue::fusion::Sm90EVT<Compute1, XScale, EVTCompute0>;
+
+  // With bias
+  using ComputeWithBias =
+      cutlass::epilogue::fusion::Sm90Compute<cutlass::multiply_add, ElementOutput, ElementComputeEpilogue,
+                                             cutlass::FloatRoundStyle::round_to_nearest>;
+  using EVTComputeWithBias = cutlass::epilogue::fusion::Sm90EVT<ComputeWithBias, XScale, EVTCompute0, Bias>;
+
+  using EpilogueEVT = typename cutlass::platform::conditional<WithBias, EVTComputeWithBias, EVTCompute1>::type;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator, ElementComputeEpilogue, ElementC, LayoutC,
+      AlignmentC, ElementOutput, LayoutOutput, AlignmentOutput, cutlass::epilogue::TmaWarpSpecialized,
+      EpilogueEVT>::CollectiveOp;
+
+  using DefaultSchedule = cutlass::gemm::KernelTmaWarpSpecialized;
+  using PongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using FastDefaultSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using FastPongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+
+  using SlowAccum = DefaultSchedule;
+  using FastAccum = FastPongSchedule;  // Default apply Pingpong
+  using MainLoopSchedule = cute::conditional_t<FAST_ACCUM, FastAccum, SlowAccum>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB, LayoutB, AlignmentB, ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+          sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainLoopSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>,  // Indicates ProblemShape
+                                                          CollectiveMainloop, CollectiveEpilogue, TileScheduler>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 };
 
 template <typename Gemm, bool WithBias>
-typename Gemm::Arguments prepare_sm90_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias)
-{
-    using ElementT = typename Gemm::ElementA;
-    using ElementOutput = typename Gemm::ElementD;
-    using ElementComputeEpilogue = float;
-    using StrideA = typename Gemm::GemmKernel::StrideA;
-    using StrideB = typename Gemm::GemmKernel::StrideB;
-    using StrideC = typename Gemm::GemmKernel::StrideC;
-    using StrideD = typename Gemm::GemmKernel::StrideD;
-
-    int32_t m = a.size(0);
-    int32_t n = b.size(1);
-    int32_t k = a.size(1);
-    ElementT const* ptr_a = reinterpret_cast<ElementT const*>(a.data_ptr());
-    ElementT const* ptr_b = reinterpret_cast<ElementT const*>(b.data_ptr());
-    ElementOutput const* ptr_bias = nullptr;
-    if constexpr (WithBias) {
-        TORCH_CHECK(bias.has_value())
-        ptr_bias = reinterpret_cast<ElementOutput const*>(bias.value().data_ptr());
-    }
-    ElementOutput* ptr_d = reinterpret_cast<ElementOutput*>(out.data_ptr());
-    ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
-    ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
-
-    // TODO: confirm correctess
-    StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, make_shape(m, k, 1));
-    StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, make_shape(n, k, 1));
-    StrideC stride_c;
-    StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, make_shape(m, n, 1));
-    typename Gemm::Arguments args
-        = {cutlass::gemm::GemmUniversalMode::kGemm, {m, n, k, 1}, {ptr_a, stride_a, ptr_b, stride_b},
-            {{}, // epilogue.thread
-                nullptr, stride_c, ptr_d, stride_d}};
-    if constexpr (WithBias) {
-        args.epilogue.thread = {
-            {ptr_scales_a},
-            {
-                {ptr_scales_b}, {}, // Accumulator
-                {}                                                                             // Multiplies
-            },
-            {ptr_bias},
-            {},                                                                                // Multiplies
-        };
-    } else {
-        args.epilogue.thread = {
-            {ptr_scales_a},
-            {
-                {ptr_scales_b}, {}, // Accumulator
-                {}                                                                             // Multiplies
-            },
-            {},                                                                                // Multiplies
-        };
-    }
+typename Gemm::Arguments prepare_sm90_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                                               const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                                               const c10::optional<torch::Tensor>& bias) {
+  using ElementT = typename Gemm::ElementA;
+  using ElementOutput = typename Gemm::ElementD;
+  using ElementComputeEpilogue = float;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+  ElementT const* ptr_a = reinterpret_cast<ElementT const*>(a.data_ptr());
+  ElementT const* ptr_b = reinterpret_cast<ElementT const*>(b.data_ptr());
+  ElementOutput const* ptr_bias = nullptr;
+  if constexpr (WithBias) {
+    TORCH_CHECK(bias.has_value())
+    ptr_bias = reinterpret_cast<ElementOutput const*>(bias.value().data_ptr());
+  }
+  ElementOutput* ptr_d = reinterpret_cast<ElementOutput*>(out.data_ptr());
+  ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
+  ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
+
+  // TODO: confirm correctess
+  StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, make_shape(m, k, 1));
+  StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, make_shape(n, k, 1));
+  StrideC stride_c;
+  StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, make_shape(m, n, 1));
+  typename Gemm::Arguments args = {cutlass::gemm::GemmUniversalMode::kGemm,
+                                   {m, n, k, 1},
+                                   {ptr_a, stride_a, ptr_b, stride_b},
+                                   {{},  // epilogue.thread
+                                    nullptr,
+                                    stride_c,
+                                    ptr_d,
+                                    stride_d}};
+  if constexpr (WithBias) {
+    args.epilogue.thread = {
+        {ptr_scales_a},
+        {
+            {ptr_scales_b},
+            {},  // Accumulator
+            {}   // Multiplies
+        },
+        {ptr_bias},
+        {},  // Multiplies
+    };
+  } else {
+    args.epilogue.thread = {
+        {ptr_scales_a},
+        {
+            {ptr_scales_b},
+            {},  // Accumulator
+            {}   // Multiplies
+        },
+        {},  // Multiplies
+    };
+  }
 
-    return args;
+  return args;
 }
 
 template <typename Gemm, bool WithBias>
-void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias)
-{
-    auto args = prepare_sm90_fp8_args<Gemm, WithBias>(out, a, b, scales_a, scales_b, bias);
-    Gemm gemm_op;
+void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                               const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                               const c10::optional<torch::Tensor>& bias) {
+  auto args = prepare_sm90_fp8_args<Gemm, WithBias>(out, a, b, scales_a, scales_b, bias);
+  Gemm gemm_op;
 
-    size_t workspace_size = gemm_op.get_workspace_size(args);
-    auto const workspace_options =
-        torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-    auto workspace = torch::empty(workspace_size, workspace_options);
-    auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options = torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
 
-    auto can_implement = gemm_op.can_implement(args);
-    TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
+  auto can_implement = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
 
-    auto status = gemm_op.run(args, workspace.data_ptr(), stream);
+  auto status = gemm_op.run(args, workspace.data_ptr(), stream);
 
-    TORCH_CHECK(status == cutlass::Status::kSuccess)
+  TORCH_CHECK(status == cutlass::Status::kSuccess)
 }
 
-
 template <typename OutType, typename CTAShape, typename ClusterShape>
-void sm90_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias) {
-    using ElementInput = cutlass::float_e4m3_t;
-    using ElementOutput = OutType;
-    using AccumElementType = float;
-    using MainloopScheduleType = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
-    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized;
-    using TileSchedulerType = void;
-    if (bias) {
-        using Gemm = typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape,
-            ClusterShape, MainloopScheduleType, EpilogueScheduleType, TileSchedulerType, true>::Gemm;
-        return launch_sm90_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
-    } else {
-        using Gemm = typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape,
-            ClusterShape, MainloopScheduleType, EpilogueScheduleType, TileSchedulerType, false>::Gemm;
-        return launch_sm90_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
-    }
+void sm90_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                        const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                        const c10::optional<torch::Tensor>& bias) {
+  using ElementInput = cutlass::float_e4m3_t;
+  using ElementOutput = OutType;
+  using AccumElementType = float;
+  using MainloopScheduleType = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized;
+  using TileSchedulerType = void;
+  if (bias) {
+    using Gemm =
+        typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape, ClusterShape,
+                                          MainloopScheduleType, EpilogueScheduleType, TileSchedulerType, true>::Gemm;
+    return launch_sm90_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
+  } else {
+    using Gemm =
+        typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape, ClusterShape,
+                                          MainloopScheduleType, EpilogueScheduleType, TileSchedulerType, false>::Gemm;
+    return launch_sm90_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
+  }
 }
 
 template <typename OutType>
-void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b,
-                             const c10::optional<torch::Tensor>& bias) {
-    uint32_t const m = a.size(0);
-    uint32_t const mp2 =
-        std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
-
-    if (mp2 <= 64) {
-        // m in [1, 64]
-        return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _4, _1>>(out, a, b, scales_a, scales_b, bias);
-    } else if (mp2 <= 256) {
-        // m in (64, 256]
-        return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _1, _1>>(out, a, b, scales_a, scales_b, bias);
-    } else if (mp2 <= 1024) {
-        // m in (256, 1024]
-        return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_1, _1, _1>>(out, a, b, scales_a, scales_b, bias);
-    } else {
-        // m in (1024, inf)
-        return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
-    }
+void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                         const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                         const c10::optional<torch::Tensor>& bias) {
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 = std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    // m in [1, 64]
+    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _4, _1>>(out, a, b, scales_a, scales_b, bias);
+  } else if (mp2 <= 256) {
+    // m in (64, 256]
+    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+  } else if (mp2 <= 1024) {
+    // m in (256, 1024]
+    return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_1, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+  } else {
+    // m in (1024, inf)
+    return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+  }
 }
 #endif
 
 torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
-                             const c10::optional<torch::Tensor>& bias) {
-
-
+                            const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
+                            const c10::optional<torch::Tensor>& bias) {
   TORCH_CHECK(mat_a.is_cuda(), "mat_a must be a CUDA tensor");
   TORCH_CHECK(mat_b.is_cuda(), "mat_b must be a CUDA tensor");
   TORCH_CHECK(mat_a.dim() == 2, "mat_a must be a 2D tensor");
@@ -551,8 +561,10 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
   TORCH_CHECK(mat_b.stride(0) == 1, "mat_a must be a column major tensor");
   TORCH_CHECK(mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied");
 
-  TORCH_CHECK((mat_a.size(1) * mat_a.element_size()) % 16 == 0, "mat_a must be multiple of 16 bytes for memory alignment");
-  TORCH_CHECK((mat_b.size(0) * mat_b.element_size()) % 16 == 0, "mat_b must be multiple of 16 bytes for memory alignment");
+  TORCH_CHECK((mat_a.size(1) * mat_a.element_size()) % 16 == 0,
+              "mat_a must be multiple of 16 bytes for memory alignment");
+  TORCH_CHECK((mat_b.size(0) * mat_b.element_size()) % 16 == 0,
+              "mat_b must be multiple of 16 bytes for memory alignment");
   TORCH_CHECK(mat_a.scalar_type() == torch::kFloat8_e4m3fn, "mat_a must be Float8_e4m3fn");
   TORCH_CHECK(mat_b.scalar_type() == torch::kFloat8_e4m3fn, "mat_b must be Float8_e4m3fn");
   TORCH_CHECK(out_dtype == torch::kHalf || out_dtype == torch::kBFloat16, "out_dtype must be Half or BFloat16");
@@ -577,26 +589,25 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
   if (sm_version >= 90) {
-        if (out_dtype == torch::kBFloat16) {
-            sm90_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
-        } else {
-            sm90_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
-        }
+    if (out_dtype == torch::kBFloat16) {
+      sm90_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      sm90_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
     return out;
   }
 #endif
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12040
-if (sm_version == 89) {
-        if (out_dtype == torch::kBFloat16) {
-            sm89_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
-        } else {
-            sm89_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
-        }
+  if (sm_version == 89) {
+    if (out_dtype == torch::kBFloat16) {
+      sm89_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      sm89_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
     return out;
-}
+  }
 #endif
-  
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented fp8_scaled_mm for current compute capability: ", sm_version);
 
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented fp8_scaled_mm for current compute capability: ", sm_version);
 }
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index 57e74bdc363d..4f224ed90212 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -27,8 +27,8 @@ torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& ma
                              const c10::optional<torch::Tensor>& bias);
 
 torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
-                             const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
-                             const c10::optional<torch::Tensor>& bias);
+                            const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
+                            const c10::optional<torch::Tensor>& bias);
 // rotary embedding
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, torch::Tensor& key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox);
diff --git a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
index 4343a340db79..5820b1350ab5 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
+++ b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp
@@ -1,5 +1,6 @@
 #pragma once
 #include <torch/extension.h>
+
 #include <sstream>
 
 struct cuda_error : public std::runtime_error {
diff --git a/sgl-kernel/tests/test_fp8_gemm.py b/sgl-kernel/tests/test_fp8_gemm.py
index 386910db1ec3..b55bd089a4d9 100644
--- a/sgl-kernel/tests/test_fp8_gemm.py
+++ b/sgl-kernel/tests/test_fp8_gemm.py
@@ -1,8 +1,8 @@
 import unittest
 
 import torch
-from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
 from sgl_kernel import fp8_scaled_mm
+from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
 
 
 def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):

From 6fc37bd8ee3535673835712fa76a973bda0cb450 Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Wed, 22 Jan 2025 16:49:08 +0800
Subject: [PATCH 182/248] Fix sgl-kernel compile for sm80 (#3046)

---
 sgl-kernel/setup.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index a8d9517bb25b..1aea485ff8ff 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -24,6 +24,22 @@ def update_wheel_platform_tag():
         old_wheel.rename(new_wheel)
 
 
+def get_cuda_version():
+    if torch.version.cuda:
+        return tuple(map(int, torch.version.cuda.split(".")))
+    return (0, 0)
+
+
+def get_device_sm():
+    if torch.cuda.is_available():
+        major, minor = torch.cuda.get_device_capability()
+        return major * 10 + minor
+    return 0
+
+
+cuda_version = get_cuda_version()
+sm_version = get_device_sm()
+
 cutlass = root / "3rdparty" / "cutlass"
 flashinfer = root / "3rdparty" / "flashinfer"
 include_dirs = [
@@ -42,12 +58,15 @@ def update_wheel_platform_tag():
     "-gencode=arch=compute_80,code=sm_80",
     "-gencode=arch=compute_89,code=sm_89",
     "-gencode=arch=compute_90,code=sm_90",
-    "-gencode=arch=compute_90a,code=sm_90a",
     "-std=c++17",
     "-use_fast_math",
     "-DFLASHINFER_ENABLE_F16",
     "-DFLASHINFER_ENABLE_BF16",
 ]
+
+if cuda_version >= (12, 0) and sm_version >= 90:
+    nvcc_flags.append("-gencode=arch=compute_90a,code=sm_90a")
+
 for flag in [
     "-D__CUDA_NO_HALF_OPERATORS__",
     "-D__CUDA_NO_HALF_CONVERSIONS__",

From 8b87aad53bf6a0851d662b5de80a586586efeb7b Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Wed, 22 Jan 2025 16:50:02 +0800
Subject: [PATCH 183/248] upd

---
 sgl-kernel/setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 159e55347c03..20f34dc676ca 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -58,7 +58,6 @@ def get_device_sm():
     "-gencode=arch=compute_75,code=sm_75",
     "-gencode=arch=compute_80,code=sm_80",
     "-gencode=arch=compute_89,code=sm_89",
-    "-gencode=arch=compute_90a,code=sm_90a",
     "-gencode=arch=compute_90,code=sm_90",
     "-std=c++17",
     "-use_fast_math",

From 9f8f2c7f749523070a8b259f843cd42acceb9963 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Wed, 22 Jan 2025 18:58:44 +0800
Subject: [PATCH 184/248] update norm cu (#3048)

---
 sgl-kernel/setup.py                    |  2 +-
 sgl-kernel/src/sgl-kernel/csrc/norm.cu | 28 --------------------------
 2 files changed, 1 insertion(+), 29 deletions(-)
 delete mode 100644 sgl-kernel/src/sgl-kernel/csrc/norm.cu

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 1aea485ff8ff..1197611d6a2d 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -91,7 +91,7 @@ def get_device_sm():
             "src/sgl-kernel/csrc/sampling_scaling_penalties.cu",
             "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
             "src/sgl-kernel/csrc/rotary_embedding.cu",
-            "src/sgl-kernel/csrc/norm.cu",
+            "3rdparty/flashinfer/csrc/norm.cu",
         ],
         include_dirs=include_dirs,
         extra_compile_args={
diff --git a/sgl-kernel/src/sgl-kernel/csrc/norm.cu b/sgl-kernel/src/sgl-kernel/csrc/norm.cu
deleted file mode 100644
index ad102a50d3f0..000000000000
--- a/sgl-kernel/src/sgl-kernel/csrc/norm.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <cstdint>
-#include <flashinfer/norm.cuh>
-
-#include "pytorch_extension_utils.h"
-
-using namespace flashinfer;
-
-void rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, double eps, int64_t cuda_stream) {
-  CHECK_INPUT(input);
-  CHECK_INPUT(weight);
-  auto device = input.device();
-  CHECK_EQ(weight.device(), device);
-  CHECK_DIM(2, input);   // input: (batch_size, hidden_size)
-  CHECK_DIM(1, weight);  // weight: (hidden_size)
-  CHECK_EQ(input.size(1), weight.size(0));
-  unsigned int batch_size = input.size(0);
-  unsigned int hidden_size = input.size(1);
-  CHECK_EQ(output.size(0), batch_size);
-  CHECK_EQ(output.size(1), hidden_size);
-
-  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
-  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(input.scalar_type(), c_type, [&] {
-    cudaError_t status = norm::RMSNorm(static_cast<c_type*>(input.data_ptr()), static_cast<c_type*>(weight.data_ptr()),
-                                       static_cast<c_type*>(output.data_ptr()), batch_size, hidden_size, eps, stream);
-    TORCH_CHECK(status == cudaSuccess, "RMSNorm failed with error code " + std::string(cudaGetErrorString(status)));
-    return true;
-  });
-}

From bcda0c9ee6a6e687e53ac933f3541dd5c5a1fe9b Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Wed, 22 Jan 2025 20:33:13 +0800
Subject: [PATCH 185/248] sync the upstream updates of flashinfer (#3051)

---
 .github/workflows/pr-test-sgl-kernel.yml | 1 +
 sgl-kernel/3rdparty/flashinfer           | 2 +-
 sgl-kernel/setup.py                      | 6 ++++++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
index 3d9802658310..794a73f36619 100644
--- a/.github/workflows/pr-test-sgl-kernel.yml
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -40,6 +40,7 @@ jobs:
         run: |
           pip3 install torch==2.5.1 && pip3 install pytest && pip3 install vllm
           pip3 uninstall sgl-kernel -y || true
+          find . -name index.lock -delete
           cd sgl-kernel
           git submodule deinit --all --force && git submodule sync --recursive && git submodule update --init --force --recursive
           pip3 install .
diff --git a/sgl-kernel/3rdparty/flashinfer b/sgl-kernel/3rdparty/flashinfer
index a0e99a3a8201..4e8eb1879f9c 160000
--- a/sgl-kernel/3rdparty/flashinfer
+++ b/sgl-kernel/3rdparty/flashinfer
@@ -1 +1 @@
-Subproject commit a0e99a3a820109763d9a757138a5cdf7bbcd1f85
+Subproject commit 4e8eb1879f9c3ba6d75511e5893183bf8f289a62
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 1197611d6a2d..b9324c355431 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -47,6 +47,7 @@ def get_device_sm():
     cutlass.resolve() / "tools" / "util" / "include",
     root / "src" / "sgl-kernel" / "csrc",
     flashinfer.resolve() / "include",
+    flashinfer.resolve() / "include" / "gemm",
     flashinfer.resolve() / "csrc",
 ]
 nvcc_flags = [
@@ -91,7 +92,12 @@ def get_device_sm():
             "src/sgl-kernel/csrc/sampling_scaling_penalties.cu",
             "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
             "src/sgl-kernel/csrc/rotary_embedding.cu",
+            "3rdparty/flashinfer/csrc/activation.cu",
+            "3rdparty/flashinfer/csrc/bmm_fp8.cu",
+            "3rdparty/flashinfer/csrc/group_gemm.cu",
+            "3rdparty/flashinfer/csrc/group_gemm_sm90.cu",
             "3rdparty/flashinfer/csrc/norm.cu",
+            "3rdparty/flashinfer/csrc/sampling.cu",
         ],
         include_dirs=include_dirs,
         extra_compile_args={

From 7353fb9b97705c89d205aa3477b446759fcb86b7 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Wed, 22 Jan 2025 21:32:48 +0800
Subject: [PATCH 186/248] feat: integrate norm kernels into sgl-kernel (#3052)

---
 sgl-kernel/src/sgl-kernel/__init__.py         |  16 ++-
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     |  16 +++
 sgl-kernel/src/sgl-kernel/ops/__init__.py     |  45 +++++-
 sgl-kernel/tests/test_norm.py                 | 129 ++++++++++++++++++
 sgl-kernel/tests/test_rmsnorm.py              |  31 -----
 5 files changed, 195 insertions(+), 42 deletions(-)
 create mode 100644 sgl-kernel/tests/test_norm.py
 delete mode 100644 sgl-kernel/tests/test_rmsnorm.py

diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py
index 3352abeb5506..bdbc0ce846cb 100644
--- a/sgl-kernel/src/sgl-kernel/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/__init__.py
@@ -1,6 +1,9 @@
 from sgl_kernel.ops import (
     custom_dispose,
     custom_reduce,
+    fused_add_rmsnorm,
+    gemma_fused_add_rmsnorm,
+    gemma_rmsnorm,
     get_graph_buffer_ipc_meta,
     init_custom_reduce,
     int8_scaled_mm,
@@ -12,14 +15,17 @@
 )
 
 __all__ = [
-    "moe_align_block_size",
-    "init_custom_reduce",
     "custom_dispose",
     "custom_reduce",
-    "int8_scaled_mm",
-    "sampling_scaling_penalties",
+    "fused_add_rmsnorm",
+    "gemma_fused_add_rmsnorm",
+    "gemma_rmsnorm",
     "get_graph_buffer_ipc_meta",
+    "init_custom_reduce",
+    "int8_scaled_mm",
+    "moe_align_block_size",
     "register_graph_buffers",
-    "rotary_embedding",
     "rmsnorm",
+    "rotary_embedding",
+    "sampling_scaling_penalties",
 ]
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index ed359bfbb0a1..8f9d1ae53331 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -33,6 +33,16 @@ void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, torch::Ten
 // rms norm
 void rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, double eps, int64_t cuda_stream);
 
+// fused rms norm
+void fused_add_rmsnorm(at::Tensor& input, at::Tensor& residual, at::Tensor& weight, double eps, int64_t cuda_stream);
+
+// gemma rms norm
+void gemma_rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, double eps, int64_t cuda_stream);
+
+// fused gemma rms norm
+void gemma_fused_add_rmsnorm(at::Tensor& input, at::Tensor& residual, at::Tensor& weight, double eps,
+                             int64_t cuda_stream);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // trt_reduce
   m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)");
@@ -50,4 +60,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("rotary_embedding", &rotary_embedding, "Rotary Embedding (CUDA)");
   // rms norm
   m.def("rmsnorm", &rmsnorm, "RMSNorm (CUDA)");
+  // fused rms norm
+  m.def("fused_add_rmsnorm", &fused_add_rmsnorm, "Fused Add RMSNorm (CUDA)");
+  // gemma rms norm
+  m.def("gemma_rmsnorm", &gemma_rmsnorm, "Gemma RMSNorm (CUDA)");
+  // fused gemma rms norm
+  m.def("gemma_fused_add_rmsnorm", &gemma_fused_add_rmsnorm, "Gemma Fused Add RMSNorm (CUDA)");
 }
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index e9eadb759cfa..bbfd76878a7f 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -3,6 +3,9 @@
 import torch
 from sgl_kernel.ops._kernels import all_reduce as _all_reduce
 from sgl_kernel.ops._kernels import dispose as _dispose
+from sgl_kernel.ops._kernels import fused_add_rmsnorm as _fused_add_rmsnorm
+from sgl_kernel.ops._kernels import gemma_fused_add_rmsnorm as _gemma_fused_add_rmsnorm
+from sgl_kernel.ops._kernels import gemma_rmsnorm as _gemma_rmsnorm
 from sgl_kernel.ops._kernels import (
     get_graph_buffer_ipc_meta as _get_graph_buffer_ipc_meta,
 )
@@ -17,6 +20,10 @@
 )
 
 
+def get_cuda_stream(device: torch.device) -> int:
+    return torch.cuda.current_stream(device).cuda_stream
+
+
 def init_custom_reduce(
     rank_id, num_devices, rank_data, buffers, tmp_buffers, barrier_in, barrier_out
 ):
@@ -88,9 +95,35 @@ def rmsnorm(
     eps: float = 1e-6,
     out: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
-    if out is None:
-        out = torch.empty_like(input)
-    stream = torch.cuda.current_stream().cuda_stream
-    stream_int = int(stream)
-    _rmsnorm(out, input, weight, eps, stream_int)
-    return out
+    with input.device as device:
+        if out is None:
+            out = torch.empty_like(input)
+        _rmsnorm(out, input, weight, eps, get_cuda_stream(device))
+        return out
+
+
+def fused_add_rmsnorm(
+    input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6
+) -> None:
+    with input.device as device:
+        _fused_add_rmsnorm(input, residual, weight, eps, get_cuda_stream(device))
+
+
+def gemma_rmsnorm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    out: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    with input.device as device:
+        if out is None:
+            out = torch.empty_like(input)
+        _gemma_rmsnorm(out, input, weight, eps, get_cuda_stream(device))
+        return out
+
+
+def gemma_fused_add_rmsnorm(
+    input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6
+) -> None:
+    with input.device as device:
+        _gemma_fused_add_rmsnorm(input, residual, weight, eps, get_cuda_stream(device))
diff --git a/sgl-kernel/tests/test_norm.py b/sgl-kernel/tests/test_norm.py
new file mode 100644
index 000000000000..32f8c25d9f7c
--- /dev/null
+++ b/sgl-kernel/tests/test_norm.py
@@ -0,0 +1,129 @@
+# Adapted from https://github.com/flashinfer-ai/flashinfer/blob/4e8eb1879f9c3ba6d75511e5893183bf8f289a62/tests/test_norm.py
+
+import pytest
+import sgl_kernel
+import torch
+
+
+def llama_rms_norm(x, w, eps=1e-6):
+    orig_dtype = x.dtype
+    x = x.float()
+    variance = x.pow(2).mean(dim=-1, keepdim=True)
+    x = x * torch.rsqrt(variance + eps)
+    x = x * w.float()
+    x = x.to(orig_dtype)
+    return x
+
+
+def gemma_rms_norm(x, w, eps=1e-6):
+    orig_dtype = x.dtype
+    x = x.float()
+    variance = x.pow(2).mean(dim=-1, keepdim=True)
+    x = x * torch.rsqrt(variance + eps)
+    x = x * (1.0 + w.float())
+    x = x.to(orig_dtype)
+    return x
+
+
+def gemma_fused_add_rms_norm(x, residual, w, eps=1e-6):
+    orig_dtype = x.dtype
+    x = x + residual
+    residual = x
+    x = x.float()
+    variance = x.pow(2).mean(dim=-1, keepdim=True)
+    x = x * torch.rsqrt(variance + eps)
+    x = x * (1.0 + w.float())
+    x = x.to(orig_dtype)
+    return x, residual
+
+
+def fused_add_rms_norm(x, residual, weight, eps):
+    orig_dtype = x.dtype
+    x = x.to(torch.float32)
+    x = x + residual.to(torch.float32)
+    residual = x.to(orig_dtype)
+
+    variance = x.pow(2).mean(dim=-1, keepdim=True)
+    x = x * torch.rsqrt(variance + eps)
+    x = (x * weight.float()).to(orig_dtype)
+    return x, residual
+
+
+@pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
+@pytest.mark.parametrize("hidden_size", [111, 500, 1024, 3072, 3584, 4096, 8192, 16384])
+@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("specify_out", [True, False])
+def test_norm(batch_size, hidden_size, dtype, specify_out):
+    x = torch.randn(batch_size, hidden_size).to(0).to(dtype)
+    w = torch.randn(hidden_size).to(0).to(dtype)
+
+    y_ref = llama_rms_norm(x, w)
+    if specify_out:
+        y = torch.empty_like(x)
+        sgl_kernel.rmsnorm(x, w, out=y)
+    else:
+        y = sgl_kernel.rmsnorm(x, w)
+
+    torch.testing.assert_close(y_ref, y, rtol=1e-3, atol=1e-3)
+
+
+@pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
+@pytest.mark.parametrize("hidden_size", [111, 500, 1024, 3072, 3584, 4096, 8192, 16384])
+@pytest.mark.parametrize("dtype", [torch.float16])
+def test_fused_add_rmsnorm(batch_size, hidden_size, dtype):
+    eps = 1e-6
+
+    x = torch.randn(batch_size, hidden_size, dtype=dtype, device="cuda")
+    residual = torch.randn_like(x)
+    weight = torch.randn(hidden_size, dtype=dtype, device="cuda")
+
+    x_native, residual_native = fused_add_rms_norm(
+        x.clone(), residual.clone(), weight, eps
+    )
+
+    x_fused = x.clone()
+    residual_fused = residual.clone()
+    sgl_kernel.fused_add_rmsnorm(x_fused, residual_fused, weight, eps)
+
+    torch.testing.assert_close(x_fused, x_native, rtol=1e-3, atol=1e-3)
+    torch.testing.assert_close(residual_fused, residual_native, rtol=1e-3, atol=1e-3)
+
+
+@pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
+@pytest.mark.parametrize("hidden_size", [111, 500, 1024, 3072, 3584, 4096, 8192, 16384])
+@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("specify_out", [True, False])
+def test_gemma_norm(batch_size, hidden_size, dtype, specify_out):
+    x = torch.randn(batch_size, hidden_size).to(0).to(dtype)
+    w = torch.randn(hidden_size).to(0).to(dtype)
+
+    y_ref = gemma_rms_norm(x, w)
+    if specify_out:
+        y = torch.empty_like(x)
+        sgl_kernel.gemma_rmsnorm(x, w, out=y)
+    else:
+        y = sgl_kernel.gemma_rmsnorm(x, w)
+
+    torch.testing.assert_close(y_ref, y, rtol=1e-3, atol=1e-3)
+
+
+@pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
+@pytest.mark.parametrize("hidden_size", [111, 500, 1024, 3072, 3584, 4096, 8192, 16384])
+@pytest.mark.parametrize("dtype", [torch.float16])
+def test_gemma_fused_add_rmsnorm(batch_size, hidden_size, dtype):
+    eps = 1e-6
+
+    x = torch.randn(batch_size, hidden_size, dtype=dtype, device="cuda")
+    residual = torch.randn_like(x)
+    weight = torch.randn(hidden_size, dtype=dtype, device="cuda")
+
+    x_native, residual_native = gemma_fused_add_rms_norm(
+        x.clone(), residual.clone(), weight, eps
+    )
+
+    x_fused = x.clone()
+    residual_fused = residual.clone()
+    sgl_kernel.gemma_fused_add_rmsnorm(x_fused, residual_fused, weight, eps)
+
+    torch.testing.assert_close(x_fused, x_native, rtol=1e-3, atol=1e-3)
+    torch.testing.assert_close(residual_fused, residual_native, rtol=1e-3, atol=1e-3)
diff --git a/sgl-kernel/tests/test_rmsnorm.py b/sgl-kernel/tests/test_rmsnorm.py
deleted file mode 100644
index dda225de9e3f..000000000000
--- a/sgl-kernel/tests/test_rmsnorm.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import pytest
-import torch
-from sgl_kernel import rmsnorm
-
-
-def llama_rms_norm(x, w, eps=1e-6):
-    orig_dtype = x.dtype
-    x = x.float()
-    variance = x.pow(2).mean(dim=-1, keepdim=True)
-    x = x * torch.rsqrt(variance + eps)
-    x = x * w.float()
-    x = x.to(orig_dtype)
-    return x
-
-
-@pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
-@pytest.mark.parametrize("hidden_size", [111, 500, 1024, 3072, 3584, 4096, 8192, 16384])
-@pytest.mark.parametrize("dtype", [torch.float16])
-@pytest.mark.parametrize("specify_out", [True, False])
-def test_norm(batch_size, hidden_size, dtype, specify_out):
-    x = torch.randn(batch_size, hidden_size).to(0).to(dtype)
-    w = torch.randn(hidden_size).to(0).to(dtype)
-
-    y_ref = llama_rms_norm(x, w)
-    if specify_out:
-        y = torch.empty_like(x)
-        rmsnorm(x, w, out=y)
-    else:
-        y = rmsnorm(x, w)
-
-    torch.testing.assert_close(y_ref, y, rtol=1e-3, atol=1e-3)

From 9d9b482a392598fc342ee449835af5535ccc772f Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Wed, 22 Jan 2025 23:25:45 +0800
Subject: [PATCH 187/248] feat: integrate activation kernels into sgl-kernel
 (#3053)

---
 sgl-kernel/src/sgl-kernel/__init__.py         |  6 ++
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     | 15 +++++
 sgl-kernel/src/sgl-kernel/ops/__init__.py     | 61 +++++++++++++++++++
 sgl-kernel/tests/test_activation.py           | 38 ++++++++++++
 4 files changed, 120 insertions(+)
 create mode 100644 sgl-kernel/tests/test_activation.py

diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py
index bdbc0ce846cb..0bcd77aad371 100644
--- a/sgl-kernel/src/sgl-kernel/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/__init__.py
@@ -2,6 +2,8 @@
     custom_dispose,
     custom_reduce,
     fused_add_rmsnorm,
+    gelu_and_mul,
+    gelu_tanh_and_mul,
     gemma_fused_add_rmsnorm,
     gemma_rmsnorm,
     get_graph_buffer_ipc_meta,
@@ -12,12 +14,15 @@
     rmsnorm,
     rotary_embedding,
     sampling_scaling_penalties,
+    silu_and_mul,
 )
 
 __all__ = [
     "custom_dispose",
     "custom_reduce",
     "fused_add_rmsnorm",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
     "gemma_fused_add_rmsnorm",
     "gemma_rmsnorm",
     "get_graph_buffer_ipc_meta",
@@ -28,4 +33,5 @@
     "rmsnorm",
     "rotary_embedding",
     "sampling_scaling_penalties",
+    "silu_and_mul",
 ]
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index 8f9d1ae53331..d9aaa41b88b4 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -43,6 +43,15 @@ void gemma_rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, do
 void gemma_fused_add_rmsnorm(at::Tensor& input, at::Tensor& residual, at::Tensor& weight, double eps,
                              int64_t cuda_stream);
 
+// silu and mul
+void silu_and_mul(at::Tensor& out, at::Tensor& input, int64_t cuda_stream);
+
+// gelu tanh and mul
+void gelu_tanh_and_mul(at::Tensor& out, at::Tensor& input, int64_t cuda_stream);
+
+// gelu and mul
+void gelu_and_mul(at::Tensor& out, at::Tensor& input, int64_t cuda_stream);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // trt_reduce
   m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)");
@@ -66,4 +75,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("gemma_rmsnorm", &gemma_rmsnorm, "Gemma RMSNorm (CUDA)");
   // fused gemma rms norm
   m.def("gemma_fused_add_rmsnorm", &gemma_fused_add_rmsnorm, "Gemma Fused Add RMSNorm (CUDA)");
+  // silu and mul
+  m.def("silu_and_mul", &silu_and_mul, "Silu and Mul (CUDA)");
+  // gelu tanh and mul
+  m.def("gelu_tanh_and_mul", &gelu_tanh_and_mul, "Gelu Tanh and Mul (CUDA)");
+  // gelu and mul
+  m.def("gelu_and_mul", &gelu_and_mul, "Gelu and Mul (CUDA)");
 }
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index bbfd76878a7f..5bfde5df2d06 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -4,6 +4,8 @@
 from sgl_kernel.ops._kernels import all_reduce as _all_reduce
 from sgl_kernel.ops._kernels import dispose as _dispose
 from sgl_kernel.ops._kernels import fused_add_rmsnorm as _fused_add_rmsnorm
+from sgl_kernel.ops._kernels import gelu_and_mul as _gelu_and_mul
+from sgl_kernel.ops._kernels import gelu_tanh_and_mul as _gelu_tanh_and_mul
 from sgl_kernel.ops._kernels import gemma_fused_add_rmsnorm as _gemma_fused_add_rmsnorm
 from sgl_kernel.ops._kernels import gemma_rmsnorm as _gemma_rmsnorm
 from sgl_kernel.ops._kernels import (
@@ -18,6 +20,7 @@
 from sgl_kernel.ops._kernels import (
     sampling_scaling_penalties as _sampling_scaling_penalties,
 )
+from sgl_kernel.ops._kernels import silu_and_mul as _silu_and_mul
 
 
 def get_cuda_stream(device: torch.device) -> int:
@@ -127,3 +130,61 @@ def gemma_fused_add_rmsnorm(
 ) -> None:
     with input.device as device:
         _gemma_fused_add_rmsnorm(input, residual, weight, eps, get_cuda_stream(device))
+
+
+def _check_shape(input: torch.Tensor, output: torch.Tensor) -> None:
+    assert input.ndim == output.ndim, f"{input.ndim} != {output.ndim}"
+    assert (
+        input.shape[:-1] == output.shape[:-1]
+    ), f"{input.shape[:-1]} != {output.shape[:-1]}"
+    assert (
+        input.shape[-1] == 2 * output.shape[-1]
+    ), f"{input.shape[-1]} != {2 * output.shape[-1]}"
+
+
+def silu_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
+    if input.shape[-1] * input.dtype.itemsize % 16 != 0:
+        raise ValueError("The pointers must be multiple of 16 bytes.")
+    if out is not None:
+        _check_shape(input, out)
+    else:
+        out = torch.empty(
+            input.shape[:-1] + (input.shape[-1] // 2,),
+            device=input.device,
+            dtype=input.dtype,
+        )
+    with input.device as device:
+        _silu_and_mul(out, input, get_cuda_stream(device))
+        return out
+
+
+def gelu_tanh_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
+    if input.shape[-1] * input.dtype.itemsize % 16 != 0:
+        raise ValueError("The pointers must be multiple of 16 bytes.")
+    if out is not None:
+        _check_shape(input, out)
+    else:
+        out = torch.empty(
+            input.shape[:-1] + (input.shape[-1] // 2,),
+            device=input.device,
+            dtype=input.dtype,
+        )
+    with input.device as device:
+        _gelu_tanh_and_mul(out, input, get_cuda_stream(device))
+        return out
+
+
+def gelu_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
+    if input.shape[-1] * input.dtype.itemsize % 16 != 0:
+        raise ValueError("The pointers must be multiple of 16 bytes.")
+    if out is not None:
+        _check_shape(input, out)
+    else:
+        out = torch.empty(
+            input.shape[:-1] + (input.shape[-1] // 2,),
+            device=input.device,
+            dtype=input.dtype,
+        )
+    with input.device as device:
+        _gelu_and_mul(out, input, get_cuda_stream(device))
+        return out
diff --git a/sgl-kernel/tests/test_activation.py b/sgl-kernel/tests/test_activation.py
new file mode 100644
index 000000000000..f71f36b513da
--- /dev/null
+++ b/sgl-kernel/tests/test_activation.py
@@ -0,0 +1,38 @@
+# Adapted from https://github.com/flashinfer-ai/flashinfer/blob/4e8eb1879f9c3ba6d75511e5893183bf8f289a62/tests/test_activation.py
+
+import pytest
+import sgl_kernel
+import torch
+
+
+@pytest.mark.parametrize("dim", [128, 256, 512, 2048, 4096, 11008, 16384])
+@pytest.mark.parametrize("batch_size", [1, 2, 4, 8, 16])
+@pytest.mark.parametrize("seq_len", [1, 2, 4, 8, 16, 32, 64, 128, 512])
+def test_fused_silu_mul(dim, batch_size, seq_len):
+    x = torch.randn(batch_size, seq_len, 2 * dim).to(0).to(torch.float16)
+    y_ref = x[..., dim:] * torch.nn.functional.silu(x[..., :dim])
+    y = sgl_kernel.silu_and_mul(x)
+    torch.testing.assert_close(y_ref, y, rtol=1e-3, atol=1e-3)
+
+
+@pytest.mark.parametrize("dim", [128, 256, 512, 2048, 4096, 11008, 16384])
+@pytest.mark.parametrize("batch_size", [1, 2, 4, 8, 16])
+@pytest.mark.parametrize("seq_len", [1, 2, 4, 8, 16, 32, 64, 128, 512])
+def test_fused_gelu_tanh_mul(dim, batch_size, seq_len):
+    x = torch.randn(batch_size, seq_len, 2 * dim).to(0).to(torch.float16)
+    y_ref = x[..., dim:] * torch.nn.functional.gelu(x[..., :dim], approximate="tanh")
+    y = sgl_kernel.gelu_tanh_and_mul(x)
+    torch.testing.assert_close(y_ref, y, rtol=1e-3, atol=1e-3)
+
+
+@pytest.mark.parametrize("dim", [128, 256, 512, 2048, 4096, 11008, 16384])
+@pytest.mark.parametrize("batch_size", [1, 2, 4, 8, 16])
+@pytest.mark.parametrize("seq_len", [1, 2, 4, 8, 16, 32, 64, 128, 512])
+def test_fused_gelu_mul(dim, batch_size, seq_len):
+    x = torch.randn(batch_size, seq_len, 2 * dim).to(0).to(torch.float16)
+    y_ref = x[..., dim:] * torch.nn.functional.gelu(x[..., :dim], approximate="none")
+    y = sgl_kernel.gelu_and_mul(x)
+    torch.testing.assert_close(y_ref, y, rtol=1e-3, atol=1e-3)
+
+
+test_fused_silu_mul(128, 1, 1)

From b2bd8f444c61c5ffaa6e84bb0f094eb14f605fcc Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Wed, 22 Jan 2025 23:45:18 +0800
Subject: [PATCH 188/248] minor: update header and use pytest (#3054)

---
 sgl-kernel/Makefile                                          | 2 +-
 sgl-kernel/src/sgl-kernel/csrc/int8_gemm_kernel.cu           | 2 +-
 sgl-kernel/src/sgl-kernel/csrc/moe_align_kernel.cu           | 2 +-
 sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu | 2 +-
 sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu             | 2 +-
 sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cuh       | 2 +-
 sgl-kernel/src/sgl-kernel/csrc/{utils.hpp => utils.h}        | 0
 7 files changed, 6 insertions(+), 6 deletions(-)
 rename sgl-kernel/src/sgl-kernel/csrc/{utils.hpp => utils.h} (100%)

diff --git a/sgl-kernel/Makefile b/sgl-kernel/Makefile
index c7641bb5fee1..9261b8969345 100644
--- a/sgl-kernel/Makefile
+++ b/sgl-kernel/Makefile
@@ -19,7 +19,7 @@ clean:
 	@rm -rf build dist *.egg-info
 
 test:
-	@find tests -name "test_*.py" | xargs -n 1 python3
+	@find tests -name "test_*.py" | xargs -n 1 python3 && pytest tests/test_norm.py && pytest tests/test_activation.py
 
 format:
 	@find src tests -name '*.cc' -o -name '*.cu' -o -name '*.cuh' -o -name '*.h' -o -name '*.hpp' | xargs clang-format -i && find src tests -name '*.py' | xargs isort && find src tests -name '*.py' | xargs black
diff --git a/sgl-kernel/src/sgl-kernel/csrc/int8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/int8_gemm_kernel.cu
index 8e3f72757028..c77851c32b61 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/int8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/int8_gemm_kernel.cu
@@ -16,7 +16,7 @@
 #include "cutlass_extensions/epilogue/epilogue_per_row_per_col_scale.h"
 #include "cutlass_extensions/gemm/gemm_universal_base_compat.h"
 #include "cutlass_extensions/gemm/gemm_with_epilogue_visitor.h"
-#include "utils.hpp"
+#include "utils.h"
 
 using namespace cute;
 
diff --git a/sgl-kernel/src/sgl-kernel/csrc/moe_align_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/moe_align_kernel.cu
index c7faf9d37758..83861aee071c 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/moe_align_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/moe_align_kernel.cu
@@ -6,7 +6,7 @@
 
 #include <THC/THCAtomics.cuh>
 
-#include "utils.hpp"
+#include "utils.h"
 
 #ifdef USE_ROCM
 #include <hip/hip_runtime.h>
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu b/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
index a61d4b860596..2f53bb1a99f0 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
@@ -4,7 +4,7 @@
 
 #include <THC/THCAtomics.cuh>
 
-#include "utils.hpp"
+#include "utils.h"
 #include "vectorization.cuh"
 
 template <typename scalar_t>
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index d9aaa41b88b4..985cfa17326a 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -1,6 +1,6 @@
 #include <vector>
 
-#include "utils.hpp"
+#include "utils.h"
 
 // trt_reduce
 using fptr_t = int64_t;
diff --git a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cuh b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cuh
index 9d6f9722eb5f..22ba0e414fc6 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cuh
+++ b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cuh
@@ -21,7 +21,7 @@
 #include <stdint.h>
 #include <torch/all.h>
 
-#include "utils.hpp"
+#include "utils.h"
 
 namespace trt_llm {
 constexpr size_t WARP_SIZE = 32;
diff --git a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp b/sgl-kernel/src/sgl-kernel/csrc/utils.h
similarity index 100%
rename from sgl-kernel/src/sgl-kernel/csrc/utils.hpp
rename to sgl-kernel/src/sgl-kernel/csrc/utils.h

From bf669606eb84e12dc1ecf15b23c1eedab204d660 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 23 Jan 2025 00:39:38 +0800
Subject: [PATCH 189/248] feat: integrate bmm_fp8 kernel into sgl-kernel
 (#3056)

---
 sgl-kernel/setup.py                           | 12 +++-
 sgl-kernel/src/sgl-kernel/__init__.py         |  2 +
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     |  6 ++
 sgl-kernel/src/sgl-kernel/ops/__init__.py     | 61 +++++++++++++++----
 sgl-kernel/src/sgl-kernel/ops/utils.py        | 19 ++++++
 sgl-kernel/tests/test_bmm_fp8.py              | 43 +++++++++++++
 6 files changed, 131 insertions(+), 12 deletions(-)
 create mode 100644 sgl-kernel/src/sgl-kernel/ops/utils.py
 create mode 100644 sgl-kernel/tests/test_bmm_fp8.py

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index b9324c355431..81cd96e99adb 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -62,12 +62,22 @@ def get_device_sm():
     "-std=c++17",
     "-use_fast_math",
     "-DFLASHINFER_ENABLE_F16",
-    "-DFLASHINFER_ENABLE_BF16",
 ]
 
 if cuda_version >= (12, 0) and sm_version >= 90:
     nvcc_flags.append("-gencode=arch=compute_90a,code=sm_90a")
 
+if sm_version >= 90:
+    nvcc_flags.extend(
+        [
+            "-DFLASHINFER_ENABLE_FP8",
+            "-DFLASHINFER_ENABLE_FP8_E4M3",
+            "-DFLASHINFER_ENABLE_FP8_E5M2",
+        ]
+    )
+if sm_version >= 80:
+    nvcc_flags.append("-DFLASHINFER_ENABLE_BF16")
+
 for flag in [
     "-D__CUDA_NO_HALF_OPERATORS__",
     "-D__CUDA_NO_HALF_CONVERSIONS__",
diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py
index 0bcd77aad371..86c4f34d3534 100644
--- a/sgl-kernel/src/sgl-kernel/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/__init__.py
@@ -1,4 +1,5 @@
 from sgl_kernel.ops import (
+    bmm_fp8,
     custom_dispose,
     custom_reduce,
     fused_add_rmsnorm,
@@ -18,6 +19,7 @@
 )
 
 __all__ = [
+    "bmm_fp8",
     "custom_dispose",
     "custom_reduce",
     "fused_add_rmsnorm",
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index 985cfa17326a..12df0747171e 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -52,6 +52,10 @@ void gelu_tanh_and_mul(at::Tensor& out, at::Tensor& input, int64_t cuda_stream);
 // gelu and mul
 void gelu_and_mul(at::Tensor& out, at::Tensor& input, int64_t cuda_stream);
 
+// bmm fp8
+void bmm_fp8(at::Tensor A, at::Tensor B, at::Tensor D, at::Tensor A_scale, at::Tensor B_scale,
+             at::Tensor workspace_buffer, int64_t cublas_handle, int64_t cuda_stream);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // trt_reduce
   m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)");
@@ -81,4 +85,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("gelu_tanh_and_mul", &gelu_tanh_and_mul, "Gelu Tanh and Mul (CUDA)");
   // gelu and mul
   m.def("gelu_and_mul", &gelu_and_mul, "Gelu and Mul (CUDA)");
+  // bmm fp8
+  m.def("bmm_fp8", &bmm_fp8, "BMM FP8 (CUDA)");
 }
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index 5bfde5df2d06..cea3436b631f 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -2,6 +2,7 @@
 
 import torch
 from sgl_kernel.ops._kernels import all_reduce as _all_reduce
+from sgl_kernel.ops._kernels import bmm_fp8 as _bmm_fp8
 from sgl_kernel.ops._kernels import dispose as _dispose
 from sgl_kernel.ops._kernels import fused_add_rmsnorm as _fused_add_rmsnorm
 from sgl_kernel.ops._kernels import gelu_and_mul as _gelu_and_mul
@@ -21,10 +22,7 @@
     sampling_scaling_penalties as _sampling_scaling_penalties,
 )
 from sgl_kernel.ops._kernels import silu_and_mul as _silu_and_mul
-
-
-def get_cuda_stream(device: torch.device) -> int:
-    return torch.cuda.current_stream(device).cuda_stream
+from sgl_kernel.ops.utils import _get_cache_buf, _get_cuda_stream
 
 
 def init_custom_reduce(
@@ -101,7 +99,7 @@ def rmsnorm(
     with input.device as device:
         if out is None:
             out = torch.empty_like(input)
-        _rmsnorm(out, input, weight, eps, get_cuda_stream(device))
+        _rmsnorm(out, input, weight, eps, _get_cuda_stream(device))
         return out
 
 
@@ -109,7 +107,7 @@ def fused_add_rmsnorm(
     input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6
 ) -> None:
     with input.device as device:
-        _fused_add_rmsnorm(input, residual, weight, eps, get_cuda_stream(device))
+        _fused_add_rmsnorm(input, residual, weight, eps, _get_cuda_stream(device))
 
 
 def gemma_rmsnorm(
@@ -121,7 +119,7 @@ def gemma_rmsnorm(
     with input.device as device:
         if out is None:
             out = torch.empty_like(input)
-        _gemma_rmsnorm(out, input, weight, eps, get_cuda_stream(device))
+        _gemma_rmsnorm(out, input, weight, eps, _get_cuda_stream(device))
         return out
 
 
@@ -129,7 +127,7 @@ def gemma_fused_add_rmsnorm(
     input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6
 ) -> None:
     with input.device as device:
-        _gemma_fused_add_rmsnorm(input, residual, weight, eps, get_cuda_stream(device))
+        _gemma_fused_add_rmsnorm(input, residual, weight, eps, _get_cuda_stream(device))
 
 
 def _check_shape(input: torch.Tensor, output: torch.Tensor) -> None:
@@ -154,7 +152,7 @@ def silu_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
             dtype=input.dtype,
         )
     with input.device as device:
-        _silu_and_mul(out, input, get_cuda_stream(device))
+        _silu_and_mul(out, input, _get_cuda_stream(device))
         return out
 
 
@@ -170,7 +168,7 @@ def gelu_tanh_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Te
             dtype=input.dtype,
         )
     with input.device as device:
-        _gelu_tanh_and_mul(out, input, get_cuda_stream(device))
+        _gelu_tanh_and_mul(out, input, _get_cuda_stream(device))
         return out
 
 
@@ -186,5 +184,46 @@ def gelu_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
             dtype=input.dtype,
         )
     with input.device as device:
-        _gelu_and_mul(out, input, get_cuda_stream(device))
+        _gelu_and_mul(out, input, _get_cuda_stream(device))
         return out
+
+
+def _bmm_fp8_internal(
+    workspace_buffer: torch.Tensor,
+    A: torch.Tensor,
+    B: torch.Tensor,
+    D: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+) -> None:
+    with A.device as device:
+        cublas_handle = torch.cuda.current_blas_handle()
+        _bmm_fp8(
+            A,
+            B,
+            D,
+            A_scale,
+            B_scale,
+            workspace_buffer,
+            cublas_handle,
+            _get_cuda_stream(device),
+        )
+
+
+def bmm_fp8(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    dtype: torch.dtype,
+    out: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    if out is None:
+        out = torch.empty(
+            (A.shape[0], A.shape[1], B.shape[2]),
+            device=A.device,
+            dtype=dtype,
+        )
+    workspace_buffer = _get_cache_buf("bmm_fp8_workspace", 32 * 1024 * 1024, A.device)
+    _bmm_fp8_internal(workspace_buffer, A, B, out, A_scale, B_scale)
+    return out
diff --git a/sgl-kernel/src/sgl-kernel/ops/utils.py b/sgl-kernel/src/sgl-kernel/ops/utils.py
new file mode 100644
index 000000000000..af5fccbb786d
--- /dev/null
+++ b/sgl-kernel/src/sgl-kernel/ops/utils.py
@@ -0,0 +1,19 @@
+from typing import Dict, Tuple
+
+import torch
+
+
+def _get_cuda_stream(device: torch.device) -> int:
+    return torch.cuda.current_stream(device).cuda_stream
+
+
+_cache_buf: Dict[Tuple[str, torch.device], torch.Tensor] = {}
+
+
+def _get_cache_buf(name: str, bytes: int, device: torch.device) -> torch.Tensor:
+    key = (name, device)
+    buf = _cache_buf.get(key)
+    if buf is None:
+        buf = torch.empty(bytes, dtype=torch.uint8, device=device)
+        _cache_buf[key] = buf
+    return buf
diff --git a/sgl-kernel/tests/test_bmm_fp8.py b/sgl-kernel/tests/test_bmm_fp8.py
new file mode 100644
index 000000000000..e0be92896f61
--- /dev/null
+++ b/sgl-kernel/tests/test_bmm_fp8.py
@@ -0,0 +1,43 @@
+# Adapted from https://github.com/flashinfer-ai/flashinfer/blob/4e8eb1879f9c3ba6d75511e5893183bf8f289a62/tests/test_bmm_fp8.py
+
+import pytest
+import torch
+import torch.nn.functional as F
+from sgl_kernel import bmm_fp8
+
+
+def to_float8(x, dtype=torch.float8_e4m3fn):
+    finfo = torch.finfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    return x_scl_sat.to(dtype), scale.float().reciprocal()
+
+
+@pytest.mark.parametrize("input_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+@pytest.mark.parametrize("mat2_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+@pytest.mark.parametrize("res_dtype", [torch.bfloat16, torch.float16])
+def test_bmm_fp8(input_dtype, mat2_dtype, res_dtype):
+    if input_dtype == torch.float8_e5m2 and mat2_dtype == torch.float8_e5m2:
+        pytest.skip("Invalid combination: both input and mat2 are e5m2")
+
+    input = torch.randn([16, 48, 64], device="cuda", dtype=torch.bfloat16)
+    input_fp8, input_inv_s = to_float8(input, dtype=input_dtype)
+
+    # mat2 row  major -> column major
+    mat2 = torch.randn([16, 80, 64], device="cuda", dtype=torch.bfloat16).transpose(
+        -2, -1
+    )
+    mat2_fp8, mat2_inv_s = to_float8(mat2, dtype=mat2_dtype)
+
+    res = torch.empty([16, 48, 80], device="cuda", dtype=res_dtype)
+    bmm_fp8(input_fp8, mat2_fp8, input_inv_s, mat2_inv_s, res_dtype, res)
+
+    reference = torch.bmm(input, mat2)
+    cos_sim = F.cosine_similarity(reference.reshape(-1), res.reshape(-1), dim=0)
+    assert cos_sim > 0.99
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])

From 0d2148efaa9c490231f89bd5f587e2241fcff26c Mon Sep 17 00:00:00 2001
From: nstream-ai-devx <155576234+sudo-root-ns@users.noreply.github.com>
Date: Wed, 22 Jan 2025 23:45:32 +0530
Subject: [PATCH 190/248] fix rotary_embedding rope_scaling for phi (#3055)

---
 python/sglang/srt/layers/rotary_embedding.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py
index 43478f39d2c3..ad265830f8f7 100644
--- a/python/sglang/srt/layers/rotary_embedding.py
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -1018,7 +1018,12 @@ def get_rope(
             head_size, rotary_dim, max_position, base, is_neox_style, dtype
         )
     else:
-        scaling_type = rope_scaling["rope_type"]
+        if "rope_type" in rope_scaling:
+            scaling_type = rope_scaling["rope_type"]
+        elif "type" in rope_scaling:
+            scaling_type = rope_scaling["type"]
+        else:
+            raise ValueError("Unknown RoPE scaling type")
 
         if scaling_type == "llama3":
             scaling_factor = rope_scaling["factor"]

From 806a3002c10b3992b86921e0af17b116794c78e1 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 23 Jan 2025 02:47:36 +0800
Subject: [PATCH 191/248] add notice about flashinfer in sgl-kernel (#3057)

---
 sgl-kernel/src/sgl-kernel/ops/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index cea3436b631f..d90f121d4f35 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -90,6 +90,8 @@ def rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox):
     return _rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox)
 
 
+# These implementations extensively draw from and build upon the FlashInfer project https://github.com/flashinfer-ai/flashinfer
+# Kudos to @yzh119
 def rmsnorm(
     input: torch.Tensor,
     weight: torch.Tensor,

From ddc2001fb00f67d0d657ebaf056d65c4900e8e57 Mon Sep 17 00:00:00 2001
From: Hui Liu <96135754+hliuca@users.noreply.github.com>
Date: Wed, 22 Jan 2025 13:57:22 -0800
Subject: [PATCH 192/248] disable custom allreduce on HIP (#3058)

---
 python/sglang/srt/distributed/parallel_state.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/sglang/srt/distributed/parallel_state.py b/python/sglang/srt/distributed/parallel_state.py
index c6d1a8307818..d97c348ef7b5 100644
--- a/python/sglang/srt/distributed/parallel_state.py
+++ b/python/sglang/srt/distributed/parallel_state.py
@@ -41,6 +41,7 @@
 from sglang.srt.utils import (
     direct_register_custom_op,
     is_cuda_alike,
+    is_hip,
     supports_custom_op,
 )
 
@@ -952,6 +953,9 @@ def graph_capture():
 def set_custom_all_reduce(enable: bool):
     global _ENABLE_CUSTOM_ALL_REDUCE
     _ENABLE_CUSTOM_ALL_REDUCE = enable
+    if enable and is_hip():
+        logger.warning("HIP doesn't support custom_all_reduce, so disable it.")
+        _ENABLE_CUSTOM_ALL_REDUCE = False
 
 
 def init_distributed_environment(

From b3393e941fd1d9b97ae317ec852b8c6a705dbe40 Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <baz003@ucsd.edu>
Date: Wed, 22 Jan 2025 14:17:26 -0800
Subject: [PATCH 193/248] [Doc] Update doc of profiling with PyTorch Profiler
 (#3038)

---
 docs/references/benchmark_and_profiling.md | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/docs/references/benchmark_and_profiling.md b/docs/references/benchmark_and_profiling.md
index 87ac5177424d..0600b192b4fb 100644
--- a/docs/references/benchmark_and_profiling.md
+++ b/docs/references/benchmark_and_profiling.md
@@ -64,16 +64,31 @@ with nvtx.annotate("description", color="color"):
 ```bash
 # set trace path
 export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
+
 # start server
 python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct
 
-python -m sglang.bench_serving --backend sglang --model-path meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --profile
+# send profiling request from client
+python -m sglang.bench_serving --backend sglang --model-path meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --sharegpt-output-len 100 --profile
 ```
-
-Traces can be visualized using https://ui.perfetto.dev/.
+Please make sure that the `SGLANG_TORCH_PROFILER_DIR` should be set at both server and client side, otherwise the trace file cannot be generated correctly . A secure way will be setting `SGLANG_TORCH_PROFILER_DIR` in the `.*rc` file of shell (e.g. `~/.bashrc` for bash shells).
 
 - To profile offline
 ```bash
 export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
 python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
 ```
+
+- View Traces
+
+Trace files can be loaded and visualized from:
+1. https://ui.perfetto.dev/ (any browser)
+2. chrome://tracing (Chrome browser only)
+
+If browser cannot open trace file due to its large size,
+client can generate a small trace file (<100MB) by controlling number of prompts and lengths of prompt outputs.
+For example, when profiling a server,
+```bash
+python -m sglang.bench_serving --backend sglang --model-path meta-llama/Llama-3.1-8B-Instruct --num-prompts 2 --sharegpt-output-len 100 --profile
+```
+sets the number of prompts to 2 with `--num-prompts` argument and limits the length of output sequences to 100 with `--sharegpt-output-len` argument, which can generate a small trace file for browser to open smoothly.

From b8ab989ff4666f4cee2a1d77aa3941d794ffabd7 Mon Sep 17 00:00:00 2001
From: lukec <118525388+sleepcoo@users.noreply.github.com>
Date: Thu, 23 Jan 2025 06:19:33 +0800
Subject: [PATCH 194/248] Fix the FP8 E4M3 parsing offline scales failure bug
 (#3045)

---
 .../sglang/srt/model_loader/weight_utils.py   | 78 ++++++++++++++++++-
 1 file changed, 77 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py
index 77c3fcbee74d..f2f67ecab1d4 100644
--- a/python/sglang/srt/model_loader/weight_utils.py
+++ b/python/sglang/srt/model_loader/weight_utils.py
@@ -27,6 +27,7 @@
 import numpy as np
 import torch
 from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
+from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator
 from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm
 
@@ -650,6 +651,81 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
     return name
 
 
+# Adapted from https://github.com/vllm-project/vllm/blob/68ad4e3a8d8a66fb2a43be57471ee13a8bec4ec0/vllm/model_executor/layers/quantization/schema.py
+class KVCacheQuantSchema(BaseModel):
+    dtype: str
+    # Each key is a TP rank. Each value is a dictionary mapping a TP rank's
+    # layer indices to their per-tensor KV cache scaling factor.
+    # TODO: Consider pulling this and its validation methods out into its
+    # own schema class (tricky as its members are variable)
+    scaling_factor: Dict[int, Dict[int, float]]
+
+    @model_validator(mode="after")
+    def check_is_fp8(self) -> "KVCacheQuantSchema":
+        assert self.dtype == "float8_e4m3fn", (
+            "Loaded scaling factors intended for KV cache dtype = "
+            f"{self.dtype} rather than float8_e4m3fn!"
+        )
+        return self
+
+    @model_validator(mode="after")
+    def check_tp_ranks(self, info: ValidationInfo) -> "KVCacheQuantSchema":
+        context = info.context
+        if context:
+            tp_size = context["tp_size"]
+            num_hidden_layers = context["num_hidden_layers"]
+            assert len(self.scaling_factor) == tp_size, (
+                f"Loaded dictionary has TP size {len(self.scaling_factor)} "
+                f"but LLM engine is currently running with TP size {tp_size}."
+            )
+            for tp_rank, layer_maps in self.scaling_factor.items():
+                assert len(layer_maps) == num_hidden_layers, (
+                    f"KV cache scales map for TP rank {tp_rank} is malformed. "
+                    f"Expected {num_hidden_layers} layers, got "
+                    f"{len(layer_maps)}."
+                )
+            for i in range(tp_size):
+                assert (
+                    i in self.scaling_factor
+                ), f"KV cache scales map for TP rank {i} not found."
+        return self
+
+    @model_validator(mode="after")
+    def check_current_rank(self, info: ValidationInfo) -> "KVCacheQuantSchema":
+        context = info.context
+        if context:
+            tp_rank = context["tp_rank"]
+            num_hidden_layers = context["num_hidden_layers"]
+            layer_scales_map = self.scaling_factor[tp_rank]
+            for i in range(num_hidden_layers):
+                assert i in layer_scales_map, (
+                    f"Could not find KV cache scales for layer {i} in "
+                    f"TP rank {tp_rank}."
+                )
+        return self
+
+
+class QuantParamSchema(BaseModel):
+    # TODO: Generalize and extend with more fields
+    # (e.g. weights/activations params) once functionality is enabled
+    model_config = ConfigDict(protected_namespaces=())
+    model_type: Optional[str]
+    kv_cache: KVCacheQuantSchema
+
+    @model_validator(mode="after")
+    def check_model_type(self, info: ValidationInfo) -> "QuantParamSchema":
+        context = info.context
+        if context:
+            model_type = context.get("model_type", None)
+            if model_type is not None:
+                assert model_type == self.model_type, (
+                    f"Model type is {model_type} but loaded "
+                    f"scaling factors belonging to different "
+                    f"model type {self.model_type}!"
+                )
+        return self
+
+
 def kv_cache_scales_loader(
     filename: str,
     tp_rank: int,
@@ -681,7 +757,7 @@ def kv_cache_scales_loader(
     except json.JSONDecodeError:
         logger.error("Error decoding JSON in file '%s'.", filename)
     except Exception:
-        logger.exception("An error occurred while reading '%s'.", filename)
+        logger.error("An error occurred while reading '%s'.", filename)
     # This section is reached if and only if any of the excepts are hit
     # Return an empty iterable (list) => no KV cache scales are loaded
     # which ultimately defaults to 1.0 scales

From 022614d26e25cbd963d3bd2706582198943a44ee Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 22 Jan 2025 15:05:51 -0800
Subject: [PATCH 195/248] Add some flags to allow sync token ids across TP
 ranks (#3060)

---
 python/sglang/srt/layers/sampler.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index f3c376ed1eb8..24f951f2b5d3 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -2,12 +2,18 @@
 from typing import List
 
 import torch
+import torch.distributed as dist
 from torch import nn
 
+from sglang.srt.distributed import get_tensor_model_parallel_group
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
-from sglang.srt.utils import crash_on_warnings, is_flashinfer_available
+from sglang.srt.utils import (
+    crash_on_warnings,
+    get_bool_env_var,
+    is_flashinfer_available,
+)
 
 if is_flashinfer_available():
     from flashinfer.sampling import (
@@ -20,6 +26,8 @@
 
 logger = logging.getLogger(__name__)
 
+SYNC_TOKEN_IDS_ACROSS_TP = get_bool_env_var("SYNC_TOKEN_IDS_ACROSS_TP")
+
 
 class Sampler(nn.Module):
     def __init__(self):
@@ -121,6 +129,20 @@ def forward(
                 batch_next_token_ids,
             ]
 
+        if SYNC_TOKEN_IDS_ACROSS_TP or sampling_info.grammars:
+            # For performance reasons, SGLang does not sync the final token IDs across TP ranks by default.
+            # This saves one all-reduce, but the correctness of this approach depends on the determinism of several operators:
+            # the last all-reduce, the last lm_head matmul, and all sampling kernels.
+            # These kernels are deterministic in most cases, but there are some rare instances where they are not deterministic.
+            # In such cases, enable this env variable to prevent hanging due to TP ranks becoming desynchronized.
+            # When using xgrammar, this becomes more likely so we also do the sync when grammar is used.
+
+            torch.distributed.all_reduce(
+                batch_next_token_ids,
+                op=dist.ReduceOp.MIN,
+                group=get_tensor_model_parallel_group().device_group,
+            )
+
         return batch_next_token_ids.to(torch.int32)
 
     def _apply_custom_logit_processor(

From c0bf9bf15c2e4969e63c7fc13c51ae99d14e1570 Mon Sep 17 00:00:00 2001
From: Byron Hsu <byronhsu1230@gmail.com>
Date: Wed, 22 Jan 2025 17:47:54 -0800
Subject: [PATCH 196/248] [devcontainer] add non-root user (#2989)

---
 .devcontainer/Dockerfile        | 35 +++++++++++++++++++++++++++++++++
 .devcontainer/devcontainer.json |  3 ++-
 docker/Dockerfile.dev           |  6 ------
 3 files changed, 37 insertions(+), 7 deletions(-)
 create mode 100644 .devcontainer/Dockerfile

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 000000000000..0c061cd1871a
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,35 @@
+From lmsysorg/sglang:dev
+
+# Create non-root user with specified UID and GID
+# NOTE: Replace with your own UID and GID. This is a workaround from https://github.com/microsoft/vscode-remote-release/issues/49#issuecomment-489060908.
+ARG HOST_UID=1003
+ARG HOST_GID=1003
+RUN groupadd -g $HOST_GID devuser && \
+    useradd -m -u $HOST_UID -g $HOST_GID -s /bin/zsh devuser
+
+# Give devuser sudo access
+RUN apt-get update && apt-get install -y sudo && \
+    echo "devuser ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/devuser && \
+    rm -rf /var/lib/apt/lists/* && \
+    apt-get clean
+
+# Set up oh-my-zsh for devuser
+RUN cp -r /root/.oh-my-zsh /home/devuser/.oh-my-zsh && \
+    cp /root/.zshrc /home/devuser/.zshrc && \
+    cp /root/.vimrc /home/devuser/.vimrc && \
+    cp /root/.tmux.conf /home/devuser/.tmux.conf && \
+    sed -i 's|/root/.oh-my-zsh|/home/devuser/.oh-my-zsh|g' /home/devuser/.zshrc && \
+    chown -R devuser:devuser /home/devuser/
+
+# Set workspace directory and ownership
+WORKDIR /sgl-workspace/sglang
+RUN chown -R devuser:devuser /sgl-workspace
+
+# Switch to devuser
+USER devuser
+
+# Install uv
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Install rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 66f7aecbf826..5767aa2631a4 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,8 +1,9 @@
 {
     "name": "sglang",
     "build": {
-        "dockerfile": "../docker/Dockerfile.dev"
+        "dockerfile": "Dockerfile"
     },
+    "remoteUser": "devuser",
     "customizations": {
         "vscode": {
             "extensions": [
diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev
index 9d05ee5997e8..5ff1fa7a51a0 100644
--- a/docker/Dockerfile.dev
+++ b/docker/Dockerfile.dev
@@ -67,12 +67,6 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.31.1/cmake-3.31.1
     && cp -r cmake-3.31.1-linux-x86_64/share/* /usr/local/share/ \
     && rm -rf cmake-3.31.1-linux-x86_64 cmake-3.31.1-linux-x86_64.tar.gz
 
-# Install uv
-RUN curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Install rust
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-
 # Add yank script
 COPY --chown=root:root <<-"EOF" /usr/local/bin/yank
 #!/bin/bash

From 5de50653cd52195c7effe4f58b284ab05ce04809 Mon Sep 17 00:00:00 2001
From: Byron Hsu <byronhsu1230@gmail.com>
Date: Wed, 22 Jan 2025 17:56:21 -0800
Subject: [PATCH 197/248] [router] make error actionable (#3063)

---
 sgl-router/src/router.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sgl-router/src/router.rs b/sgl-router/src/router.rs
index 5bbffc74ccf5..a189ff9eb888 100644
--- a/sgl-router/src/router.rs
+++ b/sgl-router/src/router.rs
@@ -238,12 +238,12 @@ impl Router {
         loop {
             if start_time.elapsed() > Duration::from_secs(timeout_secs) {
                 error!(
-                    "Timeout {}s waiting for workers to become healthy",
-                    timeout_secs
+                    "Timeout {}s waiting for workers {:?} to become healthy. Please set --router-worker-startup-timeout-secs (sglang_router.launch_server) or --worker-startup-timeout-secs (sglang_worker.router) to a larger value",
+                    timeout_secs, worker_urls
                 );
                 return Err(format!(
-                    "Timeout {}s waiting for workers to become healthy",
-                    timeout_secs
+                    "Timeout {}s waiting for workers {:?} to become healthy. Please set --router-worker-startup-timeout-secs (sglang_router.launch_server) or --worker-startup-timeout-secs (sglang_worker.router) to a larger value",
+                    timeout_secs, worker_urls
                 ));
             }
 
@@ -644,11 +644,11 @@ impl Router {
         loop {
             if start_time.elapsed() > Duration::from_secs(timeout_secs) {
                 error!(
-                    "Timeout {}s waiting for worker {} to become healthy",
+                    "Timeout {}s waiting for worker {} to become healthy. Please set --router-worker-startup-timeout-secs (sglang_router.launch_server) or --worker-startup-timeout-secs (sglang_worker.router) to a larger value",
                     timeout_secs, worker_url
                 );
                 return Err(format!(
-                    "Timeout {}s waiting for worker {} to become healthy",
+                    "Timeout {}s waiting for worker {} to become healthy. Please set --router-worker-startup-timeout-secs (sglang_router.launch_server) or --worker-startup-timeout-secs (sglang_worker.router) to a larger value",
                     timeout_secs, worker_url
                 ));
             }

From 8b84e69f25929c8de0286c6e0e0c2ce4686b561c Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 22 Jan 2025 18:51:40 -0800
Subject: [PATCH 198/248] Fix tp token sync for dp attention (#3062)

---
 python/sglang/srt/layers/sampler.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index 24f951f2b5d3..3173d533d16e 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -6,6 +6,7 @@
 from torch import nn
 
 from sglang.srt.distributed import get_tensor_model_parallel_group
+from sglang.srt.layers.dp_attention import get_attention_tp_group
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
@@ -33,6 +34,10 @@ class Sampler(nn.Module):
     def __init__(self):
         super().__init__()
         self.use_nan_detectioin = global_server_args_dict["enable_nan_detection"]
+        self.tp_sync_group = get_tensor_model_parallel_group().device_group
+
+        if global_server_args_dict["enable_dp_attention"]:
+            self.tp_sync_group = get_attention_tp_group().device_group
 
     def forward(
         self,
@@ -140,7 +145,7 @@ def forward(
             torch.distributed.all_reduce(
                 batch_next_token_ids,
                 op=dist.ReduceOp.MIN,
-                group=get_tensor_model_parallel_group().device_group,
+                group=self.tp_sync_group,
             )
 
         return batch_next_token_ids.to(torch.int32)

From 862bcff833c8ae480fea0fdab6e53e619c650cb5 Mon Sep 17 00:00:00 2001
From: Ke Wen <kw2501@meta.com>
Date: Wed, 22 Jan 2025 21:33:17 -0800
Subject: [PATCH 199/248] Support loading of larger models with on-the-fly
 quantization (#3061)

---
 python/sglang/srt/configs/load_config.py      |  1 +
 python/sglang/srt/layers/torchao_utils.py     | 18 +++--
 .../sglang/srt/model_executor/model_runner.py |  9 ++-
 python/sglang/srt/model_loader/loader.py      | 75 +++++++++++++++++++
 .../sglang/srt/models/torch_native_llama.py   | 21 +++++-
 python/sglang/srt/server_args.py              |  6 +-
 6 files changed, 116 insertions(+), 14 deletions(-)

diff --git a/python/sglang/srt/configs/load_config.py b/python/sglang/srt/configs/load_config.py
index 2b2b341faeb5..6cb35ab47c68 100644
--- a/python/sglang/srt/configs/load_config.py
+++ b/python/sglang/srt/configs/load_config.py
@@ -20,6 +20,7 @@ class LoadFormat(str, enum.Enum):
     GGUF = "gguf"
     BITSANDBYTES = "bitsandbytes"
     MISTRAL = "mistral"
+    LAYERED = "layered"
 
 
 @dataclass
diff --git a/python/sglang/srt/layers/torchao_utils.py b/python/sglang/srt/layers/torchao_utils.py
index c5bca25df373..e08abd5ae1d5 100644
--- a/python/sglang/srt/layers/torchao_utils.py
+++ b/python/sglang/srt/layers/torchao_utils.py
@@ -5,6 +5,7 @@
 import logging
 import os
 import pwd
+from typing import Callable, Optional
 
 import torch
 
@@ -27,8 +28,18 @@ def save_gemlite_cache(print_error: bool = False) -> bool:
     return True
 
 
+def proj_filter(
+    module: torch.nn.Module,
+    fqn: str,
+):
+    """Filter function for quantizing projection layers."""
+    return "proj" in fqn
+
+
 def apply_torchao_config_to_model(
-    model: torch.nn.Module, torchao_config: str, filter_fn=None
+    model: torch.nn.Module,
+    torchao_config: str,
+    filter_fn: Optional[Callable] = proj_filter,
 ):
     """Quantize a modelwith torchao quantization specified by torchao_config
 
@@ -49,11 +60,6 @@ def apply_torchao_config_to_model(
     )
     from torchao.quantization.observer import PerRow, PerTensor
 
-    if filter_fn is None:
-
-        def filter_fn(module, fqn):
-            return "proj" in fqn
-
     if torchao_config == "" or torchao_config is None:
         return model
     elif "int8wo" in torchao_config:
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index d5cdcf2beb07..e7dc6bd66c53 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -185,9 +185,12 @@ def __init__(
         self.load_model()
 
         # Apply torchao quantization
-        apply_torchao_config_to_model(
-            self.model, global_server_args_dict["torchao_config"]
-        )
+        torchao_applied = getattr(self.model, "torchao_applied", False)
+        # In layered loading, torchao may have been applied
+        if not torchao_applied:
+            apply_torchao_config_to_model(
+                self.model, global_server_args_dict["torchao_config"]
+            )
 
         # Apply torch TP if the model supports it
         supports_torch_tp = getattr(self.model, "supports_torch_tp", False)
diff --git a/python/sglang/srt/model_loader/loader.py b/python/sglang/srt/model_loader/loader.py
index 677d716d43b6..9e6b09488e61 100644
--- a/python/sglang/srt/model_loader/loader.py
+++ b/python/sglang/srt/model_loader/loader.py
@@ -374,6 +374,78 @@ def load_model(
         return model.eval()
 
 
+class LayeredModelLoader(DefaultModelLoader):
+    """Model loader that loads weights layer by layer so that one can quantize a
+    layer before loading another to make the peak memory envelope smaller."""
+
+    def __init__(self, load_config: LoadConfig):
+        # Back to the default load format
+        load_config.load_format = LoadFormat.AUTO
+        super().__init__(load_config)
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
+        from sglang.srt.managers.schedule_batch import global_server_args_dict
+
+        torchao_config = global_server_args_dict.get("torchao_config")
+        target_device = torch.device(device_config.device)
+
+        with set_default_torch_dtype(model_config.dtype):
+            # Create model on meta device
+            with torch.device("meta"):
+                model = _initialize_model(
+                    model_config,
+                    self.load_config,
+                )
+
+            # Check model's layered load support
+            if not hasattr(model, "load_weights_to_module"):
+                raise ValueError(
+                    "LayeredModelLoader requires the model to have a "
+                    "`load_weights_to_module` method. "
+                    f"{model_config.model_path} does not support it."
+                )
+
+            # Get all weights from disk
+            weights = self._get_all_weights(model_config, model)
+
+            # Helper function to recursively fill the weights of a module
+            def fill_module(module, fqn: List[str], weights):
+                """
+                fqn: list of strings representing the fully qualified name of `module`.
+                """
+                # Layer by layer
+                for name, submod in module.named_children():
+                    fill_module(submod, fqn + [name], weights)
+
+                # First materialize on target device
+                module.to_empty(device=target_device, recurse=False)
+                fqn_path = ".".join(fqn)
+                # Fill weights
+                model.load_weights_to_module(
+                    fqn_path,
+                    weights,
+                )
+                # Quantize weights if applicable
+                if torchao_config and "proj" in fqn_path:
+                    # Note: `None` here is needed to indicate no filter, see
+                    # `apply_torchao_config_to_model` for details.
+                    apply_torchao_config_to_model(module, torchao_config, None)
+
+            # Start calling on root module
+            fill_module(model, [], weights)
+
+        if torchao_config:
+            model.torchao_applied = True
+
+        return model.eval()
+
+
 class DummyModelLoader(BaseModelLoader):
     """Model loader that will set model weights to random values."""
 
@@ -1149,4 +1221,7 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     if load_config.load_format == LoadFormat.GGUF:
         return GGUFModelLoader(load_config)
 
+    if load_config.load_format == LoadFormat.LAYERED:
+        return LayeredModelLoader(load_config)
+
     return DefaultModelLoader(load_config)
diff --git a/python/sglang/srt/models/torch_native_llama.py b/python/sglang/srt/models/torch_native_llama.py
index 024a6f317fa3..7b3e5bc5ddd5 100644
--- a/python/sglang/srt/models/torch_native_llama.py
+++ b/python/sglang/srt/models/torch_native_llama.py
@@ -460,7 +460,12 @@ def get_num_params(self):
         params_dict = dict(self.named_parameters())
         return len(params_dict)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights_to_module(
+        self,
+        fqn: str,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+    ):
+        """Load weights onto submodule pointed by path `fqn`."""
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -469,7 +474,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".gate_proj", 0),
             (".gate_up_proj", ".up_proj", 1),
         ]
-        params_dict = dict(self.named_parameters())
+        module = self.get_submodule(fqn)
+        params_dict = dict(module.named_parameters(prefix=fqn, recurse=False))
 
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name or "projector" in name:
@@ -486,7 +492,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
+                if name.endswith(".bias") or name not in params_dict:
                     continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
@@ -494,12 +500,19 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 break
             else:
                 # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
+                if name.endswith(".bias") or name not in params_dict:
                     continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
 
+    def load_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+    ):
+        """Load weights onto the full model."""
+        self.load_weights_to_module("", weights)
+
 
 class TorchNativePhi3ForCausalLM(TorchNativeLlamaForCausalLM):
     pass
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 4a7a28751db9..330c38132885 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -317,6 +317,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
                 "dummy",
                 "gguf",
                 "bitsandbytes",
+                "layered",
             ],
             help="The format of the model weights to load. "
             '"auto" will try to load the weights in the safetensors format '
@@ -330,7 +331,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
             "which is mainly for profiling."
             '"gguf" will load the weights in the gguf format. '
             '"bitsandbytes" will load the weights using bitsandbytes '
-            "quantization.",
+            "quantization."
+            '"layered" loads weights layer by layer so that one can quantize a '
+            "layer before loading another to make the peak memory envelope "
+            "smaller.",
         )
         parser.add_argument(
             "--trust-remote-code",

From ea535dc5745e3a5e7197ec1a58a26d60e4ab3d05 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 22 Jan 2025 21:33:35 -0800
Subject: [PATCH 200/248] Revert "disable custom allreduce on HIP" (#3067)

---
 python/sglang/srt/distributed/parallel_state.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/sglang/srt/distributed/parallel_state.py b/python/sglang/srt/distributed/parallel_state.py
index d97c348ef7b5..c6d1a8307818 100644
--- a/python/sglang/srt/distributed/parallel_state.py
+++ b/python/sglang/srt/distributed/parallel_state.py
@@ -41,7 +41,6 @@
 from sglang.srt.utils import (
     direct_register_custom_op,
     is_cuda_alike,
-    is_hip,
     supports_custom_op,
 )
 
@@ -953,9 +952,6 @@ def graph_capture():
 def set_custom_all_reduce(enable: bool):
     global _ENABLE_CUSTOM_ALL_REDUCE
     _ENABLE_CUSTOM_ALL_REDUCE = enable
-    if enable and is_hip():
-        logger.warning("HIP doesn't support custom_all_reduce, so disable it.")
-        _ENABLE_CUSTOM_ALL_REDUCE = False
 
 
 def init_distributed_environment(

From a547aad61f9f7182724caa1e4ea883848f1c632d Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 23 Jan 2025 13:47:53 +0800
Subject: [PATCH 201/248] docs: add developer guide for sgl-kernel (#3068)

---
 sgl-kernel/developer_guide.md | 46 +++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 sgl-kernel/developer_guide.md

diff --git a/sgl-kernel/developer_guide.md b/sgl-kernel/developer_guide.md
new file mode 100644
index 000000000000..8c9bf6195b0a
--- /dev/null
+++ b/sgl-kernel/developer_guide.md
@@ -0,0 +1,46 @@
+# Developer Guide for sgl-kernel
+
+## Development Environment Setup
+
+Use Docker to set up the development environment. See [Docker setup guide](https://github.com/sgl-project/sglang/blob/main/docs/developer/development_guide_using_docker.md#setup-docker-container).
+
+Create and enter development container:
+```bash
+docker run -itd --shm-size 32g --gpus all -v $HOME/.cache:/root/.cache --ipc=host --name sglang_zhyncs lmsysorg/sglang:dev /bin/zsh
+docker exec -it sglang_zhyncs /bin/zsh
+```
+
+## Project Structure
+
+### Dependencies
+
+Third-party libraries:
+
+- [CCCL](https://github.com/NVIDIA/cccl)
+- [CUTLASS](https://github.com/NVIDIA/cutlass)
+- [FlashInfer](https://github.com/flashinfer-ai/flashinfer)
+
+### Kernel Development
+
+Steps to add a new kernel:
+
+1. Implement in `sgl-kernel/src/sgl-kernel/csrc`
+2. Expose interface in `sgl-kernel/csrc/sgl_kernel_ops.cu` with pybind11
+3. Create Python wrapper in `sgl-kernel/src/sgl-kernel/ops/__init__.py`
+4. Expose Python interface in `sgl-kernel/src/sgl-kernel/__init__.py`
+
+### Build & Install
+
+Development build:
+
+```bash
+make build
+pip3 install dist/*whl --force-reinstall --no-deps
+# Or use: make install (runs pip install -e .)
+```
+
+### Testing & Benchmarking
+
+1. Add pytest tests in `sgl-kernel/tests/`
+2. Add benchmarks using [triton benchmark](https://triton-lang.org/main/python-api/generated/triton.testing.Benchmark.html) in `sgl-kernel/benchmark/`
+3. Run test suite

From 44e12ce463f44a29f87fc8af0f1e6c784b2c82ac Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 23 Jan 2025 14:08:25 +0800
Subject: [PATCH 202/248] docs: update developer guide for sgl-kernel (#3069)

---
 sgl-kernel/developer_guide.md | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/sgl-kernel/developer_guide.md b/sgl-kernel/developer_guide.md
index 8c9bf6195b0a..8afb6b0e460d 100644
--- a/sgl-kernel/developer_guide.md
+++ b/sgl-kernel/developer_guide.md
@@ -24,10 +24,11 @@ Third-party libraries:
 
 Steps to add a new kernel:
 
-1. Implement in `sgl-kernel/src/sgl-kernel/csrc`
-2. Expose interface in `sgl-kernel/csrc/sgl_kernel_ops.cu` with pybind11
-3. Create Python wrapper in `sgl-kernel/src/sgl-kernel/ops/__init__.py`
-4. Expose Python interface in `sgl-kernel/src/sgl-kernel/__init__.py`
+1. Implement in [src/sgl-kernel/csrc/](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/src/sgl-kernel/csrc)
+2. Expose interface in [csrc/sgl_kernel_ops.cu](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu) with pybind11
+3. Create Python wrapper in [src/sgl-kernel/ops/__init__.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/ops/__init__.py)
+4. Expose Python interface in [src/sgl-kernel/__init__.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/__init__.py)
+5. Update [setup.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/setup.py) to include new CUDA source
 
 ### Build & Install
 
@@ -41,6 +42,10 @@ pip3 install dist/*whl --force-reinstall --no-deps
 
 ### Testing & Benchmarking
 
-1. Add pytest tests in `sgl-kernel/tests/`
-2. Add benchmarks using [triton benchmark](https://triton-lang.org/main/python-api/generated/triton.testing.Benchmark.html) in `sgl-kernel/benchmark/`
+1. Add pytest tests in [tests/](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/tests)
+2. Add benchmarks using [triton benchmark](https://triton-lang.org/main/python-api/generated/triton.testing.Benchmark.html) in [benchmark/](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/benchmark)
 3. Run test suite
+
+### Release new version
+
+Update version in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/pyproject.toml)

From 3e032c07cc45b7fe3fc16041e602e4e5ed13ef79 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 23 Jan 2025 14:19:38 +0800
Subject: [PATCH 203/248] use v0.6.4.post1 for sgl-kernel ci (#3071)

---
 .github/workflows/pr-test-sgl-kernel.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
index 794a73f36619..55eb636d64ff 100644
--- a/.github/workflows/pr-test-sgl-kernel.yml
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -38,7 +38,7 @@ jobs:
 
       - name: Install
         run: |
-          pip3 install torch==2.5.1 && pip3 install pytest && pip3 install vllm
+          pip3 install torch==2.5.1 && pip3 install pytest && pip3 install vllm==0.6.4.post1
           pip3 uninstall sgl-kernel -y || true
           find . -name index.lock -delete
           cd sgl-kernel

From b4195b0c874b0621ed4ec887c083c91779c3ce76 Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Thu, 23 Jan 2025 07:18:03 +0000
Subject: [PATCH 204/248] fix include

---
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    | 44 +++++++++++--------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index 1348ab8305c2..33fe0b743173 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -3,28 +3,36 @@
 // https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm89.h
 // https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h
 
-#pragma once
-
 #include <ATen/cuda/CUDAContext.h>
 #include <cudaTypedefs.h>
+#include <cutlass/arch/arch.h>
+#include <cutlass/arch/memory.h>
+#include <cutlass/arch/mma.h>
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/epilogue/thread/activation.h>
+#include <cutlass/epilogue/thread/linear_combination.h>
+#include <cutlass/epilogue/threadblock/default_thread_map_tensor_op.h>
+#include <cutlass/gemm/device/gemm.h>
+#include <cutlass/gemm/device/gemm_universal_adapter.h>
+#include <cutlass/gemm/gemm.h>
+#include <cutlass/gemm/kernel/default_gemm_universal_with_visitor.h>
+#include <cutlass/gemm/thread/mma.h>
+#include <cutlass/layout/matrix.h>
+#include <cutlass/matrix_coord.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/tensor_ref.h>
 #include <torch/all.h>
 
-#include "cute/tensor.hpp"
-#include "cutlass/conv/convolution.h"
-// Order matters here, packed_stride.hpp is missing cute and convolution includes
-#include "cutlass/cutlass.h"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-#include "cutlass/gemm/device/gemm.h"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/util/packed_stride.hpp"
+#include <cute/tensor.hpp>
+#include <cutlass/epilogue/collective/collective_builder.hpp>
+#include <cutlass/epilogue/collective/default_epilogue.hpp>
+#include <cutlass/epilogue/threadblock/fusion/visitors.hpp>
+#include <cutlass/gemm/collective/collective_builder.hpp>
+#include <cutlass/gemm/dispatch_policy.hpp>
+#include <cutlass/gemm/kernel/gemm_universal.hpp>
+#include <cutlass/util/packed_stride.hpp>
+
 #include "utils.hpp"
 
 using namespace cute;

From 8290ba6c5d7ec064a98e94b0a2d124197faac6a1 Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Thu, 23 Jan 2025 07:20:22 +0000
Subject: [PATCH 205/248] add more shapes for benchmark

---
 sgl-kernel/benchmark/bench_fp8_gemm.py | 109 ++++++++++++++++++++++++-
 1 file changed, 105 insertions(+), 4 deletions(-)

diff --git a/sgl-kernel/benchmark/bench_fp8_gemm.py b/sgl-kernel/benchmark/bench_fp8_gemm.py
index e68695a3f396..c3f804753568 100644
--- a/sgl-kernel/benchmark/bench_fp8_gemm.py
+++ b/sgl-kernel/benchmark/bench_fp8_gemm.py
@@ -1,10 +1,73 @@
+import argparse
+import copy
+import itertools
+
 import torch
-import torch.nn.functional as F
 import triton
 from sgl_kernel import fp8_scaled_mm as sgl_scaled_mm
 from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
 from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
 
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "meta-llama/Llama-3.1-8B-Instruct": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-3.3-70B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "mistralai/Mistral-Large-Instruct-2407": [
+        ([12288, 14336], 1),
+        ([12288, 12288], 0),
+        ([12288, 57344], 1),
+        ([28672, 12288], 0),
+    ],
+    "Qwen/Qwen2.5-7B-Instruct": [
+        ([3584, 4608], 1),
+        ([3584, 3584], 0),
+        ([3584, 37888], 1),
+        ([18944, 3584], 0),
+    ],
+    "Qwen/Qwen2.5-32B-Instruct": [
+        ([5120, 7168], 1),
+        ([5120, 5120], 0),
+        ([5120, 55296], 1),
+        ([27648, 5120], 0),
+    ],
+    "Qwen/Qwen2.5-72B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 59136], 1),
+        ([29568, 8192], 0),
+    ],
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
+        ([2048, 3072], 1),
+        ([2048, 4096], 1),
+        ([2048, 2048], 0),
+        ([2048, 576], 0),
+        ([2048, 21888], 1),
+        ([10944, 2048], 0),
+        ([2048, 2816], 1),
+        ([1408, 2048], 0),
+    ],
+}
+
 
 @triton.testing.perf_report(
     triton.testing.Benchmark(
@@ -30,8 +93,9 @@
         args={},
     )
 )
-def benchmark(batch_size, provider):
-    M, N, K = batch_size, 4096, 8192
+def benchmark(batch_size, provider, N, K):
+    # M, N, K = batch_size, 4096, 8192
+    M = batch_size
     a = torch.ones((M, K), device="cuda") * 5.0
     b = torch.ones((N, K), device="cuda") * 5.0
     scale_a = torch.randn((M,), device="cuda", dtype=torch.float32)
@@ -60,4 +124,41 @@ def benchmark(batch_size, provider):
     return gbps(ms), gbps(max_ms), gbps(min_ms)
 
 
-benchmark.run(print_data=True, show_plots=True, save_path="bench_fp8_res")
+def prepare_shapes(args):
+    KN_model_names = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        assert model in WEIGHT_SHAPES
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KN.append(model)
+            KN_model_names.append(KN)
+    return KN_model_names
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        help="List of models to benchmark",
+    )
+    parser.add_argument(
+        "--tp-sizes",
+        nargs="+",
+        type=int,
+        default=[1],
+        help="List of tensor parallel sizes",
+    )
+    args = parser.parse_args()
+
+    KN_model_names = prepare_shapes(args)
+    for K, N, model_name in KN_model_names:
+        print(f"{model_name} N={N} K={K}: ")
+        benchmark.run(
+            print_data=True, show_plots=True, save_path="bench_fp8_res", N=N, K=K
+        )
+
+    print("Benchmark finished!")

From ac2dc35d0e529a278450bceb4d234aae3a1c93d8 Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Thu, 23 Jan 2025 15:29:20 +0800
Subject: [PATCH 206/248] support lightning_attention_decode in sgl-kernel for
 MiniMax-Text-01 (#3030)

---
 .../benchmark_lightning_attention_decode.py   |  77 ++++-
 .../bench_lightning_attention_decode.py       | 299 ++++++++++++++++++
 sgl-kernel/setup.py                           |   1 +
 sgl-kernel/src/sgl-kernel/__init__.py         |   2 +
 .../csrc/lightning_attention_decode_kernel.cu | 119 +++++++
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     |   7 +
 sgl-kernel/src/sgl-kernel/ops/__init__.py     |   7 +
 .../tests/test_lightning_attention_decode.py  |  84 +++++
 8 files changed, 588 insertions(+), 8 deletions(-)
 create mode 100644 sgl-kernel/benchmark/bench_lightning_attention_decode.py
 create mode 100644 sgl-kernel/src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu
 create mode 100644 sgl-kernel/tests/test_lightning_attention_decode.py

diff --git a/benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_decode.py b/benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_decode.py
index a2d1e10f6623..57fbcfddf2c1 100644
--- a/benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_decode.py
+++ b/benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_decode.py
@@ -9,6 +9,7 @@
 import triton
 import triton.language as tl
 from einops import rearrange
+from sgl_kernel import lightning_attention_decode as sgl_lightning_attention_decode
 
 
 @triton.jit
@@ -332,7 +333,6 @@ def test_lightning_attention_implementations(model_params):
         model_params["num_attention_heads"],
         d,
         d,
-        dtype=dtype,
         device=device,
     )
     with torch.no_grad():
@@ -350,7 +350,13 @@ def test_lightning_attention_implementations(model_params):
     q = q.transpose(1, 2)
     k = k.transpose(1, 2)
     v = v.transpose(1, 2)
+    q = q.contiguous()
+    k = k.contiguous()
+    v = v.contiguous()
+    past_kv = past_kv.contiguous()
+    slope_rate = slope_rate.contiguous()
 
+    # Test Triton implementation
     triton_output, triton_new_kv = lightning_attn_decode(q, k, v, past_kv, slope_rate)
     triton_output = triton_output.transpose(1, 2).contiguous()
     triton_output = triton_output.view(batch_size, seq_len, -1)
@@ -358,22 +364,50 @@ def test_lightning_attention_implementations(model_params):
     triton_output = torch.sigmoid(model_attn.output_gate(hidden_states)) * triton_output
     triton_output = model_attn.out_proj(triton_output)
 
+    # Test SGL implementation
+    sgl_output = torch.empty_like(v)
+    sgl_new_kv = torch.empty_like(past_kv)
+    sgl_lightning_attention_decode(q, k, v, past_kv, slope_rate, sgl_output, sgl_new_kv)
+
+    sgl_output = sgl_output.transpose(1, 2).contiguous()
+    sgl_output = sgl_output.view(batch_size, seq_len, -1)
+    sgl_output = model_attn.norm(sgl_output)
+    sgl_output = torch.sigmoid(model_attn.output_gate(hidden_states)) * sgl_output
+    sgl_output = model_attn.out_proj(sgl_output)
+
+    # Verify Triton implementation results
     torch.testing.assert_close(
         model_output,
         triton_output,
         rtol=1e-3,
         atol=1e-2,
-        msg="Lightning attention implementations produce different output results",
+        msg="Triton lightning attention implementation produces different output results",
     )
     torch.testing.assert_close(
         new_kv,
         triton_new_kv,
         rtol=1e-3,
         atol=1e-2,
-        msg="Lightning attention implementations produce different kv results",
+        msg="Triton lightning attention implementation produces different kv results",
     )
 
-    print("✅ Two implementations match")
+    # Verify SGL implementation results
+    torch.testing.assert_close(
+        model_output,
+        sgl_output,
+        rtol=1e-3,
+        atol=1e-2,
+        msg="SGL lightning attention implementation produces different output results",
+    )
+    torch.testing.assert_close(
+        new_kv,
+        sgl_new_kv,
+        rtol=1e-3,
+        atol=1e-2,
+        msg="SGL lightning attention implementation produces different kv results",
+    )
+
+    print("✅ All implementations match")
 
 
 def _build_slope_tensor(n_attention_heads: int):
@@ -408,12 +442,13 @@ def get_benchmark():
             x_names=["batch_size", "seq_len"],
             x_vals=[list(_) for _ in configs],
             line_arg="provider",
-            line_vals=["Original", "Triton"],
+            line_vals=["Original", "Triton", "SGL"],
             line_names=[
                 "Original PyTorch Implementation",
                 "Triton Implementation",
+                "SGL Implementation",
             ],
-            styles=[("blue", "-"), ("green", "-")],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
             ylabel="us",
             plot_name="lightning-attention-decode-performance",
             args={},
@@ -446,7 +481,6 @@ def benchmark(batch_size, seq_len, provider):
             params["num_attention_heads"],
             d,
             d,
-            dtype=dtype,
             device=device,
         )
 
@@ -461,7 +495,7 @@ def benchmark(batch_size, seq_len, provider):
                 ),
                 quantiles=quantiles,
             )
-        else:
+        elif provider == "Triton":
 
             def run_triton():
                 qkv = model_attn.act(model_attn.qkv_proj(hidden_states))
@@ -483,6 +517,33 @@ def run_triton():
                 run_triton,
                 quantiles=quantiles,
             )
+        else:  # SGL
+
+            def run_sgl():
+                qkv = model_attn.act(model_attn.qkv_proj(hidden_states))
+                new_shape = qkv.size()[:-1] + (model_attn.num_heads, -1)
+                qkv = qkv.view(*new_shape)
+                q, k, v = torch.split(qkv, [model_attn.head_dim] * 3, dim=-1)
+                q = q.transpose(1, 2).contiguous()
+                k = k.transpose(1, 2).contiguous()
+                v = v.transpose(1, 2).contiguous()
+
+                output = torch.empty_like(v)
+                new_kv = torch.empty_like(past_kv)
+                sgl_lightning_attention_decode(
+                    q, k, v, past_kv, slope_rate, output, new_kv
+                )
+
+                output = output.transpose(1, 2).contiguous()
+                output = output.view(batch_size, seq_len, -1)
+                output = model_attn.norm(output)
+                output = torch.sigmoid(model_attn.output_gate(hidden_states)) * output
+                return model_attn.out_proj(output)
+
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                run_sgl,
+                quantiles=quantiles,
+            )
 
         return 1000 * ms, 1000 * max_ms, 1000 * min_ms
 
diff --git a/sgl-kernel/benchmark/bench_lightning_attention_decode.py b/sgl-kernel/benchmark/bench_lightning_attention_decode.py
new file mode 100644
index 000000000000..24872e61a4d4
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_lightning_attention_decode.py
@@ -0,0 +1,299 @@
+import itertools
+import math
+
+import torch
+import triton
+import triton.language as tl
+from sgl_kernel import lightning_attention_decode
+
+
+def next_power_of_2(n):
+    return 2 ** (int(math.ceil(math.log(n, 2))))
+
+
+@triton.jit
+def _decode_kernel(
+    Q,
+    K,
+    V,
+    KV,
+    Out,
+    S,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n: tl.constexpr,
+    d: tl.constexpr,
+    d_original: tl.constexpr,
+    e: tl.constexpr,
+    e_original: tl.constexpr,
+):
+    off_bh = tl.program_id(0)
+    off_h = off_bh % h
+
+    qk_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    o_offset = off_bh * n * e
+    kv_offset = off_bh * d * e
+
+    s = tl.load(S + off_h)
+    ratio = tl.exp(-s)
+
+    d_idx = tl.arange(0, d)
+    e_idx = tl.arange(0, e)
+
+    # Create masks for original dimensions
+    d_mask = d_idx < d_original
+    e_mask = e_idx < e_original
+
+    # Load with masking
+    q = tl.load(Q + qk_offset + d_idx, mask=d_mask, other=0.0)
+    k = tl.load(K + qk_offset + d_idx, mask=d_mask, other=0.0)
+    v = tl.load(V + v_offset + e_idx, mask=e_mask, other=0.0)
+
+    # Load KV with 2D masking
+    kv = tl.load(
+        KV + kv_offset + d_idx[:, None] * e + e_idx[None, :],
+        mask=(d_mask[:, None] & e_mask[None, :]),
+        other=0.0,
+    )
+
+    # Compute outer product using element-wise operations
+    k_v_prod = k[:, None] * v[None, :]
+    kv = ratio * kv + k_v_prod
+
+    # Store KV with 2D masking
+    tl.store(
+        KV + kv_offset + d_idx[:, None] * e + e_idx[None, :],
+        kv.to(KV.dtype.element_ty),
+        mask=(d_mask[:, None] & e_mask[None, :]),
+    )
+
+    # Compute matrix-vector multiplication using element-wise operations and reduction
+    o = tl.sum(q[:, None] * kv, axis=0)
+
+    # Store output with masking
+    tl.store(Out + o_offset + e_idx, o.to(Out.dtype.element_ty), mask=e_mask)
+
+
+def triton_lightning_attn_decode(q, k, v, kv, s):
+    """Triton implementation of Lightning Attention decode operation"""
+    b, h, n, d = q.shape
+    e = v.shape[-1]
+    assert n == 1, "Sequence length must be 1 in decode mode"
+
+    # Get padded dimensions (power of 2)
+    d_padded = next_power_of_2(d)
+    e_padded = next_power_of_2(e)
+
+    # Create output tensor (padded)
+    o_padded = torch.empty(b, h, n, e_padded, dtype=v.dtype, device=v.device)
+
+    # Create padded tensors without actually padding the data
+    q_padded = torch.empty(b, h, n, d_padded, dtype=q.dtype, device=q.device)
+    k_padded = torch.empty(b, h, n, d_padded, dtype=k.dtype, device=k.device)
+    v_padded = torch.empty(b, h, n, e_padded, dtype=v.dtype, device=v.device)
+    kv_padded = torch.empty(
+        b, h, d_padded, e_padded, dtype=torch.float32, device=kv.device
+    )
+
+    # Copy data to padded tensors
+    q_padded[..., :d] = q
+    k_padded[..., :d] = k
+    v_padded[..., :e] = v
+    kv_padded[..., :d, :e] = kv
+
+    # Launch kernel
+    grid = (b * h, 1)
+    _decode_kernel[grid](
+        q_padded,
+        k_padded,
+        v_padded,
+        kv_padded,
+        o_padded,
+        s,
+        b=b,
+        h=h,
+        n=n,
+        d=d_padded,
+        d_original=d,
+        e=e_padded,
+        e_original=e,
+    )
+
+    # Get unpadded outputs
+    o = o_padded[..., :e]
+    kv_out = kv_padded[..., :d, :e]
+
+    return o, kv_out
+
+
+def lightning_attention_decode_naive(q, k, v, past_kv, slope):
+    """Naive implementation of lightning attention decode"""
+    original_dtype = q.dtype
+    ratio = torch.exp(-slope)  # [h, 1, 1]
+
+    kv = past_kv
+    b, h, n, d = q.shape
+
+    output = []
+    for i in range(n):
+        kv = ratio * kv.to(torch.float32) + torch.einsum(
+            "... n d, ... n e -> ... d e",
+            k[:, :, i : i + 1],
+            v[:, :, i : i + 1],
+        )
+        qkv = torch.einsum(
+            "... n e, ... e d -> ... n d",
+            q[:, :, i : i + 1].to(torch.float32),
+            kv.to(torch.float32),
+        )
+        output.append(qkv)
+    output = torch.concat(output, dim=-2)
+
+    return output.to(original_dtype), kv
+
+
+def lightning_attention_decode_kernel(q, k, v, past_kv, slope, output, new_kv):
+    return lightning_attention_decode(q, k, v, past_kv, slope, output, new_kv)
+
+
+def calculate_diff(batch_size):
+    dtype = torch.bfloat16
+    device = torch.device("cuda")
+    num_heads = 64
+    head_dim = 96
+    seq_len = 1
+
+    q = torch.randn(
+        batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
+    )
+    k = torch.randn(
+        batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
+    )
+    v = torch.randn(
+        batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
+    )
+    past_kv = torch.randn(batch_size, num_heads, head_dim, head_dim, device=device)
+    slope = torch.randn(num_heads, 1, 1, device=device)
+
+    output_naive, new_kv_naive = lightning_attention_decode_naive(
+        q.clone(), k.clone(), v.clone(), past_kv.clone(), slope.clone()
+    )
+
+    output_kernel = torch.empty_like(output_naive)
+    new_kv_kernel = torch.empty_like(new_kv_naive)
+    lightning_attention_decode_kernel(
+        q.clone(),
+        k.clone(),
+        v.clone(),
+        past_kv.clone(),
+        slope.clone(),
+        output_kernel,
+        new_kv_kernel,
+    )
+
+    output_triton, new_kv_triton = triton_lightning_attn_decode(
+        q.clone(), k.clone(), v.clone(), past_kv.clone(), slope.clone()
+    )
+
+    if (
+        torch.allclose(output_naive, output_kernel, atol=1e-2, rtol=1e-2)
+        and torch.allclose(output_naive, output_triton, atol=1e-2, rtol=1e-2)
+        and torch.allclose(new_kv_naive, new_kv_kernel, atol=1e-2, rtol=1e-2)
+        and torch.allclose(new_kv_naive, new_kv_triton, atol=1e-2, rtol=1e-2)
+    ):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+batch_size_range = [i for i in range(1, 65)]  # 1 to 128
+configs = [(bs,) for bs in batch_size_range]
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[list(_) for _ in configs],
+        line_arg="provider",
+        line_vals=["naive", "kernel", "triton"],
+        line_names=["PyTorch Naive", "SGL Kernel", "Triton"],
+        styles=[("blue", "-"), ("red", "-"), ("green", "-")],
+        ylabel="us",
+        plot_name="lightning-attention-decode-performance",
+        args={},
+    )
+)
+def benchmark(batch_size, provider):
+    dtype = torch.bfloat16
+    device = torch.device("cuda")
+    num_heads = 64
+    head_dim = 96
+    seq_len = 1
+
+    q = torch.randn(
+        batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
+    )
+    k = torch.randn(
+        batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
+    )
+    v = torch.randn(
+        batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
+    )
+    past_kv = torch.randn(batch_size, num_heads, head_dim, head_dim, device=device)
+    slope = torch.randn(num_heads, 1, 1, device=device)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "naive":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: lightning_attention_decode_naive(
+                q.clone(), k.clone(), v.clone(), past_kv.clone(), slope.clone()
+            ),
+            quantiles=quantiles,
+        )
+    elif provider == "kernel":
+        output = torch.empty(
+            batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
+        )
+        new_kv = torch.empty(batch_size, num_heads, head_dim, head_dim, device=device)
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: lightning_attention_decode_kernel(
+                q.clone(),
+                k.clone(),
+                v.clone(),
+                past_kv.clone(),
+                slope.clone(),
+                output,
+                new_kv,
+            ),
+            quantiles=quantiles,
+        )
+    elif provider == "triton":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: triton_lightning_attn_decode(
+                q.clone(), k.clone(), v.clone(), past_kv.clone(), slope.clone()
+            ),
+            quantiles=quantiles,
+        )
+
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./configs/benchmark_ops/lightning_attention_decode_sgl/",
+        help="Path to save lightning attention decode benchmark results",
+    )
+    args = parser.parse_args()
+
+    # Run correctness test
+    calculate_diff(batch_size=4)
+
+    # Run performance benchmark
+    benchmark.run(print_data=True)
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 81cd96e99adb..9a2324b60d8a 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -100,6 +100,7 @@ def get_device_sm():
             "src/sgl-kernel/csrc/moe_align_kernel.cu",
             "src/sgl-kernel/csrc/int8_gemm_kernel.cu",
             "src/sgl-kernel/csrc/sampling_scaling_penalties.cu",
+            "src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu",
             "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
             "src/sgl-kernel/csrc/rotary_embedding.cu",
             "3rdparty/flashinfer/csrc/activation.cu",
diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py
index 86c4f34d3534..9eaa64e50833 100644
--- a/sgl-kernel/src/sgl-kernel/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/__init__.py
@@ -10,6 +10,7 @@
     get_graph_buffer_ipc_meta,
     init_custom_reduce,
     int8_scaled_mm,
+    lightning_attention_decode,
     moe_align_block_size,
     register_graph_buffers,
     rmsnorm,
@@ -35,5 +36,6 @@
     "rmsnorm",
     "rotary_embedding",
     "sampling_scaling_penalties",
+    "lightning_attention_decode",
     "silu_and_mul",
 ]
diff --git a/sgl-kernel/src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu
new file mode 100644
index 000000000000..eb79373b22cb
--- /dev/null
+++ b/sgl-kernel/src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu
@@ -0,0 +1,119 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include "utils.h"
+
+#define THREADS_PER_BLOCK 128
+
+template <typename T>
+__global__ void lightning_attention_decode_kernel(const T* __restrict__ q,            // [b, h, 1, d]
+                                                  const T* __restrict__ k,            // [b, h, 1, d]
+                                                  const T* __restrict__ v,            // [b, h, 1, e]
+                                                  const float* __restrict__ past_kv,  // [b, h, d, e]
+                                                  const float* __restrict__ slope,    // [h, 1, 1]
+                                                  T* __restrict__ output,             // [b, h, 1, e]
+                                                  float* __restrict__ new_kv,         // [b, h, d, e]
+                                                  const int batch_size, const int num_heads, const int qk_dim,
+                                                  const int v_dim) {
+  extern __shared__ char smem[];
+  T* q_shared = reinterpret_cast<T*>(smem);
+  T* k_shared = reinterpret_cast<T*>(smem + qk_dim * sizeof(T));
+  T* v_shared = reinterpret_cast<T*>(smem + 2 * qk_dim * sizeof(T));
+  float* new_kv_shared = reinterpret_cast<float*>(smem + (2 * qk_dim + v_dim) * sizeof(T));
+  T* output_shared =
+      reinterpret_cast<T*>(smem + (2 * qk_dim + v_dim) * sizeof(T) + qk_dim * (v_dim + 1) * sizeof(float));
+
+  const int32_t tid = threadIdx.x;
+  const int32_t current_head = blockIdx.x;
+  const int32_t b = current_head / num_heads;
+  const int32_t h = current_head % num_heads;
+
+  if (b >= batch_size) return;
+
+  const int32_t qk_offset = b * num_heads * qk_dim + h * qk_dim;
+  const int32_t v_offset = b * num_heads * v_dim + h * v_dim;
+  const int32_t kv_offset = b * num_heads * qk_dim * v_dim + h * qk_dim * v_dim;
+
+  for (int d = tid; d < qk_dim; d += blockDim.x) {
+    q_shared[d] = q[qk_offset + d];
+    k_shared[d] = k[qk_offset + d];
+  }
+  for (int e = tid; e < v_dim; e += blockDim.x) {
+    v_shared[e] = v[v_offset + e];
+  }
+
+  __syncthreads();
+
+  const float ratio = expf(-1.0f * slope[h]);
+
+  for (int d = tid; d < qk_dim; d += blockDim.x) {
+    T k_val = k_shared[d];
+    for (int e = 0; e < v_dim; ++e) {
+      int past_kv_idx = kv_offset + d * v_dim + e;
+      T v_val = v_shared[e];
+      float new_val = ratio * past_kv[past_kv_idx] + k_val * v_val;
+      int shared_idx = d * (v_dim + 1) + e;
+      new_kv_shared[shared_idx] = new_val;
+    }
+  }
+
+  __syncthreads();
+
+  for (int idx = tid; idx < qk_dim * v_dim; idx += blockDim.x) {
+    int d = idx / v_dim;
+    int e = idx % v_dim;
+    int shared_idx = d * (v_dim + 1) + e;
+    int global_idx = kv_offset + idx;
+    new_kv[global_idx] = new_kv_shared[shared_idx];
+  }
+
+  __syncthreads();
+
+  for (int e = tid; e < v_dim; e += blockDim.x) {
+    float sum = 0.0f;
+    for (int d = 0; d < qk_dim; ++d) {
+      int shared_idx = d * (v_dim + 1) + e;
+      sum += q_shared[d] * new_kv_shared[shared_idx];
+    }
+    output_shared[e] = static_cast<T>(sum);
+  }
+
+  __syncthreads();
+
+  if (tid == 0) {
+    for (int e = 0; e < v_dim; ++e) {
+      output[v_offset + e] = output_shared[e];
+    }
+  }
+}
+
+void lightning_attention_decode(const torch::Tensor& q, const torch::Tensor& k, const torch::Tensor& v,
+                                const torch::Tensor& past_kv, const torch::Tensor& slope, torch::Tensor output,
+                                torch::Tensor new_kv) {
+  TORCH_CHECK(q.is_contiguous(), "q must be contiguous");
+  TORCH_CHECK(k.is_contiguous(), "k must be contiguous");
+  TORCH_CHECK(v.is_contiguous(), "v must be contiguous");
+  TORCH_CHECK(past_kv.is_contiguous(), "past_kv must be contiguous");
+
+  auto batch_size = q.size(0);
+  auto num_heads = q.size(1);
+  auto qk_dim = q.size(3);
+  auto v_dim = v.size(3);
+
+  dim3 block(THREADS_PER_BLOCK);
+  dim3 grid(batch_size * num_heads);
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16, q.scalar_type(), "lightning_attention_decode_kernel", ([&] {
+        size_t smem_size = (2 * qk_dim + 2 * v_dim) * sizeof(scalar_t) + qk_dim * (v_dim + 1) * sizeof(float);
+        lightning_attention_decode_kernel<scalar_t><<<grid, block, smem_size, stream>>>(
+            q.data_ptr<scalar_t>(), k.data_ptr<scalar_t>(), v.data_ptr<scalar_t>(), past_kv.data_ptr<float>(),
+            slope.data_ptr<float>(), output.data_ptr<scalar_t>(), new_kv.data_ptr<float>(), batch_size, num_heads,
+            qk_dim, v_dim);
+      }));
+}
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index 12df0747171e..cd5df07895aa 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -26,6 +26,11 @@ torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& ma
                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
                              const c10::optional<torch::Tensor>& bias);
 
+// lightning_attention_decode
+void lightning_attention_decode(const torch::Tensor& q, const torch::Tensor& k, const torch::Tensor& v,
+                                const torch::Tensor& past_kv, const torch::Tensor& slope, torch::Tensor output,
+                                torch::Tensor new_kv);
+
 // rotary embedding
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, torch::Tensor& key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox);
@@ -69,6 +74,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("sampling_scaling_penalties", &sampling_scaling_penalties, "Sampling scaling penalties (CUDA)");
   // int8_scaled_mm
   m.def("int8_scaled_mm", &int8_scaled_mm, "INT8 scaled matmul (CUDA)");
+  // lightning_attention_decode
+  m.def("lightning_attention_decode", &lightning_attention_decode, "Lightning Attention Ddecode (CUDA)");
   // rotary embedding
   m.def("rotary_embedding", &rotary_embedding, "Rotary Embedding (CUDA)");
   // rms norm
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index d90f121d4f35..0aead260bc4a 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -14,6 +14,9 @@
 )
 from sgl_kernel.ops._kernels import init_custom_ar as _init_custom_ar
 from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm
+from sgl_kernel.ops._kernels import (
+    lightning_attention_decode as _lightning_attention_decode,
+)
 from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size
 from sgl_kernel.ops._kernels import register_graph_buffers as _register_graph_buffers
 from sgl_kernel.ops._kernels import rmsnorm as _rmsnorm
@@ -86,6 +89,10 @@ def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
     )
 
 
+def lightning_attention_decode(q, k, v, past_kv, slope, output, new_kv):
+    _lightning_attention_decode(q, k, v, past_kv, slope, output, new_kv)
+
+
 def rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox):
     return _rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox)
 
diff --git a/sgl-kernel/tests/test_lightning_attention_decode.py b/sgl-kernel/tests/test_lightning_attention_decode.py
new file mode 100644
index 000000000000..74af78e27b5d
--- /dev/null
+++ b/sgl-kernel/tests/test_lightning_attention_decode.py
@@ -0,0 +1,84 @@
+import pytest
+import torch
+from sgl_kernel import lightning_attention_decode
+
+
+def naive_lightning_attention_decode(q, k, v, past_kv, slope):
+    """Naive implementation of lightning attention decode"""
+    original_dtype = q.dtype
+    ratio = torch.exp(-slope)  # [h, 1, 1]
+
+    kv = past_kv
+    b, h, n, d = q.shape
+
+    output = []
+    for i in range(n):
+        kv = ratio * kv.to(torch.float32) + torch.einsum(
+            "... n d, ... n e -> ... d e",
+            k[:, :, i : i + 1],
+            v[:, :, i : i + 1],
+        )
+        qkv = torch.einsum(
+            "... n e, ... e d -> ... n d",
+            q[:, :, i : i + 1].to(torch.float32),
+            kv.to(torch.float32),
+        )
+        output.append(qkv)
+    output = torch.concat(output, dim=-2)
+
+    return output.to(original_dtype), kv
+
+
+configs = [
+    # (batch_size, num_heads, dim, embed_dim)
+    (1, 8, 64, 64),
+    (2, 8, 64, 64),
+    (1, 32, 32, 64),
+    (2, 32, 32, 64),
+    (4, 32, 64, 64),
+    (4, 32, 64, 64),
+    (16, 64, 96, 96),
+    (64, 64, 96, 96),
+]
+
+dtypes = [torch.float32, torch.float16, torch.bfloat16]
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.parametrize("dtype", dtypes)
+@pytest.mark.parametrize("batch_size,num_heads,dim,embed_dim", configs)
+def test_lightning_attention_decode(dtype, batch_size, num_heads, dim, embed_dim):
+    device = torch.device("cuda")
+
+    q = torch.randn(batch_size, num_heads, 1, dim, device=device, dtype=dtype)
+    k = torch.randn(batch_size, num_heads, 1, dim, device=device, dtype=dtype)
+    v = torch.randn(batch_size, num_heads, 1, embed_dim, device=device, dtype=dtype)
+    past_kv = torch.randn(batch_size, num_heads, dim, embed_dim, device=device)
+    slope = torch.randn(num_heads, 1, 1, device=device)
+
+    ref_output, ref_new_kv = naive_lightning_attention_decode(q, k, v, past_kv, slope)
+
+    output = torch.empty_like(ref_output)
+    new_kv = torch.empty_like(ref_new_kv)
+    lightning_attention_decode(q, k, v, past_kv, slope, output, new_kv)
+
+    rtol = 1e-2
+    atol = 1e-2
+
+    torch.testing.assert_close(
+        output,
+        ref_output,
+        rtol=rtol,
+        atol=atol,
+        msg=f"Output mismatch for batch_size={batch_size}, num_heads={num_heads}, "
+        f"dim={dim}, embed_dim={embed_dim}, dtype={dtype}",
+    )
+
+    torch.testing.assert_close(
+        new_kv,
+        ref_new_kv,
+        rtol=rtol,
+        atol=atol,
+        msg=f"New KV mismatch for batch_size={batch_size}, num_heads={num_heads}, "
+        f"dim={dim}, embed_dim={embed_dim}, dtype={dtype}",
+    )

From 42f408f89238625cc64e9a8094bde4955c06b15c Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Thu, 23 Jan 2025 08:13:19 +0000
Subject: [PATCH 207/248] fix bug

---
 sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index 33fe0b743173..aa30f8073473 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -33,7 +33,7 @@
 #include <cutlass/gemm/kernel/gemm_universal.hpp>
 #include <cutlass/util/packed_stride.hpp>
 
-#include "utils.hpp"
+#include "utils.h"
 
 using namespace cute;
 

From 553f5a3ffe28d186524cb182849b1ef0d7020a49 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 23 Jan 2025 01:23:37 -0800
Subject: [PATCH 208/248] Remove torch dependency in sgl-kernel (#3074)

---
 sgl-kernel/pyproject.toml | 4 +---
 sgl-kernel/setup.py       | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/sgl-kernel/pyproject.toml b/sgl-kernel/pyproject.toml
index ab9d68b44c8c..11e9880a5afd 100644
--- a/sgl-kernel/pyproject.toml
+++ b/sgl-kernel/pyproject.toml
@@ -14,9 +14,7 @@ classifiers = [
   "License :: OSI Approved :: Apache Software License",
   "Environment :: GPU :: NVIDIA CUDA"
 ]
-dependencies = [
-  "torch",
-]
+dependencies = []
 
 [project.urls]
 "Homepage" = "https://github.com/sgl-project/sglang"
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 9a2324b60d8a..c51fd7045048 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -127,7 +127,6 @@ def get_device_sm():
     package_dir={"": "src"},
     ext_modules=ext_modules,
     cmdclass={"build_ext": BuildExtension},
-    install_requires=["torch"],
 )
 
 update_wheel_platform_tag()

From 1f6cf0d4b9bc3b03243157a854407fdcf1db6c11 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 23 Jan 2025 19:16:35 +0800
Subject: [PATCH 209/248] fix build error for sgl-kernel (#3078)

---
 .github/workflows/release-pypi-kernel.yml | 4 ++--
 sgl-kernel/pyproject.toml                 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/release-pypi-kernel.yml b/.github/workflows/release-pypi-kernel.yml
index 466f2bdc70d3..1b925b772186 100644
--- a/.github/workflows/release-pypi-kernel.yml
+++ b/.github/workflows/release-pypi-kernel.yml
@@ -18,8 +18,8 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
-        cuda-version: ['12.1']
+        python-version: ['3.9', '3.10', '3.11', '3.12']
+        cuda-version: ['12.4']
 
     steps:
     - uses: actions/checkout@v4
diff --git a/sgl-kernel/pyproject.toml b/sgl-kernel/pyproject.toml
index 11e9880a5afd..eecf68b37cf8 100644
--- a/sgl-kernel/pyproject.toml
+++ b/sgl-kernel/pyproject.toml
@@ -7,7 +7,7 @@ name = "sgl-kernel"
 version = "0.0.2.post15"
 description = "Kernel Library for SGLang"
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 license = { file = "LICENSE" }
 classifiers = [
   "Programming Language :: Python :: 3",

From 3d0bfa3e17bb1468ccb93fcc731c7e2e99d12af1 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 23 Jan 2025 19:45:25 +0800
Subject: [PATCH 210/248] update version setup for sgl-kernel (#3079)

---
 .github/workflows/release-pypi-kernel.yml |  2 +-
 sgl-kernel/developer_guide.md             |  6 +++---
 sgl-kernel/setup.py                       | 10 ++--------
 sgl-kernel/version.py                     |  1 +
 4 files changed, 7 insertions(+), 12 deletions(-)
 create mode 100644 sgl-kernel/version.py

diff --git a/.github/workflows/release-pypi-kernel.yml b/.github/workflows/release-pypi-kernel.yml
index 1b925b772186..c07069c5d124 100644
--- a/.github/workflows/release-pypi-kernel.yml
+++ b/.github/workflows/release-pypi-kernel.yml
@@ -5,7 +5,7 @@ on:
     branches:
       - main
     paths:
-      - sgl-kernel/pyproject.toml
+      - sgl-kernel/version.py
   workflow_dispatch:
 
 concurrency:
diff --git a/sgl-kernel/developer_guide.md b/sgl-kernel/developer_guide.md
index 8afb6b0e460d..f41ce071e0b0 100644
--- a/sgl-kernel/developer_guide.md
+++ b/sgl-kernel/developer_guide.md
@@ -26,8 +26,8 @@ Steps to add a new kernel:
 
 1. Implement in [src/sgl-kernel/csrc/](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/src/sgl-kernel/csrc)
 2. Expose interface in [csrc/sgl_kernel_ops.cu](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu) with pybind11
-3. Create Python wrapper in [src/sgl-kernel/ops/__init__.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/ops/__init__.py)
-4. Expose Python interface in [src/sgl-kernel/__init__.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/__init__.py)
+3. Create Python wrapper in [src/sgl-kernel/ops/\_\_init\_\_.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/ops/__init__.py)
+4. Expose Python interface in [src/sgl-kernel/\_\_init\_\_.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/__init__.py)
 5. Update [setup.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/setup.py) to include new CUDA source
 
 ### Build & Install
@@ -48,4 +48,4 @@ pip3 install dist/*whl --force-reinstall --no-deps
 
 ### Release new version
 
-Update version in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/pyproject.toml)
+Update version in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/pyproject.toml) and [version.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/version.py)
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index c51fd7045048..71952655cd4d 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -3,17 +3,11 @@
 import torch
 from setuptools import find_packages, setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+from version import __version__
 
 root = Path(__file__).parent.resolve()
 
 
-def get_version():
-    with open(root / "pyproject.toml") as f:
-        for line in f:
-            if line.startswith("version"):
-                return line.split("=")[1].strip().strip('"')
-
-
 def update_wheel_platform_tag():
     wheel_dir = Path("dist")
     if wheel_dir.exists() and wheel_dir.is_dir():
@@ -122,7 +116,7 @@ def get_device_sm():
 
 setup(
     name="sgl-kernel",
-    version=get_version(),
+    version=__version__,
     packages=find_packages(),
     package_dir={"": "src"},
     ext_modules=ext_modules,
diff --git a/sgl-kernel/version.py b/sgl-kernel/version.py
new file mode 100644
index 000000000000..4bb48c132a2f
--- /dev/null
+++ b/sgl-kernel/version.py
@@ -0,0 +1 @@
+__version__ = "0.0.2.post15"

From 07a22cbba34e5012d4ee9606c51ff9bac8124d0e Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 23 Jan 2025 20:46:49 +0800
Subject: [PATCH 211/248] use env variable to control the build conf on the CPU
 build node (#3080)

---
 sgl-kernel/build.sh |  3 ++
 sgl-kernel/setup.py | 67 +++++++++++++++++++++++++++++++--------------
 2 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/sgl-kernel/build.sh b/sgl-kernel/build.sh
index 0d8169579519..c899224818e7 100755
--- a/sgl-kernel/build.sh
+++ b/sgl-kernel/build.sh
@@ -11,6 +11,9 @@ docker run --rm \
     ${PYTHON_ROOT_PATH}/bin/pip install --no-cache-dir torch==2.5.1 --index-url https://download.pytorch.org/whl/cu${CUDA_VERSION//.} && \
     export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX' && \
     export CUDA_VERSION=${CUDA_VERSION} && \
+    export SGL_KERNEL_ENABLE_BF16=1 && \
+    export SGL_KERNEL_ENABLE_FP8=1 && \
+    export SGL_KERNEL_ENABLE_SM90A=1 && \
     mkdir -p /usr/lib/x86_64-linux-gnu/ && \
     ln -s /usr/local/cuda-${CUDA_VERSION}/targets/x86_64-linux/lib/stubs/libcuda.so /usr/lib/x86_64-linux-gnu/libcuda.so && \
     cd /sgl-kernel && \
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 71952655cd4d..184cc08c4373 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -1,14 +1,14 @@
+import os
 from pathlib import Path
 
 import torch
 from setuptools import find_packages, setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-from version import __version__
 
 root = Path(__file__).parent.resolve()
 
 
-def update_wheel_platform_tag():
+def _update_wheel_platform_tag():
     wheel_dir = Path("dist")
     if wheel_dir.exists() and wheel_dir.is_dir():
         old_wheel = next(wheel_dir.glob("*.whl"))
@@ -18,21 +18,25 @@ def update_wheel_platform_tag():
         old_wheel.rename(new_wheel)
 
 
-def get_cuda_version():
+def _get_cuda_version():
     if torch.version.cuda:
         return tuple(map(int, torch.version.cuda.split(".")))
     return (0, 0)
 
 
-def get_device_sm():
+def _get_device_sm():
     if torch.cuda.is_available():
         major, minor = torch.cuda.get_device_capability()
         return major * 10 + minor
     return 0
 
 
-cuda_version = get_cuda_version()
-sm_version = get_device_sm()
+def _get_version():
+    with open(root / "pyproject.toml") as f:
+        for line in f:
+            if line.startswith("version"):
+                return line.split("=")[1].strip().strip('"')
+
 
 cutlass = root / "3rdparty" / "cutlass"
 flashinfer = root / "3rdparty" / "flashinfer"
@@ -58,19 +62,39 @@ def get_device_sm():
     "-DFLASHINFER_ENABLE_F16",
 ]
 
-if cuda_version >= (12, 0) and sm_version >= 90:
-    nvcc_flags.append("-gencode=arch=compute_90a,code=sm_90a")
-
-if sm_version >= 90:
-    nvcc_flags.extend(
-        [
-            "-DFLASHINFER_ENABLE_FP8",
-            "-DFLASHINFER_ENABLE_FP8_E4M3",
-            "-DFLASHINFER_ENABLE_FP8_E5M2",
-        ]
-    )
-if sm_version >= 80:
-    nvcc_flags.append("-DFLASHINFER_ENABLE_BF16")
+enable_bf16 = os.getenv("SGL_KERNEL_ENABLE_BF16", "0") == "1"
+enable_fp8 = os.getenv("SGL_KERNEL_ENABLE_FP8", "0") == "1"
+enable_sm90a = os.getenv("SGL_KERNEL_ENABLE_SM90A", "0") == "1"
+cuda_version = _get_cuda_version()
+sm_version = _get_device_sm()
+
+if torch.cuda.is_available():
+    if cuda_version >= (12, 0) and sm_version >= 90:
+        nvcc_flags.append("-gencode=arch=compute_90a,code=sm_90a")
+    if sm_version >= 90:
+        nvcc_flags.extend(
+            [
+                "-DFLASHINFER_ENABLE_FP8",
+                "-DFLASHINFER_ENABLE_FP8_E4M3",
+                "-DFLASHINFER_ENABLE_FP8_E5M2",
+            ]
+        )
+    if sm_version >= 80:
+        nvcc_flags.append("-DFLASHINFER_ENABLE_BF16")
+else:
+    # compilation environment without GPU
+    if enable_sm90a:
+        nvcc_flags.append("-gencode=arch=compute_90a,code=sm_90a")
+    if enable_fp8:
+        nvcc_flags.extend(
+            [
+                "-DFLASHINFER_ENABLE_FP8",
+                "-DFLASHINFER_ENABLE_FP8_E4M3",
+                "-DFLASHINFER_ENABLE_FP8_E5M2",
+            ]
+        )
+    if enable_bf16:
+        nvcc_flags.append("-DFLASHINFER_ENABLE_BF16")
 
 for flag in [
     "-D__CUDA_NO_HALF_OPERATORS__",
@@ -82,6 +106,7 @@ def get_device_sm():
         torch.utils.cpp_extension.COMMON_NVCC_FLAGS.remove(flag)
     except ValueError:
         pass
+
 cxx_flags = ["-O3"]
 libraries = ["c10", "torch", "torch_python", "cuda"]
 extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib", "-L/usr/lib/x86_64-linux-gnu"]
@@ -116,11 +141,11 @@ def get_device_sm():
 
 setup(
     name="sgl-kernel",
-    version=__version__,
+    version=_get_version(),
     packages=find_packages(),
     package_dir={"": "src"},
     ext_modules=ext_modules,
     cmdclass={"build_ext": BuildExtension},
 )
 
-update_wheel_platform_tag()
+_update_wheel_platform_tag()

From 0da0989ad4f468ce35f4c8220241901a75ed1b26 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 23 Jan 2025 21:13:55 +0800
Subject: [PATCH 212/248] sync flashinfer and update sgl-kernel tests (#3081)

---
 .github/workflows/pr-test-sgl-kernel.yml            | 2 +-
 sgl-kernel/3rdparty/flashinfer                      | 2 +-
 sgl-kernel/Makefile                                 | 2 +-
 sgl-kernel/tests/test_activation.py                 | 3 ++-
 sgl-kernel/tests/test_lightning_attention_decode.py | 4 ++++
 sgl-kernel/tests/test_norm.py                       | 4 ++++
 6 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
index 55eb636d64ff..aea60969719e 100644
--- a/.github/workflows/pr-test-sgl-kernel.yml
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -47,7 +47,7 @@ jobs:
           pip3 list | grep sgl-kernel
 
       - name: Run test
-        timeout-minutes: 10
+        timeout-minutes: 30
         run: |
           cd sgl-kernel
           find tests -name "test_*.py" | xargs -n 1 python3
diff --git a/sgl-kernel/3rdparty/flashinfer b/sgl-kernel/3rdparty/flashinfer
index 4e8eb1879f9c..93e1a2634e22 160000
--- a/sgl-kernel/3rdparty/flashinfer
+++ b/sgl-kernel/3rdparty/flashinfer
@@ -1 +1 @@
-Subproject commit 4e8eb1879f9c3ba6d75511e5893183bf8f289a62
+Subproject commit 93e1a2634e22355b0856246b032b285ad1d1da6b
diff --git a/sgl-kernel/Makefile b/sgl-kernel/Makefile
index 9261b8969345..c7641bb5fee1 100644
--- a/sgl-kernel/Makefile
+++ b/sgl-kernel/Makefile
@@ -19,7 +19,7 @@ clean:
 	@rm -rf build dist *.egg-info
 
 test:
-	@find tests -name "test_*.py" | xargs -n 1 python3 && pytest tests/test_norm.py && pytest tests/test_activation.py
+	@find tests -name "test_*.py" | xargs -n 1 python3
 
 format:
 	@find src tests -name '*.cc' -o -name '*.cu' -o -name '*.cuh' -o -name '*.h' -o -name '*.hpp' | xargs clang-format -i && find src tests -name '*.py' | xargs isort && find src tests -name '*.py' | xargs black
diff --git a/sgl-kernel/tests/test_activation.py b/sgl-kernel/tests/test_activation.py
index f71f36b513da..43593441e3b6 100644
--- a/sgl-kernel/tests/test_activation.py
+++ b/sgl-kernel/tests/test_activation.py
@@ -35,4 +35,5 @@ def test_fused_gelu_mul(dim, batch_size, seq_len):
     torch.testing.assert_close(y_ref, y, rtol=1e-3, atol=1e-3)
 
 
-test_fused_silu_mul(128, 1, 1)
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_lightning_attention_decode.py b/sgl-kernel/tests/test_lightning_attention_decode.py
index 74af78e27b5d..f2cace00157a 100644
--- a/sgl-kernel/tests/test_lightning_attention_decode.py
+++ b/sgl-kernel/tests/test_lightning_attention_decode.py
@@ -82,3 +82,7 @@ def test_lightning_attention_decode(dtype, batch_size, num_heads, dim, embed_dim
         msg=f"New KV mismatch for batch_size={batch_size}, num_heads={num_heads}, "
         f"dim={dim}, embed_dim={embed_dim}, dtype={dtype}",
     )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_norm.py b/sgl-kernel/tests/test_norm.py
index 32f8c25d9f7c..7b38dba72bfb 100644
--- a/sgl-kernel/tests/test_norm.py
+++ b/sgl-kernel/tests/test_norm.py
@@ -127,3 +127,7 @@ def test_gemma_fused_add_rmsnorm(batch_size, hidden_size, dtype):
 
     torch.testing.assert_close(x_fused, x_native, rtol=1e-3, atol=1e-3)
     torch.testing.assert_close(residual_fused, residual_native, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])

From f1b68618281d680add95b9c30635ef644f1f6f25 Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Thu, 23 Jan 2025 22:19:04 +0800
Subject: [PATCH 213/248] use flashinfer vec_dtypes in sgl_kernel (#3083)

---
 .../csrc/sampling_scaling_penalties.cu        | 47 ++++++++--------
 .../src/sgl-kernel/csrc/vectorization.cuh     | 29 ----------
 .../tests/test_sampling_scaling_penalties.py  | 55 +++++++++----------
 3 files changed, 51 insertions(+), 80 deletions(-)
 delete mode 100644 sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh

diff --git a/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu b/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
index 2f53bb1a99f0..2a9de4d9f711 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
@@ -1,11 +1,12 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <pytorch_extension_utils.h>
 
 #include <THC/THCAtomics.cuh>
+#include <flashinfer/vec_dtypes.cuh>
 
 #include "utils.h"
-#include "vectorization.cuh"
 
 template <typename scalar_t>
 __global__ void sampling_scaling_penalties_kernel(const scalar_t* logits, const scalar_t* scaling_penalties,
@@ -13,31 +14,31 @@ __global__ void sampling_scaling_penalties_kernel(const scalar_t* logits, const
   const int32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   const int32_t stride = blockDim.x * gridDim.x;
 
-  auto const* vectorized_logits = reinterpret_cast<vec4_t<scalar_t> const*>(logits);
-  auto const* vectorized_penalties = reinterpret_cast<vec4_t<scalar_t> const*>(scaling_penalties);
-  auto* vectorized_output = reinterpret_cast<vec4_t<scalar_t>*>(output);
+  constexpr uint32_t vec_size = 16 / sizeof(scalar_t);
+  using vec_t = flashinfer::vec_t<scalar_t, vec_size>;
 
-  const int32_t num_vec_elems = numel >> 2;
+  const int32_t num_vec_elems = numel / vec_size;
 
-#pragma unroll 4
+#pragma unroll 1
   for (int32_t i = tid; i < num_vec_elems; i += stride) {
-    vec4_t<scalar_t> logits_vec = vectorized_logits[i];
-    vec4_t<scalar_t> penalties_vec = vectorized_penalties[i];
-    vec4_t<scalar_t> out_vec;
+    vec_t logits_vec, penalties_vec, out_vec;
+    logits_vec.cast_load(logits + i * vec_size);
+    penalties_vec.cast_load(scaling_penalties + i * vec_size);
 
-    out_vec.x = logits_vec.x > 0 ? logits_vec.x / penalties_vec.x : logits_vec.x * penalties_vec.x;
-    out_vec.y = logits_vec.y > 0 ? logits_vec.y / penalties_vec.y : logits_vec.y * penalties_vec.y;
-    out_vec.z = logits_vec.z > 0 ? logits_vec.z / penalties_vec.z : logits_vec.z * penalties_vec.z;
-    out_vec.w = logits_vec.w > 0 ? logits_vec.w / penalties_vec.w : logits_vec.w * penalties_vec.w;
+#pragma unroll
+    for (uint32_t j = 0; j < vec_size; ++j) {
+      out_vec[j] = logits_vec[j] > scalar_t(0.0f) ? logits_vec[j] / penalties_vec[j] : logits_vec[j] * penalties_vec[j];
+    }
 
-    vectorized_output[i] = out_vec;
+    out_vec.cast_store(output + i * vec_size);
   }
 
-  const int32_t start_idx = num_vec_elems * 4;
+  // process the remaining elements
+  const int32_t start_idx = num_vec_elems * vec_size;
   for (int32_t i = start_idx + tid; i < numel; i += stride) {
     scalar_t logit = logits[i];
     scalar_t penalty = scaling_penalties[i];
-    output[i] = logit > 0 ? logit / penalty : logit * penalty;
+    output[i] = logit > scalar_t(0.0f) ? logit / penalty : logit * penalty;
   }
 }
 
@@ -48,12 +49,14 @@ torch::Tensor sampling_scaling_penalties(const torch::Tensor& logits, const torc
 
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  AT_DISPATCH_FLOATING_TYPES_AND2(
-      at::ScalarType::Half, at::ScalarType::BFloat16, logits.scalar_type(), "sampling_scaling_penalties_kernel", ([&] {
-        const int blocks = (numel + threads * 4 - 1) / (threads * 4);
-        sampling_scaling_penalties_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
-            logits.data_ptr<scalar_t>(), scaling_penalties.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), numel);
-      }));
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(logits.scalar_type(), scalar_t, [&] {
+    uint32_t vec_size = 16 / sizeof(scalar_t);
+    const int blocks = (numel + threads * vec_size - 1) / (threads * vec_size);
+    sampling_scaling_penalties_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        static_cast<scalar_t*>(logits.data_ptr()), static_cast<scalar_t*>(scaling_penalties.data_ptr()),
+        static_cast<scalar_t*>(output.data_ptr()), numel);
+    return true;
+  });
 
   return output;
 }
diff --git a/sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh b/sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh
deleted file mode 100644
index 2bfb710189bb..000000000000
--- a/sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-// Adapted from https://github.com/vllm-project/vllm/blob/main/csrc/quantization/vectorization.cuh
-#pragma once
-/**
- * __device__ datatypes vectorized by 4
- */
-
-// Include both AMD and NVIDIA fp8 types to avoid circular import
-// TODO(luka/varun) use FP8_TYPE instead after refactoring
-#include <c10/util/Float8_e4m3fn.h>
-#include <c10/util/Float8_e4m3fnuz.h>
-
-// Vectorization containers
-template <typename scalar_t>
-struct __align__(8) vec4_t {
-  scalar_t x;
-  scalar_t y;
-  scalar_t z;
-  scalar_t w;
-};
-
-template <typename quant_type_t>
-struct __align__(4) q8x4_t {
-  static_assert(std::is_same_v<quant_type_t, int8_t> || std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
-                std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
-  quant_type_t x;
-  quant_type_t y;
-  quant_type_t z;
-  quant_type_t w;
-};
diff --git a/sgl-kernel/tests/test_sampling_scaling_penalties.py b/sgl-kernel/tests/test_sampling_scaling_penalties.py
index 4b9746fd7934..00f12bfbe76f 100644
--- a/sgl-kernel/tests/test_sampling_scaling_penalties.py
+++ b/sgl-kernel/tests/test_sampling_scaling_penalties.py
@@ -1,37 +1,34 @@
+import pytest
 import torch
 from sgl_kernel import sampling_scaling_penalties
 
 
-def test_sampling_scaling_penalties():
-    batch_sizes = [1, 2, 4, 8, 16, 32, 64, 65]
-    vocab_sizes = [2048, 4096, 8192, 16384, 32768, 32767]
-    dtypes = [torch.float32, torch.half, torch.bfloat16]
+@pytest.mark.parametrize("batch_size", [1, 2, 4, 8, 16, 32, 64, 65])
+@pytest.mark.parametrize("vocab_size", [2048, 4096, 8192, 16384, 32768, 32767])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+def test_sampling_scaling_penalties(batch_size, vocab_size, dtype):
     device = torch.device("cuda")
-
-    for dtype in dtypes:
-        rtol = 1e-3
-        atol = 1e-3
-
-        for bs in batch_sizes:
-            for vocab_size in vocab_sizes:
-                logits = torch.randn(bs, vocab_size, device=device, dtype=dtype)
-                scaling_penalties = (
-                    torch.rand(bs, vocab_size, device=device, dtype=dtype) + 0.5
-                )
-
-                ref_output = torch.where(
-                    logits > 0, logits / scaling_penalties, logits * scaling_penalties
-                )
-
-                kernel_output = sampling_scaling_penalties(logits, scaling_penalties)
-
-                torch.testing.assert_close(
-                    kernel_output,
-                    ref_output,
-                    rtol=rtol,
-                    atol=atol,
-                    msg=f"Failed for batch_size={bs}, vocab_size={vocab_size}, dtype={dtype}",
-                )
+    rtol = 1e-3
+    atol = 1e-3
+
+    logits = torch.randn(batch_size, vocab_size, device=device, dtype=dtype)
+    scaling_penalties = (
+        torch.rand(batch_size, vocab_size, device=device, dtype=dtype) + 0.5
+    )
+
+    ref_output = torch.where(
+        logits > 0, logits / scaling_penalties, logits * scaling_penalties
+    )
+
+    kernel_output = sampling_scaling_penalties(logits, scaling_penalties)
+
+    torch.testing.assert_close(
+        kernel_output,
+        ref_output,
+        rtol=rtol,
+        atol=atol,
+        msg=f"Failed for batch_size={batch_size}, vocab_size={vocab_size}, dtype={dtype}",
+    )
 
 
 if __name__ == "__main__":

From e0cd65c2b69a04b5cd7c348c6b80fdec1eabecf0 Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Fri, 24 Jan 2025 00:33:59 +0800
Subject: [PATCH 214/248] [hotfix] fix test_sampling_scaling_penalties.py ci
 test (#3084)

---
 sgl-kernel/tests/test_sampling_scaling_penalties.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sgl-kernel/tests/test_sampling_scaling_penalties.py b/sgl-kernel/tests/test_sampling_scaling_penalties.py
index 00f12bfbe76f..6194c761710a 100644
--- a/sgl-kernel/tests/test_sampling_scaling_penalties.py
+++ b/sgl-kernel/tests/test_sampling_scaling_penalties.py
@@ -32,5 +32,4 @@ def test_sampling_scaling_penalties(batch_size, vocab_size, dtype):
 
 
 if __name__ == "__main__":
-    test_sampling_scaling_penalties()
-    print("All tests passed!")
+    pytest.main([__file__])

From 5de4051bcf88c51d7d74752caf33029363a7bfaa Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Fri, 24 Jan 2025 01:54:47 +0800
Subject: [PATCH 215/248] feat: integrate sampling kernels into sgl-kernel
 (#3086)

Co-authored-by: Zihao Ye <expye@outlook.com>
---
 sgl-kernel/setup.py                           |   1 +
 sgl-kernel/src/sgl-kernel/__init__.py         |  10 +-
 .../src/sgl-kernel/csrc/sgl_kernel_ops.cu     |  34 +++
 sgl-kernel/src/sgl-kernel/ops/__init__.py     | 229 +++++++++++++++++-
 sgl-kernel/src/sgl-kernel/ops/utils.py        |   7 +
 sgl-kernel/tests/test_sampling.py             | 141 +++++++++++
 6 files changed, 419 insertions(+), 3 deletions(-)
 create mode 100644 sgl-kernel/tests/test_sampling.py

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 184cc08c4373..72d188e71d8b 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -128,6 +128,7 @@ def _get_version():
             "3rdparty/flashinfer/csrc/group_gemm_sm90.cu",
             "3rdparty/flashinfer/csrc/norm.cu",
             "3rdparty/flashinfer/csrc/sampling.cu",
+            "3rdparty/flashinfer/csrc/renorm.cu",
         ],
         include_dirs=include_dirs,
         extra_compile_args={
diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py
index 9eaa64e50833..c7fcd2742592 100644
--- a/sgl-kernel/src/sgl-kernel/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/__init__.py
@@ -11,12 +11,16 @@
     init_custom_reduce,
     int8_scaled_mm,
     lightning_attention_decode,
+    min_p_sampling_from_probs,
     moe_align_block_size,
     register_graph_buffers,
     rmsnorm,
     rotary_embedding,
     sampling_scaling_penalties,
     silu_and_mul,
+    top_k_renorm_prob,
+    top_k_top_p_sampling_from_probs,
+    top_p_renorm_prob,
 )
 
 __all__ = [
@@ -31,11 +35,15 @@
     "get_graph_buffer_ipc_meta",
     "init_custom_reduce",
     "int8_scaled_mm",
+    "lightning_attention_decode",
+    "min_p_sampling_from_probs",
     "moe_align_block_size",
     "register_graph_buffers",
     "rmsnorm",
     "rotary_embedding",
     "sampling_scaling_penalties",
-    "lightning_attention_decode",
     "silu_and_mul",
+    "top_k_renorm_prob",
+    "top_k_top_p_sampling_from_probs",
+    "top_p_renorm_prob",
 ]
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
index cd5df07895aa..876d62b7eb3c 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
@@ -61,6 +61,30 @@ void gelu_and_mul(at::Tensor& out, at::Tensor& input, int64_t cuda_stream);
 void bmm_fp8(at::Tensor A, at::Tensor B, at::Tensor D, at::Tensor A_scale, at::Tensor B_scale,
              at::Tensor workspace_buffer, int64_t cublas_handle, int64_t cuda_stream);
 
+// min p sampling from probs
+void min_p_sampling_from_probs(at::Tensor probs, at::Tensor uniform_samples, at::Tensor samples,
+                               std::optional<at::Tensor> maybe_min_p_arr, double min_p_val, bool deterministic,
+                               int64_t cuda_stream);
+
+// top k renorm probs
+void top_k_renorm_probs(at::Tensor probs, at::Tensor renorm_probs, std::optional<at::Tensor> maybe_top_k_arr,
+                        unsigned int top_k_val, int64_t cuda_stream);
+
+// top p renorm probs
+void top_p_renorm_probs(at::Tensor probs, at::Tensor renorm_probs, std::optional<at::Tensor> maybe_top_p_arr,
+                        double top_p_val, int64_t cuda_stream);
+
+// top k top p sampling from probs
+void top_k_top_p_sampling_from_probs(at::Tensor probs, at::Tensor uniform_samples, at::Tensor samples,
+                                     at::Tensor success, std::optional<at::Tensor> maybe_top_k_arr, double top_k_val,
+                                     std::optional<at::Tensor> maybe_top_p_arr, double top_p_val, bool deterministic,
+                                     int64_t cuda_stream);
+
+// top p sampling from probs
+void top_p_sampling_from_probs(at::Tensor probs, at::Tensor uniform_samples, at::Tensor samples, at::Tensor success,
+                               std::optional<at::Tensor> maybe_top_p_arr, double top_p_val, bool deterministic,
+                               int64_t cuda_stream);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // trt_reduce
   m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)");
@@ -94,4 +118,14 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("gelu_and_mul", &gelu_and_mul, "Gelu and Mul (CUDA)");
   // bmm fp8
   m.def("bmm_fp8", &bmm_fp8, "BMM FP8 (CUDA)");
+  // min p sampling from probs
+  m.def("min_p_sampling_from_probs", &min_p_sampling_from_probs, "Min P Sampling From Probs (CUDA)");
+  // top k renorm probs
+  m.def("top_k_renorm_probs", &top_k_renorm_probs, "Top K Renorm Probs (CUDA)");
+  // top p renorm probs
+  m.def("top_p_renorm_probs", &top_p_renorm_probs, "Top P Renorm Probs (CUDA)");
+  // top k top p sampling from probs
+  m.def("top_k_top_p_sampling_from_probs", &top_k_top_p_sampling_from_probs, "Top K Top P Sampling From Probs (CUDA)");
+  // top p sampling from probs
+  m.def("top_p_sampling_from_probs", &top_p_sampling_from_probs, "Top P Sampling From Probs (CUDA)");
 }
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index 0aead260bc4a..cd69eb3c2495 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Tuple, Union
 
 import torch
 from sgl_kernel.ops._kernels import all_reduce as _all_reduce
@@ -17,6 +17,9 @@
 from sgl_kernel.ops._kernels import (
     lightning_attention_decode as _lightning_attention_decode,
 )
+from sgl_kernel.ops._kernels import (
+    min_p_sampling_from_probs as _min_p_sampling_from_probs,
+)
 from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size
 from sgl_kernel.ops._kernels import register_graph_buffers as _register_graph_buffers
 from sgl_kernel.ops._kernels import rmsnorm as _rmsnorm
@@ -25,7 +28,19 @@
     sampling_scaling_penalties as _sampling_scaling_penalties,
 )
 from sgl_kernel.ops._kernels import silu_and_mul as _silu_and_mul
-from sgl_kernel.ops.utils import _get_cache_buf, _get_cuda_stream
+from sgl_kernel.ops._kernels import top_k_renorm_probs as _top_k_renorm_probs
+from sgl_kernel.ops._kernels import (
+    top_k_top_p_sampling_from_probs as _top_k_top_p_sampling_from_probs,
+)
+from sgl_kernel.ops._kernels import top_p_renorm_probs as _top_p_renorm_probs
+from sgl_kernel.ops._kernels import (
+    top_p_sampling_from_probs as _top_p_sampling_from_probs,
+)
+from sgl_kernel.ops.utils import (
+    _get_cache_buf,
+    _get_cuda_stream,
+    _to_tensor_scalar_tuple,
+)
 
 
 def init_custom_reduce(
@@ -236,3 +251,213 @@ def bmm_fp8(
     workspace_buffer = _get_cache_buf("bmm_fp8_workspace", 32 * 1024 * 1024, A.device)
     _bmm_fp8_internal(workspace_buffer, A, B, out, A_scale, B_scale)
     return out
+
+
+def _top_k_renorm_probs_internal(
+    probs: torch.Tensor,
+    maybe_top_k_arr: Optional[torch.Tensor],
+    top_k_val: int,
+) -> torch.Tensor:
+    with probs.device as device:
+        probs = probs.float()
+        maybe_top_k_arr = maybe_top_k_arr.int() if maybe_top_k_arr is not None else None
+        renorm_probs = torch.empty_like(probs)
+        _top_k_renorm_probs(
+            probs,
+            renorm_probs,
+            maybe_top_k_arr,
+            top_k_val,
+            _get_cuda_stream(device),
+        )
+        return renorm_probs
+
+
+def top_k_renorm_probs(
+    probs: torch.Tensor,
+    top_k: Union[torch.Tensor, int],
+) -> torch.Tensor:
+    return _top_k_renorm_probs_internal(probs, *_to_tensor_scalar_tuple(top_k))
+
+
+top_k_renorm_prob = top_k_renorm_probs
+
+
+def _top_p_renorm_probs_internal(
+    probs: torch.Tensor,
+    maybe_top_p_arr: Optional[torch.Tensor],
+    top_p_val: float,
+) -> torch.Tensor:
+    with probs.device as device:
+        probs = probs.float()
+        maybe_top_p_arr = (
+            maybe_top_p_arr.float() if maybe_top_p_arr is not None else None
+        )
+        renorm_probs = torch.empty_like(probs)
+        _top_p_renorm_probs(
+            probs,
+            renorm_probs,
+            maybe_top_p_arr,
+            top_p_val,
+            _get_cuda_stream(device),
+        )
+        return renorm_probs
+
+
+def top_p_renorm_probs(
+    probs: torch.Tensor,
+    top_p: Union[torch.Tensor, float],
+) -> torch.Tensor:
+    return _top_p_renorm_probs_internal(probs, *_to_tensor_scalar_tuple(top_p))
+
+
+top_p_renorm_prob = top_p_renorm_probs
+
+
+def _top_p_sampling_from_probs_internal(
+    probs: torch.Tensor,
+    uniform_samples: torch.Tensor,
+    maybe_top_p_arr: Optional[torch.Tensor],
+    top_p_val: float,
+    deterministic: bool,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    with probs.device as device:
+        probs = probs.float()
+        uniform_samples = uniform_samples.float()
+        maybe_top_p_arr = (
+            maybe_top_p_arr.float() if maybe_top_p_arr is not None else None
+        )
+        samples = torch.empty(probs.size(0), dtype=torch.int32, device=device)
+        success = torch.empty(probs.size(0), dtype=torch.bool, device=device)
+        _top_p_sampling_from_probs(
+            probs,
+            uniform_samples,
+            samples,
+            success,
+            maybe_top_p_arr,
+            top_p_val,
+            deterministic,
+            _get_cuda_stream(device),
+        )
+        return samples, success
+
+
+def top_p_sampling_from_probs(
+    probs: torch.Tensor,
+    uniform_samples: torch.Tensor,
+    top_p: Union[torch.Tensor, float],
+    deterministic: bool = True,
+    check_nan: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if check_nan:
+        if torch.any(torch.isnan(probs)):
+            raise ValueError("Input probs contains NaN.")
+    return _top_p_sampling_from_probs_internal(
+        probs, uniform_samples, *_to_tensor_scalar_tuple(top_p), deterministic
+    )
+
+
+def _top_k_top_p_sampling_from_probs_internal(
+    probs: torch.Tensor,
+    uniform_samples: torch.Tensor,
+    maybe_top_k_arr: Optional[torch.Tensor],
+    top_k_val: int,
+    maybe_top_p_arr: Optional[torch.Tensor],
+    top_p_val: float,
+    deterministic: bool,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    with probs.device as device:
+        probs = probs.float()
+        uniform_samples = uniform_samples.float()
+        maybe_top_k_arr = maybe_top_k_arr.int() if maybe_top_k_arr is not None else None
+        maybe_top_p_arr = (
+            maybe_top_p_arr.float() if maybe_top_p_arr is not None else None
+        )
+        samples = torch.empty(probs.size(0), dtype=torch.int32, device=device)
+        success = torch.empty(probs.size(0), dtype=torch.bool, device=device)
+        _top_k_top_p_sampling_from_probs(
+            probs,
+            uniform_samples,
+            samples,
+            success,
+            maybe_top_k_arr,
+            top_k_val,
+            maybe_top_p_arr,
+            top_p_val,
+            deterministic,
+            _get_cuda_stream(device),
+        )
+        return samples, success
+
+
+def top_k_top_p_sampling_from_probs(
+    probs: torch.Tensor,
+    uniform_samples: torch.Tensor,
+    top_k: Union[torch.Tensor, int],
+    top_p: Union[torch.Tensor, float],
+    filter_apply_order: str = "top_k_first",
+    deterministic: bool = True,
+    check_nan: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if filter_apply_order == "top_k_first":
+        renorm_probs = top_k_renorm_probs(probs, top_k)
+        return top_p_sampling_from_probs(
+            renorm_probs, uniform_samples, top_p, deterministic, check_nan=check_nan
+        )
+    elif filter_apply_order == "joint":
+        if check_nan:
+            if torch.any(torch.isnan(probs)):
+                raise ValueError("Input probs contains NaN.")
+        return _top_k_top_p_sampling_from_probs_internal(
+            probs,
+            uniform_samples,
+            *_to_tensor_scalar_tuple(top_k),
+            *_to_tensor_scalar_tuple(top_p),
+            deterministic,
+        )
+    else:
+        raise ValueError(f"Invalid filter_apply_order: {filter_apply_order}")
+
+
+def _min_p_sampling_from_probs_internal(
+    probs: torch.Tensor,
+    uniform_samples: torch.Tensor,
+    maybe_min_p_arr: Optional[torch.Tensor],
+    min_p_val: float,
+    deterministic: bool,
+) -> torch.Tensor:
+    with probs.device as device:
+        probs = probs.float()
+        uniform_samples = uniform_samples.float()
+        maybe_min_p_arr = (
+            maybe_min_p_arr.float() if maybe_min_p_arr is not None else None
+        )
+        samples = torch.empty(probs.size(0), dtype=torch.int32, device=device)
+        _min_p_sampling_from_probs(
+            probs,
+            uniform_samples,
+            samples,
+            maybe_min_p_arr,
+            min_p_val,
+            deterministic,
+            _get_cuda_stream(device),
+        )
+        return samples
+
+
+def min_p_sampling_from_probs(
+    probs: torch.Tensor,
+    uniform_samples: torch.Tensor,
+    min_p: Union[torch.Tensor, float],
+    deterministic: bool = True,
+    check_nan: bool = False,
+) -> torch.Tensor:
+    if uniform_samples.dim() == 2:
+        # Take the first row (round) of uniform_samples
+        uniform_samples = uniform_samples[0]
+
+    if check_nan:
+        if torch.any(torch.isnan(probs)):
+            raise ValueError("Input probs contains NaN.")
+    return _min_p_sampling_from_probs_internal(
+        probs, uniform_samples, *_to_tensor_scalar_tuple(min_p), deterministic
+    )
diff --git a/sgl-kernel/src/sgl-kernel/ops/utils.py b/sgl-kernel/src/sgl-kernel/ops/utils.py
index af5fccbb786d..31a6bbf9919d 100644
--- a/sgl-kernel/src/sgl-kernel/ops/utils.py
+++ b/sgl-kernel/src/sgl-kernel/ops/utils.py
@@ -17,3 +17,10 @@ def _get_cache_buf(name: str, bytes: int, device: torch.device) -> torch.Tensor:
         buf = torch.empty(bytes, dtype=torch.uint8, device=device)
         _cache_buf[key] = buf
     return buf
+
+
+def _to_tensor_scalar_tuple(x):
+    if isinstance(x, torch.Tensor):
+        return (x, 0)
+    else:
+        return (None, x)
diff --git a/sgl-kernel/tests/test_sampling.py b/sgl-kernel/tests/test_sampling.py
new file mode 100644
index 000000000000..7d3bc5059eea
--- /dev/null
+++ b/sgl-kernel/tests/test_sampling.py
@@ -0,0 +1,141 @@
+# Adapted from https://github.com/flashinfer-ai/flashinfer/blob/93e1a2634e22355b0856246b032b285ad1d1da6b/tests/test_sampling.py
+
+import pytest
+import sgl_kernel
+import torch
+
+
+@pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
+@pytest.mark.parametrize("vocab_size", [111, 500, 32000, 128256])
+@pytest.mark.parametrize("p", [0.1, 0.5])
+def test_top_k_top_p_joint_sampling_from_probs(batch_size, vocab_size, p):
+    torch.manual_seed(42)
+    if p == 0.1:
+        k = int(vocab_size * 0.5)
+    elif p == 0.5:
+        k = int(vocab_size * 0.1)
+    else:
+        raise ValueError("p not recognized")
+    max_top_k_trails = 32
+    eps = 1e-4
+    pre_norm_prob = torch.rand(batch_size, vocab_size).to(0)
+    normalized_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
+    # top-p mask
+    sorted_prob, indices = torch.sort(normalized_prob, descending=False)
+    cdf = torch.cumsum(sorted_prob, dim=-1)
+    mask_top_p = torch.zeros(batch_size, vocab_size, dtype=torch.int32).to(0)
+    mask_top_p.scatter_add_(1, indices, (cdf > (1 - p) - eps).int())
+    # top-k mask
+    sorted_prob, _ = torch.sort(normalized_prob, descending=True)
+    pivot = sorted_prob[:, k - 1]
+    mask_top_k = (normalized_prob >= pivot.unsqueeze(-1)).int()
+    # overall mask
+    mask = torch.minimum(mask_top_p, mask_top_k)
+    uniform_samples = torch.empty(max_top_k_trails, batch_size, dtype=torch.float32).to(
+        0
+    )
+    top_p_tensor = torch.full((batch_size,), p).to(0)
+    top_k_tensor = torch.full((batch_size,), k).to(0)
+
+    num_trails = 1000
+    for _ in range(num_trails):
+        uniform_samples.uniform_()
+        samples, success = sgl_kernel.top_k_top_p_sampling_from_probs(
+            normalized_prob,
+            uniform_samples,
+            top_k_tensor,
+            top_p_tensor,
+            filter_apply_order="joint",
+        )
+        assert torch.all(success)
+        assert torch.all(samples < vocab_size) and torch.all(samples >= 0)
+        assert torch.all(mask[torch.arange(batch_size), samples] == 1), normalized_prob[
+            torch.arange(batch_size), samples
+        ]
+
+
+@pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
+@pytest.mark.parametrize("vocab_size", [111, 500, 32000, 128256])
+@pytest.mark.parametrize("p", [0.1, 0.5, 0.9])
+def test_top_p_renorm_probs(batch_size, vocab_size, p):
+    pre_norm_prob = torch.rand(batch_size, vocab_size).to(0)
+    normalized_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
+    sorted_prob, indices = torch.sort(normalized_prob, descending=False)
+    cdf = torch.cumsum(sorted_prob, dim=-1)
+    mask = torch.zeros(batch_size, vocab_size, dtype=torch.int32).to(0)
+    mask.scatter_add_(1, indices, (cdf >= (1 - p)).int())
+    renorm_prob_ground_truth = normalized_prob
+    renorm_prob_ground_truth[mask == 0] = 0
+    renorm_prob_ground_truth = renorm_prob_ground_truth / renorm_prob_ground_truth.sum(
+        dim=-1, keepdim=True
+    )
+
+    renorm_prob = sgl_kernel.top_p_renorm_prob(normalized_prob, p)
+    torch.testing.assert_close(
+        renorm_prob_ground_truth,
+        renorm_prob,
+        rtol=1e-3,
+        atol=1e-3,
+    )
+
+
+@pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
+@pytest.mark.parametrize("vocab_size", [111, 500, 32000, 128256])
+@pytest.mark.parametrize("k", [10, 100, 500])
+def test_top_k_renorm_probs(batch_size, vocab_size, k):
+    if k > vocab_size:
+        pytest.skip("k should be less than vocab_size")
+    torch.manual_seed(42)
+    pre_norm_prob = torch.rand(batch_size, vocab_size).to(0)
+    normalized_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
+    sorted_prob, _ = torch.sort(normalized_prob, descending=True)
+    pivot = sorted_prob[:, k - 1]
+    mask = (normalized_prob >= pivot.unsqueeze(-1)).int()
+    renorm_prob_ground_truth = normalized_prob
+    renorm_prob_ground_truth[mask == 0] = 0
+    renorm_prob_ground_truth = renorm_prob_ground_truth / renorm_prob_ground_truth.sum(
+        dim=-1, keepdim=True
+    )
+
+    renorm_prob = sgl_kernel.top_k_renorm_prob(normalized_prob, k)
+    torch.testing.assert_close(
+        renorm_prob_ground_truth,
+        renorm_prob,
+        rtol=1e-3,
+        atol=1e-3,
+    )
+
+
+@pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
+@pytest.mark.parametrize("vocab_size", [111, 500, 32000, 128256])
+@pytest.mark.parametrize("p", [0.05, 0.1, 0.2, 0.7, 1])
+def test_min_p_sampling(batch_size, vocab_size, p):
+    torch.manual_seed(42)
+    pre_norm_prob = torch.rand(batch_size, vocab_size).to(0)
+    normalized_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
+    sorted_prob, indices = torch.sort(normalized_prob, descending=False)
+    # scale min-p
+    top_probs = sorted_prob[:, -1].unsqueeze(-1)
+    scaled_p = p * top_probs
+    # min-p mask
+    mask = torch.zeros(batch_size, vocab_size, dtype=torch.int32).to(0)
+    mask.scatter_add_(1, indices, (sorted_prob >= scaled_p).int())
+    uniform_samples = torch.empty(batch_size, dtype=torch.float32).to(0)
+    min_p_tensor = torch.full((batch_size,), p).to(0)
+
+    num_trails = 1000
+    for _ in range(num_trails):
+        uniform_samples.uniform_()
+        samples = sgl_kernel.min_p_sampling_from_probs(
+            normalized_prob,
+            uniform_samples,
+            min_p_tensor,
+        )
+
+        assert torch.all(mask[torch.arange(batch_size), samples] == 1), samples[
+            torch.nonzero(mask[torch.arange(batch_size), samples] == 0)
+        ]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])

From 54bac8af0bd4c00ad82d511de84b01d235993df3 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Fri, 24 Jan 2025 01:57:48 +0800
Subject: [PATCH 216/248] chore: bump sgl-kernel 0.0.2.post16 (#3087)

---
 sgl-kernel/pyproject.toml | 2 +-
 sgl-kernel/version.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sgl-kernel/pyproject.toml b/sgl-kernel/pyproject.toml
index eecf68b37cf8..0032c369d94d 100644
--- a/sgl-kernel/pyproject.toml
+++ b/sgl-kernel/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sgl-kernel"
-version = "0.0.2.post15"
+version = "0.0.2.post16"
 description = "Kernel Library for SGLang"
 readme = "README.md"
 requires-python = ">=3.9"
diff --git a/sgl-kernel/version.py b/sgl-kernel/version.py
index 4bb48c132a2f..5a127146bb50 100644
--- a/sgl-kernel/version.py
+++ b/sgl-kernel/version.py
@@ -1 +1 @@
-__version__ = "0.0.2.post15"
+__version__ = "0.0.2.post16"

From 1c4e0d2445311f2e635e9dab5a660d982731ad20 Mon Sep 17 00:00:00 2001
From: simveit <69345428+simveit@users.noreply.github.com>
Date: Thu, 23 Jan 2025 20:32:05 +0100
Subject: [PATCH 217/248] Docs: Update doc for server arguments (#2742)

Co-authored-by: Chayenne <zhaochen20@outlook.com>
Co-authored-by: Yineng Zhang <me@zhyncs.com>
---
 docs/backend/server_arguments.md | 155 ++++++++++++++++++++++++++++++-
 1 file changed, 153 insertions(+), 2 deletions(-)

diff --git a/docs/backend/server_arguments.md b/docs/backend/server_arguments.md
index 6d72aa55a3f4..7e8f4ca0a544 100644
--- a/docs/backend/server_arguments.md
+++ b/docs/backend/server_arguments.md
@@ -1,13 +1,16 @@
 # Server Arguments
 
+## Common launch commands
+
 - To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 2
 ```
-- To enable multi-GPU data parallelism, add `--dp 2`. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total.
+- To enable multi-GPU data parallelism, add `--dp 2`. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total. We recommend [SGLang Router](https://docs.sglang.ai/router/router.html) for data parallelism.
 ```
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --dp 2 --tp 2
+python -m sglang_router.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --dp 2 --tp 2
 ```
+
 - If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --mem-fraction-static 0.7
@@ -31,3 +34,151 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 # Node 1
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --dist-init-addr sgl-dev-0:50000 --nnodes 2 --node-rank 1
 ```
+
+Please consult the documentation below to learn more about the parameters you may provide when launching a server.
+
+
+## Model and tokenizer
+
+* `model_path`: Path to the model that will be served.
+* `tokenizer_path`: Defaults to the `model_path`.
+* `tokenizer_mode`: By default `auto`, see [here](https://huggingface.co/docs/transformers/en/main_classes/tokenizer) for different mode.
+* `load_format`: The format the weights are loaded in. Defaults to `*.safetensors`/`*.bin`.
+* `trust_remote_code`:  If `True`, will use locally cached config files, other wise use remote configs in HuggingFace.
+* `dtype`: Dtype used for the model, defaults to `bfloat16`.
+* `kv_cache_dtype`: Dtype of the kv cache, defaults to the `dtype`.
+* `context_length`: The number of tokens our model can process *including the input*. Not that extending the default might lead to strange behavior.
+* `device`: The device we put the model, defaults to `cuda`.
+* `chat_template`: The chat template to use. Deviating from the default might lead to unexpected responses. For multi-modal chat templates, refer to [here](https://docs.sglang.ai/backend/openai_api_vision.html#Chat-Template).
+* `is_embedding`: Set to true to perform [embedding](https://docs.sglang.ai/backend/openai_api_embeddings.html) / [enocode](https://docs.sglang.ai/backend/native_api.html#Encode-(embedding-model)) and [reward](https://docs.sglang.ai/backend/native_api.html#Classify-(reward-model)) tasks.
+* `revision`: Adjust if a specific version of the model should be used.
+* `skip_tokenizer_init`: Set to true to provide the tokens to the engine and get the output tokens directly, typically used in RLHF.
+* `json_model_override_args`: Override model config with the provided JSON.
+* `delete_ckpt_after_loading`: Delete the model checkpoint after loading the model.
+
+## Serving: HTTP & API
+
+### HTTP Server configuration
+
+* `port` and `host`: Setup the host for HTTP server. By default `host: str = "127.0.0.1"` and `port: int = 30000`
+
+### API configuration
+
+* `api_key`: Sets an API key for the server and the OpenAI-compatible API.
+* `file_storage_pth`: Directory for storing uploaded or generated files from API calls.
+* `enable_cache_report`: If set, includes detailed usage of cached tokens in the response usage.
+
+## Parallelism
+
+### Tensor parallelism
+
+* `tp_size`: The number of GPUs the model weights get sharded over. Mainly for saving memory rather than for high throughput, see [this blogpost](https://pytorch.org/tutorials/intermediate/TP_tutorial.html#how-tensor-parallel-works).
+
+### Data parallelism
+
+* `dp_size`: Will be deprecated. The number of data-parallel copies of the model. [SGLang router](https://docs.sglang.ai/router/router.html) is recommended instead of the current naive data parallel.
+* `load_balance_method`: Will be deprecated. Load balancing strategy for data parallel requests.
+
+### Expert parallelism
+
+* `ep_size`: Distribute the experts onto multiple GPUs for MoE models. Remember to shard the model weights with `tp_size=ep_size`, for detailed benchmarking refer to [this PR](https://github.com/sgl-project/sglang/pull/2203).
+
+## Memory and scheduling
+
+* `mem_fraction_static`: Fraction of the free GPU memory used for static memory like model weights and KV cache. If building KV cache fails, it should be increased. If CUDA runs out of memory, it should be decreased.
+* `max_running_requests`: The maximum number of requests to run concurrently.
+* `max_total_tokens`: The maximum number of tokens that can be stored into the KV cache. Use mainly for debugging.
+* `chunked_prefill_size`: Perform the prefill in chunks of these size. Larger chunk size speeds up the prefill phase but increases the VRAM consumption. If CUDA runs out of memory, it should be decreased.
+* `max_prefill_tokens`: Token budget of how many tokens to accept in one prefill batch. The actual number is the max of this parameter and the `context_length`.
+* `schedule_policy`: The scheduling policy to control the processing order of waiting prefill requests in a single engine.
+* `schedule_conservativeness`: Can be used to decrease/increase the conservativeness of the server when taking new requests. Highly conservative behavior leads to starvation, but low conservativeness leads to slowed-down performance.
+* `cpu_offload_gb`: Reserve this amount of RAM in GB for offloading of model parameters to the CPU.
+* `prefill_only_one_req`: When this flag is turned on, the engine prefills only one request at a time.
+
+## Other runtime options
+
+* `stream_interval`: Interval (in tokens) for streaming responses. Smaller values lead to smoother streaming, and larger values lead to better throughput.
+* `random_seed`: Can be used to enforce more deterministic behavior.
+* `watchdog_timeout`: Adjusts the watchdog thread’s timeout before killing the server if batch generation takes too long.
+* `download_dir`: Use to override the default Hugging Face cache directory for model weights.
+* `base_gpu_id`: Use to adjust first GPU used to distribute the model across available GPUs.
+* `allow_auto_truncate`: Automatically truncate requests that exceed the maximum input length.
+
+## Logging
+
+* `log_level`: Global log verbosity.
+* `log_level_http`: Separate verbosity level for the HTTP server logs (if unset, defaults to `log_level`).
+* `log_requests`: Logs the inputs and outputs of all requests for debugging.
+* `show_time_cost`: Prints or logs detailed timing info for internal operations (helpful for performance tuning).
+* `enable_metrics`: Exports Prometheus-like metrics for request usage and performance.
+* `decode_log_interval`: How often (in tokens) to log decode progress.
+
+## Multi-node distributed serving
+
+* `dist_init_addr`: The TCP address used for initializing PyTorch’s distributed backend (e.g. `192.168.0.2:25000`).
+* `nnodes`: Total number of nodes in the cluster. Refer to how to run the [Llama 405B model](https://docs.sglang.ai/references/llama_405B.html#run-405b-fp16-on-two-nodes).
+* `node_rank`: Rank (ID) of this node among the `nnodes` in the distributed setup.
+
+
+## LoRA
+
+* `lora_paths`: You may provide a list of adapters to your model as a list. Each batch element will get model response with the corresponding lora adapter applied. Currently `cuda_graph` and `radix_attention` are not supportet with this option so you need to disable them manually. We are still working on through these [issues](https://github.com/sgl-project/sglang/issues/2929).
+* `max_loras_per_batch`: Maximum number of LoRAs in a running batch including base model.
+
+## Kernel backend
+
+* `attention_backend`: The backend for attention computation and KV cache management.
+* `sampling_backend`: The backend for sampling.
+
+## Constrained Decoding
+
+* `grammar_backend`: The grammar backend for constraint decoding. Detailed usage can be found in this [document](https://docs.sglang.ai/backend/structured_outputs.html).
+* `constrained_json_whitespace_pattern`: Use with `Outlines` grammar backend to allow JSON with syntatic newlines, tabs or multiple spaces. Details can be found [here](https://dottxt-ai.github.io/outlines/latest/reference/generation/json/#using-pydantic).
+
+## Speculative decoding
+
+* `speculative_draft_model_path`: The draft model path for speculative decoding.
+* `speculative_algorithm`: The algorithm for speculative decoding. Currently only [Eagle](https://arxiv.org/html/2406.16858v1) is supported. Note that the radix cache, chunked prefill, and overlap scheduler are disabled when using eagle speculative decoding.
+* `speculative_num_steps`: How many draft passes we run before verifying.
+* `speculative_num_draft_tokens`: The number of tokens proposed in a draft.
+* `speculative_eagle_topk`: The number of top candidates we keep for verification at each step for [Eagle](https://arxiv.org/html/2406.16858v1).
+
+
+## Double Sparsity
+
+* `enable_double_sparsity`: Enables [double sparsity](https://arxiv.org/html/2408.07092v2) which increases throughput.
+* `ds_channel_config_path`: The double sparsity config. For a guide on how to generate the config for your model see [this repo](https://github.com/andy-yang-1/DoubleSparse/tree/main/config).
+* `ds_heavy_channel_num`: Number of channel indices to keep for each layer.
+* `ds_heavy_token_num`: Number of tokens used for attention during decode. Skip sparse decoding if `min_seq_len` in batch < this number.
+* `ds_heavy_channel_type`: The type of heavy channels. Either `q`, `k` or `qk`.
+* `ds_sparse_decode_threshold`: Don't apply sparse decoding if `max_seq_len` in batch < this threshold.
+
+## Debug options
+
+*Note: We recommend to stay with the defaults and only use these options for debugging for best possible performance.*
+
+* `disable_radix_cache`: Disable [Radix](https://lmsys.org/blog/2024-01-17-sglang/) backend for prefix caching.
+* `disable_jump_forward`: Disable [jump-forward](https://lmsys.org/blog/2024-02-05-compressed-fsm/#our-method-jump-forward-decoding-with-a-compressed-finite-state-machine) for outlines grammar backend.
+* `disable_cuda_graph`: Disable [cuda graph](https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/) for model forward.
+* `disable_cuda_graph_padding`: Disable cuda graph when padding is needed. In other case still use cuda graph.
+* `disable_outlines_disk_cache`: Disable disk cache for outlines grammar backend.
+* `disable_custom_all_reduce`: Disable usage of custom all reduce kernel.
+* `disable_mla`: Disable [Multi-Head Latent Attention](https://arxiv.org/html/2405.04434v5) for Deepseek model.
+* `disable_overlap_schedule`: Disable the [Overhead-Scheduler](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#zero-overhead-batch-scheduler).
+* `enable_nan_detection`: Turning this on makes the sampler print a warning if the logits contain `NaN`.
+* `enable_p2p_check`: Turns off the default of allowing always p2p check when accessing GPU.
+* `triton_attention_reduce_in_fp32`: In triton kernels this will cast the intermediate attention result to `float32`.
+
+## Optimization
+
+*Note: Some of these options are still in experimental stage.*
+
+* `enable_mixed_chunk`: Enables mixing prefill and decode, see [this discussion](https://github.com/sgl-project/sglang/discussions/1163).
+* `enable_dp_attention`: Enable [Data Parallelism Attention](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#data-parallelism-attention-for-deepseek-models) for Deepseek models. Note that you need to choose `dp_size = tp_size` for this.
+* `enable_ep_moe`: Enables expert parallelism, see the description of `ep_size`.
+* `enable_torch_compile`: Torch compile the model. This is an experimental feature.
+* `torch_compile_max_bs`: The maximum batch size when using `torch_compile`.
+* `cuda_graph_max_bs`: Adjust the maximum batchsize when using cuda graph. By default this is chosen for you based on GPU specifics.
+* `cuda_graph_bs`: The batch sizes to capture by `CudaGraphRunner`. By default this is done for you.
+* `torchao_config`: Experimental feature that optimizes the model with [torchao](https://github.com/pytorch/ao). Possible choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo, fp8dq-per_tensor, fp8dq-per_row.
+* `triton_attention_num_kv_splits`: Use to adjust the number of KV splits in triton kernels. Default is 8.

From 7bad7e75bf50ad4d21a6fbd93eadafb8e324f79a Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Fri, 24 Jan 2025 12:27:30 +0800
Subject: [PATCH 218/248] Add shapes for int8 gemm benchmark (#3093)

---
 sgl-kernel/benchmark/bench_int8_gemm.py | 97 ++++++++++++++++++++++++-
 1 file changed, 94 insertions(+), 3 deletions(-)

diff --git a/sgl-kernel/benchmark/bench_int8_gemm.py b/sgl-kernel/benchmark/bench_int8_gemm.py
index 2657c616cf34..c5a709393c11 100644
--- a/sgl-kernel/benchmark/bench_int8_gemm.py
+++ b/sgl-kernel/benchmark/bench_int8_gemm.py
@@ -1,3 +1,7 @@
+import argparse
+import copy
+import itertools
+
 import torch
 import triton
 from sgl_kernel import int8_scaled_mm
@@ -8,6 +12,56 @@ def to_int8(tensor: torch.Tensor) -> torch.Tensor:
     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 
 
+WEIGHT_SHAPES = {
+    "meta-llama/Llama-3.1-8B-Instruct": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-3.3-70B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "mistralai/Mistral-Large-Instruct-2407": [
+        ([12288, 14336], 1),
+        ([12288, 12288], 0),
+        ([12288, 57344], 1),
+        ([28672, 12288], 0),
+    ],
+    "Qwen/Qwen2.5-7B-Instruct": [
+        ([3584, 4608], 1),
+        ([3584, 3584], 0),
+        ([3584, 37888], 1),
+        ([18944, 3584], 0),
+    ],
+    "Qwen/Qwen2.5-32B-Instruct": [
+        ([5120, 7168], 1),
+        ([5120, 5120], 0),
+        ([5120, 55296], 1),
+        ([27648, 5120], 0),
+    ],
+    "Qwen/Qwen2.5-72B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 59136], 1),
+        ([29568, 8192], 0),
+    ],
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
+        ([2048, 3072], 1),
+        ([2048, 4096], 1),
+        ([2048, 2048], 0),
+        ([2048, 576], 0),
+        ([2048, 21888], 1),
+        ([10944, 2048], 0),
+        ([2048, 2816], 1),
+        ([1408, 2048], 0),
+    ],
+}
+
+
 @triton.testing.perf_report(
     triton.testing.Benchmark(
         x_names=["batch_size"],
@@ -22,8 +76,8 @@ def to_int8(tensor: torch.Tensor) -> torch.Tensor:
         args={},
     )
 )
-def benchmark(batch_size, provider):
-    M, N, K = batch_size, 4096, 8192
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
     a = to_int8(torch.randn((M, K), device="cuda") * 5)
     b = to_int8(torch.randn((N, K), device="cuda").t() * 5)
     scale_a = torch.randn((M,), device="cuda", dtype=torch.float32)
@@ -52,4 +106,41 @@ def benchmark(batch_size, provider):
     return gbps(ms), gbps(max_ms), gbps(min_ms)
 
 
-benchmark.run(print_data=True, show_plots=True, save_path="bench_int8_res")
+def prepare_shapes(args):
+    KN_model_names = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        assert model in WEIGHT_SHAPES
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KN.append(model)
+            KN_model_names.append(KN)
+    return KN_model_names
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        help="List of models to benchmark",
+    )
+    parser.add_argument(
+        "--tp-sizes",
+        nargs="+",
+        type=int,
+        default=[1],
+        help="List of tensor parallel sizes",
+    )
+    args = parser.parse_args()
+
+    KN_model_names = prepare_shapes(args)
+    for K, N, model_name in KN_model_names:
+        print(f"{model_name} N={N} K={K}: ")
+        benchmark.run(
+            print_data=True, show_plots=True, save_path="bench_int8_res", N=N, K=K
+        )
+
+    print("Benchmark finished!")

From 9a0cc2e90e61942483c6e073e9af42cec75364df Mon Sep 17 00:00:00 2001
From: Byron Hsu <byronhsu1230@gmail.com>
Date: Thu, 23 Jan 2025 20:30:31 -0800
Subject: [PATCH 219/248] [router] Forward all request headers from router to
 workers  (#3070)

---
 scripts/killall_sglang.sh                |  9 ++++
 sgl-router/py_test/test_launch_server.py | 56 +++++++++++++++++++
 sgl-router/src/router.rs                 | 68 ++++++++++++++++++------
 sgl-router/src/server.rs                 | 24 +++++----
 4 files changed, 132 insertions(+), 25 deletions(-)

diff --git a/scripts/killall_sglang.sh b/scripts/killall_sglang.sh
index 53d08703e014..163a60f184b7 100755
--- a/scripts/killall_sglang.sh
+++ b/scripts/killall_sglang.sh
@@ -1,5 +1,14 @@
 #!/bin/bash
 
+# Check if sudo is available
+if command -v sudo >/dev/null 2>&1; then
+    sudo apt-get update
+    sudo apt-get install -y lsof
+else
+    apt-get update
+    apt-get install -y lsof
+fi
+
 # Show current GPU status
 nvidia-smi
 
diff --git a/sgl-router/py_test/test_launch_server.py b/sgl-router/py_test/test_launch_server.py
index e11602933a63..80659fc4f3e0 100644
--- a/sgl-router/py_test/test_launch_server.py
+++ b/sgl-router/py_test/test_launch_server.py
@@ -22,6 +22,7 @@ def popen_launch_router(
     timeout: float,
     policy: str = "cache_aware",
     max_payload_size: int = None,
+    api_key: str = None,
 ):
     """
     Launch the router server process.
@@ -33,6 +34,7 @@ def popen_launch_router(
         timeout: Server launch timeout
         policy: Router policy, one of "cache_aware", "round_robin", "random"
         max_payload_size: Maximum payload size in bytes
+        api_key: API key for the router
     """
     _, host, port = base_url.split(":")
     host = host[2:]
@@ -55,6 +57,9 @@ def popen_launch_router(
         policy,
     ]
 
+    if api_key is not None:
+        command.extend(["--api-key", api_key])
+
     if max_payload_size is not None:
         command.extend(["--router-max-payload-size", str(max_payload_size)])
 
@@ -333,6 +338,57 @@ def test_4_payload_size(self):
                 f"1.2MB payload should fail with 413 but got status {response.status_code}",
             )
 
+    def test_5_api_key(self):
+        print("Running test_5_api_key...")
+
+        self.process = popen_launch_router(
+            self.model,
+            self.base_url,
+            dp_size=1,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            policy="round_robin",
+            api_key="correct_api_key",
+        )
+
+        # # Test case 1: request without api key should fail
+        with requests.Session() as session:
+            response = session.post(
+                f"{self.base_url}/generate",
+                json={"text": "Kanye west is, ", "temperature": 0},
+            )
+            print(f"status code: {response.status_code}, response: {response.text}")
+            self.assertEqual(
+                response.status_code,
+                401,
+                "Request without api key should fail with 401",
+            )
+
+        # Test case 2: request with invalid api key should fail
+        with requests.Session() as session:
+            response = requests.post(
+                f"{self.base_url}/generate",
+                json={"text": "Kanye west is, ", "temperature": 0},
+                headers={"Authorization": "Bearer 123"},
+            )
+            print(f"status code: {response.status_code}, response: {response.text}")
+            self.assertEqual(
+                response.status_code,
+                401,
+                "Request with invalid api key should fail with 401",
+            )
+
+        # Test case 3: request with correct api key should succeed
+        with requests.Session() as session:
+            response = session.post(
+                f"{self.base_url}/generate",
+                json={"text": "Kanye west is ", "temperature": 0},
+                headers={"Authorization": "Bearer correct_api_key"},
+            )
+            print(f"status code: {response.status_code}, response: {response.text}")
+            self.assertEqual(
+                response.status_code, 200, "Request with correct api key should succeed"
+            )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/sgl-router/src/router.rs b/sgl-router/src/router.rs
index a189ff9eb888..5ee34c59869d 100644
--- a/sgl-router/src/router.rs
+++ b/sgl-router/src/router.rs
@@ -12,6 +12,18 @@ use std::thread;
 use std::time::Duration;
 use tokio;
 
+fn copy_request_headers(req: &HttpRequest) -> Vec<(String, String)> {
+    req.headers()
+        .iter()
+        .filter_map(|(name, value)| {
+            value
+                .to_str()
+                .ok()
+                .map(|v| (name.to_string(), v.to_string()))
+        })
+        .collect()
+}
+
 #[derive(Debug)]
 pub enum Router {
     RoundRobin {
@@ -303,8 +315,18 @@ impl Router {
         client: &reqwest::Client,
         worker_url: &str,
         route: &str,
+        req: &HttpRequest,
     ) -> HttpResponse {
-        match client.get(format!("{}{}", worker_url, route)).send().await {
+        let mut request_builder = client.get(format!("{}{}", worker_url, route));
+
+        // Copy all headers from original request except for /health because it does not need authorization
+        if route != "/health" {
+            for (name, value) in copy_request_headers(req) {
+                request_builder = request_builder.header(name, value);
+            }
+        }
+
+        match request_builder.send().await {
             Ok(res) => {
                 let status = actix_web::http::StatusCode::from_u16(res.status().as_u16())
                     .unwrap_or(actix_web::http::StatusCode::INTERNAL_SERVER_ERROR);
@@ -322,7 +344,12 @@ impl Router {
         }
     }
 
-    pub async fn route_to_first(&self, client: &reqwest::Client, route: &str) -> HttpResponse {
+    pub async fn route_to_first(
+        &self,
+        client: &reqwest::Client,
+        route: &str,
+        req: &HttpRequest,
+    ) -> HttpResponse {
         const MAX_REQUEST_RETRIES: u32 = 3;
         const MAX_TOTAL_RETRIES: u32 = 6;
         let mut total_retries = 0;
@@ -338,10 +365,17 @@ impl Router {
                             info!("Retrying request after {} failed attempts", total_retries);
                         }
 
-                        let response = self.send_request(client, &worker_url, route).await;
+                        let response = self.send_request(client, &worker_url, route, req).await;
 
                         if response.status().is_success() {
                             return response;
+                        } else {
+                            // if the worker is healthy, it means the request is bad, so return the error response
+                            let health_response =
+                                self.send_request(client, &worker_url, "/health", req).await;
+                            if health_response.status().is_success() {
+                                return response;
+                            }
                         }
 
                         warn!(
@@ -496,19 +530,16 @@ impl Router {
             .map(|v| v.get("stream").and_then(|s| s.as_bool()).unwrap_or(false))
             .unwrap_or(false);
 
-        let res = match client
+        let mut request_builder = client
             .post(format!("{}{}", worker_url, route))
-            .header(
-                "Content-Type",
-                req.headers()
-                    .get("Content-Type")
-                    .and_then(|h| h.to_str().ok())
-                    .unwrap_or("application/json"),
-            )
-            .body(body.to_vec())
-            .send()
-            .await
-        {
+            .body(body.to_vec());
+
+        // Copy all headers from original request
+        for (name, value) in copy_request_headers(req) {
+            request_builder = request_builder.header(name, value);
+        }
+
+        let res = match request_builder.send().await {
             Ok(res) => res,
             Err(_) => return HttpResponse::InternalServerError().finish(),
         };
@@ -596,6 +627,13 @@ impl Router {
 
                 if response.status().is_success() {
                     return response;
+                } else {
+                    // if the worker is healthy, it means the request is bad, so return the error response
+                    let health_response =
+                        self.send_request(client, &worker_url, "/health", req).await;
+                    if health_response.status().is_success() {
+                        return response;
+                    }
                 }
 
                 warn!(
diff --git a/sgl-router/src/server.rs b/sgl-router/src/server.rs
index e3587389e9ff..0706c57c06cc 100644
--- a/sgl-router/src/server.rs
+++ b/sgl-router/src/server.rs
@@ -26,33 +26,37 @@ impl AppState {
 }
 
 #[get("/health")]
-async fn health(data: web::Data<AppState>) -> impl Responder {
-    data.router.route_to_first(&data.client, "/health").await
+async fn health(req: HttpRequest, data: web::Data<AppState>) -> impl Responder {
+    data.router
+        .route_to_first(&data.client, "/health", &req)
+        .await
 }
 
 #[get("/health_generate")]
-async fn health_generate(data: web::Data<AppState>) -> impl Responder {
+async fn health_generate(req: HttpRequest, data: web::Data<AppState>) -> impl Responder {
     data.router
-        .route_to_first(&data.client, "/health_generate")
+        .route_to_first(&data.client, "/health_generate", &req)
         .await
 }
 
 #[get("/get_server_info")]
-async fn get_server_info(data: web::Data<AppState>) -> impl Responder {
+async fn get_server_info(req: HttpRequest, data: web::Data<AppState>) -> impl Responder {
     data.router
-        .route_to_first(&data.client, "/get_server_info")
+        .route_to_first(&data.client, "/get_server_info", &req)
         .await
 }
 
 #[get("/v1/models")]
-async fn v1_models(data: web::Data<AppState>) -> impl Responder {
-    data.router.route_to_first(&data.client, "/v1/models").await
+async fn v1_models(req: HttpRequest, data: web::Data<AppState>) -> impl Responder {
+    data.router
+        .route_to_first(&data.client, "/v1/models", &req)
+        .await
 }
 
 #[get("/get_model_info")]
-async fn get_model_info(data: web::Data<AppState>) -> impl Responder {
+async fn get_model_info(req: HttpRequest, data: web::Data<AppState>) -> impl Responder {
     data.router
-        .route_to_first(&data.client, "/get_model_info")
+        .route_to_first(&data.client, "/get_model_info", &req)
         .await
 }
 

From 8d8ef8497ebb5b98b7bfd6f6ce4e20baa2bda976 Mon Sep 17 00:00:00 2001
From: Byron Hsu <byronhsu1230@gmail.com>
Date: Thu, 23 Jan 2025 20:32:43 -0800
Subject: [PATCH 220/248] bump router to 0.1.4 (#3094)

---
 sgl-router/py_src/sglang_router/version.py | 2 +-
 sgl-router/pyproject.toml                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sgl-router/py_src/sglang_router/version.py b/sgl-router/py_src/sglang_router/version.py
index ae7362549b3c..bbab0242f6aa 100644
--- a/sgl-router/py_src/sglang_router/version.py
+++ b/sgl-router/py_src/sglang_router/version.py
@@ -1 +1 @@
-__version__ = "0.1.3"
+__version__ = "0.1.4"
diff --git a/sgl-router/pyproject.toml b/sgl-router/pyproject.toml
index 3a00d047200f..9bd6027068ba 100644
--- a/sgl-router/pyproject.toml
+++ b/sgl-router/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sglang-router"
-version = "0.1.3"
+version = "0.1.4"
 description = "SGLang router is a standalone module implemented in Rust to achieve data parallelism across SGLang instances."
 authors = [{name = "Byron Hsu", email = "byronhsu1230@gmail.com"}]
 requires-python = ">=3.8"

From 3ed0a547b233eaf1153409ba4e59a21da0aa3883 Mon Sep 17 00:00:00 2001
From: Byron Hsu <byronhsu1230@gmail.com>
Date: Thu, 23 Jan 2025 21:01:01 -0800
Subject: [PATCH 221/248] [router] Fix twine uploading (#3095)

---
 .github/workflows/release-pypi-router.yml | 1 +
 sgl-router/pyproject.toml                 | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/.github/workflows/release-pypi-router.yml b/.github/workflows/release-pypi-router.yml
index bba0c0fca53d..547522e8aa6c 100644
--- a/.github/workflows/release-pypi-router.yml
+++ b/.github/workflows/release-pypi-router.yml
@@ -84,6 +84,7 @@ jobs:
       - name: Build SDist
         run: |
           pip install build
+          python -m pip install -U packaging
           python -m build --sdist
 
       - uses: actions/upload-artifact@v4
diff --git a/sgl-router/pyproject.toml b/sgl-router/pyproject.toml
index 9bd6027068ba..da5c44a1196d 100644
--- a/sgl-router/pyproject.toml
+++ b/sgl-router/pyproject.toml
@@ -20,6 +20,10 @@ classifiers = [
 [tool.setuptools.packages]
 find = { where = ["py_src"] }
 
+# workaround for https://github.com/pypa/twine/issues/1216
+[tool.setuptools]
+license-files = []
+
 [[tool.setuptools-rust.ext-modules]]
 target = "sglang_router_rs"
 path = "Cargo.toml"

From 0666d39ab87b24d9347c6a70ae5ec9493f1e4d27 Mon Sep 17 00:00:00 2001
From: yych0745 <1398089567@qq.com>
Date: Fri, 24 Jan 2025 15:12:13 +0800
Subject: [PATCH 222/248] cutlass optimization

---
 sgl-kernel/bench_fp8_res/results.html         |  3 +
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    | 72 +++++++++++--------
 2 files changed, 44 insertions(+), 31 deletions(-)
 create mode 100644 sgl-kernel/bench_fp8_res/results.html

diff --git a/sgl-kernel/bench_fp8_res/results.html b/sgl-kernel/bench_fp8_res/results.html
new file mode 100644
index 000000000000..6e17ec3d55b6
--- /dev/null
+++ b/sgl-kernel/bench_fp8_res/results.html
@@ -0,0 +1,3 @@
+<html><body>
+<image src="fp8 scaled matmul.png"/>
+</body></html>
diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index aa30f8073473..587293cd1972 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -354,7 +354,6 @@ struct DeviceGemmFp8RowwiseSm90 {
   using ArchTag = cutlass::arch::Sm90;  // Tag indicating the minimum SM that supports the intended feature
   using OperatorClass = cutlass::arch::OpClassTensorOp;  // Operator class tag
   using TileShape = CTAShape;                            // Threadblock-level tile size
-  using TileScheduler = TileSchedulerType;
 
   static constexpr bool PONG = false;
   static constexpr bool FAST_ACCUM = true;
@@ -412,17 +411,16 @@ struct DeviceGemmFp8RowwiseSm90 {
 
   using SlowAccum = DefaultSchedule;
   using FastAccum = FastPongSchedule;  // Default apply Pingpong
-  using MainLoopSchedule = cute::conditional_t<FAST_ACCUM, FastAccum, SlowAccum>;
 
   using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
       ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB, LayoutB, AlignmentB, ElementAccumulator,
       TileShape, ClusterShape,
       cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
           sizeof(typename CollectiveEpilogue::SharedStorage))>,
-      MainLoopSchedule>::CollectiveOp;
+          MainloopScheduleType>::CollectiveOp;
 
   using GemmKernel = cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>,  // Indicates ProblemShape
-                                                          CollectiveMainloop, CollectiveEpilogue, TileScheduler>;
+                                                          CollectiveMainloop, CollectiveEpilogue, TileSchedulerType>;
 
   using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 };
@@ -512,27 +510,34 @@ void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const
   TORCH_CHECK(status == cutlass::Status::kSuccess)
 }
 
-template <typename OutType, typename CTAShape, typename ClusterShape>
-void sm90_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
-                        const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-                        const c10::optional<torch::Tensor>& bias) {
-  using ElementInput = cutlass::float_e4m3_t;
-  using ElementOutput = OutType;
-  using AccumElementType = float;
-  using MainloopScheduleType = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
-  using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized;
-  using TileSchedulerType = void;
-  if (bias) {
-    using Gemm =
-        typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape, ClusterShape,
-                                          MainloopScheduleType, EpilogueScheduleType, TileSchedulerType, true>::Gemm;
-    return launch_sm90_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
-  } else {
-    using Gemm =
-        typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape, ClusterShape,
-                                          MainloopScheduleType, EpilogueScheduleType, TileSchedulerType, false>::Gemm;
-    return launch_sm90_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
-  }
+template <typename OutType, typename CTAShape, typename ClusterShape, typename MainloopScheduleType, typename TileSchedulerType>
+void sm90_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, 
+                       const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                       const c10::optional<torch::Tensor>& bias,
+                       bool fast_accum = true,
+                       bool use_persistent = false) {
+    using ElementInput = cutlass::float_e4m3_t;
+    using ElementOutput = OutType;
+    using AccumElementType = float;
+    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized;
+    
+    if (bias) {
+        using Gemm = typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, 
+            CTAShape, ClusterShape, 
+            MainloopScheduleType,
+            EpilogueScheduleType,
+            TileSchedulerType,
+            true>::Gemm;
+        return launch_sm90_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
+    } else {
+        using Gemm = typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType,
+            CTAShape, ClusterShape,
+            MainloopScheduleType, 
+            EpilogueScheduleType,
+            TileSchedulerType,
+            false>::Gemm;
+        return launch_sm90_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
+    }
 }
 
 template <typename OutType>
@@ -541,19 +546,24 @@ void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch
                          const c10::optional<torch::Tensor>& bias) {
   uint32_t const m = a.size(0);
   uint32_t const mp2 = std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
-
-  if (mp2 <= 64) {
+using FastPingpongScheduler = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+using FastBasicScheduler = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+using PersistentTileScheduler = cutlass::gemm::PersistentScheduler;
+using BasicTileScheduler = void;
+  if (mp2 <= 1) {
+    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _8, _1>, FastBasicScheduler, BasicTileScheduler>(out, a, b, scales_a, scales_b, bias);
+  } if (mp2 <= 64) {
     // m in [1, 64]
-    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _4, _1>>(out, a, b, scales_a, scales_b, bias);
+    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _4, _1>, FastPingpongScheduler, PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
   } else if (mp2 <= 256) {
     // m in (64, 256]
-    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _1, _1>, FastPingpongScheduler, PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
   } else if (mp2 <= 1024) {
     // m in (256, 1024]
-    return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_1, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+    return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_1, _1, _1>, FastPingpongScheduler, PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
   } else {
     // m in (1024, inf)
-    return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias);
+    return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>, FastPingpongScheduler, PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
   }
 }
 #endif

From 6619f48e18e8896c76ec6319b3e5fd092afe4040 Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Fri, 24 Jan 2025 15:19:09 +0800
Subject: [PATCH 223/248] Fix cu118 group gemm compile issue (#3097)

---
 sgl-kernel/setup.py | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 72d188e71d8b..d60167435c4e 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -62,6 +62,23 @@ def _get_version():
     "-DFLASHINFER_ENABLE_F16",
 ]
 
+sources = [
+    "src/sgl-kernel/csrc/trt_reduce_internal.cu",
+    "src/sgl-kernel/csrc/trt_reduce_kernel.cu",
+    "src/sgl-kernel/csrc/moe_align_kernel.cu",
+    "src/sgl-kernel/csrc/int8_gemm_kernel.cu",
+    "src/sgl-kernel/csrc/sampling_scaling_penalties.cu",
+    "src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu",
+    "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
+    "src/sgl-kernel/csrc/rotary_embedding.cu",
+    "3rdparty/flashinfer/csrc/activation.cu",
+    "3rdparty/flashinfer/csrc/bmm_fp8.cu",
+    "3rdparty/flashinfer/csrc/group_gemm.cu",
+    "3rdparty/flashinfer/csrc/norm.cu",
+    "3rdparty/flashinfer/csrc/sampling.cu",
+    "3rdparty/flashinfer/csrc/renorm.cu",
+]
+
 enable_bf16 = os.getenv("SGL_KERNEL_ENABLE_BF16", "0") == "1"
 enable_fp8 = os.getenv("SGL_KERNEL_ENABLE_FP8", "0") == "1"
 enable_sm90a = os.getenv("SGL_KERNEL_ENABLE_SM90A", "0") == "1"
@@ -71,6 +88,7 @@ def _get_version():
 if torch.cuda.is_available():
     if cuda_version >= (12, 0) and sm_version >= 90:
         nvcc_flags.append("-gencode=arch=compute_90a,code=sm_90a")
+        sources.append("3rdparty/flashinfer/csrc/group_gemm_sm90.cu")
     if sm_version >= 90:
         nvcc_flags.extend(
             [
@@ -85,6 +103,7 @@ def _get_version():
     # compilation environment without GPU
     if enable_sm90a:
         nvcc_flags.append("-gencode=arch=compute_90a,code=sm_90a")
+        sources.append("3rdparty/flashinfer/csrc/group_gemm_sm90.cu")
     if enable_fp8:
         nvcc_flags.extend(
             [
@@ -110,26 +129,11 @@ def _get_version():
 cxx_flags = ["-O3"]
 libraries = ["c10", "torch", "torch_python", "cuda"]
 extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib", "-L/usr/lib/x86_64-linux-gnu"]
+
 ext_modules = [
     CUDAExtension(
         name="sgl_kernel.ops._kernels",
-        sources=[
-            "src/sgl-kernel/csrc/trt_reduce_internal.cu",
-            "src/sgl-kernel/csrc/trt_reduce_kernel.cu",
-            "src/sgl-kernel/csrc/moe_align_kernel.cu",
-            "src/sgl-kernel/csrc/int8_gemm_kernel.cu",
-            "src/sgl-kernel/csrc/sampling_scaling_penalties.cu",
-            "src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu",
-            "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
-            "src/sgl-kernel/csrc/rotary_embedding.cu",
-            "3rdparty/flashinfer/csrc/activation.cu",
-            "3rdparty/flashinfer/csrc/bmm_fp8.cu",
-            "3rdparty/flashinfer/csrc/group_gemm.cu",
-            "3rdparty/flashinfer/csrc/group_gemm_sm90.cu",
-            "3rdparty/flashinfer/csrc/norm.cu",
-            "3rdparty/flashinfer/csrc/sampling.cu",
-            "3rdparty/flashinfer/csrc/renorm.cu",
-        ],
+        sources=sources,
         include_dirs=include_dirs,
         extra_compile_args={
             "nvcc": nvcc_flags,

From b9980afab9e169b65692707d6c180fb30b7a9592 Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Fri, 24 Jan 2025 07:47:39 +0000
Subject: [PATCH 224/248] clean code

---
 sgl-kernel/bench_fp8_res/results.html | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 sgl-kernel/bench_fp8_res/results.html

diff --git a/sgl-kernel/bench_fp8_res/results.html b/sgl-kernel/bench_fp8_res/results.html
deleted file mode 100644
index 6e17ec3d55b6..000000000000
--- a/sgl-kernel/bench_fp8_res/results.html
+++ /dev/null
@@ -1,3 +0,0 @@
-<html><body>
-<image src="fp8 scaled matmul.png"/>
-</body></html>

From cd5108369abb19e7f6330f16b07d12ddb6e4a245 Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Fri, 24 Jan 2025 09:34:06 +0000
Subject: [PATCH 225/248] fix reivew issues

---
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    | 122 ++++++++----------
 sgl-kernel/src/sgl-kernel/csrc/utils.h        |   5 -
 sgl-kernel/tests/test_fp8_gemm.py             |  31 +++--
 3 files changed, 76 insertions(+), 82 deletions(-)

diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index 587293cd1972..02ecfa37574b 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -229,69 +229,64 @@ void sm89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch
                          const torch::Tensor& scales_a, const torch::Tensor& scales_b,
                          const c10::optional<torch::Tensor>& bias) {
   uint32_t const m = a.size(0);
-  // uint32_t const mp2 =
-  //     std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
-  uint32_t const mp2 = next_pow_2(m);  // next power of 2
-
   uint32_t const n = out.size(1);
-  uint32_t const np2 = next_pow_2(n);
 
-  if (mp2 == 1) {
-    if (np2 <= 8192) {
+  if (m == 1) {
+    if (n <= 8192) {
       return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
                                 7>(out, a, b, scales_a, scales_b, bias);
     } else {
       return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
                                 5>(out, a, b, scales_a, scales_b, bias);
     }
-  } else if (mp2 <= 16) {
+  } else if (m <= 16) {
     // M in (1, 16]
-    if (np2 <= 8192) {
+    if (n <= 8192) {
       return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
                                 4>(out, a, b, scales_a, scales_b, bias);
-    } else if (np2 <= 16384) {
+    } else if (n <= 16384) {
       return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
                                 5>(out, a, b, scales_a, scales_b, bias);
     } else {
       return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
                                 7>(out, a, b, scales_a, scales_b, bias);
     }
-  } else if (mp2 <= 64) {
+  } else if (m <= 64) {
     // M in (16, 64]
-    if (np2 <= 16384) {
+    if (n <= 16384) {
       return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
                                 7>(out, a, b, scales_a, scales_b, bias);
     } else {
       return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
                                 7>(out, a, b, scales_a, scales_b, bias);
     }
-  } else if (mp2 <= 128) {
+  } else if (m <= 128) {
     // M in (64, 128]
-    if (np2 <= 8192) {
+    if (n <= 8192) {
       return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>,
                                 4>(out, a, b, scales_a, scales_b, bias);
-    } else if (np2 <= 16384) {
+    } else if (n <= 16384) {
       return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>,
                                 5>(out, a, b, scales_a, scales_b, bias);
     } else {
       return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
                                 5>(out, a, b, scales_a, scales_b, bias);
     }
-  } else if (mp2 <= 256) {
+  } else if (m <= 256) {
     // M in (128, 256]
-    if (np2 <= 8192) {
+    if (n <= 8192) {
       return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
                                 5>(out, a, b, scales_a, scales_b, bias);
-    } else if (np2 <= 16384) {
+    } else if (n <= 16384) {
       return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
                                 7>(out, a, b, scales_a, scales_b, bias);
     } else {
       return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 128>, cutlass::gemm::GemmShape<64, 32, 128>,
                                 4>(out, a, b, scales_a, scales_b, bias);
     }
-  } else if (mp2 <= 512) {
+  } else if (m <= 512) {
     // M in (256, 512)
-    if (np2 <= 16384) {
+    if (n <= 16384) {
       return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
                                 2>(out, a, b, scales_a, scales_b, bias);
     } else {
@@ -300,7 +295,7 @@ void sm89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch
     }
   } else {
     // M in (512, inf)
-    if (np2 <= 8192) {
+    if (n <= 8192) {
       return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
                                 3>(out, a, b, scales_a, scales_b, bias);
     } else {
@@ -417,7 +412,7 @@ struct DeviceGemmFp8RowwiseSm90 {
       TileShape, ClusterShape,
       cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
           sizeof(typename CollectiveEpilogue::SharedStorage))>,
-          MainloopScheduleType>::CollectiveOp;
+      MainloopScheduleType>::CollectiveOp;
 
   using GemmKernel = cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>,  // Indicates ProblemShape
                                                           CollectiveMainloop, CollectiveEpilogue, TileSchedulerType>;
@@ -451,7 +446,6 @@ typename Gemm::Arguments prepare_sm90_fp8_args(torch::Tensor& out, const torch::
   ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
   ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
 
-  // TODO: confirm correctess
   StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, make_shape(m, k, 1));
   StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, make_shape(n, k, 1));
   StrideC stride_c;
@@ -510,34 +504,27 @@ void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const
   TORCH_CHECK(status == cutlass::Status::kSuccess)
 }
 
-template <typename OutType, typename CTAShape, typename ClusterShape, typename MainloopScheduleType, typename TileSchedulerType>
-void sm90_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, 
-                       const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-                       const c10::optional<torch::Tensor>& bias,
-                       bool fast_accum = true,
-                       bool use_persistent = false) {
-    using ElementInput = cutlass::float_e4m3_t;
-    using ElementOutput = OutType;
-    using AccumElementType = float;
-    using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized;
-    
-    if (bias) {
-        using Gemm = typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, 
-            CTAShape, ClusterShape, 
-            MainloopScheduleType,
-            EpilogueScheduleType,
-            TileSchedulerType,
-            true>::Gemm;
-        return launch_sm90_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
-    } else {
-        using Gemm = typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType,
-            CTAShape, ClusterShape,
-            MainloopScheduleType, 
-            EpilogueScheduleType,
-            TileSchedulerType,
-            false>::Gemm;
-        return launch_sm90_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
-    }
+template <typename OutType, typename CTAShape, typename ClusterShape, typename MainloopScheduleType,
+          typename TileSchedulerType>
+void sm90_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                        const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                        const c10::optional<torch::Tensor>& bias, bool fast_accum = true, bool use_persistent = false) {
+  using ElementInput = cutlass::float_e4m3_t;
+  using ElementOutput = OutType;
+  using AccumElementType = float;
+  using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized;
+
+  if (bias) {
+    using Gemm =
+        typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape, ClusterShape,
+                                          MainloopScheduleType, EpilogueScheduleType, TileSchedulerType, true>::Gemm;
+    return launch_sm90_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
+  } else {
+    using Gemm =
+        typename DeviceGemmFp8RowwiseSm90<ElementInput, ElementOutput, AccumElementType, CTAShape, ClusterShape,
+                                          MainloopScheduleType, EpilogueScheduleType, TileSchedulerType, false>::Gemm;
+    return launch_sm90_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
+  }
 }
 
 template <typename OutType>
@@ -545,25 +532,30 @@ void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch
                          const torch::Tensor& scales_a, const torch::Tensor& scales_b,
                          const c10::optional<torch::Tensor>& bias) {
   uint32_t const m = a.size(0);
-  uint32_t const mp2 = std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
-using FastPingpongScheduler = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-using FastBasicScheduler = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
-using PersistentTileScheduler = cutlass::gemm::PersistentScheduler;
-using BasicTileScheduler = void;
-  if (mp2 <= 1) {
-    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _8, _1>, FastBasicScheduler, BasicTileScheduler>(out, a, b, scales_a, scales_b, bias);
-  } if (mp2 <= 64) {
+  using FastPingpongScheduler = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using FastBasicScheduler = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using PersistentTileScheduler = cutlass::gemm::PersistentScheduler;
+  using BasicTileScheduler = void;
+  if (m <= 1) {
+    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _8, _1>, FastBasicScheduler,
+                              BasicTileScheduler>(out, a, b, scales_a, scales_b, bias);
+  }
+  if (m <= 64) {
     // m in [1, 64]
-    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _4, _1>, FastPingpongScheduler, PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
-  } else if (mp2 <= 256) {
+    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _4, _1>, FastPingpongScheduler,
+                              PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
+  } else if (m <= 256) {
     // m in (64, 256]
-    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _1, _1>, FastPingpongScheduler, PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
-  } else if (mp2 <= 1024) {
+    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _1, _1>, FastPingpongScheduler,
+                              PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
+  } else if (m <= 1024) {
     // m in (256, 1024]
-    return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_1, _1, _1>, FastPingpongScheduler, PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
+    return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_1, _1, _1>, FastPingpongScheduler,
+                              PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
   } else {
     // m in (1024, inf)
-    return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>, FastPingpongScheduler, PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
+    return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>, FastPingpongScheduler,
+                              PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
   }
 }
 #endif
diff --git a/sgl-kernel/src/sgl-kernel/csrc/utils.h b/sgl-kernel/src/sgl-kernel/csrc/utils.h
index 5820b1350ab5..2fed2d60c039 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/utils.h
+++ b/sgl-kernel/src/sgl-kernel/csrc/utils.h
@@ -44,8 +44,3 @@ inline int getSMVersion() {
   CHECK_CUDA_SUCCESS(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
   return sm_major * 10 + sm_minor;
 }
-
-inline uint32_t next_pow_2(uint32_t const num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
diff --git a/sgl-kernel/tests/test_fp8_gemm.py b/sgl-kernel/tests/test_fp8_gemm.py
index b55bd089a4d9..1a7318659444 100644
--- a/sgl-kernel/tests/test_fp8_gemm.py
+++ b/sgl-kernel/tests/test_fp8_gemm.py
@@ -2,7 +2,6 @@
 
 import torch
 from sgl_kernel import fp8_scaled_mm
-from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
 
 
 def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
@@ -20,23 +19,31 @@ def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
 
 class TestFp8Gemm(unittest.TestCase):
     def _test_accuracy_once(self, M, N, K, with_bias, out_dtype, device):
-        a = torch.randn((M, K), device=device)
-        b = torch.randn((N, K), device=device)
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
-        scale_a = torch.randn((M,), device="cuda", dtype=torch.float32) * 0.001
-        scale_b = torch.randn((N,), device="cuda", dtype=torch.float32) * 0.001
+        a_fp32 = (
+            (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+        )
+        a_fp8 = a_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        b_fp32 = (
+            (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+        )
+        b_fp8 = b_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        scale_a = torch.randn((M,), device=device, dtype=torch.float32) * 0.001
+        scale_b = torch.randn((N,), device=device, dtype=torch.float32) * 0.001
         if with_bias:
-            bias = torch.randn((N,), device="cuda", dtype=out_dtype)
+            bias = torch.randn((N,), device=device, dtype=out_dtype)
         else:
             bias = None
-        o1 = torch.empty((a.shape[0], b.shape[1]), device="cuda", dtype=torch.bfloat16)
-        b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+        o1 = torch.empty((M, N), device=device, dtype=torch.bfloat16)
         b_fp8 = b_fp8.t()
-        a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
-        o = torch_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias)
-        o1 = fp8_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias)
+        o = torch_scaled_mm(a_fp8, b_fp8, scale_a, scale_b, out_dtype, bias)
+        o1 = fp8_scaled_mm(a_fp8, b_fp8, scale_a, scale_b, out_dtype, bias)
         rtol = 0.02
-        atol = 2
+        atol = 1
         torch.testing.assert_close(o, o1, rtol=rtol, atol=atol)
         print(f"M={M}, N={N}, K={K}, with_bias={with_bias}, out_dtype={out_dtype}: OK")
 

From 8c3dc139dc9fc2ecf54d8d77db76ef478f3c5f2d Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Fri, 24 Jan 2025 10:28:44 +0000
Subject: [PATCH 226/248] fix bug

---
 sgl-kernel/setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index f84c0418f2b2..27e7e95df3b3 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -68,6 +68,7 @@ def _get_version():
     "src/sgl-kernel/csrc/trt_reduce_kernel.cu",
     "src/sgl-kernel/csrc/moe_align_kernel.cu",
     "src/sgl-kernel/csrc/int8_gemm_kernel.cu",
+    "src/sgl-kernel/csrc/fp8_gemm_kernel.cu",
     "src/sgl-kernel/csrc/sampling_scaling_penalties.cu",
     "src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu",
     "src/sgl-kernel/csrc/sgl_kernel_ops.cu",

From 153b414e835ead40017b00b3049bfb657a7748fa Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Fri, 24 Jan 2025 19:22:39 +0800
Subject: [PATCH 227/248] minor: sync flashinfer and add turbomind as 3rdparty
 (#3105)

---
 .gitmodules                    | 3 +++
 sgl-kernel/3rdparty/flashinfer | 2 +-
 sgl-kernel/3rdparty/turbomind  | 1 +
 sgl-kernel/developer_guide.md  | 1 +
 4 files changed, 6 insertions(+), 1 deletion(-)
 create mode 160000 sgl-kernel/3rdparty/turbomind

diff --git a/.gitmodules b/.gitmodules
index ed7603bfd3c1..97f3421449d3 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "sgl-kernel/3rdparty/flashinfer"]
 	path = sgl-kernel/3rdparty/flashinfer
 	url = https://github.com/flashinfer-ai/flashinfer.git
+[submodule "sgl-kernel/3rdparty/turbomind"]
+	path = sgl-kernel/3rdparty/turbomind
+	url = https://github.com/InternLM/turbomind
diff --git a/sgl-kernel/3rdparty/flashinfer b/sgl-kernel/3rdparty/flashinfer
index 93e1a2634e22..2d03ed7c01ae 160000
--- a/sgl-kernel/3rdparty/flashinfer
+++ b/sgl-kernel/3rdparty/flashinfer
@@ -1 +1 @@
-Subproject commit 93e1a2634e22355b0856246b032b285ad1d1da6b
+Subproject commit 2d03ed7c01aefd946c8a5781df9e59c0380116d4
diff --git a/sgl-kernel/3rdparty/turbomind b/sgl-kernel/3rdparty/turbomind
new file mode 160000
index 000000000000..0c9d0c724a99
--- /dev/null
+++ b/sgl-kernel/3rdparty/turbomind
@@ -0,0 +1 @@
+Subproject commit 0c9d0c724a99974ca3af0c12b24ef8a0444c4fd9
diff --git a/sgl-kernel/developer_guide.md b/sgl-kernel/developer_guide.md
index f41ce071e0b0..91e93ff75089 100644
--- a/sgl-kernel/developer_guide.md
+++ b/sgl-kernel/developer_guide.md
@@ -19,6 +19,7 @@ Third-party libraries:
 - [CCCL](https://github.com/NVIDIA/cccl)
 - [CUTLASS](https://github.com/NVIDIA/cutlass)
 - [FlashInfer](https://github.com/flashinfer-ai/flashinfer)
+- [TurboMind](https://github.com/InternLM/turbomind)
 
 ### Kernel Development
 

From 685a5738a7b09faacc786e77f2a2ecfb5c9d6cea Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevoraidanmorris@gmail.com>
Date: Fri, 24 Jan 2025 03:59:47 -0800
Subject: [PATCH 228/248] Allow local cutlass directory to be used in
 sgl-kernel build (#3037)

---
 sgl-kernel/setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index d60167435c4e..cf3c6a56303a 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -39,6 +39,8 @@ def _get_version():
 
 
 cutlass = root / "3rdparty" / "cutlass"
+cutlass_default = root / "3rdparty" / "cutlass"
+cutlass = Path(os.environ.get("CUSTOM_CUTLASS_SRC_DIR", default=cutlass_default))
 flashinfer = root / "3rdparty" / "flashinfer"
 include_dirs = [
     cutlass.resolve() / "include",

From 4505a43614ba7826a192c122f749b99e170966b5 Mon Sep 17 00:00:00 2001
From: Adarsh Shirawalmath <114558126+adarshxs@users.noreply.github.com>
Date: Fri, 24 Jan 2025 17:30:20 +0530
Subject: [PATCH 229/248] [Docs] minor update for phi-3 and phi-4 (#3096)

---
 docs/references/supported_models.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/references/supported_models.md b/docs/references/supported_models.md
index 60551b2c1da5..0a00ad0c8a1a 100644
--- a/docs/references/supported_models.md
+++ b/docs/references/supported_models.md
@@ -28,6 +28,7 @@
 - XVERSE / XVERSE MoE
 - SmolLM
 - GLM-4
+- Phi-3 / Phi-4
 - Phi-3-Small
 - IBM Granite 3
 

From 04f0b4cbeff5f1d5e511a1ce5cc2f8cdfa0fc1fc Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Fri, 24 Jan 2025 20:10:35 +0800
Subject: [PATCH 230/248] minor: update sgl-kernel setup (#3107)

---
 sgl-kernel/setup.py                           | 26 +++---
 .../src/sgl-kernel/csrc/fused_add_rms_norm.cu | 92 +++++++++++++++++++
 2 files changed, 103 insertions(+), 15 deletions(-)
 create mode 100644 sgl-kernel/src/sgl-kernel/csrc/fused_add_rms_norm.cu

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index cf3c6a56303a..56c5b1bb56b3 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -38,10 +38,10 @@ def _get_version():
                 return line.split("=")[1].strip().strip('"')
 
 
-cutlass = root / "3rdparty" / "cutlass"
 cutlass_default = root / "3rdparty" / "cutlass"
 cutlass = Path(os.environ.get("CUSTOM_CUTLASS_SRC_DIR", default=cutlass_default))
 flashinfer = root / "3rdparty" / "flashinfer"
+turbomind = root / "3rdparty" / "turbomind"
 include_dirs = [
     cutlass.resolve() / "include",
     cutlass.resolve() / "tools" / "util" / "include",
@@ -49,6 +49,8 @@ def _get_version():
     flashinfer.resolve() / "include",
     flashinfer.resolve() / "include" / "gemm",
     flashinfer.resolve() / "csrc",
+    turbomind.resolve(),
+    turbomind.resolve() / "src",
 ]
 nvcc_flags = [
     "-DNDEBUG",
@@ -63,6 +65,11 @@ def _get_version():
     "-use_fast_math",
     "-DFLASHINFER_ENABLE_F16",
 ]
+nvcc_flags_fp8 = [
+    "-DFLASHINFER_ENABLE_FP8",
+    "-DFLASHINFER_ENABLE_FP8_E4M3",
+    "-DFLASHINFER_ENABLE_FP8_E5M2",
+]
 
 sources = [
     "src/sgl-kernel/csrc/trt_reduce_internal.cu",
@@ -73,6 +80,7 @@ def _get_version():
     "src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu",
     "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
     "src/sgl-kernel/csrc/rotary_embedding.cu",
+    "src/sgl-kernel/csrc/fused_add_rms_norm.cu",
     "3rdparty/flashinfer/csrc/activation.cu",
     "3rdparty/flashinfer/csrc/bmm_fp8.cu",
     "3rdparty/flashinfer/csrc/group_gemm.cu",
@@ -92,13 +100,7 @@ def _get_version():
         nvcc_flags.append("-gencode=arch=compute_90a,code=sm_90a")
         sources.append("3rdparty/flashinfer/csrc/group_gemm_sm90.cu")
     if sm_version >= 90:
-        nvcc_flags.extend(
-            [
-                "-DFLASHINFER_ENABLE_FP8",
-                "-DFLASHINFER_ENABLE_FP8_E4M3",
-                "-DFLASHINFER_ENABLE_FP8_E5M2",
-            ]
-        )
+        nvcc_flags.extend(nvcc_flags_fp8)
     if sm_version >= 80:
         nvcc_flags.append("-DFLASHINFER_ENABLE_BF16")
 else:
@@ -107,13 +109,7 @@ def _get_version():
         nvcc_flags.append("-gencode=arch=compute_90a,code=sm_90a")
         sources.append("3rdparty/flashinfer/csrc/group_gemm_sm90.cu")
     if enable_fp8:
-        nvcc_flags.extend(
-            [
-                "-DFLASHINFER_ENABLE_FP8",
-                "-DFLASHINFER_ENABLE_FP8_E4M3",
-                "-DFLASHINFER_ENABLE_FP8_E5M2",
-            ]
-        )
+        nvcc_flags.extend(nvcc_flags_fp8)
     if enable_bf16:
         nvcc_flags.append("-DFLASHINFER_ENABLE_BF16")
 
diff --git a/sgl-kernel/src/sgl-kernel/csrc/fused_add_rms_norm.cu b/sgl-kernel/src/sgl-kernel/csrc/fused_add_rms_norm.cu
new file mode 100644
index 000000000000..734061586675
--- /dev/null
+++ b/sgl-kernel/src/sgl-kernel/csrc/fused_add_rms_norm.cu
@@ -0,0 +1,92 @@
+// Adapted from
+// https://github.com/InternLM/lmdeploy/blob/800b6010c0bf76aadf678bc38a507b749fb9774c/src/turbomind/kernels/norm/rms_norm.cu
+
+#include <turbomind/kernels/core/array_ops.h>
+#include <turbomind/kernels/core/common.h>
+
+#include <cub/block/block_reduce.cuh>
+
+using namespace turbomind;
+
+template <class T, class Tacc, int block_dim, int vec_size>
+__global__ void BiasResidualRMSNormKernel(T* __restrict__ residual, T* __restrict__ hidden_states,
+                                          const T* __restrict__ weights, const T* __restrict__ bias, int dims, int num,
+                                          float eps, float inv_dims) {
+  const int ti = blockIdx.x;
+  const int di = threadIdx.x * vec_size;
+
+  if (ti >= num) {
+    return;
+  }
+
+  residual += dims * ti;
+  hidden_states += dims * ti;
+
+  Array<Tacc, vec_size> accum{};
+
+  Array<T, vec_size> r_vec;
+  Array<T, vec_size> h_vec;
+  Array<T, vec_size> b_vec;
+
+  for (int i = di; i < dims; i += block_dim * vec_size) {
+    Load(r_vec, &residual[i]);
+    Load(h_vec, &hidden_states[i]);
+
+    using namespace ops;
+    r_vec = r_vec + h_vec;
+
+    if (bias) {
+      Ldg(b_vec, &bias[i]);
+      r_vec = r_vec + b_vec;
+    }
+
+    Store(&residual[i], r_vec);
+
+    Array<Tacc, vec_size> tmp = cast<Tacc>(r_vec);
+
+    accum = accum + tmp * tmp;
+  }
+
+  float sum{};
+  PRAGMA_UNROLL
+  for (int i = 0; i < vec_size; ++i) {
+    sum += accum[i];
+  }
+
+  using BlockReduce = cub::BlockReduce<Tacc, block_dim>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  sum = BlockReduce{temp_storage}.Sum(sum);
+
+  __shared__ float shared_sum;
+
+  if (threadIdx.x == 0) {
+    shared_sum = rsqrtf(sum * inv_dims + eps);
+  }
+
+  __syncthreads();
+
+  sum = shared_sum;
+
+  Array<T, vec_size> w_vec;
+  for (int i = di; i < dims; i += block_dim * vec_size) {
+    Load(r_vec, &residual[i]);
+    Ldg(w_vec, &weights[i]);
+    PRAGMA_UNROLL
+    for (int c = 0; c < vec_size; ++c) {
+      r_vec[c] = (T)((float)r_vec[c] * sum) * w_vec[c];
+    }
+    Store(&hidden_states[i], r_vec);
+  }
+}
+
+template <class T>
+void invokeBiasResidualRMSNorm(T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num,
+                               float eps, cudaStream_t st) {
+  constexpr int vec_size = 16 / sizeof(T);
+  constexpr int threads = 512;
+  const int blocks = num;
+
+  BiasResidualRMSNormKernel<T, float, threads, vec_size>
+      <<<blocks, threads, 0, st>>>(residual, hidden_states, weights, bias, dims, num, eps, 1.f / dims);
+}

From a22f60a313818678ba7455088833705be694c32f Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Fri, 24 Jan 2025 22:30:30 +0800
Subject: [PATCH 231/248] Add workflow for sgl-kernel cu118 release (#3109)

---
 .github/workflows/release-whl-kernel.yml | 59 ++++++++++++++++++++++++
 sgl-kernel/build.sh                      |  8 +++-
 2 files changed, 66 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/release-whl-kernel.yml

diff --git a/.github/workflows/release-whl-kernel.yml b/.github/workflows/release-whl-kernel.yml
new file mode 100644
index 000000000000..b49da1feb9c0
--- /dev/null
+++ b/.github/workflows/release-whl-kernel.yml
@@ -0,0 +1,59 @@
+name: Release SGLang Kernel Wheel (cu118)
+
+on:
+  workflow_dispatch:
+    inputs:
+      tag_name:
+        required: true
+        type: string
+
+jobs:
+  build-wheels:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.9', '3.10', '3.11', '3.12']
+        cuda-version: ['11.8']
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: 'recursive'
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheels for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
+          path: sgl-kernel/dist/*
+
+  release:
+    needs: build-wheels
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-*
+
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ inputs.tag_name }}
+          repository: sgl-project/whl
+          token: ${{ secrets.WHL_TOKEN }}
+          files: |
+            sgl-kernel/dist/*
diff --git a/sgl-kernel/build.sh b/sgl-kernel/build.sh
index c899224818e7..1caa892bc845 100755
--- a/sgl-kernel/build.sh
+++ b/sgl-kernel/build.sh
@@ -4,6 +4,12 @@ PYTHON_VERSION=$1
 CUDA_VERSION=$2
 PYTHON_ROOT_PATH=/opt/python/cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}
 
+if (( ${CUDA_VERSION%.*} < 12 )); then
+    ENABLE_SM90A=0
+else
+    ENABLE_SM90A=1
+fi
+
 docker run --rm \
     -v "$(pwd)":/sgl-kernel \
     pytorch/manylinux-builder:cuda${CUDA_VERSION} \
@@ -13,7 +19,7 @@ docker run --rm \
     export CUDA_VERSION=${CUDA_VERSION} && \
     export SGL_KERNEL_ENABLE_BF16=1 && \
     export SGL_KERNEL_ENABLE_FP8=1 && \
-    export SGL_KERNEL_ENABLE_SM90A=1 && \
+    export SGL_KERNEL_ENABLE_SM90A=${ENABLE_SM90A} && \
     mkdir -p /usr/lib/x86_64-linux-gnu/ && \
     ln -s /usr/local/cuda-${CUDA_VERSION}/targets/x86_64-linux/lib/stubs/libcuda.so /usr/lib/x86_64-linux-gnu/libcuda.so && \
     cd /sgl-kernel && \

From 665e5e85f6d7a3a153d852cf11f73ba2f892fdff Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Sat, 25 Jan 2025 02:03:01 +0800
Subject: [PATCH 232/248] Add step to update sgl-kernel whl index (#3110)

---
 .github/workflows/release-whl-kernel.yml | 19 +++++++++++++++++++
 scripts/update_kernel_whl_index.py       | 16 ++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 scripts/update_kernel_whl_index.py

diff --git a/.github/workflows/release-whl-kernel.yml b/.github/workflows/release-whl-kernel.yml
index b49da1feb9c0..1b2efaad77d9 100644
--- a/.github/workflows/release-whl-kernel.yml
+++ b/.github/workflows/release-whl-kernel.yml
@@ -42,6 +42,8 @@ jobs:
     needs: build-wheels
     runs-on: ubuntu-latest
     steps:
+      - uses: actions/checkout@v4
+
       - name: Download artifacts
         uses: actions/download-artifact@v4
         with:
@@ -57,3 +59,20 @@ jobs:
           token: ${{ secrets.WHL_TOKEN }}
           files: |
             sgl-kernel/dist/*
+
+      - name: Clone wheel index
+        run: git clone https://oauth2:${WHL_TOKEN}@github.com/sgl-project/whl.git sgl-whl
+        env:
+          WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
+
+      - name: Update wheel index
+        run: python3 scripts/update_kernel_whl_index.py
+
+      - name: Push wheel index
+        run: |
+          cd sgl-whl
+          git config --local user.name "github-actions[bot]"
+          git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add -A
+          git commit -m "update whl index"
+          git push
diff --git a/scripts/update_kernel_whl_index.py b/scripts/update_kernel_whl_index.py
new file mode 100644
index 000000000000..bcd92ef64e9e
--- /dev/null
+++ b/scripts/update_kernel_whl_index.py
@@ -0,0 +1,16 @@
+# Reference: https://github.com/flashinfer-ai/flashinfer/blob/v0.2.0/scripts/update_whl_index.py
+
+import hashlib
+import pathlib
+import re
+
+for path in sorted(pathlib.Path("sgl-kernel/dist").glob("*.whl")):
+    with open(path, "rb") as f:
+        sha256 = hashlib.sha256(f.read()).hexdigest()
+    ver = re.findall(r"sgl_kernel-([0-9.]+(?:\.post[0-9]+)?)-", path.name)[0]
+    index_dir = pathlib.Path(f"sgl-whl/cu118")
+    index_dir.mkdir(exist_ok=True)
+    base_url = "https://github.com/sgl-project/whl/releases/download"
+    full_url = f"{base_url}/v{ver}/{path.name}#sha256={sha256}"
+    with (index_dir / "index.html").open("a") as f:
+        f.write(f'<a href="{full_url}">{path.name}</a><br>\n')

From 5d9d15e70f7e73223a3d2baf3851b95a9d5356f0 Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Sat, 25 Jan 2025 16:52:17 +0800
Subject: [PATCH 233/248] support fp32 in sampling_scaling_penalties kernel
 (#3121)

---
 .../csrc/sampling_scaling_penalties.cu         |  3 +--
 sgl-kernel/src/sgl-kernel/csrc/utils.h         | 18 ++++++++++++++++++
 .../tests/test_sampling_scaling_penalties.py   | 10 +++++++---
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu b/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
index 2a9de4d9f711..18beb86445f9 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
@@ -1,7 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
-#include <pytorch_extension_utils.h>
 
 #include <THC/THCAtomics.cuh>
 #include <flashinfer/vec_dtypes.cuh>
@@ -49,7 +48,7 @@ torch::Tensor sampling_scaling_penalties(const torch::Tensor& logits, const torc
 
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(logits.scalar_type(), scalar_t, [&] {
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(logits.scalar_type(), scalar_t, [&] {
     uint32_t vec_size = 16 / sizeof(scalar_t);
     const int blocks = (numel + threads * vec_size - 1) / (threads * vec_size);
     sampling_scaling_penalties_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
diff --git a/sgl-kernel/src/sgl-kernel/csrc/utils.h b/sgl-kernel/src/sgl-kernel/csrc/utils.h
index 2fed2d60c039..ed802d4fdefb 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/utils.h
+++ b/sgl-kernel/src/sgl-kernel/csrc/utils.h
@@ -1,4 +1,5 @@
 #pragma once
+#include <pytorch_extension_utils.h>
 #include <torch/extension.h>
 
 #include <sstream>
@@ -44,3 +45,20 @@ inline int getSMVersion() {
   CHECK_CUDA_SUCCESS(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
   return sm_major * 10 + sm_minor;
 }
+
+#define DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(pytorch_dtype, c_type, ...)           \
+  [&]() -> bool {                                                                        \
+    switch (pytorch_dtype) {                                                             \
+      case at::ScalarType::Float: {                                                      \
+        using c_type = float;                                                            \
+        return __VA_ARGS__();                                                            \
+      }                                                                                  \
+        _DISPATCH_CASE_F16(c_type, __VA_ARGS__)                                          \
+        _DISPATCH_CASE_BF16(c_type, __VA_ARGS__)                                         \
+      default:                                                                           \
+        std::ostringstream oss;                                                          \
+        oss << __PRETTY_FUNCTION__ << " failed to dispatch data type " << pytorch_dtype; \
+        TORCH_CHECK(false, oss.str());                                                   \
+        return false;                                                                    \
+    }                                                                                    \
+  }()
diff --git a/sgl-kernel/tests/test_sampling_scaling_penalties.py b/sgl-kernel/tests/test_sampling_scaling_penalties.py
index 6194c761710a..a56eca866b21 100644
--- a/sgl-kernel/tests/test_sampling_scaling_penalties.py
+++ b/sgl-kernel/tests/test_sampling_scaling_penalties.py
@@ -2,10 +2,14 @@
 import torch
 from sgl_kernel import sampling_scaling_penalties
 
+batch_sizes = [1, 2, 4, 8, 16, 32, 64, 65]
+vocab_sizes = [2048, 4096, 8192, 16384, 32768, 32767]
+dtypes = [torch.float32, torch.half, torch.bfloat16]
 
-@pytest.mark.parametrize("batch_size", [1, 2, 4, 8, 16, 32, 64, 65])
-@pytest.mark.parametrize("vocab_size", [2048, 4096, 8192, 16384, 32768, 32767])
-@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+
+@pytest.mark.parametrize("batch_size", batch_sizes)
+@pytest.mark.parametrize("vocab_size", vocab_sizes)
+@pytest.mark.parametrize("dtype", dtypes)
 def test_sampling_scaling_penalties(batch_size, vocab_size, dtype):
     device = torch.device("cuda")
     rtol = 1e-3

From 98522149ff422d4700bf43dc6c944ee70cf2b516 Mon Sep 17 00:00:00 2001
From: yizhang2077 <1109276519@qq.com>
Date: Sat, 25 Jan 2025 18:26:41 +0800
Subject: [PATCH 234/248] mirror fix for custom allreduce (#3124)

---
 sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu
index 006c3200dd1e..8bdb50125430 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu
@@ -160,7 +160,7 @@ __inline__ __device__ void block_barrier(uint32_t** signals, uint32_t const flag
 }
 
 template <typename T, int RANKS_PER_NODE, bool COPY_INPUT = true>
-static __global__ void oneShotAllReduceKernel(AllReduceParams params) {
+static __global__ void __launch_bounds__(512, 1) oneShotAllReduceKernel(AllReduceParams params) {
   // Suppose that two GPUs participate in the AR exchange, and we start four blocks.
   // The message is partitioned into chunks as detailed below:
   //               message

From 14e754a868619b5099688d303667d09d2ef3724c Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Sat, 25 Jan 2025 20:43:02 +0800
Subject: [PATCH 235/248] chore: bump v0.0.2.post17 for sgl-kernel (#3125)

---
 sgl-kernel/3rdparty/flashinfer | 2 +-
 sgl-kernel/Makefile            | 7 +++++--
 sgl-kernel/pyproject.toml      | 2 +-
 sgl-kernel/version.py          | 2 +-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/sgl-kernel/3rdparty/flashinfer b/sgl-kernel/3rdparty/flashinfer
index 2d03ed7c01ae..6e6f38d35349 160000
--- a/sgl-kernel/3rdparty/flashinfer
+++ b/sgl-kernel/3rdparty/flashinfer
@@ -1 +1 @@
-Subproject commit 2d03ed7c01aefd946c8a5781df9e59c0380116d4
+Subproject commit 6e6f38d3534994c34b2c6b09b5b45c8a7b92ffd2
diff --git a/sgl-kernel/Makefile b/sgl-kernel/Makefile
index c7641bb5fee1..1384f1bcd81d 100644
--- a/sgl-kernel/Makefile
+++ b/sgl-kernel/Makefile
@@ -1,4 +1,4 @@
-.PHONY: tree ln submodule install build clean test format
+.PHONY: tree ln submodule install build clean rebuild test format
 
 tree:
 	@tree --prune -I "__pycache__|*.egg-info|*.so|build|3rdparty|dist"
@@ -13,11 +13,14 @@ install: submodule
 	@pip install -e .
 
 build: submodule
-	@export MAX_JOBS=$(nproc) && python3 setup.py bdist_wheel
+	@rm -rf dist/* || true && export MAX_JOBS=$(nproc) && python3 setup.py bdist_wheel && pip3 install dist/*whl --force-reinstall --no-deps
 
 clean:
 	@rm -rf build dist *.egg-info
 
+rebuild: clean submodule build
+	@echo "Succeed to rebuild"
+
 test:
 	@find tests -name "test_*.py" | xargs -n 1 python3
 
diff --git a/sgl-kernel/pyproject.toml b/sgl-kernel/pyproject.toml
index 0032c369d94d..582e67f46131 100644
--- a/sgl-kernel/pyproject.toml
+++ b/sgl-kernel/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sgl-kernel"
-version = "0.0.2.post16"
+version = "0.0.2.post17"
 description = "Kernel Library for SGLang"
 readme = "README.md"
 requires-python = ">=3.9"
diff --git a/sgl-kernel/version.py b/sgl-kernel/version.py
index 5a127146bb50..ad3ff8af9444 100644
--- a/sgl-kernel/version.py
+++ b/sgl-kernel/version.py
@@ -1 +1 @@
-__version__ = "0.0.2.post16"
+__version__ = "0.0.2.post17"

From 3cab5f71eaff5baf4f1d033371d06e2262a396d0 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Sat, 25 Jan 2025 21:37:48 +0800
Subject: [PATCH 236/248] speedup pr test for sgl-kernel (#3126)

---
 .github/workflows/pr-test-sgl-kernel.yml | 43 +++++++++++++++++++++---
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
index aea60969719e..7b58052085b7 100644
--- a/.github/workflows/pr-test-sgl-kernel.yml
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -30,20 +30,55 @@ jobs:
           clangFormatVersion: 16
           style: file
 
+  build-wheels:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.10']
+        cuda-version: ['12.4']
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: 'recursive'
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheels for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
+          path: sgl-kernel/dist/*
+
   unit-test:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    needs: build-wheels
     runs-on: 1-gpu-runner
     steps:
       - uses: actions/checkout@v4
 
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-*
+
       - name: Install
         run: |
           pip3 install torch==2.5.1 && pip3 install pytest && pip3 install vllm==0.6.4.post1
           pip3 uninstall sgl-kernel -y || true
-          find . -name index.lock -delete
-          cd sgl-kernel
-          git submodule deinit --all --force && git submodule sync --recursive && git submodule update --init --force --recursive
-          pip3 install .
+          pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
           pip3 list | grep sgl-kernel
 
       - name: Run test

From 67ad4338e1016ff2aa31dbde7dd48432859eb6e5 Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Sat, 25 Jan 2025 23:14:35 +0800
Subject: [PATCH 237/248] Update tag name for whl release (#3127)

---
 .github/workflows/release-whl-kernel.yml | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release-whl-kernel.yml b/.github/workflows/release-whl-kernel.yml
index 1b2efaad77d9..08a820c2aab4 100644
--- a/.github/workflows/release-whl-kernel.yml
+++ b/.github/workflows/release-whl-kernel.yml
@@ -4,8 +4,12 @@ on:
   workflow_dispatch:
     inputs:
       tag_name:
-        required: true
         type: string
+  push:
+    branches:
+      - main
+    paths:
+      - sgl-kernel/version.py
 
 jobs:
   build-wheels:
@@ -51,10 +55,20 @@ jobs:
           merge-multiple: true
           pattern: wheel-*
 
+      - name: Set tag name
+        id: set_tag_name
+        run: |
+          if [ -z "${{ inputs.tag_name }}" ]; then
+            TAG_NAME="v$(cat sgl-kernel/version.py | cut -d'"' -f2)"
+            echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT
+          else
+            echo "tag_name=${{ inputs.tag_name }}" >> $GITHUB_OUTPUT
+          fi
+
       - name: Release
         uses: softprops/action-gh-release@v2
         with:
-          tag_name: ${{ inputs.tag_name }}
+          tag_name: ${{ steps.set_tag_name.outputs.tag_name }}
           repository: sgl-project/whl
           token: ${{ secrets.WHL_TOKEN }}
           files: |

From c23d5706f4148afc4e7a09d305e8508f4ee7bd0d Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Sat, 25 Jan 2025 23:57:09 +0800
Subject: [PATCH 238/248] Update whl index path (#3128)

---
 scripts/update_kernel_whl_index.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/update_kernel_whl_index.py b/scripts/update_kernel_whl_index.py
index bcd92ef64e9e..a42969641f57 100644
--- a/scripts/update_kernel_whl_index.py
+++ b/scripts/update_kernel_whl_index.py
@@ -8,7 +8,7 @@
     with open(path, "rb") as f:
         sha256 = hashlib.sha256(f.read()).hexdigest()
     ver = re.findall(r"sgl_kernel-([0-9.]+(?:\.post[0-9]+)?)-", path.name)[0]
-    index_dir = pathlib.Path(f"sgl-whl/cu118")
+    index_dir = pathlib.Path(f"sgl-whl/cu118/sgl-kernel")
     index_dir.mkdir(exist_ok=True)
     base_url = "https://github.com/sgl-project/whl/releases/download"
     full_url = f"{base_url}/v{ver}/{path.name}#sha256={sha256}"

From 896c07441ec12a3ff1b71e74905ba436f0f76501 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Sun, 26 Jan 2025 00:00:13 +0800
Subject: [PATCH 239/248] update installation doc for sgl-kernel (#3129)

---
 .github/workflows/pr-test-sgl-kernel.yml |  2 +-
 sgl-kernel/README.md                     | 16 +++++++++++++++-
 sgl-kernel/pyproject.toml                |  2 +-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
index 7b58052085b7..26b921eee332 100644
--- a/.github/workflows/pr-test-sgl-kernel.yml
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -31,7 +31,7 @@ jobs:
           style: file
 
   build-wheels:
-    if: github.repository == 'sgl-project/sglang'
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
     runs-on: ubuntu-latest
     strategy:
       matrix:
diff --git a/sgl-kernel/README.md b/sgl-kernel/README.md
index 857cae366d83..0572f9758ab3 100644
--- a/sgl-kernel/README.md
+++ b/sgl-kernel/README.md
@@ -1,5 +1,19 @@
 # SGL Kernel
 
-Kernel Library for SGLang
+[Kernel Library](https://github.com/sgl-project/sglang/tree/main/sgl-kernel) for SGLang
 
 [![PyPI](https://img.shields.io/pypi/v/sgl-kernel)](https://pypi.org/project/sgl-kernel)
+
+## Installation
+
+For CUDA 11.8:
+
+```bash
+pip3 install sgl-kernel -i https://docs.sglang.ai/whl/cu118
+```
+
+For CUDA 12.1 or CUDA 12.4:
+
+```bash
+pip3 install sgl-kernel
+```
diff --git a/sgl-kernel/pyproject.toml b/sgl-kernel/pyproject.toml
index 582e67f46131..b23c302b564c 100644
--- a/sgl-kernel/pyproject.toml
+++ b/sgl-kernel/pyproject.toml
@@ -17,7 +17,7 @@ classifiers = [
 dependencies = []
 
 [project.urls]
-"Homepage" = "https://github.com/sgl-project/sglang"
+"Homepage" = "https://github.com/sgl-project/sglang/tree/main/sgl-kernel"
 "Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
 
 [tool.setuptools]

From 9286740eff9b735a005e14cf5dfae986c75e3533 Mon Sep 17 00:00:00 2001
From: yinfan98 <1106310035@qq.com>
Date: Sun, 26 Jan 2025 02:55:08 +0800
Subject: [PATCH 240/248] feat: refactor sgl-kernel and use TORCH_LIBRARY
 instead of PYBIND11_MODULE for custom ops (#3130)

Co-authored-by: yinfan.1024 <yinfan.1024@bytedance.com>
Co-authored-by: yinfan98 <1106110035@qq.com>
Co-authored-by: Yineng Zhang <me@zhyncs.com>
---
 sgl-kernel/developer_guide.md                 |  11 +-
 sgl-kernel/setup.py                           |  11 +-
 .../sgl_kernels_ops.h}                        |  72 ++++-------
 .../{csrc => include}/trt_reduce_internal.cuh |   0
 .../src/sgl-kernel/{csrc => include}/utils.h  |   3 +
 sgl-kernel/src/sgl-kernel/ops/__init__.py     |  93 ++++++--------
 sgl-kernel/src/sgl-kernel/torch_extension.cc  | 119 ++++++++++++++++++
 7 files changed, 198 insertions(+), 111 deletions(-)
 rename sgl-kernel/src/sgl-kernel/{csrc/sgl_kernel_ops.cu => include/sgl_kernels_ops.h} (65%)
 rename sgl-kernel/src/sgl-kernel/{csrc => include}/trt_reduce_internal.cuh (100%)
 rename sgl-kernel/src/sgl-kernel/{csrc => include}/utils.h (98%)
 create mode 100644 sgl-kernel/src/sgl-kernel/torch_extension.cc

diff --git a/sgl-kernel/developer_guide.md b/sgl-kernel/developer_guide.md
index 91e93ff75089..26b68535c03b 100644
--- a/sgl-kernel/developer_guide.md
+++ b/sgl-kernel/developer_guide.md
@@ -26,10 +26,11 @@ Third-party libraries:
 Steps to add a new kernel:
 
 1. Implement in [src/sgl-kernel/csrc/](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/src/sgl-kernel/csrc)
-2. Expose interface in [csrc/sgl_kernel_ops.cu](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu) with pybind11
-3. Create Python wrapper in [src/sgl-kernel/ops/\_\_init\_\_.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/ops/__init__.py)
-4. Expose Python interface in [src/sgl-kernel/\_\_init\_\_.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/__init__.py)
-5. Update [setup.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/setup.py) to include new CUDA source
+2. Expose interface in [src/sgl-kernel/include/sgl_kernel_ops.h](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/include/sgl_kernel_ops.h)
+3. Create torch extension in [src/sgl-kernel/torch_extension.cc](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/torch_extension.cc)
+4. Create Python wrapper in [src/sgl-kernel/ops/\_\_init\_\_.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/ops/__init__.py)
+5. Expose Python interface in [src/sgl-kernel/\_\_init\_\_.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/__init__.py)
+6. Update [setup.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/setup.py) to include new CUDA source
 
 ### Build & Install
 
@@ -37,8 +38,6 @@ Development build:
 
 ```bash
 make build
-pip3 install dist/*whl --force-reinstall --no-deps
-# Or use: make install (runs pip install -e .)
 ```
 
 ### Testing & Benchmarking
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 56c5b1bb56b3..95b040fe185d 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -38,6 +38,7 @@ def _get_version():
                 return line.split("=")[1].strip().strip('"')
 
 
+operator_namespace = "sgl_kernels"
 cutlass_default = root / "3rdparty" / "cutlass"
 cutlass = Path(os.environ.get("CUSTOM_CUTLASS_SRC_DIR", default=cutlass_default))
 flashinfer = root / "3rdparty" / "flashinfer"
@@ -45,15 +46,19 @@ def _get_version():
 include_dirs = [
     cutlass.resolve() / "include",
     cutlass.resolve() / "tools" / "util" / "include",
+    root / "src" / "sgl-kernel" / "include",
     root / "src" / "sgl-kernel" / "csrc",
     flashinfer.resolve() / "include",
     flashinfer.resolve() / "include" / "gemm",
     flashinfer.resolve() / "csrc",
+    "cublas",
+    "cublasLt",
     turbomind.resolve(),
     turbomind.resolve() / "src",
 ]
 nvcc_flags = [
     "-DNDEBUG",
+    f"-DOPERATOR_NAMESPACE={operator_namespace}",
     "-O3",
     "-Xcompiler",
     "-fPIC",
@@ -72,13 +77,13 @@ def _get_version():
 ]
 
 sources = [
+    "src/sgl-kernel/torch_extension.cc",
     "src/sgl-kernel/csrc/trt_reduce_internal.cu",
     "src/sgl-kernel/csrc/trt_reduce_kernel.cu",
     "src/sgl-kernel/csrc/moe_align_kernel.cu",
     "src/sgl-kernel/csrc/int8_gemm_kernel.cu",
     "src/sgl-kernel/csrc/sampling_scaling_penalties.cu",
     "src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu",
-    "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
     "src/sgl-kernel/csrc/rotary_embedding.cu",
     "src/sgl-kernel/csrc/fused_add_rms_norm.cu",
     "3rdparty/flashinfer/csrc/activation.cu",
@@ -125,7 +130,7 @@ def _get_version():
         pass
 
 cxx_flags = ["-O3"]
-libraries = ["c10", "torch", "torch_python", "cuda"]
+libraries = ["c10", "torch", "torch_python", "cuda", "cublas", "cublasLt"]
 extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib", "-L/usr/lib/x86_64-linux-gnu"]
 
 ext_modules = [
@@ -139,6 +144,7 @@ def _get_version():
         },
         libraries=libraries,
         extra_link_args=extra_link_args,
+        py_limited_api=True,
     ),
 ]
 
@@ -149,6 +155,7 @@ def _get_version():
     package_dir={"": "src"},
     ext_modules=ext_modules,
     cmdclass={"build_ext": BuildExtension},
+    options={"bdist_wheel": {"py_limited_api": "cp39"}},
 )
 
 _update_wheel_platform_tag()
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/include/sgl_kernels_ops.h
similarity index 65%
rename from sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
rename to sgl-kernel/src/sgl-kernel/include/sgl_kernels_ops.h
index 876d62b7eb3c..91e350895c29 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+++ b/sgl-kernel/src/sgl-kernel/include/sgl_kernels_ops.h
@@ -1,7 +1,25 @@
+#pragma once
+#include <Python.h>
+#include <torch/extension.h>
+
 #include <vector>
 
 #include "utils.h"
 
+#define _CONCAT(A, B) A##B
+#define CONCAT(A, B) _CONCAT(A, B)
+
+#define _STRINGIFY(A) #A
+#define STRINGIFY(A) _STRINGIFY(A)
+
+#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
+
+#define REGISTER_EXTENSION(NAME)                                                                      \
+  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                                            \
+    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT, STRINGIFY(NAME), nullptr, 0, nullptr}; \
+    return PyModule_Create(&module);                                                                  \
+  }
+
 // trt_reduce
 using fptr_t = int64_t;
 fptr_t init_custom_ar(int64_t rank_id, int64_t world_size, torch::Tensor& rank_data, const std::vector<fptr_t>& buffers,
@@ -67,9 +85,18 @@ void min_p_sampling_from_probs(at::Tensor probs, at::Tensor uniform_samples, at:
                                int64_t cuda_stream);
 
 // top k renorm probs
+// patch here, cause flashinfer use unsigned int. but torch must use int64_t for extension.
 void top_k_renorm_probs(at::Tensor probs, at::Tensor renorm_probs, std::optional<at::Tensor> maybe_top_k_arr,
                         unsigned int top_k_val, int64_t cuda_stream);
 
+// patch here, cause flashinfer use unsigned int. but torch must use int64_t for extension.
+// wrapper for binding
+inline void top_k_renorm_probs_wrapper(at::Tensor probs, at::Tensor renorm_probs,
+                                       std::optional<at::Tensor> maybe_top_k_arr, int64_t top_k_val,
+                                       int64_t cuda_stream) {
+  top_k_renorm_probs(probs, renorm_probs, maybe_top_k_arr, static_cast<unsigned int>(top_k_val), cuda_stream);
+}
+
 // top p renorm probs
 void top_p_renorm_probs(at::Tensor probs, at::Tensor renorm_probs, std::optional<at::Tensor> maybe_top_p_arr,
                         double top_p_val, int64_t cuda_stream);
@@ -84,48 +111,3 @@ void top_k_top_p_sampling_from_probs(at::Tensor probs, at::Tensor uniform_sample
 void top_p_sampling_from_probs(at::Tensor probs, at::Tensor uniform_samples, at::Tensor samples, at::Tensor success,
                                std::optional<at::Tensor> maybe_top_p_arr, double top_p_val, bool deterministic,
                                int64_t cuda_stream);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  // trt_reduce
-  m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)");
-  m.def("dispose", &dispose, "dispose custom allreduce meta");
-  m.def("all_reduce", &all_reduce, "custom all reduce (CUDA)");
-  m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta, "custom all reduce get graph ipc meta");
-  m.def("register_graph_buffers", &register_graph_buffers, "custom all reduce register graph buffers");
-  // moe_align_block_size
-  m.def("moe_align_block_size", &moe_align_block_size, "MOE Align Block Size (CUDA)");
-  // sampling_scaling_penalties
-  m.def("sampling_scaling_penalties", &sampling_scaling_penalties, "Sampling scaling penalties (CUDA)");
-  // int8_scaled_mm
-  m.def("int8_scaled_mm", &int8_scaled_mm, "INT8 scaled matmul (CUDA)");
-  // lightning_attention_decode
-  m.def("lightning_attention_decode", &lightning_attention_decode, "Lightning Attention Ddecode (CUDA)");
-  // rotary embedding
-  m.def("rotary_embedding", &rotary_embedding, "Rotary Embedding (CUDA)");
-  // rms norm
-  m.def("rmsnorm", &rmsnorm, "RMSNorm (CUDA)");
-  // fused rms norm
-  m.def("fused_add_rmsnorm", &fused_add_rmsnorm, "Fused Add RMSNorm (CUDA)");
-  // gemma rms norm
-  m.def("gemma_rmsnorm", &gemma_rmsnorm, "Gemma RMSNorm (CUDA)");
-  // fused gemma rms norm
-  m.def("gemma_fused_add_rmsnorm", &gemma_fused_add_rmsnorm, "Gemma Fused Add RMSNorm (CUDA)");
-  // silu and mul
-  m.def("silu_and_mul", &silu_and_mul, "Silu and Mul (CUDA)");
-  // gelu tanh and mul
-  m.def("gelu_tanh_and_mul", &gelu_tanh_and_mul, "Gelu Tanh and Mul (CUDA)");
-  // gelu and mul
-  m.def("gelu_and_mul", &gelu_and_mul, "Gelu and Mul (CUDA)");
-  // bmm fp8
-  m.def("bmm_fp8", &bmm_fp8, "BMM FP8 (CUDA)");
-  // min p sampling from probs
-  m.def("min_p_sampling_from_probs", &min_p_sampling_from_probs, "Min P Sampling From Probs (CUDA)");
-  // top k renorm probs
-  m.def("top_k_renorm_probs", &top_k_renorm_probs, "Top K Renorm Probs (CUDA)");
-  // top p renorm probs
-  m.def("top_p_renorm_probs", &top_p_renorm_probs, "Top P Renorm Probs (CUDA)");
-  // top k top p sampling from probs
-  m.def("top_k_top_p_sampling_from_probs", &top_k_top_p_sampling_from_probs, "Top K Top P Sampling From Probs (CUDA)");
-  // top p sampling from probs
-  m.def("top_p_sampling_from_probs", &top_p_sampling_from_probs, "Top P Sampling From Probs (CUDA)");
-}
diff --git a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cuh b/sgl-kernel/src/sgl-kernel/include/trt_reduce_internal.cuh
similarity index 100%
rename from sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cuh
rename to sgl-kernel/src/sgl-kernel/include/trt_reduce_internal.cuh
diff --git a/sgl-kernel/src/sgl-kernel/csrc/utils.h b/sgl-kernel/src/sgl-kernel/include/utils.h
similarity index 98%
rename from sgl-kernel/src/sgl-kernel/csrc/utils.h
rename to sgl-kernel/src/sgl-kernel/include/utils.h
index ed802d4fdefb..1cca35d5cd7a 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/utils.h
+++ b/sgl-kernel/src/sgl-kernel/include/utils.h
@@ -1,9 +1,12 @@
 #pragma once
+#include <cuda_runtime.h>
 #include <pytorch_extension_utils.h>
 #include <torch/extension.h>
 
 #include <sstream>
 
+#include "sgl_kernels_ops.h"
+
 struct cuda_error : public std::runtime_error {
   /**
    * @brief Constructs a `cuda_error` object with the given `message`.
diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py
index cd69eb3c2495..3a21ced875ad 100644
--- a/sgl-kernel/src/sgl-kernel/ops/__init__.py
+++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py
@@ -1,41 +1,8 @@
+import os
 from typing import Optional, Tuple, Union
 
+import sgl_kernel.ops._kernels
 import torch
-from sgl_kernel.ops._kernels import all_reduce as _all_reduce
-from sgl_kernel.ops._kernels import bmm_fp8 as _bmm_fp8
-from sgl_kernel.ops._kernels import dispose as _dispose
-from sgl_kernel.ops._kernels import fused_add_rmsnorm as _fused_add_rmsnorm
-from sgl_kernel.ops._kernels import gelu_and_mul as _gelu_and_mul
-from sgl_kernel.ops._kernels import gelu_tanh_and_mul as _gelu_tanh_and_mul
-from sgl_kernel.ops._kernels import gemma_fused_add_rmsnorm as _gemma_fused_add_rmsnorm
-from sgl_kernel.ops._kernels import gemma_rmsnorm as _gemma_rmsnorm
-from sgl_kernel.ops._kernels import (
-    get_graph_buffer_ipc_meta as _get_graph_buffer_ipc_meta,
-)
-from sgl_kernel.ops._kernels import init_custom_ar as _init_custom_ar
-from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm
-from sgl_kernel.ops._kernels import (
-    lightning_attention_decode as _lightning_attention_decode,
-)
-from sgl_kernel.ops._kernels import (
-    min_p_sampling_from_probs as _min_p_sampling_from_probs,
-)
-from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size
-from sgl_kernel.ops._kernels import register_graph_buffers as _register_graph_buffers
-from sgl_kernel.ops._kernels import rmsnorm as _rmsnorm
-from sgl_kernel.ops._kernels import rotary_embedding as _rotary_embedding
-from sgl_kernel.ops._kernels import (
-    sampling_scaling_penalties as _sampling_scaling_penalties,
-)
-from sgl_kernel.ops._kernels import silu_and_mul as _silu_and_mul
-from sgl_kernel.ops._kernels import top_k_renorm_probs as _top_k_renorm_probs
-from sgl_kernel.ops._kernels import (
-    top_k_top_p_sampling_from_probs as _top_k_top_p_sampling_from_probs,
-)
-from sgl_kernel.ops._kernels import top_p_renorm_probs as _top_p_renorm_probs
-from sgl_kernel.ops._kernels import (
-    top_p_sampling_from_probs as _top_p_sampling_from_probs,
-)
 from sgl_kernel.ops.utils import (
     _get_cache_buf,
     _get_cuda_stream,
@@ -46,25 +13,25 @@
 def init_custom_reduce(
     rank_id, num_devices, rank_data, buffers, tmp_buffers, barrier_in, barrier_out
 ):
-    return _init_custom_ar(
+    return torch.ops.sgl_kernels.init_custom_ar(
         rank_id, num_devices, rank_data, buffers, tmp_buffers, barrier_in, barrier_out
     )
 
 
 def custom_dispose(fa):
-    _dispose(fa)
+    torch.ops.sgl_kernels.dispose(fa)
 
 
 def custom_reduce(fa, inp, out):
-    _all_reduce(fa, inp, out)
+    torch.ops.sgl_kernels.all_reduce(fa, inp, out)
 
 
 def get_graph_buffer_ipc_meta(fa):
-    return _get_graph_buffer_ipc_meta(fa)
+    return torch.ops.sgl_kernels.get_graph_buffer_ipc_meta(fa)
 
 
 def register_graph_buffers(fa, handles, offsets):
-    _register_graph_buffers(fa, handles, offsets)
+    torch.ops.sgl_kernels.register_graph_buffers(fa, handles, offsets)
 
 
 def moe_align_block_size(
@@ -77,7 +44,7 @@ def moe_align_block_size(
     token_cnts_buffer,
     cumsum_buffer,
 ):
-    _moe_align_block_size(
+    torch.ops.sgl_kernels.moe_align_block_size(
         topk_ids,
         num_experts,
         block_size,
@@ -90,11 +57,11 @@ def moe_align_block_size(
 
 
 def sampling_scaling_penalties(logits, scaling_penalties):
-    return _sampling_scaling_penalties(logits, scaling_penalties)
+    return torch.ops.sgl_kernels.sampling_scaling_penalties(logits, scaling_penalties)
 
 
 def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
-    return _int8_scaled_mm(
+    return torch.ops.sgl_kernels.int8_scaled_mm(
         mat_a,
         mat_b,
         scales_a,
@@ -105,11 +72,15 @@ def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
 
 
 def lightning_attention_decode(q, k, v, past_kv, slope, output, new_kv):
-    _lightning_attention_decode(q, k, v, past_kv, slope, output, new_kv)
+    torch.ops.sgl_kernels.lightning_attention_decode(
+        q, k, v, past_kv, slope, output, new_kv
+    )
 
 
 def rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox):
-    return _rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox)
+    return torch.ops.sgl_kernels.rotary_embedding(
+        positions, query, key, head_size, cos_sin_cache, is_neox
+    )
 
 
 # These implementations extensively draw from and build upon the FlashInfer project https://github.com/flashinfer-ai/flashinfer
@@ -123,7 +94,7 @@ def rmsnorm(
     with input.device as device:
         if out is None:
             out = torch.empty_like(input)
-        _rmsnorm(out, input, weight, eps, _get_cuda_stream(device))
+        torch.ops.sgl_kernels.rmsnorm(out, input, weight, eps, _get_cuda_stream(device))
         return out
 
 
@@ -131,7 +102,9 @@ def fused_add_rmsnorm(
     input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6
 ) -> None:
     with input.device as device:
-        _fused_add_rmsnorm(input, residual, weight, eps, _get_cuda_stream(device))
+        torch.ops.sgl_kernels.fused_add_rmsnorm(
+            input, residual, weight, eps, _get_cuda_stream(device)
+        )
 
 
 def gemma_rmsnorm(
@@ -143,7 +116,9 @@ def gemma_rmsnorm(
     with input.device as device:
         if out is None:
             out = torch.empty_like(input)
-        _gemma_rmsnorm(out, input, weight, eps, _get_cuda_stream(device))
+        torch.ops.sgl_kernels.gemma_rmsnorm(
+            out, input, weight, eps, _get_cuda_stream(device)
+        )
         return out
 
 
@@ -151,7 +126,9 @@ def gemma_fused_add_rmsnorm(
     input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6
 ) -> None:
     with input.device as device:
-        _gemma_fused_add_rmsnorm(input, residual, weight, eps, _get_cuda_stream(device))
+        torch.ops.sgl_kernels.gemma_fused_add_rmsnorm(
+            input, residual, weight, eps, _get_cuda_stream(device)
+        )
 
 
 def _check_shape(input: torch.Tensor, output: torch.Tensor) -> None:
@@ -176,7 +153,7 @@ def silu_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
             dtype=input.dtype,
         )
     with input.device as device:
-        _silu_and_mul(out, input, _get_cuda_stream(device))
+        torch.ops.sgl_kernels.silu_and_mul(out, input, _get_cuda_stream(device))
         return out
 
 
@@ -192,7 +169,7 @@ def gelu_tanh_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Te
             dtype=input.dtype,
         )
     with input.device as device:
-        _gelu_tanh_and_mul(out, input, _get_cuda_stream(device))
+        torch.ops.sgl_kernels.gelu_tanh_and_mul(out, input, _get_cuda_stream(device))
         return out
 
 
@@ -208,7 +185,7 @@ def gelu_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
             dtype=input.dtype,
         )
     with input.device as device:
-        _gelu_and_mul(out, input, _get_cuda_stream(device))
+        torch.ops.sgl_kernels.gelu_and_mul(out, input, _get_cuda_stream(device))
         return out
 
 
@@ -222,7 +199,7 @@ def _bmm_fp8_internal(
 ) -> None:
     with A.device as device:
         cublas_handle = torch.cuda.current_blas_handle()
-        _bmm_fp8(
+        torch.ops.sgl_kernels.bmm_fp8(
             A,
             B,
             D,
@@ -262,7 +239,7 @@ def _top_k_renorm_probs_internal(
         probs = probs.float()
         maybe_top_k_arr = maybe_top_k_arr.int() if maybe_top_k_arr is not None else None
         renorm_probs = torch.empty_like(probs)
-        _top_k_renorm_probs(
+        torch.ops.sgl_kernels.top_k_renorm_probs_wrapper(
             probs,
             renorm_probs,
             maybe_top_k_arr,
@@ -293,7 +270,7 @@ def _top_p_renorm_probs_internal(
             maybe_top_p_arr.float() if maybe_top_p_arr is not None else None
         )
         renorm_probs = torch.empty_like(probs)
-        _top_p_renorm_probs(
+        torch.ops.sgl_kernels.top_p_renorm_probs(
             probs,
             renorm_probs,
             maybe_top_p_arr,
@@ -328,7 +305,7 @@ def _top_p_sampling_from_probs_internal(
         )
         samples = torch.empty(probs.size(0), dtype=torch.int32, device=device)
         success = torch.empty(probs.size(0), dtype=torch.bool, device=device)
-        _top_p_sampling_from_probs(
+        torch.ops.sgl_kernels.top_p_sampling_from_probs(
             probs,
             uniform_samples,
             samples,
@@ -374,7 +351,7 @@ def _top_k_top_p_sampling_from_probs_internal(
         )
         samples = torch.empty(probs.size(0), dtype=torch.int32, device=device)
         success = torch.empty(probs.size(0), dtype=torch.bool, device=device)
-        _top_k_top_p_sampling_from_probs(
+        torch.ops.sgl_kernels.top_k_top_p_sampling_from_probs(
             probs,
             uniform_samples,
             samples,
@@ -432,7 +409,7 @@ def _min_p_sampling_from_probs_internal(
             maybe_min_p_arr.float() if maybe_min_p_arr is not None else None
         )
         samples = torch.empty(probs.size(0), dtype=torch.int32, device=device)
-        _min_p_sampling_from_probs(
+        torch.ops.sgl_kernels.min_p_sampling_from_probs(
             probs,
             uniform_samples,
             samples,
diff --git a/sgl-kernel/src/sgl-kernel/torch_extension.cc b/sgl-kernel/src/sgl-kernel/torch_extension.cc
new file mode 100644
index 000000000000..f8a061c15d59
--- /dev/null
+++ b/sgl-kernel/src/sgl-kernel/torch_extension.cc
@@ -0,0 +1,119 @@
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <torch/library.h>
+
+#include "sgl_kernels_ops.h"
+
+TORCH_LIBRARY_EXPAND(sgl_kernels, m) {
+  // trt_reduce
+  m.def(
+      "init_custom_ar(int rank_id, int world_size, Tensor rank_data, int[] buffers, int[] tmp_result_buffers, int[] "
+      "barrier_in, int[] barrier_out) -> int");
+  m.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
+
+  m.def("dispose", &dispose);
+
+  m.def("all_reduce(int fa, Tensor inp, Tensor! out) -> ()");
+  m.impl("all_reduce", torch::kCUDA, &all_reduce);
+
+  m.def("get_graph_buffer_ipc_meta(int fa) -> (int[], int[])");
+  m.impl("get_graph_buffer_ipc_meta", torch::kCUDA, &get_graph_buffer_ipc_meta);
+
+  m.def("register_graph_buffers(int fa, int[][] handles, int[][] offsets) -> ()");
+  m.impl("register_graph_buffers", torch::kCUDA, &register_graph_buffers);
+
+  // moe_align_block_size
+  m.def(
+      "moe_align_block_size(Tensor topk_ids, int num_experts, int block_size, Tensor! sorted_token_ids, Tensor! "
+      "experts_ids, Tensor! num_tokens_post_pad, Tensor! token_cnts_buffer, Tensor! cumsum_buffer) -> ()");
+  m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
+
+  // sampling_scaling_penalties
+  m.def("sampling_scaling_penalties(Tensor logits, Tensor scaling_penalties) -> Tensor");
+  m.impl("sampling_scaling_penalties", torch::kCUDA, &sampling_scaling_penalties);
+
+  // int8_scaled_mm
+  m.def(
+      "int8_scaled_mm(Tensor mat_a, Tensor mat_b, Tensor scales_a, Tensor scales_b, ScalarType out_dtype, Tensor? "
+      "bias) -> Tensor");
+  m.impl("int8_scaled_mm", torch::kCUDA, &int8_scaled_mm);
+
+  // lightning_attention_decode
+  m.def(
+      "lightning_attention_decode(Tensor q, Tensor k, Tensor v, Tensor past_kv, Tensor slope, Tensor! output, Tensor! "
+      "new_kv) -> ()");
+  m.impl("lightning_attention_decode", torch::kCUDA, &lightning_attention_decode);
+
+  // rotary embedding
+  m.def(
+      "rotary_embedding(Tensor positions, Tensor! query, Tensor! key, int head_size, Tensor cos_sin_cache, bool "
+      "is_neox) -> ()");
+  m.impl("rotary_embedding", torch::kCUDA, &rotary_embedding);
+
+  // rms norm
+  m.def("rmsnorm(Tensor! output, Tensor input, Tensor weight, float eps, int cuda_stream) -> ()");
+  m.impl("rmsnorm", torch::kCUDA, &rmsnorm);
+
+  // fused rms norm
+  m.def("fused_add_rmsnorm(Tensor! input, Tensor! residual, Tensor weight, float eps, int cuda_stream) -> ()");
+  m.impl("fused_add_rmsnorm", torch::kCUDA, &fused_add_rmsnorm);
+
+  // gemma rms norm
+  m.def("gemma_rmsnorm(Tensor! output, Tensor input, Tensor weight, float eps, int cuda_stream) -> ()");
+  m.impl("gemma_rmsnorm", torch::kCUDA, &gemma_rmsnorm);
+
+  // fused gemma rms norm
+  m.def("gemma_fused_add_rmsnorm(Tensor! input, Tensor! residual, Tensor weight, float eps, int cuda_stream) -> ()");
+  m.impl("gemma_fused_add_rmsnorm", torch::kCUDA, &gemma_fused_add_rmsnorm);
+
+  // silu and mul
+  m.def("silu_and_mul(Tensor! out, Tensor input, int cuda_stream) -> ()");
+  m.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
+
+  // gelu tanh and mul
+  m.def("gelu_tanh_and_mul(Tensor! out, Tensor input, int cuda_stream) -> ()");
+  m.impl("gelu_tanh_and_mul", torch::kCUDA, &gelu_tanh_and_mul);
+
+  // gelu and mul
+  m.def("gelu_and_mul(Tensor! out, Tensor input, int cuda_stream) -> ()");
+  m.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul);
+
+  // bmm fp8
+  m.def(
+      "bmm_fp8(Tensor A, Tensor B, Tensor! D, Tensor A_scale, Tensor B_scale, Tensor workspace_buffer, int "
+      "cublas_handle, int cuda_stream) -> ()");
+  m.impl("bmm_fp8", torch::kCUDA, &bmm_fp8);
+
+  // min p sampling from probs
+  m.def(
+      "min_p_sampling_from_probs(Tensor probs, Tensor uniform_samples, Tensor! samples, Tensor? maybe_min_p_arr, float "
+      "min_p_val, bool deterministic, int cuda_stream) -> ()");
+  m.impl("min_p_sampling_from_probs", torch::kCUDA, &min_p_sampling_from_probs);
+
+  // top k renorm probs
+  m.def(
+      "top_k_renorm_probs_wrapper(Tensor probs, Tensor! renorm_probs, Tensor? maybe_top_k_arr, int top_k_val, int "
+      "cuda_stream) -> ()");
+  m.impl("top_k_renorm_probs_wrapper", torch::kCUDA, &top_k_renorm_probs_wrapper);
+
+  // top p renorm probs
+  m.def(
+      "top_p_renorm_probs(Tensor probs, Tensor! renorm_probs, Tensor? maybe_top_p_arr, float top_p_val, int "
+      "cuda_stream) -> ()");
+  m.impl("top_p_renorm_probs", torch::kCUDA, &top_p_renorm_probs);
+
+  // top k top p sampling from probs
+  m.def(
+      "top_k_top_p_sampling_from_probs(Tensor probs, Tensor uniform_samples, Tensor! samples, Tensor! success, Tensor? "
+      "maybe_top_k_arr, float top_k_val, Tensor? maybe_top_p_arr, float top_p_val, bool deterministic, int "
+      "cuda_stream) -> ()");
+  m.impl("top_k_top_p_sampling_from_probs", torch::kCUDA, &top_k_top_p_sampling_from_probs);
+
+  // top p sampling from probs
+  m.def(
+      "top_p_sampling_from_probs(Tensor probs, Tensor uniform_samples, Tensor! samples, Tensor! success, Tensor? "
+      "maybe_top_p_arr, float top_p_val, bool deterministic, int cuda_stream) -> ()");
+  m.impl("top_p_sampling_from_probs", torch::kCUDA, &top_p_sampling_from_probs);
+}
+
+REGISTER_EXTENSION(_kernels)

From da6f8081f6bc59f56ac773ded42e16b4043a93a5 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 25 Jan 2025 17:43:39 -0800
Subject: [PATCH 241/248] Fix CI tests (#3132)

---
 .github/workflows/pr-test.yml  |  2 ++
 test/srt/test_bench_serving.py | 12 ++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index c5eeeee3c141..998a12e75d82 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -43,6 +43,8 @@ jobs:
 
       - name: Run test
         timeout-minutes: 10
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         run: |
           cd test/lang
           python3 run_suite.py --suite per-commit
diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py
index b55260f71a63..8233438fcaf2 100644
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -49,7 +49,7 @@ def test_offline_throughput_non_stream_small_batch_size(self):
             )
             # There is a regression with torch 2.5
             # This number was 950 for torch 2.4
-            self.assertGreater(res["output_throughput"], 850)
+            self.assertGreater(res["output_throughput"], 1000)
 
     def test_offline_throughput_without_radix_cache(self):
         res = run_bench_serving(
@@ -114,7 +114,7 @@ def test_offline_throughput_default_fp8(self):
                 f"### test_offline_throughput_default_fp8\n"
                 f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
             )
-            self.assertGreater(res["output_throughput"], 3850)
+            self.assertGreater(res["output_throughput"], 3900)
 
     def test_online_latency_default(self):
         res = run_bench_serving(
@@ -129,7 +129,7 @@ def test_online_latency_default(self):
                 f"### test_online_latency_default\n"
                 f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n'
             )
-            self.assertLess(res["median_e2e_latency_ms"], 12000)
+            self.assertLess(res["median_e2e_latency_ms"], 11000)
             self.assertLess(res["median_ttft_ms"], 86)
             self.assertLess(res["median_itl_ms"], 10)
 
@@ -161,7 +161,7 @@ def test_online_latency_eagle(self):
                 f"### test_online_latency_eagle\n"
                 f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n'
             )
-            self.assertLess(res["median_e2e_latency_ms"], 10000)
+            self.assertLess(res["median_e2e_latency_ms"], 450)
 
     def test_moe_offline_throughput_default(self):
         res = run_bench_serving(
@@ -176,7 +176,7 @@ def test_moe_offline_throughput_default(self):
                 f"### test_moe_offline_throughput_default\n"
                 f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
             )
-            self.assertGreater(res["output_throughput"], 2150)
+            self.assertGreater(res["output_throughput"], 2200)
 
     def test_moe_offline_throughput_without_radix_cache(self):
         res = run_bench_serving(
@@ -191,7 +191,7 @@ def test_moe_offline_throughput_without_radix_cache(self):
                 f"### test_moe_offline_throughput_without_radix_cache\n"
                 f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
             )
-            self.assertGreater(res["output_throughput"], 2150)
+            self.assertGreater(res["output_throughput"], 2200)
 
 
 if __name__ == "__main__":

From 27acf63bbd37eeb82231eca611a9d2947dc74ac6 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 25 Jan 2025 18:27:33 -0800
Subject: [PATCH 242/248] Use torch.compile for scaling penalty (#3133)

---
 .../benchmark_deepseekv3_moe_align_blocks.py  |  1 -
 .../penalizers/repetition_penalty.py          | 24 ++++++++-----------
 .../srt/sampling/sampling_batch_info.py       | 18 ++++----------
 3 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py b/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py
index d00f4985ad2b..e2c4d8d35067 100644
--- a/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py
+++ b/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py
@@ -1,6 +1,5 @@
 import argparse
 import itertools
-import time
 
 import torch
 import triton
diff --git a/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py b/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py
index fcd5ff71c233..0f714c54806b 100644
--- a/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py
+++ b/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py
@@ -3,11 +3,16 @@
 import torch
 
 from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs
-from sglang.srt.utils import is_cuda_available
+from sglang.srt.utils import get_compiler_backend
 
-is_cuda = is_cuda_available()
-if is_cuda:
-    from sgl_kernel import sampling_scaling_penalties
+
+@torch.compile(dynamic=True, backend=get_compiler_backend())
+def apply_scaling_penalties(logits, scaling_penalties):
+    logits[:] = torch.where(
+        logits > 0,
+        logits / scaling_penalties,
+        logits * scaling_penalties,
+    )
 
 
 class BatchedRepetitionPenalizer(_BatchedPenalizer):
@@ -61,16 +66,7 @@ def _cumulate_output_tokens(self, output_ids: _TokenIDs):
         self.cumulated_repetition_penalties[mask] = self.repetition_penalties[mask]
 
     def _apply(self, logits: torch.Tensor) -> torch.Tensor:
-        if is_cuda:
-            return sampling_scaling_penalties(
-                logits, self.cumulated_repetition_penalties
-            )
-        else:
-            return torch.where(
-                logits > 0,
-                logits / self.cumulated_repetition_penalties,
-                logits * self.cumulated_repetition_penalties,
-            )
+        apply_scaling_penalties(logits, self.cumulated_repetition_penalties)
 
     def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.repetition_penalties = self.repetition_penalties[indices_tensor_to_keep]
diff --git a/python/sglang/srt/sampling/sampling_batch_info.py b/python/sglang/srt/sampling/sampling_batch_info.py
index a27ff1ad2a36..9521a34f4f6f 100644
--- a/python/sglang/srt/sampling/sampling_batch_info.py
+++ b/python/sglang/srt/sampling/sampling_batch_info.py
@@ -7,14 +7,11 @@
 
 import torch
 
-from sglang.srt.utils import is_cuda_available
-
-is_cuda = is_cuda_available()
-if is_cuda:
-    from sgl_kernel import sampling_scaling_penalties
-
 import sglang.srt.sampling.penaltylib as penaltylib
 from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
+from sglang.srt.sampling.penaltylib.penalizers.repetition_penalty import (
+    apply_scaling_penalties,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -386,14 +383,7 @@ def apply_logits_bias(self, logits: torch.Tensor):
 
         # repetition
         if self.scaling_penalties is not None:
-            if is_cuda:
-                logits[:] = sampling_scaling_penalties(logits, self.scaling_penalties)
-            else:
-                logits[:] = torch.where(
-                    logits > 0,
-                    logits / self.scaling_penalties,
-                    logits * self.scaling_penalties,
-                )
+            apply_scaling_penalties(logits, self.scaling_penalties)
 
         # Apply regex vocab_mask
         if self.vocab_mask is not None:

From 8e48ca8cc1c7409a66eaff61685cd4be40d93908 Mon Sep 17 00:00:00 2001
From: Hui Liu <96135754+hliuca@users.noreply.github.com>
Date: Sat, 25 Jan 2025 18:29:14 -0800
Subject: [PATCH 243/248] enable kv_scale for Gemma2 (#3113)

---
 python/sglang/srt/models/gemma2.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py
index 4d21901de7c7..06a7b030260a 100644
--- a/python/sglang/srt/models/gemma2.py
+++ b/python/sglang/srt/models/gemma2.py
@@ -35,7 +35,10 @@
 from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from sglang.srt.utils import make_layers
 
 
@@ -424,6 +427,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)

From 822bae8c009a038a8a1d2a899afa2704c7be4202 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Sun, 26 Jan 2025 13:21:34 +0800
Subject: [PATCH 244/248] feat: cross python wheel for sgl-kernel (#3138)

---
 .github/workflows/pr-test-sgl-kernel.yml  | 2 +-
 .github/workflows/release-pypi-kernel.yml | 2 +-
 .github/workflows/release-whl-kernel.yml  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
index 26b921eee332..65e452369617 100644
--- a/.github/workflows/pr-test-sgl-kernel.yml
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -35,7 +35,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.10']
+        python-version: ['3.9']
         cuda-version: ['12.4']
 
     steps:
diff --git a/.github/workflows/release-pypi-kernel.yml b/.github/workflows/release-pypi-kernel.yml
index c07069c5d124..af34c8423ce7 100644
--- a/.github/workflows/release-pypi-kernel.yml
+++ b/.github/workflows/release-pypi-kernel.yml
@@ -18,7 +18,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.9', '3.10', '3.11', '3.12']
+        python-version: ['3.9']
         cuda-version: ['12.4']
 
     steps:
diff --git a/.github/workflows/release-whl-kernel.yml b/.github/workflows/release-whl-kernel.yml
index 08a820c2aab4..70c451778fa4 100644
--- a/.github/workflows/release-whl-kernel.yml
+++ b/.github/workflows/release-whl-kernel.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.9', '3.10', '3.11', '3.12']
+        python-version: ['3.9']
         cuda-version: ['11.8']
 
     steps:

From 66283dbc0c052c6f32bde68451addc5b0d00cf3b Mon Sep 17 00:00:00 2001
From: yigex <yigex@amd.com>
Date: Sun, 26 Jan 2025 13:33:51 +0800
Subject: [PATCH 245/248] [Fix] Not skip NVML Check on AMD Platform (#3135)

---
 .../distributed/device_communicators/custom_all_reduce.py  | 7 +++++--
 python/sglang/srt/utils.py                                 | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
index c3cbc41fe635..faeac0bbae9e 100644
--- a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
+++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
@@ -185,9 +185,12 @@ def __init__(
         # test nvlink first, this will filter out most of the cases
         # where custom allreduce is not supported
         # this checks hardware and driver support for NVLink
-        assert is_cuda()
+        if is_cuda():
+            assert is_cuda()
 
-        full_nvlink = is_full_nvlink(physical_device_ids)
+            full_nvlink = is_full_nvlink(physical_device_ids)
+        else:
+            full_nvlink = False
         if world_size > 2 and not full_nvlink:
             logger.warning(
                 "Custom allreduce is disabled because it's not supported on"
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index 23dcb43d2d90..f1d57e9062a7 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -73,7 +73,7 @@ def is_hip() -> bool:
 
 
 def is_cuda():
-    return hasattr(torch, "cuda") and torch.cuda.is_available()
+    return hasattr(torch, "cuda") and torch.version.cuda is not None
 
 
 def is_cuda_alike():

From 4f118a39d7469f7e14a1d3405508eea18a9cc8bb Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 25 Jan 2025 21:48:58 -0800
Subject: [PATCH 246/248] Fix repetition penalty (#3139)

---
 .github/workflows/pr-test.yml                    | 16 ++++++++--------
 .../penaltylib/penalizers/repetition_penalty.py  |  1 +
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 998a12e75d82..487dfb6612ba 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -29,7 +29,7 @@ concurrency:
 jobs:
 
   unit-test-frontend:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false
     runs-on: 1-gpu-runner
     steps:
       - name: Checkout code
@@ -50,7 +50,7 @@ jobs:
           python3 run_suite.py --suite per-commit
 
   unit-test-backend-1-gpu:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false
     runs-on: 1-gpu-runner
     strategy:
       matrix:
@@ -77,7 +77,7 @@ jobs:
 
 
   unit-test-backend-2-gpu:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false
     runs-on: 2-gpu-runner
     steps:
       - name: Checkout code
@@ -114,7 +114,7 @@ jobs:
           python3 test_moe_ep.py
 
   performance-test-1-gpu-part-1:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false
     runs-on: 1-gpu-runner
     steps:
       - name: Checkout code
@@ -158,7 +158,7 @@ jobs:
 
 
   performance-test-1-gpu-part-2:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false
     runs-on: 1-gpu-runner
     steps:
       - name: Checkout code
@@ -189,7 +189,7 @@ jobs:
           python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
 
   performance-test-2-gpu:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false
     runs-on: 2-gpu-runner
     steps:
       - name: Checkout code
@@ -227,7 +227,7 @@ jobs:
 
 
   accuracy-test-1-gpu:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false
     runs-on: 1-gpu-runner
     steps:
       - name: Checkout code
@@ -251,7 +251,7 @@ jobs:
 
 
   accuracy-test-2-gpu:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false
     runs-on: 2-gpu-runner
     steps:
       - name: Checkout code
diff --git a/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py b/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py
index 0f714c54806b..fe687c569d4c 100644
--- a/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py
+++ b/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py
@@ -67,6 +67,7 @@ def _cumulate_output_tokens(self, output_ids: _TokenIDs):
 
     def _apply(self, logits: torch.Tensor) -> torch.Tensor:
         apply_scaling_penalties(logits, self.cumulated_repetition_penalties)
+        return logits
 
     def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.repetition_penalties = self.repetition_penalties[indices_tensor_to_keep]

From 95f789adb0d6a07e06fbb095982d56a20eeed38d Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Sun, 26 Jan 2025 14:29:58 +0800
Subject: [PATCH 247/248] minor: cleanup sgl-kernel (#3143)

---
 sgl-kernel/developer_guide.md                 |  4 +
 sgl-kernel/setup.py                           |  2 -
 .../src/sgl-kernel/csrc/fused_add_rms_norm.cu | 92 -------------------
 .../csrc/lightning_attention_decode_kernel.cu |  3 +-
 .../src/sgl-kernel/csrc/moe_align_kernel.cu   | 17 +---
 .../csrc/sampling_scaling_penalties.cu        | 61 ------------
 .../sgl-kernel/csrc/trt_reduce_internal.cu    |  1 +
 .../src/sgl-kernel/csrc/trt_reduce_kernel.cu  |  1 +
 .../src/sgl-kernel/include/sgl_kernels_ops.h  |  6 +-
 .../include/trt_reduce_internal.cuh           |  3 +-
 sgl-kernel/src/sgl-kernel/include/utils.h     |  3 +-
 sgl-kernel/src/sgl-kernel/torch_extension.cc  |  4 -
 .../tests/test_sampling_scaling_penalties.py  | 39 --------
 13 files changed, 11 insertions(+), 225 deletions(-)
 delete mode 100644 sgl-kernel/src/sgl-kernel/csrc/fused_add_rms_norm.cu
 delete mode 100644 sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
 delete mode 100644 sgl-kernel/tests/test_sampling_scaling_penalties.py

diff --git a/sgl-kernel/developer_guide.md b/sgl-kernel/developer_guide.md
index 26b68535c03b..26426d90d8a3 100644
--- a/sgl-kernel/developer_guide.md
+++ b/sgl-kernel/developer_guide.md
@@ -40,6 +40,10 @@ Development build:
 make build
 ```
 
+Note:
+
+The `sgl-kernel` is rapidly evolving. If you experience a compilation failure, try using `make rebuild`.
+
 ### Testing & Benchmarking
 
 1. Add pytest tests in [tests/](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/tests)
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
index 95b040fe185d..56a42ae47591 100644
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -82,10 +82,8 @@ def _get_version():
     "src/sgl-kernel/csrc/trt_reduce_kernel.cu",
     "src/sgl-kernel/csrc/moe_align_kernel.cu",
     "src/sgl-kernel/csrc/int8_gemm_kernel.cu",
-    "src/sgl-kernel/csrc/sampling_scaling_penalties.cu",
     "src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu",
     "src/sgl-kernel/csrc/rotary_embedding.cu",
-    "src/sgl-kernel/csrc/fused_add_rms_norm.cu",
     "3rdparty/flashinfer/csrc/activation.cu",
     "3rdparty/flashinfer/csrc/bmm_fp8.cu",
     "3rdparty/flashinfer/csrc/group_gemm.cu",
diff --git a/sgl-kernel/src/sgl-kernel/csrc/fused_add_rms_norm.cu b/sgl-kernel/src/sgl-kernel/csrc/fused_add_rms_norm.cu
deleted file mode 100644
index 734061586675..000000000000
--- a/sgl-kernel/src/sgl-kernel/csrc/fused_add_rms_norm.cu
+++ /dev/null
@@ -1,92 +0,0 @@
-// Adapted from
-// https://github.com/InternLM/lmdeploy/blob/800b6010c0bf76aadf678bc38a507b749fb9774c/src/turbomind/kernels/norm/rms_norm.cu
-
-#include <turbomind/kernels/core/array_ops.h>
-#include <turbomind/kernels/core/common.h>
-
-#include <cub/block/block_reduce.cuh>
-
-using namespace turbomind;
-
-template <class T, class Tacc, int block_dim, int vec_size>
-__global__ void BiasResidualRMSNormKernel(T* __restrict__ residual, T* __restrict__ hidden_states,
-                                          const T* __restrict__ weights, const T* __restrict__ bias, int dims, int num,
-                                          float eps, float inv_dims) {
-  const int ti = blockIdx.x;
-  const int di = threadIdx.x * vec_size;
-
-  if (ti >= num) {
-    return;
-  }
-
-  residual += dims * ti;
-  hidden_states += dims * ti;
-
-  Array<Tacc, vec_size> accum{};
-
-  Array<T, vec_size> r_vec;
-  Array<T, vec_size> h_vec;
-  Array<T, vec_size> b_vec;
-
-  for (int i = di; i < dims; i += block_dim * vec_size) {
-    Load(r_vec, &residual[i]);
-    Load(h_vec, &hidden_states[i]);
-
-    using namespace ops;
-    r_vec = r_vec + h_vec;
-
-    if (bias) {
-      Ldg(b_vec, &bias[i]);
-      r_vec = r_vec + b_vec;
-    }
-
-    Store(&residual[i], r_vec);
-
-    Array<Tacc, vec_size> tmp = cast<Tacc>(r_vec);
-
-    accum = accum + tmp * tmp;
-  }
-
-  float sum{};
-  PRAGMA_UNROLL
-  for (int i = 0; i < vec_size; ++i) {
-    sum += accum[i];
-  }
-
-  using BlockReduce = cub::BlockReduce<Tacc, block_dim>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  sum = BlockReduce{temp_storage}.Sum(sum);
-
-  __shared__ float shared_sum;
-
-  if (threadIdx.x == 0) {
-    shared_sum = rsqrtf(sum * inv_dims + eps);
-  }
-
-  __syncthreads();
-
-  sum = shared_sum;
-
-  Array<T, vec_size> w_vec;
-  for (int i = di; i < dims; i += block_dim * vec_size) {
-    Load(r_vec, &residual[i]);
-    Ldg(w_vec, &weights[i]);
-    PRAGMA_UNROLL
-    for (int c = 0; c < vec_size; ++c) {
-      r_vec[c] = (T)((float)r_vec[c] * sum) * w_vec[c];
-    }
-    Store(&hidden_states[i], r_vec);
-  }
-}
-
-template <class T>
-void invokeBiasResidualRMSNorm(T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num,
-                               float eps, cudaStream_t st) {
-  constexpr int vec_size = 16 / sizeof(T);
-  constexpr int threads = 512;
-  const int blocks = num;
-
-  BiasResidualRMSNormKernel<T, float, threads, vec_size>
-      <<<blocks, threads, 0, st>>>(residual, hidden_states, weights, bias, dims, num, eps, 1.f / dims);
-}
diff --git a/sgl-kernel/src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu
index eb79373b22cb..e62a154cb183 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu
@@ -3,8 +3,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
-
-#include "utils.h"
+#include <torch/extension.h>
 
 #define THREADS_PER_BLOCK 128
 
diff --git a/sgl-kernel/src/sgl-kernel/csrc/moe_align_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/moe_align_kernel.cu
index 83861aee071c..19e9850b51a9 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/moe_align_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/moe_align_kernel.cu
@@ -3,28 +3,14 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
 
 #include <THC/THCAtomics.cuh>
 
-#include "utils.h"
-
-#ifdef USE_ROCM
-#include <hip/hip_runtime.h>
-#endif
-
-#ifndef USE_ROCM
 #define WARP_SIZE 32
-#else
-#define WARP_SIZE warpSize
-#endif
 
-#ifndef USE_ROCM
 #define DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
   cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
-#else
-#define DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
-  hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
-#endif
 
 #define CEILDIV(x, y) (((x) + (y)-1) / (y))
 
@@ -39,7 +25,6 @@
   AT_DISPATCH_SWITCH(TYPE, NAME, DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
 
 __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, int32_t col) {
-  // don't worry about overflow because num_experts is relatively small
   return row * total_col + col;
 }
 
diff --git a/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu b/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
deleted file mode 100644
index 18beb86445f9..000000000000
--- a/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
+++ /dev/null
@@ -1,61 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <THC/THCAtomics.cuh>
-#include <flashinfer/vec_dtypes.cuh>
-
-#include "utils.h"
-
-template <typename scalar_t>
-__global__ void sampling_scaling_penalties_kernel(const scalar_t* logits, const scalar_t* scaling_penalties,
-                                                  scalar_t* output, const int32_t numel) {
-  const int32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-  const int32_t stride = blockDim.x * gridDim.x;
-
-  constexpr uint32_t vec_size = 16 / sizeof(scalar_t);
-  using vec_t = flashinfer::vec_t<scalar_t, vec_size>;
-
-  const int32_t num_vec_elems = numel / vec_size;
-
-#pragma unroll 1
-  for (int32_t i = tid; i < num_vec_elems; i += stride) {
-    vec_t logits_vec, penalties_vec, out_vec;
-    logits_vec.cast_load(logits + i * vec_size);
-    penalties_vec.cast_load(scaling_penalties + i * vec_size);
-
-#pragma unroll
-    for (uint32_t j = 0; j < vec_size; ++j) {
-      out_vec[j] = logits_vec[j] > scalar_t(0.0f) ? logits_vec[j] / penalties_vec[j] : logits_vec[j] * penalties_vec[j];
-    }
-
-    out_vec.cast_store(output + i * vec_size);
-  }
-
-  // process the remaining elements
-  const int32_t start_idx = num_vec_elems * vec_size;
-  for (int32_t i = start_idx + tid; i < numel; i += stride) {
-    scalar_t logit = logits[i];
-    scalar_t penalty = scaling_penalties[i];
-    output[i] = logit > scalar_t(0.0f) ? logit / penalty : logit * penalty;
-  }
-}
-
-torch::Tensor sampling_scaling_penalties(const torch::Tensor& logits, const torch::Tensor& scaling_penalties) {
-  auto output = torch::empty_like(logits);
-  const auto numel = logits.numel();
-  const int threads = 512;
-
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(logits.scalar_type(), scalar_t, [&] {
-    uint32_t vec_size = 16 / sizeof(scalar_t);
-    const int blocks = (numel + threads * vec_size - 1) / (threads * vec_size);
-    sampling_scaling_penalties_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
-        static_cast<scalar_t*>(logits.data_ptr()), static_cast<scalar_t*>(scaling_penalties.data_ptr()),
-        static_cast<scalar_t*>(output.data_ptr()), numel);
-    return true;
-  });
-
-  return output;
-}
diff --git a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu
index 8bdb50125430..2ee0c98c91e1 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu
@@ -26,6 +26,7 @@
 #include <tuple>
 
 #include "trt_reduce_internal.cuh"
+#include "utils.h"
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_kernel.cu
index d647c349602e..fd0483e39eed 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_kernel.cu
@@ -5,6 +5,7 @@
 #include <cassert>
 
 #include "trt_reduce_internal.cuh"
+#include "utils.h"
 
 using namespace trt_llm;
 
diff --git a/sgl-kernel/src/sgl-kernel/include/sgl_kernels_ops.h b/sgl-kernel/src/sgl-kernel/include/sgl_kernels_ops.h
index 91e350895c29..b29d30ac5572 100644
--- a/sgl-kernel/src/sgl-kernel/include/sgl_kernels_ops.h
+++ b/sgl-kernel/src/sgl-kernel/include/sgl_kernels_ops.h
@@ -1,11 +1,10 @@
 #pragma once
+
 #include <Python.h>
 #include <torch/extension.h>
 
 #include <vector>
 
-#include "utils.h"
-
 #define _CONCAT(A, B) A##B
 #define CONCAT(A, B) _CONCAT(A, B)
 
@@ -36,9 +35,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t b
                           torch::Tensor sorted_token_ids, torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad,
                           torch::Tensor token_cnts_buffer, torch::Tensor cumsum_buffer);
 
-// sampling_scaling_penalties
-torch::Tensor sampling_scaling_penalties(const torch::Tensor& logits, const torch::Tensor& scaling_penalties);
-
 // int8_scaled_mm
 torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a,
                              const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
diff --git a/sgl-kernel/src/sgl-kernel/include/trt_reduce_internal.cuh b/sgl-kernel/src/sgl-kernel/include/trt_reduce_internal.cuh
index 22ba0e414fc6..46522348aafa 100644
--- a/sgl-kernel/src/sgl-kernel/include/trt_reduce_internal.cuh
+++ b/sgl-kernel/src/sgl-kernel/include/trt_reduce_internal.cuh
@@ -17,12 +17,11 @@
  */
 
 #pragma once
+
 #include <cuda_fp16.h>
 #include <stdint.h>
 #include <torch/all.h>
 
-#include "utils.h"
-
 namespace trt_llm {
 constexpr size_t WARP_SIZE = 32;
 constexpr size_t MAX_ALL_REDUCE_BLOCKS = 36;
diff --git a/sgl-kernel/src/sgl-kernel/include/utils.h b/sgl-kernel/src/sgl-kernel/include/utils.h
index 1cca35d5cd7a..55594f7b2733 100644
--- a/sgl-kernel/src/sgl-kernel/include/utils.h
+++ b/sgl-kernel/src/sgl-kernel/include/utils.h
@@ -1,12 +1,11 @@
 #pragma once
+
 #include <cuda_runtime.h>
 #include <pytorch_extension_utils.h>
 #include <torch/extension.h>
 
 #include <sstream>
 
-#include "sgl_kernels_ops.h"
-
 struct cuda_error : public std::runtime_error {
   /**
    * @brief Constructs a `cuda_error` object with the given `message`.
diff --git a/sgl-kernel/src/sgl-kernel/torch_extension.cc b/sgl-kernel/src/sgl-kernel/torch_extension.cc
index f8a061c15d59..099a03a5601e 100644
--- a/sgl-kernel/src/sgl-kernel/torch_extension.cc
+++ b/sgl-kernel/src/sgl-kernel/torch_extension.cc
@@ -28,10 +28,6 @@ TORCH_LIBRARY_EXPAND(sgl_kernels, m) {
       "experts_ids, Tensor! num_tokens_post_pad, Tensor! token_cnts_buffer, Tensor! cumsum_buffer) -> ()");
   m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
 
-  // sampling_scaling_penalties
-  m.def("sampling_scaling_penalties(Tensor logits, Tensor scaling_penalties) -> Tensor");
-  m.impl("sampling_scaling_penalties", torch::kCUDA, &sampling_scaling_penalties);
-
   // int8_scaled_mm
   m.def(
       "int8_scaled_mm(Tensor mat_a, Tensor mat_b, Tensor scales_a, Tensor scales_b, ScalarType out_dtype, Tensor? "
diff --git a/sgl-kernel/tests/test_sampling_scaling_penalties.py b/sgl-kernel/tests/test_sampling_scaling_penalties.py
deleted file mode 100644
index a56eca866b21..000000000000
--- a/sgl-kernel/tests/test_sampling_scaling_penalties.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import pytest
-import torch
-from sgl_kernel import sampling_scaling_penalties
-
-batch_sizes = [1, 2, 4, 8, 16, 32, 64, 65]
-vocab_sizes = [2048, 4096, 8192, 16384, 32768, 32767]
-dtypes = [torch.float32, torch.half, torch.bfloat16]
-
-
-@pytest.mark.parametrize("batch_size", batch_sizes)
-@pytest.mark.parametrize("vocab_size", vocab_sizes)
-@pytest.mark.parametrize("dtype", dtypes)
-def test_sampling_scaling_penalties(batch_size, vocab_size, dtype):
-    device = torch.device("cuda")
-    rtol = 1e-3
-    atol = 1e-3
-
-    logits = torch.randn(batch_size, vocab_size, device=device, dtype=dtype)
-    scaling_penalties = (
-        torch.rand(batch_size, vocab_size, device=device, dtype=dtype) + 0.5
-    )
-
-    ref_output = torch.where(
-        logits > 0, logits / scaling_penalties, logits * scaling_penalties
-    )
-
-    kernel_output = sampling_scaling_penalties(logits, scaling_penalties)
-
-    torch.testing.assert_close(
-        kernel_output,
-        ref_output,
-        rtol=rtol,
-        atol=atol,
-        msg=f"Failed for batch_size={batch_size}, vocab_size={vocab_size}, dtype={dtype}",
-    )
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])

From 62bf9a4932feba3397d79aa5fdfe111c63f7509b Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Sun, 26 Jan 2025 07:03:55 +0000
Subject: [PATCH 248/248] fix name conflict

---
 .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu    | 121 +++++++++---------
 1 file changed, 61 insertions(+), 60 deletions(-)

diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
index 02ecfa37574b..3e33e143c0ce 100644
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu
@@ -207,9 +207,9 @@ void launch_sm89_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const
 }
 
 template <typename OutType, typename CtaShape, typename WarpShape, int Stages>
-void sm89_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
-                        const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-                        const c10::optional<torch::Tensor>& bias) {
+void sm89_fp8_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                            const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                            const c10::optional<torch::Tensor>& bias) {
   using ElementInput = cutlass::float_e4m3_t;
   using ElementOutput = OutType;
   using AccumElementType = float;
@@ -225,82 +225,82 @@ void sm89_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch:
 }
 
 template <typename OutType>
-void sm89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
-                         const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-                         const c10::optional<torch::Tensor>& bias) {
+void sm89_fp8_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                             const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                             const c10::optional<torch::Tensor>& bias) {
   uint32_t const m = a.size(0);
   uint32_t const n = out.size(1);
 
   if (m == 1) {
     if (n <= 8192) {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
-                                7>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>,
+                                    cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
     } else {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
-                                5>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>,
+                                    cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     }
   } else if (m <= 16) {
     // M in (1, 16]
     if (n <= 8192) {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
-                                4>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>,
+                                    cutlass::gemm::GemmShape<16, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
     } else if (n <= 16384) {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
-                                5>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>,
+                                    cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     } else {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
-                                7>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>,
+                                    cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
     }
   } else if (m <= 64) {
     // M in (16, 64]
     if (n <= 16384) {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
-                                7>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>,
+                                    cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
     } else {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
-                                7>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<16, 64, 128>,
+                                    cutlass::gemm::GemmShape<16, 64, 64>, 7>(out, a, b, scales_a, scales_b, bias);
     }
   } else if (m <= 128) {
     // M in (64, 128]
     if (n <= 8192) {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>,
-                                4>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>,
+                                    cutlass::gemm::GemmShape<32, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias);
     } else if (n <= 16384) {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<32, 64, 64>,
-                                5>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 64, 128>,
+                                    cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     } else {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 64, 64>,
-                                5>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<32, 64, 128>,
+                                    cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     }
   } else if (m <= 256) {
     // M in (128, 256]
     if (n <= 8192) {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
-                                5>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 64>,
+                                    cutlass::gemm::GemmShape<64, 32, 64>, 5>(out, a, b, scales_a, scales_b, bias);
     } else if (n <= 16384) {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
-                                7>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<64, 128, 64>,
+                                    cutlass::gemm::GemmShape<64, 32, 64>, 7>(out, a, b, scales_a, scales_b, bias);
     } else {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 128>, cutlass::gemm::GemmShape<64, 32, 128>,
-                                4>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 64, 128>,
+                                    cutlass::gemm::GemmShape<64, 32, 128>, 4>(out, a, b, scales_a, scales_b, bias);
     }
   } else if (m <= 512) {
     // M in (256, 512)
     if (n <= 16384) {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
-                                2>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>,
+                                    cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
     } else {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
-                                4>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>,
+                                    cutlass::gemm::GemmShape<64, 32, 64>, 4>(out, a, b, scales_a, scales_b, bias);
     }
   } else {
     // M in (512, inf)
     if (n <= 8192) {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
-                                3>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>,
+                                    cutlass::gemm::GemmShape<64, 32, 64>, 3>(out, a, b, scales_a, scales_b, bias);
     } else {
-      return sm89_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 32, 64>,
-                                2>(out, a, b, scales_a, scales_b, bias);
+      return sm89_fp8_dispatch_bias<OutType, cutlass::gemm::GemmShape<128, 128, 64>,
+                                    cutlass::gemm::GemmShape<64, 32, 64>, 2>(out, a, b, scales_a, scales_b, bias);
     }
   }
 }
@@ -506,9 +506,10 @@ void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const
 
 template <typename OutType, typename CTAShape, typename ClusterShape, typename MainloopScheduleType,
           typename TileSchedulerType>
-void sm90_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
-                        const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-                        const c10::optional<torch::Tensor>& bias, bool fast_accum = true, bool use_persistent = false) {
+void sm90_fp8_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                            const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                            const c10::optional<torch::Tensor>& bias, bool fast_accum = true,
+                            bool use_persistent = false) {
   using ElementInput = cutlass::float_e4m3_t;
   using ElementOutput = OutType;
   using AccumElementType = float;
@@ -528,34 +529,34 @@ void sm90_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch:
 }
 
 template <typename OutType>
-void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
-                         const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-                         const c10::optional<torch::Tensor>& bias) {
+void sm90_fp8_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                             const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                             const c10::optional<torch::Tensor>& bias) {
   uint32_t const m = a.size(0);
   using FastPingpongScheduler = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
   using FastBasicScheduler = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
   using PersistentTileScheduler = cutlass::gemm::PersistentScheduler;
   using BasicTileScheduler = void;
   if (m <= 1) {
-    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _8, _1>, FastBasicScheduler,
-                              BasicTileScheduler>(out, a, b, scales_a, scales_b, bias);
+    return sm90_fp8_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _8, _1>, FastBasicScheduler,
+                                  BasicTileScheduler>(out, a, b, scales_a, scales_b, bias);
   }
   if (m <= 64) {
     // m in [1, 64]
-    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _4, _1>, FastPingpongScheduler,
-                              PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
+    return sm90_fp8_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _4, _1>, FastPingpongScheduler,
+                                  PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
   } else if (m <= 256) {
     // m in (64, 256]
-    return sm90_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _1, _1>, FastPingpongScheduler,
-                              PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
+    return sm90_fp8_dispatch_bias<OutType, Shape<_64, _64, _128>, Shape<_1, _1, _1>, FastPingpongScheduler,
+                                  PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
   } else if (m <= 1024) {
     // m in (256, 1024]
-    return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_1, _1, _1>, FastPingpongScheduler,
-                              PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
+    return sm90_fp8_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_1, _1, _1>, FastPingpongScheduler,
+                                  PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
   } else {
     // m in (1024, inf)
-    return sm90_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>, FastPingpongScheduler,
-                              PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
+    return sm90_fp8_dispatch_bias<OutType, Shape<_128, _128, _128>, Shape<_2, _1, _1>, FastPingpongScheduler,
+                                  PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
   }
 }
 #endif
@@ -600,9 +601,9 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
   if (sm_version >= 90) {
     if (out_dtype == torch::kBFloat16) {
-      sm90_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+      sm90_fp8_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
     } else {
-      sm90_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+      sm90_fp8_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
     }
     return out;
   }
@@ -611,9 +612,9 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
 #if defined CUDA_VERSION && CUDA_VERSION >= 12040
   if (sm_version == 89) {
     if (out_dtype == torch::kBFloat16) {
-      sm89_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+      sm89_fp8_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
     } else {
-      sm89_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+      sm89_fp8_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
     }
     return out;
   }