From b22f3f647517b6a2fd5bef57a3532025af0b8984 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Tue, 7 Jan 2025 21:02:35 -0800 Subject: [PATCH 001/248] Fix nightly accuracy tests (#2780) --- python/sglang/test/test_utils.py | 2 +- test/srt/run_suite.py | 3 +- test/srt/test_nightly_gsm8k_eval.py | 49 +++++++++++++++------------- test/srt/test_nightly_human_eval.py | 2 +- test/srt/test_skip_tokenizer_init.py | 6 ++-- 5 files changed, 33 insertions(+), 29 deletions(-) diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index cd21c896a044..4121deb17cc7 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -36,7 +36,7 @@ DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" -DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4" diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 83d2e90a43a9..2c1750d363ce 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -49,8 +49,7 @@ ], "nightly": [ "test_nightly_gsm8k_eval.py", - "test_nightly_human_eval.py", - # Disable temporarly + # Disable temporarily # "test_nightly_math_eval.py", ], "sampling/penaltylib": glob.glob( diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py index 7e23b721e433..7820f6825a9c 100644 --- a/test/srt/test_nightly_gsm8k_eval.py +++ b/test/srt/test_nightly_gsm8k_eval.py @@ -1,6 +1,5 @@ import json import os -import subprocess import unittest import warnings from datetime import datetime @@ -16,24 +15,26 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, + is_in_ci, popen_launch_server, + write_github_step_summary, ) MODEL_SCORE_THRESHOLDS = { - "meta-llama/Llama-3.1-8B-Instruct": 0.83, + "meta-llama/Llama-3.1-8B-Instruct": 0.82, "mistralai/Mistral-7B-Instruct-v0.3": 0.58, - "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.84, + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85, "google/gemma-2-27b-it": 0.92, - "meta-llama/Llama-3.1-70B-Instruct": 0.96, - "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.63, - "Qwen/Qwen2-57B-A14B-Instruct": 0.87, - "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.84, + "meta-llama/Llama-3.1-70B-Instruct": 0.95, + "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64, + "Qwen/Qwen2-57B-A14B-Instruct": 0.88, + "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83, "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54, - "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.83, + "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.84, "neuralmagic/gemma-2-2b-it-FP8": 0.60, - "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.95, - "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.61, - "neuralmagic/Qwen2-72B-Instruct-FP8": 0.95, + "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.94, + "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.62, + "neuralmagic/Qwen2-72B-Instruct-FP8": 0.94, "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82, "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84, "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.83, @@ -67,7 +68,6 @@ def launch_server(base_url, model, is_fp8, is_tp2): base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=other_args, - return_stdout_stderr=(subprocess.DEVNULL, subprocess.DEVNULL), ) return process @@ -99,6 +99,9 @@ def write_results_to_json(model, metrics, mode="a"): def check_model_scores(results): failed_models = [] + summary = " | model | score | threshold |\n" + summary += "| ----- | ----- | --------- |\n" + for model, score in results: threshold = MODEL_SCORE_THRESHOLDS.get(model) if threshold is None: @@ -111,11 +114,19 @@ def check_model_scores(results): f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})" ) + line = f"| {model} | {score} | {threshold} |\n" + summary += line + + print(summary) + + if is_in_ci(): + write_github_step_summary(f"### TestNightlyGsm8KEval\n{summary}") + if failed_models: raise AssertionError("\n".join(failed_models)) -class TestEvalAccuracyLarge(unittest.TestCase): +class TestNightlyGsm8KEval(unittest.TestCase): @classmethod def setUpClass(cls): cls.model_groups = [ @@ -127,13 +138,6 @@ def setUpClass(cls): ] cls.base_url = DEFAULT_URL_FOR_TEST - def setUp(self): - self.process = None - - def tearDown(self): - if self.process: - kill_process_tree(self.process.pid) - def test_mgsm_en_all_models(self): warnings.filterwarnings( "ignore", category=ResourceWarning, message="unclosed.*socket" @@ -144,7 +148,7 @@ def test_mgsm_en_all_models(self): for model_group, is_fp8, is_tp2 in self.model_groups: for model in model_group: with self.subTest(model=model): - self.process = launch_server(self.base_url, model, is_fp8, is_tp2) + process = launch_server(self.base_url, model, is_fp8, is_tp2) args = SimpleNamespace( base_url=self.base_url, @@ -163,8 +167,7 @@ def test_mgsm_en_all_models(self): is_first = False all_results.append((model, metrics["score"])) - - self.tearDown() + kill_process_tree(process.pid) try: with open("results.json", "r") as f: diff --git a/test/srt/test_nightly_human_eval.py b/test/srt/test_nightly_human_eval.py index bffe214b5deb..0b682937a825 100644 --- a/test/srt/test_nightly_human_eval.py +++ b/test/srt/test_nightly_human_eval.py @@ -18,7 +18,7 @@ ) -class TestEvalAccuracyLarge(unittest.TestCase): +class TestNightlyHumanEval(unittest.TestCase): @classmethod def setUpClass(cls): if is_in_ci(): diff --git a/test/srt/test_skip_tokenizer_init.py b/test/srt/test_skip_tokenizer_init.py index bc99b23ad581..eef033ea98cb 100644 --- a/test/srt/test_skip_tokenizer_init.py +++ b/test/srt/test_skip_tokenizer_init.py @@ -55,8 +55,10 @@ def run_decode(self, return_logprob=False, top_logprobs_num=0, n=1): print(json.dumps(ret)) def assert_one_item(item): - assert len(item["token_ids"]) == item["meta_info"]["completion_tokens"] - assert len(item["token_ids"]) == max_new_tokens + self.assertEqual( + len(item["token_ids"]), item["meta_info"]["completion_tokens"] + ) + self.assertEqual(len(item["token_ids"]), max_new_tokens) assert item["meta_info"]["prompt_tokens"] == len(input_ids) if return_logprob: From 694e41925e6698829b5f24381ec3957429eb4701 Mon Sep 17 00:00:00 2001 From: JJJJOHNSON Date: Wed, 8 Jan 2025 13:46:02 +0800 Subject: [PATCH 002/248] [eagle2] fix end check when target model verify (#2723) --- python/sglang/srt/speculative/eagle_utils.py | 50 ++++++++++++-------- test/srt/test_eagle_infer.py | 29 ++++++++++++ 2 files changed, 60 insertions(+), 19 deletions(-) diff --git a/python/sglang/srt/speculative/eagle_utils.py b/python/sglang/srt/speculative/eagle_utils.py index a6fcf2e570df..88c88c0724f4 100644 --- a/python/sglang/srt/speculative/eagle_utils.py +++ b/python/sglang/srt/speculative/eagle_utils.py @@ -550,8 +550,37 @@ def verify(self, batch: ScheduleBatch, logits_output: torch.Tensor) -> torch.Ten triton.next_power_of_2(max_draft_len), ) - accept_index = accept_index[accept_index != -1] + draft_input = EAGLEDraftInput() + new_accept_index = [] + unfinished_index = [] + finished_extend_len = {} # {rid:accept_length + 1} + accept_index_cpu = accept_index.tolist() + predict_cpu = predict.tolist() + # iterate every accepted token and check if req has finished after append the token + # should be checked BEFORE free kv cache slots + for i, (req, accept_index_row) in enumerate(zip(batch.reqs, accept_index_cpu)): + new_accept_index_ = [] + for j, idx in enumerate(accept_index_row): + if idx == -1: + break + id = predict_cpu[idx] + # if not found_finished: + req.output_ids.append(id) + finished_extend_len[req.rid] = j + 1 + req.check_finished() + if req.finished(): + draft_input.has_finished = True + # set all tokens after finished token to -1 and break + accept_index[i, j + 1 :] = -1 + break + else: + new_accept_index_.append(idx) + if not req.finished(): + new_accept_index.extend(new_accept_index_) + unfinished_index.append(i) + accept_length = (accept_index != -1).sum(dim=1) - 1 + accept_index = accept_index[accept_index != -1] accept_length_cpu = accept_length.tolist() verified_id = predict[accept_index] verified_id_cpu = verified_id.tolist() @@ -570,26 +599,9 @@ def verify(self, batch: ScheduleBatch, logits_output: torch.Tensor) -> torch.Ten triton.next_power_of_2(bs), ) batch.seq_lens.add_(accept_length + 1) - new_accept_index = [] - unfinished_index = [] - finished_extend_len = {} # {rid:accept_length + 1} - # retracted_reqs, new_token_ratio = batch.retract_decode() - - low = 0 - draft_input = EAGLEDraftInput() - for i, (req, verified_len) in enumerate(zip(batch.reqs, accept_length_cpu)): - req.output_ids.extend(verified_id_cpu[low : low + verified_len + 1]) - req.check_finished() - if req.finished(): - draft_input.has_finished = True - else: - new_accept_index.append(accept_index[low : low + verified_len + 1]) - unfinished_index.append(i) - low += verified_len + 1 - finished_extend_len[req.rid] = verified_len + 1 if len(new_accept_index) > 0: - new_accept_index = torch.cat(new_accept_index, dim=0) + new_accept_index = torch.tensor(new_accept_index, device="cuda") draft_input.verified_id = predict[new_accept_index] draft_input.hidden_states = batch.spec_info.hidden_states[new_accept_index] draft_input.accept_length = accept_length[unfinished_index] diff --git a/test/srt/test_eagle_infer.py b/test/srt/test_eagle_infer.py index 609d4411d77d..94ebc79ca743 100644 --- a/test/srt/test_eagle_infer.py +++ b/test/srt/test_eagle_infer.py @@ -1,5 +1,7 @@ import unittest +from transformers import AutoConfig, AutoTokenizer + import sglang as sgl @@ -34,6 +36,33 @@ def test_eagle_accuracy(self): print(out2) self.assertEqual(out1, out2) + def test_eagle_end_check(self): + prompt = "[INST] <>\\nYou are a helpful assistant.\\n<>\\nToday is a sunny day and I like [/INST]" + target_model_path = "meta-llama/Llama-2-7b-chat-hf" + tokenizer = AutoTokenizer.from_pretrained(target_model_path) + speculative_draft_model_path = "lmzheng/sglang-EAGLE-llama2-chat-7B" + + sampling_params = { + "temperature": 0, + "max_new_tokens": 1024, + "skip_special_tokens": False, + } + + engine = sgl.Engine( + model_path=target_model_path, + speculative_draft_model_path=speculative_draft_model_path, + speculative_algorithm="EAGLE", + speculative_num_steps=3, + speculative_eagle_topk=4, + speculative_num_draft_tokens=16, + ) + out1 = engine.generate(prompt, sampling_params)["text"] + engine.shutdown() + print("==== Answer 1 ====") + print(repr(out1)) + tokens = tokenizer.encode(out1, truncation=False) + assert tokenizer.eos_token_id not in tokens + if __name__ == "__main__": unittest.main() From 8a6906127a81421e06c904273f8e06dff85039a7 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Tue, 7 Jan 2025 23:29:10 -0800 Subject: [PATCH 003/248] Improve linear.py to load sharded weights & remove the dependency of Parameters from vllm (#2784) Co-authored-by: SangBin Cho rkooo567@gmail.com --- 3rdparty/amd/tuning/benchmark_moe_rocm.py | 5 +- .../sglang/srt/layers/attention/__init__.py | 9 +- .../layers/attention/flashinfer_backend.py | 6 +- python/sglang/srt/layers/linear.py | 222 ++++++--- .../srt/layers/moe/fused_moe_triton/layer.py | 5 +- python/sglang/srt/layers/parameter.py | 431 ++++++++++++++++++ python/sglang/srt/layers/quantization/fp8.py | 2 +- .../srt/layers/vocab_parallel_embedding.py | 2 +- .../sglang/srt/managers/session_controller.py | 2 +- .../srt/model_executor/forward_batch_info.py | 3 + .../sglang/srt/model_executor/model_runner.py | 3 +- python/sglang/srt/models/grok.py | 41 +- python/sglang/srt/server.py | 9 +- python/sglang/srt/speculative/eagle_utils.py | 2 +- scripts/killall_sglang.sh | 1 + 15 files changed, 655 insertions(+), 88 deletions(-) create mode 100644 python/sglang/srt/layers/parameter.py diff --git a/3rdparty/amd/tuning/benchmark_moe_rocm.py b/3rdparty/amd/tuning/benchmark_moe_rocm.py index a3f26e8e5028..5aff8c0d664e 100644 --- a/3rdparty/amd/tuning/benchmark_moe_rocm.py +++ b/3rdparty/amd/tuning/benchmark_moe_rocm.py @@ -10,7 +10,10 @@ from tqdm import tqdm from transformers import AutoConfig -from sglang.srt.layers.fused_moe_triton.fused_moe import fused_moe, get_config_file_name +from sglang.srt.layers.moe.fused_moe_triton.fused_moe import ( + fused_moe, + get_config_file_name, +) padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0 diff --git a/python/sglang/srt/layers/attention/__init__.py b/python/sglang/srt/layers/attention/__init__.py index 140755ff5e67..745598643028 100644 --- a/python/sglang/srt/layers/attention/__init__.py +++ b/python/sglang/srt/layers/attention/__init__.py @@ -66,7 +66,14 @@ def forward( if forward_batch.forward_mode.is_decode(): return self.forward_decode(q, k, v, layer, forward_batch, save_kv_cache) else: - return self.forward_extend(q, k, v, layer, forward_batch, save_kv_cache) + return self.forward_extend( + q, + k, + v, + layer, + forward_batch, + save_kv_cache, + ) def forward_decode( self, diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index 8b823cc5a5dd..fc3455b60774 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -347,6 +347,8 @@ def forward_extend( else forward_batch.encoder_out_cache_loc ) + logits_soft_cap = layer.logit_cap + if not self.forward_metadata.use_ragged: if k is not None: assert v is not None @@ -359,7 +361,7 @@ def forward_extend( causal=not layer.is_cross_attention, sm_scale=layer.scaling, window_left=layer.sliding_window_size, - logits_soft_cap=layer.logit_cap, + logits_soft_cap=logits_soft_cap, ) else: o1, s1 = self.prefill_wrapper_ragged.forward_return_lse( @@ -368,7 +370,7 @@ def forward_extend( v.contiguous().view(-1, layer.tp_v_head_num, layer.head_dim), causal=True, sm_scale=layer.scaling, - logits_soft_cap=layer.logit_cap, + logits_soft_cap=logits_soft_cap, ) if self.forward_metadata.extend_no_prefix: diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py index b828c03911e8..9edfa739458b 100644 --- a/python/sglang/srt/layers/linear.py +++ b/python/sglang/srt/layers/linear.py @@ -18,14 +18,15 @@ # workaround from vllm.model_executor.layers.linear import LinearBase -from vllm.model_executor.parameter import ( + +from sglang.srt.layers.parameter import ( BasevLLMParameter, PackedColumnParameter, PackedvLLMParameter, PerTensorScaleParameter, RowvLLMParameter, + _ColumnvLLMParameter, ) - from sglang.srt.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, @@ -94,6 +95,62 @@ def adjust_scalar_to_fused_array(param, loaded_weight, shard_id): return param[shard_id], loaded_weight +def load_column_qkv_weight( + self, loaded_weight, num_heads, shard_id, shard_offset, shard_size, tp_rank +): + if ( + isinstance(self, (PackedColumnParameter, PackedvLLMParameter)) + and self.output_dim == self.packed_dim + ): + shard_size, shard_offset = self.adjust_shard_indexes_for_packing( + shard_offset=shard_offset, shard_size=shard_size + ) + + param_data = self.data + shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads + param_data = param_data.narrow(self.output_dim, shard_offset, shard_size) + loaded_weight = loaded_weight.narrow( + self.output_dim, shard_id * shard_size, shard_size + ) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +def load_column_parallel_weight( + self, loaded_weight: torch.Tensor, tp_rank, use_presharded_weights: bool = False +): + if isinstance(self, _ColumnvLLMParameter): + if not use_presharded_weights: + shard_size = self.data.shape[self.output_dim] + loaded_weight = loaded_weight.narrow( + self.output_dim, tp_rank * shard_size, shard_size + ) + assert self.data.shape == loaded_weight.shape + self.data.copy_(loaded_weight) + else: + self.data.copy_(loaded_weight) + + +def load_row_parallel_weight( + self, loaded_weight: torch.Tensor, tp_rank, use_presharded_weights: bool = False +): + if isinstance(self, RowvLLMParameter): + if not use_presharded_weights: + shard_size = self.data.shape[self.input_dim] + loaded_weight = loaded_weight.narrow( + self.input_dim, tp_rank * shard_size, shard_size + ) + + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + + assert self.data.shape == loaded_weight.shape + self.data.copy_(loaded_weight) + else: + self.data.copy_(loaded_weight) + + class LinearMethodBase(QuantizeMethodBase): """Base class for different (maybe quantized) linear methods.""" @@ -287,6 +344,8 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, output_sizes: Optional[List[int]] = None, prefix: str = "", + tp_rank: Optional[int] = None, + tp_size: Optional[int] = None, ): super().__init__( input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix @@ -295,7 +354,11 @@ def __init__( self.gather_output = gather_output # Divide the weight matrix along the last dimension. - tp_size = get_tensor_model_parallel_world_size() + if tp_rank is None: + tp_rank = get_tensor_model_parallel_rank() + if tp_size is None: + tp_size = get_tensor_model_parallel_world_size() + self.tp_rank, self.tp_size = tp_rank, tp_size assert self.quant_method is not None self.output_size_per_partition = divide(self.output_size, tp_size) self.output_partition_sizes = [self.output_size_per_partition] @@ -336,7 +399,6 @@ def __init__( self.register_parameter("bias", None) def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): - tp_rank = get_tensor_model_parallel_rank() output_dim = getattr(param, "output_dim", None) # Special case for GGUF @@ -356,7 +418,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): # no need to narrow here if output_dim is not None and not use_bitsandbytes_4bit: shard_size = param_data.shape[output_dim] - start_idx = tp_rank * shard_size + start_idx = self.tp_rank * shard_size loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) # Special case for loading scales off disk, which often do not @@ -364,7 +426,9 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): if len(loaded_weight.shape) == 0: loaded_weight = loaded_weight.reshape(1) - assert param_data.shape == loaded_weight.shape + assert ( + param_data.shape == loaded_weight.shape + ), f"{param_data.shape=}, {loaded_weight.shape=}" param_data.copy_(loaded_weight) def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor): @@ -373,7 +437,7 @@ def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor): if len(loaded_weight.shape) == 0: assert loaded_weight.numel() == 1 loaded_weight = loaded_weight.reshape(1) - param.load_column_parallel_weight(loaded_weight=loaded_weight) + load_column_parallel_weight(param, loaded_weight, self.tp_rank) def forward(self, input_): bias = self.bias if not self.skip_bias_add else None @@ -393,7 +457,7 @@ def extra_repr(self) -> str: s = f"in_features={self.input_size}" s += f", output_features={self.output_size_per_partition}" s += f", bias={self.bias is not None}" - s += f", tp_size={get_tensor_model_parallel_world_size()}" + s += f", tp_size={self.tp_size}" s += f", gather_output={self.gather_output}" return s @@ -431,10 +495,18 @@ def __init__( params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + tp_rank: Optional[int] = None, + tp_size: Optional[int] = None, + use_presharded_weights: bool = False, ): self.output_sizes = output_sizes - tp_size = get_tensor_model_parallel_world_size() + if tp_rank is None: + tp_rank = get_tensor_model_parallel_rank() + if tp_size is None: + tp_size = get_tensor_model_parallel_world_size() + self.tp_rank, self.tp_size = tp_rank, tp_size assert all(output_size % tp_size == 0 for output_size in output_sizes) + self.use_presharded_weights = use_presharded_weights super().__init__( input_size=input_size, output_size=sum(output_sizes), @@ -444,6 +516,8 @@ def __init__( params_dtype=params_dtype, quant_config=quant_config, prefix=prefix, + tp_rank=tp_rank, + tp_size=tp_size, ) def weight_loader( @@ -463,12 +537,9 @@ def weight_loader( return if is_gguf_weight: - tp_size = get_tensor_model_parallel_world_size() - tp_rank = get_tensor_model_parallel_rank() - output_dim = getattr(param, "output_dim", None) - shard_size = loaded_weight.size(output_dim) // tp_size - start_idx = tp_rank * shard_size + shard_size = loaded_weight.size(output_dim) // self.tp_size + start_idx = self.tp_rank * shard_size loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) @@ -494,7 +565,9 @@ def weight_loader( param_data, loaded_weight, 0 ) - assert param_data.shape == loaded_weight.shape + assert ( + param_data.shape == loaded_weight.shape + ), f"{param_data.shape=}, {loaded_weight.shape=}" param_data.copy_(loaded_weight) return current_shard_offset = 0 @@ -522,11 +595,9 @@ def weight_loader( return assert loaded_shard_id < len(self.output_sizes) - tp_rank = get_tensor_model_parallel_rank() - tp_size = get_tensor_model_parallel_world_size() if output_dim is not None: - shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size - shard_size = self.output_sizes[loaded_shard_id] // tp_size + shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size + shard_size = self.output_sizes[loaded_shard_id] // self.tp_size # Special case for quantization. # If quantized, we need to adjust the offset and size to account # for the packing. @@ -545,10 +616,10 @@ def weight_loader( shard_offset = loaded_weight.shape[output_dim] * loaded_shard_id param_data = param_data.narrow(output_dim, shard_offset, shard_size) - start_idx = tp_rank * shard_size + start_idx = self.tp_rank * shard_size # bitsandbytes loads the weights of the specific portion # no need to narrow here - if not use_bitsandbytes_4bit: + if not use_bitsandbytes_4bit and not self.use_presharded_weights: loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) # Special case for AQLM codebooks. elif is_metadata: @@ -572,7 +643,9 @@ def weight_loader( "the same for all partitions." ) - assert param_data.shape == loaded_weight.shape + assert ( + param_data.shape == loaded_weight.shape + ), f"{param_data.shape=}, {loaded_weight.shape=}" param_data.copy_(loaded_weight) def _load_fused_module_from_checkpoint( @@ -629,26 +702,27 @@ def weight_loader_v2( assert loaded_shard_id < len(self.output_sizes) - tp_size = get_tensor_model_parallel_world_size() - if isinstance(param, BlockQuantScaleParameter): weight_block_size = self.quant_method.quant_config.weight_block_size block_n, _ = weight_block_size[0], weight_block_size[1] shard_offset = ( (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // block_n - ) // tp_size + ) // self.tp_size shard_size = ( - (self.output_sizes[loaded_shard_id] + block_n - 1) // block_n // tp_size + (self.output_sizes[loaded_shard_id] + block_n - 1) + // block_n + // self.tp_size ) else: - shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size - shard_size = self.output_sizes[loaded_shard_id] // tp_size + shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size + shard_size = self.output_sizes[loaded_shard_id] // self.tp_size param.load_merged_column_weight( loaded_weight=loaded_weight, shard_id=loaded_shard_id, shard_offset=shard_offset, shard_size=shard_size, + use_presharded_weights=self.use_presharded_weights, ) @@ -689,6 +763,8 @@ def __init__( params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + tp_rank: Optional[int] = None, + tp_size: Optional[int] = None, ): self.hidden_size = hidden_size self.head_size = head_size @@ -697,7 +773,11 @@ def __init__( total_num_kv_heads = total_num_heads self.total_num_kv_heads = total_num_kv_heads # Divide the weight matrix along the last dimension. - tp_size = get_tensor_model_parallel_world_size() + if tp_rank is None: + tp_rank = get_tensor_model_parallel_rank() + if tp_size is None: + tp_size = get_tensor_model_parallel_world_size() + self.tp_rank, self.tp_size = tp_rank, tp_size self.num_heads = divide(self.total_num_heads, tp_size) if tp_size >= self.total_num_kv_heads: self.num_kv_heads = 1 @@ -724,6 +804,8 @@ def __init__( params_dtype=params_dtype, quant_config=quant_config, prefix=prefix, + tp_rank=tp_rank, + tp_size=tp_size, ) def _get_shard_offset_mapping(self, loaded_shard_id: str): @@ -814,13 +896,24 @@ def weight_loader_v2( shard_offset = (shard_offset + block_n - 1) // block_n shard_size = (shard_size + block_n - 1) // block_n - param.load_qkv_weight( - loaded_weight=loaded_weight, - num_heads=self.num_kv_head_replicas, - shard_id=loaded_shard_id, - shard_offset=shard_offset, - shard_size=shard_size, - ) + if isinstance(param, _ColumnvLLMParameter): + load_column_qkv_weight( + param, + loaded_weight, + num_heads=self.num_kv_head_replicas, + shard_id=loaded_shard_id, + shard_offset=shard_offset, + shard_size=shard_size, + tp_rank=self.tp_rank, + ) + else: + param.load_qkv_weight( + loaded_weight=loaded_weight, + num_heads=self.num_kv_head_replicas, + shard_id=loaded_shard_id, + shard_offset=shard_offset, + shard_size=shard_size, + ) def weight_loader( self, @@ -840,12 +933,9 @@ def weight_loader( return if is_gguf_weight: - tp_size = get_tensor_model_parallel_world_size() - tp_rank = get_tensor_model_parallel_rank() - output_dim = getattr(param, "output_dim", None) - shard_size = loaded_weight.size(output_dim) // tp_size - start_idx = tp_rank * shard_size + shard_size = loaded_weight.size(output_dim) // self.tp_size + start_idx = self.tp_rank * shard_size loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) @@ -872,7 +962,9 @@ def weight_loader( param_data, loaded_weight, 0 ) - assert param_data.shape == loaded_weight.shape + assert ( + param_data.shape == loaded_weight.shape + ), f"{param_data.shape=}, {loaded_weight.shape=}" param_data.copy_(loaded_weight) return shard_offsets = [ @@ -934,7 +1026,6 @@ def weight_loader( self.weight_loader(param, loaded_weight_shard, shard_id) return - tp_rank = get_tensor_model_parallel_rank() assert loaded_shard_id in ["q", "k", "v"] # If output dim is defined, use the default loading process. @@ -984,9 +1075,9 @@ def weight_loader( param_data = param_data.narrow(output_dim, shard_offset, shard_size) if loaded_shard_id == "q": - shard_id = tp_rank + shard_id = self.tp_rank else: - shard_id = tp_rank // self.num_kv_head_replicas + shard_id = self.tp_rank // self.num_kv_head_replicas start_idx = shard_id * shard_size # bitsandbytes loads the weights of the specific portion @@ -1014,7 +1105,9 @@ def weight_loader( "for all partitions." ) - assert param_data.shape == loaded_weight.shape + assert ( + param_data.shape == loaded_weight.shape + ), f"{param_data.shape=}, {loaded_weight.shape=}" param_data.copy_(loaded_weight) @@ -1055,6 +1148,9 @@ def __init__( reduce_results: bool = True, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + tp_rank: Optional[int] = None, + tp_size: Optional[int] = None, + use_presharded_weights: bool = False, ): super().__init__( input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix @@ -1064,10 +1160,14 @@ def __init__( self.reduce_results = reduce_results # Divide the weight matrix along the last dimension. - self.tp_rank = get_tensor_model_parallel_rank() - self.tp_size = get_tensor_model_parallel_world_size() + if tp_rank is None: + tp_rank = get_tensor_model_parallel_rank() + if tp_size is None: + tp_size = get_tensor_model_parallel_world_size() + self.tp_rank, self.tp_size = tp_rank, tp_size self.input_size_per_partition = divide(input_size, self.tp_size) assert self.quant_method is not None + self.use_presharded_weights = use_presharded_weights self.quant_method.create_weights( layer=self, @@ -1101,8 +1201,6 @@ def __init__( self.register_parameter("bias", None) def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): - tp_rank = get_tensor_model_parallel_rank() - tp_size = get_tensor_model_parallel_world_size() input_dim = getattr(param, "input_dim", None) use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) @@ -1116,15 +1214,19 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): if is_gguf_weight and isinstance(param, UninitializedParameter): weight_shape = list(loaded_weight.shape) if input_dim: - weight_shape[input_dim] = weight_shape[input_dim] // tp_size + weight_shape[input_dim] = weight_shape[input_dim] // self.tp_size param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype) param_data = param.data # bitsandbytes loads the weights of the specific portion # no need to narrow here - if input_dim is not None and not use_bitsandbytes_4bit: + if ( + input_dim is not None + and not use_bitsandbytes_4bit + and not self.use_presharded_weights + ): shard_size = param_data.shape[input_dim] - start_idx = tp_rank * shard_size + start_idx = self.tp_rank * shard_size loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size) # Special case for loading scales off disk, which often do not @@ -1132,7 +1234,9 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): if len(loaded_weight.shape) == 0: loaded_weight = loaded_weight.reshape(1) - assert param_data.shape == loaded_weight.shape + assert ( + param_data.shape == loaded_weight.shape + ), f"{param_data.shape=}, {loaded_weight.shape=}" param_data.copy_(loaded_weight) def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor): @@ -1143,17 +1247,21 @@ def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor assert loaded_weight.numel() == 1 loaded_weight = loaded_weight.reshape(1) - param.load_row_parallel_weight(loaded_weight=loaded_weight) + load_row_parallel_weight( + param, + loaded_weight, + self.tp_rank, + use_presharded_weights=self.use_presharded_weights, + ) def forward(self, input_): if self.input_is_parallel: input_parallel = input_ else: - tp_rank = get_tensor_model_parallel_rank() splitted_input = split_tensor_along_last_dim( input_, num_partitions=self.tp_size ) - input_parallel = splitted_input[tp_rank].contiguous() + input_parallel = splitted_input[self.tp_rank].contiguous() # Matrix multiply. assert self.quant_method is not None diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py index 96eaf856616f..8d0b7035ee50 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py @@ -204,6 +204,7 @@ def __init__( prefix: str = "", custom_routing_function: Optional[Callable] = None, correction_bias: Optional[torch.Tensor] = None, + use_presharded_weights: bool = False, ): super().__init__() @@ -243,6 +244,7 @@ def __init__( params_dtype=params_dtype, weight_loader=self.weight_loader, ) + self.use_presharded_weights = use_presharded_weights def _load_per_tensor_weight_scale( self, @@ -395,10 +397,7 @@ def weight_loader( weight_name: str, shard_id: str, expert_id: int, - use_presharded_weights: bool = False, ) -> None: - self.use_presharded_weights = use_presharded_weights - # compressed-tensors checkpoints with packed weights are stored flipped # TODO (mgoin): check self.quant_method.quant_config.quant_format # against known CompressionFormat enum values that have this quality diff --git a/python/sglang/srt/layers/parameter.py b/python/sglang/srt/layers/parameter.py new file mode 100644 index 000000000000..435cc69bb51d --- /dev/null +++ b/python/sglang/srt/layers/parameter.py @@ -0,0 +1,431 @@ +""" +Adapted from vLLM (0.6.4.post1). +https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/parameter.py +""" + +import logging +from fractions import Fraction +from typing import Callable, Optional, Union + +import torch +from torch.nn import Parameter +from vllm.distributed import get_tensor_model_parallel_rank + +__all__ = [ + "BasevLLMParameter", + "PackedvLLMParameter", + "PerTensorScaleParameter", + "ModelWeightParameter", + "ChannelQuantScaleParameter", + "GroupQuantScaleParameter", + "PackedColumnParameter", + "RowvLLMParameter", +] + +logger = logging.getLogger(__name__) + + +class BasevLLMParameter(Parameter): + """ + Base parameter for vLLM linear layers. Extends the torch.nn.parameter + by taking in a linear weight loader. Will copy the loaded weight + into the parameter when the provided weight loader is called. + """ + + def __new__(cls, data: torch.Tensor, **kwargs): + + return super().__new__(cls, data=data, requires_grad=False) + + def __init__(self, data: torch.Tensor, weight_loader: Callable): + """ + Initialize the BasevLLMParameter + + :param data: torch tensor with the parameter data + :param weight_loader: weight loader callable + + :returns: a torch.nn.parameter + """ + + self._weight_loader = weight_loader + + @property + def weight_loader(self): + return self._weight_loader + + def _assert_and_load(self, loaded_weight: torch.Tensor): + assert self.data.shape == loaded_weight.shape + self.data.copy_(loaded_weight) + + def load_column_parallel_weight(self, loaded_weight: torch.Tensor): + self._assert_and_load(loaded_weight) + + def load_row_parallel_weight(self, loaded_weight: torch.Tensor): + self._assert_and_load(loaded_weight) + + def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs): + self._assert_and_load(loaded_weight) + + def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): + self._assert_and_load(loaded_weight) + + +class _ColumnvLLMParameter(BasevLLMParameter): + """ + Private class defining weight loading functionality + (load_merged_column_weight, load_qkv_weight) + for parameters being loaded into linear layers with column + parallelism. This includes QKV and MLP layers which are + not already fused on disk. Requires an output dimension + to be defined. Called within the weight loader of + each of the column parallel linear layers. + """ + + def __init__(self, output_dim: int, **kwargs): + self._output_dim = output_dim + super().__init__(**kwargs) + + @property + def output_dim(self): + return self._output_dim + + def load_column_parallel_weight(self, loaded_weight: torch.Tensor): + tp_rank = get_tensor_model_parallel_rank() + shard_size = self.data.shape[self.output_dim] + loaded_weight = loaded_weight.narrow( + self.output_dim, tp_rank * shard_size, shard_size + ) + assert self.data.shape == loaded_weight.shape + self.data.copy_(loaded_weight) + + def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs): + + shard_offset = kwargs.get("shard_offset") + shard_size = kwargs.get("shard_size") + use_presharded_weights = kwargs.get("use_presharded_weights") + if ( + isinstance(self, (PackedColumnParameter, PackedvLLMParameter)) + and self.packed_dim == self.output_dim + ): + shard_size, shard_offset = self.adjust_shard_indexes_for_packing( + shard_offset=shard_offset, shard_size=shard_size + ) + + param_data = self.data + + tp_rank = get_tensor_model_parallel_rank() + param_data = param_data.narrow(self.output_dim, shard_offset, shard_size) + if not use_presharded_weights: + loaded_weight = loaded_weight.narrow( + self.output_dim, tp_rank * shard_size, shard_size + ) + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): + + shard_offset = kwargs.get("shard_offset") + shard_size = kwargs.get("shard_size") + shard_id = kwargs.get("shard_id") + num_heads = kwargs.get("num_heads") + + if ( + isinstance(self, (PackedColumnParameter, PackedvLLMParameter)) + and self.output_dim == self.packed_dim + ): + shard_size, shard_offset = self.adjust_shard_indexes_for_packing( + shard_offset=shard_offset, shard_size=shard_size + ) + + param_data = self.data + tp_rank = get_tensor_model_parallel_rank() + shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads + param_data = param_data.narrow(self.output_dim, shard_offset, shard_size) + loaded_weight = loaded_weight.narrow( + self.output_dim, shard_id * shard_size, shard_size + ) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +class RowvLLMParameter(BasevLLMParameter): + """ + Parameter class defining weight_loading functionality + (load_row_parallel_weight) for parameters being loaded + into linear layers with row parallel functionality. + Requires an input_dim to be defined. + """ + + def __init__(self, input_dim: int, **kwargs): + self._input_dim = input_dim + super().__init__(**kwargs) + + @property + def input_dim(self): + return self._input_dim + + def load_row_parallel_weight(self, loaded_weight: torch.Tensor, **kwargs): + use_presharded_weights = kwargs.get("use_presharded_weights") + tp_rank = get_tensor_model_parallel_rank() + shard_size = self.data.shape[self.input_dim] + if not use_presharded_weights: + loaded_weight = loaded_weight.narrow( + self.input_dim, tp_rank * shard_size, shard_size + ) + + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + + assert self.data.shape == loaded_weight.shape + self.data.copy_(loaded_weight) + + +class ModelWeightParameter(_ColumnvLLMParameter, RowvLLMParameter): + """ + Parameter class for linear layer weights. Uses both column and + row parallelism. + """ + + pass + + +class GroupQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter): + """ + Parameter class for weight scales loaded for weights with + grouped quantization. Uses both column and row parallelism. + """ + + pass + + +class ChannelQuantScaleParameter(_ColumnvLLMParameter): + """ + Parameter class for weight scales loaded for weights with + channel-wise quantization. Equivalent to _ColumnvLLMParameter. + """ + + pass + + +class PerTensorScaleParameter(BasevLLMParameter): + """ + Parameter class for scales where the number of scales is + equivalent to the number of logical matrices in fused linear + layers (e.g. for QKV, there are 3 scales loaded from disk). + This is relevant to weights with per-tensor quantization. + Adds functionality to map the scalers to a shard during + weight loading. + + Note: additional parameter manipulation may be handled + for each quantization config specifically, within + process_weights_after_loading + """ + + def __init__(self, **kwargs): + self.qkv_idxs = {"q": 0, "k": 1, "v": 2} + super().__init__(**kwargs) + + def _shard_id_as_int(self, shard_id: Union[str, int]) -> int: + if isinstance(shard_id, int): + return shard_id + + # if not int, assume shard_id for qkv + # map to int and return + assert isinstance(shard_id, str) + assert shard_id in self.qkv_idxs + return self.qkv_idxs[shard_id] + + # For row parallel layers, no sharding needed + # load weight into parameter as is + def load_row_parallel_weight(self, *args, **kwargs): + super().load_row_parallel_weight(*args, **kwargs) + + def load_merged_column_weight(self, *args, **kwargs): + self._load_into_shard_id(*args, **kwargs) + + def load_qkv_weight(self, *args, **kwargs): + self._load_into_shard_id(*args, **kwargs) + + def load_column_parallel_weight(self, *args, **kwargs): + super().load_row_parallel_weight(*args, **kwargs) + + def _load_into_shard_id( + self, loaded_weight: torch.Tensor, shard_id: Union[str, int], **kwargs + ): + """ + Slice the parameter data based on the shard id for + loading. + """ + + param_data = self.data + shard_id = self._shard_id_as_int(shard_id) + + # AutoFP8 scales do not have a shape + # compressed-tensors scales do have a shape + if len(loaded_weight.shape) != 0: + assert loaded_weight.shape[0] == 1 + loaded_weight = loaded_weight[0] + + param_data = param_data[shard_id] + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +class PackedColumnParameter(_ColumnvLLMParameter): + """ + Parameter for model parameters which are packed on disk + and support column parallelism only. See PackedvLLMParameter + for more details on the packed properties. + """ + + def __init__( + self, + packed_factor: Union[int, Fraction], + packed_dim: int, + marlin_tile_size: Optional[int] = None, + **kwargs + ): + self._packed_factor = packed_factor + self._packed_dim = packed_dim + self._marlin_tile_size = marlin_tile_size + super().__init__(**kwargs) + + @property + def packed_dim(self): + return self._packed_dim + + @property + def packed_factor(self): + return self._packed_factor + + @property + def marlin_tile_size(self): + return self._marlin_tile_size + + def adjust_shard_indexes_for_packing(self, shard_size, shard_offset): + return _adjust_shard_indexes_for_packing( + shard_size=shard_size, + shard_offset=shard_offset, + packed_factor=self.packed_factor, + marlin_tile_size=self.marlin_tile_size, + ) + + +class PackedvLLMParameter(ModelWeightParameter): + """ + Parameter for model weights which are packed on disk. + Example: GPTQ Marlin weights are int4 or int8, packed into int32. + Extends the ModelWeightParameter to take in the + packed factor, the packed dimension, and optionally, marlin + tile size for marlin kernels. Adjusts the shard_size and + shard_offset for fused linear layers model weight loading + by accounting for packing and optionally, marlin tile size. + """ + + def __init__( + self, + packed_factor: Union[int, Fraction], + packed_dim: int, + marlin_tile_size: Optional[int] = None, + **kwargs + ): + self._packed_factor = packed_factor + self._packed_dim = packed_dim + self._marlin_tile_size = marlin_tile_size + super().__init__(**kwargs) + + @property + def packed_dim(self): + return self._packed_dim + + @property + def packed_factor(self): + return self._packed_factor + + @property + def marlin_tile_size(self): + return self._marlin_tile_size + + def adjust_shard_indexes_for_packing(self, shard_size, shard_offset): + return _adjust_shard_indexes_for_packing( + shard_size=shard_size, + shard_offset=shard_offset, + packed_factor=self.packed_factor, + marlin_tile_size=self.marlin_tile_size, + ) + + +def permute_param_layout_( + param: BasevLLMParameter, input_dim: int, output_dim: int, **kwargs +) -> BasevLLMParameter: + """ + Permute a parameter's layout to the specified input and output dimensions, + useful for forcing the parameter into a known layout, for example, if I need + a packed (quantized) weight matrix to be in the layout + {input_dim = 0, output_dim = 1, packed_dim = 0} + then I can call: + permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0) + to ensure x is in the correct layout (permuting it to the correct layout if + required, asserting if it cannot get it to the correct layout) + """ + + curr_input_dim = getattr(param, "input_dim", None) + curr_output_dim = getattr(param, "output_dim", None) + + if curr_input_dim is None or curr_output_dim is None: + assert param.data.dim() == 2, ( + "permute_param_layout_ only supports 2D parameters when either " + "input_dim or output_dim is not set" + ) + + # if one of the dimensions is not set, set it to the opposite of the other + # we can only do this since we asserted the parameter is 2D above + if curr_input_dim is None: + assert curr_output_dim is not None, "either input or output dim must be set" + curr_input_dim = (curr_output_dim + 1) % 2 + if curr_output_dim is None: + assert curr_input_dim is not None, "either input or output dim must be set" + curr_output_dim = (curr_input_dim + 1) % 2 + + # create permutation from the current layout to the layout with + # self.input_dim at input_dim and self.output_dim at output_dim preserving + # other dimensions + perm = [ + i for i in range(param.data.dim()) if i not in [curr_input_dim, curr_output_dim] + ] + perm.insert(input_dim, curr_input_dim) + perm.insert(output_dim, curr_output_dim) + + if "packed_dim" in kwargs: + assert ( + hasattr(param, "packed_dim") + and param.packed_dim == perm[kwargs["packed_dim"]] + ), "permute_param_layout_ currently doesn't support repacking" + + param.data = param.data.permute(*perm) + if hasattr(param, "_input_dim"): + param._input_dim = input_dim + if hasattr(param, "_output_dim"): + param._output_dim = output_dim + if "packed_dim" in kwargs and hasattr(param, "_packed_dim"): + param._packed_dim = kwargs["packed_dim"] + + return param + + +def _adjust_shard_indexes_for_marlin(shard_size, shard_offset, marlin_tile_size): + return shard_size * marlin_tile_size, shard_offset * marlin_tile_size + + +def _adjust_shard_indexes_for_packing( + shard_size, shard_offset, packed_factor, marlin_tile_size +): + shard_size = shard_size // packed_factor + shard_offset = shard_offset // packed_factor + if marlin_tile_size is not None: + return _adjust_shard_indexes_for_marlin( + shard_size=shard_size, + shard_offset=shard_offset, + marlin_tile_size=marlin_tile_size, + ) + return shard_size, shard_offset diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index a263cb2362a9..f9e4a8a4ff45 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -25,9 +25,9 @@ per_tensor_dequantize, requantize_with_max_scale, ) -from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter from sglang.srt.layers.linear import LinearMethodBase, UnquantizedLinearMethod +from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter from sglang.srt.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, diff --git a/python/sglang/srt/layers/vocab_parallel_embedding.py b/python/sglang/srt/layers/vocab_parallel_embedding.py index effea1c6c950..21d973918758 100644 --- a/python/sglang/srt/layers/vocab_parallel_embedding.py +++ b/python/sglang/srt/layers/vocab_parallel_embedding.py @@ -12,8 +12,8 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) -from vllm.model_executor.parameter import BasevLLMParameter +from sglang.srt.layers.parameter import BasevLLMParameter from sglang.srt.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, diff --git a/python/sglang/srt/managers/session_controller.py b/python/sglang/srt/managers/session_controller.py index e3e94ce6b655..e9c0c909d52c 100644 --- a/python/sglang/srt/managers/session_controller.py +++ b/python/sglang/srt/managers/session_controller.py @@ -99,7 +99,7 @@ def create_req(self, req: TokenizedGenerateReqInput, tokenizer): if last_req is not None: # trim bos token if it is an append - if req.input_ids[0] == tokenizer.bos_token_id: + if tokenizer is not None and req.input_ids[0] == tokenizer.bos_token_id: req.input_ids = req.input_ids[1:] input_ids = ( diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py index fab8b15a3316..354408ab3433 100644 --- a/python/sglang/srt/model_executor/forward_batch_info.py +++ b/python/sglang/srt/model_executor/forward_batch_info.py @@ -106,6 +106,9 @@ def is_cuda_graph(self): def is_dummy_first(self): return self == ForwardMode.DUMMY_FIRST + def is_decode_or_idle(self): + return self == ForwardMode.DECODE or self == ForwardMode.IDLE + class CaptureHiddenMode(IntEnum): NULL = auto() diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 7cd9e759a3dc..719db19cd765 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -205,7 +205,7 @@ def init_torch_distributed(self): if self.device == "cuda": backend = "nccl" elif self.device == "xpu": - # TODO(liangan1):Just use gloo to bypass the initilization fail + # TODO(liangan1): Just use gloo to bypass the initilization fail # Need to use xccl for xpu backend in the future backend = "gloo" elif self.device == "hpu": @@ -634,7 +634,6 @@ def init_attention_backend(self): ) def init_double_sparsity_channel_config(self, selected_channel): - selected_channel = "." + selected_channel + "_proj" self.sorted_channels = [] # load channel config diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index 0485b80fc3a2..33a055a8fcb9 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -57,6 +57,7 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, prefix: str = "", reduce_results=True, + use_presharded_weights: bool = False, ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( @@ -65,6 +66,7 @@ def __init__( bias=False, quant_config=quant_config, prefix=f"{prefix}.gate_up_proj", + use_presharded_weights=use_presharded_weights, ) self.down_proj = RowParallelLinear( intermediate_size, @@ -73,6 +75,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.down_proj", reduce_results=reduce_results, + use_presharded_weights=use_presharded_weights, ) self.act_fn = GeluAndMul(approximate="tanh") @@ -103,6 +106,7 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, tp_size: Optional[int] = None, reduce_results=True, + use_presharded_weights: bool = False, ): super().__init__() self.hidden_size = hidden_size @@ -129,6 +133,7 @@ def __init__( renormalize=False, quant_config=quant_config, tp_size=tp_size, + use_presharded_weights=use_presharded_weights, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -156,6 +161,7 @@ def __init__( max_position: int = 4096 * 32, rope_theta: float = 10000, quant_config: Optional[QuantizationConfig] = None, + reduce_results: bool = True, ) -> None: super().__init__() self.config = config @@ -194,6 +200,7 @@ def __init__( hidden_size, bias=False, quant_config=quant_config, + reduce_results=reduce_results, ) self.rotary_emb = get_rope( self.head_dim, @@ -234,10 +241,12 @@ def __init__( config: PretrainedConfig, layer_id: int = 0, quant_config: Optional[QuantizationConfig] = None, + use_presharded_weights: bool = False, ) -> None: super().__init__() self.num_experts = config.num_local_experts self.hidden_size = config.hidden_size + self.layer_id = layer_id rope_theta = getattr(config, "rope_theta", 10000) self.self_attn = Grok1Attention( @@ -262,6 +271,7 @@ def __init__( ), quant_config=quant_config, reduce_results=True, + use_presharded_weights=use_presharded_weights, ) self.pre_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -299,6 +309,7 @@ def __init__( self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, + use_presharded_weights: bool = False, ) -> None: super().__init__() self.config = config @@ -311,7 +322,12 @@ def __init__( ) self.layers = nn.ModuleList( [ - Grok1DecoderLayer(config, i, quant_config=quant_config) + Grok1DecoderLayer( + config, + i, + quant_config=quant_config, + use_presharded_weights=use_presharded_weights, + ) for i in range(config.num_hidden_layers) ] ) @@ -347,11 +363,7 @@ def __init__( super().__init__() self.config = config self.quant_config = quant_config - self.model = Grok1Model(config, quant_config=quant_config) - self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.logits_processor = LogitsProcessor(config) - # Monkey patch _prepare_weights to load pre-sharded weights if ( self.config.num_local_experts > 0 and get_tensor_model_parallel_world_size() > 1 @@ -361,6 +373,14 @@ def __init__( else: self.use_presharded_weights = False + self.model = Grok1Model( + config, + quant_config=quant_config, + use_presharded_weights=self.use_presharded_weights, + ) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.logits_processor = LogitsProcessor(config) + def forward( self, input_ids: torch.Tensor, @@ -376,10 +396,7 @@ def forward( def load_weights( self, weights: Iterable[Tuple[str, torch.Tensor]], - use_presharded_weights: Optional[bool] = None, ): - if use_presharded_weights is None: - use_presharded_weights = self.use_presharded_weights num_experts = self.config.num_local_experts stacked_params_mapping = [ @@ -435,20 +452,12 @@ def load_weight_wrapper(name, loaded_weight, *args, **kwargs): continue name = name.replace(weight_name, param_name) - if use_presharded_weights: - extra_kwargs = { - "use_presharded_weights": use_presharded_weights - } - else: - extra_kwargs = {} - load_weight_wrapper( name, loaded_weight, name, shard_id=shard_id, expert_id=expert_id, - **extra_kwargs, ) break else: diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index f60af5d73153..8fd902818995 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -544,7 +544,12 @@ def launch_server( # Send a warmup request t = threading.Thread( - target=_wait_and_warmup, args=(server_args, pipe_finish_writer) + target=_wait_and_warmup, + args=( + server_args, + pipe_finish_writer, + tokenizer_manager.image_token_id, + ), ) t.start() @@ -614,7 +619,7 @@ def sigquit_handler(signum, frame): mp.set_start_method("spawn", force=True) -def _wait_and_warmup(server_args, pipe_finish_writer): +def _wait_and_warmup(server_args, pipe_finish_writer, image_token_text): headers = {} url = server_args.url() if server_args.api_key: diff --git a/python/sglang/srt/speculative/eagle_utils.py b/python/sglang/srt/speculative/eagle_utils.py index 88c88c0724f4..b804e7c6af2e 100644 --- a/python/sglang/srt/speculative/eagle_utils.py +++ b/python/sglang/srt/speculative/eagle_utils.py @@ -14,7 +14,7 @@ from sglang.srt.speculative.spec_info import SpecInfo if TYPE_CHECKING: - from python.sglang.srt.managers.schedule_batch import ScheduleBatch + from sglang.srt.managers.schedule_batch import ScheduleBatch from sglang.srt.server_args import ServerArgs diff --git a/scripts/killall_sglang.sh b/scripts/killall_sglang.sh index 4057d2be2fb4..53d08703e014 100755 --- a/scripts/killall_sglang.sh +++ b/scripts/killall_sglang.sh @@ -7,6 +7,7 @@ nvidia-smi kill -9 $(ps aux | grep 'sglang::' | grep -v 'grep' | awk '{print $2}') 2>/dev/null kill -9 $(ps aux | grep 'sglang.launch_server' | grep -v 'grep' | awk '{print $2}') 2>/dev/null kill -9 $(ps aux | grep 'sglang.bench' | grep -v 'grep' | awk '{print $2}') 2>/dev/null +kill -9 $(ps aux | grep 'sglang.data_parallel' | grep -v 'grep' | awk '{print $2}') 2>/dev/null # Clean all GPU processes if any argument is provided if [ $# -gt 0 ]; then From 977f785dad98540f01bca34abe6c6fd326fd6a7c Mon Sep 17 00:00:00 2001 From: mlmz <54172054+minleminzui@users.noreply.github.com> Date: Wed, 8 Jan 2025 16:02:59 +0800 Subject: [PATCH 004/248] Docs: Rewrite docs for LLama 405B and ModelSpace (#2773) Co-authored-by: Chayenne --- docs/backend/server_arguments.md | 43 -------------------------------- docs/index.rst | 2 ++ docs/references/llama_405B.md | 16 ++++++++++++ docs/references/modelscope.md | 28 +++++++++++++++++++++ 4 files changed, 46 insertions(+), 43 deletions(-) create mode 100644 docs/references/llama_405B.md create mode 100644 docs/references/modelscope.md diff --git a/docs/backend/server_arguments.md b/docs/backend/server_arguments.md index a4913b8af6b9..fcee7f88d52a 100644 --- a/docs/backend/server_arguments.md +++ b/docs/backend/server_arguments.md @@ -32,46 +32,3 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1 ``` -## Use Models From ModelScope -
-More - -To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE. -``` -export SGLANG_USE_MODELSCOPE=true -``` -Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server -``` -SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000 -``` - -Or start it by docker. -```bash -docker run --gpus all \ - -p 30000:30000 \ - -v ~/.cache/modelscope:/root/.cache/modelscope \ - --env "SGLANG_USE_MODELSCOPE=true" \ - --ipc=host \ - lmsysorg/sglang:latest \ - python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000 -``` - -
- -## Example: Run Llama 3.1 405B -
-More - -```bash -# Run 405B (fp8) on a single node -python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8 - -# Run 405B (fp16) on two nodes -## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port -python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 - -## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port -python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 -``` - -
diff --git a/docs/index.rst b/docs/index.rst index 80a53d1cb3bb..4141161894b4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -60,3 +60,5 @@ The core features include: references/troubleshooting.md references/faq.md references/learn_more.md + references/llama_405B.md + references/modelscope.md diff --git a/docs/references/llama_405B.md b/docs/references/llama_405B.md new file mode 100644 index 000000000000..3383d1625c86 --- /dev/null +++ b/docs/references/llama_405B.md @@ -0,0 +1,16 @@ +# Example: Run Llama 3.1 405B + +```bash +# Run 405B (fp8) on a single node +python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8 +``` + +```bash +# Run 405B (fp16) on two nodes +## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port +python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 + +## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port +python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 +``` + diff --git a/docs/references/modelscope.md b/docs/references/modelscope.md new file mode 100644 index 000000000000..ad7b6151b435 --- /dev/null +++ b/docs/references/modelscope.md @@ -0,0 +1,28 @@ +# Use Models From ModelScope + +To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable `SGLANG_USE_MODELSCOPE`. + +```bash +export SGLANG_USE_MODELSCOPE=true +``` + +We take [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) as an example. Launch the Server: +--- + +```bash +python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000 +``` + +Or start it by docker: + +```bash +docker run --gpus all \ + -p 30000:30000 \ + -v ~/.cache/modelscope:/root/.cache/modelscope \ + --env "SGLANG_USE_MODELSCOPE=true" \ + --ipc=host \ + lmsysorg/sglang:latest \ + python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000 +``` + +Note that modelscope uses a different cache directory than huggingface. You may need to set it manually to avoid running out of disk space. From 2e6346fc2ef9adecda3b71d7415f4d023dc22aff Mon Sep 17 00:00:00 2001 From: Chayenne Date: Wed, 8 Jan 2025 01:07:54 -0800 Subject: [PATCH 005/248] =?UTF-8?q?Docs=EF=BC=9AUpdate=20the=20style=20of?= =?UTF-8?q?=20llma=203.1=20405B=20docs=20(#2789)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/backend/server_arguments.md | 1 - docs/index.rst | 4 ++-- docs/references/llama_405B.md | 17 ++++++++++------- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/backend/server_arguments.md b/docs/backend/server_arguments.md index fcee7f88d52a..90b36a0bdd91 100644 --- a/docs/backend/server_arguments.md +++ b/docs/backend/server_arguments.md @@ -31,4 +31,3 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct # Node 1 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1 ``` - diff --git a/docs/index.rst b/docs/index.rst index 4141161894b4..ff104808ca99 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -56,9 +56,9 @@ The core features include: references/hyperparameter_tuning.md references/benchmark_and_profiling.md references/custom_chat_template.md + references/llama_405B.md + references/modelscope.md references/contribution_guide.md references/troubleshooting.md references/faq.md references/learn_more.md - references/llama_405B.md - references/modelscope.md diff --git a/docs/references/llama_405B.md b/docs/references/llama_405B.md index 3383d1625c86..4f70e89f6d9a 100644 --- a/docs/references/llama_405B.md +++ b/docs/references/llama_405B.md @@ -1,16 +1,19 @@ -# Example: Run Llama 3.1 405B +# Run Llama 3.1 405B + +## Run 405B (fp8) on a Single Node ```bash -# Run 405B (fp8) on a single node python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8 ``` +## Run 405B (fp16) on Two Nodes + ```bash -# Run 405B (fp16) on two nodes -## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port +# on the first node, replace 172.16.4.52:20000 with your own node ip address and port + python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 -## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port -python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 -``` +# on the second node, replace 172.18.45.52:20000 with your own node ip address and port +python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.18.45.52:20000 --nnodes 2 --node-rank 1 +``` From b5fb4ef58a6bbe6c105d533b69e8e8bc2bf4fc3c Mon Sep 17 00:00:00 2001 From: Ke Bao Date: Wed, 8 Jan 2025 18:04:30 +0800 Subject: [PATCH 006/248] Update modelopt config and fix running issue (#2792) --- python/sglang/srt/layers/quantization/__init__.py | 2 +- python/sglang/srt/layers/{ => quantization}/modelopt_quant.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) rename python/sglang/srt/layers/{ => quantization}/modelopt_quant.py (99%) diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py index df20a7a4ba47..35b0c4d94edb 100644 --- a/python/sglang/srt/layers/quantization/__init__.py +++ b/python/sglang/srt/layers/quantization/__init__.py @@ -17,12 +17,12 @@ from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinConfig from vllm.model_executor.layers.quantization.gptq_marlin_24 import GPTQMarlin24Config from vllm.model_executor.layers.quantization.marlin import MarlinConfig -from vllm.model_executor.layers.quantization.modelopt import ModelOptFp8Config from vllm.model_executor.layers.quantization.qqq import QQQConfig from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.quantization.fp8 import Fp8Config +from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp8Config QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = { "aqlm": AQLMConfig, diff --git a/python/sglang/srt/layers/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py similarity index 99% rename from python/sglang/srt/layers/modelopt_quant.py rename to python/sglang/srt/layers/quantization/modelopt_quant.py index 2c0887df2391..8ce9d20d1911 100644 --- a/python/sglang/srt/layers/modelopt_quant.py +++ b/python/sglang/srt/layers/quantization/modelopt_quant.py @@ -142,6 +142,7 @@ def create_weights( data=torch.full( (len(output_partition_sizes),), torch.finfo(torch.float32).min, + dtype=torch.float32, ), weight_loader=weight_loader, ), From 656aed58c6622bb03887ce9d2a7f34ba18eaaff3 Mon Sep 17 00:00:00 2001 From: Yunmeng Date: Thu, 9 Jan 2025 17:51:56 +0800 Subject: [PATCH 007/248] Remove vllm dependency in model config (#2809) --- python/sglang/srt/configs/__init__.py | 4 + python/sglang/srt/configs/chatglm.py | 78 ++++++ python/sglang/srt/configs/dbrx.py | 279 +++++++++++++++++++++ python/sglang/srt/hf_transformers_utils.py | 23 +- python/sglang/srt/models/chatglm.py | 2 +- python/sglang/srt/models/dbrx.py | 2 +- 6 files changed, 372 insertions(+), 16 deletions(-) create mode 100644 python/sglang/srt/configs/chatglm.py create mode 100644 python/sglang/srt/configs/dbrx.py diff --git a/python/sglang/srt/configs/__init__.py b/python/sglang/srt/configs/__init__.py index 600b58e49377..3d81c5d4fd50 100644 --- a/python/sglang/srt/configs/__init__.py +++ b/python/sglang/srt/configs/__init__.py @@ -1,3 +1,5 @@ +from sglang.srt.configs.chatglm import ChatGLMConfig +from sglang.srt.configs.dbrx import DbrxConfig from sglang.srt.configs.exaone import ExaoneConfig from sglang.srt.configs.qwen2vl import Qwen2VLConfig, Qwen2VLVisionConfig @@ -5,4 +7,6 @@ "ExaoneConfig", "Qwen2VLConfig", "Qwen2VLVisionConfig", + "ChatGLMConfig", + "DbrxConfig", ] diff --git a/python/sglang/srt/configs/chatglm.py b/python/sglang/srt/configs/chatglm.py new file mode 100644 index 000000000000..9370c218aab8 --- /dev/null +++ b/python/sglang/srt/configs/chatglm.py @@ -0,0 +1,78 @@ +# Adapted from +# https://github.com/THUDM/ChatGLM2-6B +# https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/configs/chatglm.py + +# ChatGLM2 and ChatGLM3 share the same config. +# ChatGLM4 is officially supported by Huggingface +# transformers >= 4.46.0 is required +# https://huggingface.co/docs/transformers/en/model_doc/glm +from transformers import PretrainedConfig + + +class ChatGLMConfig(PretrainedConfig): + model_type = "chatglm" + attribute_map = { + "num_hidden_layers": "num_layers", + "n_head_kv": "multi_query_group_num", + } + + def __init__( + self, + num_layers=28, + padded_vocab_size=65024, + hidden_size=4096, + ffn_hidden_size=13696, + kv_channels=128, + num_attention_heads=32, + seq_length=2048, + hidden_dropout=0.0, + attention_dropout=0.0, + layernorm_epsilon=1e-5, + rmsnorm=True, + apply_residual_connection_post_layernorm=False, + post_layer_norm=True, + add_bias_linear=False, + add_qkv_bias=False, + interleaved_qkv=False, + bias_dropout_fusion=True, + multi_query_attention=False, + multi_query_group_num=1, + apply_query_key_layer_scaling=True, + attention_softmax_in_fp32=True, + fp32_residual_connection=False, + quantization_bit=0, + pre_seq_len=None, + prefix_projection=False, + **kwargs + ): + self.num_layers = num_layers + self.vocab_size = padded_vocab_size + self.padded_vocab_size = padded_vocab_size + self.hidden_size = hidden_size + self.ffn_hidden_size = ffn_hidden_size + self.kv_channels = kv_channels + self.num_attention_heads = num_attention_heads + self.seq_length = seq_length + # It is to be compatible with long lora. + self.max_position_embeddings = seq_length + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.layernorm_epsilon = layernorm_epsilon + self.rmsnorm = rmsnorm + self.apply_residual_connection_post_layernorm = ( + apply_residual_connection_post_layernorm + ) + self.post_layer_norm = post_layer_norm + self.add_bias_linear = add_bias_linear + self.add_qkv_bias = add_qkv_bias + self.bias_dropout_fusion = bias_dropout_fusion + self.multi_query_attention = multi_query_attention + self.multi_query_group_num = multi_query_group_num + self.apply_query_key_layer_scaling = apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = attention_softmax_in_fp32 + self.fp32_residual_connection = fp32_residual_connection + self.quantization_bit = quantization_bit + self.pre_seq_len = pre_seq_len + self.prefix_projection = prefix_projection + self.interleaved_qkv = interleaved_qkv + super().__init__(**kwargs) diff --git a/python/sglang/srt/configs/dbrx.py b/python/sglang/srt/configs/dbrx.py new file mode 100644 index 000000000000..75ccbde944ea --- /dev/null +++ b/python/sglang/srt/configs/dbrx.py @@ -0,0 +1,279 @@ +# Adapted from +# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py +# https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/configs/dbrx.py +"""Dbrx configuration.""" + +from typing import Any, Optional + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore + + +class DbrxAttentionConfig(PretrainedConfig): + """Configuration class for Dbrx Attention. + + [`DbrxAttention`] class. It is used to instantiate attention layers + according to the specified arguments, defining the layers architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + attn_pdrop (`float`, *optional*, defaults to 0.0): + The dropout probability for the attention layers. + clip_qkv (`float`, *optional*, defaults to None): + If not `None`, clip the queries, keys, and values in the attention layer to this value. + kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads. + rope_theta (float): The base frequency for rope. + """ + + def __init__( + self, + attn_pdrop: float = 0, + clip_qkv: Optional[float] = None, + kv_n_heads: int = 1, + rope_theta: float = 10000.0, + **kwargs: Any, + ): + super().__init__(**kwargs) + self.attn_pdrop = attn_pdrop + self.clip_qkv = clip_qkv + self.kv_n_heads = kv_n_heads + self.rope_theta = rope_theta + + for k in ["model_type"]: + if k in kwargs: + kwargs.pop(k) + if len(kwargs) != 0: + raise ValueError(f"Found unknown {kwargs=}") + + @classmethod + def from_pretrained( + cls, pretrained_model_name_or_path: str, **kwargs: Any + ) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict( + pretrained_model_name_or_path, **kwargs + ) + + if config_dict.get("model_type") == "dbrx": + config_dict = config_dict["attn_config"] + + if ( + "model_type" in config_dict + and hasattr(cls, "model_type") + and config_dict["model_type"] != cls.model_type + ): + logger.warning( + "You are using a model of type %s to instantiate a model of " + "type %s. This is not supported for all configurations of " + "models and can yield errors.", + config_dict["model_type"], + cls.model_type, + ) + + return cls.from_dict(config_dict, **kwargs) + + +class DbrxFFNConfig(PretrainedConfig): + """Configuration class for Dbrx FFN. + + [`DbrxFFN`] class. It is used to instantiate feedforward layers according to + the specified arguments, defining the layers architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + ffn_act_fn (dict, optional): A dict specifying activation function for the FFN. + The dict should have a key 'name' with the value being the name of + the activation function along with any additional keyword arguments. + ffn_hidden_size (int, optional): The hidden size of the feedforward network. + moe_num_experts (int, optional): The number of experts in the mixture of experts layer. + moe_top_k (int, optional): The number of experts to use in the mixture of experts layer. + moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer. + moe_loss_weight (float, optional): The loss weight for the mixture of experts layer. + moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights. + uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment. + This should only be used for benchmarking purposes. + """ + + def __init__( + self, + ffn_act_fn: Optional[dict] = None, + ffn_hidden_size: int = 3584, + moe_num_experts: int = 4, + moe_top_k: int = 1, + moe_jitter_eps: Optional[float] = None, + moe_loss_weight: float = 0.01, + moe_normalize_expert_weights: Optional[float] = 1, + uniform_expert_assignment: bool = False, + **kwargs: Any, + ): + super().__init__() + if ffn_act_fn is None: + ffn_act_fn = {"name": "silu"} + self.ffn_act_fn = ffn_act_fn + self.ffn_hidden_size = ffn_hidden_size + self.moe_num_experts = moe_num_experts + self.moe_top_k = moe_top_k + self.moe_jitter_eps = moe_jitter_eps + self.moe_loss_weight = moe_loss_weight + self.moe_normalize_expert_weights = moe_normalize_expert_weights + self.uniform_expert_assignment = uniform_expert_assignment + + for k in ["model_type"]: + if k in kwargs: + kwargs.pop(k) + if len(kwargs) != 0: + raise ValueError(f"Found unknown {kwargs=}") + + @classmethod + def from_pretrained( + cls, pretrained_model_name_or_path: str, **kwargs: Any + ) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict( + pretrained_model_name_or_path, **kwargs + ) + + if config_dict.get("model_type") == "dbrx": + config_dict = config_dict["ffn_config"] + + if ( + "model_type" in config_dict + and hasattr(cls, "model_type") + and config_dict["model_type"] != cls.model_type + ): + logger.warning( + "You are using a model of type %s to instantiate a model of " + "type %s. This is not supported for all " + "configurations of models and can yield errors.", + config_dict["model_type"], + cls.model_type, + ) + + return cls.from_dict(config_dict, **kwargs) + + +class DbrxConfig(PretrainedConfig): + """Configuration class for Dbrx. + + [`DbrxModel`]. It is used to instantiate a Dbrx model according to the + specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + d_model (`int`, *optional*, defaults to 6144): + Dimensionality of the embeddings and hidden states. + n_heads (`int`, *optional*, defaults to 48): + Number of attention heads for each attention layer in the Transformer encoder. + n_layers (`int`, *optional*, defaults to 40): + Number of hidden layers in the Transformer encoder. + max_seq_len (`int`, *optional*, defaults to 32768): + The maximum sequence length of the model. + vocab_size (`int`, *optional*, defaults to 100352): + Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by + the `inputs_ids` passed when calling [`DbrxModel`]. + resid_pdrop (`float`, *optional*, defaults to 0.0): + The dropout probability applied to the attention output before combining with residual. + emb_pdrop (`float`, *optional*, defaults to 0.0): + The dropout probability for the embedding layer. + attn_config (`dict`, *optional*): + A dictionary used to configure the model's attention module. + ffn_config (`dict`, *optional*): + A dictionary used to configure the model's FFN module. + use_cache (`bool`, *optional*, defaults to `False`): + Whether or not the model should return the last key/values attentions (not used by all models). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + output_router_logits (`bool`, *optional*, defaults to `False`): + Whether or not the router logits should be returned by the model. Enabling this will also + allow the model to output the auxiliary loss. See [here]() for more details + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): + The aux loss factor for the total loss. + + + Example: + ```python + >>> from transformers import DbrxConfig, DbrxModel + + >>> # Initializing a Dbrx configuration + >>> configuration = DbrxConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = DbrxModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "dbrx" + attribute_map = { + "num_attention_heads": "n_heads", + "hidden_size": "d_model", + "num_hidden_layers": "n_layers", + "max_position_embeddings": "max_seq_len", + } + + def __init__( + self, + d_model: int = 2048, + n_heads: int = 16, + n_layers: int = 24, + max_seq_len: int = 2048, + vocab_size: int = 32000, + resid_pdrop: float = 0.0, + emb_pdrop: float = 0.0, + attn_config: Optional[DbrxAttentionConfig] = None, + ffn_config: Optional[DbrxFFNConfig] = None, + use_cache: bool = True, + initializer_range: float = 0.02, + output_router_logits: bool = False, + router_aux_loss_coef: float = 0.05, + **kwargs: Any, + ): + if attn_config is None: + self.attn_config = DbrxAttentionConfig() + elif isinstance(attn_config, dict): + self.attn_config = DbrxAttentionConfig(**attn_config) + else: + self.attn_config = attn_config + + if ffn_config is None: + self.ffn_config = DbrxFFNConfig() + elif isinstance(ffn_config, dict): + self.ffn_config = DbrxFFNConfig(**ffn_config) + else: + self.ffn_config = ffn_config + + self.d_model = d_model + self.n_heads = n_heads + self.n_layers = n_layers + self.max_seq_len = max_seq_len + self.vocab_size = vocab_size + self.resid_pdrop = resid_pdrop + self.emb_pdrop = emb_pdrop + self.use_cache = use_cache + self.initializer_range = initializer_range + self.output_router_logits = output_router_logits + self.router_aux_loss_coef = router_aux_loss_coef + + tie_word_embeddings = kwargs.pop("tie_word_embeddings", False) + if tie_word_embeddings: + raise ValueError("tie_word_embeddings is not supported for Dbrx models.") + + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py index 92b01d4524f8..ea39d73f2eea 100644 --- a/python/sglang/srt/hf_transformers_utils.py +++ b/python/sglang/srt/hf_transformers_utils.py @@ -30,20 +30,15 @@ ) from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES -try: - from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig - - from sglang.srt.configs import ExaoneConfig, Qwen2VLConfig - - _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { - ChatGLMConfig.model_type: ChatGLMConfig, - DbrxConfig.model_type: DbrxConfig, - ExaoneConfig.model_type: ExaoneConfig, - Qwen2VLConfig.model_type: Qwen2VLConfig, - } -except ImportError: - # We want this file to run without vllm dependency - _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {} +from sglang.srt.configs import ChatGLMConfig, DbrxConfig, ExaoneConfig, Qwen2VLConfig + +_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { + ChatGLMConfig.model_type: ChatGLMConfig, + DbrxConfig.model_type: DbrxConfig, + ExaoneConfig.model_type: ExaoneConfig, + Qwen2VLConfig.model_type: Qwen2VLConfig, +} + for name, cls in _CONFIG_REGISTRY.items(): with contextlib.suppress(ValueError): diff --git a/python/sglang/srt/models/chatglm.py b/python/sglang/srt/models/chatglm.py index 9c3bc2ee9e0a..b69a9e11639a 100644 --- a/python/sglang/srt/models/chatglm.py +++ b/python/sglang/srt/models/chatglm.py @@ -23,8 +23,8 @@ from torch.nn import LayerNorm from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.transformers_utils.configs import ChatGLMConfig +from sglang.srt.configs import ChatGLMConfig from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py index 852f58a710d6..f838cfa575bb 100644 --- a/python/sglang/srt/models/dbrx.py +++ b/python/sglang/srt/models/dbrx.py @@ -25,8 +25,8 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.transformers_utils.configs.dbrx import DbrxConfig +from sglang.srt.configs import DbrxConfig from sglang.srt.layers.linear import ( QKVParallelLinear, ReplicatedLinear, From 679c3bcacfd19eb852e8dbf42ad6b756eec56df4 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 9 Jan 2025 03:03:24 -0800 Subject: [PATCH 008/248] Fix typo in cuda_graph_bs (#2813) --- python/sglang/srt/model_executor/cuda_graph_runner.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index deaea33129d1..e4580b5e2ba8 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -131,11 +131,6 @@ def __init__(self, model_runner: "ModelRunner"): else: self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)] - if model_runner.server_args.disable_cuda_graph_padding: - self.capture_bs = list(range(1, 33)) + [64, 128] - else: - self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)] - if max(self.capture_bs) > model_runner.req_to_token_pool.size: # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests # is very samll. We add more values here to make sure we capture the maximum bs. From 4f077c01b8cca17993df1c2c77285dce176742c3 Mon Sep 17 00:00:00 2001 From: sleepcoo <118525388+sleepcoo@users.noreply.github.com> Date: Thu, 9 Jan 2025 22:24:42 +0800 Subject: [PATCH 009/248] minor: support specifying local dataset path for gsm8k and hellaswag (#2816) --- benchmark/gsm8k/bench_sglang.py | 7 +++++-- benchmark/hellaswag/bench_sglang.py | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py index 9fe9b79baaf8..f01734f0afb0 100644 --- a/benchmark/gsm8k/bench_sglang.py +++ b/benchmark/gsm8k/bench_sglang.py @@ -1,6 +1,7 @@ import argparse import ast import json +import os import re import time @@ -46,9 +47,11 @@ def main(args): set_default_backend(select_sglang_backend(args)) # Read data + data_path = args.data_path url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl" - filename = download_and_cache_file(url) - lines = list(read_jsonl(filename)) + if not os.path.isfile(data_path): + data_path = download_and_cache_file(url) + lines = list(read_jsonl(data_path)) # Construct prompts num_questions = args.num_questions diff --git a/benchmark/hellaswag/bench_sglang.py b/benchmark/hellaswag/bench_sglang.py index f09d7256da93..798521f9766d 100644 --- a/benchmark/hellaswag/bench_sglang.py +++ b/benchmark/hellaswag/bench_sglang.py @@ -1,5 +1,6 @@ import argparse import json +import os import time import numpy as np @@ -31,9 +32,11 @@ def main(args): set_default_backend(select_sglang_backend(args)) # Read data + data_path = args.data_path url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl" - filename = download_and_cache_file(url) - lines = list(read_jsonl(filename)) + if not os.path.isfile(data_path): + data_path = download_and_cache_file(url) + lines = list(read_jsonl(data_path)) # Construct prompts num_questions = args.num_questions From 11fffbc95a919a2446ae10fc33753d9951374fdf Mon Sep 17 00:00:00 2001 From: Xiaotong Jiang Date: Thu, 9 Jan 2025 13:43:12 -0800 Subject: [PATCH 010/248] [Doc]: Deepseek reference docs (#2787) --- docs/index.rst | 1 + docs/references/deepseek.md | 34 ++++++++++++++++++++++++++++++++++ docs/references/modelscope.md | 4 ++-- 3 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 docs/references/deepseek.md diff --git a/docs/index.rst b/docs/index.rst index ff104808ca99..6ed313a3bd17 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -62,3 +62,4 @@ The core features include: references/troubleshooting.md references/faq.md references/learn_more.md + references/deepseek.md diff --git a/docs/references/deepseek.md b/docs/references/deepseek.md new file mode 100644 index 000000000000..6cf155f46211 --- /dev/null +++ b/docs/references/deepseek.md @@ -0,0 +1,34 @@ +# DeepSeek Model Optimizations in SGLang + +SGLang provides several optimizations specifically designed for the DeepSeek model to boost its inference speed. This document outlines current optimizations for DeepSeek. Additionally, the SGLang team is actively developing enhancements for [DeepSeek-V3](https://github.com/sgl-project/sglang/issues/2591). + + +## Multi-head Latent Attention (MLA) Throughput Optimizations + +**Description**: [MLA](https://arxiv.org/pdf/2405.04434) is an innovative attention mechanism introduced by the DeepSeek team, aimed at improving inference efficiency. SGLang has implemented specific optimizations for this, including: + +- **Weight Absorption**: By applying the associative law of matrix multiplication to reorder computation steps, this method balances computation and memory access and improves efficiency in the decoding phase. +- **Triton Decoding Kernel Optimization**: In the MLA decoding kernel, there is only one KV head. This optimization reduces memory access to the KV cache by processing multiple query heads within one block, accelerating the decoding process. +- **FP8 Quantization**: W8A8 FP8 and KV Cache FP8 quantization enables efficient FP8 inference. Additionally, we have implemented Batched Matrix Multiplication (BMM) operator to facilitate FP8 inference in MLA with weight absorption. +- **CUDA Graph & Torch.compile**: Both MLA and Mixture of Experts (MoE) are compatible with CUDA Graph and Torch.compile, which reduces latency and accelerates decoding speed for small batch sizes. + +Overall, with these optimizations, we have achieved up to a 7x acceleration in output throughput compared to the previous version. +![Data Parallelism Attention for DeepSeek Series Models](https://lmsys.org/images/blog/sglang_v0_3/deepseek_mla.svg) + +**Usage**: MLA optimization is enabled by defalut, to disable, use `--disable-mla`. + +**Reference**: Check [Blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#deepseek-multi-head-latent-attention-mla-throughput-optimizations) and [Slides](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/lmsys_1st_meetup_deepseek_mla.pdf) for more details. + +## Data Parallelism Attention + +**Description**: This optimization involves data parallelism (DP) for the MLA attention mechanism of DeepSeek Series Models, which allows for a significant reduction in the KV cache size, enabling larger batch sizes. Each DP worker independently handles different types of batches (prefill, decode, idle), which are then synchronized before and after processing through the Mixture-of-Experts (MoE) layer. +![Data Parallelism Attention for DeepSeek Series Models](https://lmsys.org/images/blog/sglang_v0_4/dp_attention.svg). + +**Usage**: This optimization is aimed at improving throughput and should be used for scenarios with high QPS (Queries Per Second). Data Parallelism Attention optimization can be enabeld by `--enable-dp-attention` for DeepSeek Series Models. + +**Reference**: Check [Blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#data-parallelism-attention-for-deepseek-models). + +## Multi Node Tensor Parallelism +**Description**: For users with limited memory on a single node, SGLang supports serving DeepSeek Series Models, including DeepSeek V3, across multiple nodes using tensor parallelism. This approach partitions the model parameters across multiple GPUs or nodes to handle models that are too large for one node's memory. + +**Usage**: Check [here](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-2-h208) for usage examples. diff --git a/docs/references/modelscope.md b/docs/references/modelscope.md index ad7b6151b435..4740c2770f9e 100644 --- a/docs/references/modelscope.md +++ b/docs/references/modelscope.md @@ -6,9 +6,9 @@ To use a model from [ModelScope](https://www.modelscope.cn), set the environment export SGLANG_USE_MODELSCOPE=true ``` -We take [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) as an example. Launch the Server: ---- +We take [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) as an example. +Launch the Server: ```bash python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000 ``` From 5cc1170552bfe1f32d070e802331d1b4b7f699cf Mon Sep 17 00:00:00 2001 From: Chayenne Date: Fri, 10 Jan 2025 00:26:59 -0800 Subject: [PATCH 011/248] Doc: add block-wise FP8 in dpsk model reference (#2830) --- docs/references/deepseek.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/docs/references/deepseek.md b/docs/references/deepseek.md index 6cf155f46211..5a95fd9a9418 100644 --- a/docs/references/deepseek.md +++ b/docs/references/deepseek.md @@ -9,11 +9,14 @@ SGLang provides several optimizations specifically designed for the DeepSeek mod - **Weight Absorption**: By applying the associative law of matrix multiplication to reorder computation steps, this method balances computation and memory access and improves efficiency in the decoding phase. - **Triton Decoding Kernel Optimization**: In the MLA decoding kernel, there is only one KV head. This optimization reduces memory access to the KV cache by processing multiple query heads within one block, accelerating the decoding process. + - **FP8 Quantization**: W8A8 FP8 and KV Cache FP8 quantization enables efficient FP8 inference. Additionally, we have implemented Batched Matrix Multiplication (BMM) operator to facilitate FP8 inference in MLA with weight absorption. + - **CUDA Graph & Torch.compile**: Both MLA and Mixture of Experts (MoE) are compatible with CUDA Graph and Torch.compile, which reduces latency and accelerates decoding speed for small batch sizes. Overall, with these optimizations, we have achieved up to a 7x acceleration in output throughput compared to the previous version. -![Data Parallelism Attention for DeepSeek Series Models](https://lmsys.org/images/blog/sglang_v0_3/deepseek_mla.svg) + +![Multi-head Latent Attention for DeepSeek Series Models](https://lmsys.org/images/blog/sglang_v0_3/deepseek_mla.svg) **Usage**: MLA optimization is enabled by defalut, to disable, use `--disable-mla`. @@ -22,6 +25,7 @@ Overall, with these optimizations, we have achieved up to a 7x acceleration in o ## Data Parallelism Attention **Description**: This optimization involves data parallelism (DP) for the MLA attention mechanism of DeepSeek Series Models, which allows for a significant reduction in the KV cache size, enabling larger batch sizes. Each DP worker independently handles different types of batches (prefill, decode, idle), which are then synchronized before and after processing through the Mixture-of-Experts (MoE) layer. + ![Data Parallelism Attention for DeepSeek Series Models](https://lmsys.org/images/blog/sglang_v0_4/dp_attention.svg). **Usage**: This optimization is aimed at improving throughput and should be used for scenarios with high QPS (Queries Per Second). Data Parallelism Attention optimization can be enabeld by `--enable-dp-attention` for DeepSeek Series Models. @@ -29,6 +33,16 @@ Overall, with these optimizations, we have achieved up to a 7x acceleration in o **Reference**: Check [Blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#data-parallelism-attention-for-deepseek-models). ## Multi Node Tensor Parallelism + **Description**: For users with limited memory on a single node, SGLang supports serving DeepSeek Series Models, including DeepSeek V3, across multiple nodes using tensor parallelism. This approach partitions the model parameters across multiple GPUs or nodes to handle models that are too large for one node's memory. **Usage**: Check [here](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-2-h208) for usage examples. + +## Block-wise FP8 + +**Description**: SGLang implements block-wise FP8 quantization with two key optimizations: + +- **Activation**: E4M3 format using per-token-per-128-channel sub-vector scales with online casting. +- **Weight**: Per-128x128-block quantization for better numerical stability. + +**Usage**: turn on by default for DeepSeek V3 models. From 2db03a04ca39dd85a5e419a28803bd483528fcc1 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Fri, 10 Jan 2025 03:49:04 -0800 Subject: [PATCH 012/248] Update README.md (#2833) Co-authored-by: Heiner --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 97ad1e935c68..024fa2761270 100644 --- a/README.md +++ b/README.md @@ -61,5 +61,4 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI. ## Acknowledgment and Citation -We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). -Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful. +We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful. From 8f157893141ea24ebb581c9e48c27a8eeb9b81fb Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Fri, 10 Jan 2025 07:30:44 -0800 Subject: [PATCH 013/248] Add more metrics to serving benchmark. (#2819) --- python/sglang/bench_serving.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 4744ad3386ba..941507705e36 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -514,6 +514,8 @@ class BenchmarkMetrics: p99_itl_ms: float mean_e2e_latency_ms: float median_e2e_latency_ms: float + std_e2e_latency_ms: float + p99_e2e_latency_ms: float SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json" @@ -873,6 +875,8 @@ def calculate_metrics( p99_itl_ms=np.percentile(itls or 0, 99) * 1000, mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000, median_e2e_latency_ms=np.median(e2e_latencies) * 1000, + std_e2e_latency_ms=np.std(e2e_latencies) * 1000, + p99_e2e_latency_ms=np.percentile(e2e_latencies, 99) * 1000, ) return metrics, output_lens @@ -1064,10 +1068,20 @@ async def limited_request_func(request_func_input, pbar): "total_output_tokens_retokenized": metrics.total_output_retokenized, "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms, "median_e2e_latency_ms": metrics.median_e2e_latency_ms, + "std_e2e_latency_ms": metrics.std_e2e_latency_ms, + "p99_e2e_latency_ms": metrics.p99_e2e_latency_ms, "mean_ttft_ms": metrics.mean_ttft_ms, "median_ttft_ms": metrics.median_ttft_ms, + "std_ttft_ms": metrics.std_ttft_ms, + "p99_ttft_ms": metrics.p99_ttft_ms, + "mean_tpot_ms": metrics.mean_tpot_ms, + "median_tpot_ms": metrics.median_tpot_ms, + "std_tpot_ms": metrics.std_tpot_ms, + "p99_tpot_ms": metrics.p99_tpot_ms, "mean_itl_ms": metrics.mean_itl_ms, "median_itl_ms": metrics.median_itl_ms, + "std_itl_ms": metrics.std_itl_ms, + "p99_itl_ms": metrics.p99_itl_ms, "input_throughput": metrics.input_throughput, "output_throughput": metrics.output_throughput, "sharegpt_output_len": args.sharegpt_output_len, From f290bd4332ce4ff4be97d59e82daa013f99c66ca Mon Sep 17 00:00:00 2001 From: Chang Su Date: Fri, 10 Jan 2025 13:14:51 -0800 Subject: [PATCH 014/248] [Bugfix] Fix embedding model hangs with `--enable-metrics` (#2822) --- python/sglang/srt/configs/model_config.py | 2 +- .../sglang/srt/managers/tokenizer_manager.py | 8 +++- .../sglang/srt/model_executor/model_runner.py | 2 +- test/srt/test_openai_server.py | 41 +++++++++++++++++++ 4 files changed, 49 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index a2f9b82844e8..072c88b04a78 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -128,7 +128,7 @@ def __init__( self.num_hidden_layers = self.hf_text_config.num_hidden_layers self.vocab_size = self.hf_text_config.vocab_size - # Veirfy quantization + # Verify quantization self._verify_quantization() # Cache attributes diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 08dbd02c5ba3..00ef8458ab82 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -688,7 +688,7 @@ async def handle_loop(self): if self.enable_metrics: completion_tokens = ( recv_obj.completion_tokens[i] - if recv_obj.completion_tokens + if getattr(recv_obj, "completion_tokens", None) else 0 ) @@ -716,7 +716,11 @@ async def handle_loop(self): time.time() - state.created_time ) # Compute time_per_output_token for the non-streaming case - if not state.obj.stream and completion_tokens >= 1: + if ( + hasattr(state.obj, "stream") + and not state.obj.stream + and completion_tokens >= 1 + ): self.metrics_collector.observe_time_per_output_token( (time.time() - state.created_time) / completion_tokens diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 719db19cd765..efba8c25b504 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -724,7 +724,7 @@ def forward(self, forward_batch: ForwardBatch) -> LogitsProcessorOutput: elif forward_batch.forward_mode.is_idle(): return self.forward_idle(forward_batch) else: - raise ValueError(f"Invaid forward mode: {forward_batch.forward_mode}") + raise ValueError(f"Invalid forward mode: {forward_batch.forward_mode}") def sample( self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index 379e57f356e9..4bedf7439663 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -14,6 +14,7 @@ from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( + DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST, DEFAULT_SMALL_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, @@ -675,5 +676,45 @@ def test_function_calling_format(self): ), "Function name should be add for the above response" +class TestOpenAIEmbedding(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.api_key = "sk-123456" + + # Configure embedding-specific args + other_args = ["--is-embedding", "--enable-metrics"] + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + api_key=cls.api_key, + other_args=other_args, + ) + cls.base_url += "/v1" + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_embedding_single(self): + """Test single embedding request""" + client = openai.Client(api_key=self.api_key, base_url=self.base_url) + response = client.embeddings.create(model=self.model, input="Hello world") + self.assertEqual(len(response.data), 1) + self.assertTrue(len(response.data[0].embedding) > 0) + + def test_embedding_batch(self): + """Test batch embedding request""" + client = openai.Client(api_key=self.api_key, base_url=self.base_url) + response = client.embeddings.create( + model=self.model, input=["Hello world", "Test text"] + ) + self.assertEqual(len(response.data), 2) + self.assertTrue(len(response.data[0].embedding) > 0) + self.assertTrue(len(response.data[1].embedding) > 0) + + if __name__ == "__main__": unittest.main() From 5413ec2bbe42de54d244e35c65bd7929b458fd22 Mon Sep 17 00:00:00 2001 From: Muqi Li <642733045@qq.com> Date: Sat, 11 Jan 2025 05:37:00 +0800 Subject: [PATCH 015/248] [Bugfix] Fix bug in fork logic caused by null text_ (#2835) --- python/sglang/lang/interpreter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/lang/interpreter.py b/python/sglang/lang/interpreter.py index 6d1ca71adab1..4c294781c20e 100644 --- a/python/sglang/lang/interpreter.py +++ b/python/sglang/lang/interpreter.py @@ -347,7 +347,7 @@ def fork( size: int = 1, position_ids_offset: Optional[List[int]] = None, ): - if size > 1: + if size > 1 and str(self.text_): self.submit(SglCommitLazy()) self.sync() From b170646991a06cb18b1bd4e74efcd095f5b00c18 Mon Sep 17 00:00:00 2001 From: TianYu GUO Date: Sat, 11 Jan 2025 05:44:32 +0800 Subject: [PATCH 016/248] Fix port number overflow (#2826) --- python/sglang/srt/server_args.py | 5 ++++- python/sglang/srt/utils.py | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index ef4df60a5763..09d1a3edebc4 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -928,7 +928,10 @@ def init_new(server_args) -> "PortArgs": while True: if is_port_available(port): break - port += 42 + if port < 60000: + port += 42 + else: + port -= 43 return PortArgs( tokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name, diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 44a5e41a41bd..b07f6f01d184 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -335,6 +335,8 @@ def is_port_available(port): return True except socket.error: return False + except OverflowError: + return False def decode_video_base64(video_base64): From a47bf39123c4f5bffcf96a80640f234e3f637c4c Mon Sep 17 00:00:00 2001 From: justdoit <24875266+coolhok@users.noreply.github.com> Date: Sat, 11 Jan 2025 06:00:43 +0800 Subject: [PATCH 017/248] [Eagle2] Fix multiple concurrent request crashes (#2730) --- python/sglang/srt/speculative/eagle_utils.py | 17 ++- python/sglang/srt/speculative/eagle_worker.py | 2 + test/srt/test_eagle_infer.py | 119 ++++++++++++++++++ 3 files changed, 134 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/speculative/eagle_utils.py b/python/sglang/srt/speculative/eagle_utils.py index b804e7c6af2e..1a324000cb28 100644 --- a/python/sglang/srt/speculative/eagle_utils.py +++ b/python/sglang/srt/speculative/eagle_utils.py @@ -245,9 +245,10 @@ def prepare_for_decode(self, batch: ScheduleBatch): ) # (b, topk) topk_cs_index, topk_cs_p = topk_cs.indices, topk_cs.values - selected_input_index = ( - topk_cs_index.flatten() // self.topk - ) # shape: (b * topk) + selected_input_index = topk_cs_index.flatten() // self.topk + torch.arange( + 0, batch.batch_size() * self.topk, step=self.topk, device="cuda" + ).repeat_interleave(self.topk) + batch.spec_info.hidden_states = batch.spec_info.hidden_states[ selected_input_index, : ] @@ -336,6 +337,7 @@ def prepare_extend_after_decode(self, batch: ScheduleBatch): triton.next_power_of_2(self.spec_steps + 1), ) + batch.seq_lens_sum = sum(batch.seq_lens) batch.input_ids = self.verified_id self.verified_id = new_verified_id @@ -439,7 +441,14 @@ def generate_attn_arg_prefill( return kv_indices, cum_kv_seq_len, qo_indptr, None def merge_batch(self, spec_info: EAGLEDraftInput): - + if self.hidden_states is None: + self.hidden_states = spec_info.hidden_states + self.verified_id = spec_info.verified_id + self.sample_output = spec_info.sample_output + self.prev_mode = spec_info.prev_mode + return + if spec_info.hidden_states is None: + return self.hidden_states = torch.cat( [self.hidden_states, spec_info.hidden_states], axis=0 ) diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py index 16d54c43bafb..0e53506a8840 100644 --- a/python/sglang/srt/speculative/eagle_worker.py +++ b/python/sglang/srt/speculative/eagle_worker.py @@ -169,6 +169,8 @@ def finish_request(self, reqs: Union[Req, List[Req]]): if not isinstance(reqs, List): reqs = [reqs] for req in reqs: + if req.rid not in self.finish_extend_len: + continue req_len = ( len(req.origin_input_ids) + len(req.output_ids) diff --git a/test/srt/test_eagle_infer.py b/test/srt/test_eagle_infer.py index 94ebc79ca743..92127b8ef591 100644 --- a/test/srt/test_eagle_infer.py +++ b/test/srt/test_eagle_infer.py @@ -1,8 +1,18 @@ +import multiprocessing +import random +import time import unittest +import requests from transformers import AutoConfig, AutoTokenizer import sglang as sgl +from sglang.srt.utils import kill_process_tree +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) class TestEAGLEEngine(unittest.TestCase): @@ -64,5 +74,114 @@ def test_eagle_end_check(self): assert tokenizer.eos_token_id not in tokens +prompts = [ + "[INST] <>\\nYou are a helpful assistant.\\n<>\\nToday is a sunny day and I like[/INST]" + '[INST] <>\\nYou are a helpful assistant.\\n<>\\nWhat are the mental triggers in Jeff Walker\'s Product Launch Formula and "Launch" book?[/INST]', + "[INST] <>\\nYou are a helpful assistant.\\n<>\\nSummarize Russell Brunson's Perfect Webinar Script...[/INST]", + "[INST] <>\\nYou are a helpful assistant.\\n<>\\nwho are you?[/INST]", + "[INST] <>\\nYou are a helpful assistant.\\n<>\\nwhere are you from?[/INST]", +] + + +def process(server_url: str): + time.sleep(random.uniform(0, 2)) + for prompt in prompts: + url = server_url + data = { + "model": "base", + "text": prompt, + "sampling_params": { + "temperature": 0, + "max_new_tokens": 1024, + }, + } + response = requests.post(url, json=data) + assert response.status_code == 200 + + +def abort_process(server_url: str): + for prompt in prompts: + try: + time.sleep(1) + url = server_url + data = { + "model": "base", + "text": prompt, + "sampling_params": { + "temperature": 0, + "max_new_tokens": 1024, + }, + } + # set timeout = 1s,mock disconnected + requests.post(url, json=data, timeout=1) + except: + pass + + +class TestEAGLELaunchServer(unittest.TestCase): + @classmethod + def setUpClass(cls): + speculative_draft_model_path = "lmzheng/sglang-EAGLE-llama2-chat-7B" + cls.model = "meta-llama/Llama-2-7b-chat-hf" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--speculative-algorithm", + "EAGLE", + "--speculative-draft-model-path", + speculative_draft_model_path, + "--speculative-num-steps", + "3", + "--speculative-eagle-topk", + "4", + "--speculative-num-draft-tokens", + "16", + "--served-model-name", + "base", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_eagle_server_concurrency(self): + concurrency = 4 + processes = [ + multiprocessing.Process( + target=process, + kwargs={"server_url": self.base_url + "/generate"}, + ) + for _ in range(concurrency) + ] + for worker in processes: + worker.start() + for p in processes: + p.join() + + def test_eagle_server_request_abort(self): + concurrency = 4 + processes = [ + multiprocessing.Process( + target=process, + kwargs={"server_url": self.base_url + "/generate"}, + ) + for _ in range(concurrency) + ] + [ + multiprocessing.Process( + target=abort_process, + kwargs={"server_url": self.base_url + "/generate"}, + ) + for _ in range(concurrency) + ] + for worker in processes: + worker.start() + for p in processes: + p.join() + + if __name__ == "__main__": unittest.main() From 5d6e9467d4624a66ca64b0714042cb032df72695 Mon Sep 17 00:00:00 2001 From: Zhiqiang Xie Date: Fri, 10 Jan 2025 20:22:01 -0800 Subject: [PATCH 018/248] Cache controller for hierarchical caching (#2804) --- .../sglang/srt/managers/cache_controller.py | 307 ++++++++++++++++++ 1 file changed, 307 insertions(+) create mode 100644 python/sglang/srt/managers/cache_controller.py diff --git a/python/sglang/srt/managers/cache_controller.py b/python/sglang/srt/managers/cache_controller.py new file mode 100644 index 000000000000..4560a270870f --- /dev/null +++ b/python/sglang/srt/managers/cache_controller.py @@ -0,0 +1,307 @@ +from __future__ import annotations + +""" +Copyright 2023-2025 SGLang Team +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import logging +import threading +from queue import PriorityQueue, Queue +from typing import Optional + +import torch + +from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool, MLATokenToKVPoolHost + +logger = logging.getLogger(__name__) + + +class CacheOperation: + + counter = 0 + + def __init__( + self, + host_indices: torch.Tensor, + device_indices: torch.Tensor, + node_id: int, + priority: Optional[int] = None, + ): + self.host_indices = host_indices + self.device_indices = device_indices + self.node_ids = [node_id] + self.data = None + + self.id = CacheOperation.counter + CacheOperation.counter += 1 + # default priority is the order of creation + self.priority = priority if priority is not None else self.id + + def merge(self, other: "CacheOperation") -> None: + # multiple operations can be merged into a single operation for batch processing + self.host_indices = torch.cat([self.host_indices, other.host_indices]) + self.device_indices = torch.cat([self.device_indices, other.device_indices]) + self.priority = min(self.priority, other.priority) + self.node_ids.extend(other.node_ids) + + def __lt__(self, other: "CacheOperation"): + return self.priority < other.priority + + +class TransferBuffer: + """ + Overlapping buffer preparation and transfer operations to improve throughput. + """ + + def __init__(self, buffer_count: int = 3, max_buffer_size: int = 1000) -> None: + self.buffers = Queue(maxsize=buffer_count) + # todo: adjust the buffer size based on throughput profile of the system + self.max_buffer_size = max_buffer_size + + def full(self) -> bool: + return self.buffers.full() + + def empty(self) -> bool: + return self.buffers.empty() + + def put(self, item, block=True) -> None: + self.buffers.put(item, block=block) + + def get(self, block=True) -> Optional[CacheOperation]: + try: + return self.buffers.get(block=block) + except Exception as e: + logger.error(e) + + +class HiCacheController: + + def __init__( + self, + mem_pool_device: MHATokenToKVPool, + mem_pool_host: MLATokenToKVPoolHost, + write_policy: str = "write_through_selective", + ): + + self.mem_pool_device = mem_pool_device + self.mem_pool_host = mem_pool_host + self.write_policy = write_policy + + if write_policy not in [ + "write_through", + "write_through_selective", + "write_back", + ]: + raise ValueError(f"Invalid write policy: {write_policy}") + + self.write_queue = PriorityQueue() + self.load_queue = PriorityQueue() + + self.ack_write_queue = Queue() + self.ack_load_queue = Queue() + + self.write_buffer = TransferBuffer() + self.load_buffer = TransferBuffer() + + self.write_stream = torch.cuda.Stream() + self.load_stream = torch.cuda.Stream() + + self.write_thread = threading.Thread( + target=self.write_thread_func_buffer, daemon=True + ) + self.load_thread = threading.Thread( + target=self.load_thread_func_buffer, daemon=True + ) + self.write_thread.start() + self.load_thread.start() + + def write( + self, + device_indices: torch.Tensor, + priority: Optional[int] = None, + node_id: int = 0, + ) -> Optional[torch.Tensor]: + """ + Back up KV caches from device memory to host memory. + """ + host_indices = self.mem_pool_host.alloc(len(device_indices)) + if host_indices is None: + return None + self.write_queue.put( + CacheOperation(host_indices, device_indices, node_id, priority) + ) + self.mem_pool_host.protect_write(host_indices) + return host_indices + + def load( + self, + host_indices: torch.Tensor, + priority: Optional[int] = None, + node_id: int = 0, + ) -> Optional[torch.Tensor]: + """ + Load KV caches from host memory to device memory. + """ + device_indices = self.mem_pool_device.alloc(len(host_indices)) + if device_indices is None: + return None + self.load_queue.put( + CacheOperation(host_indices, device_indices, node_id, priority) + ) + self.mem_pool_host.protect_load(host_indices) + return device_indices + + def write_thread_func_direct(self): + """ + Directly write through KV caches to host memory without buffering. + """ + with torch.cuda.stream(self.write_stream): + while True: + try: + operation = self.write_queue.get(block=True) + operation.data = self.mem_pool_device.get_flat_data( + operation.device_indices + ) + self.mem_pool_host.transfer(operation.host_indices, operation.data) + self.mem_pool_host.complete_io(operation.host_indices) + for node_id in operation.node_ids: + self.ack_write_queue.put(node_id) + except Exception as e: + logger.error(e) + + def load_thread_func_direct(self): + """ + Directly load KV caches from host memory to device memory without buffering. + """ + with torch.cuda.stream(self.load_stream): + while True: + try: + operation = self.load_queue.get(block=True) + operation.data = self.mem_pool_host.get_flat_data( + operation.host_indices + ) + self.mem_pool_device.transfer( + operation.device_indices, operation.data + ) + self.mem_pool_host.complete_io(operation.host_indices) + for node_id in operation.node_ids: + self.ack_load_queue.put(node_id) + except Exception as e: + logger.error(e) + + def write_aux_func(self, no_wait=False): + """ + Auxiliary function to prepare the buffer for write operations. + """ + buffer = None + while True: + try: + operation = self.write_queue.get(block=True) + if buffer is None: + buffer = operation + else: + buffer.merge(operation) + if ( + no_wait + or len(buffer.host_indices) >= self.write_buffer.max_buffer_size + or self.write_queue.empty() + or self.write_buffer.empty() + ): + assert ( + buffer.device_indices.is_cuda + ), "Device indices should be on GPU" + buffer.data = self.mem_pool_device.get_flat_data( + buffer.device_indices + ).contiguous() + self.write_buffer.put(buffer, block=True) + buffer = None + except Exception as e: + logger.error(e) + + def load_aux_func(self): + """ + Auxiliary function to prepare the buffer for load operations. + """ + buffer = None + while True: + try: + operation = self.load_queue.get(block=True) + if buffer is None: + buffer = operation + else: + buffer.merge(operation) + if ( + len(buffer.host_indices) >= self.load_buffer.max_buffer_size + or self.load_queue.empty() + or self.load_buffer.empty() + ): + buffer.data = ( + self.mem_pool_host.get_flat_data(buffer.host_indices) + .contiguous() + .pin_memory() + ) + self.load_buffer.put(buffer, block=True) + buffer = None + except Exception as e: + logger.error(e) + + def write_thread_func_buffer(self): + aux_thread = threading.Thread(target=self.write_aux_func, daemon=True) + aux_thread.start() + with torch.cuda.stream(self.write_stream): + while True: + operation = self.write_buffer.get() + if operation is None: + continue + self.mem_pool_host.transfer(operation.host_indices, operation.data) + self.mem_pool_host.complete_io(operation.host_indices) + for node_id in operation.node_ids: + self.ack_write_queue.put(node_id) + + def load_thread_func_buffer(self): + aux_thread = threading.Thread(target=self.load_aux_func, daemon=True) + aux_thread.start() + with torch.cuda.stream(self.load_stream): + while True: + operation = self.load_buffer.get() + if operation is None: + continue + self.mem_pool_device.transfer(operation.device_indices, operation.data) + self.mem_pool_host.complete_io(operation.host_indices) + for node_id in operation.node_ids: + self.ack_load_queue.put(node_id) + + def evict_device( + self, device_indices: torch.Tensor, host_indices: torch.Tensor + ) -> int: + if self.mem_pool_host.is_synced(host_indices): + self.mem_pool_device.free(device_indices) + self.mem_pool_host.update_backup(host_indices) + return len(device_indices) + else: + raise ValueError( + f"Inconsistent states: {self.mem_pool_host.get_state(host_indices)}" + ) + + def evict_host(self, host_indices: torch.Tensor, backup_only: bool = True) -> int: + if not backup_only: + raise ValueError("Other eviction policies are not supported yet.") + + if self.mem_pool_host.is_backup(host_indices): + self.mem_pool_host.free(host_indices) + return len(host_indices) + else: + raise ValueError( + f"Inconsistent states: {self.mem_pool_host.get_state(host_indices)}" + ) From f1769586d651c701bc5f5b6f3a39d5b0f478eb02 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Fri, 10 Jan 2025 20:37:34 -0800 Subject: [PATCH 019/248] Update threshold in test_nightly_gsm8k_eval.py (#2836) --- test/srt/test_nightly_gsm8k_eval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py index 7820f6825a9c..2e379c111799 100644 --- a/test/srt/test_nightly_gsm8k_eval.py +++ b/test/srt/test_nightly_gsm8k_eval.py @@ -26,8 +26,8 @@ "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85, "google/gemma-2-27b-it": 0.92, "meta-llama/Llama-3.1-70B-Instruct": 0.95, - "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64, - "Qwen/Qwen2-57B-A14B-Instruct": 0.88, + "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.63, + "Qwen/Qwen2-57B-A14B-Instruct": 0.87, "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83, "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54, "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.84, From f0e15dc6ab6766a8fcdeedb5432b92a18e14979f Mon Sep 17 00:00:00 2001 From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Date: Sat, 11 Jan 2025 14:34:26 +0800 Subject: [PATCH 020/248] [HotFix] fix fp8 scale load failed in tp>1 (#2837) --- python/sglang/srt/layers/linear.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py index 9edfa739458b..b839deeb3251 100644 --- a/python/sglang/srt/layers/linear.py +++ b/python/sglang/srt/layers/linear.py @@ -437,7 +437,7 @@ def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor): if len(loaded_weight.shape) == 0: assert loaded_weight.numel() == 1 loaded_weight = loaded_weight.reshape(1) - load_column_parallel_weight(param, loaded_weight, self.tp_rank) + param.load_column_parallel_weight(loaded_weight=loaded_weight) def forward(self, input_): bias = self.bias if not self.skip_bias_add else None @@ -1247,12 +1247,7 @@ def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor assert loaded_weight.numel() == 1 loaded_weight = loaded_weight.reshape(1) - load_row_parallel_weight( - param, - loaded_weight, - self.tp_rank, - use_presharded_weights=self.use_presharded_weights, - ) + param.load_row_parallel_weight(loaded_weight=loaded_weight) def forward(self, input_): if self.input_is_parallel: From f624901cdd5da4ad6ffa20a5c29561dcbac0eb4a Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Sat, 11 Jan 2025 23:10:02 +0800 Subject: [PATCH 021/248] chore: bump v0.4.1.post5 (#2840) --- benchmark/deepseek_v3/README.md | 4 +++- docker/Dockerfile.rocm | 2 +- docs/developer/setup_github_runner.md | 4 ++-- docs/start/install.md | 10 +++++----- python/pyproject.toml | 2 +- python/sglang/version.py | 2 +- 6 files changed, 13 insertions(+), 11 deletions(-) diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md index 15cf0b26a244..a4f5bf854bde 100644 --- a/benchmark/deepseek_v3/README.md +++ b/benchmark/deepseek_v3/README.md @@ -4,6 +4,8 @@ The SGLang and DeepSeek teams collaborated to get DeepSeek V3 FP8 running on NVI Special thanks to Meituan's Search & Recommend Platform Team and Baseten's Model Performance Team for implementing the model, and DataCrunch for providing GPU resources. +For optimizations made on the DeepSeek series models regarding SGLang, please refer to https://sgl-project.github.io/references/deepseek.html + ## Hardware Recommendation - 8 x NVIDIA H200 GPUs @@ -29,7 +31,7 @@ For high QPS scenarios, add the `--enable-dp-attention` argument to boost throug ### Using pip ```bash # Installation -pip install "sglang[all]>=0.4.1.post3" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer +pip install "sglang[all]>=0.4.1.post5" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer # Launch python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 44b3f85b3516..9b1d67b5e4f7 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -1,5 +1,5 @@ # Usage (to build SGLang ROCm docker image): -# docker build --build-arg SGL_BRANCH=v0.4.1.post4 -t v0.4.1.post4-rocm620 -f Dockerfile.rocm . +# docker build --build-arg SGL_BRANCH=v0.4.1.post5 -t v0.4.1.post5-rocm620 -f Dockerfile.rocm . # default base image ARG BASE_IMAGE="rocmshared/vllm-rocm:20241031-tuned" diff --git a/docs/developer/setup_github_runner.md b/docs/developer/setup_github_runner.md index 7b510d72305e..fe856e9d659b 100644 --- a/docs/developer/setup_github_runner.md +++ b/docs/developer/setup_github_runner.md @@ -11,9 +11,9 @@ docker pull nvidia/cuda:12.1.1-devel-ubuntu22.04 # Nvidia docker run --shm-size 128g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash # AMD -docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post4-rocm620 /bin/bash +docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post5-rocm620 /bin/bash # AMD just the last 2 GPUs -docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post4-rocm620 /bin/bash +docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.1.post5-rocm620 /bin/bash ``` ### Step 2: Configure the runner by `config.sh` diff --git a/docs/start/install.md b/docs/start/install.md index 8a81bb177974..26b09dfe319f 100644 --- a/docs/start/install.md +++ b/docs/start/install.md @@ -13,7 +13,7 @@ Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/ ## Method 2: From source ``` # Use the last release branch -git clone -b v0.4.1.post4 https://github.com/sgl-project/sglang.git +git clone -b v0.4.1.post5 https://github.com/sgl-project/sglang.git cd sglang pip install --upgrade pip @@ -26,7 +26,7 @@ Note: To AMD ROCm system with Instinct/MI GPUs, do following instead: ``` # Use the last release branch -git clone -b v0.4.1.post4 https://github.com/sgl-project/sglang.git +git clone -b v0.4.1.post5 https://github.com/sgl-project/sglang.git cd sglang pip install --upgrade pip @@ -51,7 +51,7 @@ docker run --gpus all \ Note: To AMD ROCm system with Instinct/MI GPUs, it is recommended to use `docker/Dockerfile.rocm` to build images, example and usage as below: ```bash -docker build --build-arg SGL_BRANCH=v0.4.1.post4 -t v0.4.1.post4-rocm620 -f Dockerfile.rocm . +docker build --build-arg SGL_BRANCH=v0.4.1.post5 -t v0.4.1.post5-rocm620 -f Dockerfile.rocm . alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/dri --ipc=host \ --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ @@ -60,11 +60,11 @@ alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/d drun -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=" \ - v0.4.1.post4-rocm620 \ + v0.4.1.post5-rocm620 \ python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000 # Till flashinfer backend available, --attention-backend triton --sampling-backend pytorch are set by default -drun v0.4.1.post4-rocm620 python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8 +drun v0.4.1.post5-rocm620 python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8 ``` ## Method 4: Using docker compose diff --git a/python/pyproject.toml b/python/pyproject.toml index d536f8832e1d..a236469a17c8 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "sglang" -version = "0.4.1.post4" +version = "0.4.1.post5" description = "SGLang is yet another fast serving framework for large language models and vision language models." readme = "README.md" requires-python = ">=3.8" diff --git a/python/sglang/version.py b/python/sglang/version.py index 24e54e5c95d5..51eb3167fae5 100644 --- a/python/sglang/version.py +++ b/python/sglang/version.py @@ -1 +1 @@ -__version__ = "0.4.1.post4" +__version__ = "0.4.1.post5" From 197cbf9bab6aa4d75d7da392bbb8ac9c58ba7c5d Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Sat, 11 Jan 2025 23:11:38 +0800 Subject: [PATCH 022/248] docs: update README (#2841) --- benchmark/deepseek_v3/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md index a4f5bf854bde..d14a8d55630c 100644 --- a/benchmark/deepseek_v3/README.md +++ b/benchmark/deepseek_v3/README.md @@ -4,7 +4,7 @@ The SGLang and DeepSeek teams collaborated to get DeepSeek V3 FP8 running on NVI Special thanks to Meituan's Search & Recommend Platform Team and Baseten's Model Performance Team for implementing the model, and DataCrunch for providing GPU resources. -For optimizations made on the DeepSeek series models regarding SGLang, please refer to https://sgl-project.github.io/references/deepseek.html +For optimizations made on the DeepSeek series models regarding SGLang, please refer to [DeepSeek Model Optimizations in SGLang](https://sgl-project.github.io/references/deepseek.html). ## Hardware Recommendation - 8 x NVIDIA H200 GPUs From c4f9707e16146d7bc85d2744693aa78642e75e18 Mon Sep 17 00:00:00 2001 From: Shi Shuai <126407087+shuaills@users.noreply.github.com> Date: Sat, 11 Jan 2025 23:14:26 +0000 Subject: [PATCH 023/248] Improve: Token-In Token-Out Usage for RLHF (#2843) --- docs/backend/native_api.ipynb | 70 +++++++++++ docs/backend/structured_outputs.ipynb | 6 +- docs/index.rst | 2 +- docs/references/deepseek.md | 10 +- .../srt/managers/detokenizer_manager.py | 2 - python/sglang/srt/managers/io_struct.py | 8 +- python/sglang/srt/managers/scheduler.py | 10 +- .../sglang/srt/managers/tokenizer_manager.py | 7 -- python/sglang/srt/server_args.py | 18 +-- test/srt/run_suite.py | 1 - test/srt/test_engine_token_ids.py | 45 ------- test/srt/test_skip_tokenizer_init.py | 119 ++++++++++++------ 12 files changed, 168 insertions(+), 130 deletions(-) delete mode 100644 test/srt/test_engine_token_ids.py diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb index 26758f7f9759..f6c10d745c5e 100644 --- a/docs/backend/native_api.ipynb +++ b/docs/backend/native_api.ipynb @@ -348,6 +348,76 @@ "source": [ "terminate_process(reward_process)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Skip Tokenizer and Detokenizer\n", + "\n", + "SGLang Runtime also supports skip tokenizer and detokenizer. This is useful in cases like integrating with RLHF workflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer_free_server_process = execute_shell_command(\n", + " \"\"\"\n", + "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010 --skip-tokenizer-init\n", + "\"\"\"\n", + ")\n", + "\n", + "wait_for_server(\"http://localhost:30010\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Llama-3.2-1B-Instruct\")\n", + "\n", + "input_text = \"What is the capital of France?\"\n", + "\n", + "input_tokens = tokenizer.encode(input_text)\n", + "print_highlight(f\"Input Text: {input_text}\")\n", + "print_highlight(f\"Tokenized Input: {input_tokens}\")\n", + "\n", + "response = requests.post(\n", + " \"http://localhost:30010/generate\",\n", + " json={\n", + " \"input_ids\": input_tokens,\n", + " \"sampling_params\": {\n", + " \"temperature\": 0,\n", + " \"max_new_tokens\": 256,\n", + " \"stop_token_ids\": [tokenizer.eos_token_id],\n", + " },\n", + " \"stream\": False,\n", + " },\n", + ")\n", + "output = response.json()\n", + "output_tokens = output[\"token_ids\"]\n", + "\n", + "output_text = tokenizer.decode(output_tokens, skip_special_tokens=False)\n", + "print_highlight(f\"Tokenized Output: {output_tokens}\")\n", + "print_highlight(f\"Decoded Output: {output_text}\")\n", + "print_highlight(f\"Output Text: {output['meta_info']['finish_reason']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "terminate_process(tokenizer_free_server_process)" + ] } ], "metadata": { diff --git a/docs/backend/structured_outputs.ipynb b/docs/backend/structured_outputs.ipynb index f017ef863035..55ca0b627f9c 100644 --- a/docs/backend/structured_outputs.ipynb +++ b/docs/backend/structured_outputs.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Structured Outputs (JSON, Regex, EBNF)" + "# Structured Outputs" ] }, { @@ -43,6 +43,10 @@ " print_highlight,\n", ")\n", "import openai\n", + "import os\n", + "\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", + "\n", "\n", "server_process = execute_shell_command(\n", " \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0 --grammar-backend xgrammar\"\n", diff --git a/docs/index.rst b/docs/index.rst index 6ed313a3bd17..51796d4a1071 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -56,10 +56,10 @@ The core features include: references/hyperparameter_tuning.md references/benchmark_and_profiling.md references/custom_chat_template.md + references/deepseek.md references/llama_405B.md references/modelscope.md references/contribution_guide.md references/troubleshooting.md references/faq.md references/learn_more.md - references/deepseek.md diff --git a/docs/references/deepseek.md b/docs/references/deepseek.md index 5a95fd9a9418..913395357e1d 100644 --- a/docs/references/deepseek.md +++ b/docs/references/deepseek.md @@ -1,4 +1,4 @@ -# DeepSeek Model Optimizations in SGLang +# DeepSeek Model Optimizations SGLang provides several optimizations specifically designed for the DeepSeek model to boost its inference speed. This document outlines current optimizations for DeepSeek. Additionally, the SGLang team is actively developing enhancements for [DeepSeek-V3](https://github.com/sgl-project/sglang/issues/2591). @@ -16,7 +16,9 @@ SGLang provides several optimizations specifically designed for the DeepSeek mod Overall, with these optimizations, we have achieved up to a 7x acceleration in output throughput compared to the previous version. -![Multi-head Latent Attention for DeepSeek Series Models](https://lmsys.org/images/blog/sglang_v0_3/deepseek_mla.svg) +

+ Multi-head Latent Attention for DeepSeek Series Models +

**Usage**: MLA optimization is enabled by defalut, to disable, use `--disable-mla`. @@ -26,7 +28,9 @@ Overall, with these optimizations, we have achieved up to a 7x acceleration in o **Description**: This optimization involves data parallelism (DP) for the MLA attention mechanism of DeepSeek Series Models, which allows for a significant reduction in the KV cache size, enabling larger batch sizes. Each DP worker independently handles different types of batches (prefill, decode, idle), which are then synchronized before and after processing through the Mixture-of-Experts (MoE) layer. -![Data Parallelism Attention for DeepSeek Series Models](https://lmsys.org/images/blog/sglang_v0_4/dp_attention.svg). +

+ Data Parallelism Attention for DeepSeek Series Models +

**Usage**: This optimization is aimed at improving throughput and should be used for scenarios with high QPS (Queries Per Second). Data Parallelism Attention optimization can be enabeld by `--enable-dp-attention` for DeepSeek Series Models. diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py index fd77d338edce..b4bc1e7a448d 100644 --- a/python/sglang/srt/managers/detokenizer_manager.py +++ b/python/sglang/srt/managers/detokenizer_manager.py @@ -181,8 +181,6 @@ def event_loop(self): finished_reasons=recv_obj.finished_reasons, output_strs=output_strs, prompt_tokens=recv_obj.prompt_tokens, - origin_input_ids=recv_obj.origin_input_ids, - output_ids=recv_obj.output_ids, completion_tokens=recv_obj.completion_tokens, cached_tokens=recv_obj.cached_tokens, input_token_logprobs_val=recv_obj.input_token_logprobs_val, diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 1aae28b00b76..6ddc0993f9d7 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -323,9 +323,7 @@ class BatchTokenIDOut: decoded_texts: List[str] decode_ids: List[int] read_offsets: List[int] - # Only used when --return-token-ids` is set - origin_input_ids: Optional[List[int]] - # Only used when `--skip-tokenizer-init` or `--return-token-ids` is set + # Only used when `--skip-tokenizer-init` is on output_ids: Optional[List[int]] # Detokenization configs skip_special_tokens: List[bool] @@ -356,10 +354,6 @@ class BatchStrOut: # The output decoded strings output_strs: List[str] - # The token ids - origin_input_ids: Optional[List[int]] - output_ids: Optional[List[int]] - # Token counts # real input and output tokens can be get from # origin_input_ids and output_ids by enabling --return_token_ids diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 6022a2567343..31c8018e2581 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1253,7 +1253,6 @@ def stream_output( decode_ids_list = [] read_offsets = [] output_ids = [] - origin_input_ids = [] skip_special_tokens = [] spaces_between_special_tokens = [] @@ -1305,14 +1304,8 @@ def stream_output( decode_ids, read_offset = req.init_incremental_detokenize() decode_ids_list.append(decode_ids) read_offsets.append(read_offset) - if self.skip_tokenizer_init or self.server_args.return_token_ids: + if self.skip_tokenizer_init: output_ids.append(req.output_ids) - else: - output_ids = None - if self.server_args.return_token_ids: - origin_input_ids.append(req.origin_input_ids) - else: - origin_input_ids = None skip_special_tokens.append(req.sampling_params.skip_special_tokens) spaces_between_special_tokens.append( req.sampling_params.spaces_between_special_tokens @@ -1344,7 +1337,6 @@ def stream_output( decoded_texts, decode_ids_list, read_offsets, - origin_input_ids, output_ids, skip_special_tokens, spaces_between_special_tokens, diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 00ef8458ab82..9f9c53eaa8ec 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -663,13 +663,6 @@ async def handle_loop(self): "text": recv_obj.output_strs[i], "meta_info": meta_info, } - if self.server_args.return_token_ids: - out_dict.update( - { - "input_ids": recv_obj.origin_input_ids[i], - "output_ids": recv_obj.output_ids[i], - } - ) elif isinstance(recv_obj, BatchTokenIDOut): out_dict = { "token_ids": recv_obj.output_ids[i], diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 09d1a3edebc4..66739652aa9d 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -55,7 +55,6 @@ class ServerArgs: is_embedding: bool = False revision: Optional[str] = None skip_tokenizer_init: bool = False - return_token_ids: bool = False # Port for the HTTP server host: str = "127.0.0.1" @@ -296,6 +295,11 @@ def add_cli_args(parser: argparse.ArgumentParser): "tokenizer if available, and 'slow' will " "always use the slow tokenizer.", ) + parser.add_argument( + "--skip-tokenizer-init", + action="store_true", + help="If set, skip init tokenizer and pass input_ids in generate request", + ) parser.add_argument( "--load-format", type=str, @@ -404,18 +408,6 @@ def add_cli_args(parser: argparse.ArgumentParser): "name, a tag name, or a commit id. If unspecified, will use " "the default version.", ) - parser.add_argument( - "--skip-tokenizer-init", - action="store_true", - help="If set, skip init tokenizer and pass input_ids in generate request", - ) - parser.add_argument( - "--return-token-ids", - action="store_true", - default=ServerArgs.return_token_ids, - help="Whether to return token IDs in the output, this may introduce additional overhead.", - ) - # Memory and scheduling parser.add_argument( "--mem-fraction-static", diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 2c1750d363ce..320fea7294e5 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -45,7 +45,6 @@ "test_vision_chunked_prefill.py", "test_vision_openai_server.py", "test_session_control.py", - "test_engine_token_ids.py", ], "nightly": [ "test_nightly_gsm8k_eval.py", diff --git a/test/srt/test_engine_token_ids.py b/test/srt/test_engine_token_ids.py deleted file mode 100644 index 4dee24edc9de..000000000000 --- a/test/srt/test_engine_token_ids.py +++ /dev/null @@ -1,45 +0,0 @@ -import unittest - -from transformers import AutoTokenizer - -import sglang as sgl -from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST - - -class TestEngineTokenIds(unittest.TestCase): - def test_token_ids_in_generate(self): - llm = sgl.Engine( - model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST, return_token_ids=True - ) - tokenizer = AutoTokenizer.from_pretrained(DEFAULT_SMALL_MODEL_NAME_FOR_TEST) - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - - sampling_params = {"temperature": 0, "top_p": 0.95} - outputs = llm.generate(prompts, sampling_params) - - for prompt, output in zip(prompts, outputs): - deocode_input = tokenizer.decode( - output["input_ids"], skip_special_tokens=True - ) - assert (deocode_input in prompt) or ( - prompt in deocode_input - ), f"Decode input: {deocode_input} mismatch for: {prompt}" - - deocode_output = tokenizer.decode( - output["output_ids"], skip_special_tokens=True - ) - assert (deocode_output in output["text"]) or ( - output["text"] in deocode_output - ), f"Decode output: {deocode_output} mismatch for: {output['text']}" - - llm.shutdown() - - -if __name__ == "__main__": - unittest.main() diff --git a/test/srt/test_skip_tokenizer_init.py b/test/srt/test_skip_tokenizer_init.py index eef033ea98cb..db70944091f2 100644 --- a/test/srt/test_skip_tokenizer_init.py +++ b/test/srt/test_skip_tokenizer_init.py @@ -1,11 +1,8 @@ -""" -python3 -m unittest test_skip_tokenizer_init.TestSkipTokenizerInit.test_parallel_sample -""" - import json import unittest import requests +from transformers import AutoTokenizer from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( @@ -15,35 +12,63 @@ popen_launch_server, ) +_server_process = None +_base_url = None +_tokenizer = None + + +def setUpModule(): + """ + Launch the server once before all tests and initialize the tokenizer. + """ + global _server_process, _base_url, _tokenizer + _server_process = popen_launch_server( + DEFAULT_SMALL_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=["--skip-tokenizer-init"], + ) + _base_url = DEFAULT_URL_FOR_TEST + + _tokenizer = AutoTokenizer.from_pretrained( + DEFAULT_SMALL_MODEL_NAME_FOR_TEST, use_fast=False + ) + print(">>> setUpModule: Server launched, tokenizer ready") + + +def tearDownModule(): + """ + Terminate the server once after all tests have completed. + """ + global _server_process + if _server_process is not None: + kill_process_tree(_server_process.pid) + _server_process = None + print(">>> tearDownModule: Server terminated") -class TestSkipTokenizerInit(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=["--skip-tokenizer-init"], - ) - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) +class TestSkipTokenizerInit(unittest.TestCase): + def run_decode( + self, + prompt_text="The capital of France is", + max_new_tokens=32, + return_logprob=False, + top_logprobs_num=0, + n=1, + ): + input_ids = _tokenizer(prompt_text, return_tensors="pt")["input_ids"][ + 0 + ].tolist() - def run_decode(self, return_logprob=False, top_logprobs_num=0, n=1): - max_new_tokens = 32 - input_ids = [128000, 791, 6864, 315, 9822, 374] # The capital of France is response = requests.post( - self.base_url + "/generate", + _base_url + "/generate", json={ "input_ids": input_ids, "sampling_params": { "temperature": 0 if n == 1 else 0.5, "max_new_tokens": max_new_tokens, "n": n, - "stop_token_ids": [119690], + "stop_token_ids": [_tokenizer.eos_token_id], }, "stream": False, "return_logprob": return_logprob, @@ -52,25 +77,37 @@ def run_decode(self, return_logprob=False, top_logprobs_num=0, n=1): }, ) ret = response.json() - print(json.dumps(ret)) + print(json.dumps(ret, indent=2)) def assert_one_item(item): - self.assertEqual( - len(item["token_ids"]), item["meta_info"]["completion_tokens"] - ) - self.assertEqual(len(item["token_ids"]), max_new_tokens) - assert item["meta_info"]["prompt_tokens"] == len(input_ids) - - if return_logprob: - assert len(item["meta_info"]["input_token_logprobs"]) == len( - input_ids - ), f'{len(item["meta_info"]["input_token_logprobs"])} vs. f{len(input_ids)}' - assert len(item["meta_info"]["output_token_logprobs"]) == max_new_tokens - + if item["meta_info"]["finish_reason"]["type"] == "stop": + self.assertEqual( + item["meta_info"]["finish_reason"]["matched"], + _tokenizer.eos_token_id, + ) + elif item["meta_info"]["finish_reason"]["type"] == "length": + self.assertEqual( + len(item["token_ids"]), item["meta_info"]["completion_tokens"] + ) + self.assertEqual(len(item["token_ids"]), max_new_tokens) + self.assertEqual(item["meta_info"]["prompt_tokens"], len(input_ids)) + + if return_logprob: + self.assertEqual( + len(item["meta_info"]["input_token_logprobs"]), + len(input_ids), + f'{len(item["meta_info"]["input_token_logprobs"])} mismatch with {len(input_ids)}', + ) + self.assertEqual( + len(item["meta_info"]["output_token_logprobs"]), + max_new_tokens, + ) + + # Determine whether to assert a single item or multiple items based on n if n == 1: assert_one_item(ret) else: - assert len(ret) == n + self.assertEqual(len(ret), n) for i in range(n): assert_one_item(ret[i]) @@ -84,10 +121,10 @@ def test_parallel_sample(self): def test_logprob(self): for top_logprobs_num in [0, 3]: - self.run_decode( - return_logprob=True, - top_logprobs_num=top_logprobs_num, - ) + self.run_decode(return_logprob=True, top_logprobs_num=top_logprobs_num) + + def test_eos_behavior(self): + self.run_decode(max_new_tokens=256) if __name__ == "__main__": From e2b16c4716f220a0469cdb424c508c95767fb924 Mon Sep 17 00:00:00 2001 From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Date: Mon, 13 Jan 2025 11:38:17 +0800 Subject: [PATCH 024/248] add sampling_scaling_penalties kernel (#2846) --- sgl-kernel/CMakeLists.txt | 1 + sgl-kernel/pyproject.toml | 2 +- sgl-kernel/setup.py | 1 + sgl-kernel/src/sgl-kernel/__init__.py | 2 + .../csrc/sampling_scaling_penalties.cu | 64 +++++++++++++++++++ .../src/sgl-kernel/csrc/sgl_kernel_ops.cu | 5 ++ .../src/sgl-kernel/csrc/vectorization.cuh | 30 +++++++++ sgl-kernel/src/sgl-kernel/ops/__init__.py | 7 ++ .../tests/test_sampling_scaling_penalties.py | 39 +++++++++++ 9 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu create mode 100644 sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh create mode 100644 sgl-kernel/tests/test_sampling_scaling_penalties.py diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt index 3c267a4de504..15818d289eae 100644 --- a/sgl-kernel/CMakeLists.txt +++ b/sgl-kernel/CMakeLists.txt @@ -32,6 +32,7 @@ add_library(_kernels SHARED src/sgl-kernel/csrc/trt_reduce_kernel.cu src/sgl-kernel/csrc/moe_align_kernel.cu src/sgl-kernel/csrc/int8_gemm_kernel.cu + src/sgl-kernel/csrc/sampling_scaling_penalties.cu src/sgl-kernel/csrc/sgl_kernel_ops.cu ) diff --git a/sgl-kernel/pyproject.toml b/sgl-kernel/pyproject.toml index 359ffafd70d2..b03b4c02b5e1 100644 --- a/sgl-kernel/pyproject.toml +++ b/sgl-kernel/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "sgl-kernel" -version = "0.0.2.post11" +version = "0.0.2.post12" description = "Kernel Library for SGLang" readme = "README.md" requires-python = ">=3.8" diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py index c93e87f6bad3..83025d6d6c6f 100644 --- a/sgl-kernel/setup.py +++ b/sgl-kernel/setup.py @@ -50,6 +50,7 @@ def update_wheel_platform_tag(): "src/sgl-kernel/csrc/trt_reduce_kernel.cu", "src/sgl-kernel/csrc/moe_align_kernel.cu", "src/sgl-kernel/csrc/int8_gemm_kernel.cu", + "src/sgl-kernel/csrc/sampling_scaling_penalties.cu", "src/sgl-kernel/csrc/sgl_kernel_ops.cu", ], include_dirs=include_dirs, diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py index 892808f1ee15..62c366731e55 100644 --- a/sgl-kernel/src/sgl-kernel/__init__.py +++ b/sgl-kernel/src/sgl-kernel/__init__.py @@ -4,6 +4,7 @@ init_custom_reduce, int8_scaled_mm, moe_align_block_size, + sampling_scaling_penalties, ) __all__ = [ @@ -12,4 +13,5 @@ "custom_dispose", "custom_reduce", "int8_scaled_mm", + "sampling_scaling_penalties", ] diff --git a/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu b/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu new file mode 100644 index 000000000000..30264caa3666 --- /dev/null +++ b/sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu @@ -0,0 +1,64 @@ +#include +#include +#include +#include +#include "utils.hpp" +#include "vectorization.cuh" + +template +__global__ void sampling_scaling_penalties_kernel( + const scalar_t* logits, + const scalar_t* scaling_penalties, + scalar_t* output, + const int32_t numel) { + + const int32_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const int32_t stride = blockDim.x * gridDim.x; + + auto const* vectorized_logits = reinterpret_cast const*>(logits); + auto const* vectorized_penalties = reinterpret_cast const*>(scaling_penalties); + auto* vectorized_output = reinterpret_cast*>(output); + + const int32_t num_vec_elems = numel >> 2; + +#pragma unroll 4 + for (int32_t i = tid; i < num_vec_elems; i += stride) { + vec4_t logits_vec = vectorized_logits[i]; + vec4_t penalties_vec = vectorized_penalties[i]; + vec4_t out_vec; + + out_vec.x = logits_vec.x > 0 ? logits_vec.x / penalties_vec.x : logits_vec.x * penalties_vec.x; + out_vec.y = logits_vec.y > 0 ? logits_vec.y / penalties_vec.y : logits_vec.y * penalties_vec.y; + out_vec.z = logits_vec.z > 0 ? logits_vec.z / penalties_vec.z : logits_vec.z * penalties_vec.z; + out_vec.w = logits_vec.w > 0 ? logits_vec.w / penalties_vec.w : logits_vec.w * penalties_vec.w; + + vectorized_output[i] = out_vec; + } + + const int32_t start_idx = num_vec_elems * 4; + for (int32_t i = start_idx + tid; i < numel; i += stride) { + scalar_t logit = logits[i]; + scalar_t penalty = scaling_penalties[i]; + output[i] = logit > 0 ? logit / penalty : logit * penalty; + } +} + +torch::Tensor sampling_scaling_penalties(const torch::Tensor& logits, const torch::Tensor& scaling_penalties) { + auto output = torch::empty_like(logits); + const auto numel = logits.numel(); + const int threads = 512; + + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, + logits.scalar_type(), "sampling_scaling_penalties_kernel", ([&] { + const int blocks = (numel + threads * 4 - 1) / (threads * 4); + sampling_scaling_penalties_kernel<<>>( + logits.data_ptr(), + scaling_penalties.data_ptr(), + output.data_ptr(), + numel); + })); + + return output; +} diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu index 6ed543e6c542..fbfe51442a35 100644 --- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu +++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu @@ -12,6 +12,9 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t b torch::Tensor sorted_token_ids, torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad, torch::Tensor token_cnts_buffer, torch::Tensor cumsum_buffer); +// sampling_scaling_penalties +torch::Tensor sampling_scaling_penalties(const torch::Tensor& logits, const torch::Tensor& scaling_penalties); + // int8_scaled_mm torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a, const torch::Tensor& scales_b, const torch::Dtype& out_dtype, @@ -24,6 +27,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("all_reduce", &all_reduce, "custom all reduce (CUDA)"); // moe_align_block_size m.def("moe_align_block_size", &moe_align_block_size, "MOE Align Block Size (CUDA)"); + // sampling_scaling_penalties + m.def("sampling_scaling_penalties", &sampling_scaling_penalties, "Sampling scaling penalties (CUDA)"); // int8_scaled_mm m.def("int8_scaled_mm", &int8_scaled_mm, "INT8 scaled matmul (CUDA)"); } diff --git a/sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh b/sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh new file mode 100644 index 000000000000..cb36d0e7a456 --- /dev/null +++ b/sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh @@ -0,0 +1,30 @@ +// Adapted from https://github.com/vllm-project/vllm/blob/main/csrc/quantization/vectorization.cuh +#pragma once +/** + * __device__ datatypes vectorized by 4 + */ + +// Include both AMD and NVIDIA fp8 types to avoid circular import +// TODO(luka/varun) use FP8_TYPE instead after refactoring +#include +#include + +// Vectorization containers +template +struct __align__(8) vec4_t { + scalar_t x; + scalar_t y; + scalar_t z; + scalar_t w; +}; + +template +struct __align__(4) q8x4_t { + static_assert(std::is_same_v || + std::is_same_v || + std::is_same_v); + quant_type_t x; + quant_type_t y; + quant_type_t z; + quant_type_t w; +}; diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py index e388ae35653b..03a8db80fd37 100644 --- a/sgl-kernel/src/sgl-kernel/ops/__init__.py +++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py @@ -3,6 +3,9 @@ from sgl_kernel.ops._kernels import init_custom_ar as _init_custom_ar from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size +from sgl_kernel.ops._kernels import ( + sampling_scaling_penalties as _sampling_scaling_penalties, +) def init_custom_reduce(rank_id, num_devices, buffers, barrier_in, barrier_out): @@ -39,6 +42,10 @@ def moe_align_block_size( ) +def sampling_scaling_penalties(logits, scaling_penalties): + return _sampling_scaling_penalties(logits, scaling_penalties) + + def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None): return _int8_scaled_mm( mat_a, diff --git a/sgl-kernel/tests/test_sampling_scaling_penalties.py b/sgl-kernel/tests/test_sampling_scaling_penalties.py new file mode 100644 index 000000000000..4b9746fd7934 --- /dev/null +++ b/sgl-kernel/tests/test_sampling_scaling_penalties.py @@ -0,0 +1,39 @@ +import torch +from sgl_kernel import sampling_scaling_penalties + + +def test_sampling_scaling_penalties(): + batch_sizes = [1, 2, 4, 8, 16, 32, 64, 65] + vocab_sizes = [2048, 4096, 8192, 16384, 32768, 32767] + dtypes = [torch.float32, torch.half, torch.bfloat16] + device = torch.device("cuda") + + for dtype in dtypes: + rtol = 1e-3 + atol = 1e-3 + + for bs in batch_sizes: + for vocab_size in vocab_sizes: + logits = torch.randn(bs, vocab_size, device=device, dtype=dtype) + scaling_penalties = ( + torch.rand(bs, vocab_size, device=device, dtype=dtype) + 0.5 + ) + + ref_output = torch.where( + logits > 0, logits / scaling_penalties, logits * scaling_penalties + ) + + kernel_output = sampling_scaling_penalties(logits, scaling_penalties) + + torch.testing.assert_close( + kernel_output, + ref_output, + rtol=rtol, + atol=atol, + msg=f"Failed for batch_size={bs}, vocab_size={vocab_size}, dtype={dtype}", + ) + + +if __name__ == "__main__": + test_sampling_scaling_penalties() + print("All tests passed!") From a879c2fb4cf5e976f24a4dc95d21b2af99a7624b Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Mon, 13 Jan 2025 12:27:17 +0800 Subject: [PATCH 025/248] fix sgl-kernel build (#2850) --- .github/workflows/release-pypi-kernel.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/release-pypi-kernel.yml b/.github/workflows/release-pypi-kernel.yml index f046538a6fad..362088c47fd1 100644 --- a/.github/workflows/release-pypi-kernel.yml +++ b/.github/workflows/release-pypi-kernel.yml @@ -22,6 +22,8 @@ jobs: steps: - uses: actions/checkout@v4 + with: + submodules: 'recursive' - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 From 85b2e05770ea453bf619d20b5e41679e4b86efb6 Mon Sep 17 00:00:00 2001 From: Ke Bao Date: Mon, 13 Jan 2025 13:16:58 +0800 Subject: [PATCH 026/248] Add int8 quant kernel (#2848) --- .../kernels/quantization/bench_int8_quant.py | 94 +++++++++++++++++++ .../srt/layers/quantization/int8_kernel.py | 53 +++++++++++ 2 files changed, 147 insertions(+) create mode 100644 benchmark/kernels/quantization/bench_int8_quant.py create mode 100644 python/sglang/srt/layers/quantization/int8_kernel.py diff --git a/benchmark/kernels/quantization/bench_int8_quant.py b/benchmark/kernels/quantization/bench_int8_quant.py new file mode 100644 index 000000000000..94b795690bfc --- /dev/null +++ b/benchmark/kernels/quantization/bench_int8_quant.py @@ -0,0 +1,94 @@ +import argparse + +import torch +import triton +from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant + +from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8 + + +@torch.compile(backend="inductor") +def torch_int8_quant(x): + int8_max = torch.iinfo(torch.int8).max + + abs_max = x.abs().max(dim=-1, keepdim=True).values + scales = abs_max.to(torch.float32) / float(int8_max) + + q_x = (x / scales).round().to(torch.int8) + + return q_x, scales + + +def _test_accuracy_once(M, K, input_dtype, device): + x = torch.randn(M, K, dtype=input_dtype, device=device) * 5000 + out, scales, _ = vllm_scaled_int8_quant(x, symmetric=True) + out1, scales1 = per_token_quant_int8(x) + out2, scales2 = torch_int8_quant(x) + torch.testing.assert_close(out, out2, atol=1, rtol=0) + torch.testing.assert_close(out, out1, atol=1, rtol=0) + torch.testing.assert_close(scales, scales2) + torch.testing.assert_close(scales1, scales2) + print(f"M: {M}, K: {K}, type: {input_dtype} OK") + + +def test_accuracy(): + Ms = [1, 13, 128, 1024, 2048, 4096] + Ks = [512, 1024, 2048, 8192] + input_dtypes = [torch.float16, torch.bfloat16] + for M in Ms: + for K in Ks: + for input_dtype in input_dtypes: + _test_accuracy_once(M, K, input_dtype, "cuda") + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048], + x_log=False, + line_arg="provider", + line_vals=["vllm op", "triton", "torch.compile"], + line_names=["vllm op", "triton", "torch.compile"], + styles=[("blue", "-"), ("orange", "-"), ("red", "-")], + ylabel="ms", + plot_name="int8 per token quant", + args={}, + ) +) +def benchmark(batch_size, provider): + M, K = batch_size, 16384 + x = torch.randn(M, K, dtype=torch.float16, device="cuda") * 1000 + + quantiles = [0.5, 0.2, 0.8] + if provider == "vllm op": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: vllm_scaled_int8_quant(x, symmetric=True), + quantiles=quantiles, + ) + if provider == "triton": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: per_token_quant_int8(x), + quantiles=quantiles, + ) + if provider == "torch.compile": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: torch_int8_quant(x), + quantiles=quantiles, + ) + + return ms, min_ms, max_ms + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--save_path", + type=str, + default="./bench_int8_quant_res", + help="Path to save int8 quant benchmark results", + ) + args = parser.parse_args() + + test_accuracy() + + benchmark.run(print_data=True, show_plots=True, save_path=args.save_path) diff --git a/python/sglang/srt/layers/quantization/int8_kernel.py b/python/sglang/srt/layers/quantization/int8_kernel.py new file mode 100644 index 000000000000..d1e74c6044de --- /dev/null +++ b/python/sglang/srt/layers/quantization/int8_kernel.py @@ -0,0 +1,53 @@ +import torch +import triton +import triton.language as tl + + +@triton.jit +def _per_token_quant_int8( + x_ptr, + xq_ptr, + scale_ptr, + stride_x, + stride_xq, + N, + BLOCK: tl.constexpr, +): + # Adapted from https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282 + row_id = tl.program_id(0) + + cols = tl.arange(0, BLOCK) + mask = cols < N + + x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask, other=0.0).to(tl.float32) + absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10) + scale_x = absmax / 127 + x_q = tl.extra.cuda.libdevice.round(x / scale_x).to(tl.int8) + + tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask) + tl.store(scale_ptr + row_id, scale_x) + + +def per_token_quant_int8(x): + M = x.numel() // x.shape[-1] + N = x.shape[-1] + x_q = torch.empty_like(x, device=x.device, dtype=torch.int8) + scales = torch.empty(x.shape[:-1] + (1,), device=x.device, dtype=torch.float32) + BLOCK = triton.next_power_of_2(N) + # heuristics for number of warps + num_warps = min(max(BLOCK // 256, 1), 8) + + assert x.is_contiguous() + _per_token_quant_int8[(M,)]( + x, + x_q, + scales, + stride_x=x.stride(-2), + stride_xq=x_q.stride(-2), + N=N, + BLOCK=BLOCK, + num_warps=num_warps, + num_stages=1, + ) + + return x_q, scales From 0bb0f7631114b8a4b614ec8f197327ea7fce645d Mon Sep 17 00:00:00 2001 From: bjmsong Date: Mon, 13 Jan 2025 13:17:11 +0800 Subject: [PATCH 027/248] Support FP8 E4M3 KV Cache (#2786) Co-authored-by: root --- .../layers/attention/flashinfer_backend.py | 16 ++++- python/sglang/srt/layers/radix_attention.py | 2 + python/sglang/srt/mem_cache/memory_pool.py | 10 +-- .../sglang/srt/model_executor/model_runner.py | 27 ++++++++ python/sglang/srt/models/llama.py | 33 +++++++++- python/sglang/srt/server_args.py | 15 ++++- python/sglang/srt/utils.py | 6 ++ test/srt/kv_cache_scales_llama3_1_8b.json | 42 ++++++++++++ test/srt/test_fp8_kvcache.py | 64 +++++++++++++++++++ 9 files changed, 205 insertions(+), 10 deletions(-) create mode 100644 test/srt/kv_cache_scales_llama3_1_8b.json create mode 100644 test/srt/test_fp8_kvcache.py diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index fc3455b60774..f038394628fd 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -353,7 +353,9 @@ def forward_extend( if k is not None: assert v is not None if save_kv_cache: - forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v) + forward_batch.token_to_kv_pool.set_kv_buffer( + layer, cache_loc, k, v, layer.k_scale, layer.v_scale + ) o = prefill_wrapper_paged.forward( q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim), @@ -362,6 +364,8 @@ def forward_extend( sm_scale=layer.scaling, window_left=layer.sliding_window_size, logits_soft_cap=logits_soft_cap, + k_scale=layer.k_scale, + v_scale=layer.v_scale, ) else: o1, s1 = self.prefill_wrapper_ragged.forward_return_lse( @@ -387,7 +391,9 @@ def forward_extend( o, _ = merge_state(o1, s1, o2, s2) if save_kv_cache: - forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v) + forward_batch.token_to_kv_pool.set_kv_buffer( + layer, cache_loc, k, v, layer.k_scale, layer.v_scale + ) return o.view(-1, layer.tp_q_head_num * layer.head_dim) @@ -412,13 +418,17 @@ def forward_decode( if k is not None: assert v is not None if save_kv_cache: - forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v) + forward_batch.token_to_kv_pool.set_kv_buffer( + layer, cache_loc, k, v, layer.k_scale, layer.v_scale + ) o = decode_wrapper.forward( q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim), forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id), sm_scale=layer.scaling, logits_soft_cap=layer.logit_cap, + k_scale=layer.k_scale, + v_scale=layer.v_scale, ) return o.view(-1, layer.tp_q_head_num * layer.head_dim) diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py index 4b762c00ba55..a449d7188a46 100644 --- a/python/sglang/srt/layers/radix_attention.py +++ b/python/sglang/srt/layers/radix_attention.py @@ -47,6 +47,8 @@ def __init__( self.logit_cap = logit_cap self.sliding_window_size = sliding_window_size or -1 self.is_cross_attention = is_cross_attention + self.k_scale = 1.0 + self.v_scale = 1.0 def forward( self, diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py index b67f085b204b..6cb186577238 100644 --- a/python/sglang/srt/mem_cache/memory_pool.py +++ b/python/sglang/srt/mem_cache/memory_pool.py @@ -109,8 +109,8 @@ def __init__( ): self.size = size self.dtype = dtype - if dtype == torch.float8_e5m2: - # NOTE: Store as torch.uint8 because Tensor index_put is not implemented for torch.float8_e5m2 + if dtype in (torch.float8_e5m2, torch.float8_e4m3fn): + # NOTE: Store as torch.uint8 because Tensor.index_put is not implemented for torch.float8_e5m2 self.store_dtype = torch.uint8 else: self.store_dtype = dtype @@ -256,11 +256,13 @@ def set_kv_buffer( loc: torch.Tensor, cache_k: torch.Tensor, cache_v: torch.Tensor, + k_scale: float = 1.0, + v_scale: float = 1.0, ): layer_id = layer.layer_id if cache_k.dtype != self.dtype: - cache_k = cache_k.to(self.dtype) - cache_v = cache_v.to(self.dtype) + cache_k = (cache_k / k_scale).to(self.dtype) + cache_v = (cache_v / v_scale).to(self.dtype) if self.store_dtype != self.dtype: self.k_buffer[layer_id][loc] = cache_k.view(self.store_dtype) self.v_buffer[layer_id][loc] = cache_v.view(self.store_dtype) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index efba8c25b504..d46a2c0dc725 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -54,6 +54,7 @@ enable_show_time_cost, get_available_gpu_memory, init_custom_process_group, + is_cuda, is_hip, monkey_patch_vllm_gguf_config, monkey_patch_vllm_p2p_access_check, @@ -277,6 +278,29 @@ def load_model(self): device_config=DeviceConfig(self.device), ) + if self.server_args.kv_cache_dtype == "fp8_e4m3": + if self.server_args.quantization_param_path is not None: + if callable(getattr(self.model, "load_kv_cache_scales", None)): + self.model.load_kv_cache_scales( + self.server_args.quantization_param_path + ) + logger.info( + "Loaded KV cache scaling factors from %s", + self.server_args.quantization_param_path, + ) + else: + raise RuntimeError( + "Using FP8 KV cache and scaling factors provided but " + "model %s does not support loading scaling factors.", + self.model.__class__, + ) + else: + logger.warning( + "Using FP8 KV cache but no scaling factors " + "provided. Defaulting to scaling factors of 1.0. " + "This may lead to less accurate results!" + ) + # Parse other args self.sliding_window_size = ( self.model.get_attention_sliding_window_size() @@ -516,6 +540,9 @@ def init_memory_pool( self.kv_cache_dtype = torch.float8_e5m2fnuz else: self.kv_cache_dtype = torch.float8_e5m2 + elif self.server_args.kv_cache_dtype == "fp8_e4m3": + if is_cuda(): + self.kv_cache_dtype = torch.float8_e4m3fn else: raise ValueError( f"Unsupported kv_cache_dtype: {self.server_args.kv_cache_dtype}." diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py index e1688df01a8c..d606e52f8b8d 100644 --- a/python/sglang/srt/models/llama.py +++ b/python/sglang/srt/models/llama.py @@ -22,8 +22,12 @@ import torch from torch import nn from transformers import LlamaConfig -from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.model_loader.weight_utils import kv_cache_scales_loader from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm @@ -299,6 +303,30 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states + # If this function is called, it should always initialize KV cache scale + # factors (or else raise an exception). Thus, handled exceptions should + # make sure to leave KV cache scale factors in a known good (dummy) state + def load_kv_cache_scales(self, quantization_param_path: str) -> None: + tp_size = get_tensor_model_parallel_world_size() + tp_rank = get_tensor_model_parallel_rank() + for layer_idx, scaling_factor in kv_cache_scales_loader( + quantization_param_path, + tp_rank, + tp_size, + self.config.num_hidden_layers, + self.config.__class__.model_type, + ): + if not isinstance(self.layers[layer_idx], nn.Identity): + layer_self_attn = self.layers[layer_idx].self_attn + + if hasattr(layer_self_attn.attn, "k_scale"): + layer_self_attn.attn.k_scale = scaling_factor + layer_self_attn.attn.v_scale = scaling_factor + else: + raise RuntimeError( + "Self attention has no KV cache scaling " "factor attribute!" + ) + class LlamaForCausalLM(nn.Module): @@ -534,6 +562,9 @@ def set_embed_and_head(self, embed, head): torch.cuda.empty_cache() torch.cuda.synchronize() + def load_kv_cache_scales(self, quantization_param_path: str) -> None: + self.model.load_kv_cache_scales(quantization_param_path) + class Phi3ForCausalLM(LlamaForCausalLM): pass diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 66739652aa9d..be85a3670d40 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -32,6 +32,7 @@ is_hip, is_ipv6, is_port_available, + nullable_str, ) logger = logging.getLogger(__name__) @@ -47,6 +48,7 @@ class ServerArgs: trust_remote_code: bool = True dtype: str = "auto" kv_cache_dtype: str = "auto" + quantization_param_path: nullable_str = None quantization: Optional[str] = None context_length: Optional[int] = None device: str = "cuda" @@ -350,8 +352,17 @@ def add_cli_args(parser: argparse.ArgumentParser): "--kv-cache-dtype", type=str, default=ServerArgs.kv_cache_dtype, - choices=["auto", "fp8_e5m2"], - help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" is supported for CUDA 11.8+.', + choices=["auto", "fp8_e5m2", "fp8_e4m3"], + help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.', + ) + parser.add_argument( + "--quantization-param-path", + type=nullable_str, + default=None, + help="Path to the JSON file containing the KV cache " + "scaling factors. This should generally be supplied, when " + "KV cache dtype is FP8. Otherwise, KV cache scaling factors " + "default to 1.0, which may cause accuracy issues. ", ) parser.add_argument( "--quantization", diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index b07f6f01d184..af9bdd60b66f 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -1375,3 +1375,9 @@ def wrapper(*args, **kwargs): return func(*args, **kwargs) return wrapper + + +def nullable_str(val: str): + if not val or val == "None": + return None + return val diff --git a/test/srt/kv_cache_scales_llama3_1_8b.json b/test/srt/kv_cache_scales_llama3_1_8b.json new file mode 100644 index 000000000000..3e890e50e4af --- /dev/null +++ b/test/srt/kv_cache_scales_llama3_1_8b.json @@ -0,0 +1,42 @@ +{ + "model_type": "llama", + "kv_cache": { + "dtype": "float8_e4m3fn", + "scaling_factor": { + "0": { + "0": 1, + "1": 1, + "2": 1, + "3": 1, + "4": 1, + "5": 1, + "6": 1, + "7": 1, + "8": 1, + "9": 1, + "10": 1, + "11": 1, + "12": 1, + "13": 1, + "14": 1, + "15": 1, + "16": 1, + "17": 1, + "18": 1, + "19": 1, + "20": 1, + "21": 1, + "22": 1, + "23": 1, + "24": 1, + "25": 1, + "26": 1, + "27": 1, + "28": 1, + "29": 1, + "30": 1, + "31": 1 + } + } + } +} diff --git a/test/srt/test_fp8_kvcache.py b/test/srt/test_fp8_kvcache.py new file mode 100644 index 000000000000..0d6602997de5 --- /dev/null +++ b/test/srt/test_fp8_kvcache.py @@ -0,0 +1,64 @@ +import os +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_process_tree +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestFp8Kvcache(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + dirpath = os.path.dirname(__file__) + config_file = os.path.join(dirpath, "kv_cache_scales_llama3_8b_chat.json") + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--kv-cache-dtype", + "fp8_e4m3", + "--quantization-param-path", + config_file, + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_mgsm_en(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mgsm_en", + num_examples=None, + num_threads=1024, + ) + + metrics = run_eval(args) + self.assertGreater(metrics["score"], 0.835) + + def test_mmlu(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mmlu", + num_examples=64, + num_threads=32, + ) + + metrics = run_eval(args) + self.assertGreaterEqual(metrics["score"], 0.65) + + +if __name__ == "__main__": + unittest.main() From a18ab81ddd505fed4b663c1e3b6df81e6613484a Mon Sep 17 00:00:00 2001 From: sogalin <39478626+sogalin@users.noreply.github.com> Date: Mon, 13 Jan 2025 14:39:44 +0800 Subject: [PATCH 028/248] Update base image for ROCm (#2852) Co-authored-by: HAI --- docker/Dockerfile.rocm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 9b1d67b5e4f7..7e6ae193aae5 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -2,7 +2,7 @@ # docker build --build-arg SGL_BRANCH=v0.4.1.post5 -t v0.4.1.post5-rocm620 -f Dockerfile.rocm . # default base image -ARG BASE_IMAGE="rocmshared/vllm-rocm:20241031-tuned" +ARG BASE_IMAGE="rocmshared/vllm-rocm:20250113-tuned-elementwise" FROM $BASE_IMAGE AS base USER root From e808c1df3e046d2c590efa32a22ebcb8741593ed Mon Sep 17 00:00:00 2001 From: kk <43161300+kkHuang-amd@users.noreply.github.com> Date: Mon, 13 Jan 2025 16:23:07 +0800 Subject: [PATCH 029/248] Integrate ROCm ater package for ck moe function feasibility (#2854) Co-authored-by: wunhuang Co-authored-by: Lin, Soga --- docker/Dockerfile.rocm | 9 ++ .../srt/layers/moe/fused_moe_triton/layer.py | 45 ++++-- python/sglang/srt/layers/quantization/fp8.py | 143 ++++++++++++------ python/sglang/srt/utils.py | 19 +++ 4 files changed, 162 insertions(+), 54 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 7e6ae193aae5..2ad62d2d493d 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -16,6 +16,10 @@ ARG SGL_BRANCH=${SGL_DEFAULT} ARG TRITON_REPO="https://github.com/triton-lang/triton.git" ARG TRITON_COMMIT="845d75a" + +ARG ATER_REPO="https://github.com/HaiShaw/ater" +ARG CK_COMMITS="fa05ae" + RUN git clone ${SGL_REPO} \ && cd sglang \ && if [ "${SGL_BRANCH}" = ${SGL_DEFAULT} ]; then \ @@ -46,6 +50,11 @@ RUN git clone ${TRITON_REPO} \ && cd python \ && python3 setup.py install +RUN git clone ${ATER_REPO} \ + && cd ater \ + && git submodule update --init --recursive \ + && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop + # Performance environment variable. ENV HIP_FORCE_DEV_KERNARG=1 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py index 8d0b7035ee50..e1064bcdabd1 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py @@ -1,5 +1,6 @@ # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/model_executor/layers/fused_moe/layer.py +import os from abc import abstractmethod from enum import Enum from typing import Callable, List, Optional, Tuple @@ -18,7 +19,7 @@ QuantizationConfig, QuantizeMethodBase, ) -from sglang.srt.utils import set_weight_attrs +from sglang.srt.utils import is_hip, permute_weight, set_weight_attrs if torch.cuda.is_available(): from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts @@ -97,6 +98,20 @@ def create_weights( layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + if is_hip() and bool(int(os.getenv("CK_MOE", "0"))): + layer.w13_weight = torch.nn.Parameter( + permute_weight(layer.w13_weight.data), + requires_grad=False, + ) + torch.cuda.empty_cache() + layer.w2_weight = torch.nn.Parameter( + permute_weight(layer.w2_weight.data), + requires_grad=False, + ) + torch.cuda.empty_cache() + return + def apply( self, layer: torch.nn.Module, @@ -148,14 +163,26 @@ def forward_cuda( correction_bias=correction_bias, ) - return fused_experts( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - ) + if is_hip() and bool(int(os.getenv("CK_MOE", "0"))): + import ater + from ater.fused_moe import fused_experts_ck + + return fused_experts_ck( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + ) + else: + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + ) def forward_cpu(self, *args, **kwargs): raise NotImplementedError("The CPU backend currently does not support MoE.") diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index f9e4a8a4ff45..22a43675bf85 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -40,6 +40,7 @@ from sglang.srt.utils import ( get_bool_env_var, is_hip, + permute_weight, print_warning_once, set_weight_attrs, ) @@ -616,18 +617,30 @@ def process_weights_after_loading(self, layer: Module) -> None: layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) - # If ROCm, apply weight padding (min. Mem channel contention) only if set - if is_hip() and bool(int(os.getenv("MOE_PADDING", "0"))): - layer.w13_weight = torch.nn.Parameter( - F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0), - requires_grad=False, - ) - torch.cuda.empty_cache() - layer.w2_weight = torch.nn.Parameter( - F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0), - requires_grad=False, - ) - torch.cuda.empty_cache() + if is_hip(): + if bool(int(os.getenv("CK_MOE", "0"))): + layer.w13_weight = torch.nn.Parameter( + permute_weight(layer.w13_weight.data), + requires_grad=False, + ) + torch.cuda.empty_cache() + layer.w2_weight = torch.nn.Parameter( + permute_weight(layer.w2_weight.data), + requires_grad=False, + ) + torch.cuda.empty_cache() + elif bool(int(os.getenv("MOE_PADDING", "0"))): + # If ROCm, apply weight padding (min. Mem channel contention) only if set + layer.w13_weight = torch.nn.Parameter( + F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0), + requires_grad=False, + ) + torch.cuda.empty_cache() + layer.w2_weight = torch.nn.Parameter( + F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0), + requires_grad=False, + ) + torch.cuda.empty_cache() return # If checkpoint is fp8, we need to handle that the @@ -708,18 +721,30 @@ def process_weights_after_loading(self, layer: Module) -> None: max_w13_scales, requires_grad=False ) - # If ROCm, apply weight padding (min. Mem channel contention) only if set - if is_hip() and bool(int(os.getenv("MOE_PADDING", "0"))): - layer.w13_weight = torch.nn.Parameter( - F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0), - requires_grad=False, - ) - torch.cuda.empty_cache() - layer.w2_weight = torch.nn.Parameter( - F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0), - requires_grad=False, - ) - torch.cuda.empty_cache() + if is_hip(): + if bool(int(os.getenv("CK_MOE", "0"))): + layer.w13_weight = torch.nn.Parameter( + permute_weight(layer.w13_weight.data), + requires_grad=False, + ) + torch.cuda.empty_cache() + layer.w2_weight = torch.nn.Parameter( + permute_weight(layer.w2_weight.data), + requires_grad=False, + ) + torch.cuda.empty_cache() + elif bool(int(os.getenv("MOE_PADDING", "0"))): + # If ROCm, apply weight padding (min. Mem channel contention) only if set + layer.w13_weight = torch.nn.Parameter( + F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0), + requires_grad=False, + ) + torch.cuda.empty_cache() + layer.w2_weight = torch.nn.Parameter( + F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0), + requires_grad=False, + ) + torch.cuda.empty_cache() return def apply( @@ -752,27 +777,55 @@ def apply( correction_bias=correction_bias, ) - # Expert fusion with FP8 quantization - return fused_experts( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - use_fp8_w8a8=True, - w1_scale=( - layer.w13_weight_scale_inv - if self.block_quant - else layer.w13_weight_scale - ), - w2_scale=( - layer.w2_weight_scale_inv if self.block_quant else layer.w2_weight_scale - ), - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale, - block_shape=self.quant_config.weight_block_size, - ) + if is_hip() and bool(int(os.getenv("CK_MOE", "0"))): + import ater + from ater.fused_moe import fused_experts_ck + + return fused_experts_ck( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + use_fp8_w8a8=True, + w1_scale=( + layer.w13_weight_scale_inv + if self.block_quant + else layer.w13_weight_scale + ), + w2_scale=( + layer.w2_weight_scale_inv + if self.block_quant + else layer.w2_weight_scale + ), + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + ) + + else: + # Expert fusion with FP8 quantization + return fused_experts( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + use_fp8_w8a8=True, + w1_scale=( + layer.w13_weight_scale_inv + if self.block_quant + else layer.w13_weight_scale + ), + w2_scale=( + layer.w2_weight_scale_inv + if self.block_quant + else layer.w2_weight_scale + ), + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + block_shape=self.quant_config.weight_block_size, + ) class Fp8KVCacheMethod(BaseKVCacheMethod): diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index af9bdd60b66f..51ca91a96b0d 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -1340,6 +1340,25 @@ def parse_tool_response(text, tools, **kwargs): return text, call_info_list +def permute_weight(x: torch.Tensor) -> torch.Tensor: + b_ = x.shape[0] + n_ = x.shape[1] + k_ = x.shape[2] + + x_ = x + if x.dtype == torch.bfloat16 or x.dtype == torch.float16: + x_ = x_.view(int(b_), int(n_ / 16), 16, int(k_ / 32), 4, 8) + elif x.dtype == torch.float8_e4m3fnuz or x.dtype == torch.int8: + x_ = x_.view(int(b_), int(n_ / 16), 16, int(k_ / 64), 4, 16) + else: + return x_ + + x_ = x_.permute(0, 1, 3, 4, 2, 5) + x_ = x_.contiguous() + x_ = x_.view(*x.shape) + return x_ + + class MultiprocessingSerializer: @staticmethod def serialize(obj): From 4093aa4660838c42a51f860989450b7d4c480436 Mon Sep 17 00:00:00 2001 From: justdoit <24875266+coolhok@users.noreply.github.com> Date: Mon, 13 Jan 2025 17:01:21 +0800 Subject: [PATCH 030/248] [Fix]eagle2 health_generate is first request,apiserver will core (#2853) --- python/sglang/srt/speculative/eagle_worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py index 0e53506a8840..2a6ec96048bb 100644 --- a/python/sglang/srt/speculative/eagle_worker.py +++ b/python/sglang/srt/speculative/eagle_worker.py @@ -40,6 +40,7 @@ def __init__( ) self.target_worker = target_worker self.server_args = server_args + self.finish_extend_len = [] # Share the embedding and lm_head embed, head = self.target_worker.model_runner.model.get_embed_and_head() From 72c77763559317b2c8bddfd67e173b67aa1facb0 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 13 Jan 2025 01:39:14 -0800 Subject: [PATCH 031/248] Fix linear.py and improve weight loading (#2851) Co-authored-by: SangBin Cho --- benchmark/deepseek_v3/README.md | 7 +- docs/references/supported_models.md | 2 +- python/sglang/srt/layers/linear.py | 134 +++++------------- python/sglang/srt/layers/moe/topk.py | 6 +- python/sglang/srt/layers/parameter.py | 40 +++--- .../srt/layers/quantization/fp8_utils.py | 2 +- .../srt/layers/quantization/modelopt_quant.py | 2 +- .../srt/layers/vocab_parallel_embedding.py | 17 ++- python/sglang/srt/managers/scheduler.py | 4 + python/sglang/srt/mem_cache/memory_pool.py | 19 +++ python/sglang/srt/server.py | 3 + test/srt/test_moe_eval_accuracy_large.py | 2 +- 12 files changed, 113 insertions(+), 125 deletions(-) diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md index d14a8d55630c..5c353bca5c79 100644 --- a/benchmark/deepseek_v3/README.md +++ b/benchmark/deepseek_v3/README.md @@ -39,7 +39,7 @@ python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-r For high QPS scenarios, add the `--enable-dp-attention` argument to boost throughput. -### Example with OpenAI API +### Example: Sending requests with OpenAI API ```python3 import openai @@ -58,7 +58,8 @@ response = client.chat.completions.create( ) print(response) ``` -### Example serving with 2 H20*8 + +### Example: Serving with two H20*8 nodes For example, there are two H20 nodes, each with 8 GPUs. The first node's IP is `10.0.0.1`, and the second node's IP is `10.0.0.2`. ```bash @@ -71,7 +72,7 @@ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --di If you have two H100 nodes, the usage is similar to the aforementioned H20. -### Example serving with Docker two H200*8 nodes +### Example: Serving with two H200*8 nodes and docker There are two H200 nodes, each with 8 GPUs. The first node's IP is `192.168.114.10`, and the second node's IP is `192.168.114.11`. Configure the endpoint to expose it to another Docker container using `--host 0.0.0.0` and `--port 40000`, and set up communications with `--dist-init-addr 192.168.114.10:20000`. A single H200 with 8 devices can run DeepSeek V3, the dual H200 setup is just to demonstrate multi-node usage. diff --git a/docs/references/supported_models.md b/docs/references/supported_models.md index 9dafc3d2a3d7..1cc7b874732d 100644 --- a/docs/references/supported_models.md +++ b/docs/references/supported_models.md @@ -5,7 +5,7 @@ - Mistral / Mixtral / Mistral NeMo - Gemma / Gemma 2 - Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL -- DeepSeek / DeepSeek 2 +- DeepSeek / DeepSeek 2 / [DeepSeek 3](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3) - OLMoE - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/) - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava` diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py index b839deeb3251..ee9386c13fa3 100644 --- a/python/sglang/srt/layers/linear.py +++ b/python/sglang/srt/layers/linear.py @@ -1,4 +1,4 @@ -# Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/linear.py +"""Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/linear.py""" import logging from abc import abstractmethod @@ -16,7 +16,7 @@ tensor_model_parallel_all_reduce, ) -# workaround +# Workaround: many QuantizationConfig still depends on this, so we have to use vLLM's LinearBase now. from vllm.model_executor.layers.linear import LinearBase from sglang.srt.layers.parameter import ( @@ -25,7 +25,6 @@ PackedvLLMParameter, PerTensorScaleParameter, RowvLLMParameter, - _ColumnvLLMParameter, ) from sglang.srt.layers.quantization.base_config import ( QuantizationConfig, @@ -43,9 +42,13 @@ "GPTQMarlinLinearMethod", "Fp8LinearMethod", "MarlinLinearMethod", - "GPTQLinearMethod", "QQQLinearMethod", + "GPTQMarlin24LinearMethod", + "TPUInt8LinearMethod", + "GPTQLinearMethod", + "FBGEMMFp8LinearMethod", "ModelOptFp8LinearMethod", + "IPEXAWQLinearMethod", ] @@ -95,62 +98,6 @@ def adjust_scalar_to_fused_array(param, loaded_weight, shard_id): return param[shard_id], loaded_weight -def load_column_qkv_weight( - self, loaded_weight, num_heads, shard_id, shard_offset, shard_size, tp_rank -): - if ( - isinstance(self, (PackedColumnParameter, PackedvLLMParameter)) - and self.output_dim == self.packed_dim - ): - shard_size, shard_offset = self.adjust_shard_indexes_for_packing( - shard_offset=shard_offset, shard_size=shard_size - ) - - param_data = self.data - shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads - param_data = param_data.narrow(self.output_dim, shard_offset, shard_size) - loaded_weight = loaded_weight.narrow( - self.output_dim, shard_id * shard_size, shard_size - ) - - assert param_data.shape == loaded_weight.shape - param_data.copy_(loaded_weight) - - -def load_column_parallel_weight( - self, loaded_weight: torch.Tensor, tp_rank, use_presharded_weights: bool = False -): - if isinstance(self, _ColumnvLLMParameter): - if not use_presharded_weights: - shard_size = self.data.shape[self.output_dim] - loaded_weight = loaded_weight.narrow( - self.output_dim, tp_rank * shard_size, shard_size - ) - assert self.data.shape == loaded_weight.shape - self.data.copy_(loaded_weight) - else: - self.data.copy_(loaded_weight) - - -def load_row_parallel_weight( - self, loaded_weight: torch.Tensor, tp_rank, use_presharded_weights: bool = False -): - if isinstance(self, RowvLLMParameter): - if not use_presharded_weights: - shard_size = self.data.shape[self.input_dim] - loaded_weight = loaded_weight.narrow( - self.input_dim, tp_rank * shard_size, shard_size - ) - - if len(loaded_weight.shape) == 0: - loaded_weight = loaded_weight.reshape(1) - - assert self.data.shape == loaded_weight.shape - self.data.copy_(loaded_weight) - else: - self.data.copy_(loaded_weight) - - class LinearMethodBase(QuantizeMethodBase): """Base class for different (maybe quantized) linear methods.""" @@ -426,9 +373,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): if len(loaded_weight.shape) == 0: loaded_weight = loaded_weight.reshape(1) - assert ( - param_data.shape == loaded_weight.shape - ), f"{param_data.shape=}, {loaded_weight.shape=}" + assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor): @@ -437,7 +382,7 @@ def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor): if len(loaded_weight.shape) == 0: assert loaded_weight.numel() == 1 loaded_weight = loaded_weight.reshape(1) - param.load_column_parallel_weight(loaded_weight=loaded_weight) + param.load_column_parallel_weight(loaded_weight, tp_rank=self.tp_rank) def forward(self, input_): bias = self.bias if not self.skip_bias_add else None @@ -565,9 +510,7 @@ def weight_loader( param_data, loaded_weight, 0 ) - assert ( - param_data.shape == loaded_weight.shape - ), f"{param_data.shape=}, {loaded_weight.shape=}" + assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) return current_shard_offset = 0 @@ -643,9 +586,7 @@ def weight_loader( "the same for all partitions." ) - assert ( - param_data.shape == loaded_weight.shape - ), f"{param_data.shape=}, {loaded_weight.shape=}" + assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) def _load_fused_module_from_checkpoint( @@ -697,6 +638,7 @@ def weight_loader_v2( elif type(param) in (RowvLLMParameter, BasevLLMParameter): param.load_merged_column_weight(loaded_weight=loaded_weight) return + # TODO: @dsikka - move to parameter.py self._load_fused_module_from_checkpoint(param, loaded_weight) return @@ -882,6 +824,7 @@ def weight_loader_v2( elif type(param) in (RowvLLMParameter, BasevLLMParameter): param.load_qkv_weight(loaded_weight=loaded_weight) return + # TODO: @dsikka - move to parameter.py self._load_fused_module_from_checkpoint(param, loaded_weight) return @@ -896,24 +839,14 @@ def weight_loader_v2( shard_offset = (shard_offset + block_n - 1) // block_n shard_size = (shard_size + block_n - 1) // block_n - if isinstance(param, _ColumnvLLMParameter): - load_column_qkv_weight( - param, - loaded_weight, - num_heads=self.num_kv_head_replicas, - shard_id=loaded_shard_id, - shard_offset=shard_offset, - shard_size=shard_size, - tp_rank=self.tp_rank, - ) - else: - param.load_qkv_weight( - loaded_weight=loaded_weight, - num_heads=self.num_kv_head_replicas, - shard_id=loaded_shard_id, - shard_offset=shard_offset, - shard_size=shard_size, - ) + param.load_qkv_weight( + loaded_weight=loaded_weight, + num_heads=self.num_kv_head_replicas, + shard_id=loaded_shard_id, + shard_offset=shard_offset, + shard_size=shard_size, + tp_rank=self.tp_rank, + ) def weight_loader( self, @@ -962,9 +895,7 @@ def weight_loader( param_data, loaded_weight, 0 ) - assert ( - param_data.shape == loaded_weight.shape - ), f"{param_data.shape=}, {loaded_weight.shape=}" + assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) return shard_offsets = [ @@ -1105,9 +1036,7 @@ def weight_loader( "for all partitions." ) - assert ( - param_data.shape == loaded_weight.shape - ), f"{param_data.shape=}, {loaded_weight.shape=}" + assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) @@ -1234,9 +1163,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): if len(loaded_weight.shape) == 0: loaded_weight = loaded_weight.reshape(1) - assert ( - param_data.shape == loaded_weight.shape - ), f"{param_data.shape=}, {loaded_weight.shape=}" + assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor): @@ -1247,7 +1174,18 @@ def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor assert loaded_weight.numel() == 1 loaded_weight = loaded_weight.reshape(1) - param.load_row_parallel_weight(loaded_weight=loaded_weight) + if isinstance(param, BasevLLMParameter): + # This `BasevLLMParameter` is defined in sglang/srt/layers/parameter.py, + # It supports additional parameters like tp_rank and use_presharded_weights. + param.load_row_parallel_weight( + loaded_weight, + tp_rank=self.tp_rank, + use_presharded_weights=self.use_presharded_weights, + ) + else: + # `params` is defined in `vllm/model_executor/parameter.py`, + # It does not support additional parameters. + param.load_row_parallel_weight(loaded_weight) def forward(self, input_): if self.input_is_parallel: diff --git a/python/sglang/srt/layers/moe/topk.py b/python/sglang/srt/layers/moe/topk.py index 8190321988dc..527a7d499b6a 100644 --- a/python/sglang/srt/layers/moe/topk.py +++ b/python/sglang/srt/layers/moe/topk.py @@ -24,7 +24,9 @@ def fused_topk_native( topk: int, renormalize: bool, ): - assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch" + assert ( + hidden_states.shape[0] == gating_output.shape[0] + ), f"Number of tokens mismatch, {hidden_states.shape=} vs {gating_output.shape=}" M, _ = hidden_states.shape topk_weights = torch.empty( M, topk, dtype=torch.float32, device=hidden_states.device @@ -180,7 +182,7 @@ def select_experts( num_expert_group=num_expert_group, topk_group=topk_group, ) - elif torch_native: + elif torch_native and custom_routing_function is None: topk_weights, topk_ids = fused_topk_native( hidden_states=hidden_states, gating_output=router_logits, diff --git a/python/sglang/srt/layers/parameter.py b/python/sglang/srt/layers/parameter.py index 435cc69bb51d..fe999baa2660 100644 --- a/python/sglang/srt/layers/parameter.py +++ b/python/sglang/srt/layers/parameter.py @@ -1,7 +1,4 @@ -""" -Adapted from vLLM (0.6.4.post1). -https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/parameter.py -""" +"""Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/parameter.py""" import logging from fractions import Fraction @@ -88,12 +85,17 @@ def __init__(self, output_dim: int, **kwargs): def output_dim(self): return self._output_dim - def load_column_parallel_weight(self, loaded_weight: torch.Tensor): - tp_rank = get_tensor_model_parallel_rank() - shard_size = self.data.shape[self.output_dim] - loaded_weight = loaded_weight.narrow( - self.output_dim, tp_rank * shard_size, shard_size - ) + def load_column_parallel_weight( + self, + loaded_weight: torch.Tensor, + tp_rank: int, + use_presharded_weights: bool = False, + ): + if not use_presharded_weights: + shard_size = self.data.shape[self.output_dim] + loaded_weight = loaded_weight.narrow( + self.output_dim, tp_rank * shard_size, shard_size + ) assert self.data.shape == loaded_weight.shape self.data.copy_(loaded_weight) @@ -121,7 +123,7 @@ def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs): assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) - def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): + def load_qkv_weight(self, loaded_weight: torch.Tensor, tp_rank: int, **kwargs): shard_offset = kwargs.get("shard_offset") shard_size = kwargs.get("shard_size") @@ -137,7 +139,6 @@ def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): ) param_data = self.data - tp_rank = get_tensor_model_parallel_rank() shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads param_data = param_data.narrow(self.output_dim, shard_offset, shard_size) loaded_weight = loaded_weight.narrow( @@ -164,11 +165,14 @@ def __init__(self, input_dim: int, **kwargs): def input_dim(self): return self._input_dim - def load_row_parallel_weight(self, loaded_weight: torch.Tensor, **kwargs): - use_presharded_weights = kwargs.get("use_presharded_weights") - tp_rank = get_tensor_model_parallel_rank() - shard_size = self.data.shape[self.input_dim] + def load_row_parallel_weight( + self, + loaded_weight: torch.Tensor, + tp_rank: int, + use_presharded_weights: bool = False, + ): if not use_presharded_weights: + shard_size = self.data.shape[self.input_dim] loaded_weight = loaded_weight.narrow( self.input_dim, tp_rank * shard_size, shard_size ) @@ -238,6 +242,8 @@ def _shard_id_as_int(self, shard_id: Union[str, int]) -> int: # For row parallel layers, no sharding needed # load weight into parameter as is def load_row_parallel_weight(self, *args, **kwargs): + kwargs.pop("tp_rank", None) + kwargs.pop("use_presharded_weights", None) super().load_row_parallel_weight(*args, **kwargs) def load_merged_column_weight(self, *args, **kwargs): @@ -247,6 +253,8 @@ def load_qkv_weight(self, *args, **kwargs): self._load_into_shard_id(*args, **kwargs) def load_column_parallel_weight(self, *args, **kwargs): + kwargs.pop("tp_rank", None) + kwargs.pop("use_presharded_weights", None) super().load_row_parallel_weight(*args, **kwargs) def _load_into_shard_id( diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py index 140e70dd9d20..d6ff12ee1635 100644 --- a/python/sglang/srt/layers/quantization/fp8_utils.py +++ b/python/sglang/srt/layers/quantization/fp8_utils.py @@ -1,8 +1,8 @@ from typing import List, Optional, Tuple import torch -from vllm.model_executor.parameter import RowvLLMParameter, _ColumnvLLMParameter +from sglang.srt.layers.parameter import RowvLLMParameter, _ColumnvLLMParameter from sglang.srt.layers.quantization.fp8_kernel import ( per_token_group_quant_fp8, w8a8_block_fp8_matmul, diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py index 8ce9d20d1911..5d65899d6349 100644 --- a/python/sglang/srt/layers/quantization/modelopt_quant.py +++ b/python/sglang/srt/layers/quantization/modelopt_quant.py @@ -11,9 +11,9 @@ cutlass_fp8_supported, requantize_with_max_scale, ) -from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter from sglang.srt.layers.linear import LinearMethodBase +from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter from sglang.srt.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, diff --git a/python/sglang/srt/layers/vocab_parallel_embedding.py b/python/sglang/srt/layers/vocab_parallel_embedding.py index 21d973918758..a346a2cbd1c9 100644 --- a/python/sglang/srt/layers/vocab_parallel_embedding.py +++ b/python/sglang/srt/layers/vocab_parallel_embedding.py @@ -220,6 +220,7 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, prefix: str = "", enable_tp: bool = True, + use_presharded_weights: bool = False, ): super().__init__() self.quant_config = quant_config @@ -236,6 +237,12 @@ def __init__( self.padding_size = padding_size self.org_vocab_size = org_num_embeddings or num_embeddings num_added_embeddings = num_embeddings - self.org_vocab_size + self.use_presharded_weights = use_presharded_weights + if use_presharded_weights: + assert ( + num_added_embeddings == 0 + ), "Lora is not supported with presharded weights." + self.org_vocab_size_padded = pad_vocab_size( self.org_vocab_size, self.padding_size ) @@ -447,10 +454,14 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): start_idx = start_idx // packed_factor shard_size = shard_size // packed_factor else: - assert loaded_weight.shape[output_dim] == self.org_vocab_size + assert loaded_weight.shape[output_dim] == ( + self.org_vocab_size + // (self.tp_size if self.use_presharded_weights else 1) + ) # Copy the data. - loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + if not self.use_presharded_weights: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) param[: loaded_weight.shape[0]].data.copy_(loaded_weight) param[loaded_weight.shape[0] :].data.fill_(0) @@ -514,6 +525,7 @@ def __init__( padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_presharded_weights: bool = False, ): super().__init__( num_embeddings, @@ -523,6 +535,7 @@ def __init__( padding_size, quant_config, prefix, + use_presharded_weights=use_presharded_weights, ) self.quant_config = quant_config if bias: diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 31c8018e2581..1c07ea6adb75 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -13,6 +13,7 @@ # ============================================================================== """A scheduler that manages a tensor parallel GPU worker.""" +import faulthandler import logging import os import signal @@ -399,6 +400,8 @@ def watchdog_thread(self): self.watchdog_last_time = time.time() time.sleep(self.watchdog_timeout / 2) + # Wait sometimes so that the parent process can print the error. + time.sleep(5) self.parent_process.send_signal(signal.SIGQUIT) @torch.no_grad() @@ -1582,6 +1585,7 @@ def run_scheduler_process( pipe_writer, ): setproctitle.setproctitle("sglang::scheduler") + faulthandler.enable() # [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var if dp_rank is None and "SGLANG_DP_RANK" in os.environ: diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py index 6cb186577238..abee7764bebf 100644 --- a/python/sglang/srt/mem_cache/memory_pool.py +++ b/python/sglang/srt/mem_cache/memory_pool.py @@ -27,6 +27,7 @@ from functools import wraps from typing import List, Tuple, Union +import numpy as np import psutil import torch @@ -35,6 +36,8 @@ logger = logging.getLogger(__name__) +GB = 1024 * 1024 * 1024 + class ReqToTokenPool: """A memory pool that maps a request to its token locations.""" @@ -193,6 +196,11 @@ def __init__( self.layer_num = layer_num self._create_buffers() + k_size, v_size = self.get_kv_size_bytes() + logger.info( + f"KV Cache is allocated. K size: {k_size / GB:.2f} GB, V size: {v_size / GB:.2f} GB." + ) + def _create_buffers(self): # [size, head_num, head_dim] for each layer # The padded slot 0 is used for writing dummy outputs from padded tokens. @@ -217,6 +225,17 @@ def _clear_buffers(self): del self.k_buffer del self.v_buffer + def get_kv_size_bytes(self): + assert hasattr(self, "k_buffer") + assert hasattr(self, "v_buffer") + k_size_bytes = 0 + for k_cache in self.k_buffer: + k_size_bytes += np.prod(k_cache.shape) * k_cache.dtype.itemsize + v_size_bytes = 0 + for v_cache in self.v_buffer: + v_size_bytes += np.prod(v_cache.shape) * v_cache.dtype.itemsize + return k_size_bytes, v_size_bytes + # Todo: different memory layout def get_flat_data(self, indices): # prepare a large chunk of contiguous data for efficient transfer diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 8fd902818995..fa1625b09595 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -611,6 +611,9 @@ def _set_envs_and_config(server_args: ServerArgs): # The child processes will send SIGQUIT to this process when any error happens # This process then clean up the whole process tree def sigquit_handler(signum, frame): + logger.error( + "Received sigquit from a child proces. It usually means the child failed." + ) kill_process_tree(os.getpid()) signal.signal(signal.SIGQUIT, sigquit_handler) diff --git a/test/srt/test_moe_eval_accuracy_large.py b/test/srt/test_moe_eval_accuracy_large.py index 6f3affbba4d7..dc420f00dfaf 100644 --- a/test/srt/test_moe_eval_accuracy_large.py +++ b/test/srt/test_moe_eval_accuracy_large.py @@ -71,7 +71,7 @@ def test_mgsm_en(self): ) metrics = run_eval(args) - self.assertGreater(metrics["score"], 0.62) + self.assertGreater(metrics["score"], 0.61) if __name__ == "__main__": From 42f390996317a162f00571f51b6a54dc5fb3165f Mon Sep 17 00:00:00 2001 From: kk <43161300+kkHuang-amd@users.noreply.github.com> Date: Mon, 13 Jan 2025 18:12:44 +0800 Subject: [PATCH 032/248] Unify sglang coding style (#2856) Co-authored-by: Lin, Soga --- .../srt/layers/moe/fused_moe_triton/layer.py | 9 +++--- python/sglang/srt/layers/quantization/fp8.py | 29 ++++++++++--------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py index e1064bcdabd1..d95498377793 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py @@ -1,6 +1,5 @@ # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/model_executor/layers/fused_moe/layer.py -import os from abc import abstractmethod from enum import Enum from typing import Callable, List, Optional, Tuple @@ -19,7 +18,7 @@ QuantizationConfig, QuantizeMethodBase, ) -from sglang.srt.utils import is_hip, permute_weight, set_weight_attrs +from sglang.srt.utils import get_bool_env_var, is_hip, permute_weight, set_weight_attrs if torch.cuda.is_available(): from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts @@ -28,6 +27,8 @@ import logging +is_hip_ = is_hip() + logger = logging.getLogger(__name__) @@ -99,7 +100,7 @@ def create_weights( set_weight_attrs(w2_weight, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - if is_hip() and bool(int(os.getenv("CK_MOE", "0"))): + if is_hip_ and get_bool_env_var("CK_MOE"): layer.w13_weight = torch.nn.Parameter( permute_weight(layer.w13_weight.data), requires_grad=False, @@ -163,7 +164,7 @@ def forward_cuda( correction_bias=correction_bias, ) - if is_hip() and bool(int(os.getenv("CK_MOE", "0"))): + if is_hip_ and get_bool_env_var("CK_MOE"): import ater from ater.fused_moe import fused_experts_ck diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index 22a43675bf85..d16a3b0c257b 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -1,7 +1,6 @@ # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/quantization/fp8.py import logging -import os from typing import Any, Callable, Dict, List, Optional import torch @@ -47,6 +46,8 @@ ACTIVATION_SCHEMES = ["static", "dynamic"] +is_hip_ = is_hip() + logger = logging.getLogger(__name__) @@ -162,7 +163,7 @@ def __init__(self, quant_config: Fp8Config): # kernel for fast weight-only FP8 quantization self.use_marlin = get_bool_env_var("SGLANG_FORCE_FP8_MARLIN") # Disable marlin for ROCm - if is_hip(): + if is_hip_: self.use_marlin = False self.block_quant = self.quant_config.weight_block_size is not None @@ -274,7 +275,7 @@ def process_weights_after_loading(self, layer: Module) -> None: # Block quant doesn't need to process weights after loading if self.block_quant: # If ROCm, normalize the weights and scales to e4m3fnuz - if is_hip(): + if is_hip_: # activation_scheme: dynamic weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz( weight=layer.weight, @@ -331,7 +332,7 @@ def process_weights_after_loading(self, layer: Module) -> None: weight_scale = layer.weight_scale # If ROCm, normalize the weights and scales to e4m3fnuz - if is_hip(): + if is_hip_: weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( weight=weight, weight_scale=weight_scale, @@ -568,7 +569,7 @@ def process_weights_after_loading(self, layer: Module) -> None: # Block quant doesn't need to process weights after loading if self.block_quant: # If ROCm, normalize the weights and scales to e4m3fnuz - if is_hip(): + if is_hip_: # activation_scheme: dynamic w13_weight, w13_weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz( weight=layer.w13_weight, @@ -595,7 +596,7 @@ def process_weights_after_loading(self, layer: Module) -> None: # If checkpoint is fp16 or bfloat16, quantize in place. if not self.quant_config.is_checkpoint_fp8_serialized: # If ROCm, use float8_e4m3fnuz instead (MI300x HW) - fp8_dtype = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn + fp8_dtype = torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype) w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype) @@ -617,8 +618,8 @@ def process_weights_after_loading(self, layer: Module) -> None: layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) - if is_hip(): - if bool(int(os.getenv("CK_MOE", "0"))): + if is_hip_: + if get_bool_env_var("CK_MOE"): layer.w13_weight = torch.nn.Parameter( permute_weight(layer.w13_weight.data), requires_grad=False, @@ -629,7 +630,7 @@ def process_weights_after_loading(self, layer: Module) -> None: requires_grad=False, ) torch.cuda.empty_cache() - elif bool(int(os.getenv("MOE_PADDING", "0"))): + elif get_bool_env_var("MOE_PADDING"): # If ROCm, apply weight padding (min. Mem channel contention) only if set layer.w13_weight = torch.nn.Parameter( F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0), @@ -671,7 +672,7 @@ def process_weights_after_loading(self, layer: Module) -> None: ) # If ROCm, normalize the weights and scales to e4m3fnuz - if is_hip(): + if is_hip_: # Normalize the weights and scales w13_weight, w13_weight_scale, w13_input_scale = ( normalize_e4m3fn_to_e4m3fnuz( @@ -721,8 +722,8 @@ def process_weights_after_loading(self, layer: Module) -> None: max_w13_scales, requires_grad=False ) - if is_hip(): - if bool(int(os.getenv("CK_MOE", "0"))): + if is_hip_: + if get_bool_env_var("CK_MOE"): layer.w13_weight = torch.nn.Parameter( permute_weight(layer.w13_weight.data), requires_grad=False, @@ -733,7 +734,7 @@ def process_weights_after_loading(self, layer: Module) -> None: requires_grad=False, ) torch.cuda.empty_cache() - elif bool(int(os.getenv("MOE_PADDING", "0"))): + elif get_bool_env_var("MOE_PADDING"): # If ROCm, apply weight padding (min. Mem channel contention) only if set layer.w13_weight = torch.nn.Parameter( F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0), @@ -777,7 +778,7 @@ def apply( correction_bias=correction_bias, ) - if is_hip() and bool(int(os.getenv("CK_MOE", "0"))): + if is_hip_ and get_bool_env_var("CK_MOE"): import ater from ater.fused_moe import fused_experts_ck From 20a9f5dfe0b75614f723401072bca8589c781770 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Mon, 13 Jan 2025 18:36:40 +0800 Subject: [PATCH 033/248] fix: not delete CNAME (#2860) --- .github/workflows/release-docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml index ab2129e3721a..44bdfa0fa1ab 100644 --- a/.github/workflows/release-docs.yml +++ b/.github/workflows/release-docs.yml @@ -49,7 +49,7 @@ jobs: cd _build/html git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1 - rm -rf ../sgl-project.github.io/* + find ../sgl-project.github.io/ -mindepth 1 -not -name CNAME -delete cp -r * ../sgl-project.github.io cp ../../README.md ../sgl-project.github.io/README.md cd ../sgl-project.github.io From 41d7e5b7e68f3aa0ef741d8774333c3518522d5e Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Mon, 13 Jan 2025 18:40:48 +0800 Subject: [PATCH 034/248] docs: update link (#2857) --- README.md | 16 ++++++++-------- benchmark/deepseek_v3/README.md | 2 +- docs/references/contribution_guide.md | 2 +- python/sglang/srt/sampling/sampling_params.py | 2 +- sgl-router/README.md | 2 +- sgl-router/v0.1.0.md | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 024fa2761270..bacdb9fc15f6 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,9 @@ -------------------------------------------------------------------------------- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) -| [**Documentation**](https://sgl-project.github.io/) -| [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw) -| [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) +| [**Documentation**](https://docs.sglang.ai/) +| [**Join Slack**](https://slack.sglang.ai/) +| [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) | ## News @@ -45,11 +45,11 @@ The core features include: - **Active Community**: SGLang is open-source and backed by an active community with industry adoption. ## Getting Started -- [Install SGLang](https://sgl-project.github.io/start/install.html) -- [Quick Start](https://sgl-project.github.io/start/send_request.html) -- [Backend Tutorial](https://sgl-project.github.io/backend/openai_api_completions.html) -- [Frontend Tutorial](https://sgl-project.github.io/frontend/frontend.html) -- [Contribution Guide](https://sgl-project.github.io/references/contribution_guide.html) +- [Install SGLang](https://docs.sglang.ai/start/install.html) +- [Quick Start](https://docs.sglang.ai/start/send_request.html) +- [Backend Tutorial](https://docs.sglang.ai/backend/openai_api_completions.html) +- [Frontend Tutorial](https://docs.sglang.ai/frontend/frontend.html) +- [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html) ## Benchmark and Performance Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/) diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md index 5c353bca5c79..e7ad8d33609c 100644 --- a/benchmark/deepseek_v3/README.md +++ b/benchmark/deepseek_v3/README.md @@ -4,7 +4,7 @@ The SGLang and DeepSeek teams collaborated to get DeepSeek V3 FP8 running on NVI Special thanks to Meituan's Search & Recommend Platform Team and Baseten's Model Performance Team for implementing the model, and DataCrunch for providing GPU resources. -For optimizations made on the DeepSeek series models regarding SGLang, please refer to [DeepSeek Model Optimizations in SGLang](https://sgl-project.github.io/references/deepseek.html). +For optimizations made on the DeepSeek series models regarding SGLang, please refer to [DeepSeek Model Optimizations in SGLang](https://docs.sglang.ai/references/deepseek.html). ## Hardware Recommendation - 8 x NVIDIA H200 GPUs diff --git a/docs/references/contribution_guide.md b/docs/references/contribution_guide.md index b2211f463fb0..b3b7f826894a 100644 --- a/docs/references/contribution_guide.md +++ b/docs/references/contribution_guide.md @@ -14,7 +14,7 @@ git clone https://github.com//sglang.git ### Install Dependencies & Build -Refer to [Install SGLang from Source](https://sgl-project.github.io/start/install.html#method-2-from-source) documentation for more details on setting up the necessary dependencies. +Refer to [Install SGLang from Source](https://docs.sglang.ai/start/install.html#method-2-from-source) documentation for more details on setting up the necessary dependencies. ## Code Formatting with Pre-Commit diff --git a/python/sglang/srt/sampling/sampling_params.py b/python/sglang/srt/sampling/sampling_params.py index 2c3817e1b795..d1d932693c61 100644 --- a/python/sglang/srt/sampling/sampling_params.py +++ b/python/sglang/srt/sampling/sampling_params.py @@ -23,7 +23,7 @@ class SamplingParams: The sampling parameters. See docs/references/sampling_params.md or - https://sgl-project.github.io/references/sampling_params.html + https://docs.sglang.ai/references/sampling_params.html for the documentation. """ diff --git a/sgl-router/README.md b/sgl-router/README.md index 617bca5405fe..f39d63625de1 100644 --- a/sgl-router/README.md +++ b/sgl-router/README.md @@ -4,7 +4,7 @@ SGLang router is a standalone module implemented in Rust to achieve data paralle ## User docs -Please check https://sgl-project.github.io/router/router.html +Please check https://docs.sglang.ai/router/router.html ## Developer docs diff --git a/sgl-router/v0.1.0.md b/sgl-router/v0.1.0.md index 9a1ee152f113..747731a71c2d 100644 --- a/sgl-router/v0.1.0.md +++ b/sgl-router/v0.1.0.md @@ -54,7 +54,7 @@ Note: ## Closing remarks: -1. Please read the full usage at https://sgl-project.github.io/router/router.html +1. Please read the full usage at https://docs.sglang.ai/router/router.html 2. The feature is still under active improvement, so please don't hesitate to raise issues or submit PRs if you have any suggestions or feedback. From 4536d7244637f7e62e5892e272d06275bba8b5f1 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Mon, 13 Jan 2025 18:58:56 +0800 Subject: [PATCH 035/248] minor: use ubuntu-latest instead of self-hosted runner for amd build (#2861) --- .github/workflows/release-docker-amd.yml | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/release-docker-amd.yml b/.github/workflows/release-docker-amd.yml index 866cc5fa5209..c0394e8e57a2 100644 --- a/.github/workflows/release-docker-amd.yml +++ b/.github/workflows/release-docker-amd.yml @@ -10,19 +10,27 @@ on: jobs: publish: if: github.repository == 'sgl-project/sglang' - runs-on: docker-builder-amd + runs-on: ubuntu-latest environment: 'prod' strategy: matrix: rocm_version: ['6.2.0'] build_type: ['all', 'srt'] steps: - - name: Delete huge unnecessary tools folder - run: rm -rf /opt/hostedtoolcache - - name: Checkout repository uses: actions/checkout@v3 + - name: Free disk space + uses: jlumbroso/free-disk-space@main + with: + tool-cache: false + docker-images: false + android: true + dotnet: true + haskell: true + large-packages: true + swap-storage: false + - name: Login to Docker Hub uses: docker/login-action@v2 with: From 67008f4b320d8950803fcb14b1e5dc6e80bf75e4 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 13 Jan 2025 03:55:33 -0800 Subject: [PATCH 036/248] Use only one GPU for MLA CI tests (#2858) --- .github/workflows/pr-test.yml | 8 +++----- test/srt/run_suite.py | 2 ++ test/srt/test_mla.py | 35 ++++++++++++++++++++++++++++++++++- test/srt/test_mla_fp8.py | 2 -- 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index f1c7871debb2..274c97c63932 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -87,18 +87,16 @@ jobs: run: | bash scripts/ci_install_dependency.sh - - name: Evaluate data parallelism accuracy (DP=2) + - name: Test data parallelism (DP=2) timeout-minutes: 10 run: | cd test/srt python3 test_data_parallelism.py - - name: Evaluate MLA accuracy (TP=2) + - name: Test data parallelism attention (DP=2) timeout-minutes: 10 run: | cd test/srt - python3 test_mla.py - python3 test_mla_fp8.py python3 test_dp_attention.py - name: Test update weights from distributed @@ -107,7 +105,7 @@ jobs: cd test/srt python3 test_update_weights_from_distributed.py - - name: Evaluate MoE EP accuracy (TP=2) + - name: Test expert parallelism (EP=2) timeout-minutes: 10 run: | cd test/srt diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 320fea7294e5..d617fcf69e62 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -22,6 +22,8 @@ "test_json_constrained.py", "test_large_max_new_tokens.py", "test_metrics.py", + "test_mla.py", + "test_mla_fp8.py", "test_no_chunked_prefill.py", "test_no_overlap_scheduler.py", "test_openai_server.py", diff --git a/test/srt/test_mla.py b/test/srt/test_mla.py index b8105a84af1a..34bc4b446452 100644 --- a/test/srt/test_mla.py +++ b/test/srt/test_mla.py @@ -2,6 +2,7 @@ from types import SimpleNamespace from sglang.srt.utils import kill_process_tree +from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MLA_MODEL_NAME_FOR_TEST, @@ -20,7 +21,7 @@ def setUpClass(cls): cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=["--tp", "2", "--trust-remote-code"], + other_args=["--trust-remote-code"], ) @classmethod @@ -52,5 +53,37 @@ def test_mgsm_en(self): self.assertGreater(metrics["score"], 0.8) +class TestDeepseekV3(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = "lmzheng/sglang-ci-dsv3-test" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=["--trust-remote-code"], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_mmlu(self): + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(metrics) + + self.assertGreater(metrics["accuracy"], 0.62) + + if __name__ == "__main__": unittest.main() diff --git a/test/srt/test_mla_fp8.py b/test/srt/test_mla_fp8.py index 769bdf34da87..4fe18b526b1e 100644 --- a/test/srt/test_mla_fp8.py +++ b/test/srt/test_mla_fp8.py @@ -21,8 +21,6 @@ def setUpClass(cls): cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ - "--tp", - "2", "--trust-remote-code", "--kv-cache-dtype", "fp8_e5m2", From 51ab3ccf470ac51c8779091dd5db4c91c11e6c8c Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 13 Jan 2025 03:57:39 -0800 Subject: [PATCH 037/248] Collect more metrics: num_requests_total (#2859) --- .../sglang/srt/managers/tokenizer_manager.py | 78 +++++++++---------- python/sglang/srt/metrics/collector.py | 15 ++-- test/srt/test_metrics.py | 1 + 3 files changed, 49 insertions(+), 45 deletions(-) diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 9f9c53eaa8ec..fb6202932f0f 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -601,7 +601,7 @@ async def sigterm_watchdog(self): while not self.gracefully_exit: await asyncio.sleep(5) - # drain requests + # Drain requests while True: remain_num_req = len(self.rid_to_state) logger.info( @@ -679,45 +679,7 @@ async def handle_loop(self): state.event.set() if self.enable_metrics: - completion_tokens = ( - recv_obj.completion_tokens[i] - if getattr(recv_obj, "completion_tokens", None) - else 0 - ) - - if state.first_token_time is None: - state.first_token_time = time.time() - self.metrics_collector.observe_time_to_first_token( - state.first_token_time - state.created_time - ) - else: - if completion_tokens >= 2: - # Compute time_per_output_token for the streaming case - self.metrics_collector.observe_time_per_output_token( - (time.time() - state.first_token_time) - / (completion_tokens - 1) - ) - - if state.finished: - self.metrics_collector.inc_prompt_tokens( - recv_obj.prompt_tokens[i] - ) - self.metrics_collector.inc_generation_tokens( - completion_tokens - ) - self.metrics_collector.observe_e2e_request_latency( - time.time() - state.created_time - ) - # Compute time_per_output_token for the non-streaming case - if ( - hasattr(state.obj, "stream") - and not state.obj.stream - and completion_tokens >= 1 - ): - self.metrics_collector.observe_time_per_output_token( - (time.time() - state.created_time) - / completion_tokens - ) + self.collect_metrics(state, recv_obj, i) elif isinstance(recv_obj, OpenSessionReqOutput): self.session_futures[recv_obj.session_id].set_result( recv_obj.session_id if recv_obj.success else None @@ -820,6 +782,42 @@ def detokenize_top_logprobs_tokens( ret.append(None) return ret + def collect_metrics(self, state: ReqState, recv_obj: BatchStrOut, i: int): + completion_tokens = ( + recv_obj.completion_tokens[i] + if getattr(recv_obj, "completion_tokens", None) + else 0 + ) + + if state.first_token_time is None: + state.first_token_time = time.time() + self.metrics_collector.observe_time_to_first_token( + state.first_token_time - state.created_time + ) + else: + if completion_tokens >= 2: + # Compute time_per_output_token for the streaming case + self.metrics_collector.observe_time_per_output_token( + (time.time() - state.first_token_time) / (completion_tokens - 1) + ) + + if state.finished: + self.metrics_collector.observe_one_finished_request( + recv_obj.prompt_tokens[i], completion_tokens + ) + self.metrics_collector.observe_e2e_request_latency( + time.time() - state.created_time + ) + # Compute time_per_output_token for the non-streaming case + if ( + hasattr(state.obj, "stream") + and not state.obj.stream + and completion_tokens >= 1 + ): + self.metrics_collector.observe_time_per_output_token( + (time.time() - state.created_time) / completion_tokens + ) + class SignalHandler: def __init__(self, tokenizer_manager): diff --git a/python/sglang/srt/metrics/collector.py b/python/sglang/srt/metrics/collector.py index 9505f012f067..070b405be429 100644 --- a/python/sglang/srt/metrics/collector.py +++ b/python/sglang/srt/metrics/collector.py @@ -109,6 +109,12 @@ def __init__(self, labels: Dict[str, str]) -> None: labelnames=labels.keys(), ) + self.num_requests_total = Counter( + name="sglang:num_requests_total", + documentation="Number of requests processed.", + labelnames=labels.keys(), + ) + self.histogram_time_to_first_token = Histogram( name="sglang:time_to_first_token_seconds", documentation="Histogram of time to first token in seconds.", @@ -185,11 +191,10 @@ def _log_counter(self, counter, data: Union[int, float]) -> None: # Convenience function for logging to counter. counter.labels(**self.labels).inc(data) - def inc_prompt_tokens(self, value: int): - self._log_counter(self.prompt_tokens_total, value) - - def inc_generation_tokens(self, value: int): - self._log_counter(self.generation_tokens_total, value) + def observe_one_finished_request(self, prompt_tokens: int, generation_tokens: int): + self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens) + self.generation_tokens_total.labels(**self.labels).inc(generation_tokens) + self.num_requests_total.labels(**self.labels).inc(1) def observe_time_to_first_token(self, value: Union[float, int]): self._log_histogram(self.histogram_time_to_first_token, value) diff --git a/test/srt/test_metrics.py b/test/srt/test_metrics.py index ccaea5be800e..69babf795f01 100644 --- a/test/srt/test_metrics.py +++ b/test/srt/test_metrics.py @@ -59,6 +59,7 @@ def test_metrics_enabled(self): "sglang:func_latency_seconds", "sglang:prompt_tokens_total", "sglang:generation_tokens_total", + "sglang:num_requests_total", "sglang:time_to_first_token_seconds", "sglang:time_per_output_token_seconds", "sglang:e2e_request_latency_seconds", From 17de02f98d8f28e5affec7c5ff8e28f110d0af42 Mon Sep 17 00:00:00 2001 From: bjmsong Date: Mon, 13 Jan 2025 20:14:16 +0800 Subject: [PATCH 038/248] Integration of TurboMind AWQ (#2828) Co-authored-by: root --- python/pyproject.toml | 2 +- python/sglang/srt/configs/model_config.py | 10 +- python/sglang/srt/layers/linear.py | 1 + .../srt/layers/quantization/__init__.py | 2 + .../srt/layers/quantization/awq_turbomind.py | 287 ++++++++++++++++++ .../layers/quantization/turbomind_utils.py | 63 ++++ python/sglang/srt/server_args.py | 1 + test/srt/test_turbomind_awq.py | 47 +++ 8 files changed, 411 insertions(+), 2 deletions(-) create mode 100644 python/sglang/srt/layers/quantization/awq_turbomind.py create mode 100644 python/sglang/srt/layers/quantization/turbomind_utils.py create mode 100644 test/srt/test_turbomind_awq.py diff --git a/python/pyproject.toml b/python/pyproject.toml index a236469a17c8..c29580b50b1b 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -28,7 +28,7 @@ runtime_common = [ srt = [ "sglang[runtime_common]", "cuda-python", "sgl-kernel>=0.0.2.post11", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", - "flashinfer==0.1.6" + "flashinfer==0.1.6", "turbomind" ] # HIP (Heterogeneous-computing Interface for Portability) for AMD diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 072c88b04a78..28144f139958 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -14,6 +14,7 @@ import json import logging +import sys from enum import IntEnum, auto from typing import List, Optional, Set, Union @@ -230,7 +231,7 @@ def _verify_quantization(self) -> None: # Parse quantization method from the HF model config, if available. quant_cfg = self._parse_quant_hf_config() - if quant_cfg is not None: + if quant_cfg is not None and not quantization_in_turbomind(self.quantization): quant_method = quant_cfg.get("quant_method", "").lower() # Detect which checkpoint is it @@ -401,3 +402,10 @@ def is_multimodal_model(model_architectures: List[str]): def is_encoder_decoder_model(model_architectures: List[str]): return "MllamaForConditionalGeneration" in model_architectures + + +def quantization_in_turbomind(quantization: str) -> bool: + if quantization in ["awq_turbomind"]: + return True + else: + return False diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py index ee9386c13fa3..815255d5c167 100644 --- a/python/sglang/srt/layers/linear.py +++ b/python/sglang/srt/layers/linear.py @@ -48,6 +48,7 @@ "GPTQLinearMethod", "FBGEMMFp8LinearMethod", "ModelOptFp8LinearMethod", + "AWQTurbomindLinearMethod", "IPEXAWQLinearMethod", ] diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py index 35b0c4d94edb..faf14d6fdd6b 100644 --- a/python/sglang/srt/layers/quantization/__init__.py +++ b/python/sglang/srt/layers/quantization/__init__.py @@ -20,6 +20,7 @@ from vllm.model_executor.layers.quantization.qqq import QQQConfig from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig +from sglang.srt.layers.quantization.awq_turbomind import AWQTurbomindConfig from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.quantization.fp8 import Fp8Config from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp8Config @@ -37,6 +38,7 @@ "gptq_marlin_24": GPTQMarlin24Config, "gptq_marlin": GPTQMarlinConfig, "awq_marlin": AWQMarlinConfig, + "awq_turbomind": AWQTurbomindConfig, "gptq": GPTQConfig, "compressed-tensors": CompressedTensorsConfig, "bitsandbytes": BitsAndBytesConfig, diff --git a/python/sglang/srt/layers/quantization/awq_turbomind.py b/python/sglang/srt/layers/quantization/awq_turbomind.py new file mode 100644 index 000000000000..007b20420973 --- /dev/null +++ b/python/sglang/srt/layers/quantization/awq_turbomind.py @@ -0,0 +1,287 @@ +import logging +import os +import sys +from typing import Any, Dict, List, Optional + +import torch +import turbomind +from torch.nn import Parameter + +turbomind_dir = os.path.split(turbomind.__file__)[0] +sys.path.append(os.path.join(turbomind_dir, "lib")) +import _turbomind_ext +from vllm.model_executor.layers.linear import LinearBase + +from sglang.srt.layers.linear import LinearMethodBase, UnquantizedLinearMethod +from sglang.srt.layers.parameter import GroupQuantScaleParameter, PackedvLLMParameter +from sglang.srt.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from sglang.srt.layers.quantization.turbomind_utils import ( + get_u4_slices, + is_layer_skipped_awq, + pack_u4_row, + unpack_awq_gemm, + verify_turbomind_supported, +) +from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead +from sglang.srt.utils import is_cuda, set_weight_attrs + +logger = logging.getLogger(__name__) + + +class AWQTurbomindConfig(QuantizationConfig): + """Config class for AWQ Turbomind""" + + def __init__( + self, + weight_bits: int, + group_size: int, + zero_point: bool, + lm_head_quantized: bool, + modules_to_not_convert: Optional[List[str]] = None, + ) -> None: + self.pack_factor = 32 // weight_bits # packed into int32 + self.group_size = group_size + self.zero_point = zero_point + self.lm_head_quantized = lm_head_quantized + self.weight_bits = weight_bits + self.modules_to_not_convert = modules_to_not_convert or [] + + verify_turbomind_supported(self.weight_bits, self.group_size) + + def __repr__(self) -> str: + return ( + f"AWQTurbomindConfig(weight_bits={self.weight_bits}, " + f"group_size={self.group_size}, " + f"zero_point={self.zero_point}, " + f"lm_head_quantized={self.lm_head_quantized}, " + f"modules_to_not_convert={self.modules_to_not_convert})" + ) + + @classmethod + def get_name(cls) -> str: + return "awq_turbomind" + + @classmethod + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.half, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 70 + + @classmethod + def get_config_filenames(cls) -> List[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "AWQTurbomindConfig": + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"]) + zero_point = cls.get_from_keys(config, ["zero_point"]) + lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) + modules_to_not_convert = cls.get_from_keys_or( + config, ["modules_to_not_convert"], None + ) + return cls( + weight_bits, + group_size, + zero_point, + lm_head_quantized, + modules_to_not_convert, + ) + + @classmethod + def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]: + can_convert = cls.is_awq_turbomind_compatible(hf_quant_cfg) + is_valid_user_quant = user_quant is None or user_quant == "awq_turbomind" + + if can_convert and is_valid_user_quant: + msg = f"The model is convertible to {cls.get_name()} during runtime. Using {cls.get_name()} kernel." + logger.info(msg) + return cls.get_name() + + if can_convert and user_quant == "awq": + logger.info( + "Detected that the model can run with awq_turbomind" + ", however you specified quantization=awq explicitly," + " so forcing awq. Use quantization=awq_turbomind for" + " faster inference" + ) + return None + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + if isinstance(layer, LinearBase) or ( + isinstance(layer, ParallelLMHead) and self.lm_head_quantized + ): + if is_layer_skipped_awq(prefix, self.modules_to_not_convert): + return UnquantizedLinearMethod() + return AWQTurbomindLinearMethod(self) + + return None + + @classmethod + def is_awq_turbomind_compatible(cls, quant_config: Dict[str, Any]): + if not is_cuda(): + return False + + # Extract data from quant config. + quant_method = quant_config.get("quant_method", "").lower() + num_bits = quant_config.get("bits") + group_size = quant_config.get("group_size") + zero_point = quant_config.get("zero_point") + + if quant_method != "awq": + return False + + # If we cannot find the info needed in the config, cannot convert. + if num_bits is None or group_size is None or zero_point is None: + return False + + return verify_turbomind_supported(quant_bit=num_bits, group_size=group_size) + + def get_scaled_act_names(self) -> List[str]: + return [] + + +class AWQTurbomindLinearMethod(LinearMethodBase): + """Linear method for AWQ Turbomind. + + Args: + quant_config: The AWQ Turbomind quantization config. + """ + + def __init__(self, quant_config: AWQTurbomindConfig) -> None: + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + + # Normalize group_size + if self.quant_config.group_size != -1: + group_size = self.quant_config.group_size + else: + group_size = input_size + + qweight = PackedvLLMParameter( + data=torch.empty( + input_size_per_partition, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + weight_loader=weight_loader, + ) + + num_groups = input_size_per_partition // group_size + + qzeros = PackedvLLMParameter( + data=torch.empty( + num_groups, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + weight_loader=weight_loader, + ) + + scales = GroupQuantScaleParameter( + data=torch.empty( + num_groups, + output_size_per_partition, + dtype=params_dtype, + ), + input_dim=0, + output_dim=1, + weight_loader=weight_loader, + ) + + layer.register_parameter("qweight", qweight) + layer.register_parameter("qzeros", qzeros) + layer.register_parameter("scales", scales) + + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + layer.num_groups = num_groups + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + + qweight_turbomind = unpack_awq_gemm(layer.qweight.data) + qzeros_turbomind = unpack_awq_gemm(layer.qzeros.data) + scales_turbomind = layer.scales.data + + qweight_turbomind = pack_u4_row(qweight_turbomind) + qzeros_turbomind = qzeros_turbomind.to(torch.half) + + device_id = layer.qweight.device.index + properties = torch.cuda.get_device_properties(device_id) + + def is_16xx_series(name): + import re + + pattern = r"GTX 16\d\d" + return bool(re.search(pattern, name)) + + simt = is_16xx_series(properties.name) + qweight_turbomind = qweight_turbomind.contiguous() + scales_turbomind = scales_turbomind.contiguous() + qzeros_turbomind = qzeros_turbomind.contiguous() + + self.linear = _turbomind_ext.Linear( + layer.input_size_per_partition, + layer.output_size_per_partition, + self.quant_config.weight_bits, + self.quant_config.group_size, + ) + + self.linear.post_init( + qweight_turbomind, scales_turbomind, qzeros_turbomind, simt + ) + + layer.qweight = Parameter(qweight_turbomind, requires_grad=False) + layer.scales = Parameter(scales_turbomind, requires_grad=False) + layer.qzeros = Parameter(qzeros_turbomind, requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + + x = x.view(-1, x.shape[-1]) + out_shape = x.shape[:-1] + (layer.output_size_per_partition,) + out = torch.empty( + (x.shape[0], layer.output_size_per_partition), + dtype=torch.float16, + device=x.device, + ) + stream = torch.cuda.current_stream() + + self.linear.forward(x, out, stream.cuda_stream) + out = torch.from_dlpack(out) + if bias is not None: + out.add_(bias) + + return out.view(out_shape) diff --git a/python/sglang/srt/layers/quantization/turbomind_utils.py b/python/sglang/srt/layers/quantization/turbomind_utils.py new file mode 100644 index 000000000000..b8d4b97d00d2 --- /dev/null +++ b/python/sglang/srt/layers/quantization/turbomind_utils.py @@ -0,0 +1,63 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from typing import List + +import torch + +from sglang.srt.utils import get_device_capability + + +def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> List[torch.Tensor]: + assert x.dtype == torch.int32 + xs = [] + for _ in range(8): + xs.append((x & 15).to(dtype)) + x = x >> 4 + return xs + + +def unpack_awq_gemm(x: torch.Tensor) -> torch.Tensor: + """ + The int4 weights are packed into int32: + bit: 31-28 27-24 23-20 19-16 15-12 11-8 7-4 3-0 + weight: int4_1 int4_2 int4_3 int4_4 int4_5 int4_6 int4_7 int4_8 + """ + xs = get_u4_slices(x, torch.uint8) + order = [0, 4, 1, 5, 2, 6, 3, 7] + ys = [xs[i] for i in order] + return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1) + + +def pack_u4_row(x: torch.Tensor) -> torch.Tensor: + assert x.dtype == torch.uint8 + xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1) + a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device) + for t in reversed(xs): + a = (a << 4) | t + return a.squeeze(dim=-1) + + +def verify_turbomind_supported(quant_bit: int, group_size: int) -> bool: + + if quant_bit not in [4]: + raise NotImplementedError( + f"[Tubomind] Only 4-bit is supported for now, but got {quant_bit} bit" + ) + if group_size != 128: + raise NotImplementedError( + f"[Tubomind] Only group_size 128 is supported for now, " + f"but got group_size {group_size}" + ) + + major, minor = get_device_capability() + capability = major * 10 + minor + if capability < 70: + raise NotImplementedError( + f"[Tubomind] Only capability >= 70 is supported for now, but got {capability}" + ) + + return True + + +def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]): + return any(module_name in prefix for module_name in modules_to_not_convert) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index be85a3670d40..061d320ef47b 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -375,6 +375,7 @@ def add_cli_args(parser: argparse.ArgumentParser): "marlin", "gptq_marlin", "awq_marlin", + "awq_turbomind", "bitsandbytes", "gguf", "modelopt", diff --git a/test/srt/test_turbomind_awq.py b/test/srt/test_turbomind_awq.py new file mode 100644 index 000000000000..fa2a879d4ff2 --- /dev/null +++ b/test/srt/test_turbomind_awq.py @@ -0,0 +1,47 @@ +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_process_tree +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestMLA(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--quantization", + "awq_turbomind", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_mgsm_en(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mgsm_en", + num_examples=None, + num_threads=1024, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.5 + + +if __name__ == "__main__": + unittest.main() From f3516c28944215c576187f94468d7a4c2546ff61 Mon Sep 17 00:00:00 2001 From: Ke Bao Date: Mon, 13 Jan 2025 20:32:17 +0800 Subject: [PATCH 039/248] Fix quant kernel accuracy issue (#2865) --- python/sglang/srt/layers/quantization/int8_kernel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/int8_kernel.py b/python/sglang/srt/layers/quantization/int8_kernel.py index d1e74c6044de..91b56f9e0e9c 100644 --- a/python/sglang/srt/layers/quantization/int8_kernel.py +++ b/python/sglang/srt/layers/quantization/int8_kernel.py @@ -22,7 +22,8 @@ def _per_token_quant_int8( x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask, other=0.0).to(tl.float32) absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10) scale_x = absmax / 127 - x_q = tl.extra.cuda.libdevice.round(x / scale_x).to(tl.int8) + x_q = x * (127 / absmax) + x_q = tl.extra.cuda.libdevice.round(x_q).to(tl.int8) tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask) tl.store(scale_ptr + row_id, scale_x) From 6249e4a19ed66afa100d55fa41997b725ff4b296 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 13 Jan 2025 04:44:39 -0800 Subject: [PATCH 040/248] Revert "Integration of TurboMind AWQ" (#2866) --- python/pyproject.toml | 2 +- python/sglang/srt/configs/model_config.py | 10 +- python/sglang/srt/layers/linear.py | 1 - .../srt/layers/quantization/__init__.py | 2 - .../srt/layers/quantization/awq_turbomind.py | 287 ------------------ .../layers/quantization/turbomind_utils.py | 63 ---- python/sglang/srt/server_args.py | 1 - test/srt/test_turbomind_awq.py | 47 --- 8 files changed, 2 insertions(+), 411 deletions(-) delete mode 100644 python/sglang/srt/layers/quantization/awq_turbomind.py delete mode 100644 python/sglang/srt/layers/quantization/turbomind_utils.py delete mode 100644 test/srt/test_turbomind_awq.py diff --git a/python/pyproject.toml b/python/pyproject.toml index c29580b50b1b..a236469a17c8 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -28,7 +28,7 @@ runtime_common = [ srt = [ "sglang[runtime_common]", "cuda-python", "sgl-kernel>=0.0.2.post11", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", - "flashinfer==0.1.6", "turbomind" + "flashinfer==0.1.6" ] # HIP (Heterogeneous-computing Interface for Portability) for AMD diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 28144f139958..072c88b04a78 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -14,7 +14,6 @@ import json import logging -import sys from enum import IntEnum, auto from typing import List, Optional, Set, Union @@ -231,7 +230,7 @@ def _verify_quantization(self) -> None: # Parse quantization method from the HF model config, if available. quant_cfg = self._parse_quant_hf_config() - if quant_cfg is not None and not quantization_in_turbomind(self.quantization): + if quant_cfg is not None: quant_method = quant_cfg.get("quant_method", "").lower() # Detect which checkpoint is it @@ -402,10 +401,3 @@ def is_multimodal_model(model_architectures: List[str]): def is_encoder_decoder_model(model_architectures: List[str]): return "MllamaForConditionalGeneration" in model_architectures - - -def quantization_in_turbomind(quantization: str) -> bool: - if quantization in ["awq_turbomind"]: - return True - else: - return False diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py index 815255d5c167..ee9386c13fa3 100644 --- a/python/sglang/srt/layers/linear.py +++ b/python/sglang/srt/layers/linear.py @@ -48,7 +48,6 @@ "GPTQLinearMethod", "FBGEMMFp8LinearMethod", "ModelOptFp8LinearMethod", - "AWQTurbomindLinearMethod", "IPEXAWQLinearMethod", ] diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py index faf14d6fdd6b..35b0c4d94edb 100644 --- a/python/sglang/srt/layers/quantization/__init__.py +++ b/python/sglang/srt/layers/quantization/__init__.py @@ -20,7 +20,6 @@ from vllm.model_executor.layers.quantization.qqq import QQQConfig from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig -from sglang.srt.layers.quantization.awq_turbomind import AWQTurbomindConfig from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.quantization.fp8 import Fp8Config from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp8Config @@ -38,7 +37,6 @@ "gptq_marlin_24": GPTQMarlin24Config, "gptq_marlin": GPTQMarlinConfig, "awq_marlin": AWQMarlinConfig, - "awq_turbomind": AWQTurbomindConfig, "gptq": GPTQConfig, "compressed-tensors": CompressedTensorsConfig, "bitsandbytes": BitsAndBytesConfig, diff --git a/python/sglang/srt/layers/quantization/awq_turbomind.py b/python/sglang/srt/layers/quantization/awq_turbomind.py deleted file mode 100644 index 007b20420973..000000000000 --- a/python/sglang/srt/layers/quantization/awq_turbomind.py +++ /dev/null @@ -1,287 +0,0 @@ -import logging -import os -import sys -from typing import Any, Dict, List, Optional - -import torch -import turbomind -from torch.nn import Parameter - -turbomind_dir = os.path.split(turbomind.__file__)[0] -sys.path.append(os.path.join(turbomind_dir, "lib")) -import _turbomind_ext -from vllm.model_executor.layers.linear import LinearBase - -from sglang.srt.layers.linear import LinearMethodBase, UnquantizedLinearMethod -from sglang.srt.layers.parameter import GroupQuantScaleParameter, PackedvLLMParameter -from sglang.srt.layers.quantization.base_config import ( - QuantizationConfig, - QuantizeMethodBase, -) -from sglang.srt.layers.quantization.turbomind_utils import ( - get_u4_slices, - is_layer_skipped_awq, - pack_u4_row, - unpack_awq_gemm, - verify_turbomind_supported, -) -from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead -from sglang.srt.utils import is_cuda, set_weight_attrs - -logger = logging.getLogger(__name__) - - -class AWQTurbomindConfig(QuantizationConfig): - """Config class for AWQ Turbomind""" - - def __init__( - self, - weight_bits: int, - group_size: int, - zero_point: bool, - lm_head_quantized: bool, - modules_to_not_convert: Optional[List[str]] = None, - ) -> None: - self.pack_factor = 32 // weight_bits # packed into int32 - self.group_size = group_size - self.zero_point = zero_point - self.lm_head_quantized = lm_head_quantized - self.weight_bits = weight_bits - self.modules_to_not_convert = modules_to_not_convert or [] - - verify_turbomind_supported(self.weight_bits, self.group_size) - - def __repr__(self) -> str: - return ( - f"AWQTurbomindConfig(weight_bits={self.weight_bits}, " - f"group_size={self.group_size}, " - f"zero_point={self.zero_point}, " - f"lm_head_quantized={self.lm_head_quantized}, " - f"modules_to_not_convert={self.modules_to_not_convert})" - ) - - @classmethod - def get_name(cls) -> str: - return "awq_turbomind" - - @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: - return [torch.half, torch.bfloat16] - - @classmethod - def get_min_capability(cls) -> int: - return 70 - - @classmethod - def get_config_filenames(cls) -> List[str]: - return ["quantize_config.json"] - - @classmethod - def from_config(cls, config: Dict[str, Any]) -> "AWQTurbomindConfig": - weight_bits = cls.get_from_keys(config, ["bits"]) - group_size = cls.get_from_keys(config, ["group_size"]) - zero_point = cls.get_from_keys(config, ["zero_point"]) - lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) - modules_to_not_convert = cls.get_from_keys_or( - config, ["modules_to_not_convert"], None - ) - return cls( - weight_bits, - group_size, - zero_point, - lm_head_quantized, - modules_to_not_convert, - ) - - @classmethod - def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]: - can_convert = cls.is_awq_turbomind_compatible(hf_quant_cfg) - is_valid_user_quant = user_quant is None or user_quant == "awq_turbomind" - - if can_convert and is_valid_user_quant: - msg = f"The model is convertible to {cls.get_name()} during runtime. Using {cls.get_name()} kernel." - logger.info(msg) - return cls.get_name() - - if can_convert and user_quant == "awq": - logger.info( - "Detected that the model can run with awq_turbomind" - ", however you specified quantization=awq explicitly," - " so forcing awq. Use quantization=awq_turbomind for" - " faster inference" - ) - return None - - def get_quant_method( - self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: - if isinstance(layer, LinearBase) or ( - isinstance(layer, ParallelLMHead) and self.lm_head_quantized - ): - if is_layer_skipped_awq(prefix, self.modules_to_not_convert): - return UnquantizedLinearMethod() - return AWQTurbomindLinearMethod(self) - - return None - - @classmethod - def is_awq_turbomind_compatible(cls, quant_config: Dict[str, Any]): - if not is_cuda(): - return False - - # Extract data from quant config. - quant_method = quant_config.get("quant_method", "").lower() - num_bits = quant_config.get("bits") - group_size = quant_config.get("group_size") - zero_point = quant_config.get("zero_point") - - if quant_method != "awq": - return False - - # If we cannot find the info needed in the config, cannot convert. - if num_bits is None or group_size is None or zero_point is None: - return False - - return verify_turbomind_supported(quant_bit=num_bits, group_size=group_size) - - def get_scaled_act_names(self) -> List[str]: - return [] - - -class AWQTurbomindLinearMethod(LinearMethodBase): - """Linear method for AWQ Turbomind. - - Args: - quant_config: The AWQ Turbomind quantization config. - """ - - def __init__(self, quant_config: AWQTurbomindConfig) -> None: - self.quant_config = quant_config - - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: List[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ) -> None: - - output_size_per_partition = sum(output_partition_sizes) - weight_loader = extra_weight_attrs.get("weight_loader") - - # Normalize group_size - if self.quant_config.group_size != -1: - group_size = self.quant_config.group_size - else: - group_size = input_size - - qweight = PackedvLLMParameter( - data=torch.empty( - input_size_per_partition, - output_size_per_partition // self.quant_config.pack_factor, - dtype=torch.int32, - ), - input_dim=0, - output_dim=1, - packed_dim=1, - packed_factor=self.quant_config.pack_factor, - weight_loader=weight_loader, - ) - - num_groups = input_size_per_partition // group_size - - qzeros = PackedvLLMParameter( - data=torch.empty( - num_groups, - output_size_per_partition // self.quant_config.pack_factor, - dtype=torch.int32, - ), - input_dim=0, - output_dim=1, - packed_dim=1, - packed_factor=self.quant_config.pack_factor, - weight_loader=weight_loader, - ) - - scales = GroupQuantScaleParameter( - data=torch.empty( - num_groups, - output_size_per_partition, - dtype=params_dtype, - ), - input_dim=0, - output_dim=1, - weight_loader=weight_loader, - ) - - layer.register_parameter("qweight", qweight) - layer.register_parameter("qzeros", qzeros) - layer.register_parameter("scales", scales) - - layer.input_size_per_partition = input_size_per_partition - layer.output_size_per_partition = output_size_per_partition - layer.num_groups = num_groups - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - - qweight_turbomind = unpack_awq_gemm(layer.qweight.data) - qzeros_turbomind = unpack_awq_gemm(layer.qzeros.data) - scales_turbomind = layer.scales.data - - qweight_turbomind = pack_u4_row(qweight_turbomind) - qzeros_turbomind = qzeros_turbomind.to(torch.half) - - device_id = layer.qweight.device.index - properties = torch.cuda.get_device_properties(device_id) - - def is_16xx_series(name): - import re - - pattern = r"GTX 16\d\d" - return bool(re.search(pattern, name)) - - simt = is_16xx_series(properties.name) - qweight_turbomind = qweight_turbomind.contiguous() - scales_turbomind = scales_turbomind.contiguous() - qzeros_turbomind = qzeros_turbomind.contiguous() - - self.linear = _turbomind_ext.Linear( - layer.input_size_per_partition, - layer.output_size_per_partition, - self.quant_config.weight_bits, - self.quant_config.group_size, - ) - - self.linear.post_init( - qweight_turbomind, scales_turbomind, qzeros_turbomind, simt - ) - - layer.qweight = Parameter(qweight_turbomind, requires_grad=False) - layer.scales = Parameter(scales_turbomind, requires_grad=False) - layer.qzeros = Parameter(qzeros_turbomind, requires_grad=False) - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - - x = x.view(-1, x.shape[-1]) - out_shape = x.shape[:-1] + (layer.output_size_per_partition,) - out = torch.empty( - (x.shape[0], layer.output_size_per_partition), - dtype=torch.float16, - device=x.device, - ) - stream = torch.cuda.current_stream() - - self.linear.forward(x, out, stream.cuda_stream) - out = torch.from_dlpack(out) - if bias is not None: - out.add_(bias) - - return out.view(out_shape) diff --git a/python/sglang/srt/layers/quantization/turbomind_utils.py b/python/sglang/srt/layers/quantization/turbomind_utils.py deleted file mode 100644 index b8d4b97d00d2..000000000000 --- a/python/sglang/srt/layers/quantization/turbomind_utils.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -from typing import List - -import torch - -from sglang.srt.utils import get_device_capability - - -def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> List[torch.Tensor]: - assert x.dtype == torch.int32 - xs = [] - for _ in range(8): - xs.append((x & 15).to(dtype)) - x = x >> 4 - return xs - - -def unpack_awq_gemm(x: torch.Tensor) -> torch.Tensor: - """ - The int4 weights are packed into int32: - bit: 31-28 27-24 23-20 19-16 15-12 11-8 7-4 3-0 - weight: int4_1 int4_2 int4_3 int4_4 int4_5 int4_6 int4_7 int4_8 - """ - xs = get_u4_slices(x, torch.uint8) - order = [0, 4, 1, 5, 2, 6, 3, 7] - ys = [xs[i] for i in order] - return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1) - - -def pack_u4_row(x: torch.Tensor) -> torch.Tensor: - assert x.dtype == torch.uint8 - xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1) - a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device) - for t in reversed(xs): - a = (a << 4) | t - return a.squeeze(dim=-1) - - -def verify_turbomind_supported(quant_bit: int, group_size: int) -> bool: - - if quant_bit not in [4]: - raise NotImplementedError( - f"[Tubomind] Only 4-bit is supported for now, but got {quant_bit} bit" - ) - if group_size != 128: - raise NotImplementedError( - f"[Tubomind] Only group_size 128 is supported for now, " - f"but got group_size {group_size}" - ) - - major, minor = get_device_capability() - capability = major * 10 + minor - if capability < 70: - raise NotImplementedError( - f"[Tubomind] Only capability >= 70 is supported for now, but got {capability}" - ) - - return True - - -def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]): - return any(module_name in prefix for module_name in modules_to_not_convert) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 061d320ef47b..be85a3670d40 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -375,7 +375,6 @@ def add_cli_args(parser: argparse.ArgumentParser): "marlin", "gptq_marlin", "awq_marlin", - "awq_turbomind", "bitsandbytes", "gguf", "modelopt", diff --git a/test/srt/test_turbomind_awq.py b/test/srt/test_turbomind_awq.py deleted file mode 100644 index fa2a879d4ff2..000000000000 --- a/test/srt/test_turbomind_awq.py +++ /dev/null @@ -1,47 +0,0 @@ -import unittest -from types import SimpleNamespace - -from sglang.srt.utils import kill_process_tree -from sglang.test.run_eval import run_eval -from sglang.test.test_utils import ( - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - popen_launch_server, -) - - -class TestMLA(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--quantization", - "awq_turbomind", - ], - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_mgsm_en(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mgsm_en", - num_examples=None, - num_threads=1024, - ) - - metrics = run_eval(args) - assert metrics["score"] >= 0.5 - - -if __name__ == "__main__": - unittest.main() From 3b141e15097d5e436f0c5ded65a364aba3d7c043 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 13 Jan 2025 04:51:56 -0800 Subject: [PATCH 041/248] Dump requests (#2862) --- .../sglang/srt/managers/tokenizer_manager.py | 28 +++++++++++++++++++ python/sglang/srt/server_args.py | 10 +++++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index fb6202932f0f..d12ed8c575b8 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -18,10 +18,12 @@ import dataclasses import logging import os +import pickle import signal import sys import time import uuid +from datetime import datetime from typing import Any, Awaitable, Dict, Generic, List, Optional, Tuple, TypeVar, Union import fastapi @@ -105,6 +107,7 @@ def __init__( # Parse args self.server_args = server_args self.enable_metrics = server_args.enable_metrics + self.dump_requsts_folder = server_args.dump_requests_folder # Init inter-process communication context = zmq.asyncio.Context(2) @@ -163,6 +166,7 @@ def __init__( # Store states self.to_create_loop = True self.rid_to_state: Dict[str, ReqState] = {} + self.dump_request_list: List[Tuple] = [] # The event to notify the weight sync is finished. self.model_update_lock = RWLock() @@ -680,6 +684,9 @@ async def handle_loop(self): if self.enable_metrics: self.collect_metrics(state, recv_obj, i) + if self.dump_requsts_folder and state.finished: + self.dump_requests(state, out_dict) + elif isinstance(recv_obj, OpenSessionReqOutput): self.session_futures[recv_obj.session_id].set_result( recv_obj.session_id if recv_obj.success else None @@ -818,6 +825,27 @@ def collect_metrics(self, state: ReqState, recv_obj: BatchStrOut, i: int): (time.time() - state.created_time) / completion_tokens ) + def dump_requests(self, state: ReqState, out_dict: dict): + self.dump_request_list.append( + (state.obj, out_dict, state.created_time, time.time()) + ) + + if len(self.dump_request_list) > int( + os.environ.get("SGLANG_DUMP_REQUESTS_THRESHOLD", "1000") + ): + to_dump = self.dump_request_list + self.dump_request_list = [] + + def background_task(): + os.makedirs(self.dump_requsts_folder, exist_ok=True) + current_time = datetime.now() + filename = current_time.strftime("%Y-%m-%d_%H-%M-%S") + ".pkl" + with open(os.path.join(self.dump_requsts_folder, filename), "wb") as f: + pickle.dump(to_dump, f) + + # Schedule the task to run in the background without awaiting it + asyncio.create_task(asyncio.to_thread(background_task)) + class SignalHandler: def __init__(self, tokenizer_manager): diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index be85a3670d40..e5c423a35188 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -23,7 +23,6 @@ import torch from sglang.srt.hf_transformers_utils import check_gguf_file -from sglang.srt.speculative.spec_info import SpeculativeAlgorithm from sglang.srt.utils import ( get_amdgpu_memory_capacity, get_hpu_memory_capacity, @@ -89,6 +88,7 @@ class ServerArgs: show_time_cost: bool = False enable_metrics: bool = False decode_log_interval: int = 40 + dump_requests_folder: str = "" # API related api_key: Optional[str] = None @@ -554,7 +554,13 @@ def add_cli_args(parser: argparse.ArgumentParser): "--decode-log-interval", type=int, default=ServerArgs.decode_log_interval, - help="The log interval of decode batch", + help="The log interval of decode batch.", + ) + parser.add_argument( + "--dump-requests-folder", + type=str, + default=ServerArgs.decode_log_interval, + help="Dump raw requests to a folder for replay.", ) # API related From 336ff5b9f564a1af2d8b4f1a22caf4c17c0cbbdc Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 13 Jan 2025 05:13:02 -0800 Subject: [PATCH 042/248] Fix typos in io_struct.py (#2867) --- python/sglang/srt/managers/io_struct.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 6ddc0993f9d7..26b8921c493f 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -355,9 +355,6 @@ class BatchStrOut: output_strs: List[str] # Token counts - # real input and output tokens can be get from - # origin_input_ids and output_ids by enabling --return_token_ids - # TODO (Shuai): Rename this to clarify the meaning. prompt_tokens: List[int] completion_tokens: List[int] cached_tokens: List[int] From d855653bd42ad8b037a6843e53171a6bb21ea420 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Mon, 13 Jan 2025 21:18:39 +0800 Subject: [PATCH 043/248] minor: fix release docs (#2868) --- .github/workflows/release-docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml index 44bdfa0fa1ab..c200f5313e65 100644 --- a/.github/workflows/release-docs.yml +++ b/.github/workflows/release-docs.yml @@ -49,7 +49,7 @@ jobs: cd _build/html git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1 - find ../sgl-project.github.io/ -mindepth 1 -not -name CNAME -delete + find ../sgl-project.github.io/ -mindepth 1 -not -path "../sgl-project.github.io/.git*" -not -name CNAME -delete cp -r * ../sgl-project.github.io cp ../../README.md ../sgl-project.github.io/README.md cd ../sgl-project.github.io From 6ec75e626d5949dfca49069cd778cd4eb29d02b1 Mon Sep 17 00:00:00 2001 From: Lzhang-hub <57925599+Lzhang-hub@users.noreply.github.com> Date: Mon, 13 Jan 2025 21:29:33 +0800 Subject: [PATCH 044/248] add qwen2 eagle model (#2863) --- python/sglang/srt/models/qwen2.py | 11 ++ python/sglang/srt/models/qwen2_eagle.py | 131 ++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 python/sglang/srt/models/qwen2_eagle.py diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index 2a20d6c50de1..e42559bbc00c 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -362,5 +362,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + def get_embed_and_head(self): + return self.model.embed_tokens.weight, self.lm_head.weight + + def set_embed_and_head(self, embed, head): + del self.model.embed_tokens.weight + del self.lm_head.weight + self.model.embed_tokens.weight = embed + self.lm_head.weight = head + torch.cuda.empty_cache() + torch.cuda.synchronize() + EntryClass = Qwen2ForCausalLM diff --git a/python/sglang/srt/models/qwen2_eagle.py b/python/sglang/srt/models/qwen2_eagle.py new file mode 100644 index 000000000000..01069ef482cd --- /dev/null +++ b/python/sglang/srt/models/qwen2_eagle.py @@ -0,0 +1,131 @@ +""" +Copyright 2023-2024 SGLang Team +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +# Adapted from +# https://github.com/SafeAILab/EAGLE/blob/main/eagle/model/cnets.py +"""Inference-only LLaMA-EAGLE model compatible with HuggingFace weights.""" + +from typing import Iterable, Optional, Tuple + +import torch +from torch import nn + +from sglang.srt.layers.logits_processor import LogitsProcessor +from sglang.srt.layers.quantization.base_config import QuantizationConfig +from sglang.srt.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from sglang.srt.model_executor.forward_batch_info import ForwardBatch +from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2ForCausalLM + +Qwen2Config = None + + +class Qwen2DecoderLayer(Qwen2DecoderLayer): + def __init__( + self, + config: Qwen2Config, + layer_id: int = 0, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__(config, layer_id, quant_config) + + # Skip the input_layernorm + # https://github.com/SafeAILab/EAGLE/blob/35c78f6cdc19a73e05cf5c330b4c358dad970c6a/eagle/model/cnets.py#L427 + if layer_id == 0: + del self.input_layernorm + setattr(self, "input_layernorm", lambda x: x) + + +class Qwen2Model(nn.Module): + def __init__( + self, + config: Qwen2Config, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.config = config + self.vocab_size = config.vocab_size + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList( + [ + Qwen2DecoderLayer( + config, i, quant_config=quant_config, prefix=f"model.layers.{i}" + ) + for i in range(config.num_hidden_layers) + ] + ) + self.fc = torch.nn.Linear(config.hidden_size * 2, config.hidden_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + forward_batch: ForwardBatch, + input_embeds: torch.Tensor = None, + ) -> torch.Tensor: + if input_embeds is None: + hidden_states = self.embed_tokens(input_ids) + else: + hidden_states = input_embeds + + hidden_states = self.fc( + torch.cat((hidden_states, forward_batch.spec_info.hidden_states), dim=-1) + ) + + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + forward_batch, + residual, + ) + return hidden_states + residual + + +class Qwen2ForCausalLMEagle(Qwen2ForCausalLM): + def __init__( + self, + config: Qwen2Config, + quant_config: Optional[QuantizationConfig] = None, + cache_config=None, + ) -> None: + nn.Module.__init__(self) + self.config = config + self.quant_config = quant_config + self.model = Qwen2Model(config, quant_config=quant_config) + if self.config.tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = ParallelLMHead( + config.vocab_size, config.hidden_size, quant_config=quant_config + ) + self.logits_processor = LogitsProcessor(config) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + for name, loaded_weight in weights: + if "lm_head" not in name: + name = "model." + name + super().load_weights([(name, loaded_weight)]) + + +EntryClass = [Qwen2ForCausalLMEagle] From c1e097ca669838f2bc09655612cc9d38fc55a275 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 13 Jan 2025 06:21:25 -0800 Subject: [PATCH 045/248] Revert "Dump requests to a folder" (#2869) --- .../sglang/srt/managers/tokenizer_manager.py | 28 ------------------- python/sglang/srt/server_args.py | 10 ++----- 2 files changed, 2 insertions(+), 36 deletions(-) diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index d12ed8c575b8..fb6202932f0f 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -18,12 +18,10 @@ import dataclasses import logging import os -import pickle import signal import sys import time import uuid -from datetime import datetime from typing import Any, Awaitable, Dict, Generic, List, Optional, Tuple, TypeVar, Union import fastapi @@ -107,7 +105,6 @@ def __init__( # Parse args self.server_args = server_args self.enable_metrics = server_args.enable_metrics - self.dump_requsts_folder = server_args.dump_requests_folder # Init inter-process communication context = zmq.asyncio.Context(2) @@ -166,7 +163,6 @@ def __init__( # Store states self.to_create_loop = True self.rid_to_state: Dict[str, ReqState] = {} - self.dump_request_list: List[Tuple] = [] # The event to notify the weight sync is finished. self.model_update_lock = RWLock() @@ -684,9 +680,6 @@ async def handle_loop(self): if self.enable_metrics: self.collect_metrics(state, recv_obj, i) - if self.dump_requsts_folder and state.finished: - self.dump_requests(state, out_dict) - elif isinstance(recv_obj, OpenSessionReqOutput): self.session_futures[recv_obj.session_id].set_result( recv_obj.session_id if recv_obj.success else None @@ -825,27 +818,6 @@ def collect_metrics(self, state: ReqState, recv_obj: BatchStrOut, i: int): (time.time() - state.created_time) / completion_tokens ) - def dump_requests(self, state: ReqState, out_dict: dict): - self.dump_request_list.append( - (state.obj, out_dict, state.created_time, time.time()) - ) - - if len(self.dump_request_list) > int( - os.environ.get("SGLANG_DUMP_REQUESTS_THRESHOLD", "1000") - ): - to_dump = self.dump_request_list - self.dump_request_list = [] - - def background_task(): - os.makedirs(self.dump_requsts_folder, exist_ok=True) - current_time = datetime.now() - filename = current_time.strftime("%Y-%m-%d_%H-%M-%S") + ".pkl" - with open(os.path.join(self.dump_requsts_folder, filename), "wb") as f: - pickle.dump(to_dump, f) - - # Schedule the task to run in the background without awaiting it - asyncio.create_task(asyncio.to_thread(background_task)) - class SignalHandler: def __init__(self, tokenizer_manager): diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index e5c423a35188..be85a3670d40 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -23,6 +23,7 @@ import torch from sglang.srt.hf_transformers_utils import check_gguf_file +from sglang.srt.speculative.spec_info import SpeculativeAlgorithm from sglang.srt.utils import ( get_amdgpu_memory_capacity, get_hpu_memory_capacity, @@ -88,7 +89,6 @@ class ServerArgs: show_time_cost: bool = False enable_metrics: bool = False decode_log_interval: int = 40 - dump_requests_folder: str = "" # API related api_key: Optional[str] = None @@ -554,13 +554,7 @@ def add_cli_args(parser: argparse.ArgumentParser): "--decode-log-interval", type=int, default=ServerArgs.decode_log_interval, - help="The log interval of decode batch.", - ) - parser.add_argument( - "--dump-requests-folder", - type=str, - default=ServerArgs.decode_log_interval, - help="Dump raw requests to a folder for replay.", + help="The log interval of decode batch", ) # API related From d08c77c434981534267d13ef78c22a817ac08775 Mon Sep 17 00:00:00 2001 From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Date: Mon, 13 Jan 2025 23:09:00 +0800 Subject: [PATCH 046/248] Sampling penalties memory interface (#2870) --- ... benchmark_deepseekv3_moe_align_blocks.py} | 3 +- python/pyproject.toml | 2 +- .../penalizers/repetition_penalty.py | 20 ++- .../srt/sampling/sampling_batch_info.py | 19 ++- python/sglang/srt/utils.py | 4 + .../benchmark_sampling_scaling_penalties.py | 159 ++++++++++++++++++ sgl-kernel/tests/test_moe_align.py | 95 +++++++---- 7 files changed, 256 insertions(+), 46 deletions(-) rename benchmark/kernels/fused_moe_triton/{benchmark_moe_align_blocks.py => benchmark_deepseekv3_moe_align_blocks.py} (98%) create mode 100644 sgl-kernel/benchmark/benchmark_sampling_scaling_penalties.py diff --git a/benchmark/kernels/fused_moe_triton/benchmark_moe_align_blocks.py b/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py similarity index 98% rename from benchmark/kernels/fused_moe_triton/benchmark_moe_align_blocks.py rename to benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py index 92547ea95ae2..0a6049a1200c 100644 --- a/benchmark/kernels/fused_moe_triton/benchmark_moe_align_blocks.py +++ b/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py @@ -222,8 +222,9 @@ def calculate_diff(batch_size, seq_len): def benchmark(batch_size, seq_len, provider): num_experts = 256 block_size = 128 + topk = 8 topk_ids = torch.randint( - 0, num_experts, (batch_size, seq_len), dtype=torch.int32, device="cuda" + 0, num_experts, (batch_size * seq_len, topk), dtype=torch.int32, device="cuda" ) max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) diff --git a/python/pyproject.toml b/python/pyproject.toml index a236469a17c8..4b627ae94785 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -27,7 +27,7 @@ runtime_common = [ ] srt = [ "sglang[runtime_common]", "cuda-python", - "sgl-kernel>=0.0.2.post11", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", + "sgl-kernel>=0.0.2.post12", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "flashinfer==0.1.6" ] diff --git a/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py b/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py index 4c293b89520d..fcd5ff71c233 100644 --- a/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +++ b/python/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py @@ -3,6 +3,11 @@ import torch from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs +from sglang.srt.utils import is_cuda_available + +is_cuda = is_cuda_available() +if is_cuda: + from sgl_kernel import sampling_scaling_penalties class BatchedRepetitionPenalizer(_BatchedPenalizer): @@ -56,11 +61,16 @@ def _cumulate_output_tokens(self, output_ids: _TokenIDs): self.cumulated_repetition_penalties[mask] = self.repetition_penalties[mask] def _apply(self, logits: torch.Tensor) -> torch.Tensor: - return torch.where( - logits > 0, - logits / self.cumulated_repetition_penalties, - logits * self.cumulated_repetition_penalties, - ) + if is_cuda: + return sampling_scaling_penalties( + logits, self.cumulated_repetition_penalties + ) + else: + return torch.where( + logits > 0, + logits / self.cumulated_repetition_penalties, + logits * self.cumulated_repetition_penalties, + ) def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor): self.repetition_penalties = self.repetition_penalties[indices_tensor_to_keep] diff --git a/python/sglang/srt/sampling/sampling_batch_info.py b/python/sglang/srt/sampling/sampling_batch_info.py index 9497e53d3092..6eda63c706a3 100644 --- a/python/sglang/srt/sampling/sampling_batch_info.py +++ b/python/sglang/srt/sampling/sampling_batch_info.py @@ -7,6 +7,12 @@ import torch +from sglang.srt.utils import is_cuda_available + +is_cuda = is_cuda_available() +if is_cuda: + from sgl_kernel import sampling_scaling_penalties + import sglang.srt.sampling.penaltylib as penaltylib logger = logging.getLogger(__name__) @@ -245,11 +251,14 @@ def apply_logits_bias(self, logits: torch.Tensor): # repetition if self.scaling_penalties is not None: - logits[:] = torch.where( - logits > 0, - logits / self.scaling_penalties, - logits * self.scaling_penalties, - ) + if is_cuda: + logits[:] = sampling_scaling_penalties(logits, self.scaling_penalties) + else: + logits[:] = torch.where( + logits > 0, + logits / self.scaling_penalties, + logits * self.scaling_penalties, + ) # Apply regex vocab_mask if self.vocab_mask is not None: diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 51ca91a96b0d..e70e6b42526d 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -97,6 +97,10 @@ def is_flashinfer_available(): return torch.cuda.is_available() and torch.version.cuda +def is_cuda_available(): + return torch.cuda.is_available() and torch.version.cuda + + def is_ipv6(address): try: ipaddress.IPv6Address(address) diff --git a/sgl-kernel/benchmark/benchmark_sampling_scaling_penalties.py b/sgl-kernel/benchmark/benchmark_sampling_scaling_penalties.py new file mode 100644 index 000000000000..000dab0d8e9a --- /dev/null +++ b/sgl-kernel/benchmark/benchmark_sampling_scaling_penalties.py @@ -0,0 +1,159 @@ +import itertools + +import torch +import triton +from sgl_kernel import sampling_scaling_penalties + + +def sampling_scaling_penalties_naive(logits, scaling_penalties): + return torch.where( + logits > 0, logits / scaling_penalties, logits * scaling_penalties + ) + + +def sampling_scaling_penalties_kernel(logits, scaling_penalties): + return sampling_scaling_penalties(logits, scaling_penalties) + + +def test_memory(func, _iter): + total_mem = [] + + for _ in range(_iter): + torch.cuda.memory.reset_peak_memory_stats() + func() + mem = torch.cuda.max_memory_allocated() / (2**20) + total_mem.append(mem) + + return sum(total_mem) / len(total_mem) + + +def calculate_diff(batch_size, vocab_size): + dtype = torch.bfloat16 + device = torch.device("cuda") + + logits = torch.randn(batch_size, vocab_size, device=device, dtype=dtype) + scaling_penalties = ( + torch.rand(batch_size, vocab_size, device=device, dtype=dtype) + 0.5 + ) + + output_naive = sampling_scaling_penalties_naive( + logits.clone(), scaling_penalties.clone() + ) + output_kernel = sampling_scaling_penalties_kernel( + logits.clone(), scaling_penalties.clone() + ) + + print(f"Naive output={output_naive}") + print(f"Kernel output={output_kernel}") + + if torch.allclose(output_naive, output_kernel, atol=1e-2, rtol=1e-2): + print("✅ Both implementations match") + else: + print("❌ Implementations differ") + + +batch_size_range = [2**i for i in range(0, 12)] +vocab_size_range = [2**i for i in range(10, 17)] +configs = list(itertools.product(batch_size_range, vocab_size_range)) + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size", "vocab_size"], + x_vals=[list(_) for _ in configs], + line_arg="provider", + line_vals=["naive", "kernel"], + line_names=["PyTorch Naive", "SGL Kernel"], + styles=[("blue", "-"), ("red", "-")], + ylabel="us", + plot_name="sampling-scaling-penalties-performance", + args={}, + ) +) +def benchmark(batch_size, vocab_size, provider): + dtype = torch.bfloat16 + device = torch.device("cuda") + + logits = torch.randn(batch_size, vocab_size, device=device, dtype=dtype) + scaling_penalties = ( + torch.rand(batch_size, vocab_size, device=device, dtype=dtype) + 0.5 + ) + + quantiles = [0.5, 0.2, 0.8] + + if provider == "naive": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: sampling_scaling_penalties_naive( + logits.clone(), + scaling_penalties.clone(), + ), + quantiles=quantiles, + ) + else: + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: sampling_scaling_penalties_kernel( + logits.clone(), + scaling_penalties.clone(), + ), + quantiles=quantiles, + ) + + return 1000 * ms, 1000 * max_ms, 1000 * min_ms + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size", "vocab_size"], + x_vals=[list(_) for _ in configs], + line_arg="provider", + line_vals=["naive", "kernel"], + line_names=["PyTorch Naive", "SGL Kernel"], + styles=[("blue", "-"), ("red", "-")], + ylabel="GPU memory usage (MB)", + plot_name="sampling-scaling-penalties-memory", + args={}, + ) +) +def benchmark_memory(batch_size, vocab_size, provider): + dtype = torch.bfloat16 + device = torch.device("cuda") + + print( + f"Running memory benchmark with batch_size={batch_size}, vocab_size={vocab_size}, provider={provider}" + ) + + def run_kernel(): + logits = torch.randn(batch_size, vocab_size, device=device, dtype=dtype) + scaling_penalties = ( + torch.rand(batch_size, vocab_size, device=device, dtype=dtype) + 0.5 + ) + + if provider == "naive": + return sampling_scaling_penalties_naive(logits, scaling_penalties) + else: + return sampling_scaling_penalties_kernel(logits, scaling_penalties) + + mem = test_memory(run_kernel, _iter=10) + return mem + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--save_path", + type=str, + default="./configs/benchmark_ops/sampling_scaling_penalties/", + help="Path to save sampling_scaling_penalties benchmark results", + ) + args = parser.parse_args() + + # Run correctness test + calculate_diff(batch_size=4, vocab_size=4096) + + # Run performance benchmark + benchmark.run(print_data=True, save_path=args.save_path) + + # Run memory benchmark + benchmark_memory.run(print_data=True, save_path=args.save_path) diff --git a/sgl-kernel/tests/test_moe_align.py b/sgl-kernel/tests/test_moe_align.py index 92596a47e5db..2fca90b2f561 100644 --- a/sgl-kernel/tests/test_moe_align.py +++ b/sgl-kernel/tests/test_moe_align.py @@ -3,38 +3,65 @@ def test_moe_align_block_size(): + # For DeepSeek V3, we have 256 experts num_experts = 256 - block_size = 128 - topk_ids = torch.randint(0, num_experts, (3, 4), dtype=torch.int32, device="cuda") - - max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) - sorted_ids = torch.empty( - (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device - ) - sorted_ids.fill_(topk_ids.numel()) - max_num_m_blocks = max_num_tokens_padded // block_size - expert_ids = torch.empty( - (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device - ) - num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device) - - token_cnts_buffer = torch.empty( - (num_experts + 1) * num_experts, dtype=torch.int32, device=topk_ids.device - ) - cumsum_buffer = torch.empty( - num_experts + 1, dtype=torch.int32, device=topk_ids.device - ) - - moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_ids, - expert_ids, - num_tokens_post_pad, - token_cnts_buffer, - cumsum_buffer, - ) - - -test_moe_align_block_size() + + # Test different combinations of block_size, num_tokens and topk + for block_size in [32, 64, 128, 256]: + print(f"\nTesting block_size={block_size}") + for num_tokens in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]: + for topk in [1, 2, 4, 8, 16, 32, 64]: + print( + f"Testing block_size={block_size}, num_tokens={num_tokens}, topk={topk}" + ) + + # Create random topk_ids with shape [num_tokens, topk] + topk_ids = torch.randint( + 0, num_experts, (num_tokens, topk), dtype=torch.int32, device="cuda" + ) + + max_num_tokens_padded = topk_ids.numel() + num_experts * ( + block_size - 1 + ) + sorted_ids = torch.empty( + (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device + ) + sorted_ids.fill_(topk_ids.numel()) + max_num_m_blocks = max_num_tokens_padded // block_size + expert_ids = torch.empty( + (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device + ) + num_tokens_post_pad = torch.empty( + (1), dtype=torch.int32, device=topk_ids.device + ) + + token_cnts_buffer = torch.empty( + (num_experts + 1) * num_experts, + dtype=torch.int32, + device=topk_ids.device, + ) + cumsum_buffer = torch.empty( + num_experts + 1, dtype=torch.int32, device=topk_ids.device + ) + + try: + moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + token_cnts_buffer, + cumsum_buffer, + ) + except Exception as e: + print( + f"Error occurred with block_size={block_size}, num_tokens={num_tokens}, topk={topk}" + ) + print(f"Error message: {str(e)}") + raise e + + +if __name__ == "__main__": + test_moe_align_block_size() From 923f518337ed4ec878a215ecc6193f8634e3b785 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Tue, 14 Jan 2025 03:38:51 +0800 Subject: [PATCH 047/248] CUDA-graph-compatible releasing and resuming KV cache and model weight memory (#2630) --- python/pyproject.toml | 1 + python/sglang/srt/managers/io_struct.py | 24 +++- python/sglang/srt/managers/scheduler.py | 43 ++++++ .../sglang/srt/managers/tokenizer_manager.py | 32 +++++ python/sglang/srt/mem_cache/memory_pool.py | 130 +++++++++++------- .../sglang/srt/model_executor/model_runner.py | 22 ++- python/sglang/srt/server.py | 48 ++++++- python/sglang/srt/server_args.py | 7 +- python/sglang/torch_memory_saver_adapter.py | 59 ++++++++ scripts/ci_install_dependency.sh | 3 +- test/srt/run_suite.py | 1 + test/srt/test_release_memory_occupation.py | 98 +++++++++++++ 12 files changed, 407 insertions(+), 61 deletions(-) create mode 100644 python/sglang/torch_memory_saver_adapter.py create mode 100644 test/srt/test_release_memory_occupation.py diff --git a/python/pyproject.toml b/python/pyproject.toml index 4b627ae94785..61a36e34132e 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -44,6 +44,7 @@ srt_hpu = ["sglang[runtime_common]"] openai = ["openai>=1.0", "tiktoken"] anthropic = ["anthropic>=0.20.0"] litellm = ["litellm>=1.0.0"] +torch_memory_saver = ["torch_memory_saver"] test = [ "jsonlines", "matplotlib", diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 26b8921c493f..ec45696bf5fd 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -19,9 +19,7 @@ import uuid from dataclasses import dataclass from enum import Enum -from typing import Dict, List, Optional, Tuple, Union - -import torch +from typing import Dict, List, Optional, Union from sglang.srt.managers.schedule_batch import BaseFinishReason from sglang.srt.sampling.sampling_params import SamplingParams @@ -459,6 +457,26 @@ class GetWeightsByNameReqOutput: parameter: list +@dataclass +class ReleaseMemoryOccupationReqInput: + pass + + +@dataclass +class ReleaseMemoryOccupationReqOutput: + pass + + +@dataclass +class ResumeMemoryOccupationReqInput: + pass + + +@dataclass +class ResumeMemoryOccupationReqOutput: + pass + + @dataclass class AbortReq: # The request id diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 1c07ea6adb75..b9e74aa9d93d 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -47,6 +47,10 @@ OpenSessionReqInput, OpenSessionReqOutput, ProfileReq, + ReleaseMemoryOccupationReqInput, + ReleaseMemoryOccupationReqOutput, + ResumeMemoryOccupationReqInput, + ResumeMemoryOccupationReqOutput, TokenizedEmbeddingReqInput, TokenizedGenerateReqInput, UpdateWeightFromDiskReqInput, @@ -88,6 +92,7 @@ set_random_seed, suppress_other_loggers, ) +from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter from sglang.utils import get_exception_traceback logger = logging.getLogger(__name__) @@ -357,6 +362,10 @@ def __init__( t.start() self.parent_process = psutil.Process().parent() + self.memory_saver_adapter = TorchMemorySaverAdapter.create( + enable=server_args.enable_memory_saver + ) + # Init profiler if os.getenv("SGLANG_TORCH_PROFILER_DIR", "") == "": self.profiler = None @@ -519,6 +528,12 @@ def process_input_requests(self, recv_reqs: List): elif isinstance(recv_req, GetWeightsByNameReqInput): parameter = self.get_weights_by_name(recv_req) self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter)) + elif isinstance(recv_req, ReleaseMemoryOccupationReqInput): + self.release_memory_occupation() + self.send_to_tokenizer.send_pyobj(ReleaseMemoryOccupationReqOutput()) + elif isinstance(recv_req, ResumeMemoryOccupationReqInput): + self.resume_memory_occupation() + self.send_to_tokenizer.send_pyobj(ResumeMemoryOccupationReqOutput()) elif isinstance(recv_req, ProfileReq): if recv_req == ProfileReq.START_PROFILE: self.start_profile() @@ -1538,6 +1553,20 @@ def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput): parameter = self.tp_worker.get_weights_by_name(recv_req) return parameter + def release_memory_occupation(self): + self.stashed_model_static_state = _export_static_state( + self.tp_worker.worker.model_runner.model + ) + self.memory_saver_adapter.pause() + self.flush_cache() + + def resume_memory_occupation(self): + self.memory_saver_adapter.resume() + _import_static_state( + self.tp_worker.worker.model_runner.model, self.stashed_model_static_state + ) + del self.stashed_model_static_state + def start_profile(self) -> None: if self.profiler is None: raise RuntimeError("Profiler is not enabled.") @@ -1576,6 +1605,20 @@ def close_session(self, recv_req: CloseSessionReqInput): del self.sessions[session_id] +def _export_static_state(model): + return dict( + buffers=[ + (name, buffer.detach().clone()) for name, buffer in model.named_buffers() + ] + ) + + +def _import_static_state(model, static_params): + self_named_buffers = dict(model.named_buffers()) + for name, tensor in static_params["buffers"]: + self_named_buffers[name][...] = tensor + + def run_scheduler_process( server_args: ServerArgs, port_args: PortArgs, diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index fb6202932f0f..33968e34fe47 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -53,6 +53,10 @@ OpenSessionReqInput, OpenSessionReqOutput, ProfileReq, + ReleaseMemoryOccupationReqInput, + ReleaseMemoryOccupationReqOutput, + ResumeMemoryOccupationReqInput, + ResumeMemoryOccupationReqOutput, SessionParams, TokenizedEmbeddingReqInput, TokenizedGenerateReqInput, @@ -188,6 +192,12 @@ def __init__( self.get_weights_by_name_communicator = _Communicator( self.send_to_scheduler, server_args.dp_size ) + self.release_memory_occupation_communicator = _Communicator( + self.send_to_scheduler, server_args.dp_size + ) + self.resume_memory_occupation_communicator = _Communicator( + self.send_to_scheduler, server_args.dp_size + ) # Metrics if self.enable_metrics: @@ -548,6 +558,22 @@ async def get_weights_by_name( else: return all_parameters + async def release_memory_occupation( + self, + obj: ReleaseMemoryOccupationReqInput, + request: Optional[fastapi.Request] = None, + ): + self.auto_create_handle_loop() + await self.release_memory_occupation_communicator(obj) + + async def resume_memory_occupation( + self, + obj: ResumeMemoryOccupationReqInput, + request: Optional[fastapi.Request] = None, + ): + self.auto_create_handle_loop() + await self.resume_memory_occupation_communicator(obj) + async def open_session( self, obj: OpenSessionReqInput, request: Optional[fastapi.Request] = None ): @@ -627,6 +653,8 @@ async def handle_loop(self): UpdateWeightsFromDistributedReqOutput, GetWeightsByNameReqOutput, InitWeightsUpdateGroupReqOutput, + ReleaseMemoryOccupationReqOutput, + ResumeMemoryOccupationReqOutput, ] = await self.recv_from_detokenizer.recv_pyobj() if isinstance(recv_obj, (BatchStrOut, BatchEmbeddingOut, BatchTokenIDOut)): @@ -709,6 +737,10 @@ async def handle_loop(self): self.update_weights_from_tensor_communicator.handle_recv(recv_obj) elif isinstance(recv_obj, GetWeightsByNameReqOutput): self.get_weights_by_name_communicator.handle_recv(recv_obj) + elif isinstance(recv_obj, ReleaseMemoryOccupationReqOutput): + self.release_memory_occupation_communicator.handle_recv(recv_obj) + elif isinstance(recv_obj, ResumeMemoryOccupationReqOutput): + self.resume_memory_occupation_communicator.handle_recv(recv_obj) else: raise ValueError(f"Invalid object: {recv_obj=}") diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py index abee7764bebf..0761169e40e5 100644 --- a/python/sglang/srt/mem_cache/memory_pool.py +++ b/python/sglang/srt/mem_cache/memory_pool.py @@ -13,6 +13,8 @@ limitations under the License. """ +from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter + """ Memory pool. @@ -42,13 +44,25 @@ class ReqToTokenPool: """A memory pool that maps a request to its token locations.""" - def __init__(self, size: int, max_context_len: int, device: str, use_records: bool): + def __init__( + self, + size: int, + max_context_len: int, + device: str, + use_records: bool, + enable_memory_saver: bool, + ): + memory_saver_adapter = TorchMemorySaverAdapter.create( + enable=enable_memory_saver + ) + self.size = size self.max_context_len = max_context_len self.device = device - self.req_to_token = torch.zeros( - (size, max_context_len), dtype=torch.int32, device=device - ) + with memory_saver_adapter.region(): + self.req_to_token = torch.zeros( + (size, max_context_len), dtype=torch.int32, device=device + ) self.free_slots = list(range(size)) self.write_records = [] self.use_records = use_records @@ -189,8 +203,14 @@ def __init__( head_dim: int, layer_num: int, device: str, + enable_memory_saver: bool, ): super().__init__(size, dtype, device) + + self.memory_saver_adapter = TorchMemorySaverAdapter.create( + enable=enable_memory_saver + ) + self.head_num = head_num self.head_dim = head_dim self.layer_num = layer_num @@ -202,24 +222,25 @@ def __init__( ) def _create_buffers(self): - # [size, head_num, head_dim] for each layer - # The padded slot 0 is used for writing dummy outputs from padded tokens. - self.k_buffer = [ - torch.empty( - (self.size + 1, self.head_num, self.head_dim), - dtype=self.store_dtype, - device=self.device, - ) - for _ in range(self.layer_num) - ] - self.v_buffer = [ - torch.empty( - (self.size + 1, self.head_num, self.head_dim), - dtype=self.store_dtype, - device=self.device, - ) - for _ in range(self.layer_num) - ] + with self.memory_saver_adapter.region(): + # [size, head_num, head_dim] for each layer + # The padded slot 0 is used for writing dummy outputs from padded tokens. + self.k_buffer = [ + torch.empty( + (self.size + 1, self.head_num, self.head_dim), + dtype=self.store_dtype, + device=self.device, + ) + for _ in range(self.layer_num) + ] + self.v_buffer = [ + torch.empty( + (self.size + 1, self.head_num, self.head_dim), + dtype=self.store_dtype, + device=self.device, + ) + for _ in range(self.layer_num) + ] def _clear_buffers(self): del self.k_buffer @@ -307,19 +328,26 @@ def __init__( qk_rope_head_dim: int, layer_num: int, device: str, + enable_memory_saver: bool, ): super().__init__(size, dtype, device) self.kv_lora_rank = kv_lora_rank - # The padded slot 0 is used for writing dummy outputs from padded tokens. - self.kv_buffer = [ - torch.empty( - (size + 1, 1, kv_lora_rank + qk_rope_head_dim), - dtype=self.store_dtype, - device=device, - ) - for _ in range(layer_num) - ] + + memory_saver_adapter = TorchMemorySaverAdapter.create( + enable=enable_memory_saver + ) + + with memory_saver_adapter.region(): + # The padded slot 0 is used for writing dummy outputs from padded tokens. + self.kv_buffer = [ + torch.empty( + (size + 1, 1, kv_lora_rank + qk_rope_head_dim), + dtype=self.store_dtype, + device=device, + ) + for _ in range(layer_num) + ] def get_key_buffer(self, layer_id: int): if self.store_dtype != self.dtype: @@ -360,26 +388,32 @@ def __init__( layer_num: int, device: str, heavy_channel_num: int, + enable_memory_saver: bool, ): super().__init__(size, dtype, device) - # [size, head_num, head_dim] for each layer - self.k_buffer = [ - torch.empty((size + 1, head_num, head_dim), dtype=dtype, device=device) - for _ in range(layer_num) - ] - self.v_buffer = [ - torch.empty((size + 1, head_num, head_dim), dtype=dtype, device=device) - for _ in range(layer_num) - ] - - # [size, head_num, heavy_channel_num] for each layer - self.label_buffer = [ - torch.empty( - (size + 1, head_num, heavy_channel_num), dtype=dtype, device=device - ) - for _ in range(layer_num) - ] + memory_saver_adapter = TorchMemorySaverAdapter.create( + enable=enable_memory_saver + ) + + with memory_saver_adapter.region(): + # [size, head_num, head_dim] for each layer + self.k_buffer = [ + torch.empty((size + 1, head_num, head_dim), dtype=dtype, device=device) + for _ in range(layer_num) + ] + self.v_buffer = [ + torch.empty((size + 1, head_num, head_dim), dtype=dtype, device=device) + for _ in range(layer_num) + ] + + # [size, head_num, heavy_channel_num] for each layer + self.label_buffer = [ + torch.empty( + (size + 1, head_num, heavy_channel_num), dtype=dtype, device=device + ) + for _ in range(layer_num) + ] def get_key_buffer(self, layer_id: int): return self.k_buffer[layer_id] diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index d46a2c0dc725..190427649312 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -60,6 +60,7 @@ monkey_patch_vllm_p2p_access_check, set_cpu_offload_max_bytes, ) +from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter logger = logging.getLogger(__name__) @@ -166,6 +167,10 @@ def __init__( # Get memory before model loading min_per_gpu_memory = self.init_torch_distributed() + self.memory_saver_adapter = TorchMemorySaverAdapter.create( + enable=self.server_args.enable_memory_saver + ) + # Load the model self.sampler = Sampler() self.load_model() @@ -272,11 +277,12 @@ def load_model(self): monkey_patch_vllm_gguf_config() # Load the model - self.model = get_model( - model_config=self.model_config, - load_config=self.load_config, - device_config=DeviceConfig(self.device), - ) + with self.memory_saver_adapter.region(): + self.model = get_model( + model_config=self.model_config, + load_config=self.load_config, + device_config=DeviceConfig(self.device), + ) if self.server_args.kv_cache_dtype == "fp8_e4m3": if self.server_args.quantization_param_path is not None: @@ -417,7 +423,7 @@ def init_weights_update_group( logger.info( f"init custom process group: master_address={master_address}, master_port={master_port}, " - f"rank_offset={rank_offset}, world_size={world_size}, group_name={group_name}, backend={backend}" + f"rank_offset={rank_offset}, rank={rank}, world_size={world_size}, group_name={group_name}, backend={backend}" ) try: @@ -590,6 +596,7 @@ def init_memory_pool( max_context_len=self.model_config.context_len + 4, device=self.device, use_records=False, + enable_memory_saver=self.server_args.enable_memory_saver, ) if ( self.model_config.attention_arch == AttentionArch.MLA @@ -602,6 +609,7 @@ def init_memory_pool( qk_rope_head_dim=self.model_config.qk_rope_head_dim, layer_num=self.model_config.num_hidden_layers, device=self.device, + enable_memory_saver=self.server_args.enable_memory_saver, ) elif self.server_args.enable_double_sparsity: self.token_to_kv_pool = DoubleSparseTokenToKVPool( @@ -612,6 +620,7 @@ def init_memory_pool( layer_num=self.model_config.num_hidden_layers, device=self.device, heavy_channel_num=self.server_args.ds_heavy_channel_num, + enable_memory_saver=self.server_args.enable_memory_saver, ) else: self.token_to_kv_pool = MHATokenToKVPool( @@ -621,6 +630,7 @@ def init_memory_pool( head_dim=self.model_config.head_dim, layer_num=self.model_config.num_hidden_layers, device=self.device, + enable_memory_saver=self.server_args.enable_memory_saver, ) logger.info( f"Memory pool end. " diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index fa1625b09595..4e837e5389ba 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -31,6 +31,8 @@ import torch +from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter + # Fix a bug of Python threading setattr(threading, "_register_atexit", lambda *args, **kwargs: None) @@ -57,6 +59,8 @@ GetWeightsByNameReqInput, InitWeightsUpdateGroupReqInput, OpenSessionReqInput, + ReleaseMemoryOccupationReqInput, + ResumeMemoryOccupationReqInput, UpdateWeightFromDiskReqInput, UpdateWeightsFromDistributedReqInput, UpdateWeightsFromTensorReqInput, @@ -255,6 +259,28 @@ async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request): return _create_error_response(e) +@app.api_route("/release_memory_occupation", methods=["GET", "POST"]) +async def release_memory_occupation( + obj: ReleaseMemoryOccupationReqInput, request: Request +): + """Release GPU occupation temporarily""" + try: + await tokenizer_manager.release_memory_occupation(obj, request) + except Exception as e: + return _create_error_response(e) + + +@app.api_route("/resume_memory_occupation", methods=["GET", "POST"]) +async def resume_memory_occupation( + obj: ResumeMemoryOccupationReqInput, request: Request +): + """Resume GPU occupation""" + try: + await tokenizer_manager.resume_memory_occupation(obj, request) + except Exception as e: + return _create_error_response(e) + + @app.api_route("/open_session", methods=["GET", "POST"]) async def open_session(obj: OpenSessionReqInput, request: Request): """Open a session, and return its unique session id.""" @@ -438,6 +464,10 @@ def launch_engine( server_args.model_path, server_args.tokenizer_path ) + memory_saver_adapter = TorchMemorySaverAdapter.create( + enable=server_args.enable_memory_saver + ) + if server_args.dp_size == 1: # Launch tensor parallel scheduler processes scheduler_procs = [] @@ -454,7 +484,8 @@ def launch_engine( target=run_scheduler_process, args=(server_args, port_args, gpu_id, tp_rank, None, writer), ) - proc.start() + with memory_saver_adapter.configure_subprocess(): + proc.start() scheduler_procs.append(proc) scheduler_pipe_readers.append(reader) @@ -471,7 +502,8 @@ def launch_engine( target=run_data_parallel_controller_process, args=(server_args, port_args, writer), ) - proc.start() + with memory_saver_adapter.configure_subprocess(): + proc.start() # Launch detokenizer process detoken_proc = mp.Process( @@ -897,6 +929,18 @@ def get_weights_by_name(self, name, truncate_size=100): loop = asyncio.get_event_loop() return loop.run_until_complete(tokenizer_manager.get_weights_by_name(obj, None)) + def release_memory_occupation(self): + """Release GPU occupation temporarily""" + obj = ReleaseMemoryOccupationReqInput() + loop = asyncio.get_event_loop() + loop.run_until_complete(tokenizer_manager.release_memory_occupation(obj, None)) + + def resume_memory_occupation(self): + """Resume GPU occupation""" + obj = ResumeMemoryOccupationReqInput() + loop = asyncio.get_event_loop() + loop.run_until_complete(tokenizer_manager.resume_memory_occupation(obj, None)) + class Runtime: """ diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index be85a3670d40..4f44d5c877dc 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -23,7 +23,6 @@ import torch from sglang.srt.hf_transformers_utils import check_gguf_file -from sglang.srt.speculative.spec_info import SpeculativeAlgorithm from sglang.srt.utils import ( get_amdgpu_memory_capacity, get_hpu_memory_capacity, @@ -157,6 +156,7 @@ class ServerArgs: triton_attention_num_kv_splits: int = 8 num_continuous_decode_steps: int = 1 delete_ckpt_after_loading: bool = False + enable_memory_saver: bool = False def __post_init__(self): # Set missing default values @@ -854,6 +854,11 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Delete the model checkpoint after loading the model.", ) + parser.add_argument( + "--enable-memory-saver", + action="store_true", + help="Allow saving memory using release_memory_occupation and resume_memory_occupation", + ) @classmethod def from_cli_args(cls, args: argparse.Namespace): diff --git a/python/sglang/torch_memory_saver_adapter.py b/python/sglang/torch_memory_saver_adapter.py new file mode 100644 index 000000000000..31f8ebf2f077 --- /dev/null +++ b/python/sglang/torch_memory_saver_adapter.py @@ -0,0 +1,59 @@ +from abc import ABC +from contextlib import contextmanager + +try: + import torch_memory_saver + + _primary_memory_saver = torch_memory_saver.TorchMemorySaver() +except ImportError: + pass + + +class TorchMemorySaverAdapter(ABC): + @staticmethod + def create(enable: bool): + return ( + _TorchMemorySaverAdapterReal() if enable else _TorchMemorySaverAdapterNoop() + ) + + def configure_subprocess(self): + raise NotImplementedError + + def region(self): + raise NotImplementedError + + def pause(self): + raise NotImplementedError + + def resume(self): + raise NotImplementedError + + +class _TorchMemorySaverAdapterReal(TorchMemorySaverAdapter): + def configure_subprocess(self): + return torch_memory_saver.configure_subprocess() + + def region(self): + return _primary_memory_saver.region() + + def pause(self): + return _primary_memory_saver.pause() + + def resume(self): + return _primary_memory_saver.resume() + + +class _TorchMemorySaverAdapterNoop(TorchMemorySaverAdapter): + @contextmanager + def configure_subprocess(self): + yield + + @contextmanager + def region(self): + yield + + def pause(self): + pass + + def resume(self): + pass diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh index 26c34879e9ba..66b113f61976 100755 --- a/scripts/ci_install_dependency.sh +++ b/scripts/ci_install_dependency.sh @@ -12,8 +12,9 @@ bash "${SCRIPT_DIR}/killall_sglang.sh" pip install --upgrade pip pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ -# Force reinstall flashinfer +# Force reinstall flashinfer and torch_memory_saver pip install flashinfer==0.1.6 --find-links ${FLASHINFER_REPO} --force-reinstall --no-deps +pip install torch_memory_saver --force-reinstall pip install transformers==4.45.2 sentence_transformers accelerate peft diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index d617fcf69e62..658b3d2f8158 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -29,6 +29,7 @@ "test_openai_server.py", "test_pytorch_sampling_backend.py", "test_radix_attention.py", + "test_release_memory_occupation.py", "test_retract_decode.py", "test_server_args.py", "test_session_control.py", diff --git a/test/srt/test_release_memory_occupation.py b/test/srt/test_release_memory_occupation.py new file mode 100644 index 000000000000..c84b64e77dfe --- /dev/null +++ b/test/srt/test_release_memory_occupation.py @@ -0,0 +1,98 @@ +import time +import unittest + +import torch +from transformers import AutoModelForCausalLM + +import sglang as sgl +from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST + +# (temporarily) set to true to observe memory usage in nvidia-smi more clearly +_DEBUG_EXTRA = True + + +class TestReleaseMemoryOccupation(unittest.TestCase): + def test_release_and_resume_occupation(self): + prompt = "Today is a sunny day and I like" + sampling_params = {"temperature": 0, "max_new_tokens": 8} + model_name = DEFAULT_SMALL_MODEL_NAME_FOR_TEST + expect_output = " to spend it outdoors. I decided to" + + engine = sgl.Engine( + model_path=model_name, + random_seed=42, + enable_memory_saver=True, + # disable_cuda_graph=True, # for debugging only + ) + hf_model_new = AutoModelForCausalLM.from_pretrained( + model_name, torch_dtype="bfloat16" + ) + + print("generate (#1)") + outputs = engine.generate(prompt, sampling_params)["text"] + self.assertEqual(outputs, expect_output) + + if _DEBUG_EXTRA: + time.sleep(3) + + self.assertEqual( + _try_allocate_big_tensor(), + False, + "Should not be able to allocate big tensors before releasing", + ) + + print("release_memory_occupation start") + t = time.time() + engine.release_memory_occupation() + if _DEBUG_EXTRA: + print("release_memory_occupation", time.time() - t) + + if _DEBUG_EXTRA: + time.sleep(5) + + self.assertEqual( + _try_allocate_big_tensor(), + True, + "Should be able to allocate big tensors aftre releasing", + ) + + if _DEBUG_EXTRA: + time.sleep(5) + + print("resume_memory_occupation start") + t = time.time() + engine.resume_memory_occupation() + if _DEBUG_EXTRA: + print("resume_memory_occupation", time.time() - t) + + self.assertEqual( + _try_allocate_big_tensor(), + False, + "Should not be able to allocate big tensors after resuming", + ) + + print("update_weights_from_tensor") + # As if: PPO has updated hf model's weights, and now we sync it to SGLang + engine.update_weights_from_tensor(list(hf_model_new.named_parameters())) + + print("generate (#2)") + outputs = engine.generate(prompt, sampling_params)["text"] + self.assertEqual(outputs, expect_output) + + if _DEBUG_EXTRA: + time.sleep(4) + + engine.shutdown() + + +def _try_allocate_big_tensor(size: int = 20_000_000_000): + try: + torch.empty((size,), dtype=torch.uint8, device="cuda") + torch.cuda.empty_cache() + return True + except torch.cuda.OutOfMemoryError: + return False + + +if __name__ == "__main__": + unittest.main() From 46d44318894a13dc6d018892b32dd4a7e09f20f7 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 13 Jan 2025 14:24:00 -0800 Subject: [PATCH 048/248] Add a new api configure_logging to allow dumping the requests (#2875) --- 3rdparty/amd/profiling/PROFILING.md | 2 +- 3rdparty/amd/profiling/server.sh | 2 +- 3rdparty/amd/tuning/TUNING.md | 2 +- benchmark/blog_v0_2/405b_sglang.sh | 2 +- .../sglang/srt/managers/configure_logging.py | 43 ++++++ python/sglang/srt/managers/io_struct.py | 7 + python/sglang/srt/managers/scheduler.py | 2 +- .../sglang/srt/managers/tokenizer_manager.py | 41 +++++- python/sglang/srt/mem_cache/memory_pool.py | 2 +- .../sglang/srt/model_executor/model_runner.py | 2 +- python/sglang/srt/server.py | 126 +++++++++--------- python/sglang/srt/server_args.py | 4 +- .../{ => srt}/torch_memory_saver_adapter.py | 0 13 files changed, 164 insertions(+), 71 deletions(-) create mode 100644 python/sglang/srt/managers/configure_logging.py rename python/sglang/{ => srt}/torch_memory_saver_adapter.py (100%) diff --git a/3rdparty/amd/profiling/PROFILING.md b/3rdparty/amd/profiling/PROFILING.md index 79bc75b503bc..7e15ec844f2b 100644 --- a/3rdparty/amd/profiling/PROFILING.md +++ b/3rdparty/amd/profiling/PROFILING.md @@ -336,7 +336,7 @@ loadTracer.sh python3 -m sglang.launch_server \ --model-path /sgl-workspace/sglang/dummy_grok1 \ --tokenizer-path Xenova/grok-1-tokenizer \ --load-format dummy \ - --quant fp8 \ + --quantization fp8 \ --tp 8 \ --port 30000 \ --disable-radix-cache 2>&1 | tee "$LOGFILE" diff --git a/3rdparty/amd/profiling/server.sh b/3rdparty/amd/profiling/server.sh index aa574f64c940..f877e6c7acd4 100755 --- a/3rdparty/amd/profiling/server.sh +++ b/3rdparty/amd/profiling/server.sh @@ -14,7 +14,7 @@ loadTracer.sh python3 -m sglang.launch_server \ --model-path /sgl-workspace/sglang/dummy_grok1 \ --tokenizer-path Xenova/grok-1-tokenizer \ --load-format dummy \ - --quant fp8 \ + --quantization fp8 \ --tp 8 \ --port 30000 \ --disable-radix-cache 2>&1 | tee "$LOGFILE" diff --git a/3rdparty/amd/tuning/TUNING.md b/3rdparty/amd/tuning/TUNING.md index a38a16d4f7a5..0638041c9743 100644 --- a/3rdparty/amd/tuning/TUNING.md +++ b/3rdparty/amd/tuning/TUNING.md @@ -104,7 +104,7 @@ To maximize moe kernel efficiency, need to use below scripts to find out the bes ```bash #Tuning -#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quant fp" to run, it defined batch-size 32 input lenth 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run). +#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input lenth 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run). #so we can tune decode moe use below command python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32" # and use this command to tune prefill moe diff --git a/benchmark/blog_v0_2/405b_sglang.sh b/benchmark/blog_v0_2/405b_sglang.sh index 4e3372ae8c70..491853782805 100644 --- a/benchmark/blog_v0_2/405b_sglang.sh +++ b/benchmark/blog_v0_2/405b_sglang.sh @@ -6,7 +6,7 @@ # wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json # Launch sglang -# python -m sglang.launch_server --model-path ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quant fp8 --disable-radix --mem-frac 0.87 +# python -m sglang.launch_server --model-path ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quantization fp8 --disable-radix --mem-frac 0.87 # offline python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > sglang_log11 diff --git a/python/sglang/srt/managers/configure_logging.py b/python/sglang/srt/managers/configure_logging.py new file mode 100644 index 000000000000..3351cdc400ce --- /dev/null +++ b/python/sglang/srt/managers/configure_logging.py @@ -0,0 +1,43 @@ +""" +Copyright 2023-2025 SGLang Team +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +""" +Configure the logging settings of a server. + +Usage: +python3 -m sglang.srt.managers.configure_logging --url http://localhost:30000 +""" + +import argparse + +import requests + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--url", type=str, default="http://localhost:30000") + parser.add_argument( + "--dump-requests-folder", type=str, default="/tmp/sglang_request_dump" + ) + parser.add_argument("--dump-requests-threshold", type=int, default=1000) + args = parser.parse_args() + + response = requests.post( + args.url + "/configure_logging", + json={ + "dump_requests_folder": args.dump_requests_folder, + "dump_requests_threshold": args.dump_requests_threshold, + }, + ) + assert response.status_code == 200 diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index ec45696bf5fd..075693c7bc90 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -488,6 +488,13 @@ class ProfileReq(Enum): STOP_PROFILE = 2 +@dataclass +class ConfigureLoggingReq: + log_requests: Optional[bool] = None + dump_requests_folder: Optional[str] = None + dump_requests_threshold: Optional[int] = None + + @dataclass class OpenSessionReqInput: capacity_of_str_len: int diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index b9e74aa9d93d..187216353171 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -82,6 +82,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardMode from sglang.srt.server_args import PortArgs, ServerArgs from sglang.srt.speculative.spec_info import SpeculativeAlgorithm +from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter from sglang.srt.utils import ( broadcast_pyobj, configure_logger, @@ -92,7 +93,6 @@ set_random_seed, suppress_other_loggers, ) -from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter from sglang.utils import get_exception_traceback logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 33968e34fe47..acd3b674a455 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -18,10 +18,12 @@ import dataclasses import logging import os +import pickle import signal import sys import time import uuid +from datetime import datetime from typing import Any, Awaitable, Dict, Generic, List, Optional, Tuple, TypeVar, Union import fastapi @@ -43,6 +45,7 @@ BatchStrOut, BatchTokenIDOut, CloseSessionReqInput, + ConfigureLoggingReq, EmbeddingReqInput, FlushCacheReq, GenerateReqInput, @@ -109,6 +112,7 @@ def __init__( # Parse args self.server_args = server_args self.enable_metrics = server_args.enable_metrics + self.log_requests = server_args.log_requests # Init inter-process communication context = zmq.asyncio.Context(2) @@ -167,6 +171,9 @@ def __init__( # Store states self.to_create_loop = True self.rid_to_state: Dict[str, ReqState] = {} + self.dump_requests_folder = "" # By default do not dump + self.dump_requests_threshold = 1000 + self.dump_request_list: List[Tuple] = [] # The event to notify the weight sync is finished. self.model_update_lock = RWLock() @@ -225,7 +232,7 @@ async def generate_request( obj.normalize_batch_and_arguments() - if self.server_args.log_requests: + if self.log_requests: logger.info(f"Receive: obj={dataclass_to_string_truncated(obj)}") async with self.model_update_lock.reader_lock: @@ -346,7 +353,7 @@ async def _wait_one_response( state.out_list = [] if state.finished: - if self.server_args.log_requests: + if self.log_requests: msg = f"Finish: obj={dataclass_to_string_truncated(obj)}, out={dataclass_to_string_truncated(out)}" logger.info(msg) del self.rid_to_state[obj.rid] @@ -597,6 +604,15 @@ async def close_session( assert not self.to_create_loop, "close session should not be the first request" await self.send_to_scheduler.send_pyobj(obj) + def configure_logging(self, obj: ConfigureLoggingReq): + if obj.log_requests is not None: + self.log_requests = obj.log_requests + if obj.dump_requests_folder is not None: + self.dump_requests_folder = obj.dump_requests_folder + if obj.dump_requests_threshold is not None: + self.dump_requests_threshold = obj.dump_requests_threshold + logging.info(f"Config logging: {obj=}") + def create_abort_task(self, obj: GenerateReqInput): # Abort the request if the client is disconnected. async def abort_request(): @@ -708,6 +724,8 @@ async def handle_loop(self): if self.enable_metrics: self.collect_metrics(state, recv_obj, i) + if self.dump_requests_folder and state.finished: + self.dump_requests(state, out_dict) elif isinstance(recv_obj, OpenSessionReqOutput): self.session_futures[recv_obj.session_id].set_result( recv_obj.session_id if recv_obj.success else None @@ -850,6 +868,25 @@ def collect_metrics(self, state: ReqState, recv_obj: BatchStrOut, i: int): (time.time() - state.created_time) / completion_tokens ) + def dump_requests(self, state: ReqState, out_dict: dict): + self.dump_request_list.append( + (state.obj, out_dict, state.created_time, time.time()) + ) + + if len(self.dump_request_list) >= self.dump_requests_threshold: + to_dump = self.dump_request_list + self.dump_request_list = [] + + def background_task(): + os.makedirs(self.dump_requests_folder, exist_ok=True) + current_time = datetime.now() + filename = current_time.strftime("%Y-%m-%d_%H-%M-%S") + ".pkl" + with open(os.path.join(self.dump_requests_folder, filename), "wb") as f: + pickle.dump(to_dump, f) + + # Schedule the task to run in the background without awaiting it + asyncio.create_task(asyncio.to_thread(background_task)) + class SignalHandler: def __init__(self, tokenizer_manager): diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py index 0761169e40e5..ab27e81b7430 100644 --- a/python/sglang/srt/mem_cache/memory_pool.py +++ b/python/sglang/srt/mem_cache/memory_pool.py @@ -13,7 +13,7 @@ limitations under the License. """ -from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter +from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter """ Memory pool. diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 190427649312..238f8603ac95 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -50,6 +50,7 @@ from sglang.srt.model_loader import get_model from sglang.srt.server_args import ServerArgs from sglang.srt.speculative.spec_info import SpeculativeAlgorithm +from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter from sglang.srt.utils import ( enable_show_time_cost, get_available_gpu_memory, @@ -60,7 +61,6 @@ monkey_patch_vllm_p2p_access_check, set_cpu_offload_max_bytes, ) -from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 4e837e5389ba..93fe1304caff 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -31,7 +31,7 @@ import torch -from sglang.torch_memory_saver_adapter import TorchMemorySaverAdapter +from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter # Fix a bug of Python threading setattr(threading, "_register_atexit", lambda *args, **kwargs: None) @@ -54,6 +54,7 @@ from sglang.srt.managers.detokenizer_manager import run_detokenizer_process from sglang.srt.managers.io_struct import ( CloseSessionReqInput, + ConfigureLoggingReq, EmbeddingReqInput, GenerateReqInput, GetWeightsByNameReqInput, @@ -161,12 +162,68 @@ async def get_model_info(): @app.get("/get_server_info") async def get_server_info(): return { - **dataclasses.asdict(tokenizer_manager.server_args), # server args + **dataclasses.asdict(tokenizer_manager.server_args), **scheduler_info, "version": __version__, } +# fastapi implicitly converts json in the request to obj (dataclass) +@app.api_route("/generate", methods=["POST", "PUT"]) +@time_func_latency +async def generate_request(obj: GenerateReqInput, request: Request): + """Handle a generate request.""" + if obj.stream: + + async def stream_results() -> AsyncIterator[bytes]: + try: + async for out in tokenizer_manager.generate_request(obj, request): + yield b"data: " + orjson.dumps( + out, option=orjson.OPT_NON_STR_KEYS + ) + b"\n\n" + except ValueError as e: + out = {"error": {"message": str(e)}} + yield b"data: " + orjson.dumps( + out, option=orjson.OPT_NON_STR_KEYS + ) + b"\n\n" + yield b"data: [DONE]\n\n" + + return StreamingResponse( + stream_results(), + media_type="text/event-stream", + background=tokenizer_manager.create_abort_task(obj), + ) + else: + try: + ret = await tokenizer_manager.generate_request(obj, request).__anext__() + return ret + except ValueError as e: + logger.error(f"Error: {e}") + return _create_error_response(e) + + +@app.api_route("/encode", methods=["POST", "PUT"]) +@time_func_latency +async def encode_request(obj: EmbeddingReqInput, request: Request): + """Handle an embedding request.""" + try: + ret = await tokenizer_manager.generate_request(obj, request).__anext__() + return ret + except ValueError as e: + return _create_error_response(e) + + +@app.api_route("/classify", methods=["POST", "PUT"]) +@time_func_latency +async def classify_request(obj: EmbeddingReqInput, request: Request): + """Handle a reward model request. Now the arguments and return values are the same as embedding models.""" + try: + ret = await tokenizer_manager.generate_request(obj, request).__anext__() + return ret + except ValueError as e: + return _create_error_response(e) + + @app.post("/flush_cache") async def flush_cache(): """Flush the radix cache.""" @@ -178,8 +235,7 @@ async def flush_cache(): ) -@app.get("/start_profile") -@app.post("/start_profile") +@app.api_route("/start_profile", methods=["GET", "POST"]) async def start_profile_async(): """Start profiling.""" tokenizer_manager.start_profile() @@ -189,8 +245,7 @@ async def start_profile_async(): ) -@app.get("/stop_profile") -@app.post("/stop_profile") +@app.api_route("/stop_profile", methods=["GET", "POST"]) async def stop_profile_async(): """Stop profiling.""" tokenizer_manager.stop_profile() @@ -305,60 +360,11 @@ async def close_session(obj: CloseSessionReqInput, request: Request): return _create_error_response(e) -# fastapi implicitly converts json in the request to obj (dataclass) -@app.api_route("/generate", methods=["POST", "PUT"]) -@time_func_latency -async def generate_request(obj: GenerateReqInput, request: Request): - """Handle a generate request.""" - if obj.stream: - - async def stream_results() -> AsyncIterator[bytes]: - try: - async for out in tokenizer_manager.generate_request(obj, request): - yield b"data: " + orjson.dumps( - out, option=orjson.OPT_NON_STR_KEYS - ) + b"\n\n" - except ValueError as e: - out = {"error": {"message": str(e)}} - yield b"data: " + orjson.dumps( - out, option=orjson.OPT_NON_STR_KEYS - ) + b"\n\n" - yield b"data: [DONE]\n\n" - - return StreamingResponse( - stream_results(), - media_type="text/event-stream", - background=tokenizer_manager.create_abort_task(obj), - ) - else: - try: - ret = await tokenizer_manager.generate_request(obj, request).__anext__() - return ret - except ValueError as e: - logger.error(f"Error: {e}") - return _create_error_response(e) - - -@app.api_route("/encode", methods=["POST", "PUT"]) -@time_func_latency -async def encode_request(obj: EmbeddingReqInput, request: Request): - """Handle an embedding request.""" - try: - ret = await tokenizer_manager.generate_request(obj, request).__anext__() - return ret - except ValueError as e: - return _create_error_response(e) - - -@app.api_route("/classify", methods=["POST", "PUT"]) -@time_func_latency -async def classify_request(obj: EmbeddingReqInput, request: Request): - """Handle a reward model request. Now the arguments and return values are the same as embedding models.""" - try: - ret = await tokenizer_manager.generate_request(obj, request).__anext__() - return ret - except ValueError as e: - return _create_error_response(e) +@app.api_route("/configure_logging", methods=["GET", "POST"]) +async def configure_logging(obj: ConfigureLoggingReq, request: Request): + """Close the session""" + tokenizer_manager.configure_logging(obj) + return Response(status_code=200) ##### OpenAI-compatible API endpoints ##### diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 4f44d5c877dc..57a82c18a331 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -91,7 +91,7 @@ class ServerArgs: # API related api_key: Optional[str] = None - file_storage_pth: str = "SGLang_storage" + file_storage_pth: str = "sglang_storage" enable_cache_report: bool = False # Data parallelism @@ -554,7 +554,7 @@ def add_cli_args(parser: argparse.ArgumentParser): "--decode-log-interval", type=int, default=ServerArgs.decode_log_interval, - help="The log interval of decode batch", + help="The log interval of decode batch.", ) # API related diff --git a/python/sglang/torch_memory_saver_adapter.py b/python/sglang/srt/torch_memory_saver_adapter.py similarity index 100% rename from python/sglang/torch_memory_saver_adapter.py rename to python/sglang/srt/torch_memory_saver_adapter.py From 80002562a8158b5c531f2ab81155da313a2a5cd6 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 14 Jan 2025 12:48:17 +0800 Subject: [PATCH 049/248] docs: update README (#2878) --- .github/workflows/release-docs.yml | 2 +- benchmark/deepseek_v3/README.md | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml index c200f5313e65..84138f7430e0 100644 --- a/.github/workflows/release-docs.yml +++ b/.github/workflows/release-docs.yml @@ -49,7 +49,7 @@ jobs: cd _build/html git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1 - find ../sgl-project.github.io/ -mindepth 1 -not -path "../sgl-project.github.io/.git*" -not -name CNAME -delete + find ../sgl-project.github.io/ -mindepth 1 -not -path "../sgl-project.github.io/.git*" -not -name CNAME -not -name ".jekyll" -not -name ".nojekyll" -delete cp -r * ../sgl-project.github.io cp ../../README.md ../sgl-project.github.io/README.md cd ../sgl-project.github.io diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md index e7ad8d33609c..ea972831a368 100644 --- a/benchmark/deepseek_v3/README.md +++ b/benchmark/deepseek_v3/README.md @@ -60,7 +60,9 @@ print(response) ``` ### Example: Serving with two H20*8 nodes -For example, there are two H20 nodes, each with 8 GPUs. The first node's IP is `10.0.0.1`, and the second node's IP is `10.0.0.2`. +For example, there are two H20 nodes, each with 8 GPUs. The first node's IP is `10.0.0.1`, and the second node's IP is `10.0.0.2`. Please **use the first node's IP** for both commands. + +If the command fails, try setting the `GLOO_SOCKET_IFNAME` parameter. For more information, see [Common Environment Variables](https://pytorch.org/docs/stable/distributed.html#common-environment-variables). ```bash # node 1 From c19d84829c7de194d3965cb0edd414de24c145d8 Mon Sep 17 00:00:00 2001 From: Ke Bao Date: Tue, 14 Jan 2025 13:34:22 +0800 Subject: [PATCH 050/248] Adjust flashinfer workspace size for Qwen2 models (#2879) --- python/sglang/srt/layers/attention/flashinfer_backend.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index f038394628fd..6a4636128103 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -84,6 +84,10 @@ def __init__(self, model_runner: ModelRunner): self.num_wrappers = 1 self.dispatch_reason = None + # Qwen2 models require higher flashinfer workspace size + if "Qwen2ForCausalLM" in model_runner.model_config.hf_config.architectures: + global_config.flashinfer_workspace_size = 512 * 1024 * 1024 + # Allocate buffers self.workspace_buffer = torch.empty( global_config.flashinfer_workspace_size, From b8cd09f27aaee18f90424f8baf74e936269428a0 Mon Sep 17 00:00:00 2001 From: kk <43161300+kkHuang-amd@users.noreply.github.com> Date: Tue, 14 Jan 2025 16:59:43 +0800 Subject: [PATCH 051/248] update ROCm docker for layernorm kernel optimization (#2885) Co-authored-by: wunhuang --- docker/Dockerfile.rocm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 2ad62d2d493d..e71cd1694029 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -2,7 +2,7 @@ # docker build --build-arg SGL_BRANCH=v0.4.1.post5 -t v0.4.1.post5-rocm620 -f Dockerfile.rocm . # default base image -ARG BASE_IMAGE="rocmshared/vllm-rocm:20250113-tuned-elementwise" +ARG BASE_IMAGE="rocmshared/vllm-rocm:20250114-tuned-elementwise-layernorm" FROM $BASE_IMAGE AS base USER root From cc0485bef29831f2fcf707ecc1a371be0c7bc816 Mon Sep 17 00:00:00 2001 From: Ke Bao Date: Tue, 14 Jan 2025 17:07:49 +0800 Subject: [PATCH 052/248] Support w8a8 int8 quantization config (#2881) --- python/sglang/srt/configs/model_config.py | 21 +++- .../srt/layers/quantization/__init__.py | 2 + .../srt/layers/quantization/w8a8_int8.py | 117 ++++++++++++++++++ python/sglang/srt/server_args.py | 1 + 4 files changed, 135 insertions(+), 6 deletions(-) create mode 100644 python/sglang/srt/layers/quantization/w8a8_int8.py diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 072c88b04a78..d087a2f2348c 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -223,7 +223,11 @@ def _verify_quantization(self) -> None: "compressed_tensors", "compressed-tensors", "experts_int8", + "w8a8_int8", ] + compatible_quantization_methods = { + "w8a8_int8": ["compressed-tensors", "compressed_tensors"] + } if self.quantization is not None: self.quantization = self.quantization.lower() @@ -247,12 +251,17 @@ def _verify_quantization(self) -> None: if self.quantization is None: self.quantization = quant_method elif self.quantization != quant_method: - raise ValueError( - "Quantization method specified in the model config " - f"({quant_method}) does not match the quantization " - f"method specified in the `quantization` argument " - f"({self.quantization})." - ) + if ( + self.quantization not in compatible_quantization_methods + or quant_method + not in compatible_quantization_methods[self.quantization] + ): + raise ValueError( + "Quantization method specified in the model config " + f"({quant_method}) does not match the quantization " + f"method specified in the `quantization` argument " + f"({self.quantization})." + ) if self.quantization is not None: if self.quantization not in supported_quantization: diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py index 35b0c4d94edb..1a39e800633c 100644 --- a/python/sglang/srt/layers/quantization/__init__.py +++ b/python/sglang/srt/layers/quantization/__init__.py @@ -23,6 +23,7 @@ from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.quantization.fp8 import Fp8Config from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp8Config +from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = { "aqlm": AQLMConfig, @@ -42,6 +43,7 @@ "bitsandbytes": BitsAndBytesConfig, "qqq": QQQConfig, "experts_int8": ExpertsInt8Config, + "w8a8_int8": W8A8Int8Config, } diff --git a/python/sglang/srt/layers/quantization/w8a8_int8.py b/python/sglang/srt/layers/quantization/w8a8_int8.py new file mode 100644 index 000000000000..0c39393b70a9 --- /dev/null +++ b/python/sglang/srt/layers/quantization/w8a8_int8.py @@ -0,0 +1,117 @@ +from typing import Any, Dict, List, Optional + +import torch + +from sglang.srt.utils import is_cuda_available + +is_cuda = is_cuda_available() +if is_cuda: + from sgl_kernel import int8_scaled_mm + +from torch.nn.parameter import Parameter + +from sglang.srt.layers.linear import LinearMethodBase +from sglang.srt.layers.parameter import ChannelQuantScaleParameter, ModelWeightParameter +from sglang.srt.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8 + + +class W8A8Int8Config(QuantizationConfig): + """Config class for W8A8 Int8 Quantization. + + - Weight: static, per-channel, symmetric + - Activation: dynamic, per-token, symmetric + """ + + def __init__(self): + pass + + @classmethod + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 75 + + @classmethod + def get_name(self) -> str: + return "w8a8_int8" + + @classmethod + def get_config_filenames(cls) -> List[str]: + return [] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "W8A8Int8Config": + return cls() + + def get_quant_method( + self, + layer: torch.nn.Module, + prefix: str, + ) -> Optional["QuantizeMethodBase"]: + from vllm.model_executor.layers.linear import LinearBase + + if isinstance(layer, LinearBase): + return W8A8Int8LinearMethod(self) + return None + + def get_scaled_act_names(self) -> List[str]: + return [] + + +class W8A8Int8LinearMethod(LinearMethodBase): + + def __init__(self, quantization_config: W8A8Int8Config): + self.quantization_config = quantization_config + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.weight = Parameter(layer.weight.t(), requires_grad=False) + layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs + ): + + weight_loader = extra_weight_attrs.get("weight_loader") + self.logical_widths = output_partition_sizes + + weight = ModelWeightParameter( + data=torch.empty( + sum(output_partition_sizes), input_size_per_partition, dtype=torch.int8 + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight_scale", weight_scale) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ): + x_q, x_scale = per_token_quant_int8(x) + + return int8_scaled_mm( + x_q, layer.weight, x_scale, layer.weight_scale, out_dtype=x.dtype, bias=bias + ) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 57a82c18a331..e445217b62fd 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -378,6 +378,7 @@ def add_cli_args(parser: argparse.ArgumentParser): "bitsandbytes", "gguf", "modelopt", + "w8a8_int8", ], help="The quantization method.", ) From f5c6c667940b53d9465f53c657508fc0316a5bad Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 14 Jan 2025 19:23:26 +0800 Subject: [PATCH 053/248] feat: support internlm 3 dense (#2888) --- python/sglang/srt/models/llama.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py index d606e52f8b8d..4f09fd185b83 100644 --- a/python/sglang/srt/models/llama.py +++ b/python/sglang/srt/models/llama.py @@ -570,4 +570,8 @@ class Phi3ForCausalLM(LlamaForCausalLM): pass -EntryClass = [LlamaForCausalLM, Phi3ForCausalLM] +class InternLM3ForCausalLM(LlamaForCausalLM): + pass + + +EntryClass = [LlamaForCausalLM, Phi3ForCausalLM, InternLM3ForCausalLM] From f005758f2bcf367739a5a71a90b91d18b56aa4cd Mon Sep 17 00:00:00 2001 From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Date: Tue, 14 Jan 2025 19:48:59 +0800 Subject: [PATCH 054/248] introduce CUB in sgl-kernel (#2887) --- .gitmodules | 3 +++ sgl-kernel/3rdparty/cub | 1 + sgl-kernel/CMakeLists.txt | 2 ++ 3 files changed, 6 insertions(+) create mode 160000 sgl-kernel/3rdparty/cub diff --git a/.gitmodules b/.gitmodules index 3a14f6297a3a..c588176e7c07 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "sgl-kernel/3rdparty/cutlass"] path = sgl-kernel/3rdparty/cutlass url = https://github.com/NVIDIA/cutlass.git +[submodule "sgl-kernel/3rdparty/cub"] + path = sgl-kernel/3rdparty/cub + url = https://github.com/NVIDIA/cub.git diff --git a/sgl-kernel/3rdparty/cub b/sgl-kernel/3rdparty/cub new file mode 160000 index 000000000000..0fc3c3701632 --- /dev/null +++ b/sgl-kernel/3rdparty/cub @@ -0,0 +1 @@ +Subproject commit 0fc3c3701632a4be906765b73be20a9ad0da603d diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt index 15818d289eae..623984f2f3e7 100644 --- a/sgl-kernel/CMakeLists.txt +++ b/sgl-kernel/CMakeLists.txt @@ -9,6 +9,7 @@ set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CUTLASS_DIR "3rdparty/cutlass") +set(CUB_DIR "3rdparty/cub") # Set CUDA architectures set(CMAKE_CUDA_ARCHITECTURES "75;80;86;89;90") @@ -43,6 +44,7 @@ target_include_directories(_kernels ${TORCH_INCLUDE_DIRS} ${CUTLASS_DIR}/include ${CUTLASS_DIR}/tools/util/include + ${CUB_DIR}/cub ) target_link_libraries(_kernels From 955a2fbf4e2b0140c6954a6344bf129fc07a7d27 Mon Sep 17 00:00:00 2001 From: yych0745 <1398089567@qq.com> Date: Tue, 7 Jan 2025 17:24:45 +0800 Subject: [PATCH 055/248] Add performance and accuracy test code for FP8 GEMM operations --- sgl-kernel/benchmark/bench_fp8_gemm.py | 71 ++++++++++++++++++++++++++ sgl-kernel/tests/test_fp8_gemm.py | 59 +++++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 sgl-kernel/benchmark/bench_fp8_gemm.py create mode 100644 sgl-kernel/tests/test_fp8_gemm.py diff --git a/sgl-kernel/benchmark/bench_fp8_gemm.py b/sgl-kernel/benchmark/bench_fp8_gemm.py new file mode 100644 index 000000000000..d4bc2fdb91a3 --- /dev/null +++ b/sgl-kernel/benchmark/bench_fp8_gemm.py @@ -0,0 +1,71 @@ +import torch +import torch.nn.functional as F +import triton + +from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm +from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant + + +def to_int8(tensor: torch.Tensor) -> torch.Tensor: + return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048], + x_log=False, + line_arg="provider", + line_vals=["vllm-fp8", "torch-fp8"], + line_names=["vllm-fp8", "torch-fp8"], + styles=[("green", "-"), ("blue", "-")], + ylabel="GB/s", + plot_name="int8 scaled matmul", + args={}, + ) +) +def benchmark(batch_size, provider): + M, N, K = batch_size, 8192, 21760 + a = torch.ones((M, K), device="cuda") * 5.0 + b = torch.ones((N, K), device="cuda") * 5.0 + scale_a = torch.randn((M,), device="cuda", dtype=torch.float32) + scale_b = torch.randn((N,), device="cuda", dtype=torch.float32) + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + b_fp8 = b_fp8.t() + quantiles = [0.5, 0.2, 0.8] + + if provider == "vllm-fp8": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: vllm_scaled_mm( + a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, torch.bfloat16 + ), + quantiles=quantiles, + ) + if provider == "torch-fp8": + scale_a_2d = scale_a_fp8.float().unsqueeze(1) # [M, 1] + scale_b_2d = scale_b_fp8.float().unsqueeze(0) # [1, N] + try: + out = torch.empty( + (a_fp8.shape[0], b_fp8.shape[0]), device="cuda", dtype=torch.bfloat16 + ) + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: torch._scaled_mm( + a_fp8, + b_fp8, + out=out, + out_dtype=torch.bfloat16, + scale_a=scale_a_2d, + scale_b=scale_b_2d, + use_fast_accum=True, + ), + quantiles=quantiles, + ) + except RuntimeError as e: + print("Error details:", e) + raise + gbps = lambda ms: (2 * M * N * K + M * N) * a.element_size() * 1e-9 / (ms * 1e-3) + return gbps(ms), gbps(max_ms), gbps(min_ms) + + +benchmark.run(print_data=True, show_plots=True, save_path="bench_int8_res") \ No newline at end of file diff --git a/sgl-kernel/tests/test_fp8_gemm.py b/sgl-kernel/tests/test_fp8_gemm.py new file mode 100644 index 000000000000..a233b3b435ab --- /dev/null +++ b/sgl-kernel/tests/test_fp8_gemm.py @@ -0,0 +1,59 @@ +import unittest + +import torch +from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm +from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant + + +def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias): + o = torch.matmul(a.to(torch.float32), b.to(torch.float32)) + + o = o.to(torch.float32) + temp1 = o * scale_a.view(-1, 1) + temp2 = temp1 * scale_b.view(1, -1) + final = temp2.to(out_dtype) + + return final + + +class TestInt8Gemm(unittest.TestCase): + def _test_accuracy_once(self, M, N, K, with_bias, out_dtype, device): + a = torch.randn((M, K), device=device) * 5 + b = torch.randn((N, K), device=device) * 5 + + scale_a = torch.randn((M,), device="cuda", dtype=torch.float32) + scale_b = torch.randn((N,), device="cuda", dtype=torch.float32) + if with_bias: + bias = torch.ones((N,), device="cuda", dtype=out_dtype) * 10 + else: + bias = None + o1 = torch.empty((a.shape[0], b.shape[1]), device="cuda", dtype=torch.bfloat16) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + b_fp8 = b_fp8.t() + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + o = torch_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias) + o1 = vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias) + max_val = max(o.abs().max().item(), o1.abs().max().item()) + rtol = 2e-2 + atol = max_val * rtol + torch.testing.assert_close(o, o1, rtol=rtol, atol=atol) + print(f"M={M}, N={N}, K={K}, with_bias={with_bias}, out_dtype={out_dtype}: OK") + + def test_accuracy(self): + Ms = [1, 128, 512, 1024, 4096] + Ns = [16, 128, 512, 1024, 4096] + Ks = [512, 1024, 4096, 8192, 16384] + bias_opts = [True, False] + out_dtypes = [torch.bfloat16] + for M in Ms: + for N in Ns: + for K in Ks: + for with_bias in bias_opts: + for out_dtype in out_dtypes: + self._test_accuracy_once( + M, N, K, with_bias, out_dtype, "cuda" + ) + + +if __name__ == "__main__": + unittest.main() From 30bdf20c81cdddf9eab4a9daba47742ab1e7fe17 Mon Sep 17 00:00:00 2001 From: HandH1998 <1335248067@qq.com> Date: Wed, 8 Jan 2025 19:25:23 +0800 Subject: [PATCH 056/248] support w8a8 fp8 --- sgl-kernel/CMakeLists.txt | 1 + sgl-kernel/setup.py | 1 + sgl-kernel/src/sgl-kernel/__init__.py | 2 + .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu | 571 ++++++++++++++++++ .../src/sgl-kernel/csrc/sgl_kernel_ops.cu | 6 + sgl-kernel/src/sgl-kernel/csrc/utils.hpp | 5 + sgl-kernel/src/sgl-kernel/ops/__init__.py | 11 + sgl-kernel/tests/test_fp8_gemm.py | 28 +- 8 files changed, 615 insertions(+), 10 deletions(-) create mode 100644 sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt index 3c267a4de504..c2bfd356c3db 100644 --- a/sgl-kernel/CMakeLists.txt +++ b/sgl-kernel/CMakeLists.txt @@ -32,6 +32,7 @@ add_library(_kernels SHARED src/sgl-kernel/csrc/trt_reduce_kernel.cu src/sgl-kernel/csrc/moe_align_kernel.cu src/sgl-kernel/csrc/int8_gemm_kernel.cu + src/sgl-kernel/csrc/fp8_gemm_kernel.cu src/sgl-kernel/csrc/sgl_kernel_ops.cu ) diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py index c93e87f6bad3..3a60f6ba0a6b 100644 --- a/sgl-kernel/setup.py +++ b/sgl-kernel/setup.py @@ -50,6 +50,7 @@ def update_wheel_platform_tag(): "src/sgl-kernel/csrc/trt_reduce_kernel.cu", "src/sgl-kernel/csrc/moe_align_kernel.cu", "src/sgl-kernel/csrc/int8_gemm_kernel.cu", + "src/sgl-kernel/csrc/fp8_gemm_kernel.cu", "src/sgl-kernel/csrc/sgl_kernel_ops.cu", ], include_dirs=include_dirs, diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py index 892808f1ee15..2a4a2bd51771 100644 --- a/sgl-kernel/src/sgl-kernel/__init__.py +++ b/sgl-kernel/src/sgl-kernel/__init__.py @@ -3,6 +3,7 @@ custom_reduce, init_custom_reduce, int8_scaled_mm, + fp8_scaled_mm, moe_align_block_size, ) @@ -12,4 +13,5 @@ "custom_dispose", "custom_reduce", "int8_scaled_mm", + "fp8_scaled_mm", ] diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu new file mode 100644 index 000000000000..795328930634 --- /dev/null +++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu @@ -0,0 +1,571 @@ +/* + * Copyright (c) 2022-2024, Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + * + * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef __GNUC__ // Check if the compiler is GCC or Clang +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif // __GNUC__ + +#include +#include + +#include "cute/tensor.hpp" +#include "cutlass/conv/convolution.h" +// Order matters here, packed_stride.hpp is missing cute and convolution includes +#include "cutlass/util/packed_stride.hpp" + +#ifdef __GNUC__ // Check if the compiler is GCC or Clang +#pragma GCC diagnostic pop +#endif // __GNUC__ + +// #include "fp8_rowwise_gemm_kernel_template_sm89.h" +// #include "fp8_rowwise_gemm_kernel_template_sm90.h" + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h" +#include "cutlass/epilogue/threadblock/fusion/visitors.hpp" + +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/gemm/dispatch_policy.hpp" + +#include "cutlass/epilogue/thread/activation.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" + +#include "cutlass/epilogue/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" + + +#include "utils.hpp" +using namespace cute; + +template +struct DeviceGemmFp8RowwiseSm90 +{ + static_assert(std::is_same_v, "ElementType must be FP8(e4m3)"); + + // A matrix configuration + using ElementA = ElementType; // Element type for A matrix operand + using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand + static constexpr int AlignmentA + = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of A + // matrix in units of elements (up to 16 bytes) + + // B matrix configuration + using ElementB = ElementType; // Element type for B matrix operand + using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand + static constexpr int AlignmentB + = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of B + // matrix in units of elements (up to 16 bytes) + + // C/D matrix configuration + using ElementC = void; // Element type for C matrix operands + using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands + static constexpr int AlignmentC + = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of C matrices in + // units of elements (up to 16 bytes) + + // Output matrix configuration + using ElementOutput = OutElementType; // Element type for output matrix operands + using LayoutOutput = cutlass::layout::RowMajor; // Layout type for output matrix operands + static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits::value; + + // Auxiliary matrix configuration and other fusion types + using ElementBias = float; + + // Multiply-accumulate blocking/pipelining details + using ElementAccumulator = AccumElementType; // Element type for internal accumulation + using ElementCompute = float; // Element type for compute + using ElementComputeEpilogue = float; + using ArchTag = cutlass::arch::Sm90; // Tag indicating the minimum SM that supports the intended feature + using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag + using TileShape = CTAShape; // Threadblock-level tile size + using TileScheduler = TileSchedulerType; + + static constexpr bool PONG = false; + static constexpr bool FAST_ACCUM = true; + static constexpr bool USE_BIAS = false; + + using StageCountType = cutlass::gemm::collective::StageCountAuto; // Stage count maximized + // based on the tile size + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; // Kernel to launch based on the default + // setting in the Collective Builder + // Implement rowwise scaling epilogue. + using XScale = cutlass::epilogue::fusion::Sm90ColBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue, + cute::Stride, cute::Int<0>, cute::Int<0>>>; + + using WScale = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue, + cute::Stride, cute::Int<1>, cute::Int<0>>>; + + using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementBias, ElementBias, + cute::Stride, cute::Int<1>, cute::Int<0>>>; + + using Accum = cutlass::epilogue::fusion::Sm90AccFetch; + + using Compute0 = cutlass::epilogue::fusion::Sm90Compute; + + using EVTCompute0 = cutlass::epilogue::fusion::Sm90EVT; + + using Compute1 = cutlass::epilogue::fusion::Sm90Compute; + + using EVTCompute1 = cutlass::epilogue::fusion::Sm90EVT; + + using ComputeBias = cutlass::epilogue::fusion::Sm90Compute; + + using EVTComputeBias = cutlass::epilogue::fusion::Sm90EVT; + + using EpilogueEVT = EVTCompute1; + + using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder::CollectiveOp; + + using DefaultSchedule = cutlass::gemm::KernelTmaWarpSpecialized; + using PongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong; + using FastDefaultSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum; + using FastPongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; + + using SlowAccum = DefaultSchedule; + using FastAccum = FastDefaultSchedule; + using MainLoopSchedule = cute::conditional_t; + + using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder( + sizeof(typename CollectiveEpilogue::SharedStorage))>, + MainLoopSchedule>::CollectiveOp; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal, // Indicates ProblemShape + CollectiveMainloop, CollectiveEpilogue, TileScheduler>; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; +}; + +template +// template +struct DeviceGemmFp8RowwiseSm89 +{ + static_assert(std::is_same_v, "ElementType must be FP8(e4m3)"); + + using ElementA = ElementType; + using LayoutA = cutlass::layout::RowMajor; + static constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; + + using ElementB = ElementType; + using LayoutB = cutlass::layout::ColumnMajor; + static constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; + + using ElementC = OutElementType; + using LayoutC = cutlass::layout::RowMajor; + static constexpr int AlignmentC = 128 / cutlass::sizeof_bits::value; + + using ElementOutput = OutElementType; + using LayoutOutput = cutlass::layout::RowMajor; + static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits::value; + + using ElementAccumulator = AccumElementType; + using ElementComputeEpilogue = float; + using ArchTag = cutlass::arch::Sm89; + using OperatorClass = cutlass::arch::OpClassTensorOp; + + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + // Number of epilogue stages in EVT + static constexpr int EVTEpilogueStages = 1; + + using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout; + + // Definition of EVT + using accSrc = cutlass::epilogue::threadblock::VisitorAccFetch; + + using ComputeBScale = cutlass::epilogue::threadblock::VisitorCompute; + using bScaleSrc = cutlass::epilogue::threadblock::VisitorRowBroadcast>; + using EpilogueBScale = cutlass::epilogue::threadblock::Sm80EVT; + + using ComputeAScale = cutlass::epilogue::threadblock::VisitorCompute; + using aScaleSrc = cutlass::epilogue::threadblock::VisitorColBroadcast>; + using EpilogueAScale = cutlass::epilogue::threadblock::Sm80EVT; + + // // With bias + // using biasSrc = cutlass::epilogue::threadblock::VisitorRowBroadcast>; + // using ComputeAScaleWithBias = cutlass::epilogue::threadblock::VisitorCompute; + // using EpilogueAScaleWithBias = cutlass::epilogue::threadblock::Sm80EVT; + + + using dTar = cutlass::epilogue::threadblock::VisitorAuxStore>; + using EpilogueStore = cutlass::epilogue::threadblock::Sm80EVT; + // using EpilogueStore = cutlass::platform::conditional, + // cutlass::epilogue::threadblock::Sm80EVT>::type; + + + using EpilogueOp = EpilogueStore; + + using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmWithVisitor::GemmKernel; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; +}; + + +template +typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, + const torch::Tensor& scales_b, + const c10::optional& bias) +{ + using ElementT = typename Gemm::ElementA; + using ElementOutput = typename Gemm::ElementD; + using ElementComputeEpilogue = float; + + // int const lda = k; + // int const ldb = k; + // int const ldc = n; + int32_t m = a.size(0); + int32_t n = b.size(1); + int32_t k = a.size(1); + + int64_t lda = a.stride(0); + int64_t ldb = b.stride(1); + int64_t ldc = out.stride(0); + + ElementT const* ptr_a = reinterpret_cast(a.data_ptr()); + ElementT const* ptr_b = reinterpret_cast(b.data_ptr()); + ElementOutput* ptr_d = reinterpret_cast(out.data_ptr()); + ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast(scales_a.data_ptr()); + ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast(scales_b.data_ptr()); + + typename Gemm::Arguments args(cutlass::gemm::GemmUniversalMode::kGemm, // Mode + {m, n, k}, // Problem size + 1, // Split-k factor + {}, // Epilogue args + ptr_a, // a pointer + ptr_b, // b pointer + nullptr, // c pointer (unused) + nullptr, // d pointer (unused) + m * k, // batch stride a (unused) + n * k, // batch stride b (unused) + m * n, // batch stride c (unused) + m * n, // batch stride d (unused) + lda, // stride a + ldb, // stride b + ldc, // stride c (unused) + ldc); // stride d (unused) + + args.epilogue = { + { + { + {}, // Accumulator + {ptr_scales_b, ElementComputeEpilogue(0), + {_0{}, _1{}, _0{}}}, + {} // Multiplies + }, + {ptr_scales_a, ElementComputeEpilogue(0), {_1{}, _0{}, _0{}}}, + {} // Multiplies + }, + {ptr_d, {n, _1{}, _0{}}}}; + return args; +} + +template +void launch_sm89_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, + const torch::Tensor& scales_b, + const c10::optional& bias) +{ + using ElementInput = cutlass::float_e4m3_t; + using ElementOutput = OutType; + using AccumElementType = float; + + using Gemm = typename DeviceGemmFp8RowwiseSm89::Gemm; + + auto args = prepare_sm89_fp8_args(out, a, b, scales_a, scales_b, bias); + Gemm gemm_op; + // CUTLASS_CHECK(gemm_op.can_implement(args)); + + size_t workspace_size = gemm_op.get_workspace_size(args); + auto const workspace_options = + torch::TensorOptions().dtype(torch::kUInt8).device(a.device()); + auto workspace = torch::empty(workspace_size, workspace_options); + + auto stream = at::cuda::getCurrentCUDAStream(a.get_device()); + + auto can_implement = gemm_op.can_implement(args); + TORCH_CHECK(can_implement == cutlass::Status::kSuccess) + + // auto status = gemm_op.run(args, workspace.data_ptr(), stream); + auto status = gemm_op(args, workspace.data_ptr(), stream); + TORCH_CHECK(status == cutlass::Status::kSuccess) + // return typedFp8RowwiseGemmKernelLauncher( + // Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy); +} + + +template +void s89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, + const torch::Tensor& scales_b, + const c10::optional& bias) { + uint32_t const m = a.size(0); + uint32_t const mp2 = + std::max(static_cast(32), next_pow_2(m)); // next power of 2 + + uint32_t const n = out.size(1); + uint32_t const np2 = next_pow_2(n); + + if (mp2 <= 16) { + // M in [1, 16] + if (np2 <= 8192) { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + } else if (np2 <= 24576) { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + } else { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + } + } else if (mp2 <= 32) { + // M in (16, 32] + if (np2 <= 8192) { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + } else if (np2 <= 16384) { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<32, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias); + } else { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + } + } else if (mp2 <= 64) { + // M in (32, 64] + if (np2 <= 8192) { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + } else if (np2 <= 16384) { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias); + } else { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + } + } else if (mp2 <= 128) { + // M in (64, 128] + if (np2 <= 8192) { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias); + } else if (np2 <= 16384) { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + } else { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias); + } + } else if (mp2 <= 256) { + // M in (128, 256] + if (np2 <= 4096) { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias); + } else { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + } + } else { + // M in (256, inf) + if (np2 <= 4096) { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + } else if (np2 <= 8192) { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias); + } else { + return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + } + } +} + +template +typename Gemm::Arguments prepare_sm90_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, + const torch::Tensor& scales_b, + const c10::optional& bias) +{ + using ElementT = typename Gemm::ElementA; + using ElementOutput = typename Gemm::ElementD; + using ElementComputeEpilogue = float; + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + int32_t m = a.size(0); + int32_t n = b.size(1); + int32_t k = a.size(1); + ElementT const* ptr_a = reinterpret_cast(a.data_ptr()); + ElementT const* ptr_b = reinterpret_cast(b.data_ptr()); + ElementOutput* ptr_d = reinterpret_cast(out.data_ptr()); + ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast(scales_a.data_ptr()); + ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast(scales_b.data_ptr()); + + // TODO: confirm correctess + StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, make_shape(m, k, 1)); + StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, make_shape(n, k, 1)); + StrideC stride_c; + StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, make_shape(m, n, 1)); + typename Gemm::Arguments args + = {cutlass::gemm::GemmUniversalMode::kGemm, {m, n, k, 1}, {ptr_a, stride_a, ptr_b, stride_b}, + {{}, // epilogue.thread + nullptr, stride_c, ptr_d, stride_d}}; + args.epilogue.thread = { + {ptr_scales_a}, + { + {ptr_scales_b}, {}, // Accumulator + {} // Multiplies + }, + {}, // Multiplies + }; + return args; +} + +template +void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, + const torch::Tensor& scales_b, + const c10::optional& bias) +{ + using ElementInput = cutlass::float_e4m3_t; + using ElementOutput = OutType; + using AccumElementType = float; + using MainloopScheduleType = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum; + using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized; + using TileSchedulerType = void; + using Gemm = typename DeviceGemmFp8RowwiseSm90::Gemm; + auto args = prepare_sm90_fp8_args(out, a, b, scales_a, scales_b, bias); + + // Launch the CUTLASS GEMM kernel. + Gemm gemm_op; + // CUTLASS_CHECK(gemm_op.can_implement(args)); + + size_t workspace_size = gemm_op.get_workspace_size(args); + auto const workspace_options = + torch::TensorOptions().dtype(torch::kUInt8).device(a.device()); + auto workspace = torch::empty(workspace_size, workspace_options); + + auto stream = at::cuda::getCurrentCUDAStream(a.get_device()); + + auto can_implement = gemm_op.can_implement(args); + TORCH_CHECK(can_implement == cutlass::Status::kSuccess) + + auto status = gemm_op.run(args, workspace.data_ptr(), stream); + TORCH_CHECK(status == cutlass::Status::kSuccess) + // cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream); + // CUTLASS_CHECK(status); +// return typedFp8RowwiseGemmKernelLauncher( +// Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy); +// #else // COMPILE_HOPPER_TMA_GEMMS +// throw std::runtime_error( +// "[TensorRT-LLm Error][Fp8RowwiseGemmKernelLauncherSm90] Please recompile with support for hopper by passing " +// "90-real as an arch to build_wheel.py."); +// #endif // COMPILE_HOPPER_TMA_GEMMS +} + +template +void s90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, + const torch::Tensor& scales_b, + const c10::optional& bias) { + uint32_t const m = a.size(0); + uint32_t const mp2 = + std::max(static_cast(64), next_pow_2(m)); // next power of 2 + + if (mp2 <= 64) { + // m in [1, 64] + return launch_sm90_fp8_scaled_mm, Shape<_1, _8, _1>>(out, a, b, scales_a, scales_b, bias); + } else if (mp2 <= 128) { + // m in (64, 128] + return launch_sm90_fp8_scaled_mm, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias); + } else { + // m in (128, inf) + return launch_sm90_fp8_scaled_mm, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias); + } +} + +torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a, + const torch::Tensor& scales_b, const torch::Dtype& out_dtype, + const c10::optional& bias) { + TORCH_CHECK(mat_a.is_cuda(), "mat_a must be a CUDA tensor"); + TORCH_CHECK(mat_b.is_cuda(), "mat_b must be a CUDA tensor"); + TORCH_CHECK(mat_a.dim() == 2, "mat_a must be a 2D tensor"); + TORCH_CHECK(mat_b.dim() == 2, "mat_b must be a 2D tensor"); + TORCH_CHECK(mat_a.stride(1) == 1, "mat_a must be a row major tensor"); + TORCH_CHECK(mat_b.stride(0) == 1, "mat_a must be a column major tensor"); + TORCH_CHECK(mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied"); + + TORCH_CHECK(mat_a.size(1) % 16 == 0, "mat_a.size(1) must be multiple of 16 for memory alignment"); +// TORCH_CHECK(mat_b.size(0) % 16 == 0, "mat_b.size(0) must be multiple of 16 for memory alignment"); +//TODO: % 8 + TORCH_CHECK(mat_b.size(1) % 16 == 0, "mat_b.size(1) must be multiple of 16 for memory alignment"); // out.stride(0) + TORCH_CHECK(mat_a.scalar_type() == torch::kFloat8_e4m3fn, "mat_a must be Float8_e4m3fn"); + TORCH_CHECK(mat_b.scalar_type() == torch::kFloat8_e4m3fn, "mat_b must be Float8_e4m3fn"); + TORCH_CHECK(out_dtype == torch::kHalf || out_dtype == torch::kBFloat16, "out_dtype must be Half or BFloat16"); + + TORCH_CHECK(scales_a.numel() == mat_a.size(0), "size of scales_a is not matched"); + TORCH_CHECK(scales_b.numel() == mat_b.size(1), "size of scales_b is not matched"); + TORCH_CHECK(scales_a.is_contiguous(), "scales_a must be contiguous"); + TORCH_CHECK(scales_b.is_contiguous(), "scales_b msut be contiguous"); + TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32, "scales_a must be Float32"); + TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32, "scales_b must be Float32"); + + if (bias) { + TORCH_CHECK(bias->numel() == mat_b.size(1), "size of bias is not matched"); + TORCH_CHECK(bias->is_contiguous(), "bias must be contiguous"); + TORCH_CHECK(bias->dtype() == out_dtype, "bias dtype must match output dtype"); + } + + torch::Tensor out = torch::empty({mat_a.size(0), mat_b.size(1)}, mat_a.options().dtype(out_dtype)); + + auto sm_version = getSMVersion(); + + if (sm_version >= 90) { + if (out_dtype == torch::kBFloat16) { + s90_dispatch_shape(out, mat_a, mat_b, scales_a, scales_b, bias); + } else { + s90_dispatch_shape(out, mat_a, mat_b, scales_a, scales_b, bias); + } + } else if (sm_version == 89) { + if (out_dtype == torch::kBFloat16) { + s89_dispatch_shape(out, mat_a, mat_b, scales_a, scales_b, bias); + } else { + s89_dispatch_shape(out, mat_a, mat_b, scales_a, scales_b, bias); + } + } else { + TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented int8_scaled_mm for current compute capability: ", sm_version); + } + + return out; +} diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu index 6ed543e6c542..b12d324cc62b 100644 --- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu +++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu @@ -17,6 +17,10 @@ torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& ma const torch::Tensor& scales_b, const torch::Dtype& out_dtype, const c10::optional& bias); +torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a, + const torch::Tensor& scales_b, const torch::Dtype& out_dtype, + const c10::optional& bias); + PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { // trt_reduce m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)"); @@ -26,4 +30,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("moe_align_block_size", &moe_align_block_size, "MOE Align Block Size (CUDA)"); // int8_scaled_mm m.def("int8_scaled_mm", &int8_scaled_mm, "INT8 scaled matmul (CUDA)"); + // fp8_scaled_mm + m.def("fp8_scaled_mm", &fp8_scaled_mm, "FP8 scaled matmul (CUDA)"); } diff --git a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp index 2fed2d60c039..5820b1350ab5 100644 --- a/sgl-kernel/src/sgl-kernel/csrc/utils.hpp +++ b/sgl-kernel/src/sgl-kernel/csrc/utils.hpp @@ -44,3 +44,8 @@ inline int getSMVersion() { CHECK_CUDA_SUCCESS(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device)); return sm_major * 10 + sm_minor; } + +inline uint32_t next_pow_2(uint32_t const num) { + if (num <= 1) return num; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py index e388ae35653b..f339997b027f 100644 --- a/sgl-kernel/src/sgl-kernel/ops/__init__.py +++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py @@ -2,6 +2,7 @@ from sgl_kernel.ops._kernels import dispose as _dispose from sgl_kernel.ops._kernels import init_custom_ar as _init_custom_ar from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm +from sgl_kernel.ops._kernels import fp8_scaled_mm as _fp8_scaled_mm from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size @@ -48,3 +49,13 @@ def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None): out_dtype, bias, ) + +def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None): + return _fp8_scaled_mm( + mat_a, + mat_b, + scales_a, + scales_b, + out_dtype, + bias, + ) diff --git a/sgl-kernel/tests/test_fp8_gemm.py b/sgl-kernel/tests/test_fp8_gemm.py index a233b3b435ab..c303bef1d1ed 100644 --- a/sgl-kernel/tests/test_fp8_gemm.py +++ b/sgl-kernel/tests/test_fp8_gemm.py @@ -3,6 +3,7 @@ import torch from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant +from sgl_kernel import fp8_scaled_mm def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias): @@ -16,15 +17,16 @@ def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias): return final -class TestInt8Gemm(unittest.TestCase): +class TestFp8Gemm(unittest.TestCase): def _test_accuracy_once(self, M, N, K, with_bias, out_dtype, device): - a = torch.randn((M, K), device=device) * 5 - b = torch.randn((N, K), device=device) * 5 + a = torch.randn((M, K), device=device) + b = torch.randn((N, K), device=device) - scale_a = torch.randn((M,), device="cuda", dtype=torch.float32) - scale_b = torch.randn((N,), device="cuda", dtype=torch.float32) + scale_a = torch.randn((M,), device="cuda", dtype=torch.float32) * 0.01 + scale_b = torch.randn((N,), device="cuda", dtype=torch.float32) * 0.01 if with_bias: - bias = torch.ones((N,), device="cuda", dtype=out_dtype) * 10 + # bias = torch.ones((N,), device="cuda", dtype=out_dtype) * 10 + bias = torch.randn((N,), device="cuda", dtype=out_dtype) else: bias = None o1 = torch.empty((a.shape[0], b.shape[1]), device="cuda", dtype=torch.bfloat16) @@ -32,9 +34,10 @@ def _test_accuracy_once(self, M, N, K, with_bias, out_dtype, device): b_fp8 = b_fp8.t() a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) o = torch_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias) - o1 = vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias) + # o1 = vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias) + o1 = fp8_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias) max_val = max(o.abs().max().item(), o1.abs().max().item()) - rtol = 2e-2 + rtol = 4e-3 atol = max_val * rtol torch.testing.assert_close(o, o1, rtol=rtol, atol=atol) print(f"M={M}, N={N}, K={K}, with_bias={with_bias}, out_dtype={out_dtype}: OK") @@ -43,8 +46,13 @@ def test_accuracy(self): Ms = [1, 128, 512, 1024, 4096] Ns = [16, 128, 512, 1024, 4096] Ks = [512, 1024, 4096, 8192, 16384] - bias_opts = [True, False] - out_dtypes = [torch.bfloat16] + # Ms = [128] + # Ns = [512] + # Ks = [4096] + # bias_opts = [True, False] + bias_opts = [False] + out_dtypes = [torch.bfloat16, torch.float16] + # out_dtypes = [torch.float16] for M in Ms: for N in Ns: for K in Ks: From 4cac9fb925f58e3e90fa5c7053ad10d42afa099b Mon Sep 17 00:00:00 2001 From: HandH1998 <1335248067@qq.com> Date: Thu, 9 Jan 2025 17:41:46 +0800 Subject: [PATCH 057/248] support bias --- .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu | 505 +++++++++--------- sgl-kernel/tests/test_fp8_gemm.py | 21 +- 2 files changed, 262 insertions(+), 264 deletions(-) diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu index 795328930634..ef88110e9258 100644 --- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu +++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu @@ -1,32 +1,9 @@ -/* - * Copyright (c) 2022-2024, Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - * - * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +// Adapted from https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h +// https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm89.h +// https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h #pragma once -#ifdef __GNUC__ // Check if the compiler is GCC or Clang -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#endif // __GNUC__ - #include #include @@ -35,13 +12,6 @@ // Order matters here, packed_stride.hpp is missing cute and convolution includes #include "cutlass/util/packed_stride.hpp" -#ifdef __GNUC__ // Check if the compiler is GCC or Clang -#pragma GCC diagnostic pop -#endif // __GNUC__ - -// #include "fp8_rowwise_gemm_kernel_template_sm89.h" -// #include "fp8_rowwise_gemm_kernel_template_sm90.h" - #include "cutlass/cutlass.h" #include "cutlass/gemm/device/gemm.h" #include "cutlass/gemm/device/gemm_universal_adapter.h" @@ -59,127 +29,11 @@ #include "cutlass/epilogue/collective/collective_builder.hpp" #include "cutlass/gemm/device/gemm_universal_adapter.h" - #include "utils.hpp" using namespace cute; -template -struct DeviceGemmFp8RowwiseSm90 -{ - static_assert(std::is_same_v, "ElementType must be FP8(e4m3)"); - - // A matrix configuration - using ElementA = ElementType; // Element type for A matrix operand - using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand - static constexpr int AlignmentA - = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of A - // matrix in units of elements (up to 16 bytes) - - // B matrix configuration - using ElementB = ElementType; // Element type for B matrix operand - using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand - static constexpr int AlignmentB - = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of B - // matrix in units of elements (up to 16 bytes) - - // C/D matrix configuration - using ElementC = void; // Element type for C matrix operands - using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands - static constexpr int AlignmentC - = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of C matrices in - // units of elements (up to 16 bytes) - - // Output matrix configuration - using ElementOutput = OutElementType; // Element type for output matrix operands - using LayoutOutput = cutlass::layout::RowMajor; // Layout type for output matrix operands - static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits::value; - - // Auxiliary matrix configuration and other fusion types - using ElementBias = float; - - // Multiply-accumulate blocking/pipelining details - using ElementAccumulator = AccumElementType; // Element type for internal accumulation - using ElementCompute = float; // Element type for compute - using ElementComputeEpilogue = float; - using ArchTag = cutlass::arch::Sm90; // Tag indicating the minimum SM that supports the intended feature - using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag - using TileShape = CTAShape; // Threadblock-level tile size - using TileScheduler = TileSchedulerType; - - static constexpr bool PONG = false; - static constexpr bool FAST_ACCUM = true; - static constexpr bool USE_BIAS = false; - - using StageCountType = cutlass::gemm::collective::StageCountAuto; // Stage count maximized - // based on the tile size - using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; // Kernel to launch based on the default - // setting in the Collective Builder - // Implement rowwise scaling epilogue. - using XScale = cutlass::epilogue::fusion::Sm90ColBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue, - cute::Stride, cute::Int<0>, cute::Int<0>>>; - - using WScale = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue, - cute::Stride, cute::Int<1>, cute::Int<0>>>; - - using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementBias, ElementBias, - cute::Stride, cute::Int<1>, cute::Int<0>>>; - - using Accum = cutlass::epilogue::fusion::Sm90AccFetch; - - using Compute0 = cutlass::epilogue::fusion::Sm90Compute; - - using EVTCompute0 = cutlass::epilogue::fusion::Sm90EVT; - - using Compute1 = cutlass::epilogue::fusion::Sm90Compute; - - using EVTCompute1 = cutlass::epilogue::fusion::Sm90EVT; - - using ComputeBias = cutlass::epilogue::fusion::Sm90Compute; - - using EVTComputeBias = cutlass::epilogue::fusion::Sm90EVT; - - using EpilogueEVT = EVTCompute1; - - using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder::CollectiveOp; - - using DefaultSchedule = cutlass::gemm::KernelTmaWarpSpecialized; - using PongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong; - using FastDefaultSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum; - using FastPongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; - - using SlowAccum = DefaultSchedule; - using FastAccum = FastDefaultSchedule; - using MainLoopSchedule = cute::conditional_t; - - using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder( - sizeof(typename CollectiveEpilogue::SharedStorage))>, - MainLoopSchedule>::CollectiveOp; - - using GemmKernel = cutlass::gemm::kernel::GemmUniversal, // Indicates ProblemShape - CollectiveMainloop, CollectiveEpilogue, TileScheduler>; - - using Gemm = cutlass::gemm::device::GemmUniversalAdapter; -}; - template -// template + typename WarpShape, int Stages, bool WithBias> struct DeviceGemmFp8RowwiseSm89 { static_assert(std::is_same_v, "ElementType must be FP8(e4m3)"); @@ -227,20 +81,17 @@ struct DeviceGemmFp8RowwiseSm89 Stride<_1, _0, _0>>; using EpilogueAScale = cutlass::epilogue::threadblock::Sm80EVT; - // // With bias - // using biasSrc = cutlass::epilogue::threadblock::VisitorRowBroadcast>; - // using ComputeAScaleWithBias = cutlass::epilogue::threadblock::VisitorCompute; - // using EpilogueAScaleWithBias = cutlass::epilogue::threadblock::Sm80EVT; - + // With bias + using biasSrc = cutlass::epilogue::threadblock::VisitorRowBroadcast>; + using ComputeAScaleWithBias = cutlass::epilogue::threadblock::VisitorCompute; + using EpilogueAScaleWithBias = cutlass::epilogue::threadblock::Sm80EVT; using dTar = cutlass::epilogue::threadblock::VisitorAuxStore>; - using EpilogueStore = cutlass::epilogue::threadblock::Sm80EVT; - // using EpilogueStore = cutlass::platform::conditional, - // cutlass::epilogue::threadblock::Sm80EVT>::type; + using EpilogueStore = typename cutlass::platform::conditional, + cutlass::epilogue::threadblock::Sm80EVT>::type; - using EpilogueOp = EpilogueStore; using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmWithVisitor +template typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, const torch::Tensor& scales_b, const c10::optional& bias) @@ -262,9 +113,6 @@ typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch:: using ElementOutput = typename Gemm::ElementD; using ElementComputeEpilogue = float; - // int const lda = k; - // int const ldb = k; - // int const ldc = n; int32_t m = a.size(0); int32_t n = b.size(1); int32_t k = a.size(1); @@ -275,16 +123,22 @@ typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch:: ElementT const* ptr_a = reinterpret_cast(a.data_ptr()); ElementT const* ptr_b = reinterpret_cast(b.data_ptr()); + ElementOutput const* ptr_bias = nullptr; + if constexpr (WithBias) { + TORCH_CHECK(bias.has_value()) + ptr_bias = reinterpret_cast(bias.value().data_ptr()); + } ElementOutput* ptr_d = reinterpret_cast(out.data_ptr()); ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast(scales_a.data_ptr()); ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast(scales_b.data_ptr()); + typename Gemm::Arguments args(cutlass::gemm::GemmUniversalMode::kGemm, // Mode {m, n, k}, // Problem size 1, // Split-k factor {}, // Epilogue args - ptr_a, // a pointer - ptr_b, // b pointer + ptr_a, // a pointer + ptr_b, // b pointer nullptr, // c pointer (unused) nullptr, // d pointer (unused) m * k, // batch stride a (unused) @@ -295,8 +149,22 @@ typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch:: ldb, // stride b ldc, // stride c (unused) ldc); // stride d (unused) - - args.epilogue = { + if constexpr (WithBias) { + args.epilogue = { + { + { + {}, // Accumulator + {ptr_scales_b, ElementComputeEpilogue(0), + {_0{}, _1{}, _0{}}}, + {} // Multiplies + }, + {ptr_scales_a, ElementComputeEpilogue(0), {_1{}, _0{}, _0{}}}, + {ptr_bias, ElementOutput(0), {_0{}, _1{}, _0{}}}, + {} // Multiplies + }, + {ptr_d, {n, _1{}, _0{}}}}; + } else { + args.epilogue = { { { {}, // Accumulator @@ -308,45 +176,53 @@ typename Gemm::Arguments prepare_sm89_fp8_args(torch::Tensor& out, const torch:: {} // Multiplies }, {ptr_d, {n, _1{}, _0{}}}}; + } + return args; } -template +template void launch_sm89_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, const torch::Tensor& scales_b, const c10::optional& bias) { - using ElementInput = cutlass::float_e4m3_t; - using ElementOutput = OutType; - using AccumElementType = float; - - using Gemm = typename DeviceGemmFp8RowwiseSm89::Gemm; - - auto args = prepare_sm89_fp8_args(out, a, b, scales_a, scales_b, bias); + auto args = prepare_sm89_fp8_args(out, a, b, scales_a, scales_b, bias); Gemm gemm_op; - // CUTLASS_CHECK(gemm_op.can_implement(args)); size_t workspace_size = gemm_op.get_workspace_size(args); auto const workspace_options = torch::TensorOptions().dtype(torch::kUInt8).device(a.device()); auto workspace = torch::empty(workspace_size, workspace_options); - auto stream = at::cuda::getCurrentCUDAStream(a.get_device()); auto can_implement = gemm_op.can_implement(args); TORCH_CHECK(can_implement == cutlass::Status::kSuccess) - // auto status = gemm_op.run(args, workspace.data_ptr(), stream); auto status = gemm_op(args, workspace.data_ptr(), stream); TORCH_CHECK(status == cutlass::Status::kSuccess) - // return typedFp8RowwiseGemmKernelLauncher( - // Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy); +} + +template +void sm89_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, + const torch::Tensor& scales_b, + const c10::optional& bias) { + using ElementInput = cutlass::float_e4m3_t; + using ElementOutput = OutType; + using AccumElementType = float; + if (bias) { + using Gemm = typename DeviceGemmFp8RowwiseSm89::Gemm; + return launch_sm89_fp8_scaled_mm(out, a, b, scales_a, scales_b, bias); + } else { + using Gemm = typename DeviceGemmFp8RowwiseSm89::Gemm; + return launch_sm89_fp8_scaled_mm(out, a, b, scales_a, scales_b, bias); + } } template -void s89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, +void sm89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, const torch::Tensor& scales_b, const c10::optional& bias) { uint32_t const m = a.size(0); @@ -359,59 +235,170 @@ void s89_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch: if (mp2 <= 16) { // M in [1, 16] if (np2 <= 8192) { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); } else if (np2 <= 24576) { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); } else { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); } } else if (mp2 <= 32) { // M in (16, 32] if (np2 <= 8192) { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); } else if (np2 <= 16384) { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<32, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<32, 64, 64>, 4>(out, a, b, scales_a, scales_b, bias); } else { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<16, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); } } else if (mp2 <= 64) { // M in (32, 64] if (np2 <= 8192) { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); } else if (np2 <= 16384) { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias); } else { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<32, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); } } else if (mp2 <= 128) { // M in (64, 128] if (np2 <= 8192) { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias); } else if (np2 <= 16384) { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); } else { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias); } } else if (mp2 <= 256) { // M in (128, 256] if (np2 <= 4096) { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias); } else { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); } } else { // M in (256, inf) if (np2 <= 4096) { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); } else if (np2 <= 8192) { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<64, 64, 64>, 3>(out, a, b, scales_a, scales_b, bias); } else { - return launch_sm89_fp8_scaled_mm, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); + return sm89_dispatch_bias, cutlass::gemm::GemmShape<64, 64, 64>, 5>(out, a, b, scales_a, scales_b, bias); } } } -template +template +struct DeviceGemmFp8RowwiseSm90 +{ + static_assert(std::is_same_v, "ElementType must be FP8(e4m3)"); + + // A matrix configuration + using ElementA = ElementType; // Element type for A matrix operand + using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand + static constexpr int AlignmentA + = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of A + // matrix in units of elements (up to 16 bytes) + + // B matrix configuration + using ElementB = ElementType; // Element type for B matrix operand + using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand + static constexpr int AlignmentB + = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of B + // matrix in units of elements (up to 16 bytes) + + // C/D matrix configuration + using ElementC = void; // Element type for C matrix operands + using LayoutC = cutlass::layout::RowMajor; // Layout type for C matrix operands + static constexpr int AlignmentC + = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of C matrices in + // units of elements (up to 16 bytes) + + // Output matrix configuration + using ElementOutput = OutElementType; // Element type for output matrix operands + using LayoutOutput = cutlass::layout::RowMajor; // Layout type for output matrix operands + static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits::value; + + // // Auxiliary matrix configuration and other fusion types + // using ElementBias = float; + + // Multiply-accumulate blocking/pipelining details + using ElementAccumulator = AccumElementType; // Element type for internal accumulation + using ElementCompute = float; // Element type for compute + using ElementComputeEpilogue = float; + using ArchTag = cutlass::arch::Sm90; // Tag indicating the minimum SM that supports the intended feature + using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag + using TileShape = CTAShape; // Threadblock-level tile size + using TileScheduler = TileSchedulerType; + + static constexpr bool PONG = false; + static constexpr bool FAST_ACCUM = true; + static constexpr bool USE_BIAS = false; + + using StageCountType = cutlass::gemm::collective::StageCountAuto; // Stage count maximized + // based on the tile size + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; // Kernel to launch based on the default + // setting in the Collective Builder + // Implement rowwise scaling epilogue. + using XScale = cutlass::epilogue::fusion::Sm90ColBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue, + cute::Stride, cute::Int<0>, cute::Int<0>>>; + + using WScale = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementComputeEpilogue, ElementComputeEpilogue, + cute::Stride, cute::Int<1>, cute::Int<0>>>; + + using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<0, TileShape, ElementOutput, ElementOutput, + cute::Stride, cute::Int<1>, cute::Int<0>>>; + + using Accum = cutlass::epilogue::fusion::Sm90AccFetch; + + using Compute0 = cutlass::epilogue::fusion::Sm90Compute; + + using EVTCompute0 = cutlass::epilogue::fusion::Sm90EVT; + + using Compute1 = cutlass::epilogue::fusion::Sm90Compute; + + using EVTCompute1 = cutlass::epilogue::fusion::Sm90EVT; + + // With bias + using ComputeWithBias = cutlass::epilogue::fusion::Sm90Compute; + using EVTComputeWithBias = cutlass::epilogue::fusion::Sm90EVT; + + using EpilogueEVT = typename cutlass::platform::conditional::type; + + using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder::CollectiveOp; + + using DefaultSchedule = cutlass::gemm::KernelTmaWarpSpecialized; + using PongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong; + using FastDefaultSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum; + using FastPongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; + + using SlowAccum = DefaultSchedule; + using FastAccum = FastDefaultSchedule; + using MainLoopSchedule = cute::conditional_t; + + using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder( + sizeof(typename CollectiveEpilogue::SharedStorage))>, + MainLoopSchedule>::CollectiveOp; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal, // Indicates ProblemShape + CollectiveMainloop, CollectiveEpilogue, TileScheduler>; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; +}; + +template typename Gemm::Arguments prepare_sm90_fp8_args(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, const torch::Tensor& scales_b, const c10::optional& bias) @@ -429,6 +416,11 @@ typename Gemm::Arguments prepare_sm90_fp8_args(torch::Tensor& out, const torch:: int32_t k = a.size(1); ElementT const* ptr_a = reinterpret_cast(a.data_ptr()); ElementT const* ptr_b = reinterpret_cast(b.data_ptr()); + ElementOutput const* ptr_bias = nullptr; + if constexpr (WithBias) { + TORCH_CHECK(bias.has_value()) + ptr_bias = reinterpret_cast(bias.value().data_ptr()); + } ElementOutput* ptr_d = reinterpret_cast(out.data_ptr()); ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast(scales_a.data_ptr()); ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast(scales_b.data_ptr()); @@ -442,41 +434,42 @@ typename Gemm::Arguments prepare_sm90_fp8_args(torch::Tensor& out, const torch:: = {cutlass::gemm::GemmUniversalMode::kGemm, {m, n, k, 1}, {ptr_a, stride_a, ptr_b, stride_b}, {{}, // epilogue.thread nullptr, stride_c, ptr_d, stride_d}}; - args.epilogue.thread = { - {ptr_scales_a}, - { - {ptr_scales_b}, {}, // Accumulator - {} // Multiplies - }, - {}, // Multiplies - }; + if constexpr (WithBias) { + args.epilogue.thread = { + {ptr_scales_a}, + { + {ptr_scales_b}, {}, // Accumulator + {} // Multiplies + }, + {ptr_bias}, + {}, // Multiplies + }; + } else { + args.epilogue.thread = { + {ptr_scales_a}, + { + {ptr_scales_b}, {}, // Accumulator + {} // Multiplies + }, + {}, // Multiplies + }; + } + return args; } -template +template void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, const torch::Tensor& scales_b, const c10::optional& bias) { - using ElementInput = cutlass::float_e4m3_t; - using ElementOutput = OutType; - using AccumElementType = float; - using MainloopScheduleType = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum; - using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized; - using TileSchedulerType = void; - using Gemm = typename DeviceGemmFp8RowwiseSm90::Gemm; - auto args = prepare_sm90_fp8_args(out, a, b, scales_a, scales_b, bias); - - // Launch the CUTLASS GEMM kernel. + auto args = prepare_sm90_fp8_args(out, a, b, scales_a, scales_b, bias); Gemm gemm_op; - // CUTLASS_CHECK(gemm_op.can_implement(args)); size_t workspace_size = gemm_op.get_workspace_size(args); auto const workspace_options = torch::TensorOptions().dtype(torch::kUInt8).device(a.device()); auto workspace = torch::empty(workspace_size, workspace_options); - auto stream = at::cuda::getCurrentCUDAStream(a.get_device()); auto can_implement = gemm_op.can_implement(args); @@ -484,19 +477,32 @@ void launch_sm90_fp8_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const auto status = gemm_op.run(args, workspace.data_ptr(), stream); TORCH_CHECK(status == cutlass::Status::kSuccess) - // cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream); - // CUTLASS_CHECK(status); -// return typedFp8RowwiseGemmKernelLauncher( -// Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy); -// #else // COMPILE_HOPPER_TMA_GEMMS -// throw std::runtime_error( -// "[TensorRT-LLm Error][Fp8RowwiseGemmKernelLauncherSm90] Please recompile with support for hopper by passing " -// "90-real as an arch to build_wheel.py."); -// #endif // COMPILE_HOPPER_TMA_GEMMS +} + + +template +void sm90_dispatch_bias(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, + const torch::Tensor& scales_b, + const c10::optional& bias) { + using ElementInput = cutlass::float_e4m3_t; + using ElementOutput = OutType; + using AccumElementType = float; + using MainloopScheduleType = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum; + using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized; + using TileSchedulerType = void; + if (bias) { + using Gemm = typename DeviceGemmFp8RowwiseSm90::Gemm; + return launch_sm90_fp8_scaled_mm(out, a, b, scales_a, scales_b, bias); + } else { + using Gemm = typename DeviceGemmFp8RowwiseSm90::Gemm; + return launch_sm90_fp8_scaled_mm(out, a, b, scales_a, scales_b, bias); + } } template -void s90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, +void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& scales_a, const torch::Tensor& scales_b, const c10::optional& bias) { uint32_t const m = a.size(0); @@ -505,13 +511,13 @@ void s90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch: if (mp2 <= 64) { // m in [1, 64] - return launch_sm90_fp8_scaled_mm, Shape<_1, _8, _1>>(out, a, b, scales_a, scales_b, bias); + return sm90_dispatch_bias, Shape<_1, _8, _1>>(out, a, b, scales_a, scales_b, bias); } else if (mp2 <= 128) { // m in (64, 128] - return launch_sm90_fp8_scaled_mm, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias); + return sm90_dispatch_bias, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias); } else { // m in (128, inf) - return launch_sm90_fp8_scaled_mm, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias); + return sm90_dispatch_bias, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias); } } @@ -526,10 +532,8 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat TORCH_CHECK(mat_b.stride(0) == 1, "mat_a must be a column major tensor"); TORCH_CHECK(mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied"); - TORCH_CHECK(mat_a.size(1) % 16 == 0, "mat_a.size(1) must be multiple of 16 for memory alignment"); -// TORCH_CHECK(mat_b.size(0) % 16 == 0, "mat_b.size(0) must be multiple of 16 for memory alignment"); -//TODO: % 8 - TORCH_CHECK(mat_b.size(1) % 16 == 0, "mat_b.size(1) must be multiple of 16 for memory alignment"); // out.stride(0) + TORCH_CHECK((mat_a.size(1) * mat_a.element_size()) % 16 == 0, "mat_a must be multiple of 16 bytes for memory alignment"); + TORCH_CHECK((mat_b.size(0) * mat_b.element_size()) % 16 == 0, "mat_b must be multiple of 16 bytes for memory alignment"); TORCH_CHECK(mat_a.scalar_type() == torch::kFloat8_e4m3fn, "mat_a must be Float8_e4m3fn"); TORCH_CHECK(mat_b.scalar_type() == torch::kFloat8_e4m3fn, "mat_b must be Float8_e4m3fn"); TORCH_CHECK(out_dtype == torch::kHalf || out_dtype == torch::kBFloat16, "out_dtype must be Half or BFloat16"); @@ -548,21 +552,22 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat } torch::Tensor out = torch::empty({mat_a.size(0), mat_b.size(1)}, mat_a.options().dtype(out_dtype)); + TORCH_CHECK((out.size(1) * out.element_size()) % 16 == 0, "out must be multiple of 16 bytes for memory alignment"); auto sm_version = getSMVersion(); if (sm_version >= 90) { - if (out_dtype == torch::kBFloat16) { - s90_dispatch_shape(out, mat_a, mat_b, scales_a, scales_b, bias); - } else { - s90_dispatch_shape(out, mat_a, mat_b, scales_a, scales_b, bias); - } + if (out_dtype == torch::kBFloat16) { + sm90_dispatch_shape(out, mat_a, mat_b, scales_a, scales_b, bias); + } else { + sm90_dispatch_shape(out, mat_a, mat_b, scales_a, scales_b, bias); + } } else if (sm_version == 89) { - if (out_dtype == torch::kBFloat16) { - s89_dispatch_shape(out, mat_a, mat_b, scales_a, scales_b, bias); - } else { - s89_dispatch_shape(out, mat_a, mat_b, scales_a, scales_b, bias); - } + if (out_dtype == torch::kBFloat16) { + sm89_dispatch_shape(out, mat_a, mat_b, scales_a, scales_b, bias); + } else { + sm89_dispatch_shape(out, mat_a, mat_b, scales_a, scales_b, bias); + } } else { TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented int8_scaled_mm for current compute capability: ", sm_version); } diff --git a/sgl-kernel/tests/test_fp8_gemm.py b/sgl-kernel/tests/test_fp8_gemm.py index c303bef1d1ed..2a474d7ea17e 100644 --- a/sgl-kernel/tests/test_fp8_gemm.py +++ b/sgl-kernel/tests/test_fp8_gemm.py @@ -1,7 +1,6 @@ import unittest import torch -from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant from sgl_kernel import fp8_scaled_mm @@ -13,6 +12,8 @@ def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias): temp1 = o * scale_a.view(-1, 1) temp2 = temp1 * scale_b.view(1, -1) final = temp2.to(out_dtype) + if bias is not None: + final = final + bias.view(1, -1) return final @@ -22,10 +23,9 @@ def _test_accuracy_once(self, M, N, K, with_bias, out_dtype, device): a = torch.randn((M, K), device=device) b = torch.randn((N, K), device=device) - scale_a = torch.randn((M,), device="cuda", dtype=torch.float32) * 0.01 - scale_b = torch.randn((N,), device="cuda", dtype=torch.float32) * 0.01 + scale_a = torch.randn((M,), device="cuda", dtype=torch.float32) * 0.001 + scale_b = torch.randn((N,), device="cuda", dtype=torch.float32) * 0.001 if with_bias: - # bias = torch.ones((N,), device="cuda", dtype=out_dtype) * 10 bias = torch.randn((N,), device="cuda", dtype=out_dtype) else: bias = None @@ -34,11 +34,9 @@ def _test_accuracy_once(self, M, N, K, with_bias, out_dtype, device): b_fp8 = b_fp8.t() a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) o = torch_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias) - # o1 = vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias) o1 = fp8_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, out_dtype, bias) - max_val = max(o.abs().max().item(), o1.abs().max().item()) - rtol = 4e-3 - atol = max_val * rtol + rtol = 0.01 + atol = 0.1 torch.testing.assert_close(o, o1, rtol=rtol, atol=atol) print(f"M={M}, N={N}, K={K}, with_bias={with_bias}, out_dtype={out_dtype}: OK") @@ -46,13 +44,8 @@ def test_accuracy(self): Ms = [1, 128, 512, 1024, 4096] Ns = [16, 128, 512, 1024, 4096] Ks = [512, 1024, 4096, 8192, 16384] - # Ms = [128] - # Ns = [512] - # Ks = [4096] - # bias_opts = [True, False] - bias_opts = [False] + bias_opts = [True, False] out_dtypes = [torch.bfloat16, torch.float16] - # out_dtypes = [torch.float16] for M in Ms: for N in Ns: for K in Ks: From ecc90a484fb6a150d4a76b760baad4640b2ae064 Mon Sep 17 00:00:00 2001 From: yych0745 <1398089567@qq.com> Date: Fri, 10 Jan 2025 17:22:15 +0800 Subject: [PATCH 058/248] opitmize --- sgl-kernel/benchmark/bench_fp8_gemm.py | 89 +++++++++- .../benchmark/bench_int8_res/results.html | 3 + sgl-kernel/benchmark/best_fp8_configs.json | 42 +++++ sgl-kernel/setup.py | 38 ++++- sgl-kernel/src/sgl-kernel/__init__.py | 2 + .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu | 159 ++++++++++++++++-- .../src/sgl-kernel/csrc/sgl_kernel_ops.cu | 15 +- sgl-kernel/src/sgl-kernel/ops/__init__.py | 22 ++- 8 files changed, 338 insertions(+), 32 deletions(-) create mode 100644 sgl-kernel/benchmark/bench_int8_res/results.html create mode 100644 sgl-kernel/benchmark/best_fp8_configs.json diff --git a/sgl-kernel/benchmark/bench_fp8_gemm.py b/sgl-kernel/benchmark/bench_fp8_gemm.py index d4bc2fdb91a3..65efce4417c0 100644 --- a/sgl-kernel/benchmark/bench_fp8_gemm.py +++ b/sgl-kernel/benchmark/bench_fp8_gemm.py @@ -4,8 +4,9 @@ from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant - - +from sgl_kernel import fp8_scaled_mm as sgl_scaled_mm +from sgl_kernel import fp8_scaled_mm_profile as sgl_scaled_mm_profile +import time def to_int8(tensor: torch.Tensor) -> torch.Tensor: return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) @@ -16,16 +17,18 @@ def to_int8(tensor: torch.Tensor) -> torch.Tensor: x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048], x_log=False, line_arg="provider", - line_vals=["vllm-fp8", "torch-fp8"], - line_names=["vllm-fp8", "torch-fp8"], - styles=[("green", "-"), ("blue", "-")], + # line_vals=["vllm-fp8", "torch-fp8", "sglang-fp8"], + # line_names=["vllm-fp8", "torch-fp8", "sglang-fp8"], + line_vals=["vllm-fp8", "sglang-fp8", "sglang-fp8-profile"], + line_names=["vllm-fp8", "sglang-fp8", "sglang-fp8-profile"], + styles=[("green", "-"), ("blue", "-"), ("red", "-")], ylabel="GB/s", plot_name="int8 scaled matmul", args={}, ) ) def benchmark(batch_size, provider): - M, N, K = batch_size, 8192, 21760 + M, N, K = batch_size, 4096, 8192 a = torch.ones((M, K), device="cuda") * 5.0 b = torch.ones((N, K), device="cuda") * 5.0 scale_a = torch.randn((M,), device="cuda", dtype=torch.float32) @@ -64,6 +67,80 @@ def benchmark(batch_size, provider): except RuntimeError as e: print("Error details:", e) raise + if provider == "sglang-fp8": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: sgl_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, torch.bfloat16), + quantiles=quantiles, + ) + if provider == "sglang-fp8-profile": + best_configs = [] + times = [] + valid_configs = [] + best_config_info = {} # 新增:用于存储每个输入规模的最优配置信息 + + try: + sgl_scaled_mm_profile(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, torch.bfloat16, bias=None, config_id=35) + except RuntimeError as e: + print(f"Skip config_id 35 due to error: {e}") + + for config_id in range(1, 7): + try: + torch.cuda.synchronize() + start = time.time() + sgl_scaled_mm_profile(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, + torch.bfloat16, bias=None, config_id=config_id) + torch.cuda.synchronize() + end = time.time() + times.append(end - start) + valid_configs.append(config_id) + print(f"config_id: {config_id}, time: {end - start}") + except RuntimeError as e: + print(f"Skip config_id {config_id} due to error: {e}") + continue + + if not valid_configs: + print("No valid config found") + return 0, 0, 0 + + min_time = float('inf') + best_config = None + for i, config_id in enumerate(valid_configs): + if times[i] < min_time: + min_time = times[i] + best_config = config_id + + # 记录当前输入规模的最优配置 + best_config_info[f"M{M}_N{N}_K{K}"] = { + "best_config": best_config, + "time": min_time, + "batch_size": batch_size + } + + # 将最优配置信息保存到文件 + import json + config_file = "best_fp8_configs.json" + try: + with open(config_file, "r") as f: + existing_configs = json.load(f) + except FileNotFoundError: + existing_configs = {} + + existing_configs.update(best_config_info) + with open(config_file, "w") as f: + json.dump(existing_configs, f, indent=4) + + print(f"Best config for batch_size={batch_size}: config_id={best_config}, time={min_time:.6f}s") + + # 使用最佳配置进行基准测试 + try: + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: sgl_scaled_mm_profile(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, torch.bfloat16, bias=None, config_id=best_config), + quantiles=quantiles, + ) + except RuntimeError as e: + print("Error details:", e) + print(f"config_id is not valid {best_config}") + ms, min_ms, max_ms = 1, 1, 1 gbps = lambda ms: (2 * M * N * K + M * N) * a.element_size() * 1e-9 / (ms * 1e-3) return gbps(ms), gbps(max_ms), gbps(min_ms) diff --git a/sgl-kernel/benchmark/bench_int8_res/results.html b/sgl-kernel/benchmark/bench_int8_res/results.html new file mode 100644 index 000000000000..f8f21993bfa1 --- /dev/null +++ b/sgl-kernel/benchmark/bench_int8_res/results.html @@ -0,0 +1,3 @@ + + + diff --git a/sgl-kernel/benchmark/best_fp8_configs.json b/sgl-kernel/benchmark/best_fp8_configs.json new file mode 100644 index 000000000000..cff052cfd253 --- /dev/null +++ b/sgl-kernel/benchmark/best_fp8_configs.json @@ -0,0 +1,42 @@ +{ + "M1_N4096_K8192": { + "best_config": 6, + "time": 6.532669067382812e-05, + "batch_size": 1 + }, + "M16_N4096_K8192": { + "best_config": 6, + "time": 6.699562072753906e-05, + "batch_size": 16 + }, + "M64_N4096_K8192": { + "best_config": 6, + "time": 6.67572021484375e-05, + "batch_size": 64 + }, + "M128_N4096_K8192": { + "best_config": 6, + "time": 6.699562072753906e-05, + "batch_size": 128 + }, + "M256_N4096_K8192": { + "best_config": 6, + "time": 6.842613220214844e-05, + "batch_size": 256 + }, + "M512_N4096_K8192": { + "best_config": 6, + "time": 0.00012421607971191406, + "batch_size": 512 + }, + "M1024_N4096_K8192": { + "best_config": 6, + "time": 0.00023627281188964844, + "batch_size": 1024 + }, + "M2048_N4096_K8192": { + "best_config": 6, + "time": 0.00045871734619140625, + "batch_size": 2048 + } +} \ No newline at end of file diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py index 3a60f6ba0a6b..aaa0a53dc899 100644 --- a/sgl-kernel/setup.py +++ b/sgl-kernel/setup.py @@ -2,6 +2,9 @@ from setuptools import setup from torch.utils.cpp_extension import BuildExtension, CUDAExtension +import os +import sys +import multiprocessing root = Path(__file__).parent.resolve() @@ -23,19 +26,32 @@ def update_wheel_platform_tag(): cutlass = root / "3rdparty" / "cutlass" +nlohmann = root / "3rdparty" / "nlohmann" + include_dirs = [ cutlass.resolve() / "include", cutlass.resolve() / "tools" / "util" / "include", root / "src" / "sgl-kernel" / "csrc", + nlohmann.resolve(), ] + +# nvcc_flags = [ +# "-O3", +# "-Xcompiler", +# "-fPIC", +# "-gencode=arch=compute_75,code=sm_75", +# "-gencode=arch=compute_80,code=sm_80", +# "-gencode=arch=compute_89,code=sm_89", +# "-gencode=arch=compute_90,code=sm_90", +# "-U__CUDA_NO_HALF_OPERATORS__", +# "-U__CUDA_NO_HALF2_OPERATORS__", +# ] nvcc_flags = [ "-O3", "-Xcompiler", "-fPIC", - "-gencode=arch=compute_75,code=sm_75", - "-gencode=arch=compute_80,code=sm_80", + # 只保留需要的架构 "-gencode=arch=compute_89,code=sm_89", - "-gencode=arch=compute_90,code=sm_90", "-U__CUDA_NO_HALF_OPERATORS__", "-U__CUDA_NO_HALF2_OPERATORS__", ] @@ -49,7 +65,7 @@ def update_wheel_platform_tag(): "src/sgl-kernel/csrc/trt_reduce_internal.cu", "src/sgl-kernel/csrc/trt_reduce_kernel.cu", "src/sgl-kernel/csrc/moe_align_kernel.cu", - "src/sgl-kernel/csrc/int8_gemm_kernel.cu", + # "src/sgl-kernel/csrc/int8_gemm_kernel.cu", "src/sgl-kernel/csrc/fp8_gemm_kernel.cu", "src/sgl-kernel/csrc/sgl_kernel_ops.cu", ], @@ -63,6 +79,20 @@ def update_wheel_platform_tag(): ), ] +def set_parallel_jobs(): + if sys.platform == 'win32': + num_cores = int(os.environ.get('NUMBER_OF_PROCESSORS', 4)) + else: + num_cores = len(os.sched_getaffinity(0)) if hasattr(os, 'sched_getaffinity') else os.cpu_count() + + # 限制并行度为核心数的1/4或更少 + num_jobs = max(1, num_cores // 2) + os.environ['MAX_JOBS'] = str(num_jobs) + + # 设置CUDA编译的并行任务数 + os.environ['CUDA_NVCC_THREADS'] = str(num_jobs) + return num_jobs +set_parallel_jobs() setup( name="sgl-kernel", version=get_version(), diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py index 2a4a2bd51771..06894c3358ef 100644 --- a/sgl-kernel/src/sgl-kernel/__init__.py +++ b/sgl-kernel/src/sgl-kernel/__init__.py @@ -4,6 +4,7 @@ init_custom_reduce, int8_scaled_mm, fp8_scaled_mm, + fp8_scaled_mm_profile, moe_align_block_size, ) @@ -14,4 +15,5 @@ "custom_reduce", "int8_scaled_mm", "fp8_scaled_mm", + "fp8_scaled_mm_profile", ] diff --git a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu index ef88110e9258..914d1cb4df83 100644 --- a/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu +++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_gemm_kernel.cu @@ -33,7 +33,10 @@ using namespace cute; template + typename WarpShape, int Stages, bool WithBias, + typename FP8MathOperator = cutlass::arch::OpMultiplyAdd, + template typename EpilogueVisitor = cutlass::epilogue::threadblock::Sm80EVT, + typename ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>> struct DeviceGemmFp8RowwiseSm89 { static_assert(std::is_same_v, "ElementType must be FP8(e4m3)"); @@ -97,8 +100,8 @@ struct DeviceGemmFp8RowwiseSm89 using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmWithVisitor::GemmKernel; + WarpShape, InstructionShape, EpilogueOp, ThreadblockSwizzle, + Stages, FP8MathOperator, EVTEpilogueStages>::GemmKernel; using Gemm = cutlass::gemm::device::GemmUniversalAdapter; }; @@ -509,16 +512,16 @@ void sm90_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch uint32_t const mp2 = std::max(static_cast(64), next_pow_2(m)); // next power of 2 - if (mp2 <= 64) { - // m in [1, 64] - return sm90_dispatch_bias, Shape<_1, _8, _1>>(out, a, b, scales_a, scales_b, bias); - } else if (mp2 <= 128) { - // m in (64, 128] - return sm90_dispatch_bias, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias); - } else { - // m in (128, inf) - return sm90_dispatch_bias, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias); - } + // if (mp2 <= 64) { + // // m in [1, 64] + // return sm90_dispatch_bias, Shape<_1, _8, _1>>(out, a, b, scales_a, scales_b, bias); + // } else if (mp2 <= 128) { + // // m in (64, 128] + // return sm90_dispatch_bias, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias); + // } else { + // // m in (128, inf) + // return sm90_dispatch_bias, Shape<_2, _1, _1>>(out, a, b, scales_a, scales_b, bias); + // } } torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a, @@ -574,3 +577,133 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat return out; } + + +#define DISPATCH_FP8_GEMM_CONFIG(TB_M, TB_N, TB_K, WP_M, WP_N, WP_K, STAGES) \ + sm89_dispatch_bias, \ + cutlass::gemm::GemmShape, STAGES>(out, mat_a, mat_b, scales_a, scales_b, bias) +// 定义一个宏来生成一组配置的所有stages +#define DISPATCH_FP8_GEMM_GROUP(GROUP_ID, CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, BASE_CASE) \ + case BASE_CASE: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 2); break; \ + case BASE_CASE + 1: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 3); break; \ + case BASE_CASE + 2: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 4); break; \ + case BASE_CASE + 3: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 5); break; \ + case BASE_CASE + 4: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 6); break; \ + case BASE_CASE + 5: DISPATCH_FP8_GEMM_CONFIG(CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, 7); break; + +template +void sm89_dispatch_shape_profile(torch::Tensor& out, const torch::Tensor& mat_a, const torch::Tensor& mat_b, + const torch::Tensor& scales_a, const torch::Tensor& scales_b, + const c10::optional& bias, + int config_id) { + switch(config_id) { + case 1: + DISPATCH_FP8_GEMM_CONFIG(32, 64, 128, 16, 64, 64, 5); + case 2: + DISPATCH_FP8_GEMM_CONFIG(16, 64, 128, 16, 64, 64, 5); + case 3: + DISPATCH_FP8_GEMM_CONFIG(64, 64, 128, 32, 64, 64, 5); + case 4: + DISPATCH_FP8_GEMM_CONFIG(64, 128, 64, 32, 64, 64, 5); + case 5: + DISPATCH_FP8_GEMM_CONFIG(128, 128, 64, 64, 32, 64, 2); + case 6: + DISPATCH_FP8_GEMM_CONFIG(64, 128, 64, 32, 64, 64, 6); + // // Group 1: CtaShape32x128x64_WarpShape32x32x64 + // DISPATCH_FP8_GEMM_GROUP(1, 32, 128, 64, 32, 32, 64, 1) + + // // Group 2: CtaShape64x128x64_WarpShape32x64x64 + // DISPATCH_FP8_GEMM_GROUP(2, 64, 128, 64, 32, 64, 64, 7) + + // // Group 3: CtaShape64x64x128_WarpShape32x64x64 + // DISPATCH_FP8_GEMM_GROUP(3, 64, 64, 128, 32, 64, 64, 13) + + // // Group 4: CtaShape64x128x64_WarpShape64x32x64 + // DISPATCH_FP8_GEMM_GROUP(4, 64, 128, 64, 64, 32, 64, 19) + + // // Group 5: CtaShape128x64x64_WarpShape64x32x64 + // DISPATCH_FP8_GEMM_GROUP(5, 128, 64, 64, 64, 32, 64, 25) + + // // Group 6: CtaShape128x128x64_WarpShape64x32x64 + // DISPATCH_FP8_GEMM_GROUP(6, 128, 128, 64, 64, 32, 64, 31) + + // // Group 7: CtaShape128x128x64_WarpShape64x64x64 + // DISPATCH_FP8_GEMM_GROUP(7, 128, 128, 64, 64, 64, 64, 37) + + // // Group 8: CtaShape128x128x64_WarpShape128x32x64 + // DISPATCH_FP8_GEMM_GROUP(8, 128, 128, 64, 128, 32, 64, 43) + + // // Group 9: CtaShape128x256x64_WarpShape64x64x64 + // DISPATCH_FP8_GEMM_GROUP(9, 128, 256, 64, 64, 64, 64, 49) + + // // Group 10: CtaShape256x128x64_WarpShape64x64x64 + // DISPATCH_FP8_GEMM_GROUP(10, 256, 128, 64, 64, 64, 64, 55) + + // // Group 11: CtaShape128x64x128_WarpShape64x32x128 + // DISPATCH_FP8_GEMM_GROUP(11, 128, 64, 128, 64, 32, 128, 61) + + // // Group 12: CtaShape16x256x128_WarpShape16x64x128 + // DISPATCH_FP8_GEMM_GROUP(12, 16, 256, 128, 16, 64, 128, 67) + + // // Group 13: CtaShape16x64x128_WarpShape16x64x64 + // DISPATCH_FP8_GEMM_GROUP(13, 16, 64, 128, 16, 64, 64, 73) + + // // Group 14: CtaShape16x128x64_WarpShape16x64x64 + // DISPATCH_FP8_GEMM_GROUP(14, 16, 128, 64, 16, 64, 64, 79) + + // // Group 15: CtaShape32x64x128_WarpShape16x64x64 + // DISPATCH_FP8_GEMM_GROUP(15, 32, 64, 128, 16, 64, 64, 85) + } +} +torch::Tensor fp8_scaled_mm_profile(const torch::Tensor& mat_a, const torch::Tensor& mat_b, + const torch::Tensor& scales_a, const torch::Tensor& scales_b, + const torch::Dtype& out_dtype, const c10::optional& bias, + int config_id) { + + // 基本检查 + TORCH_CHECK(mat_a.is_cuda(), "mat_a must be a CUDA tensor"); + TORCH_CHECK(mat_b.is_cuda(), "mat_b must be a CUDA tensor"); + TORCH_CHECK(mat_a.dim() == 2, "mat_a must be a 2D tensor"); + TORCH_CHECK(mat_b.dim() == 2, "mat_b must be a 2D tensor"); + TORCH_CHECK(mat_a.stride(1) == 1, "mat_a must be a row major tensor"); + TORCH_CHECK(mat_b.stride(0) == 1, "mat_a must be a column major tensor"); + TORCH_CHECK(mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied"); + + TORCH_CHECK((mat_a.size(1) * mat_a.element_size()) % 16 == 0, "mat_a must be multiple of 16 bytes for memory alignment"); + TORCH_CHECK((mat_b.size(0) * mat_b.element_size()) % 16 == 0, "mat_b must be multiple of 16 bytes for memory alignment"); + TORCH_CHECK(mat_a.scalar_type() == torch::kFloat8_e4m3fn, "mat_a must be Float8_e4m3fn"); + TORCH_CHECK(mat_b.scalar_type() == torch::kFloat8_e4m3fn, "mat_b must be Float8_e4m3fn"); + TORCH_CHECK(out_dtype == torch::kHalf || out_dtype == torch::kBFloat16, "out_dtype must be Half or BFloat16"); + + // 检查scales + TORCH_CHECK(scales_a.numel() == mat_a.size(0), "size of scales_a is not matched"); + TORCH_CHECK(scales_b.numel() == mat_b.size(1), "size of scales_b is not matched"); + TORCH_CHECK(scales_a.is_contiguous(), "scales_a must be contiguous"); + TORCH_CHECK(scales_b.is_contiguous(), "scales_b must be contiguous"); + TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32, "scales_a must be Float32"); + TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32, "scales_b must be Float32"); + + // 检查bias + if (bias) { + TORCH_CHECK(bias->numel() == mat_b.size(1), "size of bias is not matched"); + TORCH_CHECK(bias->is_contiguous(), "bias must be contiguous"); + TORCH_CHECK(bias->dtype() == out_dtype, "bias dtype must match output dtype"); + } + + torch::Tensor out = torch::empty({mat_a.size(0), mat_b.size(1)}, mat_a.options().dtype(out_dtype)); + TORCH_CHECK((out.size(1) * out.element_size()) % 16 == 0, "out must be multiple of 16 bytes for memory alignment"); + + auto sm_version = getSMVersion(); + + if (sm_version == 89) { + if (out_dtype == torch::kBFloat16) { + sm89_dispatch_shape_profile(out, mat_a, mat_b, scales_a, scales_b, bias, config_id); + } else { + sm89_dispatch_shape_profile(out, mat_a, mat_b, scales_a, scales_b, bias, config_id); + } + } else { + TORCH_CHECK_NOT_IMPLEMENTED(false, "FP8 operations require SM89 GPU architecture"); + } + + return out; +} \ No newline at end of file diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu index b12d324cc62b..4673a13271d0 100644 --- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu +++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu @@ -13,14 +13,19 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t b torch::Tensor token_cnts_buffer, torch::Tensor cumsum_buffer); // int8_scaled_mm -torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a, - const torch::Tensor& scales_b, const torch::Dtype& out_dtype, - const c10::optional& bias); +// torch::Tensor int8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a, +// const torch::Tensor& scales_b, const torch::Dtype& out_dtype, +// const c10::optional& bias); torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b, const torch::Tensor& scales_a, const torch::Tensor& scales_b, const torch::Dtype& out_dtype, const c10::optional& bias); +torch::Tensor fp8_scaled_mm_profile(const torch::Tensor& mat_a, const torch::Tensor& mat_b, + const torch::Tensor& scales_a, const torch::Tensor& scales_b, + const torch::Dtype& out_dtype, const c10::optional& bias, + int config_id); + PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { // trt_reduce m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)"); @@ -29,7 +34,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { // moe_align_block_size m.def("moe_align_block_size", &moe_align_block_size, "MOE Align Block Size (CUDA)"); // int8_scaled_mm - m.def("int8_scaled_mm", &int8_scaled_mm, "INT8 scaled matmul (CUDA)"); + // m.def("int8_scaled_mm", &int8_scaled_mm, "INT8 scaled matmul (CUDA)"); // fp8_scaled_mm m.def("fp8_scaled_mm", &fp8_scaled_mm, "FP8 scaled matmul (CUDA)"); + // fp8_scaled_mm_profile + m.def("fp8_scaled_mm_profile", &fp8_scaled_mm_profile, "FP8 scaled matmul profile (CUDA)"); } diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py index f339997b027f..8b36c1738cde 100644 --- a/sgl-kernel/src/sgl-kernel/ops/__init__.py +++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py @@ -1,10 +1,10 @@ from sgl_kernel.ops._kernels import all_reduce as _all_reduce from sgl_kernel.ops._kernels import dispose as _dispose from sgl_kernel.ops._kernels import init_custom_ar as _init_custom_ar -from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm +# from sgl_kernel.ops._kernels import int8_scaled_mm as _int8_scaled_mm from sgl_kernel.ops._kernels import fp8_scaled_mm as _fp8_scaled_mm from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size - +from sgl_kernel.ops._kernels import fp8_scaled_mm_profile as _fp8_scaled_mm_profile def init_custom_reduce(rank_id, num_devices, buffers, barrier_in, barrier_out): return _init_custom_ar(rank_id, num_devices, buffers, barrier_in, barrier_out) @@ -41,7 +41,18 @@ def moe_align_block_size( def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None): - return _int8_scaled_mm( + return None + # return _int8_scaled_mm( + # mat_a, + # mat_b, + # scales_a, + # scales_b, + # out_dtype, + # bias, + # ) + +def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None): + return _fp8_scaled_mm( mat_a, mat_b, scales_a, @@ -50,12 +61,13 @@ def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None): bias, ) -def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None): - return _fp8_scaled_mm( +def fp8_scaled_mm_profile(mat_a, mat_b, scales_a, scales_b, out_dtype, bias, config_id): + return _fp8_scaled_mm_profile( mat_a, mat_b, scales_a, scales_b, out_dtype, bias, + config_id, ) From 349795099e106bb9155a62f373f1e7c2bd85bab3 Mon Sep 17 00:00:00 2001 From: yych0745 <1398089567@qq.com> Date: Mon, 13 Jan 2025 19:38:03 +0800 Subject: [PATCH 059/248] add config_profile for sm_89 --- sgl-kernel/3rdparty/nlohmann/json.hpp | 25420 ++++++++++++++++ sgl-kernel/3rdparty/nlohmann/json_fwd.hpp | 187 + sgl-kernel/benchmark/89_fp8_bf16.json | 10 + ...fp8_bf16_256\350\247\243\345\206\263.json" | 10 + ...4096,device=NVIDIA_L40,dtype=bfloat16.json | 11 + ...=4096,device=NVIDIA_L40,dtype=float16.json | 11 + sgl-kernel/benchmark/bench_fp8_gemm.py | 142 +- .../benchmark/bench_fp8_res/results.html | 1 + sgl-kernel/benchmark/best_fp8_configs.json | 42 - sgl-kernel/outp | 0 sgl-kernel/setup.py | 13 +- .../src/sgl-kernel/csrc/fp8_gemm_kernel.cu | 320 +- .../src/sgl-kernel/csrc/sgl_kernel_ops.cu | 7 +- sgl-kernel/src/sgl-kernel/csrc/utils.hpp | 99 +- sgl-kernel/src/sgl-kernel/ops/__init__.py | 8 +- 15 files changed, 26041 insertions(+), 240 deletions(-) create mode 100644 sgl-kernel/3rdparty/nlohmann/json.hpp create mode 100644 sgl-kernel/3rdparty/nlohmann/json_fwd.hpp create mode 100644 sgl-kernel/benchmark/89_fp8_bf16.json create mode 100644 "sgl-kernel/benchmark/89_fp8_bf16_256\350\247\243\345\206\263.json" create mode 100644 sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=bfloat16.json create mode 100644 sgl-kernel/benchmark/N=8192,K=4096,device=NVIDIA_L40,dtype=float16.json create mode 100644 sgl-kernel/benchmark/bench_fp8_res/results.html delete mode 100644 sgl-kernel/benchmark/best_fp8_configs.json create mode 100644 sgl-kernel/outp diff --git a/sgl-kernel/3rdparty/nlohmann/json.hpp b/sgl-kernel/3rdparty/nlohmann/json.hpp new file mode 100644 index 000000000000..9be8b892e3dc --- /dev/null +++ b/sgl-kernel/3rdparty/nlohmann/json.hpp @@ -0,0 +1,25420 @@ +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + +/****************************************************************************\ + * Note on documentation: The source files contain links to the online * + * documentation of the public API at https://json.nlohmann.me. This URL * + * contains the most recent documentation and should also be applicable to * + * previous versions; documentation for deprecated functions is not * + * removed, but marked deprecated. See "Generate documentation" section in * + * file docs/README.md. * +\****************************************************************************/ + +#ifndef INCLUDE_NLOHMANN_JSON_HPP_ +#define INCLUDE_NLOHMANN_JSON_HPP_ + +#include // all_of, find, for_each +#include // nullptr_t, ptrdiff_t, size_t +#include // hash, less +#include // initializer_list +#ifndef JSON_NO_IO + #include // istream, ostream +#endif // JSON_NO_IO +#include // random_access_iterator_tag +#include // unique_ptr +#include // string, stoi, to_string +#include // declval, forward, move, pair, swap +#include // vector + +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +#include + +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +// This file contains all macro definitions affecting or depending on the ABI + +#ifndef JSON_SKIP_LIBRARY_VERSION_CHECK + #if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH) + #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 3 + #warning "Already included a different version of the library!" + #endif + #endif +#endif + +#define NLOHMANN_JSON_VERSION_MAJOR 3 // NOLINT(modernize-macro-to-enum) +#define NLOHMANN_JSON_VERSION_MINOR 11 // NOLINT(modernize-macro-to-enum) +#define NLOHMANN_JSON_VERSION_PATCH 3 // NOLINT(modernize-macro-to-enum) + +#ifndef JSON_DIAGNOSTICS + #define JSON_DIAGNOSTICS 0 +#endif + +#ifndef JSON_DIAGNOSTIC_POSITIONS + #define JSON_DIAGNOSTIC_POSITIONS 0 +#endif + +#ifndef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON + #define JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON 0 +#endif + +#if JSON_DIAGNOSTICS + #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS _diag +#else + #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS +#endif + +#if JSON_DIAGNOSTIC_POSITIONS + #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS _dp +#else + #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS +#endif + +#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON + #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON _ldvcmp +#else + #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON +#endif + +#ifndef NLOHMANN_JSON_NAMESPACE_NO_VERSION + #define NLOHMANN_JSON_NAMESPACE_NO_VERSION 0 +#endif + +// Construct the namespace ABI tags component +#define NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c) json_abi ## a ## b ## c +#define NLOHMANN_JSON_ABI_TAGS_CONCAT(a, b, c) \ + NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c) + +#define NLOHMANN_JSON_ABI_TAGS \ + NLOHMANN_JSON_ABI_TAGS_CONCAT( \ + NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS, \ + NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON, \ + NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS) + +// Construct the namespace version component +#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) \ + _v ## major ## _ ## minor ## _ ## patch +#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(major, minor, patch) \ + NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) + +#if NLOHMANN_JSON_NAMESPACE_NO_VERSION +#define NLOHMANN_JSON_NAMESPACE_VERSION +#else +#define NLOHMANN_JSON_NAMESPACE_VERSION \ + NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(NLOHMANN_JSON_VERSION_MAJOR, \ + NLOHMANN_JSON_VERSION_MINOR, \ + NLOHMANN_JSON_VERSION_PATCH) +#endif + +// Combine namespace components +#define NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b) a ## b +#define NLOHMANN_JSON_NAMESPACE_CONCAT(a, b) \ + NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b) + +#ifndef NLOHMANN_JSON_NAMESPACE +#define NLOHMANN_JSON_NAMESPACE \ + nlohmann::NLOHMANN_JSON_NAMESPACE_CONCAT( \ + NLOHMANN_JSON_ABI_TAGS, \ + NLOHMANN_JSON_NAMESPACE_VERSION) +#endif + +#ifndef NLOHMANN_JSON_NAMESPACE_BEGIN +#define NLOHMANN_JSON_NAMESPACE_BEGIN \ + namespace nlohmann \ + { \ + inline namespace NLOHMANN_JSON_NAMESPACE_CONCAT( \ + NLOHMANN_JSON_ABI_TAGS, \ + NLOHMANN_JSON_NAMESPACE_VERSION) \ + { +#endif + +#ifndef NLOHMANN_JSON_NAMESPACE_END +#define NLOHMANN_JSON_NAMESPACE_END \ + } /* namespace (inline namespace) NOLINT(readability/namespace) */ \ + } // namespace nlohmann +#endif + +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +#include // transform +#include // array +#include // forward_list +#include // inserter, front_inserter, end +#include // map +#ifdef JSON_HAS_CPP_17 + #include // optional +#endif +#include // string +#include // tuple, make_tuple +#include // is_arithmetic, is_same, is_enum, underlying_type, is_convertible +#include // unordered_map +#include // pair, declval +#include // valarray + +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +#include // nullptr_t +#include // exception +#if JSON_DIAGNOSTICS + #include // accumulate +#endif +#include // runtime_error +#include // to_string +#include // vector + +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +#include // array +#include // size_t +#include // uint8_t +#include // string + +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +#include // declval, pair +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +#include + +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +// #include + + +NLOHMANN_JSON_NAMESPACE_BEGIN +namespace detail +{ + +template struct make_void +{ + using type = void; +}; +template using void_t = typename make_void::type; + +} // namespace detail +NLOHMANN_JSON_NAMESPACE_END + + +NLOHMANN_JSON_NAMESPACE_BEGIN +namespace detail +{ + +// https://en.cppreference.com/w/cpp/experimental/is_detected +struct nonesuch +{ + nonesuch() = delete; + ~nonesuch() = delete; + nonesuch(nonesuch const&) = delete; + nonesuch(nonesuch const&&) = delete; + void operator=(nonesuch const&) = delete; + void operator=(nonesuch&&) = delete; +}; + +template class Op, + class... Args> +struct detector +{ + using value_t = std::false_type; + using type = Default; +}; + +template class Op, class... Args> +struct detector>, Op, Args...> +{ + using value_t = std::true_type; + using type = Op; +}; + +template class Op, class... Args> +using is_detected = typename detector::value_t; + +template class Op, class... Args> +struct is_detected_lazy : is_detected { }; + +template class Op, class... Args> +using detected_t = typename detector::type; + +template class Op, class... Args> +using detected_or = detector; + +template class Op, class... Args> +using detected_or_t = typename detected_or::type; + +template class Op, class... Args> +using is_detected_exact = std::is_same>; + +template class Op, class... Args> +using is_detected_convertible = + std::is_convertible, To>; + +} // namespace detail +NLOHMANN_JSON_NAMESPACE_END + +// #include + + +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-FileCopyrightText: 2016 - 2021 Evan Nemerson +// SPDX-License-Identifier: MIT + +/* Hedley - https://nemequ.github.io/hedley + * Created by Evan Nemerson + */ + +#if !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < 15) +#if defined(JSON_HEDLEY_VERSION) + #undef JSON_HEDLEY_VERSION +#endif +#define JSON_HEDLEY_VERSION 15 + +#if defined(JSON_HEDLEY_STRINGIFY_EX) + #undef JSON_HEDLEY_STRINGIFY_EX +#endif +#define JSON_HEDLEY_STRINGIFY_EX(x) #x + +#if defined(JSON_HEDLEY_STRINGIFY) + #undef JSON_HEDLEY_STRINGIFY +#endif +#define JSON_HEDLEY_STRINGIFY(x) JSON_HEDLEY_STRINGIFY_EX(x) + +#if defined(JSON_HEDLEY_CONCAT_EX) + #undef JSON_HEDLEY_CONCAT_EX +#endif +#define JSON_HEDLEY_CONCAT_EX(a,b) a##b + +#if defined(JSON_HEDLEY_CONCAT) + #undef JSON_HEDLEY_CONCAT +#endif +#define JSON_HEDLEY_CONCAT(a,b) JSON_HEDLEY_CONCAT_EX(a,b) + +#if defined(JSON_HEDLEY_CONCAT3_EX) + #undef JSON_HEDLEY_CONCAT3_EX +#endif +#define JSON_HEDLEY_CONCAT3_EX(a,b,c) a##b##c + +#if defined(JSON_HEDLEY_CONCAT3) + #undef JSON_HEDLEY_CONCAT3 +#endif +#define JSON_HEDLEY_CONCAT3(a,b,c) JSON_HEDLEY_CONCAT3_EX(a,b,c) + +#if defined(JSON_HEDLEY_VERSION_ENCODE) + #undef JSON_HEDLEY_VERSION_ENCODE +#endif +#define JSON_HEDLEY_VERSION_ENCODE(major,minor,revision) (((major) * 1000000) + ((minor) * 1000) + (revision)) + +#if defined(JSON_HEDLEY_VERSION_DECODE_MAJOR) + #undef JSON_HEDLEY_VERSION_DECODE_MAJOR +#endif +#define JSON_HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000) + +#if defined(JSON_HEDLEY_VERSION_DECODE_MINOR) + #undef JSON_HEDLEY_VERSION_DECODE_MINOR +#endif +#define JSON_HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000) + +#if defined(JSON_HEDLEY_VERSION_DECODE_REVISION) + #undef JSON_HEDLEY_VERSION_DECODE_REVISION +#endif +#define JSON_HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000) + +#if defined(JSON_HEDLEY_GNUC_VERSION) + #undef JSON_HEDLEY_GNUC_VERSION +#endif +#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__) + #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__) +#elif defined(__GNUC__) + #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0) +#endif + +#if defined(JSON_HEDLEY_GNUC_VERSION_CHECK) + #undef JSON_HEDLEY_GNUC_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_GNUC_VERSION) + #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GNUC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_MSVC_VERSION) + #undef JSON_HEDLEY_MSVC_VERSION +#endif +#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL) + #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100) +#elif defined(_MSC_FULL_VER) && !defined(__ICL) + #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10) +#elif defined(_MSC_VER) && !defined(__ICL) + #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0) +#endif + +#if defined(JSON_HEDLEY_MSVC_VERSION_CHECK) + #undef JSON_HEDLEY_MSVC_VERSION_CHECK +#endif +#if !defined(JSON_HEDLEY_MSVC_VERSION) + #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0) +#elif defined(_MSC_VER) && (_MSC_VER >= 1400) + #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch))) +#elif defined(_MSC_VER) && (_MSC_VER >= 1200) + #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch))) +#else + #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_VER >= ((major * 100) + (minor))) +#endif + +#if defined(JSON_HEDLEY_INTEL_VERSION) + #undef JSON_HEDLEY_INTEL_VERSION +#endif +#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && !defined(__ICL) + #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE) +#elif defined(__INTEL_COMPILER) && !defined(__ICL) + #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0) +#endif + +#if defined(JSON_HEDLEY_INTEL_VERSION_CHECK) + #undef JSON_HEDLEY_INTEL_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_INTEL_VERSION) + #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_INTEL_CL_VERSION) + #undef JSON_HEDLEY_INTEL_CL_VERSION +#endif +#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && defined(__ICL) + #define JSON_HEDLEY_INTEL_CL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0) +#endif + +#if defined(JSON_HEDLEY_INTEL_CL_VERSION_CHECK) + #undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_INTEL_CL_VERSION) + #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_CL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_PGI_VERSION) + #undef JSON_HEDLEY_PGI_VERSION +#endif +#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__) + #define JSON_HEDLEY_PGI_VERSION JSON_HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__) +#endif + +#if defined(JSON_HEDLEY_PGI_VERSION_CHECK) + #undef JSON_HEDLEY_PGI_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_PGI_VERSION) + #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PGI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_SUNPRO_VERSION) + #undef JSON_HEDLEY_SUNPRO_VERSION +#endif +#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000) + #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf), (__SUNPRO_C & 0xf) * 10) +#elif defined(__SUNPRO_C) + #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf) +#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000) + #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf), (__SUNPRO_CC & 0xf) * 10) +#elif defined(__SUNPRO_CC) + #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf) +#endif + +#if defined(JSON_HEDLEY_SUNPRO_VERSION_CHECK) + #undef JSON_HEDLEY_SUNPRO_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_SUNPRO_VERSION) + #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_SUNPRO_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION) + #undef JSON_HEDLEY_EMSCRIPTEN_VERSION +#endif +#if defined(__EMSCRIPTEN__) + #define JSON_HEDLEY_EMSCRIPTEN_VERSION JSON_HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__) +#endif + +#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK) + #undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION) + #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_EMSCRIPTEN_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_ARM_VERSION) + #undef JSON_HEDLEY_ARM_VERSION +#endif +#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION) + #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, (__ARMCOMPILER_VERSION % 1000000) / 10000, (__ARMCOMPILER_VERSION % 10000) / 100) +#elif defined(__CC_ARM) && defined(__ARMCC_VERSION) + #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100) +#endif + +#if defined(JSON_HEDLEY_ARM_VERSION_CHECK) + #undef JSON_HEDLEY_ARM_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_ARM_VERSION) + #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_ARM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_IBM_VERSION) + #undef JSON_HEDLEY_IBM_VERSION +#endif +#if defined(__ibmxl__) + #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__) +#elif defined(__xlC__) && defined(__xlC_ver__) + #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff) +#elif defined(__xlC__) + #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0) +#endif + +#if defined(JSON_HEDLEY_IBM_VERSION_CHECK) + #undef JSON_HEDLEY_IBM_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_IBM_VERSION) + #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IBM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TI_VERSION) + #undef JSON_HEDLEY_TI_VERSION +#endif +#if \ + defined(__TI_COMPILER_VERSION__) && \ + ( \ + defined(__TMS470__) || defined(__TI_ARM__) || \ + defined(__MSP430__) || \ + defined(__TMS320C2000__) \ + ) +#if (__TI_COMPILER_VERSION__ >= 16000000) + #define JSON_HEDLEY_TI_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif +#endif + +#if defined(JSON_HEDLEY_TI_VERSION_CHECK) + #undef JSON_HEDLEY_TI_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TI_VERSION) + #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TI_CL2000_VERSION) + #undef JSON_HEDLEY_TI_CL2000_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__) + #define JSON_HEDLEY_TI_CL2000_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(JSON_HEDLEY_TI_CL2000_VERSION_CHECK) + #undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TI_CL2000_VERSION) + #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL2000_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TI_CL430_VERSION) + #undef JSON_HEDLEY_TI_CL430_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__) + #define JSON_HEDLEY_TI_CL430_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(JSON_HEDLEY_TI_CL430_VERSION_CHECK) + #undef JSON_HEDLEY_TI_CL430_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TI_CL430_VERSION) + #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL430_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TI_ARMCL_VERSION) + #undef JSON_HEDLEY_TI_ARMCL_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__)) + #define JSON_HEDLEY_TI_ARMCL_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(JSON_HEDLEY_TI_ARMCL_VERSION_CHECK) + #undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TI_ARMCL_VERSION) + #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_ARMCL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TI_CL6X_VERSION) + #undef JSON_HEDLEY_TI_CL6X_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__) + #define JSON_HEDLEY_TI_CL6X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(JSON_HEDLEY_TI_CL6X_VERSION_CHECK) + #undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TI_CL6X_VERSION) + #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL6X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TI_CL7X_VERSION) + #undef JSON_HEDLEY_TI_CL7X_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__) + #define JSON_HEDLEY_TI_CL7X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(JSON_HEDLEY_TI_CL7X_VERSION_CHECK) + #undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TI_CL7X_VERSION) + #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL7X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TI_CLPRU_VERSION) + #undef JSON_HEDLEY_TI_CLPRU_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__) + #define JSON_HEDLEY_TI_CLPRU_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(JSON_HEDLEY_TI_CLPRU_VERSION_CHECK) + #undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TI_CLPRU_VERSION) + #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CLPRU_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_CRAY_VERSION) + #undef JSON_HEDLEY_CRAY_VERSION +#endif +#if defined(_CRAYC) + #if defined(_RELEASE_PATCHLEVEL) + #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL) + #else + #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0) + #endif +#endif + +#if defined(JSON_HEDLEY_CRAY_VERSION_CHECK) + #undef JSON_HEDLEY_CRAY_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_CRAY_VERSION) + #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_CRAY_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_IAR_VERSION) + #undef JSON_HEDLEY_IAR_VERSION +#endif +#if defined(__IAR_SYSTEMS_ICC__) + #if __VER__ > 1000 + #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000)) + #else + #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE(__VER__ / 100, __VER__ % 100, 0) + #endif +#endif + +#if defined(JSON_HEDLEY_IAR_VERSION_CHECK) + #undef JSON_HEDLEY_IAR_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_IAR_VERSION) + #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IAR_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TINYC_VERSION) + #undef JSON_HEDLEY_TINYC_VERSION +#endif +#if defined(__TINYC__) + #define JSON_HEDLEY_TINYC_VERSION JSON_HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100) +#endif + +#if defined(JSON_HEDLEY_TINYC_VERSION_CHECK) + #undef JSON_HEDLEY_TINYC_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TINYC_VERSION) + #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TINYC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_DMC_VERSION) + #undef JSON_HEDLEY_DMC_VERSION +#endif +#if defined(__DMC__) + #define JSON_HEDLEY_DMC_VERSION JSON_HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf) +#endif + +#if defined(JSON_HEDLEY_DMC_VERSION_CHECK) + #undef JSON_HEDLEY_DMC_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_DMC_VERSION) + #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_DMC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_COMPCERT_VERSION) + #undef JSON_HEDLEY_COMPCERT_VERSION +#endif +#if defined(__COMPCERT_VERSION__) + #define JSON_HEDLEY_COMPCERT_VERSION JSON_HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100) +#endif + +#if defined(JSON_HEDLEY_COMPCERT_VERSION_CHECK) + #undef JSON_HEDLEY_COMPCERT_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_COMPCERT_VERSION) + #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_COMPCERT_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_PELLES_VERSION) + #undef JSON_HEDLEY_PELLES_VERSION +#endif +#if defined(__POCC__) + #define JSON_HEDLEY_PELLES_VERSION JSON_HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0) +#endif + +#if defined(JSON_HEDLEY_PELLES_VERSION_CHECK) + #undef JSON_HEDLEY_PELLES_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_PELLES_VERSION) + #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PELLES_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_MCST_LCC_VERSION) + #undef JSON_HEDLEY_MCST_LCC_VERSION +#endif +#if defined(__LCC__) && defined(__LCC_MINOR__) + #define JSON_HEDLEY_MCST_LCC_VERSION JSON_HEDLEY_VERSION_ENCODE(__LCC__ / 100, __LCC__ % 100, __LCC_MINOR__) +#endif + +#if defined(JSON_HEDLEY_MCST_LCC_VERSION_CHECK) + #undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_MCST_LCC_VERSION) + #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_MCST_LCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_GCC_VERSION) + #undef JSON_HEDLEY_GCC_VERSION +#endif +#if \ + defined(JSON_HEDLEY_GNUC_VERSION) && \ + !defined(__clang__) && \ + !defined(JSON_HEDLEY_INTEL_VERSION) && \ + !defined(JSON_HEDLEY_PGI_VERSION) && \ + !defined(JSON_HEDLEY_ARM_VERSION) && \ + !defined(JSON_HEDLEY_CRAY_VERSION) && \ + !defined(JSON_HEDLEY_TI_VERSION) && \ + !defined(JSON_HEDLEY_TI_ARMCL_VERSION) && \ + !defined(JSON_HEDLEY_TI_CL430_VERSION) && \ + !defined(JSON_HEDLEY_TI_CL2000_VERSION) && \ + !defined(JSON_HEDLEY_TI_CL6X_VERSION) && \ + !defined(JSON_HEDLEY_TI_CL7X_VERSION) && \ + !defined(JSON_HEDLEY_TI_CLPRU_VERSION) && \ + !defined(__COMPCERT__) && \ + !defined(JSON_HEDLEY_MCST_LCC_VERSION) + #define JSON_HEDLEY_GCC_VERSION JSON_HEDLEY_GNUC_VERSION +#endif + +#if defined(JSON_HEDLEY_GCC_VERSION_CHECK) + #undef JSON_HEDLEY_GCC_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_GCC_VERSION) + #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_HAS_ATTRIBUTE) + #undef JSON_HEDLEY_HAS_ATTRIBUTE +#endif +#if \ + defined(__has_attribute) && \ + ( \ + (!defined(JSON_HEDLEY_IAR_VERSION) || JSON_HEDLEY_IAR_VERSION_CHECK(8,5,9)) \ + ) +# define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute) +#else +# define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) (0) +#endif + +#if defined(JSON_HEDLEY_GNUC_HAS_ATTRIBUTE) + #undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE +#endif +#if defined(__has_attribute) + #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute) +#else + #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_GCC_HAS_ATTRIBUTE) + #undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE +#endif +#if defined(__has_attribute) + #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute) +#else + #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE) + #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE +#endif +#if \ + defined(__has_cpp_attribute) && \ + defined(__cplusplus) && \ + (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) + #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute) +#else + #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0) +#endif + +#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS) + #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS +#endif +#if !defined(__cplusplus) || !defined(__has_cpp_attribute) + #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0) +#elif \ + !defined(JSON_HEDLEY_PGI_VERSION) && \ + !defined(JSON_HEDLEY_IAR_VERSION) && \ + (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \ + (!defined(JSON_HEDLEY_MSVC_VERSION) || JSON_HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute) +#else + #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0) +#endif + +#if defined(JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE) + #undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE +#endif +#if defined(__has_cpp_attribute) && defined(__cplusplus) + #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute) +#else + #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE) + #undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE +#endif +#if defined(__has_cpp_attribute) && defined(__cplusplus) + #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute) +#else + #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_HAS_BUILTIN) + #undef JSON_HEDLEY_HAS_BUILTIN +#endif +#if defined(__has_builtin) + #define JSON_HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin) +#else + #define JSON_HEDLEY_HAS_BUILTIN(builtin) (0) +#endif + +#if defined(JSON_HEDLEY_GNUC_HAS_BUILTIN) + #undef JSON_HEDLEY_GNUC_HAS_BUILTIN +#endif +#if defined(__has_builtin) + #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin) +#else + #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_GCC_HAS_BUILTIN) + #undef JSON_HEDLEY_GCC_HAS_BUILTIN +#endif +#if defined(__has_builtin) + #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin) +#else + #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_HAS_FEATURE) + #undef JSON_HEDLEY_HAS_FEATURE +#endif +#if defined(__has_feature) + #define JSON_HEDLEY_HAS_FEATURE(feature) __has_feature(feature) +#else + #define JSON_HEDLEY_HAS_FEATURE(feature) (0) +#endif + +#if defined(JSON_HEDLEY_GNUC_HAS_FEATURE) + #undef JSON_HEDLEY_GNUC_HAS_FEATURE +#endif +#if defined(__has_feature) + #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature) +#else + #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_GCC_HAS_FEATURE) + #undef JSON_HEDLEY_GCC_HAS_FEATURE +#endif +#if defined(__has_feature) + #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature) +#else + #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_HAS_EXTENSION) + #undef JSON_HEDLEY_HAS_EXTENSION +#endif +#if defined(__has_extension) + #define JSON_HEDLEY_HAS_EXTENSION(extension) __has_extension(extension) +#else + #define JSON_HEDLEY_HAS_EXTENSION(extension) (0) +#endif + +#if defined(JSON_HEDLEY_GNUC_HAS_EXTENSION) + #undef JSON_HEDLEY_GNUC_HAS_EXTENSION +#endif +#if defined(__has_extension) + #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension) +#else + #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_GCC_HAS_EXTENSION) + #undef JSON_HEDLEY_GCC_HAS_EXTENSION +#endif +#if defined(__has_extension) + #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension) +#else + #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE) + #undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE +#endif +#if defined(__has_declspec_attribute) + #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute) +#else + #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0) +#endif + +#if defined(JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE) + #undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE +#endif +#if defined(__has_declspec_attribute) + #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute) +#else + #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE) + #undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE +#endif +#if defined(__has_declspec_attribute) + #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute) +#else + #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_HAS_WARNING) + #undef JSON_HEDLEY_HAS_WARNING +#endif +#if defined(__has_warning) + #define JSON_HEDLEY_HAS_WARNING(warning) __has_warning(warning) +#else + #define JSON_HEDLEY_HAS_WARNING(warning) (0) +#endif + +#if defined(JSON_HEDLEY_GNUC_HAS_WARNING) + #undef JSON_HEDLEY_GNUC_HAS_WARNING +#endif +#if defined(__has_warning) + #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning) +#else + #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_GCC_HAS_WARNING) + #undef JSON_HEDLEY_GCC_HAS_WARNING +#endif +#if defined(__has_warning) + #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning) +#else + #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \ + defined(__clang__) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \ + JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \ + (JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) && defined(__C99_PRAGMA_OPERATOR)) + #define JSON_HEDLEY_PRAGMA(value) _Pragma(#value) +#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) + #define JSON_HEDLEY_PRAGMA(value) __pragma(value) +#else + #define JSON_HEDLEY_PRAGMA(value) +#endif + +#if defined(JSON_HEDLEY_DIAGNOSTIC_PUSH) + #undef JSON_HEDLEY_DIAGNOSTIC_PUSH +#endif +#if defined(JSON_HEDLEY_DIAGNOSTIC_POP) + #undef JSON_HEDLEY_DIAGNOSTIC_POP +#endif +#if defined(__clang__) + #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") + #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop") +#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)") + #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)") +#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) + #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push") + #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop") +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push)) + #define JSON_HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop)) +#elif JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) + #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("push") + #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("pop") +#elif \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) + #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push") + #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop") +#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0) + #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)") + #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)") +#else + #define JSON_HEDLEY_DIAGNOSTIC_PUSH + #define JSON_HEDLEY_DIAGNOSTIC_POP +#endif + +/* JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for + HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ +#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) + #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ +#endif +#if defined(__cplusplus) +# if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat") +# if JSON_HEDLEY_HAS_WARNING("-Wc++17-extensions") +# if JSON_HEDLEY_HAS_WARNING("-Wc++1z-extensions") +# define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ + _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \ + _Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") \ + xpr \ + JSON_HEDLEY_DIAGNOSTIC_POP +# else +# define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ + _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \ + xpr \ + JSON_HEDLEY_DIAGNOSTIC_POP +# endif +# else +# define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ + xpr \ + JSON_HEDLEY_DIAGNOSTIC_POP +# endif +# endif +#endif +#if !defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x +#endif + +#if defined(JSON_HEDLEY_CONST_CAST) + #undef JSON_HEDLEY_CONST_CAST +#endif +#if defined(__cplusplus) +# define JSON_HEDLEY_CONST_CAST(T, expr) (const_cast(expr)) +#elif \ + JSON_HEDLEY_HAS_WARNING("-Wcast-qual") || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define JSON_HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \ + ((T) (expr)); \ + JSON_HEDLEY_DIAGNOSTIC_POP \ + })) +#else +# define JSON_HEDLEY_CONST_CAST(T, expr) ((T) (expr)) +#endif + +#if defined(JSON_HEDLEY_REINTERPRET_CAST) + #undef JSON_HEDLEY_REINTERPRET_CAST +#endif +#if defined(__cplusplus) + #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast(expr)) +#else + #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr)) +#endif + +#if defined(JSON_HEDLEY_STATIC_CAST) + #undef JSON_HEDLEY_STATIC_CAST +#endif +#if defined(__cplusplus) + #define JSON_HEDLEY_STATIC_CAST(T, expr) (static_cast(expr)) +#else + #define JSON_HEDLEY_STATIC_CAST(T, expr) ((T) (expr)) +#endif + +#if defined(JSON_HEDLEY_CPP_CAST) + #undef JSON_HEDLEY_CPP_CAST +#endif +#if defined(__cplusplus) +# if JSON_HEDLEY_HAS_WARNING("-Wold-style-cast") +# define JSON_HEDLEY_CPP_CAST(T, expr) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \ + ((T) (expr)) \ + JSON_HEDLEY_DIAGNOSTIC_POP +# elif JSON_HEDLEY_IAR_VERSION_CHECK(8,3,0) +# define JSON_HEDLEY_CPP_CAST(T, expr) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("diag_suppress=Pe137") \ + JSON_HEDLEY_DIAGNOSTIC_POP +# else +# define JSON_HEDLEY_CPP_CAST(T, expr) ((T) (expr)) +# endif +#else +# define JSON_HEDLEY_CPP_CAST(T, expr) (expr) +#endif + +#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED) + #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wdeprecated-declarations") + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") +#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)") +#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:1478 1786)) +#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1216,1444,1445") +#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444") +#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") +#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996)) +#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444") +#elif \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718") +#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)") +#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && defined(__cplusplus) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)") +#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215") +#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)") +#else + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED +#endif + +#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS) + #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas") + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"") +#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)") +#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:161)) +#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675") +#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") +#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068)) +#elif \ + JSON_HEDLEY_TI_VERSION_CHECK(16,9,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") +#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") +#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161") +#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 161") +#else + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS +#endif + +#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES) + #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wunknown-attributes") + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"") +#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") +#elif JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)") +#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:1292)) +#elif JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030)) +#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097,1098") +#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097") +#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)") +#elif \ + JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173") +#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097") +#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097") +#else + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES +#endif + +#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL) + #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wcast-qual") + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"") +#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)") +#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"") +#else + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL +#endif + +#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION) + #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wunused-function") + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("clang diagnostic ignored \"-Wunused-function\"") +#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("GCC diagnostic ignored \"-Wunused-function\"") +#elif JSON_HEDLEY_MSVC_VERSION_CHECK(1,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION __pragma(warning(disable:4505)) +#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("diag_suppress 3142") +#else + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION +#endif + +#if defined(JSON_HEDLEY_DEPRECATED) + #undef JSON_HEDLEY_DEPRECATED +#endif +#if defined(JSON_HEDLEY_DEPRECATED_FOR) + #undef JSON_HEDLEY_DEPRECATED_FOR +#endif +#if \ + JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since)) + #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement)) +#elif \ + (JSON_HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) && !defined(JSON_HEDLEY_IAR_VERSION)) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since))) + #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement))) +#elif defined(__cplusplus) && (__cplusplus >= 201402L) + #define JSON_HEDLEY_DEPRECATED(since) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]]) + #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]]) +#elif \ + JSON_HEDLEY_HAS_ATTRIBUTE(deprecated) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ + JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0) + #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__)) + #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__)) +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ + JSON_HEDLEY_PELLES_VERSION_CHECK(6,50,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated) + #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated) +#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) + #define JSON_HEDLEY_DEPRECATED(since) _Pragma("deprecated") + #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated") +#else + #define JSON_HEDLEY_DEPRECATED(since) + #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) +#endif + +#if defined(JSON_HEDLEY_UNAVAILABLE) + #undef JSON_HEDLEY_UNAVAILABLE +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(warning) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since))) +#else + #define JSON_HEDLEY_UNAVAILABLE(available_since) +#endif + +#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT) + #undef JSON_HEDLEY_WARN_UNUSED_RESULT +#endif +#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT_MSG) + #undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) + #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__)) +#elif (JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L) + #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) + #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]]) +#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) + #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) + #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) +#elif defined(_Check_return_) /* SAL */ + #define JSON_HEDLEY_WARN_UNUSED_RESULT _Check_return_ + #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_ +#else + #define JSON_HEDLEY_WARN_UNUSED_RESULT + #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) +#endif + +#if defined(JSON_HEDLEY_SENTINEL) + #undef JSON_HEDLEY_SENTINEL +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(sentinel) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position))) +#else + #define JSON_HEDLEY_SENTINEL(position) +#endif + +#if defined(JSON_HEDLEY_NO_RETURN) + #undef JSON_HEDLEY_NO_RETURN +#endif +#if JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) + #define JSON_HEDLEY_NO_RETURN __noreturn +#elif \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__)) +#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L + #define JSON_HEDLEY_NO_RETURN _Noreturn +#elif defined(__cplusplus) && (__cplusplus >= 201103L) + #define JSON_HEDLEY_NO_RETURN JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]]) +#elif \ + JSON_HEDLEY_HAS_ATTRIBUTE(noreturn) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,2,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0) + #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__)) +#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) + #define JSON_HEDLEY_NO_RETURN _Pragma("does_not_return") +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_NO_RETURN __declspec(noreturn) +#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus) + #define JSON_HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;") +#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0) + #define JSON_HEDLEY_NO_RETURN __attribute((noreturn)) +#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0) + #define JSON_HEDLEY_NO_RETURN __declspec(noreturn) +#else + #define JSON_HEDLEY_NO_RETURN +#endif + +#if defined(JSON_HEDLEY_NO_ESCAPE) + #undef JSON_HEDLEY_NO_ESCAPE +#endif +#if JSON_HEDLEY_HAS_ATTRIBUTE(noescape) + #define JSON_HEDLEY_NO_ESCAPE __attribute__((__noescape__)) +#else + #define JSON_HEDLEY_NO_ESCAPE +#endif + +#if defined(JSON_HEDLEY_UNREACHABLE) + #undef JSON_HEDLEY_UNREACHABLE +#endif +#if defined(JSON_HEDLEY_UNREACHABLE_RETURN) + #undef JSON_HEDLEY_UNREACHABLE_RETURN +#endif +#if defined(JSON_HEDLEY_ASSUME) + #undef JSON_HEDLEY_ASSUME +#endif +#if \ + JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_ASSUME(expr) __assume(expr) +#elif JSON_HEDLEY_HAS_BUILTIN(__builtin_assume) + #define JSON_HEDLEY_ASSUME(expr) __builtin_assume(expr) +#elif \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) + #if defined(__cplusplus) + #define JSON_HEDLEY_ASSUME(expr) std::_nassert(expr) + #else + #define JSON_HEDLEY_ASSUME(expr) _nassert(expr) + #endif +#endif +#if \ + (JSON_HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(JSON_HEDLEY_ARM_VERSION))) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(18,10,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(13,1,5) || \ + JSON_HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_UNREACHABLE() __builtin_unreachable() +#elif defined(JSON_HEDLEY_ASSUME) + #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0) +#endif +#if !defined(JSON_HEDLEY_ASSUME) + #if defined(JSON_HEDLEY_UNREACHABLE) + #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (JSON_HEDLEY_UNREACHABLE(), 1))) + #else + #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, expr) + #endif +#endif +#if defined(JSON_HEDLEY_UNREACHABLE) + #if \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) + #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (JSON_HEDLEY_STATIC_CAST(void, JSON_HEDLEY_ASSUME(0)), (value)) + #else + #define JSON_HEDLEY_UNREACHABLE_RETURN(value) JSON_HEDLEY_UNREACHABLE() + #endif +#else + #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (value) +#endif +#if !defined(JSON_HEDLEY_UNREACHABLE) + #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0) +#endif + +JSON_HEDLEY_DIAGNOSTIC_PUSH +#if JSON_HEDLEY_HAS_WARNING("-Wpedantic") + #pragma clang diagnostic ignored "-Wpedantic" +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus) + #pragma clang diagnostic ignored "-Wc++98-compat-pedantic" +#endif +#if JSON_HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0) + #if defined(__clang__) + #pragma clang diagnostic ignored "-Wvariadic-macros" + #elif defined(JSON_HEDLEY_GCC_VERSION) + #pragma GCC diagnostic ignored "-Wvariadic-macros" + #endif +#endif +#if defined(JSON_HEDLEY_NON_NULL) + #undef JSON_HEDLEY_NON_NULL +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(nonnull) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) + #define JSON_HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__))) +#else + #define JSON_HEDLEY_NON_NULL(...) +#endif +JSON_HEDLEY_DIAGNOSTIC_POP + +#if defined(JSON_HEDLEY_PRINTF_FORMAT) + #undef JSON_HEDLEY_PRINTF_FORMAT +#endif +#if defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && !defined(__USE_MINGW_ANSI_STDIO) + #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(ms_printf, string_idx, first_to_check))) +#elif defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && defined(__USE_MINGW_ANSI_STDIO) + #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(gnu_printf, string_idx, first_to_check))) +#elif \ + JSON_HEDLEY_HAS_ATTRIBUTE(format) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check))) +#elif JSON_HEDLEY_PELLES_VERSION_CHECK(6,0,0) + #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check)) +#else + #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) +#endif + +#if defined(JSON_HEDLEY_CONSTEXPR) + #undef JSON_HEDLEY_CONSTEXPR +#endif +#if defined(__cplusplus) + #if __cplusplus >= 201103L + #define JSON_HEDLEY_CONSTEXPR JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr) + #endif +#endif +#if !defined(JSON_HEDLEY_CONSTEXPR) + #define JSON_HEDLEY_CONSTEXPR +#endif + +#if defined(JSON_HEDLEY_PREDICT) + #undef JSON_HEDLEY_PREDICT +#endif +#if defined(JSON_HEDLEY_LIKELY) + #undef JSON_HEDLEY_LIKELY +#endif +#if defined(JSON_HEDLEY_UNLIKELY) + #undef JSON_HEDLEY_UNLIKELY +#endif +#if defined(JSON_HEDLEY_UNPREDICTABLE) + #undef JSON_HEDLEY_UNPREDICTABLE +#endif +#if JSON_HEDLEY_HAS_BUILTIN(__builtin_unpredictable) + #define JSON_HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr)) +#endif +#if \ + (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && !defined(JSON_HEDLEY_PGI_VERSION)) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(9,0,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) +# define JSON_HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability( (expr), (value), (probability)) +# define JSON_HEDLEY_PREDICT_TRUE(expr, probability) __builtin_expect_with_probability(!!(expr), 1 , (probability)) +# define JSON_HEDLEY_PREDICT_FALSE(expr, probability) __builtin_expect_with_probability(!!(expr), 0 , (probability)) +# define JSON_HEDLEY_LIKELY(expr) __builtin_expect (!!(expr), 1 ) +# define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect (!!(expr), 0 ) +#elif \ + (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \ + JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) +# define JSON_HEDLEY_PREDICT(expr, expected, probability) \ + (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (JSON_HEDLEY_STATIC_CAST(void, expected), (expr))) +# define JSON_HEDLEY_PREDICT_TRUE(expr, probability) \ + (__extension__ ({ \ + double hedley_probability_ = (probability); \ + ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \ + })) +# define JSON_HEDLEY_PREDICT_FALSE(expr, probability) \ + (__extension__ ({ \ + double hedley_probability_ = (probability); \ + ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \ + })) +# define JSON_HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1) +# define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) +#else +# define JSON_HEDLEY_PREDICT(expr, expected, probability) (JSON_HEDLEY_STATIC_CAST(void, expected), (expr)) +# define JSON_HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr)) +# define JSON_HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr)) +# define JSON_HEDLEY_LIKELY(expr) (!!(expr)) +# define JSON_HEDLEY_UNLIKELY(expr) (!!(expr)) +#endif +#if !defined(JSON_HEDLEY_UNPREDICTABLE) + #define JSON_HEDLEY_UNPREDICTABLE(expr) JSON_HEDLEY_PREDICT(expr, 1, 0.5) +#endif + +#if defined(JSON_HEDLEY_MALLOC) + #undef JSON_HEDLEY_MALLOC +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(malloc) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_MALLOC __attribute__((__malloc__)) +#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) + #define JSON_HEDLEY_MALLOC _Pragma("returns_new_memory") +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_MALLOC __declspec(restrict) +#else + #define JSON_HEDLEY_MALLOC +#endif + +#if defined(JSON_HEDLEY_PURE) + #undef JSON_HEDLEY_PURE +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(pure) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(2,96,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) +# define JSON_HEDLEY_PURE __attribute__((__pure__)) +#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) +# define JSON_HEDLEY_PURE _Pragma("does_not_write_global_data") +#elif defined(__cplusplus) && \ + ( \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \ + ) +# define JSON_HEDLEY_PURE _Pragma("FUNC_IS_PURE;") +#else +# define JSON_HEDLEY_PURE +#endif + +#if defined(JSON_HEDLEY_CONST) + #undef JSON_HEDLEY_CONST +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(const) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(2,5,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_CONST __attribute__((__const__)) +#elif \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) + #define JSON_HEDLEY_CONST _Pragma("no_side_effect") +#else + #define JSON_HEDLEY_CONST JSON_HEDLEY_PURE +#endif + +#if defined(JSON_HEDLEY_RESTRICT) + #undef JSON_HEDLEY_RESTRICT +#endif +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus) + #define JSON_HEDLEY_RESTRICT restrict +#elif \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ + JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \ + JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \ + defined(__clang__) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_RESTRICT __restrict +#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus) + #define JSON_HEDLEY_RESTRICT _Restrict +#else + #define JSON_HEDLEY_RESTRICT +#endif + +#if defined(JSON_HEDLEY_INLINE) + #undef JSON_HEDLEY_INLINE +#endif +#if \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \ + (defined(__cplusplus) && (__cplusplus >= 199711L)) + #define JSON_HEDLEY_INLINE inline +#elif \ + defined(JSON_HEDLEY_GCC_VERSION) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(6,2,0) + #define JSON_HEDLEY_INLINE __inline__ +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_INLINE __inline +#else + #define JSON_HEDLEY_INLINE +#endif + +#if defined(JSON_HEDLEY_ALWAYS_INLINE) + #undef JSON_HEDLEY_ALWAYS_INLINE +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(always_inline) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ + JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0) +# define JSON_HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) JSON_HEDLEY_INLINE +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) +# define JSON_HEDLEY_ALWAYS_INLINE __forceinline +#elif defined(__cplusplus) && \ + ( \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \ + ) +# define JSON_HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;") +#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) +# define JSON_HEDLEY_ALWAYS_INLINE _Pragma("inline=forced") +#else +# define JSON_HEDLEY_ALWAYS_INLINE JSON_HEDLEY_INLINE +#endif + +#if defined(JSON_HEDLEY_NEVER_INLINE) + #undef JSON_HEDLEY_NEVER_INLINE +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(noinline) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ + JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0) + #define JSON_HEDLEY_NEVER_INLINE __attribute__((__noinline__)) +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline) +#elif JSON_HEDLEY_PGI_VERSION_CHECK(10,2,0) + #define JSON_HEDLEY_NEVER_INLINE _Pragma("noinline") +#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus) + #define JSON_HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;") +#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) + #define JSON_HEDLEY_NEVER_INLINE _Pragma("inline=never") +#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0) + #define JSON_HEDLEY_NEVER_INLINE __attribute((noinline)) +#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0) + #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline) +#else + #define JSON_HEDLEY_NEVER_INLINE +#endif + +#if defined(JSON_HEDLEY_PRIVATE) + #undef JSON_HEDLEY_PRIVATE +#endif +#if defined(JSON_HEDLEY_PUBLIC) + #undef JSON_HEDLEY_PUBLIC +#endif +#if defined(JSON_HEDLEY_IMPORT) + #undef JSON_HEDLEY_IMPORT +#endif +#if defined(_WIN32) || defined(__CYGWIN__) +# define JSON_HEDLEY_PRIVATE +# define JSON_HEDLEY_PUBLIC __declspec(dllexport) +# define JSON_HEDLEY_IMPORT __declspec(dllimport) +#else +# if \ + JSON_HEDLEY_HAS_ATTRIBUTE(visibility) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ + ( \ + defined(__TI_EABI__) && \ + ( \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \ + ) \ + ) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) +# define JSON_HEDLEY_PRIVATE __attribute__((__visibility__("hidden"))) +# define JSON_HEDLEY_PUBLIC __attribute__((__visibility__("default"))) +# else +# define JSON_HEDLEY_PRIVATE +# define JSON_HEDLEY_PUBLIC +# endif +# define JSON_HEDLEY_IMPORT extern +#endif + +#if defined(JSON_HEDLEY_NO_THROW) + #undef JSON_HEDLEY_NO_THROW +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(nothrow) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_NO_THROW __attribute__((__nothrow__)) +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) + #define JSON_HEDLEY_NO_THROW __declspec(nothrow) +#else + #define JSON_HEDLEY_NO_THROW +#endif + +#if defined(JSON_HEDLEY_FALL_THROUGH) + #undef JSON_HEDLEY_FALL_THROUGH +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(fallthrough) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(7,0,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_FALL_THROUGH __attribute__((__fallthrough__)) +#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough) + #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]]) +#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough) + #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]]) +#elif defined(__fallthrough) /* SAL */ + #define JSON_HEDLEY_FALL_THROUGH __fallthrough +#else + #define JSON_HEDLEY_FALL_THROUGH +#endif + +#if defined(JSON_HEDLEY_RETURNS_NON_NULL) + #undef JSON_HEDLEY_RETURNS_NON_NULL +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__)) +#elif defined(_Ret_notnull_) /* SAL */ + #define JSON_HEDLEY_RETURNS_NON_NULL _Ret_notnull_ +#else + #define JSON_HEDLEY_RETURNS_NON_NULL +#endif + +#if defined(JSON_HEDLEY_ARRAY_PARAM) + #undef JSON_HEDLEY_ARRAY_PARAM +#endif +#if \ + defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \ + !defined(__STDC_NO_VLA__) && \ + !defined(__cplusplus) && \ + !defined(JSON_HEDLEY_PGI_VERSION) && \ + !defined(JSON_HEDLEY_TINYC_VERSION) + #define JSON_HEDLEY_ARRAY_PARAM(name) (name) +#else + #define JSON_HEDLEY_ARRAY_PARAM(name) +#endif + +#if defined(JSON_HEDLEY_IS_CONSTANT) + #undef JSON_HEDLEY_IS_CONSTANT +#endif +#if defined(JSON_HEDLEY_REQUIRE_CONSTEXPR) + #undef JSON_HEDLEY_REQUIRE_CONSTEXPR +#endif +/* JSON_HEDLEY_IS_CONSTEXPR_ is for + HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ +#if defined(JSON_HEDLEY_IS_CONSTEXPR_) + #undef JSON_HEDLEY_IS_CONSTEXPR_ +#endif +#if \ + JSON_HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ + (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \ + JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr) +#endif +#if !defined(__cplusplus) +# if \ + JSON_HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ + JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \ + JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,24) +#if defined(__INTPTR_TYPE__) + #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*) +#else + #include + #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*) +#endif +# elif \ + ( \ + defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ + !defined(JSON_HEDLEY_SUNPRO_VERSION) && \ + !defined(JSON_HEDLEY_PGI_VERSION) && \ + !defined(JSON_HEDLEY_IAR_VERSION)) || \ + (JSON_HEDLEY_HAS_EXTENSION(c_generic_selections) && !defined(JSON_HEDLEY_IAR_VERSION)) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(5,3,0) +#if defined(__INTPTR_TYPE__) + #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0) +#else + #include + #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0) +#endif +# elif \ + defined(JSON_HEDLEY_GCC_VERSION) || \ + defined(JSON_HEDLEY_INTEL_VERSION) || \ + defined(JSON_HEDLEY_TINYC_VERSION) || \ + defined(JSON_HEDLEY_TI_ARMCL_VERSION) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \ + defined(JSON_HEDLEY_TI_CL2000_VERSION) || \ + defined(JSON_HEDLEY_TI_CL6X_VERSION) || \ + defined(JSON_HEDLEY_TI_CL7X_VERSION) || \ + defined(JSON_HEDLEY_TI_CLPRU_VERSION) || \ + defined(__clang__) +# define JSON_HEDLEY_IS_CONSTEXPR_(expr) ( \ + sizeof(void) != \ + sizeof(*( \ + 1 ? \ + ((void*) ((expr) * 0L) ) : \ +((struct { char v[sizeof(void) * 2]; } *) 1) \ + ) \ + ) \ + ) +# endif +#endif +#if defined(JSON_HEDLEY_IS_CONSTEXPR_) + #if !defined(JSON_HEDLEY_IS_CONSTANT) + #define JSON_HEDLEY_IS_CONSTANT(expr) JSON_HEDLEY_IS_CONSTEXPR_(expr) + #endif + #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (JSON_HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1)) +#else + #if !defined(JSON_HEDLEY_IS_CONSTANT) + #define JSON_HEDLEY_IS_CONSTANT(expr) (0) + #endif + #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (expr) +#endif + +#if defined(JSON_HEDLEY_BEGIN_C_DECLS) + #undef JSON_HEDLEY_BEGIN_C_DECLS +#endif +#if defined(JSON_HEDLEY_END_C_DECLS) + #undef JSON_HEDLEY_END_C_DECLS +#endif +#if defined(JSON_HEDLEY_C_DECL) + #undef JSON_HEDLEY_C_DECL +#endif +#if defined(__cplusplus) + #define JSON_HEDLEY_BEGIN_C_DECLS extern "C" { + #define JSON_HEDLEY_END_C_DECLS } + #define JSON_HEDLEY_C_DECL extern "C" +#else + #define JSON_HEDLEY_BEGIN_C_DECLS + #define JSON_HEDLEY_END_C_DECLS + #define JSON_HEDLEY_C_DECL +#endif + +#if defined(JSON_HEDLEY_STATIC_ASSERT) + #undef JSON_HEDLEY_STATIC_ASSERT +#endif +#if \ + !defined(__cplusplus) && ( \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \ + (JSON_HEDLEY_HAS_FEATURE(c_static_assert) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(6,0,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + defined(_Static_assert) \ + ) +# define JSON_HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message) +#elif \ + (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ + JSON_HEDLEY_MSVC_VERSION_CHECK(16,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) +# define JSON_HEDLEY_STATIC_ASSERT(expr, message) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message)) +#else +# define JSON_HEDLEY_STATIC_ASSERT(expr, message) +#endif + +#if defined(JSON_HEDLEY_NULL) + #undef JSON_HEDLEY_NULL +#endif +#if defined(__cplusplus) + #if __cplusplus >= 201103L + #define JSON_HEDLEY_NULL JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr) + #elif defined(NULL) + #define JSON_HEDLEY_NULL NULL + #else + #define JSON_HEDLEY_NULL JSON_HEDLEY_STATIC_CAST(void*, 0) + #endif +#elif defined(NULL) + #define JSON_HEDLEY_NULL NULL +#else + #define JSON_HEDLEY_NULL ((void*) 0) +#endif + +#if defined(JSON_HEDLEY_MESSAGE) + #undef JSON_HEDLEY_MESSAGE +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas") +# define JSON_HEDLEY_MESSAGE(msg) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ + JSON_HEDLEY_PRAGMA(message msg) \ + JSON_HEDLEY_DIAGNOSTIC_POP +#elif \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,4,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message msg) +#elif JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0) +# define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(_CRI message msg) +#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) +# define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg)) +#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,0,0) +# define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg)) +#else +# define JSON_HEDLEY_MESSAGE(msg) +#endif + +#if defined(JSON_HEDLEY_WARNING) + #undef JSON_HEDLEY_WARNING +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas") +# define JSON_HEDLEY_WARNING(msg) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ + JSON_HEDLEY_PRAGMA(clang warning msg) \ + JSON_HEDLEY_DIAGNOSTIC_POP +#elif \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,8,0) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(GCC warning msg) +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) +# define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(message(msg)) +#else +# define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_MESSAGE(msg) +#endif + +#if defined(JSON_HEDLEY_REQUIRE) + #undef JSON_HEDLEY_REQUIRE +#endif +#if defined(JSON_HEDLEY_REQUIRE_MSG) + #undef JSON_HEDLEY_REQUIRE_MSG +#endif +#if JSON_HEDLEY_HAS_ATTRIBUTE(diagnose_if) +# if JSON_HEDLEY_HAS_WARNING("-Wgcc-compat") +# define JSON_HEDLEY_REQUIRE(expr) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ + __attribute__((diagnose_if(!(expr), #expr, "error"))) \ + JSON_HEDLEY_DIAGNOSTIC_POP +# define JSON_HEDLEY_REQUIRE_MSG(expr,msg) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ + __attribute__((diagnose_if(!(expr), msg, "error"))) \ + JSON_HEDLEY_DIAGNOSTIC_POP +# else +# define JSON_HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error"))) +# define JSON_HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error"))) +# endif +#else +# define JSON_HEDLEY_REQUIRE(expr) +# define JSON_HEDLEY_REQUIRE_MSG(expr,msg) +#endif + +#if defined(JSON_HEDLEY_FLAGS) + #undef JSON_HEDLEY_FLAGS +#endif +#if JSON_HEDLEY_HAS_ATTRIBUTE(flag_enum) && (!defined(__cplusplus) || JSON_HEDLEY_HAS_WARNING("-Wbitfield-enum-conversion")) + #define JSON_HEDLEY_FLAGS __attribute__((__flag_enum__)) +#else + #define JSON_HEDLEY_FLAGS +#endif + +#if defined(JSON_HEDLEY_FLAGS_CAST) + #undef JSON_HEDLEY_FLAGS_CAST +#endif +#if JSON_HEDLEY_INTEL_VERSION_CHECK(19,0,0) +# define JSON_HEDLEY_FLAGS_CAST(T, expr) (__extension__ ({ \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("warning(disable:188)") \ + ((T) (expr)); \ + JSON_HEDLEY_DIAGNOSTIC_POP \ + })) +#else +# define JSON_HEDLEY_FLAGS_CAST(T, expr) JSON_HEDLEY_STATIC_CAST(T, expr) +#endif + +#if defined(JSON_HEDLEY_EMPTY_BASES) + #undef JSON_HEDLEY_EMPTY_BASES +#endif +#if \ + (JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !JSON_HEDLEY_MSVC_VERSION_CHECK(20,0,0)) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_EMPTY_BASES __declspec(empty_bases) +#else + #define JSON_HEDLEY_EMPTY_BASES +#endif + +/* Remaining macros are deprecated. */ + +#if defined(JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK) + #undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK +#endif +#if defined(__clang__) + #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) (0) +#else + #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_CLANG_HAS_ATTRIBUTE) + #undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE +#endif +#define JSON_HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_ATTRIBUTE(attribute) + +#if defined(JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE) + #undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE +#endif +#define JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) + +#if defined(JSON_HEDLEY_CLANG_HAS_BUILTIN) + #undef JSON_HEDLEY_CLANG_HAS_BUILTIN +#endif +#define JSON_HEDLEY_CLANG_HAS_BUILTIN(builtin) JSON_HEDLEY_HAS_BUILTIN(builtin) + +#if defined(JSON_HEDLEY_CLANG_HAS_FEATURE) + #undef JSON_HEDLEY_CLANG_HAS_FEATURE +#endif +#define JSON_HEDLEY_CLANG_HAS_FEATURE(feature) JSON_HEDLEY_HAS_FEATURE(feature) + +#if defined(JSON_HEDLEY_CLANG_HAS_EXTENSION) + #undef JSON_HEDLEY_CLANG_HAS_EXTENSION +#endif +#define JSON_HEDLEY_CLANG_HAS_EXTENSION(extension) JSON_HEDLEY_HAS_EXTENSION(extension) + +#if defined(JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE) + #undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE +#endif +#define JSON_HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) + +#if defined(JSON_HEDLEY_CLANG_HAS_WARNING) + #undef JSON_HEDLEY_CLANG_HAS_WARNING +#endif +#define JSON_HEDLEY_CLANG_HAS_WARNING(warning) JSON_HEDLEY_HAS_WARNING(warning) + +#endif /* !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < X) */ + + +// This file contains all internal macro definitions (except those affecting ABI) +// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them + +// #include + + +// exclude unsupported compilers +#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK) + #if defined(__clang__) + #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400 + #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers" + #endif + #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER)) + #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800 + #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers" + #endif + #endif +#endif + +// C++ language standard detection +// if the user manually specified the used c++ version this is skipped +#if !defined(JSON_HAS_CPP_20) && !defined(JSON_HAS_CPP_17) && !defined(JSON_HAS_CPP_14) && !defined(JSON_HAS_CPP_11) + #if (defined(__cplusplus) && __cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L) + #define JSON_HAS_CPP_20 + #define JSON_HAS_CPP_17 + #define JSON_HAS_CPP_14 + #elif (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464 + #define JSON_HAS_CPP_17 + #define JSON_HAS_CPP_14 + #elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1) + #define JSON_HAS_CPP_14 + #endif + // the cpp 11 flag is always specified because it is the minimal required version + #define JSON_HAS_CPP_11 +#endif + +#ifdef __has_include + #if __has_include() + #include + #endif +#endif + +#if !defined(JSON_HAS_FILESYSTEM) && !defined(JSON_HAS_EXPERIMENTAL_FILESYSTEM) + #ifdef JSON_HAS_CPP_17 + #if defined(__cpp_lib_filesystem) + #define JSON_HAS_FILESYSTEM 1 + #elif defined(__cpp_lib_experimental_filesystem) + #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1 + #elif !defined(__has_include) + #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1 + #elif __has_include() + #define JSON_HAS_FILESYSTEM 1 + #elif __has_include() + #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1 + #endif + + // std::filesystem does not work on MinGW GCC 8: https://sourceforge.net/p/mingw-w64/bugs/737/ + #if defined(__MINGW32__) && defined(__GNUC__) && __GNUC__ == 8 + #undef JSON_HAS_FILESYSTEM + #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM + #endif + + // no filesystem support before GCC 8: https://en.cppreference.com/w/cpp/compiler_support + #if defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 8 + #undef JSON_HAS_FILESYSTEM + #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM + #endif + + // no filesystem support before Clang 7: https://en.cppreference.com/w/cpp/compiler_support + #if defined(__clang_major__) && __clang_major__ < 7 + #undef JSON_HAS_FILESYSTEM + #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM + #endif + + // no filesystem support before MSVC 19.14: https://en.cppreference.com/w/cpp/compiler_support + #if defined(_MSC_VER) && _MSC_VER < 1914 + #undef JSON_HAS_FILESYSTEM + #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM + #endif + + // no filesystem support before iOS 13 + #if defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 130000 + #undef JSON_HAS_FILESYSTEM + #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM + #endif + + // no filesystem support before macOS Catalina + #if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < 101500 + #undef JSON_HAS_FILESYSTEM + #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM + #endif + #endif +#endif + +#ifndef JSON_HAS_EXPERIMENTAL_FILESYSTEM + #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 0 +#endif + +#ifndef JSON_HAS_FILESYSTEM + #define JSON_HAS_FILESYSTEM 0 +#endif + +#ifndef JSON_HAS_THREE_WAY_COMPARISON + #if defined(__cpp_impl_three_way_comparison) && __cpp_impl_three_way_comparison >= 201907L \ + && defined(__cpp_lib_three_way_comparison) && __cpp_lib_three_way_comparison >= 201907L + #define JSON_HAS_THREE_WAY_COMPARISON 1 + #else + #define JSON_HAS_THREE_WAY_COMPARISON 0 + #endif +#endif + +#ifndef JSON_HAS_RANGES + // ranges header shipping in GCC 11.1.0 (released 2021-04-27) has syntax error + #if defined(__GLIBCXX__) && __GLIBCXX__ == 20210427 + #define JSON_HAS_RANGES 0 + #elif defined(__cpp_lib_ranges) + #define JSON_HAS_RANGES 1 + #else + #define JSON_HAS_RANGES 0 + #endif +#endif + +#ifndef JSON_HAS_STATIC_RTTI + #if !defined(_HAS_STATIC_RTTI) || _HAS_STATIC_RTTI != 0 + #define JSON_HAS_STATIC_RTTI 1 + #else + #define JSON_HAS_STATIC_RTTI 0 + #endif +#endif + +#ifdef JSON_HAS_CPP_17 + #define JSON_INLINE_VARIABLE inline +#else + #define JSON_INLINE_VARIABLE +#endif + +#if JSON_HEDLEY_HAS_ATTRIBUTE(no_unique_address) + #define JSON_NO_UNIQUE_ADDRESS [[no_unique_address]] +#else + #define JSON_NO_UNIQUE_ADDRESS +#endif + +// disable documentation warnings on clang +#if defined(__clang__) + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wdocumentation" + #pragma clang diagnostic ignored "-Wdocumentation-unknown-command" +#endif + +// allow disabling exceptions +#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION) + #define JSON_THROW(exception) throw exception + #define JSON_TRY try + #define JSON_CATCH(exception) catch(exception) + #define JSON_INTERNAL_CATCH(exception) catch(exception) +#else + #include + #define JSON_THROW(exception) std::abort() + #define JSON_TRY if(true) + #define JSON_CATCH(exception) if(false) + #define JSON_INTERNAL_CATCH(exception) if(false) +#endif + +// override exception macros +#if defined(JSON_THROW_USER) + #undef JSON_THROW + #define JSON_THROW JSON_THROW_USER +#endif +#if defined(JSON_TRY_USER) + #undef JSON_TRY + #define JSON_TRY JSON_TRY_USER +#endif +#if defined(JSON_CATCH_USER) + #undef JSON_CATCH + #define JSON_CATCH JSON_CATCH_USER + #undef JSON_INTERNAL_CATCH + #define JSON_INTERNAL_CATCH JSON_CATCH_USER +#endif +#if defined(JSON_INTERNAL_CATCH_USER) + #undef JSON_INTERNAL_CATCH + #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER +#endif + +// allow overriding assert +#if !defined(JSON_ASSERT) + #include // assert + #define JSON_ASSERT(x) assert(x) +#endif + +// allow to access some private functions (needed by the test suite) +#if defined(JSON_TESTS_PRIVATE) + #define JSON_PRIVATE_UNLESS_TESTED public +#else + #define JSON_PRIVATE_UNLESS_TESTED private +#endif + +/*! +@brief macro to briefly define a mapping between an enum and JSON +@def NLOHMANN_JSON_SERIALIZE_ENUM +@since version 3.4.0 +*/ +#define NLOHMANN_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...) \ + template \ + inline void to_json(BasicJsonType& j, const ENUM_TYPE& e) \ + { \ + /* NOLINTNEXTLINE(modernize-type-traits) we use C++11 */ \ + static_assert(std::is_enum::value, #ENUM_TYPE " must be an enum!"); \ + /* NOLINTNEXTLINE(modernize-avoid-c-arrays) we don't want to depend on */ \ + static const std::pair m[] = __VA_ARGS__; \ + auto it = std::find_if(std::begin(m), std::end(m), \ + [e](const std::pair& ej_pair) -> bool \ + { \ + return ej_pair.first == e; \ + }); \ + j = ((it != std::end(m)) ? it : std::begin(m))->second; \ + } \ + template \ + inline void from_json(const BasicJsonType& j, ENUM_TYPE& e) \ + { \ + /* NOLINTNEXTLINE(modernize-type-traits) we use C++11 */ \ + static_assert(std::is_enum::value, #ENUM_TYPE " must be an enum!"); \ + /* NOLINTNEXTLINE(modernize-avoid-c-arrays) we don't want to depend on */ \ + static const std::pair m[] = __VA_ARGS__; \ + auto it = std::find_if(std::begin(m), std::end(m), \ + [&j](const std::pair& ej_pair) -> bool \ + { \ + return ej_pair.second == j; \ + }); \ + e = ((it != std::end(m)) ? it : std::begin(m))->first; \ + } + +// Ugly macros to avoid uglier copy-paste when specializing basic_json. They +// may be removed in the future once the class is split. + +#define NLOHMANN_BASIC_JSON_TPL_DECLARATION \ + template class ObjectType, \ + template class ArrayType, \ + class StringType, class BooleanType, class NumberIntegerType, \ + class NumberUnsignedType, class NumberFloatType, \ + template class AllocatorType, \ + template class JSONSerializer, \ + class BinaryType, \ + class CustomBaseClass> + +#define NLOHMANN_BASIC_JSON_TPL \ + basic_json + +// Macros to simplify conversion from/to types + +#define NLOHMANN_JSON_EXPAND( x ) x +#define NLOHMANN_JSON_GET_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, _61, _62, _63, _64, NAME,...) NAME +#define NLOHMANN_JSON_PASTE(...) NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_GET_MACRO(__VA_ARGS__, \ + NLOHMANN_JSON_PASTE64, \ + NLOHMANN_JSON_PASTE63, \ + NLOHMANN_JSON_PASTE62, \ + NLOHMANN_JSON_PASTE61, \ + NLOHMANN_JSON_PASTE60, \ + NLOHMANN_JSON_PASTE59, \ + NLOHMANN_JSON_PASTE58, \ + NLOHMANN_JSON_PASTE57, \ + NLOHMANN_JSON_PASTE56, \ + NLOHMANN_JSON_PASTE55, \ + NLOHMANN_JSON_PASTE54, \ + NLOHMANN_JSON_PASTE53, \ + NLOHMANN_JSON_PASTE52, \ + NLOHMANN_JSON_PASTE51, \ + NLOHMANN_JSON_PASTE50, \ + NLOHMANN_JSON_PASTE49, \ + NLOHMANN_JSON_PASTE48, \ + NLOHMANN_JSON_PASTE47, \ + NLOHMANN_JSON_PASTE46, \ + NLOHMANN_JSON_PASTE45, \ + NLOHMANN_JSON_PASTE44, \ + NLOHMANN_JSON_PASTE43, \ + NLOHMANN_JSON_PASTE42, \ + NLOHMANN_JSON_PASTE41, \ + NLOHMANN_JSON_PASTE40, \ + NLOHMANN_JSON_PASTE39, \ + NLOHMANN_JSON_PASTE38, \ + NLOHMANN_JSON_PASTE37, \ + NLOHMANN_JSON_PASTE36, \ + NLOHMANN_JSON_PASTE35, \ + NLOHMANN_JSON_PASTE34, \ + NLOHMANN_JSON_PASTE33, \ + NLOHMANN_JSON_PASTE32, \ + NLOHMANN_JSON_PASTE31, \ + NLOHMANN_JSON_PASTE30, \ + NLOHMANN_JSON_PASTE29, \ + NLOHMANN_JSON_PASTE28, \ + NLOHMANN_JSON_PASTE27, \ + NLOHMANN_JSON_PASTE26, \ + NLOHMANN_JSON_PASTE25, \ + NLOHMANN_JSON_PASTE24, \ + NLOHMANN_JSON_PASTE23, \ + NLOHMANN_JSON_PASTE22, \ + NLOHMANN_JSON_PASTE21, \ + NLOHMANN_JSON_PASTE20, \ + NLOHMANN_JSON_PASTE19, \ + NLOHMANN_JSON_PASTE18, \ + NLOHMANN_JSON_PASTE17, \ + NLOHMANN_JSON_PASTE16, \ + NLOHMANN_JSON_PASTE15, \ + NLOHMANN_JSON_PASTE14, \ + NLOHMANN_JSON_PASTE13, \ + NLOHMANN_JSON_PASTE12, \ + NLOHMANN_JSON_PASTE11, \ + NLOHMANN_JSON_PASTE10, \ + NLOHMANN_JSON_PASTE9, \ + NLOHMANN_JSON_PASTE8, \ + NLOHMANN_JSON_PASTE7, \ + NLOHMANN_JSON_PASTE6, \ + NLOHMANN_JSON_PASTE5, \ + NLOHMANN_JSON_PASTE4, \ + NLOHMANN_JSON_PASTE3, \ + NLOHMANN_JSON_PASTE2, \ + NLOHMANN_JSON_PASTE1)(__VA_ARGS__)) +#define NLOHMANN_JSON_PASTE2(func, v1) func(v1) +#define NLOHMANN_JSON_PASTE3(func, v1, v2) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE2(func, v2) +#define NLOHMANN_JSON_PASTE4(func, v1, v2, v3) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE3(func, v2, v3) +#define NLOHMANN_JSON_PASTE5(func, v1, v2, v3, v4) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE4(func, v2, v3, v4) +#define NLOHMANN_JSON_PASTE6(func, v1, v2, v3, v4, v5) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE5(func, v2, v3, v4, v5) +#define NLOHMANN_JSON_PASTE7(func, v1, v2, v3, v4, v5, v6) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE6(func, v2, v3, v4, v5, v6) +#define NLOHMANN_JSON_PASTE8(func, v1, v2, v3, v4, v5, v6, v7) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE7(func, v2, v3, v4, v5, v6, v7) +#define NLOHMANN_JSON_PASTE9(func, v1, v2, v3, v4, v5, v6, v7, v8) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE8(func, v2, v3, v4, v5, v6, v7, v8) +#define NLOHMANN_JSON_PASTE10(func, v1, v2, v3, v4, v5, v6, v7, v8, v9) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE9(func, v2, v3, v4, v5, v6, v7, v8, v9) +#define NLOHMANN_JSON_PASTE11(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE10(func, v2, v3, v4, v5, v6, v7, v8, v9, v10) +#define NLOHMANN_JSON_PASTE12(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE11(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11) +#define NLOHMANN_JSON_PASTE13(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE12(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12) +#define NLOHMANN_JSON_PASTE14(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE13(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) +#define NLOHMANN_JSON_PASTE15(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE14(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) +#define NLOHMANN_JSON_PASTE16(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE15(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) +#define NLOHMANN_JSON_PASTE17(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE16(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) +#define NLOHMANN_JSON_PASTE18(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE17(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17) +#define NLOHMANN_JSON_PASTE19(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE18(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18) +#define NLOHMANN_JSON_PASTE20(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE19(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19) +#define NLOHMANN_JSON_PASTE21(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE20(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20) +#define NLOHMANN_JSON_PASTE22(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE21(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21) +#define NLOHMANN_JSON_PASTE23(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE22(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22) +#define NLOHMANN_JSON_PASTE24(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE23(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23) +#define NLOHMANN_JSON_PASTE25(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE24(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24) +#define NLOHMANN_JSON_PASTE26(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE25(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) +#define NLOHMANN_JSON_PASTE27(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE26(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26) +#define NLOHMANN_JSON_PASTE28(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE27(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27) +#define NLOHMANN_JSON_PASTE29(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE28(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28) +#define NLOHMANN_JSON_PASTE30(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE29(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29) +#define NLOHMANN_JSON_PASTE31(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE30(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30) +#define NLOHMANN_JSON_PASTE32(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE31(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31) +#define NLOHMANN_JSON_PASTE33(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE32(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32) +#define NLOHMANN_JSON_PASTE34(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE33(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33) +#define NLOHMANN_JSON_PASTE35(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE34(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34) +#define NLOHMANN_JSON_PASTE36(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE35(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35) +#define NLOHMANN_JSON_PASTE37(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE36(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36) +#define NLOHMANN_JSON_PASTE38(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE37(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37) +#define NLOHMANN_JSON_PASTE39(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE38(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38) +#define NLOHMANN_JSON_PASTE40(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE39(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39) +#define NLOHMANN_JSON_PASTE41(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE40(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40) +#define NLOHMANN_JSON_PASTE42(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE41(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41) +#define NLOHMANN_JSON_PASTE43(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE42(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42) +#define NLOHMANN_JSON_PASTE44(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE43(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43) +#define NLOHMANN_JSON_PASTE45(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE44(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44) +#define NLOHMANN_JSON_PASTE46(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE45(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45) +#define NLOHMANN_JSON_PASTE47(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE46(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46) +#define NLOHMANN_JSON_PASTE48(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE47(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47) +#define NLOHMANN_JSON_PASTE49(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE48(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48) +#define NLOHMANN_JSON_PASTE50(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE49(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49) +#define NLOHMANN_JSON_PASTE51(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE50(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50) +#define NLOHMANN_JSON_PASTE52(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE51(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51) +#define NLOHMANN_JSON_PASTE53(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE52(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52) +#define NLOHMANN_JSON_PASTE54(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE53(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53) +#define NLOHMANN_JSON_PASTE55(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE54(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54) +#define NLOHMANN_JSON_PASTE56(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE55(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55) +#define NLOHMANN_JSON_PASTE57(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE56(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56) +#define NLOHMANN_JSON_PASTE58(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE57(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57) +#define NLOHMANN_JSON_PASTE59(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE58(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58) +#define NLOHMANN_JSON_PASTE60(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE59(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59) +#define NLOHMANN_JSON_PASTE61(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE60(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60) +#define NLOHMANN_JSON_PASTE62(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE61(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61) +#define NLOHMANN_JSON_PASTE63(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE62(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62) +#define NLOHMANN_JSON_PASTE64(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE63(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63) + +#define NLOHMANN_JSON_TO(v1) nlohmann_json_j[#v1] = nlohmann_json_t.v1; +#define NLOHMANN_JSON_FROM(v1) nlohmann_json_j.at(#v1).get_to(nlohmann_json_t.v1); +#define NLOHMANN_JSON_FROM_WITH_DEFAULT(v1) nlohmann_json_t.v1 = !nlohmann_json_j.is_null() ? nlohmann_json_j.value(#v1, nlohmann_json_default_obj.v1) : nlohmann_json_default_obj.v1; + +/*! +@brief macro +@def NLOHMANN_DEFINE_TYPE_INTRUSIVE +@since version 3.9.0 +*/ +#define NLOHMANN_DEFINE_TYPE_INTRUSIVE(Type, ...) \ + friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ + friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) } + +/*! +@brief macro +@def NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT +@since version 3.11.0 +*/ +#define NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(Type, ...) \ + friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ + friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) } + +/*! +@brief macro +@def NLOHMANN_DEFINE_TYPE_INTRUSIVE_ONLY_SERIALIZE +@since version 3.11.x +*/ +#define NLOHMANN_DEFINE_TYPE_INTRUSIVE_ONLY_SERIALIZE(Type, ...) \ + friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } + +/*! +@brief macro +@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE +@since version 3.9.0 +*/ +#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Type, ...) \ + inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ + inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) } + +/*! +@brief macro +@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_WITH_DEFAULT +@since version 3.11.0 +*/ +#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_WITH_DEFAULT(Type, ...) \ + inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ + inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) } + +/*! +@brief macro +@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE +@since version 3.11.x +*/ +#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE(Type, ...) \ + inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } + +/*! +@brief macro +@def NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE +@since version 3.11.x +*/ +#define NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE(Type, BaseType, ...) \ + friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ + friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) } + +#define NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE_WITH_DEFAULT(Type, BaseType, ...) \ + friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ + friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast(nlohmann_json_t)); const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) } + +#define NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE_ONLY_SERIALIZE(Type, BaseType, ...) \ + friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ + +/*! +@brief macro +@def NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE +@since version 3.11.x +*/ +#define NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE(Type, BaseType, ...) \ + inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ + inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) } + +#define NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE_WITH_DEFAULT(Type, BaseType, ...) \ + inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ + inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast(nlohmann_json_t)); const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) } + +#define NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE(Type, BaseType, ...) \ + inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ + +// inspired from https://stackoverflow.com/a/26745591 +// allows to call any std function as if (e.g. with begin): +// using std::begin; begin(x); +// +// it allows using the detected idiom to retrieve the return type +// of such an expression +#define NLOHMANN_CAN_CALL_STD_FUNC_IMPL(std_name) \ + namespace detail { \ + using std::std_name; \ + \ + template \ + using result_of_##std_name = decltype(std_name(std::declval()...)); \ + } \ + \ + namespace detail2 { \ + struct std_name##_tag \ + { \ + }; \ + \ + template \ + std_name##_tag std_name(T&&...); \ + \ + template \ + using result_of_##std_name = decltype(std_name(std::declval()...)); \ + \ + template \ + struct would_call_std_##std_name \ + { \ + static constexpr auto const value = ::nlohmann::detail:: \ + is_detected_exact::value; \ + }; \ + } /* namespace detail2 */ \ + \ + template \ + struct would_call_std_##std_name : detail2::would_call_std_##std_name \ + { \ + } + +#ifndef JSON_USE_IMPLICIT_CONVERSIONS + #define JSON_USE_IMPLICIT_CONVERSIONS 1 +#endif + +#if JSON_USE_IMPLICIT_CONVERSIONS + #define JSON_EXPLICIT +#else + #define JSON_EXPLICIT explicit +#endif + +#ifndef JSON_DISABLE_ENUM_SERIALIZATION + #define JSON_DISABLE_ENUM_SERIALIZATION 0 +#endif + +#ifndef JSON_USE_GLOBAL_UDLS + #define JSON_USE_GLOBAL_UDLS 1 +#endif + +#if JSON_HAS_THREE_WAY_COMPARISON + #include // partial_ordering +#endif + +NLOHMANN_JSON_NAMESPACE_BEGIN +namespace detail +{ + +/////////////////////////// +// JSON type enumeration // +/////////////////////////// + +/*! +@brief the JSON type enumeration + +This enumeration collects the different JSON types. It is internally used to +distinguish the stored values, and the functions @ref basic_json::is_null(), +@ref basic_json::is_object(), @ref basic_json::is_array(), +@ref basic_json::is_string(), @ref basic_json::is_boolean(), +@ref basic_json::is_number() (with @ref basic_json::is_number_integer(), +@ref basic_json::is_number_unsigned(), and @ref basic_json::is_number_float()), +@ref basic_json::is_discarded(), @ref basic_json::is_primitive(), and +@ref basic_json::is_structured() rely on it. + +@note There are three enumeration entries (number_integer, number_unsigned, and +number_float), because the library distinguishes these three types for numbers: +@ref basic_json::number_unsigned_t is used for unsigned integers, +@ref basic_json::number_integer_t is used for signed integers, and +@ref basic_json::number_float_t is used for floating-point numbers or to +approximate integers which do not fit in the limits of their respective type. + +@sa see @ref basic_json::basic_json(const value_t value_type) -- create a JSON +value with the default value for a given type + +@since version 1.0.0 +*/ +enum class value_t : std::uint8_t +{ + null, ///< null value + object, ///< object (unordered set of name/value pairs) + array, ///< array (ordered collection of values) + string, ///< string value + boolean, ///< boolean value + number_integer, ///< number value (signed integer) + number_unsigned, ///< number value (unsigned integer) + number_float, ///< number value (floating-point) + binary, ///< binary array (ordered collection of bytes) + discarded ///< discarded by the parser callback function +}; + +/*! +@brief comparison operator for JSON types + +Returns an ordering that is similar to Python: +- order: null < boolean < number < object < array < string < binary +- furthermore, each type is not smaller than itself +- discarded values are not comparable +- binary is represented as a b"" string in python and directly comparable to a + string; however, making a binary array directly comparable with a string would + be surprising behavior in a JSON file. + +@since version 1.0.0 +*/ +#if JSON_HAS_THREE_WAY_COMPARISON + inline std::partial_ordering operator<=>(const value_t lhs, const value_t rhs) noexcept // *NOPAD* +#else + inline bool operator<(const value_t lhs, const value_t rhs) noexcept +#endif +{ + static constexpr std::array order = {{ + 0 /* null */, 3 /* object */, 4 /* array */, 5 /* string */, + 1 /* boolean */, 2 /* integer */, 2 /* unsigned */, 2 /* float */, + 6 /* binary */ + } + }; + + const auto l_index = static_cast(lhs); + const auto r_index = static_cast(rhs); +#if JSON_HAS_THREE_WAY_COMPARISON + if (l_index < order.size() && r_index < order.size()) + { + return order[l_index] <=> order[r_index]; // *NOPAD* + } + return std::partial_ordering::unordered; +#else + return l_index < order.size() && r_index < order.size() && order[l_index] < order[r_index]; +#endif +} + +// GCC selects the built-in operator< over an operator rewritten from +// a user-defined spaceship operator +// Clang, MSVC, and ICC select the rewritten candidate +// (see GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105200) +#if JSON_HAS_THREE_WAY_COMPARISON && defined(__GNUC__) +inline bool operator<(const value_t lhs, const value_t rhs) noexcept +{ + return std::is_lt(lhs <=> rhs); // *NOPAD* +} +#endif + +} // namespace detail +NLOHMANN_JSON_NAMESPACE_END + +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +// #include + + +NLOHMANN_JSON_NAMESPACE_BEGIN +namespace detail +{ + +/*! +@brief replace all occurrences of a substring by another string + +@param[in,out] s the string to manipulate; changed so that all + occurrences of @a f are replaced with @a t +@param[in] f the substring to replace with @a t +@param[in] t the string to replace @a f + +@pre The search string @a f must not be empty. **This precondition is +enforced with an assertion.** + +@since version 2.0.0 +*/ +template +inline void replace_substring(StringType& s, const StringType& f, + const StringType& t) +{ + JSON_ASSERT(!f.empty()); + for (auto pos = s.find(f); // find first occurrence of f + pos != StringType::npos; // make sure f was found + s.replace(pos, f.size(), t), // replace with t, and + pos = s.find(f, pos + t.size())) // find next occurrence of f + {} +} + +/*! + * @brief string escaping as described in RFC 6901 (Sect. 4) + * @param[in] s string to escape + * @return escaped string + * + * Note the order of escaping "~" to "~0" and "/" to "~1" is important. + */ +template +inline StringType escape(StringType s) +{ + replace_substring(s, StringType{"~"}, StringType{"~0"}); + replace_substring(s, StringType{"/"}, StringType{"~1"}); + return s; +} + +/*! + * @brief string unescaping as described in RFC 6901 (Sect. 4) + * @param[in] s string to unescape + * @return unescaped string + * + * Note the order of escaping "~1" to "/" and "~0" to "~" is important. + */ +template +static void unescape(StringType& s) +{ + replace_substring(s, StringType{"~1"}, StringType{"/"}); + replace_substring(s, StringType{"~0"}, StringType{"~"}); +} + +} // namespace detail +NLOHMANN_JSON_NAMESPACE_END + +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +#include // size_t + +// #include + + +NLOHMANN_JSON_NAMESPACE_BEGIN +namespace detail +{ + +/// struct to capture the start position of the current token +struct position_t +{ + /// the total number of characters read + std::size_t chars_read_total = 0; + /// the number of characters read in the current line + std::size_t chars_read_current_line = 0; + /// the number of lines read + std::size_t lines_read = 0; + + /// conversion to size_t to preserve SAX interface + constexpr operator size_t() const + { + return chars_read_total; + } +}; + +} // namespace detail +NLOHMANN_JSON_NAMESPACE_END + +// #include + +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-FileCopyrightText: 2018 The Abseil Authors +// SPDX-License-Identifier: MIT + + + +#include // array +#include // size_t +#include // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type +#include // index_sequence, make_index_sequence, index_sequence_for + +// #include + + +NLOHMANN_JSON_NAMESPACE_BEGIN +namespace detail +{ + +template +using uncvref_t = typename std::remove_cv::type>::type; + +#ifdef JSON_HAS_CPP_14 + +// the following utilities are natively available in C++14 +using std::enable_if_t; +using std::index_sequence; +using std::make_index_sequence; +using std::index_sequence_for; + +#else + +// alias templates to reduce boilerplate +template +using enable_if_t = typename std::enable_if::type; + +// The following code is taken from https://github.com/abseil/abseil-cpp/blob/10cb35e459f5ecca5b2ff107635da0bfa41011b4/absl/utility/utility.h +// which is part of Google Abseil (https://github.com/abseil/abseil-cpp), licensed under the Apache License 2.0. + +//// START OF CODE FROM GOOGLE ABSEIL + +// integer_sequence +// +// Class template representing a compile-time integer sequence. An instantiation +// of `integer_sequence` has a sequence of integers encoded in its +// type through its template arguments (which is a common need when +// working with C++11 variadic templates). `absl::integer_sequence` is designed +// to be a drop-in replacement for C++14's `std::integer_sequence`. +// +// Example: +// +// template< class T, T... Ints > +// void user_function(integer_sequence); +// +// int main() +// { +// // user_function's `T` will be deduced to `int` and `Ints...` +// // will be deduced to `0, 1, 2, 3, 4`. +// user_function(make_integer_sequence()); +// } +template +struct integer_sequence +{ + using value_type = T; + static constexpr std::size_t size() noexcept + { + return sizeof...(Ints); + } +}; + +// index_sequence +// +// A helper template for an `integer_sequence` of `size_t`, +// `absl::index_sequence` is designed to be a drop-in replacement for C++14's +// `std::index_sequence`. +template +using index_sequence = integer_sequence; + +namespace utility_internal +{ + +template +struct Extend; + +// Note that SeqSize == sizeof...(Ints). It's passed explicitly for efficiency. +template +struct Extend, SeqSize, 0> +{ + using type = integer_sequence < T, Ints..., (Ints + SeqSize)... >; +}; + +template +struct Extend, SeqSize, 1> +{ + using type = integer_sequence < T, Ints..., (Ints + SeqSize)..., 2 * SeqSize >; +}; + +// Recursion helper for 'make_integer_sequence'. +// 'Gen::type' is an alias for 'integer_sequence'. +template +struct Gen +{ + using type = + typename Extend < typename Gen < T, N / 2 >::type, N / 2, N % 2 >::type; +}; + +template +struct Gen +{ + using type = integer_sequence; +}; + +} // namespace utility_internal + +// Compile-time sequences of integers + +// make_integer_sequence +// +// This template alias is equivalent to +// `integer_sequence`, and is designed to be a drop-in +// replacement for C++14's `std::make_integer_sequence`. +template +using make_integer_sequence = typename utility_internal::Gen::type; + +// make_index_sequence +// +// This template alias is equivalent to `index_sequence<0, 1, ..., N-1>`, +// and is designed to be a drop-in replacement for C++14's +// `std::make_index_sequence`. +template +using make_index_sequence = make_integer_sequence; + +// index_sequence_for +// +// Converts a typename pack into an index sequence of the same length, and +// is designed to be a drop-in replacement for C++14's +// `std::index_sequence_for()` +template +using index_sequence_for = make_index_sequence; + +//// END OF CODE FROM GOOGLE ABSEIL + +#endif + +// dispatch utility (taken from ranges-v3) +template struct priority_tag : priority_tag < N - 1 > {}; +template<> struct priority_tag<0> {}; + +// taken from ranges-v3 +template +struct static_const +{ + static JSON_INLINE_VARIABLE constexpr T value{}; +}; + +#ifndef JSON_HAS_CPP_17 + template + constexpr T static_const::value; +#endif + +template +constexpr std::array make_array(Args&& ... args) +{ + return std::array {{static_cast(std::forward(args))...}}; +} + +} // namespace detail +NLOHMANN_JSON_NAMESPACE_END + +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +#include // numeric_limits +#include // char_traits +#include // tuple +#include // false_type, is_constructible, is_integral, is_same, true_type +#include // declval + +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +#include // random_access_iterator_tag + +// #include + +// #include + +// #include + + +NLOHMANN_JSON_NAMESPACE_BEGIN +namespace detail +{ + +template +struct iterator_types {}; + +template +struct iterator_types < + It, + void_t> +{ + using difference_type = typename It::difference_type; + using value_type = typename It::value_type; + using pointer = typename It::pointer; + using reference = typename It::reference; + using iterator_category = typename It::iterator_category; +}; + +// This is required as some compilers implement std::iterator_traits in a way that +// doesn't work with SFINAE. See https://github.com/nlohmann/json/issues/1341. +template +struct iterator_traits +{ +}; + +template +struct iterator_traits < T, enable_if_t < !std::is_pointer::value >> + : iterator_types +{ +}; + +template +struct iterator_traits::value>> +{ + using iterator_category = std::random_access_iterator_tag; + using value_type = T; + using difference_type = ptrdiff_t; + using pointer = T*; + using reference = T&; +}; + +} // namespace detail +NLOHMANN_JSON_NAMESPACE_END + +// #include + +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +// #include + + +NLOHMANN_JSON_NAMESPACE_BEGIN + +NLOHMANN_CAN_CALL_STD_FUNC_IMPL(begin); + +NLOHMANN_JSON_NAMESPACE_END + +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +// #include + + +NLOHMANN_JSON_NAMESPACE_BEGIN + +NLOHMANN_CAN_CALL_STD_FUNC_IMPL(end); + +NLOHMANN_JSON_NAMESPACE_END + +// #include + +// #include + +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann +// SPDX-License-Identifier: MIT + +#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_ + #define INCLUDE_NLOHMANN_JSON_FWD_HPP_ + + #include // int64_t, uint64_t + #include // map + #include // allocator + #include // string + #include // vector + + // #include + + + /*! + @brief namespace for Niels Lohmann + @see https://github.com/nlohmann + @since version 1.0.0 + */ + NLOHMANN_JSON_NAMESPACE_BEGIN + + /*! + @brief default JSONSerializer template argument + + This serializer ignores the template arguments and uses ADL + ([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl)) + for serialization. + */ + template + struct adl_serializer; + + /// a class to store JSON values + /// @sa https://json.nlohmann.me/api/basic_json/ + template class ObjectType = + std::map, + template class ArrayType = std::vector, + class StringType = std::string, class BooleanType = bool, + class NumberIntegerType = std::int64_t, + class NumberUnsignedType = std::uint64_t, + class NumberFloatType = double, + template class AllocatorType = std::allocator, + template class JSONSerializer = + adl_serializer, + class BinaryType = std::vector, // cppcheck-suppress syntaxError + class CustomBaseClass = void> + class basic_json; + + /// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document + /// @sa https://json.nlohmann.me/api/json_pointer/ + template + class json_pointer; + + /*! + @brief default specialization + @sa https://json.nlohmann.me/api/json/ + */ + using json = basic_json<>; + + /// @brief a minimal map-like container that preserves insertion order + /// @sa https://json.nlohmann.me/api/ordered_map/ + template + struct ordered_map; + + /// @brief specialization that maintains the insertion order of object keys + /// @sa https://json.nlohmann.me/api/ordered_json/ + using ordered_json = basic_json; + + NLOHMANN_JSON_NAMESPACE_END + +#endif // INCLUDE_NLOHMANN_JSON_FWD_HPP_ + + +NLOHMANN_JSON_NAMESPACE_BEGIN +/*! +@brief detail namespace with internal helper functions + +This namespace collects functions that should not be exposed, +implementations of some @ref basic_json methods, and meta-programming helpers. + +@since version 2.1.0 +*/ +namespace detail +{ + +///////////// +// helpers // +///////////// + +// Note to maintainers: +// +// Every trait in this file expects a non CV-qualified type. +// The only exceptions are in the 'aliases for detected' section +// (i.e. those of the form: decltype(T::member_function(std::declval()))) +// +// In this case, T has to be properly CV-qualified to constraint the function arguments +// (e.g. to_json(BasicJsonType&, const T&)) + +template struct is_basic_json : std::false_type {}; + +NLOHMANN_BASIC_JSON_TPL_DECLARATION +struct is_basic_json : std::true_type {}; + +// used by exceptions create() member functions +// true_type for pointer to possibly cv-qualified basic_json or std::nullptr_t +// false_type otherwise +template +struct is_basic_json_context : + std::integral_constant < bool, + is_basic_json::type>::type>::value + || std::is_same::value > +{}; + +////////////////////// +// json_ref helpers // +////////////////////// + +template +class json_ref; + +template +struct is_json_ref : std::false_type {}; + +template +struct is_json_ref> : std::true_type {}; + +////////////////////////// +// aliases for detected // +////////////////////////// + +template +using mapped_type_t = typename T::mapped_type; + +template +using key_type_t = typename T::key_type; + +template +using value_type_t = typename T::value_type; + +template +using difference_type_t = typename T::difference_type; + +template +using pointer_t = typename T::pointer; + +template +using reference_t = typename T::reference; + +template +using iterator_category_t = typename T::iterator_category; + +template +using to_json_function = decltype(T::to_json(std::declval()...)); + +template +using from_json_function = decltype(T::from_json(std::declval()...)); + +template +using get_template_function = decltype(std::declval().template get()); + +// trait checking if JSONSerializer::from_json(json const&, udt&) exists +template +struct has_from_json : std::false_type {}; + +// trait checking if j.get is valid +// use this trait instead of std::is_constructible or std::is_convertible, +// both rely on, or make use of implicit conversions, and thus fail when T +// has several constructors/operator= (see https://github.com/nlohmann/json/issues/958) +template +struct is_getable +{ + static constexpr bool value = is_detected::value; +}; + +template +struct has_from_json < BasicJsonType, T, enable_if_t < !is_basic_json::value >> +{ + using serializer = typename BasicJsonType::template json_serializer; + + static constexpr bool value = + is_detected_exact::value; +}; + +// This trait checks if JSONSerializer::from_json(json const&) exists +// this overload is used for non-default-constructible user-defined-types +template +struct has_non_default_from_json : std::false_type {}; + +template +struct has_non_default_from_json < BasicJsonType, T, enable_if_t < !is_basic_json::value >> +{ + using serializer = typename BasicJsonType::template json_serializer; + + static constexpr bool value = + is_detected_exact::value; +}; + +// This trait checks if BasicJsonType::json_serializer::to_json exists +// Do not evaluate the trait when T is a basic_json type, to avoid template instantiation infinite recursion. +template +struct has_to_json : std::false_type {}; + +template +struct has_to_json < BasicJsonType, T, enable_if_t < !is_basic_json::value >> +{ + using serializer = typename BasicJsonType::template json_serializer; + + static constexpr bool value = + is_detected_exact::value; +}; + +template +using detect_key_compare = typename T::key_compare; + +template +struct has_key_compare : std::integral_constant::value> {}; + +// obtains the actual object key comparator +template +struct actual_object_comparator +{ + using object_t = typename BasicJsonType::object_t; + using object_comparator_t = typename BasicJsonType::default_object_comparator_t; + using type = typename std::conditional < has_key_compare::value, + typename object_t::key_compare, object_comparator_t>::type; +}; + +template +using actual_object_comparator_t = typename actual_object_comparator::type; + +///////////////// +// char_traits // +///////////////// + +// Primary template of char_traits calls std char_traits +template +struct char_traits : std::char_traits +{}; + +// Explicitly define char traits for unsigned char since it is not standard +template<> +struct char_traits : std::char_traits +{ + using char_type = unsigned char; + using int_type = uint64_t; + + // Redefine to_int_type function + static int_type to_int_type(char_type c) noexcept + { + return static_cast(c); + } + + static char_type to_char_type(int_type i) noexcept + { + return static_cast(i); + } + + static constexpr int_type eof() noexcept + { + return static_cast(std::char_traits::eof()); + } +}; + +// Explicitly define char traits for signed char since it is not standard +template<> +struct char_traits : std::char_traits +{ + using char_type = signed char; + using int_type = uint64_t; + + // Redefine to_int_type function + static int_type to_int_type(char_type c) noexcept + { + return static_cast(c); + } + + static char_type to_char_type(int_type i) noexcept + { + return static_cast(i); + } + + static constexpr int_type eof() noexcept + { + return static_cast(std::char_traits::eof()); + } +}; + +/////////////////// +// is_ functions // +/////////////////// + +// https://en.cppreference.com/w/cpp/types/conjunction +template struct conjunction : std::true_type { }; +template struct conjunction : B { }; +template +struct conjunction +: std::conditional(B::value), conjunction, B>::type {}; + +// https://en.cppreference.com/w/cpp/types/negation +template struct negation : std::integral_constant < bool, !B::value > { }; + +// Reimplementation of is_constructible and is_default_constructible, due to them being broken for +// std::pair and std::tuple until LWG 2367 fix (see https://cplusplus.github.io/LWG/lwg-defects.html#2367). +// This causes compile errors in e.g. clang 3.5 or gcc 4.9. +template +struct is_default_constructible : std::is_default_constructible {}; + +template +struct is_default_constructible> + : conjunction, is_default_constructible> {}; + +template +struct is_default_constructible> + : conjunction, is_default_constructible> {}; + +template +struct is_default_constructible> + : conjunction...> {}; + +template +struct is_default_constructible> + : conjunction...> {}; + +template +struct is_constructible : std::is_constructible {}; + +template +struct is_constructible> : is_default_constructible> {}; + +template +struct is_constructible> : is_default_constructible> {}; + +template +struct is_constructible> : is_default_constructible> {}; + +template +struct is_constructible> : is_default_constructible> {}; + +template +struct is_iterator_traits : std::false_type {}; + +template +struct is_iterator_traits> +{ + private: + using traits = iterator_traits; + + public: + static constexpr auto value = + is_detected::value && + is_detected::value && + is_detected::value && + is_detected::value && + is_detected::value; +}; + +template +struct is_range +{ + private: + using t_ref = typename std::add_lvalue_reference::type; + + using iterator = detected_t; + using sentinel = detected_t; + + // to be 100% correct, it should use https://en.cppreference.com/w/cpp/iterator/input_or_output_iterator + // and https://en.cppreference.com/w/cpp/iterator/sentinel_for + // but reimplementing these would be too much work, as a lot of other concepts are used underneath + static constexpr auto is_iterator_begin = + is_iterator_traits>::value; + + public: + static constexpr bool value = !std::is_same::value && !std::is_same::value && is_iterator_begin; +}; + +template +using iterator_t = enable_if_t::value, result_of_begin())>>; + +template +using range_value_t = value_type_t>>; + +// The following implementation of is_complete_type is taken from +// https://blogs.msdn.microsoft.com/vcblog/2015/12/02/partial-support-for-expression-sfinae-in-vs-2015-update-1/ +// and is written by Xiang Fan who agreed to using it in this library. + +template +struct is_complete_type : std::false_type {}; + +template +struct is_complete_type : std::true_type {}; + +template +struct is_compatible_object_type_impl : std::false_type {}; + +template +struct is_compatible_object_type_impl < + BasicJsonType, CompatibleObjectType, + enable_if_t < is_detected::value&& + is_detected::value >> +{ + using object_t = typename BasicJsonType::object_t; + + // macOS's is_constructible does not play well with nonesuch... + static constexpr bool value = + is_constructible::value && + is_constructible::value; +}; + +template +struct is_compatible_object_type + : is_compatible_object_type_impl {}; + +template +struct is_constructible_object_type_impl : std::false_type {}; + +template +struct is_constructible_object_type_impl < + BasicJsonType, ConstructibleObjectType, + enable_if_t < is_detected::value&& + is_detected::value >> +{ + using object_t = typename BasicJsonType::object_t; + + static constexpr bool value = + (is_default_constructible::value && + (std::is_move_assignable::value || + std::is_copy_assignable::value) && + (is_constructible::value && + std::is_same < + typename object_t::mapped_type, + typename ConstructibleObjectType::mapped_type >::value)) || + (has_from_json::value || + has_non_default_from_json < + BasicJsonType, + typename ConstructibleObjectType::mapped_type >::value); +}; + +template +struct is_constructible_object_type + : is_constructible_object_type_impl {}; + +template +struct is_compatible_string_type +{ + static constexpr auto value = + is_constructible::value; +}; + +template +struct is_constructible_string_type +{ + // launder type through decltype() to fix compilation failure on ICPC +#ifdef __INTEL_COMPILER + using laundered_type = decltype(std::declval()); +#else + using laundered_type = ConstructibleStringType; +#endif + + static constexpr auto value = + conjunction < + is_constructible, + is_detected_exact>::value; +}; + +template +struct is_compatible_array_type_impl : std::false_type {}; + +template +struct is_compatible_array_type_impl < + BasicJsonType, CompatibleArrayType, + enable_if_t < + is_detected::value&& + is_iterator_traits>>::value&& +// special case for types like std::filesystem::path whose iterator's value_type are themselves +// c.f. https://github.com/nlohmann/json/pull/3073 + !std::is_same>::value >> +{ + static constexpr bool value = + is_constructible>::value; +}; + +template +struct is_compatible_array_type + : is_compatible_array_type_impl {}; + +template +struct is_constructible_array_type_impl : std::false_type {}; + +template +struct is_constructible_array_type_impl < + BasicJsonType, ConstructibleArrayType, + enable_if_t::value >> + : std::true_type {}; + +template +struct is_constructible_array_type_impl < + BasicJsonType, ConstructibleArrayType, + enable_if_t < !std::is_same::value&& + !is_compatible_string_type::value&& + is_default_constructible::value&& +(std::is_move_assignable::value || + std::is_copy_assignable::value)&& +is_detected::value&& +is_iterator_traits>>::value&& +is_detected::value&& +// special case for types like std::filesystem::path whose iterator's value_type are themselves +// c.f. https://github.com/nlohmann/json/pull/3073 +!std::is_same>::value&& +is_complete_type < +detected_t>::value >> +{ + using value_type = range_value_t; + + static constexpr bool value = + std::is_same::value || + has_from_json::value || + has_non_default_from_json < + BasicJsonType, + value_type >::value; +}; + +template +struct is_constructible_array_type + : is_constructible_array_type_impl {}; + +template +struct is_compatible_integer_type_impl : std::false_type {}; + +template +struct is_compatible_integer_type_impl < + RealIntegerType, CompatibleNumberIntegerType, + enable_if_t < std::is_integral::value&& + std::is_integral::value&& + !std::is_same::value >> +{ + // is there an assert somewhere on overflows? + using RealLimits = std::numeric_limits; + using CompatibleLimits = std::numeric_limits; + + static constexpr auto value = + is_constructible::value && + CompatibleLimits::is_integer && + RealLimits::is_signed == CompatibleLimits::is_signed; +}; + +template +struct is_compatible_integer_type + : is_compatible_integer_type_impl {}; + +template +struct is_compatible_type_impl: std::false_type {}; + +template +struct is_compatible_type_impl < + BasicJsonType, CompatibleType, + enable_if_t::value >> +{ + static constexpr bool value = + has_to_json::value; +}; + +template +struct is_compatible_type + : is_compatible_type_impl {}; + +template +struct is_constructible_tuple : std::false_type {}; + +template +struct is_constructible_tuple> : conjunction...> {}; + +template +struct is_json_iterator_of : std::false_type {}; + +template +struct is_json_iterator_of : std::true_type {}; + +template +struct is_json_iterator_of : std::true_type +{}; + +// checks if a given type T is a template specialization of Primary +template