diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..7af869e
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,13 @@
+{
+    "python.testing.unittestArgs": [
+        "-v",
+        "-s",
+        "C:/sources/LLMLingua/tests",
+        "-p",
+        "test*.py"
+    ],
+    "python.testing.pytestEnabled": false,
+    "python.testing.unittestEnabled": true,
+    "python.testing.unittestDiscoveryAdapter": "legacy",
+    "python.testing.autoTestDiscoverOnSaveEnabled": true
+}
\ No newline at end of file
diff --git a/llmlingua/__init__.py b/llmlingua/__init__.py
index d750210..a98f08f 100644
--- a/llmlingua/__init__.py
+++ b/llmlingua/__init__.py
@@ -3,6 +3,7 @@
 
 # flake8: noqa
 from .prompt_compressor import PromptCompressor
+from .prompt_compress_multicore import PromptCompressorV2
 from .version import VERSION as __version__
 
 __all__ = ["PromptCompressor"]
diff --git a/llmlingua/monitor.py b/llmlingua/monitor.py
new file mode 100644
index 0000000..0dfd2af
--- /dev/null
+++ b/llmlingua/monitor.py
@@ -0,0 +1,16 @@
+import psutil
+import time
+
+# Get the current process
+process = psutil.Process()
+
+while True:
+    # Get the process's overall CPU usage as a percentage of total system CPUs
+    cpu_usage = process.cpu_percent(interval=1)
+    
+    # Get per-core CPU usage
+    per_core_usage = psutil.cpu_percent(interval=None, percpu=True)
+    
+    print(f"Process CPU usage: {cpu_usage}%")
+    print(f"Per-core CPU usage: {per_core_usage}")
+    time.sleep(1)
diff --git a/llmlingua/prompt_compress_multicore.py b/llmlingua/prompt_compress_multicore.py
new file mode 100644
index 0000000..a83f38b
--- /dev/null
+++ b/llmlingua/prompt_compress_multicore.py
@@ -0,0 +1,2469 @@
+
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
+import bisect
+import copy
+import json
+import re
+import string
+from collections import defaultdict
+from typing import List, Union
+
+import nltk
+import numpy as np
+import tiktoken
+import concurrent
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+)
+
+from .utils import (
+    TokenClfDataset,
+    get_pure_token,
+    is_begin_of_new_word,
+    process_structured_json_data,
+    remove_consecutive_commas,
+    replace_added_token,
+    seed_everything,
+)
+
+
+class PromptCompressorV2:
+    """
+    PromptCompressorV2 is designed for compressing prompts based on a given language model.
+
+    This class initializes with the language model and its configuration, preparing it for prompt compression tasks.
+    The PromptCompressorV2 class is versatile and can be adapted for various models and specific requirements in prompt processing.
+    Users can specify different model names and configurations as needed for their particular use case.The architecture is
+    based on the paper "LLMLingua: Compressing Prompts for Accelerated Inference of Large Language Models". Jiang, Huiqiang, Qianhui Wu,
+    Chin-Yew Lin, Yuqing Yang, and Lili Qiu. arXiv preprint arXiv:2310.05736 (2023).
+
+    Args:
+        model_name (str, optional): The name of the language model to be loaded. Default is "NousResearch/Llama-2-7b-hf".
+        device_map (str, optional): The device to load the model onto, e.g., "cuda" for GPU. Default is "cuda".
+        model_config (dict, optional): A dictionary containing the configuration parameters for the model. Default is an empty dictionary.
+        open_api_config (dict, optional): A dictionary containing configuration for openai APIs that may be used in conjunction with the model. Default is an empty dictionary.
+        use_llmlingua2 (bool, optional): Whether to use llmlingua-2 compressor based on the paper
+            "LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic Prompt Compression".
+            Zhuoshi Pan, Qianhui Wu, Huiqiang Jiang, Menglin Xia, Xufang Luo, Jue Zhang, Qingwei Lin, Victor Ruhle, Yuqing Yang, Chin-Yew Lin, H. Vicky Zhao, Lili Qiu, Dongmei Zhang.
+            arXiv preprint arXiv:2403.2403.12968 (2024), Default is True.
+        llmlingua2_config (dict, optional): A dictionary containing the configuration parameters for llmlingua-2. Default is
+            {
+                "max_batch_size": 50,
+                "max_force_token": 100, # max number of the tokens which will be forcely preserved
+            }
+    Example:
+        >>> compress_method = PromptCompressorV2(model_name="microsoft/llmlingua-2-xlm-roberta-large-meetingbank", use_llmlingua2=True, )
+        >>> context = ["This is the first context sentence.", "Here is another context sentence."]
+        >>> result = compress_method.compress_prompt(context, use_context_level_filter=True, target_token=5)
+        >>> print(result["compressed_prompt"])
+        # This will print the compressed version of the context.
+
+    Note:
+        The `PromptCompressorV2` class requires the Hugging Face Transformers library and an appropriate environment to load and run the models.
+    """
+
+    def __init__(
+        self,
+        model_name: str = "NousResearch/Llama-2-7b-hf",
+        device_map: str = "cuda",
+        model_config: dict = {},
+        open_api_config: dict = {},
+        use_llmlingua2: bool = False,
+        llmlingua2_config: dict = {},
+        number_of_cores: int = 16
+    ):
+        self.model_name = model_name
+        self.use_llmlingua2 = use_llmlingua2
+        self.retrieval_model = None
+        self.retrieval_model_name = None
+        self.open_api_config = open_api_config
+        self.cache_bos_num = 10
+        self.prefix_bos_num = 100
+        self.oai_tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
+        self.number_of_cores = number_of_cores
+        self.load_model(model_name, device_map, model_config)
+        if use_llmlingua2:
+            self.init_llmlingua2(**llmlingua2_config)
+
+    def init_llmlingua2(
+        self,
+        max_batch_size: int = 50,
+        max_force_token: int = 100,
+    ):
+        seed_everything(42)
+        self.max_batch_size = max_batch_size
+        self.max_seq_len = 512
+        self.max_force_token = max_force_token
+        self.special_tokens = set(
+            [
+                v
+                for k, v in self.tokenizer.special_tokens_map.items()
+                if k != "additional_special_tokens"
+            ]
+        )
+
+        self.added_tokens = [f"[NEW{i}]" for i in range(max_force_token)]
+        self.tokenizer.add_special_tokens(
+            {"additional_special_tokens": self.added_tokens}
+        )
+        self.model.resize_token_embeddings(len(self.tokenizer))
+
+    def load_model(
+        self, model_name: str, device_map: str = "cuda", model_config: dict = {}
+    ):
+        trust_remote_code = model_config.get("trust_remote_code", True)
+        if "trust_remote_code" not in model_config:
+            model_config["trust_remote_code"] = trust_remote_code
+        config = AutoConfig.from_pretrained(model_name, **model_config)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, **model_config)
+        if model_config.get("pad_to_left", True):
+            tokenizer.padding_side = "left"
+            tokenizer.pad_token_id = (
+                config.pad_token_id if config.pad_token_id else tokenizer.eos_token_id
+            )
+        MODEL_CLASS = (
+            AutoModelForTokenClassification
+            if any("ForTokenClassification" in ar for ar in config.architectures)
+            else AutoModelForCausalLM
+        )
+        self.device = (
+            device_map
+            if any(key in device_map for key in ["cuda", "cpu", "mps"])
+            else "cuda"
+        )
+        if "cuda" in device_map or "cpu" in device_map:
+            model = MODEL_CLASS.from_pretrained(
+                model_name,
+                torch_dtype=model_config.pop(
+                    "torch_dtype", "auto" if device_map == "cuda" else torch.float32
+                ),
+                device_map=device_map,
+                config=config,
+                ignore_mismatched_sizes=True,
+                **model_config,
+            )
+        else:
+            model = MODEL_CLASS.from_pretrained(
+                model_name,
+                device_map=device_map,
+                torch_dtype=model_config.pop("torch_dtype", "auto"),
+                pad_token_id=tokenizer.pad_token_id,
+                **model_config,
+            )
+        self.tokenizer = tokenizer
+        self.model = model
+        self.context_idxs = []
+        self.max_position_embeddings = config.max_position_embeddings
+
+    def get_ppl(
+        self,
+        text: str,
+        granularity: str = "sentence",
+        input_ids=None,
+        attention_mask=None,
+        past_key_values=None,
+        return_kv=False,
+        end=None,
+        condition_mode: str = "none",
+        condition_pos_id: int = 0,
+    ):
+        if input_ids is None:
+            tokenized_text = self.tokenizer(text, return_tensors="pt")
+            input_ids = tokenized_text["input_ids"].to(self.device)
+            attention_mask = tokenized_text["attention_mask"].to(self.device)
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[2]
+        else:
+            past_length = 0
+        if end is None:
+            end = input_ids.shape[1]
+        end = min(end, past_length + self.max_position_embeddings)
+        with torch.no_grad():
+            response = self.model(
+                input_ids[:, past_length:end],
+                attention_mask=attention_mask[:, :end],
+                past_key_values=past_key_values,
+                use_cache=True,
+            )
+            past_key_values = response.past_key_values
+
+        shift_logits = response.logits[..., :-1, :].contiguous()
+        shift_labels = input_ids[..., past_length + 1 : end].contiguous()
+        # Flatten the tokens
+        active = (attention_mask[:, past_length:end] == 1)[..., :-1].view(-1)
+        active_logits = shift_logits.view(-1, shift_logits.size(-1))[active]
+        active_labels = shift_labels.view(-1)[active]
+        loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
+        loss = loss_fct(active_logits, active_labels)
+        if condition_mode == "before":
+            loss = loss[:condition_pos_id]
+        elif condition_mode == "after":
+            loss = loss[condition_pos_id:]
+        res = loss.mean() if granularity == "sentence" else loss
+        return (res, past_key_values) if return_kv else res
+
+    def __call__(self, *args, **kwargs):
+        return self.compress_prompt(*args, **kwargs)
+
+    def compress_json(
+        self,
+        json_data: dict,
+        json_config: Union[str, dict],
+        instruction: str = "",
+        question: str = "",
+        rate: float = 0.5,
+        target_token: float = -1,
+        iterative_size: int = 200,
+        use_sentence_level_filter: bool = False,
+        use_keyvalue_level_filter: bool = False,
+        use_token_level_filter: bool = True,
+        keep_split: bool = False,
+        keep_first_sentence: int = 0,
+        keep_last_sentence: int = 0,
+        keep_sentence_number: int = 0,
+        high_priority_bonus: int = 100,
+        context_budget: str = "+100",
+        token_budget_ratio: float = 1.4,
+        condition_in_question: str = "none",
+        reorder_keyvalue: str = "original",
+        condition_compare: bool = False,
+        rank_method: str = "llmlingua",
+    ):
+        context, force_context_ids = process_structured_json_data(
+            json_data, json_config
+        )
+        compressed_res = self.structured_compress_prompt(
+            context=context,
+            instruction=instruction,
+            question=question,
+            rate=rate,
+            target_token=target_token,
+            iterative_size=iterative_size,
+            force_context_ids=force_context_ids,
+            use_sentence_level_filter=use_sentence_level_filter,
+            use_context_level_filter=use_keyvalue_level_filter,
+            use_token_level_filter=use_token_level_filter,
+            keep_split=keep_split,
+            keep_first_sentence=keep_first_sentence,
+            keep_last_sentence=keep_last_sentence,
+            keep_sentence_number=keep_sentence_number,
+            high_priority_bonus=high_priority_bonus,
+            context_budget=context_budget,
+            token_budget_ratio=token_budget_ratio,
+            condition_in_question=condition_in_question,
+            reorder_context=reorder_keyvalue,
+            condition_compare=condition_compare,
+            add_instruction=False,
+            rank_method=rank_method,
+            concate_question=False,
+            strict_preserve_uncompressed=False,
+        )
+        compressed_json_text = remove_consecutive_commas(
+            compressed_res["compressed_prompt"]
+        )
+        compressed_res["compressed_prompt"] = json.loads(compressed_json_text)
+        return compressed_res
+
+    def structured_compress_prompt(
+        self,
+        context: List[str],
+        instruction: str = "",
+        question: str = "",
+        rate: float = 0.5,
+        target_token: float = -1,
+        iterative_size: int = 200,
+        force_context_ids: List[int] = None,
+        force_context_number: int = None,
+        use_sentence_level_filter: bool = False,
+        use_context_level_filter: bool = True,
+        use_token_level_filter: bool = True,
+        keep_split: bool = False,
+        keep_first_sentence: int = 0,
+        keep_last_sentence: int = 0,
+        keep_sentence_number: int = 0,
+        high_priority_bonus: int = 100,
+        context_budget: str = "+100",
+        token_budget_ratio: float = 1.4,
+        condition_in_question: str = "none",
+        reorder_context: str = "original",
+        dynamic_context_compression_ratio: float = 0.0,
+        condition_compare: bool = False,
+        add_instruction: bool = False,
+        rank_method: str = "llmlingua",
+        concate_question: bool = True,
+        strict_preserve_uncompressed: bool = True,
+    ):
+        """
+        Compresses the given prompt context based on a specified structure.
+
+        Each element of context should be segmented using one or more non-nested '<llmlingua></llmlingua>' tags.
+        Each '<llmlingua>' tag can include optional parameters 'rate' and 'compress' (e.g., '<llmlingua, rate=0.3, compress=True>'),
+        indicating the compression rate for that segment. Default values are 'rate=rate' and 'compress=True'.
+        When 'compress' is set to False, it overrides the 'rate' parameter, resulting in no compression for that segment.
+
+        Args:
+            context (List[str]): List of context strings divided by '<llmlingua></llmlingua>' tags with optional compression settings.
+            instruction (str, optional): Additional instruction text to be included in the prompt. Default is an empty string.
+            question (str, optional): A specific question that the prompt is addressing. Default is an empty string.
+            rate (float, optional): The compression rate is defined the same as in paper "Language Modeling Is Compression".
+                Delétang, Grégoire, Anian Ruoss, Paul-Ambroise Duquenne, Elliot Catt, Tim Genewein, Christopher Mattern,
+                Jordi Grau-Moya et al. "Language modeling is compression." arXiv preprint arXiv:2309.10668 (2023):
+                .. math::\text{Compression Rate} = \frac{\text{Compressed Size}}{\text{Raw Size}}
+                Default is 0.5. The actual compression rate is generally lower than the specified target, but there can be
+                fluctuations due to differences in tokenizers. If specified, it should be a float less than or equal
+                to 1.0, representing the target compression rate. ``rate``, is applicable only within the context-level filter
+                and the sentence-level filter. In the token-level filter, the rate for each segment overrides the global rate.
+                However, for segments where no specific rate is defined, the global rate serves as the default value. The final
+                compression rate of the entire text is a composite result of multiple compression rates applied across different sections.
+            target_token (float, optional): The global maximum number of tokens to be achieved. Default is -1, indicating no
+                specific target. The actual number of tokens after compression should generally be less than the specified target_token,
+                but there can be fluctuations due to differences in tokenizers. If specified, compression will be based on the target_token as
+                the sole criterion, overriding the ``rate``. ``target_token``, is applicable only within the context-level
+                filter and the sentence-level filter. In the token-level filter, the rate for each segment overrides the global target token.
+                However, for segments where no specific rate is defined, the global rate calculated from global target token serves
+                as the default value. The final target token of the entire text is a composite result of multiple compression rates
+                applied across different sections.
+            iterative_size (int, optional): The number of tokens to consider in each iteration of compression. Default is 200.
+            force_context_ids (List[int], optional): List of specific context IDs to always include in the compressed result. Default is None.
+            force_context_number (int, optional): The number of context sections to forcibly include. Default is None.
+            use_sentence_level_filter (bool, optional): Whether to apply sentence-level filtering in compression. Default is False.
+            use_context_level_filter (bool, optional): Whether to apply context-level filtering in compression. Default is True.
+            use_token_level_filter (bool, optional): Whether to apply token-level filtering in compression. Default is True.
+            keep_split (bool, optional): Whether to preserve the original separators without compression. Default is False.
+            keep_first_sentence (int, optional): Number of sentences to forcibly preserve from the start of the context. Default is 0.
+            keep_last_sentence (int, optional): Number of sentences to forcibly preserve from the end of the context. Default is 0.
+            keep_sentence_number (int, optional): Total number of sentences to forcibly preserve in the compression. Default is 0.
+            high_priority_bonus (int, optional): Bonus score for high-priority sentences to influence their likelihood of being retained. Default is 100.
+            context_budget (str, optional): Token budget for the context-level filtering, expressed as a string to indicate flexibility. Default is "+100".
+            token_budget_ratio (float, optional): Ratio to adjust token budget during sentence-level filtering. Default is 1.4.
+            condition_in_question (str, optional): Specific condition to apply to question in the context. Default is "none".
+            reorder_context (str, optional): Strategy for reordering context in the compressed result. Default is "original".
+            dynamic_context_compression_ratio (float, optional): Ratio for dynamically adjusting context compression. Default is 0.0.
+            condition_compare (bool, optional): Whether to enable condition comparison during token-level compression. Default is False.
+            add_instruction (bool, optional): Whether to add the instruction to the prompt prefix. Default is False.
+            rank_method (str, optional): Method used for ranking elements during compression. Default is "llmlingua".
+            concate_question (bool, optional): Whether to concatenate the question to the compressed prompt. Default is True.
+
+        Returns:
+            dict: A dictionary containing:
+                - "compressed_prompt" (str): The resulting compressed prompt.
+                - "origin_tokens" (int): The original number of tokens in the input.
+                - "compressed_tokens" (int): The number of tokens in the compressed output.
+                - "ratio" (str): The compression ratio achieved, calculated as the original token number divided by the token number after compression.
+                - "rate" (str): The compression rate achieved, in a human-readable format.
+                - "saving" (str): Estimated savings in GPT-4 token usage.
+        """
+        if not context:
+            context = [" "]
+        if isinstance(context, str):
+            context = [context]
+        context = [
+            self.tokenizer.decode(self.tokenizer(c, add_special_tokens=False).input_ids)
+            for c in context
+        ]
+        context_tokens_length = [self.get_token_length(c) for c in context]
+        instruction_tokens_length, question_tokens_length = self.get_token_length(
+            instruction
+        ), self.get_token_length(question)
+        if target_token == -1:
+            target_token = (
+                (
+                    instruction_tokens_length
+                    + question_tokens_length
+                    + sum(context_tokens_length)
+                )
+                * rate
+                - instruction_tokens_length
+                - (question_tokens_length if concate_question else 0)
+            )
+        else:
+            rate = target_token / sum(context_tokens_length)
+        (
+            context,
+            context_segs,
+            context_segs_rate,
+            context_segs_compress,
+        ) = self.segment_structured_context(context, rate)
+        return self.compress_prompt(
+            context,
+            instruction,
+            question,
+            rate,
+            target_token,
+            iterative_size,
+            force_context_ids,
+            force_context_number,
+            use_sentence_level_filter,
+            use_context_level_filter,
+            use_token_level_filter,
+            keep_split,
+            keep_first_sentence,
+            keep_last_sentence,
+            keep_sentence_number,
+            high_priority_bonus,
+            context_budget,
+            token_budget_ratio,
+            condition_in_question,
+            reorder_context,
+            dynamic_context_compression_ratio,
+            condition_compare,
+            add_instruction,
+            rank_method,
+            concate_question,
+            context_segs=context_segs,
+            context_segs_rate=context_segs_rate,
+            context_segs_compress=context_segs_compress,
+            strict_preserve_uncompressed=strict_preserve_uncompressed,
+        )
+
+    def compress_prompt(
+        self,
+        context: List[str],
+        instruction: str = "",
+        question: str = "",
+        rate: float = 0.5,
+        target_token: float = -1,
+        iterative_size: int = 200,
+        force_context_ids: List[int] = None,
+        force_context_number: int = None,
+        use_sentence_level_filter: bool = False,
+        use_context_level_filter: bool = True,
+        use_token_level_filter: bool = True,
+        keep_split: bool = False,
+        keep_first_sentence: int = 0,
+        keep_last_sentence: int = 0,
+        keep_sentence_number: int = 0,
+        high_priority_bonus: int = 100,
+        context_budget: str = "+100",
+        token_budget_ratio: float = 1.4,
+        condition_in_question: str = "none",
+        reorder_context: str = "original",
+        dynamic_context_compression_ratio: float = 0.0,
+        condition_compare: bool = False,
+        add_instruction: bool = False,
+        rank_method: str = "llmlingua",
+        concate_question: bool = True,
+        context_segs: List[str] = None,
+        context_segs_rate: List[float] = None,
+        context_segs_compress: List[bool] = None,
+        target_context: int = -1,
+        context_level_rate: float = 1.0,
+        context_level_target_token: int = -1,
+        return_word_label: bool = False,
+        word_sep: str = "\t\t|\t\t",
+        label_sep: str = " ",
+        token_to_word: str = "mean",
+        force_tokens: List[str] = [],
+        force_reserve_digit: bool = False,
+        drop_consecutive: bool = False,
+        chunk_end_tokens: List[str] = [".", "\n"],
+        strict_preserve_uncompressed: bool = True,
+    ):
+        """
+        Compresses the given context.
+
+        Args:
+            context (List[str]): List of context strings that form the basis of the prompt.
+            instruction (str, optional): Additional instruction text to be included in the prompt. Default is an empty string.
+            question (str, optional): A specific question that the prompt is addressing. Default is an empty string.
+            rate (float, optional): The maximum compression rate target to be achieved. The compression rate is defined
+                the same as in paper "Language Modeling Is Compression". Delétang, Grégoire, Anian Ruoss, Paul-Ambroise Duquenne,
+                Elliot Catt, Tim Genewein, Christopher Mattern, Jordi Grau-Moya et al. "Language modeling is compression."
+                arXiv preprint arXiv:2309.10668 (2023):
+                .. math::\text{Compression Rate} = \frac{\text{Compressed Size}}{\text{Raw Size}}
+                Default is 0.5. The actual compression rate is generally lower than the specified target, but there can be
+                fluctuations due to differences in tokenizers. If specified, it should be a float less than or equal
+                to 1.0, representing the target compression rate.
+            target_token (float, optional): The maximum number of tokens to be achieved. Default is -1, indicating no specific target.
+                The actual number of tokens after compression should generally be less than the specified target_token, but there can
+                be fluctuations due to differences in tokenizers. If specified, compression will be based on the target_token as
+                the sole criterion, overriding the ``rate``.
+            iterative_size (int, optional): The number of tokens to consider in each iteration of compression. Default is 200.
+            force_context_ids (List[int], optional): List of specific context IDs to always include in the compressed result. Default is None.
+            force_context_number (int, optional): The number of context sections to forcibly include. Default is None.
+            use_sentence_level_filter (bool, optional): Whether to apply sentence-level filtering in compression. Default is False.
+            use_context_level_filter (bool, optional): Whether to apply context-level filtering in compression. Default is True.
+            use_token_level_filter (bool, optional): Whether to apply token-level filtering in compression. Default is True.
+            keep_split (bool, optional): Whether to preserve the original separators without compression. Default is False.
+            keep_first_sentence (int, optional): Number of sentences to forcibly preserve from the start of the context. Default is 0.
+            keep_last_sentence (int, optional): Number of sentences to forcibly preserve from the end of the context. Default is 0.
+            keep_sentence_number (int, optional): Total number of sentences to forcibly preserve in the compression. Default is 0.
+            high_priority_bonus (int, optional): Bonus score for high-priority sentences to influence their likelihood of being retained. Default is 100.
+            context_budget (str, optional): Token budget for the context-level filtering, expressed as a string to indicate flexibility. Default is "+100".
+            token_budget_ratio (float, optional): Ratio to adjust token budget during sentence-level filtering. Default is 1.4.
+            condition_in_question (str, optional): Specific condition to apply to question in the context. Default is "none".
+            reorder_context (str, optional): Strategy for reordering context in the compressed result. Default is "original".
+            dynamic_context_compression_ratio (float, optional): Ratio for dynamically adjusting context compression. Default is 0.0.
+            condition_compare (bool, optional): Whether to enable condition comparison during token-level compression. Default is False.
+            add_instruction (bool, optional): Whether to add the instruction to the prompt prefix. Default is False.
+            rank_method (str, optional): Method used for ranking elements during compression. Default is "llmlingua".
+            concate_question (bool, optional): Whether to concatenate the question to the compressed prompt. Default is True.
+
+            target_context (int, optional): The maximum number of contexts to be achieved. Default is -1, indicating no specific target.
+            context_level_rate (float, optional): The minimum compression rate target to be achieved in context level. Default is 1.0.
+            context_level_target_token (float, optional): The maximum number of tokens to be achieved in context level compression.
+                Default is -1, indicating no specific target. Only used in the coarse-to-fine compression senario.
+            force_context_ids (List[int], optional): List of specific context IDs to always include in the compressed result. Default is None.
+            return_word_label (bool, optional): Whether to return word with corresponding label. Default is False.
+            word_sep (str, optional): The sep token used in fn_labeled_original_prompt to partition words. Default is "\t\t|\t\t".
+            label_sep (str, optional): The sep token used in fn_labeled_original_prompt to partition word and label.  Default is " ".
+            token_to_word (str, optional): How to convert token probability to word probability. Default is "mean".
+            force_tokens (List[str], optional): List of specific tokens to always include in the compressed result. Default is [].
+            force_reserve_digit  (bool, optional): Whether to forcibly reserve tokens that containing digit (0,...,9). Default is False.
+            drop_consecutive (bool, optinal): Whether to drop tokens which are in 'force_tokens' but appears consecutively in compressed prompt.
+                Default is False.
+            chunk_end_tokens (List[str], optinal): The early stop tokens for segmenting chunk. Default is [".", "\n"],
+        Returns:
+            dict: A dictionary containing:
+                - "compressed_prompt" (str): The resulting compressed prompt.
+                - "compressed_prompt_list" (List[str]): List of the resulting compressed prompt. Only used in llmlingua2.
+                - "fn_labeled_original_prompt" (str): original words along with their labels
+                    indicating whether to reserve in compressed prompt, in the format (word label_sep label)
+                    Only used in llmlingua2 when return_word_label = True.
+                - "origin_tokens" (int): The original number of tokens in the input.
+                - "compressed_tokens" (int): The number of tokens in the compressed output.
+                - "ratio" (str): The compression ratio achieved, calculated as the original token number divided by the token number after compression.
+                - "rate" (str): The compression rate achieved, in a human-readable format.
+                - "saving" (str): Estimated savings in GPT-4 token usage.
+        """
+        if self.use_llmlingua2:
+            return self.compress_prompt_llmlingua2(
+                context,
+                rate=rate,
+                target_token=target_token,
+                use_context_level_filter=use_context_level_filter,
+                use_token_level_filter=use_token_level_filter,
+                target_context=target_context,
+                context_level_rate=context_level_rate,
+                context_level_target_token=context_level_target_token,
+                force_context_ids=force_context_ids,
+                return_word_label=return_word_label,
+                word_sep=word_sep,
+                label_sep=label_sep,
+                token_to_word=token_to_word,
+                force_tokens=force_tokens,
+                force_reserve_digit=force_reserve_digit,
+                drop_consecutive=drop_consecutive,
+                chunk_end_tokens=chunk_end_tokens,
+            )
+        assert (
+            rate <= 1.0
+        ), "Error: 'rate' must not exceed 1.0. The value of 'rate' indicates compression rate and must be within the range [0, 1]."
+
+        if not context:
+            context = [" "]
+        if isinstance(context, str):
+            context = [context]
+        assert not (
+            rank_method == "longllmlingua" and not question
+        ), "In the LongLLMLingua, it is necessary to set a question."
+        if condition_compare and "_condition" not in condition_in_question:
+            condition_in_question += "_condition"
+        if rank_method == "longllmlingua":
+            if condition_in_question == "none":
+                condition_in_question = "after"
+        elif rank_method == "llmlingua":
+            condition_in_question = (
+                "none"
+                if "_condition" not in condition_in_question
+                else "none_condition"
+            )
+        origin_tokens = len(
+            self.oai_tokenizer.encode(
+                "\n\n".join([instruction] + context + [question]).strip()
+            )
+        )
+        context_tokens_length = [self.get_token_length(c) for c in context]
+        instruction_tokens_length, question_tokens_length = self.get_token_length(
+            instruction
+        ), self.get_token_length(question)
+        if target_token == -1:
+            target_token = (
+                (
+                    instruction_tokens_length
+                    + question_tokens_length
+                    + sum(context_tokens_length)
+                )
+                * rate
+                - instruction_tokens_length
+                - (question_tokens_length if concate_question else 0)
+            )
+        condition_flag = "_condition" in condition_in_question
+        condition_in_question = condition_in_question.replace("_condition", "")
+
+        if len(context) > 1 and use_context_level_filter:
+            context, dynamic_ratio, context_used = self.control_context_budget(
+                context,
+                context_tokens_length,
+                target_token,
+                force_context_ids,
+                force_context_number,
+                question,
+                condition_in_question,
+                reorder_context=reorder_context,
+                dynamic_context_compression_ratio=dynamic_context_compression_ratio,
+                rank_method=rank_method,
+                context_budget=context_budget,
+                context_segs=context_segs,
+                context_segs_rate=context_segs_rate,
+                context_segs_compress=context_segs_compress,
+                strict_preserve_uncompressed=strict_preserve_uncompressed,
+            )
+            if context_segs is not None:
+                context_segs = [context_segs[idx] for idx in context_used]
+                context_segs_rate = [context_segs_rate[idx] for idx in context_used]
+                context_segs_compress = [
+                    context_segs_compress[idx] for idx in context_used
+                ]
+        else:
+            dynamic_ratio = [0.0] * len(context)
+
+        segments_info = []
+        if use_sentence_level_filter:
+            context, segments_info = self.control_sentence_budget(
+                context,
+                target_token,
+                keep_first_sentence=keep_first_sentence,
+                keep_last_sentence=keep_last_sentence,
+                keep_sentence_number=keep_sentence_number,
+                high_priority_bonus=high_priority_bonus,
+                token_budget_ratio=token_budget_ratio,
+                question=question,
+                condition_in_question=condition_in_question,
+                rank_method=rank_method,
+                context_segs=context_segs,
+                context_segs_rate=context_segs_rate,
+                context_segs_compress=context_segs_compress,
+            )
+        elif context_segs is not None:
+            for context_idx in range(len(context)):
+                segments_info.append(
+                    [
+                        (len(seg_text), seg_rate, seg_compress)
+                        for seg_text, seg_rate, seg_compress in zip(
+                            context_segs[context_idx],
+                            context_segs_rate[context_idx],
+                            context_segs_compress[context_idx],
+                        )
+                    ]
+                )
+        segments_info = [
+            self.concate_segment_info(segment_info) for segment_info in segments_info
+        ]
+
+        if condition_flag:
+            prefix = question + "\n\n" + instruction if add_instruction else question
+            if (
+                self.get_token_length(prefix + "\n\n") + iterative_size * 2
+                > self.max_position_embeddings
+            ):
+                tokens = self.tokenizer(prefix, add_special_tokens=False).input_ids
+                prefix = self.tokenizer.decode(
+                    tokens[: self.prefix_bos_num]
+                    + tokens[
+                        len(tokens)
+                        - self.max_position_embeddings
+                        + 2
+                        + self.prefix_bos_num
+                        + 2 * iterative_size :
+                    ]
+                )
+            start = self.get_prefix_length(prefix + "\n\n", context[0])
+            context = [prefix] + context
+        else:
+            start = 0
+
+        if use_token_level_filter:
+            context = self.iterative_compress_prompt(
+                context,
+                target_token,
+                iterative_size=iterative_size,
+                keep_split=keep_split,
+                start=start,
+                dynamic_ratio=dynamic_ratio,
+                condition_compare=condition_compare,
+                segments_info=segments_info,
+            )
+            compressed_prompt = (
+                self.tokenizer.batch_decode(context[0])[0]
+                .replace("<s> ", "")
+                .replace("<s>", "")
+            )
+        else:
+            if condition_flag:
+                context = context[1:]
+            compressed_prompt = "\n\n".join(context)
+
+        res = []
+        if instruction:
+            res.append(instruction)
+        if compressed_prompt.strip():
+            res.append(compressed_prompt)
+        if question and concate_question:
+            res.append(question)
+
+        compressed_prompt = "\n\n".join(res)
+
+        compressed_tokens = len(self.oai_tokenizer.encode(compressed_prompt))
+        saving = (origin_tokens - compressed_tokens) * 0.06 / 1000
+        ratio = 1 if compressed_tokens == 0 else origin_tokens / compressed_tokens
+        rate = 1 / ratio
+        return {
+            "compressed_prompt": compressed_prompt,
+            "origin_tokens": origin_tokens,
+            "compressed_tokens": compressed_tokens,
+            "ratio": f"{ratio:.1f}x",
+            "rate": f"{rate * 100:.1f}%",
+            "saving": f", Saving ${saving:.1f} in GPT-4.",
+        }
+
+    def compress_prompt_llmlingua2(
+        self,
+        context: List[str],
+        rate: float = 0.5,
+        target_token: int = -1,
+        use_context_level_filter: bool = False,
+        use_token_level_filter: bool = True,
+        target_context: int = -1,
+        context_level_rate: float = 1.0,
+        context_level_target_token: int = -1,
+        force_context_ids: List[int] = [],
+        return_word_label: bool = False,
+        word_sep: str = "\t\t|\t\t",
+        label_sep: str = " ",
+        token_to_word: str = "mean",
+        force_tokens: List[str] = [],
+        force_reserve_digit: bool = False,
+        drop_consecutive: bool = False,
+        chunk_end_tokens: List[str] = [".", "\n"],
+    ):
+        """
+        Compresses the given context, instruction and question.
+
+        Args:
+            context (List[str]): List of context strings that form the basis of the prompt.
+            rate (float, optional): The minimum compression rate target to be achieved. Default is 0.5. The actual compression rate
+                generally exceeds the specified target, but there can be fluctuations due to differences in tokenizers. If specified,
+                it should be a float greater than or equal to 1.0, representing the target compression rate.
+            target_token (int, optional): The maximum number of tokens to be achieved. Default is -1, indicating no specific target.
+                The actual number of tokens after compression should generally be less than the specified target_token, but there can
+                be fluctuations due to differences in tokenizers. If specified, compression will be based on the target_token as
+                the sole criterion, overriding the rate.
+            target_context (int, optional): The maximum number of contexts to be achieved. Default is -1, indicating no specific target.
+                Only used in the coarse-to-fine compression.
+            context_level_rate (float, optional): The minimum compression rate target to be achieved in context level. Default is 1.0.
+                Only used in the coarse-to-fine compression.
+            context_level_target_token (float, optional): The maximum number of tokens to be achieved in context level compression.
+                Default is -1, indicating no specific target. Only used in the coarse-to-fine compression senario.
+            force_context_ids (List[int], optional): List of specific context IDs to always include in the compressed result. Default is None.
+            return_word_label (bool, optional): Whether to return word with corresponding label. Default is False.
+            word_sep (str, optional): The sep token used in fn_labeled_original_prompt to partition words. Default is "\t\t|\t\t".
+            label_sep (str, optional): The sep token used in fn_labeled_original_prompt to partition word and label.  Default is " ".
+            token_to_word (str, optional): How to convert token probability to word probability. Default is "mean".
+            force_tokens (List[str], optional): List of specific tokens to always include in the compressed result. Default is [].
+            force_reserve_digit  (bool, optional): Whether to forcibly reserve tokens that containing digit (0,...,9). Default is False.
+            drop_consecutive (bool, optinal): Whether to drop tokens which are in 'force_tokens' but appears consecutively in compressed prompt.
+                Default is False.
+            chunk_end_tokens (List[str], optional): The early stop tokens for segmenting chunk. Default is [".", "\n"].
+        Returns:
+            dict: A dictionary containing:
+                - "compressed_prompt" (str): The resulting compressed prompt.
+                - "compressed_prompt_list" (List[str]): List of the resulting compressed prompt.
+                - "fn_labeled_original_prompt" (str): original words along with their labels
+                    indicating whether to reserve in compressed prompt, in the format (word label_sep label)
+                - "origin_tokens" (int): The original number of tokens in the input.
+                - "compressed_tokens" (int): The number of tokens in the compressed output.
+                - "ratio" (str): The compression ratio achieved, in a human-readable format.
+                - "rate" (str): The compression rate achieved, in a human-readable format.
+                - "saving" (str): Estimated savings in GPT-4 token usage.
+
+        """
+        assert len(force_tokens) <= self.max_force_token
+        token_map = {}
+        for i, t in enumerate(force_tokens):
+            if len(self.tokenizer.tokenize(t)) != 1:
+                token_map[t] = self.added_tokens[i]
+        chunk_end_tokens = copy.deepcopy(chunk_end_tokens)
+        for c in chunk_end_tokens:
+            if c in token_map:
+                chunk_end_tokens.append(token_map[c])
+        chunk_end_tokens = set(chunk_end_tokens)
+
+        if type(context) == str:
+            context = [context]
+        context = copy.deepcopy(context)
+
+        if len(context) == 1 and use_context_level_filter:
+            use_context_level_filter = False
+
+        n_original_token = 0
+        context_chunked = []
+        for i in range(len(context)):
+            n_original_token += self.get_token_length(
+                context[i], use_oai_tokenizer=True
+            )
+            for ori_token, new_token in token_map.items():
+                context[i] = context[i].replace(ori_token, new_token)
+            context_chunked.append(
+                self.__chunk_context(context[i], chunk_end_tokens=chunk_end_tokens)
+            )
+
+        if use_context_level_filter:
+            # want use_context_level_filter but do not specify any parameters in context level?
+            # we will set context_level_rate = (rate + 1.0) / 2 if specify rate or target_token * 2 if specify target_token
+            if (
+                target_context <= 0
+                and context_level_rate >= 1.0
+                and context_level_target_token <= 0
+            ):
+                if target_token < 0 and rate < 1.0:
+                    context_level_rate = (
+                        (rate + 1.0) / 2 if use_token_level_filter else rate
+                    )
+                if target_token >= 0:
+                    context_level_target_token = (
+                        target_token * 2 if use_token_level_filter else target_token
+                    )
+
+            if target_context >= 0:
+                context_level_rate = min(target_context / len(context), 1.0)
+            if context_level_target_token >= 0:
+                context_level_rate = min(
+                    context_level_target_token / n_original_token, 1.0
+                )
+
+            context_probs, context_words = self.__get_context_prob(
+                context_chunked,
+                token_to_word=token_to_word,
+                force_tokens=force_tokens,
+                token_map=token_map,
+                force_reserve_digit=force_reserve_digit,
+            )
+
+            threshold = np.percentile(
+                context_probs, int(100 * (1 - context_level_rate))
+            )
+
+            reserved_context = []
+            context_label = [False] * len(context_probs)
+            for i, p in enumerate(context_probs):
+                if p >= threshold or (
+                    force_context_ids is not None and i in force_context_ids
+                ):
+                    reserved_context.append(context_chunked[i])
+                    context_label[i] = True
+            n_reserved_token = 0
+            for chunks in reserved_context:
+                for c in chunks:
+                    n_reserved_token += self.get_token_length(c, use_oai_tokenizer=True)
+            if target_token >= 0:
+                rate = min(target_token / n_reserved_token, 1.0)
+
+            if use_token_level_filter:
+                compressed_context, word_list, word_label_list = self.__compress(
+                    reserved_context,
+                    reduce_rate=max(0, 1 - rate),
+                    token_to_word=token_to_word,
+                    force_tokens=force_tokens,
+                    token_map=token_map,
+                    force_reserve_digit=force_reserve_digit,
+                    drop_consecutive=drop_consecutive,
+                )
+            else:
+                compressed_context, word_list, word_label_list = self.__compress(
+                    reserved_context,
+                    reduce_rate=0,
+                    token_to_word=token_to_word,
+                    force_tokens=force_tokens,
+                    token_map=token_map,
+                    force_reserve_digit=force_reserve_digit,
+                    drop_consecutive=drop_consecutive,
+                )
+
+            n_compressed_token = 0
+            for c in compressed_context:
+                n_compressed_token += self.get_token_length(c, use_oai_tokenizer=True)
+            saving = (n_original_token - n_compressed_token) * 0.06 / 1000
+            ratio = (
+                1 if n_compressed_token == 0 else n_original_token / n_compressed_token
+            )
+            res = {
+                "compressed_prompt": "\n\n".join(compressed_context),
+                "compressed_prompt_list": compressed_context,
+                "origin_tokens": n_original_token,
+                "compressed_tokens": n_compressed_token,
+                "ratio": f"{ratio:.1f}x",
+                "rate": f"{1 / ratio * 100:.1f}%",
+                "saving": f", Saving ${saving:.1f} in GPT-4.",
+            }
+            if return_word_label:
+                words = []
+                labels = []
+                j = 0
+                for i in range(len(context)):
+                    if context_label[i]:
+                        words.extend(word_list[j])
+                        labels.extend(word_label_list[j])
+                        j += 1
+                    else:
+                        words.extend(context_words[i])
+                        labels.extend([0] * len(context_words[i]))
+                word_label_lines = word_sep.join(
+                    [f"{word}{label_sep}{label}" for word, label in zip(words, labels)]
+                )
+                res["fn_labeled_original_prompt"] = word_label_lines
+            return res
+
+        if target_token > 0:
+            rate = min(target_token / n_original_token, 1.0)
+
+        if use_token_level_filter:
+            compressed_context, word_list, word_label_list = self.__compress(
+                context_chunked,
+                reduce_rate=max(0, 1 - rate),
+                token_to_word=token_to_word,
+                force_tokens=force_tokens,
+                token_map=token_map,
+                force_reserve_digit=force_reserve_digit,
+                drop_consecutive=drop_consecutive,
+            )
+        else:
+            compressed_context, word_list, word_label_list = self.__compress(
+                context_chunked,
+                reduce_rate=0,
+                token_to_word=token_to_word,
+                force_tokens=force_tokens,
+                token_map=token_map,
+                force_reserve_digit=force_reserve_digit,
+                drop_consecutive=drop_consecutive,
+            )
+
+        n_compressed_token = 0
+        for c in compressed_context:
+            n_compressed_token += self.get_token_length(c, use_oai_tokenizer=True)
+        saving = (n_original_token - n_compressed_token) * 0.06 / 1000
+        ratio = 1 if n_compressed_token == 0 else n_original_token / n_compressed_token
+        res = {
+            "compressed_prompt": "\n\n".join(compressed_context),
+            "compressed_prompt_list": compressed_context,
+            "origin_tokens": n_original_token,
+            "compressed_tokens": n_compressed_token,
+            "ratio": f"{ratio:.1f}x",
+            "rate": f"{1 / ratio * 100:.1f}%",
+            "saving": f", Saving ${saving:.1f} in GPT-4.",
+        }
+        if return_word_label:
+            words = []
+            labels = []
+            for w_list, l_list in zip(word_list, word_label_list):
+                words.extend(w_list)
+                labels.extend(l_list)
+
+            word_label_lines = word_sep.join(
+                [f"{word}{label_sep}{label}" for word, label in zip(words, labels)]
+            )
+            res["fn_labeled_original_prompt"] = word_label_lines
+        return res
+
+    def get_token_length(
+        self,
+        text: str,
+        add_special_tokens: bool = True,
+        use_oai_tokenizer: bool = False,
+    ):
+        if use_oai_tokenizer:
+            return len(self.oai_tokenizer.encode(text))
+        else:
+            return len(
+                self.tokenizer(text, add_special_tokens=add_special_tokens).input_ids
+            )
+
+    def get_prefix_length(self, prefix: str, text: str):
+        possible_prefix_token = max(self.get_token_length(prefix, False) - 3, 1)
+        full_input_ids = self.tokenizer(
+            prefix + text[:100], add_special_tokens=False
+        ).input_ids
+        for i in range(possible_prefix_token, len(full_input_ids)):
+            cur_prefix = self.tokenizer.decode(full_input_ids[:i])
+            if cur_prefix == prefix:
+                break
+        assert self.tokenizer.decode(full_input_ids[i:]) == text[:100]
+        return i
+
+    def get_condition_ppl(
+        self,
+        text: str,
+        question: str,
+        condition_in_question: str = "none",
+        granularity: str = "sentence",
+    ):
+        if condition_in_question == "none":
+            return self.get_ppl(text, granularity=granularity)
+        elif condition_in_question == "before":
+            return self.get_ppl(
+                question + text,
+                granularity=granularity,
+                condition_mode="after",
+                condition_pos_id=self.get_token_length(question) - 1,
+            )
+        elif condition_in_question == "after":
+            return self.get_ppl(
+                text + question,
+                granularity=granularity,
+                condition_mode="after",
+                condition_pos_id=self.get_token_length(text) - 1,
+            )
+
+    def get_dynamic_compression_ratio(
+        self,
+        context: list,
+        target_token: float,
+        iterative_size: int,
+        dynamic_ratio: list,
+        start: int,
+        seg_info: List[List[tuple]] = None,
+    ):
+        def get_ratio(base: float, delta: float):
+            return max(min(1, base + delta), 0)
+
+        context_length = [self.get_token_length(ii, False) + 2 for ii in context]
+        if start:
+            context_length = context_length[1:]
+        tau = target_token / (sum(context_length) + 1)
+        res, idx, last, last_target = [], 0, 1, []
+        while idx < len(context_length):
+            if last + context_length[idx] >= iterative_size:
+                last_target.append(
+                    (iterative_size - last, get_ratio(tau, dynamic_ratio[idx]))
+                )
+                res.append(last_target)
+                last = last + context_length[idx] - iterative_size
+                if last > iterative_size:
+                    k = last // iterative_size
+                    res.extend(
+                        [[(iterative_size, get_ratio(tau, dynamic_ratio[idx]))]] * k
+                    )
+                    last -= k * iterative_size
+
+                last_target = (
+                    [(last, get_ratio(tau, dynamic_ratio[idx]))] if last else []
+                )
+            else:
+                last += context_length[idx]
+                last_target.append(
+                    (context_length[idx], get_ratio(tau, dynamic_ratio[idx]))
+                )
+            idx += 1
+        if last_target:
+            res.append(last_target)
+        return res
+
+    def get_structured_dynamic_compression_ratio(
+        self,
+        context: list,
+        iterative_size: int,
+        dynamic_ratio: list,
+        start: int,
+        seg_info: List[List[tuple]] = None,
+    ):
+        if start:
+            pure_context = context[1:]
+        else:
+            pure_context = context
+        global_dynamic_rate, global_dynamic_compress, segments = [], [], []
+        for context_idx, text in enumerate(pure_context):
+            text_seen = 0
+            for seg_idx, (seg_len, seg_rate, seg_compress) in enumerate(
+                seg_info[context_idx]
+            ):
+                seg_text = text[text_seen : text_seen + seg_len]
+                if (
+                    seg_idx == len(seg_info[context_idx]) - 1
+                    and context_idx != len(pure_context) - 1
+                ):
+                    seg_text += "\n\n"
+                segments.append(seg_text)
+                if seg_compress:
+                    global_dynamic_rate.append(seg_rate)
+                else:
+                    global_dynamic_rate.append(1.0)
+                global_dynamic_compress.append(seg_compress)
+                text_seen += seg_len
+        origin_text = "\n\n".join(pure_context)
+        assert len("".join(segments)) == len(origin_text)
+        assert len(segments) == len(global_dynamic_rate) == len(global_dynamic_compress)
+
+        text_input_ids = self.tokenizer(
+            "\n\n".join(context), add_special_tokens=False
+        ).input_ids[start:]
+        assert self.tokenizer.decode(text_input_ids) == origin_text
+        dynamic_compression_ratio = self.token_segment(
+            text_input_ids,
+            iterative_size,
+            segments,
+            global_dynamic_rate,
+            global_dynamic_compress,
+        )
+        return dynamic_compression_ratio
+
+    def token_segment(
+        self,
+        text_input_ids: List[int],
+        iterative_size: int,
+        segments: List[str],
+        global_dynamic_rate: List[float],
+        global_dynamic_compress: List[bool],
+    ):
+        decode_window = 3
+        seg_idx, seg_seen, token_seen_num, last_rate = 0, 0, 0, -1
+        dynamic_compression_rate, local_compresssion_rate = [], []
+        for i in range(len(text_input_ids)):
+            if i < decode_window:
+                id_pre, id_cur = text_input_ids[:i], text_input_ids[: i + 1]
+            else:
+                id_pre, id_cur = (
+                    text_input_ids[i - decode_window + 1 : i],
+                    text_input_ids[i - decode_window + 1 : i + 1],
+                )
+            cur_word = self.tokenizer.decode(id_cur)[
+                len(self.tokenizer.decode(id_pre)) :
+            ]
+            cur_word_len = len(cur_word)
+            if cur_word_len and cur_word_len >= len(segments[seg_idx]) - seg_seen:
+                possible_rate, possible_compress = [], []
+                while (
+                    cur_word_len and cur_word_len >= len(segments[seg_idx]) - seg_seen
+                ):
+                    possible_rate.append(global_dynamic_rate[seg_idx])
+                    possible_compress.append(global_dynamic_compress[seg_idx])
+                    cur_word_len -= len(segments[seg_idx]) - seg_seen
+                    seg_idx += 1
+                    seg_seen = 0
+                if cur_word_len:
+                    possible_rate.append(global_dynamic_rate[seg_idx])
+                    possible_compress.append(global_dynamic_compress[seg_idx])
+                new_rate = 1.0 if False in possible_compress else min(possible_rate)
+            else:
+                new_rate = global_dynamic_rate[seg_idx]
+            if new_rate != last_rate and i - token_seen_num:
+                local_compresssion_rate.append((i - token_seen_num, last_rate))
+                token_seen_num = i
+            last_rate = new_rate
+            seg_seen += cur_word_len
+            if (i + 1) % iterative_size == 0:
+                if token_seen_num != i + 1:
+                    local_compresssion_rate.append((i + 1 - token_seen_num, last_rate))
+                    token_seen_num = i + 1
+                dynamic_compression_rate.append(local_compresssion_rate[:])
+                local_compresssion_rate = []
+        if token_seen_num != len(text_input_ids):
+            local_compresssion_rate.append(
+                (len(text_input_ids) - token_seen_num, last_rate)
+            )
+        if local_compresssion_rate != []:
+            dynamic_compression_rate.append(local_compresssion_rate[:])
+        return dynamic_compression_rate
+
+    def control_context_budget(
+        self,
+        context: List[str],
+        context_tokens_length: List[int],
+        target_token: float,
+        force_context_ids: List[int] = None,
+        force_context_number: int = None,
+        question: str = "",
+        condition_in_question: str = "none",
+        reorder_context: str = "original",
+        dynamic_context_compression_ratio: float = 0.0,
+        rank_method: str = "longllmlingua",
+        context_budget: str = "+100",
+        context_segs: List[List[str]] = None,
+        context_segs_rate: List[List[float]] = None,
+        context_segs_compress: List[List[bool]] = None,
+        strict_preserve_uncompressed: bool = True,
+    ):
+        demostrations_sort = self.get_rank_results(
+            context,
+            question,
+            rank_method,
+            condition_in_question,
+            context_tokens_length,
+        )
+
+        if target_token < 0:
+            target_token = 100
+        target_token = eval("target_token" + context_budget)
+        res = []
+        used = force_context_ids if force_context_ids is not None else []
+        if context_segs is not None and strict_preserve_uncompressed:
+            for idx, _ in enumerate(context):
+                if False in context_segs_compress[idx] and idx not in used:
+                    used.append(idx)
+
+        self.context_idxs.append([x for idx, (x, _) in enumerate(demostrations_sort)])
+        for idx, _ in demostrations_sort:
+            if idx >= len(context_tokens_length):
+                continue
+            target_token -= context_tokens_length[idx]
+            if idx not in used:
+                used.append(idx)
+            if target_token < 0 or (
+                force_context_number is not None and len(res) >= force_context_number
+            ):
+                break
+        original_used = used
+        if reorder_context == "original":
+            used = sorted(used)
+        elif reorder_context == "two_stage":
+            l, r = [_ for idx, _ in enumerate(used) if idx % 2 == 0], [
+                _ for idx, _ in enumerate(used) if idx % 2 == 1
+            ]
+            used = l + r[::-1]
+
+        if dynamic_context_compression_ratio > 0:
+            N = len(used)
+            dynamic_ratio = [
+                i * (abs(dynamic_context_compression_ratio) / (N - 1)) if N > 1 else 0
+                for i in range(-(N - 1), N, 2)
+            ][::-1]
+            dynamic_ratio_map = {i: j for i, j in zip(original_used, dynamic_ratio)}
+            dynamic_ratio = [dynamic_ratio_map[i] for i in used]
+        else:
+            dynamic_ratio = [0.0] * len(used)
+
+        res = [context[idx] for idx in used if idx < len(context)]
+        return res, dynamic_ratio, used
+
+    def control_sentence_budget(
+        self,
+        context: List[str],
+        target_token: float,
+        keep_first_sentence: int = 0,
+        keep_last_sentence: int = 0,
+        keep_sentence_number: int = 0,
+        high_priority_bonus: int = 100,
+        token_budget_ratio: float = 1.4,
+        question: str = "",
+        condition_in_question: str = "none",
+        rank_method: str = "longllmlingua",
+        context_segs: List[List[str]] = None,
+        context_segs_rate: List[List[float]] = None,
+        context_segs_compress: List[List[bool]] = None,
+    ):
+        def keep_sentence(dem_idx: int, sent_keep: int):
+            idxs = sorted(dem_g[dem_idx], key=lambda x: sentence_ppl[x])[:sent_keep]
+            for idx in idxs:
+                sentence_ppl[idx] += high_priority_bonus
+
+        def sync_sentence(sentences, text):
+            seen_text = 0
+            sentence_num = len(sentences)
+            new_sentences = []
+            for i, s in enumerate(sentences):
+                assert s == text[seen_text : seen_text + len(s)]
+                if i == sentence_num - 1:
+                    new_sentences.append(text[seen_text:])
+                    break
+                next_sentence_start = text.find(
+                    sentences[i + 1][:5], seen_text + len(s)
+                )
+                new_sentences.append(text[seen_text:next_sentence_start])
+                seen_text = next_sentence_start
+            assert "".join(new_sentences) == text
+            return new_sentences
+
+        sentences = [nltk.sent_tokenize(c) for c in context]
+        sentences = [sync_sentence(s, c) for s, c in zip(sentences, context)]
+        dem_g, s2de, idx = defaultdict(set), defaultdict(int), 0
+        for idx_d, s in enumerate(sentences):
+            for _ in s:
+                dem_g[idx_d].add(idx)
+                s2de[idx] = idx_d
+                idx += 1
+
+        if context_segs is not None:
+            sen2seg_ratio = {}
+            idx = 0
+            for idx_d, sentences_each_context in enumerate(sentences):
+                segments_length = [len(s) for s in context_segs[idx_d]]
+                seg_idx, cur_seg_seen = 0, 0
+                for sentence in sentences_each_context:
+                    sentence_seg_ratio = []
+                    remain = len(sentence)
+                    while remain:
+                        if segments_length[seg_idx] - cur_seg_seen <= remain:
+                            new_seg_len = segments_length[seg_idx] - cur_seg_seen
+                            sentence_seg_ratio.append(
+                                (
+                                    new_seg_len,
+                                    context_segs_rate[idx_d][seg_idx],
+                                    context_segs_compress[idx_d][seg_idx],
+                                )
+                            )
+                            seg_idx += 1
+                            cur_seg_seen = 0
+                            remain -= new_seg_len
+                        else:
+                            sentence_seg_ratio.append(
+                                (
+                                    remain,
+                                    context_segs_rate[idx_d][seg_idx],
+                                    context_segs_compress[idx_d][seg_idx],
+                                )
+                            )
+                            cur_seg_seen += remain
+                            remain = 0
+                    sen2seg_ratio[idx] = sentence_seg_ratio
+                    idx += 1
+
+        context_sentences = [s for ii in sentences for s in ii]
+        sentence_tokens_length = [
+            self.get_token_length(sentence) for sentence in context_sentences
+        ]
+        N = len(context_sentences)
+        flags = list(range(len(context_sentences)))
+        if len(sentence_tokens_length) == 1:
+            segments_info = []
+            if context_segs is not None:
+                segments_info.append(sen2seg_ratio[0])
+            return context, segments_info
+        if rank_method == "longllmlingua":
+            sentence_ppl = [
+                self.get_condition_ppl(sentence, question, condition_in_question)
+                .cpu()
+                .numpy()
+                .item()
+                for sentence in context_sentences
+            ]
+            if keep_first_sentence:
+                sentence_ppl[:keep_first_sentence] = [
+                    ii + high_priority_bonus
+                    for ii in sentence_ppl[:keep_first_sentence]
+                ]
+            if keep_last_sentence:
+                sentence_ppl[-keep_last_sentence:] = [
+                    ii + high_priority_bonus
+                    for ii in sentence_ppl[-keep_last_sentence:]
+                ]
+            if keep_sentence_number:
+                for dem_idx in range(len(sentences)):
+                    keep_sentence(dem_idx, keep_sentence_number)
+            sort_direct = -1 if condition_in_question == "none" else 1
+            sent_sort = sorted(
+                enumerate(sentence_ppl), key=lambda x: sort_direct * x[1]
+            )
+        else:
+            sent_sort = self.get_rank_results(
+                context_sentences,
+                question,
+                rank_method,
+                condition_in_question,
+                [0] * len(context_sentences),
+            )
+
+        sentence_flags = [False] * N
+        if target_token < 0:
+            target_token = 100
+        target_token *= token_budget_ratio
+        res = []
+        for idx, _ in sent_sort:
+            idx = flags[idx]
+            target_token -= sentence_tokens_length[idx]
+            sentence_flags[idx] = True
+            if target_token < 0:
+                break
+
+        if context_segs is not None:
+            for idx in range(N):
+                preserved = [sen_seg_info[2] for sen_seg_info in sen2seg_ratio[idx]]
+                if False in preserved:
+                    sentence_flags[idx] = True
+
+        idx = 0
+        res = []
+        new_segments_info = []
+        for s in sentences:
+            tmp = [jj for ii, jj in enumerate(s) if sentence_flags[idx + ii]]
+            res.append("".join(tmp))
+            if context_segs is not None:
+                segment_ratio = []
+                for ii in range(len(s)):
+                    if sentence_flags[idx + ii]:
+                        segment_ratio.extend(sen2seg_ratio[idx + ii])
+                new_segments_info.append(segment_ratio)
+            idx += len(s)
+        return res, new_segments_info
+
+    def get_compressed_input(
+        self,
+        loss,
+        input_ids,
+        attention_mask,
+        end=200,
+        iterative_size=200,
+        threshold=0.5,
+        keep_flag=None,
+        split_token_id: int = 13,
+        start: int = 0,
+        self_loss=None,
+        self_input_ids=None,
+        self_attention_mask=None,
+    ):
+        if self_loss is not None:
+            need_idx = torch.concat(
+                [
+                    loss[:start] > 0,
+                    self_loss[: loss[start:].shape[0]] - loss[start:] > threshold,
+                    loss[:1] > 0,
+                ]
+            )
+        else:
+            need_idx = torch.concat([loss > threshold, loss[:1] > 0])
+        need_idx[end:] = 1
+        need_idx[: end - iterative_size] = 1
+        loss = loss[need_idx[:-1]]
+        if self_loss is not None:
+            if need_idx.shape[0] < self_loss.shape[0] + start + 1:
+                need_idx = torch.cat(
+                    [
+                        need_idx,
+                        torch.ones(
+                            self_loss.shape[0] - need_idx.shape[0] + start + 1,
+                            dtype=torch.bool,
+                        ).to(need_idx.device),
+                    ]
+                )
+            self_loss = self_loss[need_idx[start:-1]]
+
+        if need_idx.shape[0] < input_ids.shape[1]:
+            need_idx = torch.cat(
+                [
+                    need_idx,
+                    torch.ones(
+                        input_ids.shape[1] - need_idx.shape[0], dtype=torch.bool
+                    ).to(need_idx.device),
+                ]
+            )
+        elif need_idx.shape[0] > input_ids.shape[1]:
+            need_idx = need_idx[: input_ids.shape[1]]
+
+        if keep_flag is not None:
+            need_idx[keep_flag == 1] = 1
+        last = -1
+        if keep_flag is not None:
+            for ii in range(max(0, end - iterative_size), end):
+                if need_idx[ii] != 1:
+                    continue
+                now = input_ids[0][ii].detach().cpu().item()
+                if (
+                    now == split_token_id
+                    and last == split_token_id
+                    and keep_flag[ii].detach().cpu().item() == 0
+                ):
+                    need_idx[ii] = 0
+                else:
+                    last = now
+        compressed_input_ids = input_ids[attention_mask == 1][need_idx].unsqueeze(0)
+        compressed_attention_mask = attention_mask[attention_mask == 1][
+            need_idx
+        ].unsqueeze(0)
+
+        if self_loss is not None:
+            self_compressed_input_ids = self_input_ids[self_attention_mask == 1][
+                need_idx[start:]
+            ].unsqueeze(0)
+            self_compressed_attention_mask = self_attention_mask[
+                self_attention_mask == 1
+            ][need_idx[start:]].unsqueeze(0)
+        else:
+            self_compressed_input_ids, self_compressed_attention_mask = None, None
+        if keep_flag is not None:
+            if len(keep_flag) > len(need_idx):
+                keep_flag = torch.cat(
+                    [
+                        keep_flag[:start],
+                        keep_flag[start : len(need_idx) + start][need_idx],
+                        keep_flag[start + len(need_idx) :],
+                    ]
+                )
+            else:
+                keep_flag = keep_flag[need_idx]
+        end -= (need_idx[:end] == 0).sum()
+        return (
+            compressed_input_ids,
+            compressed_attention_mask,
+            keep_flag,
+            end,
+            loss,
+            self_loss,
+            self_compressed_input_ids,
+            self_compressed_attention_mask,
+        )
+
+    def get_estimate_threshold_base_distribution(
+        self, ppl, ratio: float, condition_flag: bool = False
+    ):
+        if ratio == 1.0:
+            return float("-inf")
+        ppl = ppl[ppl != 10000]
+        target_token = max(0, min(len(ppl) - 1, int(len(ppl) * ratio) - 1))
+        return (
+            ppl.sort(descending=not condition_flag)
+            .values[target_token]
+            .detach()
+            .cpu()
+            .item()
+        )
+
+    def iterative_compress_prompt(
+        self,
+        context: List[str],
+        target_token: float,
+        iterative_size: int = 200,
+        keep_split: bool = False,
+        split_token_id: int = 13,
+        start: int = 0,
+        dynamic_ratio: list = None,
+        condition_compare: bool = False,
+        segments_info: List[List[tuple]] = None,
+    ):
+        if segments_info is None or segments_info == []:
+            iterative_ratios = self.get_dynamic_compression_ratio(
+                context, target_token, iterative_size, dynamic_ratio, start
+            )
+        else:
+            iterative_ratios = self.get_structured_dynamic_compression_ratio(
+                context, iterative_size, dynamic_ratio, start, segments_info
+            )
+        context = "\n\n".join(context)
+        tokenized_text = self.tokenizer(
+            context, return_tensors="pt", add_special_tokens=False
+        )
+        input_ids = tokenized_text["input_ids"].to(self.device)
+        attention_mask = tokenized_text["attention_mask"].to(self.device)
+
+        N = (attention_mask == 1).sum()
+        compressed_input_ids, compressed_attention_mask = input_ids, attention_mask
+        if condition_compare:
+            self_input_ids, self_attention_mask = (
+                input_ids[:, start:],
+                attention_mask[:, start:],
+            )
+            self_compressed_input_ids, self_compressed_attention_mask = (
+                self_input_ids,
+                self_attention_mask,
+            )
+
+        end = min(iterative_size + start, compressed_input_ids.shape[1])
+        threshold, keep_flag = None, None
+        if keep_split:
+            input_ids_numpy = input_ids.cpu().detach().numpy()[0]
+            N = len(input_ids_numpy)
+            keep_flag = [
+                int(
+                    (
+                        ii > 0
+                        and input_ids_numpy[ii] == split_token_id
+                        and input_ids_numpy[ii - 1] == split_token_id
+                    )
+                    or (
+                        ii < N - 1
+                        and input_ids_numpy[ii] == split_token_id
+                        and input_ids_numpy[ii + 1] == split_token_id
+                    )
+                )
+                for ii in range(N)
+            ]
+            keep_flag = torch.tensor(keep_flag).to(self.device)
+        past_key_values, past_loss, ready_end = None, None, 0
+        self_past_key_values, self_past_loss, self_ready_end = None, None, 0
+        pop_compressed_input_ids, pop_self_compressed_input_ids = None, None
+        idx = 0
+        while end <= compressed_input_ids.shape[1]:
+            if end > self.max_position_embeddings and past_key_values is not None:
+                # KV-Cache Compression
+                e, s = end - self.max_position_embeddings, min(
+                    self.cache_bos_num + start, self.max_position_embeddings
+                )
+                if pop_compressed_input_ids is None:
+                    pop_compressed_input_ids = compressed_input_ids[:, :e]
+                else:
+                    pop_compressed_input_ids = torch.cat(
+                        [pop_compressed_input_ids, compressed_input_ids[:, :e]], dim=-1
+                    )
+                compressed_input_ids = compressed_input_ids[:, e:]
+                compressed_attention_mask = compressed_attention_mask[:, e:]
+                past_key_values = [
+                    [
+                        torch.cat([k[..., :s, :], k[..., s + e :, :]], dim=-2),
+                        torch.cat([v[..., :s, :], v[..., s + e :, :]], dim=-2),
+                    ]
+                    for k, v in past_key_values
+                ]
+                if keep_flag is not None:
+                    keep_flag = keep_flag[e:]
+                end, ready_end = end - e, ready_end - e
+                if condition_compare:
+                    s = min(s, self_past_key_values[0][0].shape[2] - e)
+                    self_ready_end -= e
+                    if pop_self_compressed_input_ids is None:
+                        pop_self_compressed_input_ids = self_compressed_input_ids[:, :e]
+                    else:
+                        pop_self_compressed_input_ids = torch.cat(
+                            [
+                                pop_self_compressed_input_ids,
+                                self_compressed_input_ids[:, :e],
+                            ],
+                            dim=-1,
+                        )
+                    self_compressed_input_ids = self_compressed_input_ids[:, e:]
+                    self_compressed_attention_mask = self_compressed_attention_mask[
+                        :, e:
+                    ]
+                    self_past_key_values = [
+                        [
+                            torch.cat([k[..., :s, :], k[..., s + e :, :]], dim=-2),
+                            torch.cat([v[..., :s, :], v[..., s + e :, :]], dim=-2),
+                        ]
+                        for k, v in self_past_key_values
+                    ]
+
+            loss, past_key_values = self.get_ppl(
+                "",
+                "token",
+                compressed_input_ids,
+                compressed_attention_mask,
+                past_key_values=past_key_values,
+                return_kv=True,
+                end=end if idx else None,
+            )
+            if loss.shape[0] == 0:
+                break
+            if past_loss is not None:
+                if end - 1 > len(past_loss):
+                    past_loss = torch.cat(
+                        [past_loss, torch.zeros_like(loss)[: end - 1 - len(past_loss)]]
+                    )
+                past_loss[ready_end : end - 1] = loss
+                loss = past_loss
+            else:
+                past_loss = loss
+            if idx:
+                past_key_values = [
+                    [k[:, :, : end - iterative_size], v[:, :, : end - iterative_size]]
+                    for k, v in past_key_values
+                ]
+            else:
+                past_key_values = None
+
+            if condition_compare:
+                self_loss, self_past_key_values = self.get_ppl(
+                    "",
+                    "token",
+                    self_compressed_input_ids,
+                    self_compressed_attention_mask,
+                    past_key_values=self_past_key_values,
+                    return_kv=True,
+                    end=end - start if idx else None,
+                )
+                if self_past_loss is not None:
+                    if end - start - 1 > len(self_past_loss):
+                        self_past_loss = torch.cat(
+                            [
+                                self_past_loss,
+                                torch.zeros_like(self_loss)[
+                                    : end - 1 - start - len(self_past_loss)
+                                ],
+                            ]
+                        )
+                    self_past_loss[self_ready_end : end - start - 1] = self_loss
+                    self_loss = self_past_loss
+                else:
+                    self_past_loss = self_loss
+                if idx:
+                    self_past_key_values = [
+                        [
+                            k[:, :, : end - iterative_size - start],
+                            v[:, :, : end - iterative_size - start],
+                        ]
+                        for k, v in self_past_key_values
+                    ]
+                else:
+                    self_past_key_values = None
+
+                self_ready_end = (
+                    end - start - iterative_size if not (start and idx == 0) else 0
+                )
+            ready_end = end - iterative_size if not (start and idx == 0) else 0
+
+            for delta_end, ratio in iterative_ratios[idx]:
+                loss = past_loss
+                if condition_compare:
+                    self_loss = self_past_loss
+                    threshold = self.get_estimate_threshold_base_distribution(
+                        self_loss[: loss[start:].shape[0]] - loss[start:], ratio, False
+                    )
+                else:
+                    threshold = self.get_estimate_threshold_base_distribution(
+                        loss, ratio, False
+                    )
+
+                (
+                    compressed_input_ids,
+                    compressed_attention_mask,
+                    keep_flag,
+                    end,
+                    past_loss,
+                    self_past_loss,
+                    self_compressed_input_ids,
+                    self_compressed_attention_mask,
+                ) = self.get_compressed_input(
+                    loss,
+                    compressed_input_ids,
+                    compressed_attention_mask,
+                    end - iterative_size + delta_end,
+                    iterative_size=delta_end,
+                    threshold=threshold,
+                    keep_flag=keep_flag,
+                    split_token_id=split_token_id,
+                    start=start,
+                    self_loss=self_loss if condition_compare else None,
+                    self_input_ids=(
+                        self_compressed_input_ids if condition_compare else None
+                    ),
+                    self_attention_mask=(
+                        self_compressed_attention_mask if condition_compare else None
+                    ),
+                )
+                end += iterative_size
+            idx += 1
+        if pop_compressed_input_ids is not None:
+            compressed_input_ids = torch.cat(
+                [pop_compressed_input_ids, compressed_input_ids], dim=-1
+            )
+        return compressed_input_ids[:, start:], compressed_attention_mask[:, start:]
+
+    def recover(
+        self,
+        original_prompt: str,
+        compressed_prompt: str,
+        response: str,
+    ):
+        def match_from_compressed(response_word):
+            response_input_ids = self.tokenizer(
+                response_word, add_special_tokens=False
+            )["input_ids"]
+            response_set, response_c = set(response_input_ids), defaultdict(list)
+            for idx in range(M):
+                if original_input_ids[idx] in response_set:
+                    response_c[original_input_ids[idx]].append(idx)
+            res, res_min, res_c = None, float("inf"), 1
+            n = len(response_input_ids)
+            for l in response_c[response_input_ids[0]]:
+                x, y, c = 0, l, 1
+                for x in range(1, n):
+                    idx = bisect.bisect_right(response_c[response_input_ids[x]], y)
+                    if (
+                        idx >= len(response_c[response_input_ids[x]])
+                        or response_c[response_input_ids[x]][idx] - y > 10
+                    ):
+                        continue
+                    c += 1
+                    y = response_c[response_input_ids[x]][idx]
+                if c > res_c:
+                    res_c = c
+                    res_min = y - l + 1
+                    res = (l, y + 1)
+                elif c == res_c and y - l + 1 < res_min:
+                    res_min = y - l + 1
+                    res = (l, y + 1)
+
+            if res is None:
+                return response_word
+            # while l > 0 and not self.tokenizer.convert_ids_to_tokens(original_input_ids[l]).startswith("_"):
+            #     l -= 1
+            # while r < M - 1 and not self.tokenizer.convert_ids_to_tokens(original_input_ids[l]).startswith("_"):
+            #     l -= 1
+            return self.tokenizer.decode(original_input_ids[res[0] : res[1]])
+
+        response_words = response.split(" ")
+
+        original_input_ids = self.tokenizer(original_prompt, add_special_tokens=False)[
+            "input_ids"
+        ]
+        N, M = len(response_words), len(original_input_ids)
+        recovered_response_words = []
+        l = 0
+        while l < N:
+            if response_words[l] not in compressed_prompt:
+                recovered_response_words.append(response_words[l])
+                l += 1
+                continue
+            r = l
+            while (
+                r + 1 < N and " ".join(response_words[l : r + 2]) in compressed_prompt
+            ):
+                r += 1
+
+            match_words = match_from_compressed(" ".join(response_words[l : r + 1]))
+            recovered_response_words.append(match_words)
+            l = r + 1
+        return " ".join(recovered_response_words)
+
+    def get_rank_results(
+        self,
+        context: list,
+        question: str,
+        rank_method: str,
+        condition_in_question: str,
+        context_tokens_length: list,
+    ):
+        def get_distance_bm25(corpus, query):
+            from rank_bm25 import BM25Okapi
+
+            tokenized_corpus = [doc.split(" ") for doc in corpus]
+            bm25 = BM25Okapi(tokenized_corpus)
+            tokenized_query = query.split(" ")
+            doc_scores = bm25.get_scores(tokenized_query)
+            idx = [(ii, 0) for ii in (-doc_scores).argsort()]
+            return idx
+
+        def get_distance_gzip(corpus, query):
+            def get_score(x, y):
+                cx, cy = len(gzip.compress(x.encode())), len(gzip.compress(y.encode()))
+                cxy = len(gzip.compress(f"{x} {y}".encode()))
+                return (cxy - min(cx, cy)) / max(cx, cy)
+
+            import gzip
+
+            doc_scores = [get_score(doc, query) for doc in corpus]
+            idx = [(ii, 0) for ii in np.argsort(doc_scores)]
+            return idx
+
+        def get_distance_sentbert(corpus, query):
+            from sentence_transformers import SentenceTransformer, util
+
+            if self.retrieval_model is None or self.retrieval_model_name != rank_method:
+                self.retrieval_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
+                self.retrieval_model_name = rank_method
+            doc_embeds = self.retrieval_model.encode(corpus)
+            query = self.retrieval_model.encode(query)
+            doc_scores = -util.dot_score(doc_embeds, query).cpu().numpy().reshape(-1)
+            idx = [(ii, 0) for ii in np.argsort(doc_scores)]
+            return idx
+
+        def get_distance_openai(corpus, query):
+            import openai
+            from sentence_transformers import util
+
+            openai.api_key = self.open_api_config.get("api_key", "")
+            openai.api_base = self.open_api_config.get(
+                "api_base", "https://api.openai.com/v1"
+            )
+            openai.api_type = self.open_api_config.get("api_type", "open_ai")
+            openai.api_version = self.open_api_config.get("api_version", "2023-05-15")
+            engine = self.open_api_config.get("engine", "text-embedding-ada-002")
+
+            def get_embed(text):
+                return openai.Embedding.create(
+                    input=[text.replace("\n", " ")], engine=engine
+                )["data"][0]["embedding"]
+
+            doc_embeds = [get_embed(i) for i in corpus]
+            query = get_embed(query)
+            doc_scores = -util.dot_score(doc_embeds, query).cpu().numpy().reshape(-1)
+            idx = [(ii, 0) for ii in np.argsort(doc_scores)]
+            return idx
+
+        def get_distance_sentbert_bge(corpus, query):
+            from sentence_transformers import SentenceTransformer, util
+
+            if self.retrieval_model is None or self.retrieval_model_name != rank_method:
+                self.retrieval_model = SentenceTransformer("BAAI/bge-large-en-v1.5")
+                self.retrieval_model_name = rank_method
+            doc_embeds = self.retrieval_model.encode(
+                [i for i in corpus], normalize_embeddings=True
+            )
+            query = self.retrieval_model.encode(query, normalize_embeddings=True)
+            doc_scores = -util.dot_score(doc_embeds, query).cpu().numpy().reshape(-1)
+            idx = [(ii, 0) for ii in np.argsort(doc_scores)]
+            return idx
+
+        def get_distance_bge_ranker(corpus, query):
+            from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+            pairs = [[i, query] for i in corpus]
+            if self.retrieval_model is None or self.retrieval_model_name != rank_method:
+                tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-large")
+                model = (
+                    AutoModelForSequenceClassification.from_pretrained(
+                        "BAAI/bge-reranker-large"
+                    )
+                    .eval()
+                    .to(self.device)
+                )
+                self.retrieval_model = [tokenizer, model]
+                self.retrieval_model_name = rank_method
+            with torch.no_grad():
+                inputs = self.retrieval_model[0](
+                    pairs,
+                    padding=True,
+                    truncation=True,
+                    return_tensors="pt",
+                    max_length=512,
+                ).to(self.device)
+                scores = (
+                    self.retrieval_model[1](**inputs, return_dict=True)
+                    .logits.view(
+                        -1,
+                    )
+                    .float()
+                )
+            idx = [(ii, 0) for ii in np.argsort(-scores.cpu())]
+            return idx
+
+        def get_distance_bge_llmembedder(corpus, query):
+            from transformers import AutoModel, AutoTokenizer
+
+            if self.retrieval_model is None or self.retrieval_model_name != rank_method:
+                tokenizer = AutoTokenizer.from_pretrained("BAAI/llm-embedder")
+                model = (
+                    AutoModel.from_pretrained("BAAI/llm-embedder")
+                    .eval()
+                    .to(self.device)
+                )
+                self.retrieval_model = [tokenizer, model]
+                self.retrieval_model_name = rank_method
+
+            instruction_qa_query = (
+                "Represent this query for retrieving relevant documents: "
+            )
+            instruction_qa_key = "Represent this document for retrieval: "
+            queries = [instruction_qa_query + query for _ in corpus]
+            keys = [instruction_qa_key + key for key in corpus]
+            with torch.no_grad():
+                query_inputs = self.retrieval_model[0](
+                    queries,
+                    padding=True,
+                    truncation=True,
+                    return_tensors="pt",
+                    max_length=512,
+                ).to(self.device)
+                key_inputs = self.retrieval_model[0](
+                    keys,
+                    padding=True,
+                    truncation=True,
+                    return_tensors="pt",
+                    max_length=512,
+                ).to(self.device)
+                query_outputs = self.retrieval_model[1](**query_inputs)
+                key_outputs = self.retrieval_model[1](**key_inputs)
+                # CLS pooling
+                query_embeddings = query_outputs.last_hidden_state[:, 0]
+                key_embeddings = key_outputs.last_hidden_state[:, 0]
+                # Normalize
+                query_embeddings = torch.nn.functional.normalize(
+                    query_embeddings, p=2, dim=1
+                )
+                key_embeddings = torch.nn.functional.normalize(
+                    key_embeddings, p=2, dim=1
+                )
+                similarity = query_embeddings @ key_embeddings.T
+            idx = [(ii, 0) for ii in np.argsort(-similarity[0].cpu())]
+            return idx
+
+        def get_distance_jinza(corpus, query):
+            from numpy.linalg import norm
+            from transformers import AutoModel
+
+            def cos_sim(a, b):
+                return (a @ b.T) / (norm(a) * norm(b))
+
+            if self.retrieval_model is None or self.retrieval_model_name != rank_method:
+                model = (
+                    AutoModel.from_pretrained(
+                        "jinaai/jina-embeddings-v2-base-en", trust_remote_code=True
+                    )
+                    .eval()
+                    .to(self.device)
+                )
+                self.retrieval_model = model
+                self.retrieval_model_name = rank_method
+
+            doc_embeds = self.retrieval_model.encode(corpus)
+            query = self.retrieval_model.encode(query)
+            doc_scores = cos_sim(doc_embeds, query)
+            idx = [(ii, 0) for ii in np.argsort(-doc_scores)]
+            return idx
+
+        def get_distance_voyageai(corpus, query):
+            import voyageai
+            from sentence_transformers import util
+
+            voyageai.api_key = self.open_api_config.get("voyageai_api_key", "")
+
+            def get_embed(text):
+                return voyageai.get_embedding(text, model="voyage-01")
+
+            doc_embeds = [get_embed(i) for i in corpus]
+            query = get_embed(query)
+            doc_scores = -util.dot_score(doc_embeds, query).cpu().numpy().reshape(-1)
+            idx = [(ii, 0) for ii in np.argsort(doc_scores)]
+            return idx
+
+        def get_distance_cohere(corpus, query):
+            import cohere
+
+            api_key = self.open_api_config.get("cohere_api_key", "")
+            co = cohere.Client(api_key)
+            results = co.rerank(
+                model="rerank-english-v2.0", query=query, documents=corpus, top_n=20
+            )
+            c_map = {jj: ii for ii, jj in enumerate(corpus)}
+            doc_rank = [c_map[ii.document["text"]] for ii in results]
+            idx = [(ii, 0) for ii in doc_rank]
+            return idx
+
+        def get_distance_longllmlingua(corpus, query):
+            context_ppl = [
+                self.get_condition_ppl(
+                    d,
+                    query
+                    + " We can get the answer to this question in the given documents.",
+                    condition_in_question,
+                )
+                - dl * 2 / 250 * 0
+                for d, dl in zip(corpus, context_tokens_length)
+            ]
+            sort_direct = -1 if condition_in_question == "none" else 1
+            ys = sorted(enumerate(context_ppl), key=lambda x: sort_direct * x[1])
+            return ys
+
+        method = None
+        if rank_method == "bm25":
+            method = get_distance_bm25
+        elif rank_method == "gzip":
+            method = get_distance_gzip
+        elif rank_method == "sentbert":
+            method = get_distance_sentbert
+        elif rank_method == "openai":
+            method = get_distance_openai
+        elif rank_method in ["longllmlingua", "llmlingua"]:
+            method = get_distance_longllmlingua
+        elif rank_method == "bge":
+            method = get_distance_sentbert_bge
+        elif rank_method == "bge_reranker":
+            method = get_distance_bge_ranker
+        elif rank_method == "bge_llmembedder":
+            method = get_distance_bge_llmembedder
+        elif rank_method == "jinza":
+            method = get_distance_jinza
+        elif rank_method == "voyageai":
+            method = get_distance_voyageai
+        elif rank_method == "cohere":
+            method = get_distance_cohere
+        return method(context, question)
+
+    def segment_structured_context(
+        self,
+        context: List[str],
+        global_rate: float,
+    ):
+        new_context, context_segs, context_segs_rate, context_segs_compress = (
+            [],
+            [],
+            [],
+            [],
+        )
+        for text in context:
+            if not text.startswith("<llmlingua"):
+                text = "<llmlingua>" + text
+            if not text.endswith("</llmlingua>"):
+                text = text + "</llmlingua>"
+
+            # Regular expression to match <llmlingua, rate=x, compress=y>content</llmlingua>, allowing rate and compress in any order
+            pattern = r"<llmlingua\s*(?:,\s*rate\s*=\s*([\d\.]+))?\s*(?:,\s*compress\s*=\s*(True|False))?\s*(?:,\s*rate\s*=\s*([\d\.]+))?\s*(?:,\s*compress\s*=\s*(True|False))?\s*>([^<]+)</llmlingua>"
+            matches = re.findall(pattern, text)
+
+            # Extracting segment contents
+            segments = [match[4] for match in matches]
+
+            # Extracting rate and compress, considering their possible positions
+            segs_rate = [
+                float(match[0]) if match[0] else (float(match[2]) if match[2] else None)
+                for match in matches
+            ]
+            segs_compress = [
+                (
+                    match[1] == "True"
+                    if match[1]
+                    else (match[3] == "True" if match[3] else None)
+                )
+                for match in matches
+            ]
+
+            segs_compress = [
+                compress if compress is not None else True for compress in segs_compress
+            ]
+            segs_rate = [
+                rate if rate else (global_rate if compress else 1.0)
+                for rate, compress in zip(segs_rate, segs_compress)
+            ]
+            assert (
+                len(segments) == len(segs_rate) == len(segs_compress)
+            ), "The number of segments, rates, and compress flags should be the same."
+            assert all(
+                seg_rate <= 1.0 for seg_rate in segs_rate
+            ), "Error: 'rate' must not exceed 1.0. The value of 'rate' indicates compression rate and must be within the range [0, 1]."
+
+            new_context.append("".join(segments))
+            context_segs.append(segments)
+            context_segs_rate.append(segs_rate)
+            context_segs_compress.append(segs_compress)
+
+        return new_context, context_segs, context_segs_rate, context_segs_compress
+
+    def concate_segment_info(
+        self,
+        segment_info: List[List[tuple]],
+    ):
+        new_segment_info = []
+        for i, (seg_len, seg_ratio, seg_compress) in enumerate(segment_info):
+            if (
+                new_segment_info
+                and new_segment_info[-1][1] == seg_ratio
+                and new_segment_info[-1][2] == seg_compress
+            ):
+                new_segment_info[-1] = (
+                    new_segment_info[-1][0] + seg_len,
+                    seg_ratio,
+                    seg_compress,
+                )
+            else:
+                new_segment_info.append((seg_len, seg_ratio, seg_compress))
+        return new_segment_info
+
+    def __get_context_prob(
+        self,
+        context_list: list,
+        token_to_word="mean",
+        force_tokens: List[str] = [],
+        token_map: dict = {},
+        force_reserve_digit: bool = False,
+    ):
+        chunk_list = []
+        for chunks in context_list:
+            for c in chunks:
+                chunk_list.append(c)
+
+        dataset = TokenClfDataset(
+            chunk_list, tokenizer=self.tokenizer, max_len=self.max_seq_len
+        )
+        dataloader = DataLoader(
+            dataset, batch_size=self.max_batch_size, shuffle=False, drop_last=False
+        )
+
+        chunk_probs = []
+        chunk_words = []
+        
+        with torch.no_grad():
+            batch_results = [None] * len(dataloader)
+            with concurrent.futures.ProcessPoolExecutor(max_workers=self.number_of_cores) as executor:
+                futures = {executor.submit(self.process_batch, i, batch, token_to_word, force_tokens, token_map, force_reserve_digit): i for i, batch in enumerate(dataloader)}
+                for future in concurrent.futures.as_completed(futures):
+                    index, results = future.result()
+                    batch_results[index] = results
+
+        for batch_result in batch_results:
+            for batch_result_chunk in batch_result:
+                chunk_words.append(batch_result_chunk[0])
+                chunk_probs.append(batch_result_chunk[1])        
+
+        prev_idx = 0
+        context_probs = []
+        context_words = []
+        for chunk_list in context_list:
+            n_chunk = len(chunk_list)
+            context_probs.append([])
+            context_words.append([])
+            for i in range(n_chunk):
+                context_probs[-1].extend(chunk_probs[prev_idx + i])
+                context_words[-1].extend(chunk_words[prev_idx + i])
+            prev_idx = prev_idx + n_chunk
+        context_probs = [sum(probs) / len(probs) for probs in context_probs]
+        return context_probs, context_words
+    
+    def process_batch(self, index, batch, token_to_word, force_tokens, token_map, force_reserve_digit):
+            ids = batch["ids"].to(self.device, dtype=torch.long)
+            mask = batch["mask"].to(self.device, dtype=torch.long) == 1
+
+            outputs = self.model(input_ids=ids, attention_mask=mask)
+            loss, logits = outputs.loss, outputs.logits
+            probs = F.softmax(logits, dim=-1)
+
+            batch_results = []
+            for j in range(ids.shape[0]):
+                _probs = probs[j, :, 1]
+                _ids = ids[j]
+                _mask = mask[j]
+
+                active_probs = torch.masked_select(_probs, _mask)
+                active_ids = torch.masked_select(_ids, _mask)
+
+                tokens = self.tokenizer.convert_ids_to_tokens(
+                    active_ids.squeeze().tolist()
+                )
+                token_probs = [prob for prob in active_probs.detach().cpu().numpy()]
+
+                (
+                    words,
+                    valid_token_probs,
+                    valid_token_probs_no_force,
+                ) = self.__merge_token_to_word(
+                    tokens,
+                    token_probs,
+                    force_tokens=force_tokens,
+                    token_map=token_map,
+                    force_reserve_digit=force_reserve_digit,
+                )
+                word_probs_no_force = self.__token_prob_to_word_prob(
+                    valid_token_probs_no_force, convert_mode=token_to_word
+                )
+
+                if "xlm-roberta-large" in self.model_name:
+                    for i in range(len(words)):
+                        words[i] = words[i].lstrip("▁")
+                batch_results.append((words, word_probs_no_force))
+            
+            return index, batch_results
+    
+    def __chunk_context(self, origin_text, chunk_end_tokens):
+        # leave 2 token for CLS and SEP
+        max_len = self.max_seq_len - 2
+        origin_list = []
+        origin_tokens = self.tokenizer.tokenize(origin_text)
+        n = len(origin_tokens)
+        st = 0
+        while st < n:
+            if st + max_len > n - 1:
+                chunk = self.tokenizer.convert_tokens_to_string(origin_tokens[st:n])
+                origin_list.append(chunk)
+                break
+            else:
+                ed = st + max_len
+                for j in range(0, ed - st):
+                    if origin_tokens[ed - j] in chunk_end_tokens:
+                        ed = ed - j
+                        break
+                chunk = self.tokenizer.convert_tokens_to_string(
+                    origin_tokens[st : ed + 1]
+                )
+                origin_list.append(chunk)
+                st = ed + 1
+        return origin_list
+
+    def __merge_token_to_word(
+        self, tokens, token_probs, force_tokens, token_map, force_reserve_digit
+    ):
+        words = []
+        word_probs = []
+        word_probs_no_force = []
+
+        for token, prob in zip(tokens, token_probs):
+            if token in self.special_tokens:
+                continue
+            # add a new word
+            elif is_begin_of_new_word(token, self.model_name, force_tokens, token_map):
+                pure_token = get_pure_token(token, self.model_name)
+                prob_no_force = prob
+                if pure_token in force_tokens or pure_token in set(token_map.values()):
+                    prob = 1.0
+                token = replace_added_token(token, token_map)
+                words.append(token)
+                word_probs.append(
+                    [
+                        1.0
+                        if force_reserve_digit and bool(re.search(r"\d", token))
+                        else prob
+                    ]
+                )
+                word_probs_no_force.append([prob_no_force])
+            # concatenate with previous token
+            else:
+                pure_token = get_pure_token(token, self.model_name)
+                words[-1] += pure_token
+                word_probs[-1].append(
+                    1.0
+                    if force_reserve_digit and bool(re.search(r"\d", token))
+                    else prob
+                )
+                word_probs_no_force[-1].append(prob_no_force)
+
+        return words, word_probs, word_probs_no_force
+
+    def __token_prob_to_word_prob(self, token_probs, convert_mode="mean"):
+        if convert_mode == "mean":
+            word_probs = [sum(p) / len(p) for p in token_probs]
+        elif convert_mode == "first":
+            word_probs = [p[0] for p in token_probs]
+        else:
+            raise NotImplementedError()
+
+        return word_probs
+
+    def __compress(
+        self,
+        context_list: list,
+        reduce_rate: float = 0.5,
+        token_to_word: str = "mean",
+        force_tokens: List[str] = [],
+        token_map: dict = {},
+        force_reserve_digit: bool = False,
+        drop_consecutive: bool = False,
+    ):
+        def split_string_to_words(input_string):
+            pattern = r'\b\w+\b|[<>=/!@#$%^&*()?":{}|\\`~;_+-]'
+            result = re.findall(pattern, input_string)
+            return result
+
+        if reduce_rate <= 0:
+            words, word_labels = [], []
+            for i in range(len(context_list)):
+                chunk_list = context_list[i]
+                chunk_words = []
+                chunk_word_labels = []
+                for j in range(len(chunk_list)):
+                    # replace to original token
+                    for ori_token, new_token in token_map.items():
+                        chunk_list[j] = chunk_list[j].replace(new_token, ori_token)
+                    ws = split_string_to_words(chunk_list[j])
+                    chunk_words.extend(ws)
+                    chunk_word_labels.extend([1 for _ in range(len(ws))])
+                context_list[i] = "".join(chunk_list)
+                words.append(chunk_words)
+                word_labels.append(chunk_word_labels)
+            return context_list, words, word_labels
+
+        chunk_list = []
+        for chunks in context_list:
+            for c in chunks:
+                chunk_list.append(c)
+
+        dataset = TokenClfDataset(
+            chunk_list, tokenizer=self.tokenizer, max_len=self.max_seq_len
+        )
+        dataloader = DataLoader(
+            dataset, batch_size=self.max_batch_size, shuffle=False, drop_last=False
+        )
+
+        compressed_chunk_list = []
+        word_list = []
+        word_label_list = []
+        with torch.no_grad():
+            for batch in dataloader:
+                ids = batch["ids"].to(self.device, dtype=torch.long)
+                mask = batch["mask"].to(self.device, dtype=torch.long) == 1
+
+                outputs = self.model(input_ids=ids, attention_mask=mask)
+                loss, logits = outputs.loss, outputs.logits
+                probs = F.softmax(logits, dim=-1)
+
+                for j in range(ids.shape[0]):
+                    chunk_probs = probs[j, :, 1]
+                    chunk_ids = ids[j]
+                    chunk_mask = mask[j]
+
+                    active_probs = torch.masked_select(chunk_probs, chunk_mask)
+                    active_ids = torch.masked_select(chunk_ids, chunk_mask)
+
+                    tokens = self.tokenizer.convert_ids_to_tokens(
+                        active_ids.squeeze().tolist()
+                    )
+                    token_probs = [prob for prob in active_probs.cpu().numpy()]
+
+                    words, valid_token_probs, _ = self.__merge_token_to_word(
+                        tokens=tokens,
+                        token_probs=token_probs,
+                        force_tokens=force_tokens,
+                        token_map=token_map,
+                        force_reserve_digit=force_reserve_digit,
+                    )
+                    word_probs = self.__token_prob_to_word_prob(
+                        valid_token_probs, convert_mode=token_to_word
+                    )
+
+                    if drop_consecutive:
+                        threshold = np.percentile(word_probs, int(100 * reduce_rate))
+                        is_token_between = False
+                        prev = None
+                        for i, (word, word_prob) in enumerate(zip(words, word_probs)):
+                            if word in force_tokens:
+                                if is_token_between:
+                                    is_token_between = False
+                                elif not is_token_between and word == prev:
+                                    word_probs[i] = 0.0
+                                prev = word
+                            else:
+                                is_token_between |= word_prob > threshold
+
+                    new_token_probs = []
+                    for word, word_prob in zip(words, word_probs):
+                        num_token = len(self.oai_tokenizer.encode(word))
+                        new_token_probs.extend([word_prob for _ in range(num_token)])
+                    threshold = np.percentile(
+                        new_token_probs, int(100 * reduce_rate + 1)
+                    )
+
+                    keep_words = []
+                    word_labels = []
+                    assert len(words) == len(word_probs)
+                    for word, word_prob in zip(words, word_probs):
+                        if word_prob > threshold or (
+                            threshold == 1.0 and word_prob == threshold
+                        ):
+                            if (
+                                drop_consecutive
+                                and word in force_tokens
+                                and len(keep_words) > 0
+                                and keep_words[-1] == word
+                            ):
+                                word_labels.append(0)
+                            else:
+                                keep_words.append(word)
+                                word_labels.append(1)
+                        else:
+                            word_labels.append(0)
+                    keep_str = self.tokenizer.convert_tokens_to_string(keep_words)
+                    if "xlm-roberta-large" in self.model_name:
+                        for i in range(len(words)):
+                            words[i] = words[i].lstrip("▁")
+
+                    compressed_chunk_list.append(keep_str)
+                    word_list.append(words[:])
+                    word_label_list.append(word_labels[:])
+
+        compressed_context_list = []
+        original_word_list = []
+        original_word_label_list = []
+        prev_idx = 0
+        for chunk_list in context_list:
+            n_chunk = len(chunk_list)
+            compressed_context_list.append(
+                "".join(compressed_chunk_list[prev_idx : prev_idx + n_chunk])
+            )
+            original_word_list.append([])
+            original_word_label_list.append([])
+            for i in range(n_chunk):
+                original_word_list[-1].extend(word_list[prev_idx + i])
+                original_word_label_list[-1].extend(word_label_list[prev_idx + i])
+            prev_idx = prev_idx + n_chunk
+
+        return compressed_context_list, original_word_list, original_word_label_list
diff --git a/script/monitor_cores.ps1 b/script/monitor_cores.ps1
new file mode 100644
index 0000000..97912d2
--- /dev/null
+++ b/script/monitor_cores.ps1
@@ -0,0 +1,27 @@
+while ($true) {
+
+    # Get the load percentage for each CPU core
+    $cpuLoad = Get-Counter '\Processor(*)\% Processor Time'
+
+    # Display the load percentage for each core
+    $cpuLoad.CounterSamples | ForEach-Object {
+        $core = $_.InstanceName
+        $load = $_.CookedValue
+        Write-Output "Core ${core}: ${load}% load"
+        # Get the process information
+        $processes = Get-Process | Sort-Object CPU -Descending | Select-Object -First 1
+
+        # Display the top process information
+        $processes | ForEach-Object {
+            $id = $_.Id
+            $name = $_.Name
+            $cpu = $_.CPU
+            Write-Output "Top Process ID: $id, Name: $name, CPU Time: $cpu"
+        }
+
+    }
+
+    
+    Start-Sleep -Milliseconds 500
+    Clear-Host
+}
\ No newline at end of file
diff --git a/script/process_on_cores.ps1 b/script/process_on_cores.ps1
new file mode 100644
index 0000000..c118311
--- /dev/null
+++ b/script/process_on_cores.ps1
@@ -0,0 +1,20 @@
+# Get the load percentage for each CPU core
+$cpuLoad = Get-Counter '\Processor(*)\% Processor Time'
+
+# Display the load percentage for each core
+$cpuLoad.CounterSamples | ForEach-Object {
+    $core = $_.InstanceName
+    $load = $_.CookedValue
+    Write-Output "Core ${core}: ${load}% load"
+}
+
+# Get the process information
+$processes = Get-Process | Select-Object Id, Name, CPU
+
+# Display the process information
+$processes | ForEach-Object {
+    $id = $_.Id
+    $name = $_.Name
+    $cpu = $_.CPU
+    Write-Output "Process ID: $id, Name: $name, CPU Time: $cpu"
+}
\ No newline at end of file
diff --git a/tests/test_llmlingua2_multicore.py b/tests/test_llmlingua2_multicore.py
new file mode 100644
index 0000000..de3a96e
--- /dev/null
+++ b/tests/test_llmlingua2_multicore.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
+import unittest
+
+from llmlingua import PromptCompressorV2
+
+
+class LLMLingua2Tester(unittest.TestCase):
+    """
+    End2end Test for LLMLingua-2
+    """
+
+    PROMPT = "John: So, um, I've been thinking about the project, you know, and I believe we need to, uh, make some changes. I mean, we want the project to succeed, right? So, like, I think we should consider maybe revising the timeline.\n\nSarah: I totally agree, John. I mean, we have to be realistic, you know. The timeline is, like, too tight. You know what I mean? We should definitely extend it."
+    COMPRESSED_SINGLE_CONTEXT_PROMPT = "John: thinking project believe need make changes. want project succeed? consider revising timeline.\n\n Sarah agree. be realistic. timeline too tight.? extend."
+    COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "John: So, I've been thinking about project believe we need to make changes. we want project to succeed, right? think we should consider maybe revising timeline."
+
+    GSM8K_PROMPT = "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAngelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\nQuestion: You can buy 4 apples or 1 watermelon for the same price. You bought 36 fruits evenly split between oranges, apples and watermelons, and the price of 1 orange is $0.50. How much does 1 apple cost if your total bill was $66?\nLet's think step by step\nIf 36 fruits were evenly split between 3 types of fruits, then I bought 36/3 = 12 units of each fruit\nIf 1 orange costs $0.50 then 12 oranges will cost $0.50 * 12 = $6\nIf my total bill was $66 and I spent $6 on oranges then I spent $66 - $6 = $60 on the other 2 fruit types.\nAssuming the price of watermelon is W, and knowing that you can buy 4 apples for the same price and that the price of one apple is A, then 1W=4A\nIf we know we bought 12 watermelons and 12 apples for $60, then we know that $60 = 12W + 12A\nKnowing that 1W=4A, then we can convert the above to $60 = 12(4A) + 12A\n$60 = 48A + 12A\n$60 = 60A\nThen we know the price of one apple (A) is $60/60= $1\nThe answer is 1"
+    GSM8K_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT = "Question: Angelo Melanie plan test 2 chapters 4 worksheets 3 hours each chapter 1.5 hours each worksheet study 4 hours day how days 10-minute break every 3 10-minute snack breaks 30 minutes lunch\n\n dedicate 3 hours 2 chapters 3 2 = 6 hours total\n worksheets 1.5 hours each worksheet 1.5 4 = 6 hours total\n 12 hours study 4 hours a day 12 / 4 = 3 days\n breaks lunch 10-minute break 12 hours 10 = 120 minutes\n 3 10-minute snack breaks 3 10 = 30 minutes\n 30 minutes lunch 120 + 30 + 30 = 180 minutes 180 / 60 = 3 extra hours\n 12 hours study + 3 hours breaks = 15 hours total\n 4 hours each day 15 / 4 = 3.75\n 4 days\nThe answer is 4"
+    GSM8K_150TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "4 apples 1 watermelon 36 fruits oranges watermelons 1 orange $0.50 1 apple bill $66\n\n 36 fruits 3 36/3 = 12 units\n 1 orange $0.50 12 oranges $0.50 * 12 = $6\n total bill $66 spent $6 oranges $66 - $6 = $60 other 2\n watermelon W 4 apples one apple A 1W=4A\n 12 watermelons 12 apples $60 $60 = 12W + 12A\n $60 = 12(4A + 12A\n = 48A + 12A\n = 60A\n one apple $60/60= $1\nThe answer is 1"
+
+    MEETINGBANK_PROMPT = "Item 28 Report from Development. Services Recommendation to declare ordinance amending the Land Use District Map from institutional to IRP 13 read and adopted as read District eight. Councilman Austin. So moved. Wonderful. And I want to ask Councilman Andrews so any member of the public that wishes to address item 28 saying none, members, cast your vote. Oh, I'm sorry, sir. I did not see you. Can we? I know this sounds picky and stupid. But this is an illogical motion because you haven't yet created ARP 13. By the way, unlike some other speakers, I will furnish you my name. I'm Joe Weinstein. I did speak last week. I do not like to come down here again to talk on the same subjects. But. There is a minor little matter. As to whether a. The proposed zoning is a good idea. And B, whether. The project, which it is intended. To permit. In fact. Meets the specifications of the zoning. I have not check that out, but someone else did raise that question and there may be some question as to whether all of the conditions of that zoning have, in fact, been met by the details of this project. This particular zoning, perhaps in the abstract, need not be a bad idea, but the way you see it realized in the project. Is not a very good idea. You could have the same density and more without destroying the usability, the usable green space that this design does. Because really, although it looks impressive from a top down view, it looks like you see plenty of green space between the buildings, that that space is pretty well wasted and useless because the buildings are high enough to pretty well shade and dominate the green space that's in that project. So I'm not saying that the density that you're going for is a bad thing. But doing it in this way doesn't work, and any zoning that just permits this without further control is not a good idea. Thank you. Okay. Thank you, sir. Members, please cast your vote. Councilman Andrew's motion carries. Next time, please. Report from Development Services recommendation to declare ordinance amending the Land Use District Map from institutional to park red and adopted as Red District eight."
+    MEETINGBANK_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT = "Item 28 Report Development. Services Recommendation declare ordinance amending Land Use District Map institutional IRP 13 adopted District eight. Councilman Austin. ask Councilman Andrews public address item 28 cast vote. see?. illogical motion created ARP 13. Joe Weinstein. last week. same subjects. minor matter. proposed zoning good idea. project intended. permit Meets specifications zoning. question conditions zoning met details project. zoning not bad project. not good. same density more without destroying usability green space. green space between buildings wasted useless buildings high shade dominate green space. not density bad. doesn't work zoning permits without control not good idea. Thank you. cast vote. Councilman Andrew's motion carries. Next time.Development Services ordinance Land District Map park District."
+
+    LONGBENCH_PROMPT_LIST = [
+        "新闻内容：\n（服务·健康）专家提醒：寒冷气候易诱发心脑血管疾病\n新华社海口２月９日专电（张苏民、李建国）海口市疾病预防控制中心专家介绍，持续的寒冷气候是心脑血管疾病的杀手，尤其患有高血压或高血脂疾病的老人更应做好防范，防止脑中风发生。\n　　在寒冷的气候环境当中要注意保暖，增添衣服，饮食以清淡为主，多食用蔬菜，忌暴食荤类。尤其过年时，切忌熬夜，平时要加强身体锻炼，劳逸结合。除此之外，冬季还是呼吸道传染病暴发和流行的季节，应该注意预防流感、麻疹、流脑、水痘等呼吸道传染病的发生。\n　　专家还指出，由于寒冷气候影响，人们习惯门窗紧闭，空气不对流，一旦有传染源传入，极容易造成疾病的暴发。春节期间，一些商场或公共娱乐场所人群密集，有关单位应加强通风。（完）\n类别：医药、卫生",
+        "\n\n新闻内容：\n李明波在恩施调研时强调 大力推进基层党内民主建设\n本报讯　（记者吴畏、通讯员曾言、周恩祖）11日至13日，省委常委、秘书长李明波到恩施州调研基层党建工作时强调，要以增强党的创新活力、巩固党的团结统一为目标，以改革创新精神大力抓好基层党内民主建设。\n　　李明波视察了非公有制企业党建、党代表常任制、基层党务公开、以党内和谐推进社区和谐等党建工作现场，与基层党务工作者座谈。李明波强调，在新形势下，要把握好民主进程与经济社会发展、尊重党员主体地位与提高党员民主素质、履行党员民主权利与保证党的统一意志、发挥党员民主监督作用与加强党纪教育管理等的关系，进一步深入探索，在丰富形式、拓宽渠道、完善机制等方面取得更大成绩。\n类别：政治",
+        "\n\n新闻内容：\n第38届世界贸易中心年会及经贸洽谈会\n第38届世界贸易中心年会将于2007年10月21至24日在美国路易斯\n安那州首府新奥尔良召开。该会由美国纽约世界贸易中心总部和美国贸\n易服务管理总局、新奥尔良世贸中心共同举办，届时将有来自60多个国\n家和地区的经贸代表团约600余人与会。天津贸促会与天津世贸中心协\n会将共同组织天津经贸代表团赴美国参加“世贸中心2007年年会及经贸\n洽谈会”。\n　　联系人：王岭　刘鹏\n　　电话：022－2520231725202123\n　　传真：022－25201975\n　　地址：天津经济技术开发区宏达街19号A区2楼\n类别：商业、外贸、海关",
+        "\n\n新闻内容：\n（全运会）第十一届全运会开闭幕时间确定\n新华社济南６月５日体育专电（记者赵仁伟）第十一届全国运动会组委会５日在济南宣布，十一运会将于今年１０月１６日在济南奥体中心开幕，闭幕时间为１０月２８日。\n　　十一运会组委会常务副秘书长、山东省体育局局长张洪涛介绍，十一运会的比赛项目共设３３个大项、４３个分项、３６２个小项，其中包括２８个夏季奥运会项目、４个冬季项目以及武术项目。与２００５年十运会相比，大项增加了１个，即自由式滑雪；小项增加了５个，分别是自由式滑雪男子个人、女子个人，女子水球项目，足球男子１６岁以下组和女子１８岁以下组。\n　　在十一运会全部３６２个小项中，马拉松男、女２个小项的比赛在北京举办，速度滑冰４个小项、自由式滑雪２个小项的比赛分别在沈阳和长春举办，其余３５４个小项的比赛在山东省１７个赛区举行。其中，济南赛区共举办小项２１２个，青岛４８个，日照４０个，滨州２８个，枣庄８个，菏泽７个，威海５个，烟台、德州各３个；淄博、东营、潍坊、济宁、泰安、莱芜、临沂、聊城８个赛区只举办小组赛和第四名以后的比赛，不产生金牌。\n　　张洪涛介绍，十一运会冰雪项目已于１月至４月举行，占全部小项的４.４％。因部分夏季项目的世界锦标赛或国际重要赛事的时间与十一运会比赛时间冲突或相距较近，国家体育总局确定把这些项目的比赛安排在开幕式前举行，共有１５个项目、８０个小项，占全部小项的２２.１％。（完）\n类别：体育",
+        "\n\n新闻内容：\n（教育）河北整顿公办初中、小学招收择校生\n（教育）河北整顿公办初中、小学招收择校生\n　　新华社石家庄３月１２日电（冯月静）记者从河北省教育纪检监察审计工作会议上了解到，从今年起，河北省不再审批新的改制学校。对已审批的改制学校进行一次全面整顿和规范，重点解决公办初中、小学以改制为名或以民办为名举办“校中校”“校中班”高收费问题。\n　　据了解，河北省规定达不到要求的，要限期整改；年内仍达不到标准要求的，一律停止招生。公办学校一律不准搞“一校两制”，更不准以改制为名高收费。\n　　同时，今年秋季新学年开始，设区市市区的公办省级示范性普通高中（含在县镇办学的市直属省级示范性高中）择校生比例最高限额由原定的４０％一律下调为３０％。严禁学校擅自扩大择校生招生比例、降低录取分数线、提高收费标准或在限定金额外加收任何其他费用。（完）\n类别：教育",
+        "\n\n新闻内容：\n（服务·关注“过劳死”） “过劳死”青睐什么人？\n人？\n    新华社郑州３月１６日专电(记者李丽静)  有关专家\n研究表明：受教育程度高、中青年、女性是“过劳死”这\n一疾病的危险人群。这是因为这些人事业上强力拼搏，生\n活负荷过重，自身经常处于紧张状态之中，过度疲劳难以\n避免。\n    随着社会竞争的日趋激烈，该病也越来越多地困扰着\n我国的都市人。据一项在上海、无锡、深圳等地对\n１１９７位中年人健康状况调查显示：其中６６％的人有\n失眠、多梦、不易入睡等现象；６２％的人经常腰酸背痛；\n５８％的人一干活就累；５７％的人爬楼时感到吃力或记\n忆力明显减退；４８％的人皮肤干燥、瘙痒、面色晦暗、\n脾气暴躁、焦虑。据国家有关部门的一项调查结果表明，\n慢性疲劳综合征在城市新兴行业人群中的发病率为１０％\n至２０％，在科技、新闻、广告、公务人员、演艺、出租\n车司机等行业中发病率则更高。\n    有关专家通过统计认为，“过劳死”特别“青睐”三\n种人：\n    第一种是有钱但不知保养的人。这部分人“富裕”的\n背后，往往有一条铺满辛酸的路。由于对贫穷的恐惧，使\n他们对财富永远不满足。为了追逐更多的财富，即使赴汤\n蹈火也在所不辞，而对他们最初惟一的资本———身体，\n则很不在乎。 \n    第二种是有事业心，特别是称得上“工作狂”的人。\n主要以从事科研、教学、新型高科技，如网络等职业者居\n多。\n    第三种是有家族遗传背景者。如果父母亲、爷爷奶奶\n等直系亲属中有心绞痛、心肌梗死、脑中风的患者，就要\n特别小心了，千万别让自己累着，否则很有可能在年轻时\n就诱发疾病。\n    而在对“过劳死”人群深入研究中发现，猝死直接死\n因的前5位是冠状动脉疾病、主动脉瘤、心瓣膜病、心肌\n病和脑出血。一些无症状冠心病，特别是无症状心肌梗死\n是首要的危险因素，一般的体检和心电图不易发现隐性冠\n心病。一旦发作，措手不及。此外，高血压也是一个潜在\n的危险因素。在遇到某些诱因时，便会引发高血压、脑中\n风等。（完）\n类别：医药、卫生",
+        "\n\n新闻内容：\n五项措施应对技术性贸易壁垒\n调查结果显示,2006年我国有31\n    .4%的出口企业受到国外技术性贸易措施不同程度的影响,比2005年增长6.3个百分点;全年出口贸易直接损失359.20亿美元,占同期出口额的3.71%,企业新增成本191.55亿美元。\n    会议通报的情况显示,对中国企业出口影响较大的技术性贸易措施类型集中在认证要求、技术标准要求、有毒有害物质限量要求、包装及材料的要求和环保要求(包括节能及产品回收),食品中农兽药残留要求、重金属等有害物质限量要求、细菌等卫生指标要求、食品标签要求和食品接触材料的要求等方面。受国外技术性贸易措施影响较大的行业排在前五位的是机电、农食产品、化矿、塑料皮革和纺织鞋帽。\n    会议提出了加强应对的5点意见。一是要强化进出口质量监管措施,在“严”字上下功夫,重点从源头上抓好农兽药残留、有毒化学物质残留、微生物等问题,同时要完善监管机制,提高检测能力,要检得出,检得快,检得准。二是要加快实施技术标准战略,在“高”字上下功夫,不断提高采标率,加快标准的制修订步伐。三是要加大信息共享力度,在“准”字上下功夫,各部门要密切配合,建立沟通机制,做到信息资源的充分利用。四是要果断迅速应对突发事件,在“快”字上下功夫。五是要加强技术性贸易措施的积极应对,在“实”字上下功夫,协调配合、相互支持。\n类别：商业、外贸、海关",
+        "\n\n新闻内容：\n（新华时评·奥运会倒计时一百天）让我们共同守护奥林匹克精神\n新华社北京４月３０日电　题：让我们共同守护奥林匹克精神\n    新华社记者张旭\n    在北京奥运会倒计时一百天之际，奥运圣火结束在其他国家的传递进入中国香港。在这两个重要时间节点重合之时，让我们以奥林匹克精神为依归，回味今年以来围绕北京奥运的风风雨雨，并以百倍的努力在接下来的日子里守护这一美好理想。\n    奥林匹克运动会是古希腊人的体育盛会，许多比赛项目源于古希腊文化。顾拜旦说：“古希腊人之所以组织竞赛活动，不仅仅只是为了锻炼体格和显示一种廉价的壮观场面，更是为了教育人”。更高更快更强并不是现代奥林匹克运动的全部价值诉求。现代奥林匹克运动经过了一百年的历史变迁，向世界传达的精神与主题始终如一，那就是在共同创造、共同分享、平等友爱的旗帜下，展现人类最美好的情感。奥林匹克是迄今为止人类社会不同种族、地域乃至不同意识形态间最大的交集。\n　　２００１年７月１３日，时任国际奥委会主席的萨马兰奇宣布北京取得２００８年奥运会主办权，现代奥林匹克运动从奥林匹亚来到万里长城。７年后的春天，当奥运圣火开始在中国境外传递时，妖魔化中国的舆论攻势和扰乱奥运火炬传递的暴力举动让海内外目光聚焦中国。我们可以肯定地说，这些人在为一己之私对奥林匹克精神进行亵渎。\n   北京奥运圣火一路走来，虽然遇到了噪音和干扰，但更多面对的还是像火一样热情的世界人民和对奥林匹克精神充分尊重的各国人士。他们因为懂得尊重奥林匹克精神，因此也能够享受奥林匹克带来的快乐。\n    ２００８年４月３０日，“北京欢迎你”的歌声回荡在有着近６００年历史的紫禁城太庙上空。８月８日，中国人民将第一次以东道主的身份在北京承办举世瞩目的奥林匹克运动会。北京奥运会对中国来说不仅仅是一次体育盛会，更是一次与世界各国开展文化交流的机会。如同当年奥林匹亚为神圣的无战争区域一样，体育竞技的目标是为了全世界的和平与发展。北京奥运会也完全可以成为世界各种文明一个共同的精神家园，通过沟通交流，达到良性互动。\n   奥运会的脚步声离我们越来越近的时候，奥林匹克运动正在为１３亿中国人民所熟悉，奥林匹克精神也继续在世界范围内承载起人类追求幸福生活的梦想。中国人民真诚地邀请各国运动员、教练员和朋友们参与２００８年北京奥运会。中国人民同时真诚地邀请全世界热爱奥林匹克精神和奥林匹克运动的人们一起，共同守护这一人类美好理想，让它在北京奥运会上开放出更加美丽的花朵。（完）\n类别：体育",
+        "\n\n新闻内容：\n海口“接管”省 特殊教育 学校\n创建于1989年的海南省特殊教育学校原属省教育厅直属正处级事业单位，为海南省惟一一所全日寄宿的公立特殊教育学校。\n    我市“接管”省特殊教育学校之后，将继续面向全省招收视障、听障两类适龄儿童，优化教育布局调整，促进特殊教育又好又快发展。\n类别：教育",
+        "\n\n新闻内容：\n９月７日特稿（加１）（美国－大学流感）\n美一大学两千学生恐染流感\n　　　　马震\n　　美国华盛顿州立大学大约２０００名学生报告甲型Ｈ１Ｎ１流感症状。校方和医护人员说，这可能是最严重的一起大学生感染新型流感事件。\n　　（小标题）人数众多\n　　这所大学位于华盛顿州普尔曼，主校区大约有１.９万名学生。据美国《纽约时报》网络版６日报道，华盛顿州注册护士萨莉·雷德曼证实了大约２０００名华盛顿州立大学学生报告流感症状一事。\n　　雷德曼在华盛顿州立大学学生医疗部门工作。她说，流感暴发情况出现在８月２１日，那时学校还没开学。但如今为学生提供医疗服务的部门总是门庭若市。有一天，大约有２００名学生就诊或给医疗机构打电话报告喉咙疼、发烧、咳嗽等症状。\n　　华盛顿州立大学所在惠特曼县的卫生部门官员说，州实验室上周的检测结果显示，这所大学的疫情确实是因甲型Ｈ１Ｎ１流感病毒引起。\n　　学校现已开学。法新社本月６日报道，学校上周开了关于流感疫情的博客，博客上最新的信息说：“秋季学期的前１０天，我们估计已与大约２０００名有流感症状的人联络。”\n　　校方管理人员说，一些学生可能到社区医院就诊，一些学生可能居家自我治疗。校方无法掌握这些人的人数，已要求当地卫生部门提供相关数据，以便校方更好了解疫情情况。\n　　（小标题）无一死亡\n　　华盛顿州立大学已根据国家疾病控制和预防中心的防流感指南向学生提供咨询服务，以避免疫情进一步加重。学校还向学生发放了一些防流感的药品和护具等。\n　　为防止甲型流感传播，美国的一些大学已建立起隔离机制，但华盛顿州立大学没有类似机制。雷德曼说，在华盛顿州立大学上报的大部分流感疫情案例中，疑似染病的学生被要求待在居所内休息并吃退烧药。如果这些人在不吃退烧药２４小时后体温仍旧正常，就可以正常来上课。\n　　美国已有５９３例与甲型流感有关的死亡病例，但华盛顿州立大学尚未发现一起死亡病例。到目前为止，学生的流感症状相对温和，只有两个不是学生的患者入院治疗。\n　　校方在声明中说：“我校患者中的绝大部分症状温和，通常３到５天就能见强。”\n　　（小标题）担心传播\n　　华盛顿州立大学大规模流感疫情出现前，美国大学健康协会于８月２８日对１６５所大学实施了流感疫情调查。调查结果显示，全国超过２０００名学生报告说有甲型流感症状。\n　　惠特曼县公共卫生部门负责人蒂莫西·穆迪认为本月晚些时候开学的其他大学可能会遭遇类似华盛顿州立大学的情况，而地方医疗机构会担心疫情可能向校外蔓延。\n　　国家疾病控制和预防中心主任托马斯·弗里登６日接受美国有线电视新闻网采访时说，学校医务人员本学年报告的流感数字不同寻常。疾病控制和预防中心此前未遭遇过８月和９月数字增长这么快的情况。\n　　国家疾病控制和预防中心现在特别重视流感疫情。弗里登说：“如果它的致命性增加，可能会造成特别严重的情形，可能会给上学和上班的人带来特别多麻烦。”（完）（新华社供本报特稿）\n　　关键词：华盛顿州立大学(Washington State University)\n类别：医药、卫生",
+        "\n\n新闻内容：\n在国防教育的落实上下功夫\n在国防教育的落实上下功夫 赵荣\n    加强全民国防教育是增强国防观念和忧患意识、促进国防和军队建设的基础性工程。鉴此，在今后的实践中，要坚持以科学发展观为指导，科学谋划、创新形式、狠抓落实，使全民国防教育深入人心，扎实有效地开展下去。\n    抓好责任落实。《国防教育法》第三章第十八条规定：各地区各部门的领导人员应当依法履行组织、领导本地区、本部门开展国防教育的职责。因而，要使全民国防教育扎实有效地开展下去，各级领导和职能部门要依法负起抓好全民国防教育的责任，对本地区、本单位、本行业的国防教育，从计划安排到组织实施都要认真负责地抓好落实。\n    抓好人员落实。国防教育是面向全民的教育，它的开展必须面向全社会，而不能只针对个别地区、个别单位和个别人员。因而，各地要对一切有接受能力的公民实施国防教育，以提高全民的政治、思想和道德素质，使全体公民积极争当热爱祖国、热爱国防的好公民。\n    抓好效果落实。国防教育的开展，效果的落实极为重要。为此，教育中应着重抓好国防理论、国防精神、国防知识、国防历史、国防技能、国防法制的教育，以强化爱国精神、增长国防知识、强化国防观念。通过教育，使全体公民进一步了解我国安全面临的新形势、世界军事变革的新发展、我国国防和军队建设面临的新挑战、以及在对国防建设中应承担的义务和责任等，不断提高他们支持和关心国防建设的积极性和自觉性。\n    (来源：中国国防报 发布时间： 2007-11-22 08:19)\n类别：军事",
+        "\n\n新闻内容：\n中国又一学者当选瑞典皇家工程科学院外籍院士\n新华社北京８月２０日电　北京航空航天大学中国循环经济研究中心主任、北京循环经济促进会会长吴季松教授，日前被瑞典皇家工程科学院全体大会选为该院外籍院士。\n　　作为改革开放后首批出国访问学者之一，吴季松曾在欧洲原子能联营法国原子能委员会研究受控热核聚变，还曾任中国常驻联合国教科文组织代表团参赞衔副代表、联合国教科文组织科技部门高技术与环境顾问。１９８５至１９８６年，主持联合国教科文组织“多学科综合研究应用于经济发展”专题研究，并由联合国教科文组织发表项目研究报告创意知识经济。\n    他在中国科技和产业领域作出了多项贡献，主要包括：创意“知识经济”并将科技园区的实践介绍到中国、提出修复生态系统理论并主持制定水资源规划、创立新循环经济学等。\n　　瑞典皇家工程科学院创建于１９１９年，是世界上第一个工程院，现有机械工程、电机工程等学部。该院参与相关诺贝尔奖项的提名和评审工作。目前共有院士（含外籍院士）近１１００人，来自中国的外籍院士包括宋健、徐冠华等。（完）\n类别：科学技术",
+    ]
+    LONGBENCH_1000TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "\n 新闻内容 第38届世界贸易中心年会及经贸洽谈会\n 安那州首府新奥尔良召开。\n 易服务管理总局、新奥尔良世贸中心共同举办\n 家和地区的经贸代表团约600余人与会。 天津贸促会与天津世贸中心协\n 会将共同组织天津经贸代表团赴美国参加“世贸中心2007年年会及经贸\n 洽谈会”。\n 联系人:王岭 刘鹏\n 电话:022-2520231725202123\n 传真:022-25201975\n 地址:天津经济 技术开发区宏达街19号A区2楼\n类别：商业、外贸、海关\n\n\n 新闻内容\n 海口“接管”省 特殊教育 学校\n 创建于1989年的海南省特殊教育 学校原属省教育 厅直属正处级事业单位,为海南省惟一一所全日寄宿的公立特殊教育 学校。\n教育 学校之后,将继续面向全省招收视障、听障两类适龄儿童教育 布局调整教育。\n类别：教育\n\n\n 中国又一学者当选瑞典皇家工程科学院外籍院士\n 新华社北京8月20日电 北京航空航天大学中国循环经济 研究中心主任、北京循环经济 促进会会长吴季松教授,日前被瑞典皇家工程科学院全体大会选为该院外籍院士。\n 作为改革开放后首批出国访问学者之一,吴季松曾在欧洲原子能联营法国原子能委员会研究受控热核聚变,还曾任中国常驻联合国教科文组织代表团参赞衔副代表、联合国教科文组织科技部门高技术与环境顾问。 1985至1986年,主持联合国教科文组织“多学科综合研究应用于经济 发展”专题研究经济。\n:创意“知识经济 ”并将科技园区的实践介绍到中国、提出修复生态系统理论并主持制定水资源规划、创立新循环经济 学等。\n 瑞典皇家工程科学院创建于1919年,是世界上第一个工程院,现有机械工程、电机工程等学部。 目前共有院士(含外籍院士)近1100人,来自中国的外籍院士包括宋健、徐冠华等。\n类别：科学技术"
+
+    def __init__(self, *args, **kwargs):
+        super(LLMLingua2Tester, self).__init__(*args, **kwargs)
+        self.llmlingua = PromptCompressorV2(
+            model_name="microsoft/llmlingua-2-xlm-roberta-large-meetingbank",
+            device_map="cpu",
+            use_llmlingua2=True,
+        )
+
+    def test_general_compress_prompt(self):
+        compressed_prompt = self.llmlingua.compress_prompt(
+            self.PROMPT,
+            rate=0.33,
+            force_tokens=["\n", ".", "!", "?"],
+            drop_consecutive=False,
+            force_reserve_digit=False,
+        )
+        self.assertEqual(
+            compressed_prompt["compressed_prompt"],
+            self.COMPRESSED_SINGLE_CONTEXT_PROMPT,
+        )
+        self.assertEqual(compressed_prompt["origin_tokens"], 98)
+        self.assertEqual(compressed_prompt["compressed_tokens"], 30)
+        self.assertEqual(compressed_prompt["ratio"], "3.3x")
+        self.assertEqual(compressed_prompt["rate"], "30.6%")
+
+        compressed_prompt = self.llmlingua.compress_prompt(
+            self.PROMPT.split("\n\n"),
+            target_token=40,
+            use_context_level_filter=True,
+            force_tokens=["\n", ".", "!", "?"],
+            drop_consecutive=False,
+            force_reserve_digit=False,
+        )
+        self.assertEqual(
+            compressed_prompt["compressed_prompt"],
+            self.COMPRESSED_MULTIPLE_CONTEXT_PROMPT,
+        )
+        self.assertEqual(compressed_prompt["origin_tokens"], 98)
+        self.assertEqual(compressed_prompt["compressed_tokens"], 34)
+        self.assertEqual(compressed_prompt["ratio"], "2.9x")
+        self.assertEqual(compressed_prompt["rate"], "34.7%")
+
+        # Single Context
+        compressed_prompt = self.llmlingua.compress_prompt(
+            self.GSM8K_PROMPT.split("\n\n")[0],
+            target_token=170,
+            force_tokens=[
+                "+",
+                "-",
+                "*",
+                "×",
+                "/",
+                "÷",
+                "=",
+                "The answer is",
+                "\n",
+                "Question:",
+            ],
+            drop_consecutive=False,
+            force_reserve_digit=True,
+        )
+        self.assertEqual(
+            compressed_prompt["compressed_prompt"],
+            self.GSM8K_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT,
+        )
+        self.assertEqual(compressed_prompt["origin_tokens"], 422)
+        self.assertEqual(compressed_prompt["compressed_tokens"], 203)
+        self.assertEqual(compressed_prompt["ratio"], "2.1x")
+        self.assertEqual(compressed_prompt["rate"], "48.1%")
+
+        # Single Context
+        compressed_prompt = self.llmlingua.compress_prompt(
+            self.MEETINGBANK_PROMPT.split("\n\n")[0],
+            target_token=150,
+            force_tokens=["\n", ".", "?", "!"],
+            drop_consecutive=True,
+            force_reserve_digit=False,
+        )
+        self.assertEqual(
+            compressed_prompt["compressed_prompt"],
+            self.MEETINGBANK_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT,
+        )
+        self.assertEqual(compressed_prompt["origin_tokens"], 464)
+        self.assertEqual(compressed_prompt["compressed_tokens"], 154)
+        self.assertEqual(compressed_prompt["ratio"], "3.0x")
+        self.assertEqual(compressed_prompt["rate"], "33.2%")
+
+        # Multiple Context
+        compressed_prompt = self.llmlingua.compress_prompt(
+            self.GSM8K_PROMPT.split("\n\n"),
+            target_token=150,
+            use_context_level_filter=True,
+            force_tokens=["+", "-", "*", "×", "/", "÷", "=", "The answer is", "\n"],
+            drop_consecutive=False,
+            force_reserve_digit=True,
+        )
+        self.assertEqual(
+            compressed_prompt["compressed_prompt"],
+            self.GSM8K_150TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT,
+        )
+        self.assertEqual(compressed_prompt["origin_tokens"], 726)
+        self.assertEqual(compressed_prompt["compressed_tokens"], 161)
+        self.assertEqual(compressed_prompt["ratio"], "4.5x")
+        self.assertEqual(compressed_prompt["rate"], "22.2%")
+
+        # Multiple Context
+        compressed_prompt = self.llmlingua.compress_prompt(
+            self.LONGBENCH_PROMPT_LIST,
+            target_token=1000,
+            use_context_level_filter=True,
+            force_tokens=[
+                "\n",
+                "。",
+                "：",
+                "？",
+                "类别：",
+                "农业、农村",
+                "军事",
+                "文学、艺术",
+                "体育",
+                "传媒业",
+                "电子信息产业",
+                "文化、休闲娱乐",
+                "社会、劳动",
+                "经济",
+                "服务业、旅游业",
+                "环境、气象",
+                "能源、水务、水利",
+                "财政、金融",
+                "教育",
+                "科学技术",
+                "对外关系、国际关系",
+                "矿业、工业",
+                "政治",
+                "交通运输、邮政、物流",
+                "灾难、事故",
+                "基本建设、建筑业、房地产",
+                "医药、卫生",
+                "法律、司法",
+                "商业、外贸、海关",
+            ],
+            drop_consecutive=True,
+            force_reserve_digit=False,
+        )
+        self.assertEqual(
+            compressed_prompt["compressed_prompt"],
+            self.LONGBENCH_1000TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT,
+        )
+        self.assertEqual(compressed_prompt["origin_tokens"], 8389)
+        self.assertEqual(compressed_prompt["compressed_tokens"], 870)
+        self.assertEqual(compressed_prompt["ratio"], "9.6x")
+        self.assertEqual(compressed_prompt["rate"], "10.4%")