From 32b94c03f65241367c6b022adf0d5ba767efe1a2 Mon Sep 17 00:00:00 2001
From: Shivam Shandilya <shivam.stpaulsdarjeeling@gmail.com>
Date: Tue, 1 Jul 2025 19:35:16 +0000
Subject: [PATCH 1/2] added support for TACO-RL

---
 experiments/taco-rl/README.md                 | 369 +++++++++++++
 .../taco-rl/configs/train_reinforce.yaml      |  53 ++
 experiments/taco-rl/metrics.py                |  27 +
 experiments/taco-rl/train_reinforce.py        | 276 ++++++++++
 experiments/taco-rl/utils.py                  | 361 +++++++++++++
 llmlingua/__init__.py                         |   3 +-
 llmlingua/taco-rl/README.md                   | 112 ++++
 llmlingua/taco-rl/__init__.py                 |   7 +
 .../taco-rl/prompt_compressor_reinforce.py    | 509 ++++++++++++++++++
 9 files changed, 1716 insertions(+), 1 deletion(-)
 create mode 100644 experiments/taco-rl/README.md
 create mode 100644 experiments/taco-rl/configs/train_reinforce.yaml
 create mode 100644 experiments/taco-rl/metrics.py
 create mode 100644 experiments/taco-rl/train_reinforce.py
 create mode 100644 experiments/taco-rl/utils.py
 create mode 100644 llmlingua/taco-rl/README.md
 create mode 100644 llmlingua/taco-rl/__init__.py
 create mode 100644 llmlingua/taco-rl/prompt_compressor_reinforce.py

diff --git a/experiments/taco-rl/README.md b/experiments/taco-rl/README.md
new file mode 100644
index 0000000..ca171f6
--- /dev/null
+++ b/experiments/taco-rl/README.md
@@ -0,0 +1,369 @@
+# TACO-RL Experiments: Training and Implementation Guide
+
+This directory contains the experimental implementation of TACO-RL (Task-Aware Prompt Compression Optimization with Reinforcement Learning) for fine-tuning LLMLingua models on new tasks.
+
+## Directory Structure
+
+```
+experiments/taco-rl/
+├── README.md                 # This file
+├── train_reinforce.py        # Main training script
+├── utils.py                  # Utility functions and LLM querying
+├── metrics.py                # Evaluation metrics
+├── configs/                  # Configuration files
+│   └── train_reinforce.yaml  # Training configuration
+└── logs/                     # Training logs (created during training)
+```
+
+## Quick Start
+
+### 1. Prepare Your Data
+
+Create a JSON file with your training data in the following format:
+
+```json
+{
+  "0": {
+    "original_prompt": "Your original prompt text here...",
+    "gpt_output": "Expected GPT output for the prompt..."
+  },
+  "1": {
+    "original_prompt": "Another prompt...",
+    "gpt_output": "Another expected output..."
+  }
+}
+```
+
+**Data Format Requirements:**
+- `original_prompt`: The input text that needs to be compressed
+- `gpt_output`: The expected output from the teacher model (e.g., GPT-3.5) for the original prompt
+
+**Task-Specific Examples:**
+
+**Meeting Summarization:**
+```json
+{
+  "0": {
+    "original_prompt": "Meeting transcript: John discussed Q1 results showing 15% growth in revenue. Sarah mentioned challenges with supply chain. Mike proposed new marketing strategy...",
+    "gpt_output": "Summary: Q1 results were positive with 15% revenue growth. Supply chain challenges were identified. New marketing strategy was proposed."
+  }
+}
+```
+
+**Question Answering:**
+```json
+{
+  "0": {
+    "original_prompt": "Context: The Eiffel Tower was built in 1889 by Gustave Eiffel for the World's Fair. It stands 324 meters tall and was originally intended to be temporary. Question: When was the Eiffel Tower built?",
+    "gpt_output": "The Eiffel Tower was built in 1889."
+  }
+}
+```
+
+**Code Summarization:**
+```json
+{
+  "0": {
+    "original_prompt": "def quicksort(arr): if len(arr) <= 1: return arr; pivot = arr[len(arr) // 2]; left = [x for x in arr if x < pivot]; middle = [x for x in arr if x == pivot]; right = [x for x in arr if x > pivot]; return quicksort(left) + middle + quicksort(right)",
+    "gpt_output": "This function implements the quicksort algorithm using a divide-and-conquer approach with pivot selection and recursive sorting."
+  }
+}
+```
+
+### 2. Configure API Settings (Optional)
+
+If you're using Azure OpenAI services, you can configure your API settings in `utils.py`:
+
+```python
+# In utils.py, update the DEFAULT_API_CONFIG with your settings:
+DEFAULT_API_CONFIG = {
+    "scope": "https://cognitiveservices.azure.com/.default",
+    "client_id": "YOUR_CLIENT_ID_HERE",  # Replace with your client ID
+    "api_details": {
+        "primary": {
+            "api_base": "YOUR_PRIMARY_API_BASE_HERE",  # Replace with your primary API base
+            "api_version": "2024-02-01",
+        },
+        "secondary": {
+            "api_base": "YOUR_SECONDARY_API_BASE_HERE",  # Replace with your secondary API base
+            "api_version": "2024-02-01",
+        }
+    }
+}
+```
+
+Alternatively, you can initialize with custom configuration in your training script:
+
+```python
+from utils import initialize_api_config
+
+custom_config = {
+    "client_id": "your-client-id",
+    "api_details": {
+        "primary": {"api_base": "your-primary-endpoint"},
+        "secondary": {"api_base": "your-secondary-endpoint"}
+    }
+}
+initialize_api_config(custom_config)
+```
+
+### 3. Configure Training Parameters
+
+Edit `configs/train_reinforce.yaml` to set your desired hyperparameters:
+
+```yaml
+data:
+  train_file_path: "path/to/your/train_data.json"
+
+model:
+  model_load_path: "microsoft/llmlingua-2-xlm-roberta-large-meetingbank"
+  trained_model_output_dir_hf: "models/reinforce_trained_model"
+
+task:
+  prompt: "Summarize the provided text (which may be compressed).\n{transcript}\nSummary:"
+  gpt_model: "gpt-35-turbo"
+  max_tokens: 1500
+
+hyperparams:
+  epochs: 4
+  train_batch_size: 8
+  policy_lr: 1e-5
+  compression_rate: 0.5
+  compression_relaxation_tokens: 30
+  max_seq_len: 512
+  entropy_coeff: 0.01
+  
+  # Compression settings
+  target_token: -1
+  use_context_level_filter: false
+  use_token_level_filter: true
+  target_context: -1
+  context_level_compression_rate: 1.0
+  context_level_target_token: -1
+  force_tokens: ""
+  drop_consecutive: true
+  force_reserve_digit: false
+
+logging:
+  log_dir: "logs_new"
+  log_interval: 5
+  save_interval: 100
+  
+device:
+  use_cuda: true
+  device_map: "auto"
+```
+
+### 4. Run Training
+
+```bash
+cd LLMLingua/experiments/taco-rl
+accelerate launch train_reinforce.py
+```
+
+## Configuration Parameters
+
+### Data Configuration
+- `train_file_path`: Path to your training data JSON file
+
+### Model Configuration
+- `model_load_path`: Pre-trained LLMLingua model to fine-tune
+- `trained_model_output_dir_hf`: Directory to save the fine-tuned model
+
+### Task Configuration
+- `prompt`: Template prompt for the teacher model (GPT-3.5). Use `{transcript}` as placeholder for the compressed text
+- `gpt_model`: Teacher model name for reward computation
+- `max_tokens`: Maximum tokens for teacher model responses
+
+**Task-Specific Prompt Examples:**
+
+**Meeting Summarization:**
+```yaml
+task:
+  prompt: "Summarize the provided meeting transcript (which may be compressed).\n{transcript}\nSummary:"
+```
+
+**Question Answering:**
+```yaml
+task:
+  prompt: "Answer the following question based on the provided context (which may be compressed).\n{transcript}\nAnswer:"
+```
+
+**Code Summarization:**
+```yaml
+task:
+  prompt: "Summarize the following code (which may be compressed).\n{transcript}\nSummary:"
+```
+
+### Training Hyperparameters
+- `epochs`: Number of training epochs
+- `train_batch_size`: Batch size for training
+- `policy_lr`: Learning rate for policy optimization
+- `compression_rate`: Target compression ratio (0.0-1.0)
+- `compression_relaxation_tokens`: Tolerance for compression deviation
+- `max_seq_len`: Maximum sequence length for tokenization
+- `entropy_coeff`: Entropy regularization coefficient for exploration
+
+### Compression Settings
+- `target_token`: Target number of tokens (-1 for rate-based compression)
+- `use_context_level_filter`: Whether to use context-level filtering
+- `use_token_level_filter`: Whether to use token-level filtering
+- `target_context`: Target number of contexts (-1 for rate-based)
+- `context_level_compression_rate`: Context-level compression rate
+- `context_level_target_token`: Context-level target tokens
+- `force_tokens`: Tokens to always preserve
+- `drop_consecutive`: Whether to drop consecutive tokens
+- `force_reserve_digit`: Whether to preserve digits
+
+### Logging Configuration
+- `log_dir`: Directory for log files
+- `log_interval`: How often to log metrics
+- `save_interval`: How often to save model checkpoints
+
+### Device Configuration
+- `use_cuda`: Whether to use CUDA if available
+- `device_map`: Device mapping strategy
+
+## Training Process
+
+### 1. Data Loading
+The training script uses the `GenericRLDataset` class to load training data:
+
+```python
+class GenericRLDataset(Dataset):
+    def __init__(self, data_path):
+        self.data = json.load(open(data_path))
+    
+    def __len__(self):
+        return len(self.data)
+    
+    def __getitem__(self, idx):
+        tokenized_dataset = tokenize_text(self.data[str(idx)]["original_prompt"])
+        return {
+            "input_ids": tokenized_dataset["input_ids"], 
+            "attention_mask": tokenized_dataset["attention_mask"], 
+            "org_prompt": self.data[str(idx)]["original_prompt"], 
+            "gpt_output": self.data[str(idx)]["gpt_output"]
+        }
+```
+
+### 2. Model Initialization
+Creates a `PromptCompressorReinforce` instance with the specified pre-trained model:
+
+```python
+compressor = PromptCompressorReinforce(
+    model_name=model_name,
+    model_config={},
+    use_llmlingua2=True,
+    device_map=device_map,
+)
+```
+
+### 3. REINFORCE Training Loop
+For each batch:
+1. Compresses prompts using the RL-enhanced compressor
+2. Calculates rewards based on compression quality using the teacher model
+3. Updates the policy network using REINFORCE algorithm
+4. Logs training metrics and progress
+
+## Reward Function
+
+The reward function evaluates compression quality by:
+
+1. **Compression Quality**: BLEU score between summaries of original and compressed text
+2. **Compression Ratio**: Penalty for not meeting target compression rate
+3. **Compression Coefficient**: Penalty for deviating from target compression
+4. **Entropy Regularization**: Encourages exploration during training
+
+### Reward Computation Details
+The reward is positive when the compression is within the tolerance threshold. It is set to a negative value during over-compression and when it crosses the tolerance threshold.
+
+```python
+def calculate_rewards(gpt_output, model_compressed_text, compression_ratio):
+    if compression_ratio["compressed"] > 0.1*compression_ratio["original"]:
+        org_input_summary = gpt_output
+        model_compressed_summary = get_gpt_output(model_compressed_text)
+        comp_ratio = compression_ratio["original"] / compression_ratio["compressed"]
+    else:
+        comp_ratio = compression_ratio["original"] / compression_ratio["compressed"]
+        return {"comp_ratio": comp_ratio, "reward": -0.4}       
+
+    if org_input_summary is not None and model_compressed_summary is not None:
+        eval_scores = evaluate_sim2([model_compressed_summary], [org_input_summary])
+        bleu = eval_scores["bleu"]
+        comp_coeff = compression_ratio["compressed"] - compression_rate*compression_ratio["original"]
+        
+        if abs(comp_coeff) > input_dict["compression_relaxation_tokens"]:
+            reward = -0.1
+        else:
+            reward = bleu
+
+        return {"comp_ratio": comp_ratio, "reward": reward}
+    else:
+        return {"comp_ratio": comp_ratio, "reward": 0.4}
+```
+### Note: This function will need to be modified as per the task. For example, for the QA task, the *reward* should be F1 Score.
+
+## Output
+
+The training script produces:
+
+### 1. Fine-tuned Model
+- Saved to the specified output directory
+- Can be loaded and used with the main LLMLingua package
+
+### 2. Training Logs
+- `reinforce_logs_updated.csv`: Policy loss and learning rate logs
+- `rewards_logs_updated.csv`: Reward and compression ratio logs
+- Console output with real-time training progress
+
+### 3. Log Format
+```
+Train/Val, Epoch, Step, Policy Loss, LR
+Train, 0, 0, 0.123, 1e-05
+Train, 0, 5, 0.098, 1e-05
+...
+```
+
+## Evaluation
+
+After training your TACO-RL model, you can evaluate its performance using the evaluation scripts and utilities available in the [LLMLingua2 experiments directory](../llmlingua2/evaluation/).
+
+
+## Integration with Main LLMLingua
+
+After training, you can use your fine-tuned model with the main LLMLingua package:
+
+```python
+from llmlingua import PromptCompressor
+
+# Load your fine-tuned model
+compressor = PromptCompressor(
+    model_name="path/to/your/fine_tuned_model",
+    use_llmlingua2=True
+)
+
+# Use it for compression
+compressed_prompt = compressor.compress_prompt_llmlingua2(
+    ["Your prompt here..."],
+    rate=0.5
+)
+```
+
+
+## Notes
+
+- The training uses the REINFORCE algorithm for policy gradient optimization
+- The compressor maintains the same interface as the original LLMLingua but provides additional RL information
+- Make sure to adjust batch sizes and learning rates based on your hardware capabilities
+- The teacher model (GPT-3.5) is used to evaluate compression quality and provide reward signals
+- All hyperparameters are configurable through the YAML file for easy experimentation
+- Different tasks may require different evaluation metrics for optimal performance
+
+## Debugging Tips
+
+1. **Check Logs**: Monitor the CSV log files for training progress
+2. **Validate Data**: Ensure your training data format is correct
+3. **Test Configuration**: Start with a small dataset to test your setup
+4. **Monitor Rewards**: Watch for consistent reward improvements over time
+5. **Check Compression**: Verify that compression ratios are within expected ranges
\ No newline at end of file
diff --git a/experiments/taco-rl/configs/train_reinforce.yaml b/experiments/taco-rl/configs/train_reinforce.yaml
new file mode 100644
index 0000000..9b2d80a
--- /dev/null
+++ b/experiments/taco-rl/configs/train_reinforce.yaml
@@ -0,0 +1,53 @@
+# Configuration file for REINFORCE training
+
+data:
+  train_file_path: <train_file_path>
+
+model:
+  model_load_path: "microsoft/llmlingua-2-xlm-roberta-large-meetingbank"
+  trained_model_output_dir_hf: "models/reinforce_trained_model"
+
+task:
+  prompt: "Summarize the provided meeting transcript (which may be compressed).\n{transcript}\nSummary:"
+  gpt_model: "gpt-35-turbo"
+  max_tokens: 1500
+
+hyperparams:
+  # Training parameters
+  epochs: 4
+  train_batch_size: 8
+  
+  # Learning rates
+  policy_lr: 1e-5
+  
+  # Compression parameters
+  compression_rate: 0.5
+  compression_relaxation_tokens: 30
+  
+  # REINFORCE specific parameters
+  entropy_coeff: 0.01  # Entropy regularization coefficient
+  
+  # Model parameters
+  max_seq_len: 512
+  
+  # Compression settings
+  target_token: -1
+  use_context_level_filter: false
+  use_token_level_filter: true
+  target_context: -1
+  context_level_compression_rate: 1.0
+  context_level_target_token: -1
+  force_tokens: ""
+  drop_consecutive: true
+  force_reserve_digit: false
+
+# Logging settings
+logging:
+  log_dir: "logs_new"
+  log_interval: 5
+  save_interval: 100
+  
+# Device settings
+device:
+  use_cuda: true
+  device_map: "cpu" 
\ No newline at end of file
diff --git a/experiments/taco-rl/metrics.py b/experiments/taco-rl/metrics.py
new file mode 100644
index 0000000..5e53eed
--- /dev/null
+++ b/experiments/taco-rl/metrics.py
@@ -0,0 +1,27 @@
+import evaluate
+
+def evaluate_sim2(pred_list, gt_list, truncate_pred=True, truncate_gt=False):
+    if truncate_pred:
+        pred_list_truncated = []
+        for pred in pred_list:
+            pred = pred.lstrip("\n").split("\n")[0].strip()
+            pred_list_truncated.append(pred)
+        pred_list = pred_list_truncated
+    if truncate_gt:
+        gt_list_truncated = []
+        for gt in gt_list:
+            gt = gt.lstrip("\n").split("\n")[0].strip()
+            gt_list_truncated.append(gt)
+        gt_list = gt_list_truncated
+
+
+    rouge = evaluate.load("rouge")
+    bleu = evaluate.load("bleu")
+    metrics = {}
+    rouge_scores = rouge.compute(predictions=pred_list, references=gt_list)
+    bleu_scores = bleu.compute(predictions=pred_list, references=gt_list)
+    metrics["rouge1"] = rouge_scores["rouge1"]
+    metrics["rougeL"] = rouge_scores["rougeL"]
+    metrics["bleu"] = bleu_scores["bleu"]
+
+    return metrics
\ No newline at end of file
diff --git a/experiments/taco-rl/train_reinforce.py b/experiments/taco-rl/train_reinforce.py
new file mode 100644
index 0000000..9b195f2
--- /dev/null
+++ b/experiments/taco-rl/train_reinforce.py
@@ -0,0 +1,276 @@
+import torch
+import torch.optim as optim
+import numpy as np
+from torch.utils.data import DataLoader, Dataset
+from accelerate import Accelerator
+from transformers import BertModel, BertTokenizer, AutoTokenizer, AutoModelForTokenClassification
+import torch.nn.functional as F
+import random
+import json
+from tqdm.auto import tqdm
+import torch.nn as nn
+from torch.distributions import Categorical
+from llmlingua.taco_rl import PromptCompressorReinforce
+from metrics import evaluate_sim2
+from utils import query_llm
+from accelerate import Accelerator
+from accelerate import DistributedDataParallelKwargs as DDPK
+from accelerate.utils import InitProcessGroupKwargs
+from datetime import timedelta
+from csv_logger import CsvLogger
+import logging
+import regex as re
+import hydra
+import gc
+import os
+import warnings
+import math
+
+
+warnings.filterwarnings("ignore")
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+# turn off logging
+logging.disable(logging.CRITICAL)
+
+# kwargs = DDPK(find_unused_parameters=True)
+kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=3600))
+accelerator = Accelerator(kwargs_handlers=[kwargs])
+
+input_dict={}
+@hydra.main(config_path="configs", config_name="train_reinforce.yaml", version_base=None)
+def run(cfg):
+    input_dict["train_data_path"] = cfg.data.train_file_path
+    input_dict["model_load_path_hf"] = cfg.model.model_load_path
+    input_dict["model_save_path_hf"] = cfg.model.trained_model_output_dir_hf
+    input_dict["num_epochs"] = cfg.hyperparams.epochs
+    input_dict["compression_rate"] = cfg.hyperparams.compression_rate
+    input_dict["policy_lr"] = cfg.hyperparams.policy_lr
+    input_dict["train_batch_size"] = cfg.hyperparams.train_batch_size
+    input_dict["compression_relaxation_tokens"] = cfg.hyperparams.compression_relaxation_tokens
+    input_dict["prompt"] = cfg.task.prompt
+    input_dict["gpt_model"] = cfg.task.gpt_model
+    input_dict["max_tokens"] = cfg.task.max_tokens
+    input_dict["target_token"] = cfg.hyperparams.target_token
+    input_dict["use_context_level_filter"] = cfg.hyperparams.use_context_level_filter
+    input_dict["use_token_level_filter"] = cfg.hyperparams.use_token_level_filter
+    input_dict["target_context"] = cfg.hyperparams.target_context
+    input_dict["context_level_compression_rate"] = cfg.hyperparams.context_level_compression_rate
+    input_dict["context_level_target_token"] = cfg.hyperparams.context_level_target_token
+    input_dict["force_tokens"] = cfg.hyperparams.force_tokens
+    input_dict["drop_consecutive"] = cfg.hyperparams.drop_consecutive
+    input_dict["force_reserve_digit"] = cfg.hyperparams.force_reserve_digit
+    input_dict["max_seq_len"] = cfg.hyperparams.max_seq_len
+    
+    # Logging settings
+    input_dict["log_dir"] = cfg.logging.log_dir
+    input_dict["log_interval"] = cfg.logging.log_interval
+    input_dict["save_interval"] = cfg.logging.save_interval
+    
+    # Device settings
+    input_dict["use_cuda"] = cfg.device.use_cuda
+    input_dict["device_map"] = cfg.device.device_map
+
+    # REINFORCE parameters
+    input_dict["entropy_coeff"] = cfg.hyperparams.entropy_coeff
+
+run()
+
+# Create log directory if it doesn't exist
+os.makedirs(input_dict["log_dir"], exist_ok=True)
+
+if accelerator.is_local_main_process:
+    delimiter = ","
+    level_names = ["Train", "Val"]
+    reward_level_names = ["TrainR", "ValR"]
+    csv_logger = CsvLogger(filename=f"{input_dict['log_dir']}/reinforce_logs_updated.csv",fmt = f'%(levelname)s{delimiter}%(message)s', add_level_names= level_names,header=["Train/Val", "Epoch", "Step", "Policy Loss", "LR"])
+    rewards_logger = CsvLogger(filename=f"{input_dict['log_dir']}/rewards_logs_updated.csv",fmt = f'%(levelname)s{delimiter}%(message)s', add_level_names= reward_level_names,header=["Train/Val", "Epoch", "Step", "Rewards", "Comp Ratio"])
+
+def get_model_output(input_prompt_list):
+    for i in range(len(input_prompt_list)):
+        input_prompt_list[i] = re.sub(r'\n', ' ', input_prompt_list[i])
+
+    compressed_output_dict = compressor.compress_prompt_llmlingua2(
+        input_prompt_list,
+        rate=compression_rate,
+        target_token=target_token,
+        use_context_level_filter=use_context_level_filter,
+        use_token_level_filter=use_token_level_filter,
+        target_context=target_context,
+        context_level_rate=context_level_compression_rate,
+        context_level_target_token=context_level_target_token,
+        force_tokens=force_tokens,
+        drop_consecutive=drop_consecutive,
+        force_reserve_digit=force_reserve_digit,
+    )
+
+    return compressed_output_dict['compressed_prompt'], compressed_output_dict['compressed_prompt_list'], compressed_output_dict['compression_ratios'], compressed_output_dict["actions"], compressed_output_dict["old_log_probs"], compressed_output_dict["old_logits"], compressed_output_dict["compressed_prompt_list_2"], compressed_output_dict["entropy"]
+
+
+class REINFORCE:
+    def __init__(self, policy_network, policy_optimizer, policy_scheduler):
+        self.policy_network = policy_network
+        self.policy_optimizer = policy_optimizer
+        self.policy_scheduler = policy_scheduler
+
+    def update(self, rewards_and_comp_ratios, old_log_probs, entropies, epoch_counter, steps):
+        rewards = [reward["reward"] for reward in rewards_and_comp_ratios]
+        comp_ratios = [reward["comp_ratio"] for reward in rewards_and_comp_ratios]
+        
+        rewards = torch.FloatTensor(rewards).to(accelerator.device)
+        rewards = rewards.unsqueeze(1).detach()
+
+        comp_ratios = torch.FloatTensor(comp_ratios).to(accelerator.device)
+        comp_ratios = comp_ratios.unsqueeze(1).detach()
+
+        entropies = torch.stack(entropies)
+        old_log_probs = torch.stack(old_log_probs)
+
+        # Enhanced policy loss with entropy regularization
+        policy_loss = -torch.mean(old_log_probs * (rewards)) - input_dict["entropy_coeff"]*torch.mean(entropies)
+        
+        self.policy_optimizer.zero_grad()
+
+        if steps % input_dict["log_interval"] == 0:
+            print(f"Avg. Entropy: {torch.mean(entropies)}")
+
+        if accelerator.is_local_main_process:
+            csv_logger.Train([epoch_counter, steps, policy_loss.item(), self.policy_scheduler.get_last_lr()[0]])
+
+        accelerator.backward(policy_loss)
+        self.policy_optimizer.step()
+        self.policy_scheduler.step()
+
+def get_gpt_output(input_text):
+    query = prompt.format(transcript=input_text)
+    gpt_output = query_llm(
+        prompt=query,
+        model_name=input_dict["gpt_model"],
+        max_tokens=input_dict["max_tokens"],
+    )
+
+    return gpt_output
+
+def calculate_rewards(gpt_output, model_compressed_text, compression_ratio):
+    if compression_ratio["compressed"] > 0.1*compression_ratio["original"]:
+        org_input_summary = gpt_output
+        model_compressed_summary = get_gpt_output(model_compressed_text)
+        comp_ratio = compression_ratio["original"] / compression_ratio["compressed"]
+    else:
+        print(compression_ratio)
+        comp_ratio = compression_ratio["original"] / compression_ratio["compressed"]
+        return {"comp_ratio": comp_ratio, "reward": -0.4}       
+
+    if org_input_summary is not None and model_compressed_summary is not None:
+        eval_scores = evaluate_sim2([model_compressed_summary], [org_input_summary])
+        bleu = eval_scores["bleu"]
+        comp_coeff = compression_ratio["compressed"] - compression_rate*compression_ratio["original"]
+        
+        if abs(comp_coeff) > input_dict["compression_relaxation_tokens"]:
+            reward = -0.1
+        else:
+            reward = bleu
+
+        return {"comp_ratio": comp_ratio, "reward": reward}
+    else:
+        return {"comp_ratio": comp_ratio, "reward": 0.4}
+
+def train(dataloader, num_epochs):
+    epoch_counter = 0
+    steps = 0
+    for epoch in range(num_epochs):
+        for batch_idx, batch in enumerate(tqdm(dataloader, total=len(dataloader)*(num_epochs-epoch))):
+            input_prompt_list = batch["org_prompt"]
+            gpt_output_list = batch["gpt_output"]
+            
+            compressed_prompt, compressed_prompt_list, compression_ratio_list, actions, old_log_probs, _, compressed_prompt_list_2, entropies = get_model_output(input_prompt_list)
+
+            rewards_and_comp_ratios = [calculate_rewards(gpt_output_list[i], compressed_prompt_list_2[i], compression_ratio_list[i]) for i in range(len(gpt_output_list))]
+            rewards = [reward["reward"] for reward in rewards_and_comp_ratios]
+            comp_ratios = [reward["comp_ratio"] for reward in rewards_and_comp_ratios]
+
+            # update the policy network
+            reinforce_agent.update(rewards_and_comp_ratios, old_log_probs, entropies, epoch_counter, steps)
+
+            # Enhanced logging with more metrics
+            if batch_idx % input_dict["log_interval"] == 0:
+                print(f"Epoch: {epoch}, Batch: {batch_idx}, Avg. Batch Rewards: {sum(rewards)/len(rewards)}, Avg. Comp Ratio: {sum(comp_ratios)/len(comp_ratios)}")
+                if accelerator.is_local_main_process:
+                    rewards_logger.TrainR([epoch_counter, batch_idx//input_dict["log_interval"], sum(rewards)/len(rewards), sum(comp_ratios)/len(comp_ratios)])
+            
+            steps += 1
+
+        epoch_counter += 1
+
+def tokenize_text(text):
+    tokenize_text = tokenizer(text, max_length=input_dict["max_seq_len"], padding="max_length", return_tensors="pt", truncation=True)
+    return {k: v[0] for k, v in tokenize_text.items()}
+
+class GenericRLDataset(Dataset):
+    def __init__(self, data_path):
+        self.data = json.load(open(data_path))
+        # self.data = dict(list(self.data.items())[:100])
+
+    def __len__(self):
+        return len(self.data)
+    
+    def __getitem__(self, idx):
+        tokenized_dataset = tokenize_text(self.data[str(idx)]["original_prompt"])
+        return {"input_ids": tokenized_dataset["input_ids"], "attention_mask": tokenized_dataset["attention_mask"], "org_prompt": self.data[str(idx)]["original_prompt"], "gpt_output": self.data[str(idx)]["gpt_output"]}
+
+
+# Example usage
+device = torch.device("cuda" if torch.cuda.is_available() and input_dict["use_cuda"] else "cpu")
+
+input_data_path = input_dict["train_data_path"]
+model_name = input_dict["model_load_path_hf"]
+
+compression_rate = input_dict["compression_rate"]
+target_token = input_dict["target_token"]
+use_context_level_filter = input_dict["use_context_level_filter"]
+use_token_level_filter = input_dict["use_token_level_filter"]
+target_context = input_dict["target_context"]
+context_level_compression_rate = input_dict["context_level_compression_rate"]
+context_level_target_token = input_dict["context_level_target_token"]
+force_tokens = input_dict["force_tokens"]
+drop_consecutive = input_dict["drop_consecutive"]
+force_reserve_digit = input_dict["force_reserve_digit"]
+
+# Use the new PromptCompressorReinforce class
+compressor = PromptCompressorReinforce(
+    model_name=model_name,
+    model_config={},
+    use_llmlingua2=True,
+    device_map=input_dict["device_map"],
+) 
+
+tokenizer = compressor.tokenizer
+
+prompt = input_dict["prompt"]
+
+policy_optimizer = optim.AdamW(compressor.model.parameters(), lr=input_dict["policy_lr"])
+
+dataset = GenericRLDataset(input_data_path)
+dataloader = DataLoader(dataset, batch_size=input_dict["train_batch_size"], shuffle=True)
+
+num_epochs = input_dict["num_epochs"]
+
+policy_scheduler = optim.lr_scheduler.CosineAnnealingLR(policy_optimizer, T_max=num_epochs*len(dataloader))
+
+compressor.model, policy_optimizer, policy_scheduler, dataloader = accelerator.prepare(
+    compressor.model, policy_optimizer, policy_scheduler, dataloader
+)
+
+policy_net = compressor.model
+
+reinforce_agent = REINFORCE(policy_net, policy_optimizer, policy_scheduler)
+
+train(dataloader, num_epochs=num_epochs)
+
+# save the model
+unwrapped_model = accelerator.unwrap_model(policy_net)
+unwrapped_model.save_pretrained(
+    input_dict["model_save_path_hf"],
+    is_main_process=accelerator.is_main_process,
+    save_function=accelerator.save,
+) 
\ No newline at end of file
diff --git a/experiments/taco-rl/utils.py b/experiments/taco-rl/utils.py
new file mode 100644
index 0000000..353e979
--- /dev/null
+++ b/experiments/taco-rl/utils.py
@@ -0,0 +1,361 @@
+import json
+import os
+import random
+import re
+import string
+import time
+
+import numpy as np
+import torch
+import yaml
+from torch.utils.data import Dataset
+from azure.identity import ManagedIdentityCredential, get_bearer_token_provider 
+from openai import AzureOpenAI 
+
+
+# Default API configuration - can be overridden by user
+DEFAULT_API_CONFIG = {
+    "scope": "https://cognitiveservices.azure.com/.default",
+    "client_id": "YOUR_CLIENT_ID_HERE",  # Replace with your client ID
+    "api_details": {
+        "primary": {
+            "api_base": "YOUR_PRIMARY_API_BASE_HERE",  # Replace with your primary API base
+            "api_version": "2024-02-01",
+        },
+        "secondary": {
+            "api_base": "YOUR_SECONDARY_API_BASE_HERE",  # Replace with your secondary API base
+            "api_version": "2024-02-01",
+        }
+    }
+}
+
+# Global variables for API configuration
+api_config = DEFAULT_API_CONFIG.copy()
+token_provider = None
+client = None
+api_base = None
+
+def initialize_api_config(custom_config=None):
+    """Initialize API configuration with custom settings or defaults"""
+    global api_config, token_provider, client, api_base
+    
+    if custom_config:
+        api_config.update(custom_config)
+    
+    # Initialize token provider
+    token_provider = get_bearer_token_provider(
+        ManagedIdentityCredential(client_id=api_config["client_id"]), 
+        api_config["scope"]
+    )
+    
+    # Initialize client with default endpoint
+    client = AzureOpenAI( 
+        api_version=api_config["api_details"]["primary"]["api_version"], 
+        azure_endpoint=api_config["api_details"]["primary"]["api_base"], 
+        azure_ad_token_provider=token_provider 
+    )
+    
+    api_base = api_config["api_details"]["primary"]["api_base"]
+
+# Initialize with default config
+initialize_api_config()
+
+def query_llm(
+    prompt,
+    model_name,
+    max_tokens,
+    **kwargs,
+):
+    SLEEP_TIME_FAILED = 1
+    global api_base, client, api_config
+
+    request = {
+        "temperature": kwargs["temperature"] if "temperature" in kwargs else 0.0,
+        "top_p": kwargs["top_p"] if "top_p" in kwargs else 1.0,
+        "seed": kwargs["seed"] if "seed" in kwargs else 42,
+        "max_tokens": max_tokens,
+        "n": 1,
+        "stream": False,
+    }
+    request["messages"] = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt},
+    ]
+
+    answer = None
+    response = None
+    while answer is None:
+        try:
+            response = client.chat.completions.create(model=model_name, **request)
+            answer = response.choices[0].message.content
+            
+        except Exception as e:
+            print(f"error: {e}, response: {response}")
+            answer = None
+            if "content management" in str(e):
+                print(f"error: {e}, response: {response}")
+                print("returning None")
+                break
+            elif "content" in str(e):
+                print(f"error: {e}, response: {response}")
+                print("returning None")
+                break
+            elif "filtered" in str(response):
+                print(f"error: {e}, response: {response}")
+                print("returning None")
+                break
+            elif "repetitive patterns" in str(e):
+                print(f"error: {e}, response: {response}")
+                print("returning None")
+                break
+            elif "exceeded token rate limit" in str(e) or "exceeded call rate limit" in str(e) or "rate limit" in str(e):
+                # change api details
+                # print(f"error: {e}, response: {response}")
+                print("changing api details")
+                if api_base == api_config["api_details"]["primary"]["api_base"]:
+                    api_base = api_config["api_details"]["secondary"]["api_base"]
+                    client = AzureOpenAI( 
+                        api_version=api_config["api_details"]["secondary"]["api_version"], 
+                        azure_endpoint=api_config["api_details"]["secondary"]["api_base"], 
+                        azure_ad_token_provider=token_provider 
+                    )
+                else:
+                    api_base = api_config["api_details"]["primary"]["api_base"]
+                    client = AzureOpenAI( 
+                        api_version=api_config["api_details"]["primary"]["api_version"], 
+                        azure_endpoint=api_config["api_details"]["primary"]["api_base"], 
+                        azure_ad_token_provider=token_provider 
+                    )
+            time.sleep(SLEEP_TIME_FAILED)
+    # sleep(SLEEP_TIME_SUCCESS)
+    return answer
+
+
+class TokenClfDataset(Dataset):
+    def __init__(
+        self,
+        texts,
+        max_len=512,
+        tokenizer=None,
+        model_name="bert-base-multilingual-cased",
+    ):
+        self.len = len(texts)
+        self.texts = texts
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+        self.model_name = model_name
+        if "bert-base-multilingual-cased" in model_name:
+            self.cls_token = "[CLS]"
+            self.sep_token = "[SEP]"
+            self.unk_token = "[UNK]"
+            self.pad_token = "[PAD]"
+            self.mask_token = "[MASK]"
+        elif "xlm-roberta-large" in model_name:
+            self.bos_token = "<s>"
+            self.eos_token = "</s>"
+            self.sep_token = "</s>"
+            self.cls_token = "<s>"
+            self.unk_token = "<unk>"
+            self.pad_token = "<pad>"
+            self.mask_token = "<mask>"
+        else:
+            raise NotImplementedError()
+
+    def __getitem__(self, index):
+        text = self.texts[index]
+        tokenized_text = self.tokenizer.tokenize(text)
+
+        tokenized_text = (
+            [self.cls_token] + tokenized_text + [self.sep_token]
+        )  # add special tokens
+
+        if len(tokenized_text) > self.max_len:
+            tokenized_text = tokenized_text[: self.max_len]
+        else:
+            tokenized_text = tokenized_text + [
+                self.pad_token for _ in range(self.max_len - len(tokenized_text))
+            ]
+
+        attn_mask = [1 if tok != self.pad_token else 0 for tok in tokenized_text]
+
+        ids = self.tokenizer.convert_tokens_to_ids(tokenized_text)
+
+        return {
+            "ids": torch.tensor(ids, dtype=torch.long),
+            "mask": torch.tensor(attn_mask, dtype=torch.long),
+        }
+
+    def __len__(self):
+        return self.len
+
+
+def seed_everything(seed: int):
+    random.seed(seed)
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+
+def is_begin_of_new_word(token, model_name, force_tokens, token_map):
+    if "bert-base-multilingual-cased" in model_name \
+            or "tinybert" in model_name.lower() \
+            or "mobilebert" in model_name.lower():
+        if token.lstrip("##") in force_tokens or token.lstrip("##") in set(
+            token_map.values()
+        ):
+            return True
+        return not token.startswith("##")
+    elif "xlm-roberta-large" in model_name:
+        if (
+            token in string.punctuation
+            or token in force_tokens
+            or token in set(token_map.values())
+        ):
+            return True
+        return token.startswith("▁")
+    else:
+        raise NotImplementedError()
+
+
+def replace_added_token(token, token_map):
+    for ori_token, new_token in token_map.items():
+        token = token.replace(new_token, ori_token)
+    return token
+
+
+def get_pure_token(token, model_name):
+    if "bert-base-multilingual-cased" in model_name \
+            or "tinybert" in model_name.lower() \
+            or "mobilebert" in model_name.lower():
+        return token.lstrip("##")
+    elif "xlm-roberta-large" in model_name:
+        return token.lstrip("▁")
+    else:
+        raise NotImplementedError()
+
+
+def process_structured_json_data(json_data, json_config):
+    if isinstance(json_config, str):
+        with open(json_config, "r") as file:
+            json_config = yaml.safe_load(file)
+    elif not isinstance(json_config, dict):
+        raise ValueError(
+            "Invalid json config file. It should be a dictionary or a path to a yaml file."
+        )
+    assert set(json_data.keys()) == set(
+        json_config.keys()
+    ), "Keys in json data and json config file do not match."
+    context = ["<llmlingua, compress=False>{</llmlingua>"]
+    forced_context_ids = [0]
+    for i, (k, v) in enumerate(json_data.items()):
+        if not json_config[k]["pair_remove"]:
+            forced_context_ids.append(i + 1)
+        rate, compress, value_type = (
+            json_config[k]["rate"],
+            json_config[k]["compress"],
+            json_config[k]["value_type"],
+        )
+        if not compress:
+            rate = 1
+        context.append(precess_jsonKVpair(k, v, value_type, rate))
+    context[-1] = context[-1][:-14] + "</llmlingua>"
+    context.append("<llmlingua, compress=False>}</llmlingua>")
+    forced_context_ids.append(len(json_data) + 1)
+
+    return context, forced_context_ids
+
+
+def precess_jsonKVpair(k, v, value_type, rate):
+    if rate == 1:
+        return (
+            "<llmlingua, compress=False>"
+            + f"{json.dumps({k:v})[1:-1]}, "
+            + "</llmlingua>"
+        )
+    if value_type == "str" or value_type == "string":
+        v = str(v)
+        new_v = (
+            f"</llmlingua><llmlingua, rate={rate}>"
+            + v
+            + "</llmlingua><llmlingua, compress=False>"
+        )
+        return (
+            "<llmlingua, compress=False>"
+            + f"{json.dumps({k:new_v})[1:-1]}, "
+            + "</llmlingua>"
+        )
+    elif value_type in ["int", "float", "integer", "number"]:
+        if value_type in ["int", "integer"]:
+            v = int(v)
+        if value_type in ["float", "number"]:
+            v = float(v)
+        return (
+            "<llmlingua, compress=False>"
+            + f'"{k}": </llmlingua><llmlingua, rate={rate}>{v}</llmlingua><llmlingua, compress=False>, </llmlingua>'
+        )
+    elif value_type == "bool" or value_type == "boolean":
+        if v in ["True", "true", "TRUE", True]:
+            v = "true"
+        elif v in ["False", "false", "FALSE", False]:
+            v = "false"
+        else:
+            raise ValueError(f"Invalid boolean value: {v}")
+        new_v = (
+            f"</llmlingua><llmlingua, rate={rate}>"
+            + v
+            + "</llmlingua><llmlingua, compress=False>"
+        )
+        return (
+            "<llmlingua, compress=False>"
+            + f"{json.dumps({k:new_v})[1:-1]}, "
+            + "</llmlingua>"
+        )
+    elif value_type == "list" or value_type == "List":
+        return (
+            "<llmlingua, compress=False>"
+            + f'"{k}": {process_sequence_data(rate, "[", "]", v)}'
+        )
+    elif value_type == "dict" or value_type == "dictionary":
+        return (
+            "<llmlingua, compress=False>"
+            + f'"{k}": {process_sequence_data(rate, "[", "]", v, is_dict=True)}'
+        )
+    elif value_type == "set":
+        raise ValueError(f"Invalid value type: {value_type}")
+        # return '<llmlingua, compress=False>' + f'"{k}": {process_sequence_data(rate, "{", "}", v)}'
+    elif value_type == "tuple":
+        return (
+            "<llmlingua, compress=False>"
+            + f'"{k}": {process_sequence_data(rate, "(", ")", v)}'
+        )
+    else:
+        raise ValueError(f"Invalid value type: {value_type}")
+
+
+def process_sequence_data(rate, start, end, sequence, is_dict=False):
+    res = f'{start}"'
+    n = len(sequence)
+    if not is_dict:
+        for i, item in enumerate(sequence):
+            item = str(item)
+            res += f"</llmlingua><llmlingua, rate={rate}>{item}</llmlingua><llmlingua, compress=False>"
+            if i != n - 1:
+                res += '", "'
+    else:
+        for i, (k, v) in enumerate(sequence.items()):
+            item = f"{k}: {v}"
+            item.replace('"', "'")
+            res += f"</llmlingua><llmlingua, rate={rate}>{item}</llmlingua><llmlingua, compress=False>"
+            if i != n - 1:
+                res += '", "'
+    res += f'"{end}, </llmlingua>'
+    return res
+
+
+def remove_consecutive_commas(text):
+    text = re.sub(r",\s*", ",", text)
+    text = re.sub(r",+", ",", text)
+    return text
\ No newline at end of file
diff --git a/llmlingua/__init__.py b/llmlingua/__init__.py
index d750210..ead3b84 100644
--- a/llmlingua/__init__.py
+++ b/llmlingua/__init__.py
@@ -3,6 +3,7 @@
 
 # flake8: noqa
 from .prompt_compressor import PromptCompressor
+from .taco_rl import PromptCompressorReinforce
 from .version import VERSION as __version__
 
-__all__ = ["PromptCompressor"]
+__all__ = ["PromptCompressor", "PromptCompressorReinforce"]
diff --git a/llmlingua/taco-rl/README.md b/llmlingua/taco-rl/README.md
new file mode 100644
index 0000000..e52916b
--- /dev/null
+++ b/llmlingua/taco-rl/README.md
@@ -0,0 +1,112 @@
+# TACO-RL: Task-Aware Prompt Compression Optimization with Reinforcement Learning
+
+This submodule provides reinforcement learning capabilities for fine-tuning LLMLingua models on new tasks using reward signals from language models like GPT-3.5.
+
+## Overview
+
+TACO-RL (Task-Aware Prompt Compression Optimization with Reinforcement Learning) extends the LLMLingua framework by adding reinforcement learning capabilities that allow you to fine-tune pre-trained LLMLingua models on new tasks. The approach addresses the limitations of existing prompt compression techniques by leveraging task-specific reward signals to guide the learning process.
+
+### Key Innovation
+
+Unlike traditional prompt compression methods that either rely on sub-optimal metrics like information entropy or treat compression as a task-agnostic token classification problem, TACO-RL:
+
+1. **Leverages Bidirectional Context**: Uses existing Transformer encoder-based token classification models for low latency
+2. **Task-Specific Optimization**: Guides learning with task-specific reward signals using lightweight REINFORCE algorithm
+3. **Performance Improvement**: Achieves 8%-189% improvement across diverse tasks while maintaining compression rates and latency requirements
+
+### Research Foundation
+
+Based on the paper "TACO-RL: Task Aware Prompt Compression Optimization with Reinforcement Learning" ([arXiv:2409.13035](https://arxiv.org/pdf/2409.13035)), this implementation addresses two key research questions:
+
+- **Q1**: How to design a prompt compression model that effectively leverages bidirectional context while providing low inference latency?
+- **Q2**: How to efficiently train a model with proper guidance from task-specific reward signals while minimizing computational cost?
+
+The solution builds on LLMLingua-2's task-agnostic encoder-based transformer model and enhances it with task-specific reward signals using on-policy REINFORCE algorithm.
+
+## How TACO-RL Works
+
+### Architecture Overview
+
+1. **Pre-trained Base Model**: Start with a pre-trained LLMLingua model (e.g., `microsoft/llmlingua-2-xlm-roberta-large-meetingbank`)
+2. **Task-Specific Data**: Prepare task-specific data with original prompts and expected outputs
+3. **Reward Signal Generation**: Use a teacher model (e.g., GPT-3.5) to evaluate compression quality by comparing outputs from original vs. compressed prompts
+4. **REINFORCE Training**: Fine-tune the model using policy gradients based on task-specific reward signals
+5. **Task-Optimized Model**: Get a model optimized for your specific task while maintaining low inference latency
+
+### Training Process
+
+During the model alignment process:
+1. Generate task output from both original and compressed prompts
+2. Compute task-specific reward signal using divergence between outputs
+3. Update the base encoder model using on-policy REINFORCE algorithm
+4. The reward signals guide the model to preserve task-relevant information during compression
+
+## Task-Specific Metrics and Evaluation
+
+TACO-RL has been extensively evaluated on three diverse and challenging tasks with different evaluation metrics:
+
+### 1. Text Summarization (MeetingBank Dataset)
+
+**Metrics Considered:**
+- **BLEU**: Measures n-gram overlap between generated and reference summaries
+- **ROUGE-1**: Unigram overlap for content coverage
+- **ROUGE-2**: Bigram overlap for fluency assessment
+- **ROUGE-L**: Longest common subsequence for structure preservation
+- **BERTScore F1**: Semantic similarity using BERT embeddings
+
+### 2. Question Answering (SQuAD Dataset)
+
+**Metrics Considered:**
+- **Exact Match (EM)**: Percentage of answers that exactly match the ground truth
+- **F1 Score**: Harmonic mean of precision and recall for answer span prediction
+- **BLEU**: For answer generation quality
+- **ROUGE**: For answer completeness and relevance
+
+### 3. Code Summarization (CodeSearchNet Dataset)
+
+**Metrics Considered:**
+- **BLEU**: For code summary quality
+- **ROUGE-1/2/L**: For summary completeness and relevance
+- **BERTScore F1**: For semantic similarity of code descriptions
+- **CodeBLEU**: Specialized metric for code-to-text generation
+
+## Key Features
+
+The `PromptCompressorReinforce` class extends the base `PromptCompressor` with:
+
+1. **Action Tracking**: Tracks actions taken during compression for RL training
+2. **Log Probability Storage**: Stores log probabilities for policy gradient computation
+3. **Entropy Calculation**: Computes entropy for exploration regularization
+4. **RL Information Methods**: Provides methods to access and clear RL-specific data
+5. **Configurable Training**: All hyperparameters configurable via YAML files
+6. **Multi-Metric Support**: Configurable evaluation metrics for different tasks
+
+
+## Integration with Main LLMLingua
+
+After training, you can use your fine-tuned model with the main LLMLingua package:
+
+```python
+from llmlingua import PromptCompressor
+
+# Load your fine-tuned model
+compressor = PromptCompressor(
+    model_name="path/to/your/fine_tuned_model",
+    use_llmlingua2=True
+)
+
+# Use it for compression
+compressed_prompt = compressor.compress_prompt_llmlingua2(
+    ["Your prompt here..."],
+    rate=0.5
+)
+```
+
+## Notes
+
+- The training uses the REINFORCE algorithm for policy gradient optimization
+- The compressor maintains the same interface as the original LLMLingua but provides additional RL information
+- Make sure to adjust batch sizes and learning rates based on your hardware capabilities
+- The teacher model (GPT-3.5) is used to evaluate compression quality and provide reward signals
+- All hyperparameters are configurable through the YAML file for easy experimentation
+- Different tasks may require different evaluation metrics for optimal performance
diff --git a/llmlingua/taco-rl/__init__.py b/llmlingua/taco-rl/__init__.py
new file mode 100644
index 0000000..6a41be2
--- /dev/null
+++ b/llmlingua/taco-rl/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
+# flake8: noqa
+from .prompt_compressor_reinforce import PromptCompressorReinforce
+
+__all__ = ["PromptCompressorReinforce"]
diff --git a/llmlingua/taco-rl/prompt_compressor_reinforce.py b/llmlingua/taco-rl/prompt_compressor_reinforce.py
new file mode 100644
index 0000000..0ccd00f
--- /dev/null
+++ b/llmlingua/taco-rl/prompt_compressor_reinforce.py
@@ -0,0 +1,509 @@
+# Copyright (c) 2023-2025 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
+import re
+import copy
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.distributions import Categorical
+from torch.utils.data import DataLoader
+from typing import List
+from llmlingua.prompt_compressor import PromptCompressor
+from utils import TokenClfDataset, get_pure_token, is_begin_of_new_word, replace_added_token
+
+class PromptCompressorReinforce(PromptCompressor):
+    """
+    PromptCompressorReinforce extends PromptCompressor with reinforcement learning capabilities.
+    
+    This class overrides the compression methods to track actions, log probabilities, and entropy
+    for reinforcement learning training. It maintains the same interface as PromptCompressor
+    but provides additional RL-specific outputs.
+    """
+    
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.actions = []
+        self.old_log_probs = []
+        self.old_logits = None
+        self.entropy = []
+        self.compressed_prompt_list_2 = []
+        self.compression_ratios = []
+    
+    def compress_prompt_llmlingua2(
+        self,
+        context: List[str],
+        rate: float = 0.5,
+        target_token: int = -1,
+        use_context_level_filter: bool = False,
+        use_token_level_filter: bool = True,
+        target_context: int = -1,
+        context_level_rate: float = 1.0,
+        context_level_target_token: int = -1,
+        force_context_ids: List[int] = [],
+        return_word_label: bool = False,
+        word_sep: str = "\t\t|\t\t",
+        label_sep: str = " ",
+        token_to_word: str = "mean",
+        force_tokens: List[str] = [],
+        force_reserve_digit: bool = False,
+        drop_consecutive: bool = False,
+        chunk_end_tokens: List[str] = [".", "\n"],
+    ):
+        """
+        Override the llmlingua2 compression method to track RL-specific information.
+        """
+        assert len(force_tokens) <= self.max_force_token
+        token_map = {}
+        for i, t in enumerate(force_tokens):
+            if len(self.tokenizer.tokenize(t)) != 1:
+                token_map[t] = self.added_tokens[i]
+        chunk_end_tokens = copy.deepcopy(chunk_end_tokens)
+        for c in chunk_end_tokens:
+            if c in token_map:
+                chunk_end_tokens.append(token_map[c])
+        chunk_end_tokens = set(chunk_end_tokens)
+
+        if type(context) == str:
+            context = [context]
+        context = copy.deepcopy(context)
+
+        if len(context) == 1 and use_context_level_filter:
+            use_context_level_filter = False
+
+        n_original_token = 0
+        context_chunked = []
+        for i in range(len(context)):
+            n_original_token += self.get_token_length(
+                context[i], use_oai_tokenizer=True
+            )
+            for ori_token, new_token in token_map.items():
+                context[i] = context[i].replace(ori_token, new_token)
+            context_chunked.append(
+                self.__chunk_context(context[i], chunk_end_tokens=chunk_end_tokens)
+            )
+
+        org_context = None
+        if use_context_level_filter:
+            # want use_context_level_filter but do not specify any parameters in context level?
+            # we will set context_level_rate = (rate + 1.0) / 2 if specify rate or target_token * 2 if specify target_token
+            if (
+                target_context <= 0
+                and context_level_rate >= 1.0
+                and context_level_target_token <= 0
+            ):
+                if target_token < 0 and rate < 1.0:
+                    context_level_rate = (
+                        (rate + 1.0) / 2 if use_token_level_filter else rate
+                    )
+                if target_token >= 0:
+                    context_level_target_token = (
+                        target_token * 2 if use_token_level_filter else target_token
+                    )
+
+            if target_context >= 0:
+                context_level_rate = min(target_context / len(context), 1.0)
+            if context_level_target_token >= 0:
+                context_level_rate = min(
+                    context_level_target_token / n_original_token, 1.0
+                )
+
+            context_probs, context_words = self.__get_context_prob(
+                context_chunked,
+                token_to_word=token_to_word,
+                force_tokens=force_tokens,
+                token_map=token_map,
+                force_reserve_digit=force_reserve_digit,
+            )
+
+            threshold = np.percentile(
+                context_probs, int(100 * (1 - context_level_rate))
+            )
+
+            reserved_context = []
+            context_label = [False] * len(context_probs)
+            for i, p in enumerate(context_probs):
+                if p >= threshold or (
+                    force_context_ids is not None and i in force_context_ids
+                ):
+                    reserved_context.append(context_chunked[i])
+                    context_label[i] = True
+            n_reserved_token = 0
+            for chunks in reserved_context:
+                for c in chunks:
+                    n_reserved_token += self.get_token_length(c, use_oai_tokenizer=True)
+            if target_token >= 0:
+                rate = min(target_token / n_reserved_token, 1.0)
+
+            org_context = copy.deepcopy(reserved_context)
+
+            if use_token_level_filter:
+                compressed_context, word_list, word_label_list, actions, old_log_probs, old_logits, compressed_prompt_list_2, entropy = self.__compress(
+                    reserved_context,
+                    reduce_rate=max(0, 1 - rate),
+                    token_to_word=token_to_word,
+                    force_tokens=force_tokens,
+                    token_map=token_map,
+                    force_reserve_digit=force_reserve_digit,
+                    drop_consecutive=drop_consecutive,
+                )
+            else:
+                compressed_context, word_list, word_label_list, actions, old_log_probs, old_logits, compressed_prompt_list_2, entropy = self.__compress(
+                    reserved_context,
+                    reduce_rate=0,
+                    token_to_word=token_to_word,
+                    force_tokens=force_tokens,
+                    token_map=token_map,
+                    force_reserve_digit=force_reserve_digit,
+                    drop_consecutive=drop_consecutive,
+                )
+
+            n_compressed_token = 0
+            compression_ratios=[]
+            for c,o in zip(compressed_prompt_list_2,org_context):
+                org_token_length = self.get_token_length(o, use_oai_tokenizer=True)
+                compressed_token_length = self.get_token_length(c, use_oai_tokenizer=True)
+                fin_dict = {"original":org_token_length, "compressed":compressed_token_length, "original_context":o, "compressed_context":c}
+                compression_ratios.append(fin_dict)
+            for c in compressed_context:
+                n_compressed_token += self.get_token_length(c, use_oai_tokenizer=True)
+            saving = (n_original_token - n_compressed_token) * 0.06 / 1000
+            ratio = (
+                1 if n_compressed_token == 0 else n_original_token / n_compressed_token
+            )
+            res = {
+                "compressed_prompt": "\n\n".join(compressed_context),
+                "compressed_prompt_list": compressed_context,
+                "compression_ratios":compression_ratios,
+                "compressed_prompt_list_2":compressed_prompt_list_2,
+                "entropy": entropy,
+                "origin_tokens": n_original_token,
+                "compressed_tokens": n_compressed_token,
+                "ratio": f"{ratio:.1f}x",
+                "rate": f"{1 / ratio * 100:.1f}%",
+                "saving": f", Saving ${saving:.1f} in GPT-4.",
+                "actions": actions,
+                "old_log_probs": old_log_probs,
+                "old_logits": old_logits,
+            }
+            if return_word_label:
+                words = []
+                labels = []
+                j = 0
+                for i in range(len(context)):
+                    if context_label[i]:
+                        words.extend(word_list[j])
+                        labels.extend(word_label_list[j])
+                        j += 1
+                    else:
+                        words.extend(context_words[i])
+                        labels.extend([0] * len(context_words[i]))
+                word_label_lines = word_sep.join(
+                    [f"{word}{label_sep}{label}" for word, label in zip(words, labels)]
+                )
+                res["fn_labeled_original_prompt"] = word_label_lines
+            return res
+
+        if target_token > 0:
+            rate = min(target_token / n_original_token, 1.0)
+
+        org_context = copy.deepcopy(context_chunked)
+
+        if use_token_level_filter:
+            compressed_context, word_list, word_label_list, actions, old_log_probs, old_logits, compressed_prompt_list_2, entropy = self.__compress(
+                context_chunked,
+                reduce_rate=max(0, 1 - rate),
+                token_to_word=token_to_word,
+                force_tokens=force_tokens,
+                token_map=token_map,
+                force_reserve_digit=force_reserve_digit,
+                drop_consecutive=drop_consecutive,
+            )
+        else:
+            compressed_context, word_list, word_label_list, actions, old_log_probs, old_logits, compressed_prompt_list_2, entropy = self.__compress(
+                context_chunked,
+                reduce_rate=0,
+                token_to_word=token_to_word,
+                force_tokens=force_tokens,
+                token_map=token_map,
+                force_reserve_digit=force_reserve_digit,
+                drop_consecutive=drop_consecutive,
+            )
+
+        n_compressed_token = 0
+        compression_ratios=[]
+        for c,o in zip(compressed_prompt_list_2, org_context):
+            org_token_length = self.get_token_length(o[0], use_oai_tokenizer=True)
+            compressed_token_length = self.get_token_length(c, use_oai_tokenizer=True)
+            fin_dict = {"original":org_token_length, "compressed":compressed_token_length, "original_context":o[0], "compressed_context":c}
+            compression_ratios.append(fin_dict)
+        for c in compressed_context:
+            n_compressed_token += self.get_token_length(c, use_oai_tokenizer=True)
+        saving = (n_original_token - n_compressed_token) * 0.06 / 1000
+        ratio = 1 if n_compressed_token == 0 else n_original_token / n_compressed_token
+        res = {
+            "compressed_prompt": "\n\n".join(compressed_context),
+            "compressed_prompt_list": compressed_context,
+            "compression_ratios":compression_ratios,
+            "compressed_prompt_list_2":compressed_prompt_list_2,
+            "entropy": entropy,
+            "origin_tokens": n_original_token,
+            "compressed_tokens": n_compressed_token,
+            "ratio": f"{ratio:.1f}x",
+            "rate": f"{1 / ratio * 100:.1f}%",
+            "saving": f", Saving ${saving:.1f} in GPT-4.",
+            "actions": actions,
+            "old_log_probs": old_log_probs,
+            "old_logits": old_logits,
+        }
+        if return_word_label:
+            words = []
+            labels = []
+            for w_list, l_list in zip(word_list, word_label_list):
+                words.extend(w_list)
+                labels.extend(l_list)
+
+            word_label_lines = word_sep.join(
+                [f"{word}{label_sep}{label}" for word, label in zip(words, labels)]
+            )
+            res["fn_labeled_original_prompt"] = word_label_lines
+        return res
+
+    def __chunk_context(self, origin_text, chunk_end_tokens):
+        # leave 2 token for CLS and SEP
+        max_len = self.max_seq_len - 2
+        origin_list = []
+        origin_tokens = self.tokenizer.tokenize(origin_text)
+        n = len(origin_tokens)
+        st = 0
+        while st < n:
+            if st + max_len > n - 1:
+                chunk = self.tokenizer.convert_tokens_to_string(origin_tokens[st:n])
+                origin_list.append(chunk)
+                break
+            else:
+                ed = st + max_len
+                for j in range(0, ed - st):
+                    if origin_tokens[ed - j] in chunk_end_tokens:
+                        ed = ed - j
+                        break
+                chunk = self.tokenizer.convert_tokens_to_string(
+                    origin_tokens[st : ed + 1]
+                )
+                origin_list.append(chunk)
+                st = ed + 1
+        return origin_list
+
+    def __merge_token_to_word_for_actions(
+        self, tokens, token_actions, force_tokens, token_map, force_reserve_digit
+    ):
+        """
+        Merge tokens to words while tracking actions for RL training.
+        """
+        words = []
+        word_actions = []
+        word_actions_no_force = []
+
+        for token, action in zip(tokens, token_actions):
+            if token in self.special_tokens:
+                continue
+            # add a new word
+            elif is_begin_of_new_word(token, self.model_name, force_tokens, token_map):
+                pure_token = get_pure_token(token, self.model_name)
+                action_no_force = action
+                if pure_token in force_tokens or pure_token in set(token_map.values()):
+                    action = 1
+                token = replace_added_token(token, token_map)
+                words.append(token)
+                word_actions.append(
+                    [
+                        1
+                        if force_reserve_digit and bool(re.search(r"\d", token))
+                        else action
+                    ]
+                )
+                word_actions_no_force.append([action_no_force])
+            # concatenate with previous token
+            else:
+                pure_token = get_pure_token(token, self.model_name)
+                words[-1] += pure_token
+                word_actions[-1].append(
+                    1
+                    if force_reserve_digit and bool(re.search(r"\d", token))
+                    else action
+                )
+                word_actions_no_force[-1].append(action)
+
+        return words, word_actions, word_actions_no_force
+
+    def __token_action_to_word_action(self, token_actions, convert_mode="all"):
+        """
+        Convert token actions to word actions for RL training.
+        """
+        if convert_mode == "all":
+            word_action = [all(p) for p in token_actions]
+        elif convert_mode == "any":
+            word_action = [any(p) for p in token_actions]
+        else:
+            raise NotImplementedError()
+
+        return word_action
+
+    def __compress(
+        self,
+        context_list: list,
+        reduce_rate: float = 0.5,
+        token_to_word: str = "mean",
+        force_tokens: List[str] = [],
+        token_map: dict = {},
+        force_reserve_digit: bool = False,
+        drop_consecutive: bool = False,
+    ):
+        """
+        Override the compression method to track RL-specific information.
+        """
+        def split_string_to_words(input_string):
+            pattern = r'\b\w+\b|[<>=/!@#$%^&*()?":{}|\\`~;_+-]'
+            result = re.findall(pattern, input_string)
+            return result
+
+        if reduce_rate <= 0:
+            words, word_labels = [], []
+            for i in range(len(context_list)):
+                chunk_list = context_list[i]
+                chunk_words = []
+                chunk_word_labels = []
+                for j in range(len(chunk_list)):
+                    # replace to original token
+                    for ori_token, new_token in token_map.items():
+                        chunk_list[j] = chunk_list[j].replace(new_token, ori_token)
+                    ws = split_string_to_words(chunk_list[j])
+                    chunk_words.extend(ws)
+                    chunk_word_labels.extend([1 for _ in range(len(ws))])
+                context_list[i] = "".join(chunk_list)
+                words.append(chunk_words)
+                word_labels.append(chunk_word_labels)
+            return context_list, words, word_labels
+
+        chunk_list = []
+        for chunks in context_list:
+            for c in chunks:
+                chunk_list.append(c)
+
+        dataset = TokenClfDataset(
+            chunk_list, tokenizer=self.tokenizer, max_len=self.max_seq_len
+        )
+        dataloader = DataLoader(
+            dataset, batch_size=self.max_batch_size, shuffle=False, drop_last=False
+        )
+
+        compressed_chunk_list = []
+        word_list = []
+        word_label_list = []
+        actions, old_log_probs, old_logits, entropy = [], [], None, []
+        compressed_prompt_list_2 = []
+        
+        for batch in dataloader:
+            ids = batch["ids"].type(torch.long).to(self.model.device)
+            mask = batch["mask"].type(torch.long).to(self.model.device) == 1
+
+            outputs = self.model(input_ids=ids, attention_mask=mask)
+            
+            loss, logits = outputs.loss, outputs.logits
+            probs = F.softmax(logits, dim=-1)
+
+            old_probs = probs
+            for logits_, old_probs_ in zip(logits, old_probs):
+                dist = Categorical(probs=old_probs_)
+                action = dist.sample()
+                log_prob = dist.log_prob(action)
+                actions.append(action)
+                old_log_probs.append(log_prob)
+                entropy.append(dist.entropy())
+
+            # Create compressed prompt list from actions
+            for j in range(ids.shape[0]):
+                chunk_actions = actions[j]
+                chunk_ids = ids[j]
+                chunk_mask = mask[j]
+                active_actions = torch.masked_select(chunk_actions, chunk_mask)
+                active_ids = torch.masked_select(chunk_ids, chunk_mask)
+
+                tokens = self.tokenizer.convert_ids_to_tokens(
+                    active_ids.squeeze().tolist()
+                )
+                token_actions = [action for action in active_actions.cpu().detach().numpy()]
+
+                words, valid_token_actions, _ = self.__merge_token_to_word_for_actions(
+                    tokens=tokens,
+                    token_actions=token_actions,
+                    force_tokens=force_tokens,
+                    token_map=token_map,
+                    force_reserve_digit=force_reserve_digit,
+                )
+                word_actions = self.__token_action_to_word_action(
+                    valid_token_actions, convert_mode="all"
+                )
+
+                if drop_consecutive:
+                    is_token_between = False
+                    prev = None
+                    for i, (word, word_action) in enumerate(zip(words, word_actions)):
+                        if word in force_tokens:
+                            if is_token_between:
+                                is_token_between = False
+                            elif not is_token_between and word == prev:
+                                word_actions[i] = 0
+                            prev = word
+                        else:
+                            is_token_between |= word_action
+                
+                keep_words = []
+                for word, word_action in zip(words, word_actions):
+                    if word_action:
+                        keep_words.append(word)
+                keep_str = self.tokenizer.convert_tokens_to_string(keep_words)
+                compressed_prompt_list_2.append(keep_str)
+
+        compressed_context_list = []
+        original_word_list = []
+        original_word_label_list = []
+        prev_idx = 0
+        for chunk_list in context_list:
+            n_chunk = len(chunk_list)
+            compressed_context_list.append(
+                "".join(compressed_prompt_list_2[prev_idx : prev_idx + n_chunk])
+            )
+            original_word_list.append([])
+            original_word_label_list.append([])
+            prev_idx = prev_idx + n_chunk
+
+        return compressed_context_list, original_word_list, original_word_label_list, actions, old_log_probs, old_logits, compressed_prompt_list_2, entropy
+
+    def get_rl_info(self):
+        """
+        Get reinforcement learning information for training.
+        
+        Returns:
+            dict: Dictionary containing RL-specific information including actions, log probabilities, and entropy.
+        """
+        return {
+            "actions": self.actions,
+            "old_log_probs": self.old_log_probs,
+            "old_logits": self.old_logits,
+            "entropy": self.entropy,
+            "compressed_prompt_list_2": self.compressed_prompt_list_2,
+            "compression_ratios": self.compression_ratios
+        }
+
+    def clear_rl_info(self):
+        """
+        Clear stored reinforcement learning information.
+        """
+        self.actions = []
+        self.old_log_probs = []
+        self.old_logits = None
+        self.entropy = []
+        self.compressed_prompt_list_2 = []
+        self.compression_ratios = []
+    
\ No newline at end of file

From 76b4b13704012b45dbf6fc3adb340055d9faa570 Mon Sep 17 00:00:00 2001
From: Shivam Shandilya <shivam.stpaulsdarjeeling@gmail.com>
Date: Tue, 1 Jul 2025 19:51:14 +0000
Subject: [PATCH 2/2] added dependencies for TACO-RL

---
 experiments/taco-rl/README.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/experiments/taco-rl/README.md b/experiments/taco-rl/README.md
index ca171f6..6266ef5 100644
--- a/experiments/taco-rl/README.md
+++ b/experiments/taco-rl/README.md
@@ -15,6 +15,22 @@ experiments/taco-rl/
 └── logs/                     # Training logs (created during training)
 ```
 
+## Requirements
+
+### Core Dependencies
+- llmlingua
+
+### Additional Dependencies
+Install the following packages for TACO-RL experiments:
+
+```bash
+pip install openai evaluate csv_logger hydra-core rouge_score
+```
+
+### API Access
+- OpenAI API access (for GPT-3.5 teacher model)
+- Azure OpenAI API access (if using Azure services)
+
 ## Quick Start
 
 ### 1. Prepare Your Data