Skip to content

Commit e0e9d99

Browse files
authored
Merge pull request #230 from liyucheng09/slingua
add securitylingua
2 parents e4e172a + ba1463a commit e0e9d99

File tree

9 files changed

+826
-6
lines changed

9 files changed

+826
-6
lines changed

README.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ LLMLingua-2, a small-size yet powerful prompt compression method trained via dat
4848
- [LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic Prompt Compression](https://aclanthology.org/2024.findings-acl.57/) (ACL 2024 Findings)<br>
4949
_Zhuoshi Pan, Qianhui Wu, Huiqiang Jiang, Menglin Xia, Xufang Luo, Jue Zhang, Qingwei Lin, Victor Ruhle, Yuqing Yang, Chin-Yew Lin, H. Vicky Zhao, Lili Qiu, Dongmei Zhang_
5050

51+
SecurityLingua is a safety guardrail model that uses the security-aware prompt compression to reveal the malicious intentions behind jailbreak attacks, enabling LLMs to detect attacks and generate safe responses. Due to the highly efficient prompt compression, the defense involves negligible overhead and 100x less token costs compared to state-of-the-art LLM guardrail approaches.
52+
53+
- [SecurityLingua: Efficient Defense of LLM Jailbreak Attacks via Security-Aware Prompt Compression](https://openreview.net/forum?id=tybbSo6wba) (CoLM 2025)<br>
54+
_Yucheng Li, Surin Ahn, Huiqiang Jiang, Amir H. Abdi, Yuqing Yang and Lili Qiu_
55+
5156
## 🎥 Overview
5257

5358
![Background](./images/LLMLingua_motivation.png)
@@ -133,6 +138,16 @@ If you find this repo helpful, please cite the following papers:
133138
}
134139
```
135140

141+
```bibtex
142+
@inproceedings{li2025securitylingua,
143+
title={{S}ecurity{L}ingua: Efficient Defense of {LLM} Jailbreak Attacks via Security-Aware Prompt Compression},
144+
author={Yucheng Li and Surin Ahn and Huiqiang Jiang and Amir H. Abdi and Yuqing Yang and Lili Qiu},
145+
booktitle={Second Conference on Language Modeling},
146+
year={2025},
147+
url={https://openreview.net/forum?id=tybbSo6wba}
148+
}
149+
```
150+
136151
## 🎯 Quick Start
137152

138153
#### 1. **Installing LLMLingua:**
@@ -205,6 +220,20 @@ llm_lingua = PromptCompressor(
205220
)
206221
```
207222

223+
To try **SecurityLingua** in your scenarios, you can use
224+
225+
```python
226+
from llmlingua import PromptCompressor
227+
228+
securitylingua = PromptCompressor(
229+
model_name="SecurityLingua/securitylingua-xlm-s2s",
230+
use_slingua=True
231+
)
232+
intention = securitylingua.compress_prompt(malicious_prompt)
233+
```
234+
235+
For more details about SecurityLingua, please refer to [securitylingua readme](./experiments/securitylingua/readme.md).
236+
208237
#### 3. **Advanced usage - Structured Prompt Compression:**
209238

210239
Split text into sections, decide on whether to compress and its rate. Use `<llmlingua></llmlingua>` tags for context segmentation, with optional rate and compress parameters.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
conda create -n llmlingua python=3.10 -y && conda activate llmlingua
2+
pip install -e .
3+
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121
4+
pip install accelerate wandb
5+
pip install openai==0.28
6+
7+
pip install spacy
8+
python -m spacy download en_core_web_sm
9+
pip install scikit-learn
10+
pip install tensorboard
11+
pip install datasets hf_transfer
12+
13+
unset WANDB_RUN_ID WANDB_RUN_GROUP WANDB_PROJECT WANDB_NOTES WANDB_NAME
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# Copyright (c) 2023 Microsoft
2+
# Licensed under The MIT License [see LICENSE for details]
3+
4+
import argparse
5+
from collections import defaultdict
6+
from typing import Dict, List, Tuple, DefaultDict
7+
import numpy as np
8+
import torch
9+
10+
def parse_arguments() -> argparse.Namespace:
11+
"""Parse command line arguments"""
12+
parser = argparse.ArgumentParser(description="Filter compressed prompts based on metrics.")
13+
parser.add_argument(
14+
"--load_path",
15+
help="path to load data",
16+
default="../../../results/meetingbank/gpt-4-32k_comp/annotation_cs512_meetingbank_train_formated.pt",
17+
)
18+
parser.add_argument(
19+
"--save_path",
20+
help="path to save filtered data",
21+
default="../../../results/meetingbank/gpt-4-32k_comp/annotation_kept_cs512_meetingbank_train_formated.pt",
22+
)
23+
parser.add_argument(
24+
"--percentile",
25+
help="percentile threshold for filtering",
26+
default=90,
27+
type=int
28+
)
29+
return parser.parse_args()
30+
31+
def filter_by_metric(
32+
data: DefaultDict[str, List],
33+
metric_name: str,
34+
percentile: float
35+
) -> Tuple[DefaultDict[str, List], DefaultDict[str, List]]:
36+
"""
37+
Filter data based on a specific metric and percentile threshold
38+
39+
Args:
40+
data: Dictionary containing all data points and their metrics
41+
metric_name: Name of the metric to filter by
42+
percentile: Percentile threshold for filtering
43+
44+
Returns:
45+
Tuple of (kept_data, filtered_data)
46+
"""
47+
metric_list = data[metric_name]
48+
threshold = np.percentile(metric_list, percentile)
49+
50+
kept = defaultdict(list)
51+
filtered = defaultdict(list)
52+
53+
# List of all metrics to transfer
54+
metrics = [
55+
"labels", "origin", "comp", "retrieval", "comp_rate",
56+
"variation_rate", "hitting_rate", "matching_rate", "alignment_gap"
57+
]
58+
59+
for values in zip(*(data[metric] for metric in metrics)):
60+
# Create a dictionary of current values
61+
current = dict(zip(metrics, values))
62+
63+
# Determine which container to use based on the metric threshold
64+
target = filtered if current[metric_name] >= threshold else kept
65+
66+
# Add values to appropriate container
67+
for metric, value in current.items():
68+
target[metric].append(value)
69+
70+
return kept, filtered
71+
72+
def main():
73+
"""Main function to run the filtering process"""
74+
args = parse_arguments()
75+
76+
# Load data
77+
res_pt = torch.load(args.load_path, weights_only=False)
78+
print(f"Initial sample count: {len(res_pt['variation_rate'])}")
79+
80+
# First filtering stage: variation rate
81+
kept, filtered = filter_by_metric(
82+
data=res_pt,
83+
metric_name="variation_rate",
84+
percentile=args.percentile
85+
)
86+
87+
# Second filtering stage: alignment gap
88+
final_kept, additional_filtered = filter_by_metric(
89+
data=kept,
90+
metric_name="alignment_gap",
91+
percentile=args.percentile
92+
)
93+
94+
# Save filtered results
95+
torch.save(final_kept, args.save_path)
96+
97+
# Print statistics
98+
print(f"Samples after first filter: {len(kept['variation_rate'])}")
99+
print(f"Final kept samples: {len(final_kept['variation_rate'])}")
100+
101+
if __name__ == "__main__":
102+
main()

0 commit comments

Comments
 (0)