-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNLI_filter.py
More file actions
65 lines (44 loc) · 2.1 KB
/
NLI_filter.py
File metadata and controls
65 lines (44 loc) · 2.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import jsonlines
import argparse
def clean_sentence(text):
"""Remove newlines and strip extra spaces from a given text."""
return text.replace("\n", " ").strip() if isinstance(text, str) else text
def process_filtering_cleaning(input_file, output_file, threshold=0.95):
"""Process data in memory without temporary files"""
# Read all data
entries = []
with jsonlines.open(input_file, mode='r') as reader:
entries = list(reader)
# Step 1: Filter by entailment
filtered_entries = []
for entry in entries:
if not entry["sentence"].strip():
continue # Skip if empty
entailment_prob = entry["nli_probabilities"][1] # Index 1 corresponds to entailment
if entailment_prob >= threshold:
filtered_entries.append(entry)
# Step 2: Clean sentences
for entry in filtered_entries:
if "sentence" in entry:
entry["sentence"] = clean_sentence(entry["sentence"])
# Step 3: Remove duplicates
seen = set()
unique_entries = []
for entry in filtered_entries:
entry_key = (entry.get("sentence", "").strip(),
entry.get("subject", "").strip(),
entry.get("relation", "").strip(),
entry.get("object", "").strip())
if entry_key not in seen:
seen.add(entry_key)
unique_entries.append(entry)
# Write final output
with jsonlines.open(output_file, mode='w') as writer:
writer.write_all(unique_entries)
print(f"✅ Entailment filtering and cleaning complete! {len(unique_entries)} entries saved to {output_file}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Entailment filtering and cleaning")
parser.add_argument("input_file", type=str, help="Path to the input jsonl data file with NLI.")
parser.add_argument("-o", "--output_file", type=str, default="dataset_NLI_Final.jsonl", help="Final dataset file path")
args = parser.parse_args()
process_filtering_cleaning(args.input_file, args.output_file)