RelEx-PT/NLI_filter.py at main · TomasCCPinto/RelEx-PT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import jsonlines
import argparse


def clean_sentence(text):
    """Remove newlines and strip extra spaces from a given text."""
    return text.replace("\n", " ").strip() if isinstance(text, str) else text


def process_filtering_cleaning(input_file, output_file, threshold=0.95):
    """Process data in memory without temporary files"""

    # Read all data
    entries = []
    with jsonlines.open(input_file, mode='r') as reader:
        entries = list(reader)

    # Step 1: Filter by entailment
    filtered_entries = []
    for entry in entries:
        if not entry["sentence"].strip():
            continue # Skip if empty

        entailment_prob = entry["nli_probabilities"][1] # Index 1 corresponds to entailment
        if entailment_prob >= threshold:
            filtered_entries.append(entry)

    # Step 2: Clean sentences
    for entry in filtered_entries:
        if "sentence" in entry:
            entry["sentence"] = clean_sentence(entry["sentence"])

    # Step 3: Remove duplicates
    seen = set()
    unique_entries = []
    for entry in filtered_entries:
        entry_key = (entry.get("sentence", "").strip(),
                     entry.get("subject", "").strip(),
                     entry.get("relation", "").strip(),
                     entry.get("object", "").strip())

        if entry_key not in seen:
            seen.add(entry_key)
            unique_entries.append(entry)

    # Write final output
    with jsonlines.open(output_file, mode='w') as writer:
        writer.write_all(unique_entries)

    print(f"✅ Entailment filtering and cleaning complete! {len(unique_entries)} entries saved to {output_file}")


if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="Entailment filtering and cleaning")
    parser.add_argument("input_file", type=str, help="Path to the input jsonl data file with NLI.")


    parser.add_argument("-o", "--output_file", type=str, default="dataset_NLI_Final.jsonl", help="Final dataset file path")

    args = parser.parse_args()


    process_filtering_cleaning(args.input_file, args.output_file)