RelEx-PT/filter_triples.py at main · TomasCCPinto/RelEx-PT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import csv
import pandas as pd
import argparse


def filter_Qids(input_file,output_file):
    with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8", newline="") as outfile:
        reader = csv.reader(infile, delimiter='\t')  # Adjust delimiter if necessary
        writer = csv.writer(outfile, delimiter='\t')

        for row in reader:
            if len(row) == 3:  # Ensure correct format
                subj, prop, obj = row
                if obj.startswith("Q") and obj[1:].isdigit():  # Keep only Qxxx objects
                    writer.writerow(row)


def filter_and_range_relations(input_file, output_file, relations_to_keep, start=0, end=200):
    """
    Combines filtering by relations and selecting a range of rows for each relation.

    Args:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to the output CSV file.
        relations_to_keep (set): Set of relations to filter by.
        start (int): Starting index for the range (default: 1000).
        end (int): Ending index for the range (default: 2000).
    """
    # Load the CSV file into a DataFrame
    df = pd.read_csv(input_file, sep='\t', header=None, names=['subject', 'relation', 'object'])

    # Step 1: Filter the DataFrame by the specified relations
    filtered_df = df[df['relation'].isin(relations_to_keep)]

    # Step 2: Function to get rows between start and end for each group
    def get_range(group):
        if len(group) > start:
            return group.iloc[start:end]  # Get rows between start and end
        return pd.DataFrame()  # Return empty DataFrame if the group is too small

    # Apply the function to each group of relations
    ranged_df = filtered_df.groupby('relation').apply(get_range).reset_index(drop=True)

    # Save the final DataFrame to a new CSV file
    ranged_df.to_csv(output_file, sep='\t', index=False, header=False)


if __name__ == "__main__":
     # Command-line argument parsing
    parser = argparse.ArgumentParser(description="Triples filtering")
    parser.add_argument("file_path", type=str, help="Path to the non filtered csv file.")


    #range for the number of triples (BEGIN and END)
    parser.add_argument("-s", "--start", type=int, default=0, help="Start of the range for the number of triples per propertie/relation (default: 0")
    parser.add_argument("-e", "--end", type=int, default=200, help="End of the range for the number of triples per propertie/relation (default: 200")

    # Add argument for wikidata properties/relation codes - accepts comma-separated string
    #if its empty means no specific relations so we stay with all
    parser.add_argument("-c", "--codes", type=str, default="", help="Comma-separated list of wikidata properties/relation codes (e.g., P135,P136,P137)")

    #the final file after the filterings
    parser.add_argument("-f", "--file_end", type=str, default="wikidata_triples_final.csv" ,help="Path to the end csv file after filterings")

    args = parser.parse_args()


    filename = args.file_path
    filtered_filename = filename.replace('.csv', '_filtered.csv')

    filter_Qids(args.file_path,filtered_filename)

    # Convert to dict (split by comma and remove empty strings)
    if args.codes:
        code_set = set(filter(None, args.codes.split(',')))
        print(f"Code set: {code_set}")
        filter_and_range_relations(filtered_filename,args.file_end,code_set,args.start,args.end)
    else:
        print("Did not specify relations")
        df = pd.read_csv(filtered_filename)
        df.to_csv(args.file_end, index=False)