-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfilter_triples.py
More file actions
87 lines (60 loc) · 3.55 KB
/
filter_triples.py
File metadata and controls
87 lines (60 loc) · 3.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import csv
import pandas as pd
import argparse
def filter_Qids(input_file,output_file):
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8", newline="") as outfile:
reader = csv.reader(infile, delimiter='\t') # Adjust delimiter if necessary
writer = csv.writer(outfile, delimiter='\t')
for row in reader:
if len(row) == 3: # Ensure correct format
subj, prop, obj = row
if obj.startswith("Q") and obj[1:].isdigit(): # Keep only Qxxx objects
writer.writerow(row)
def filter_and_range_relations(input_file, output_file, relations_to_keep, start=0, end=200):
"""
Combines filtering by relations and selecting a range of rows for each relation.
Args:
input_file (str): Path to the input CSV file.
output_file (str): Path to the output CSV file.
relations_to_keep (set): Set of relations to filter by.
start (int): Starting index for the range (default: 1000).
end (int): Ending index for the range (default: 2000).
"""
# Load the CSV file into a DataFrame
df = pd.read_csv(input_file, sep='\t', header=None, names=['subject', 'relation', 'object'])
# Step 1: Filter the DataFrame by the specified relations
filtered_df = df[df['relation'].isin(relations_to_keep)]
# Step 2: Function to get rows between start and end for each group
def get_range(group):
if len(group) > start:
return group.iloc[start:end] # Get rows between start and end
return pd.DataFrame() # Return empty DataFrame if the group is too small
# Apply the function to each group of relations
ranged_df = filtered_df.groupby('relation').apply(get_range).reset_index(drop=True)
# Save the final DataFrame to a new CSV file
ranged_df.to_csv(output_file, sep='\t', index=False, header=False)
if __name__ == "__main__":
# Command-line argument parsing
parser = argparse.ArgumentParser(description="Triples filtering")
parser.add_argument("file_path", type=str, help="Path to the non filtered csv file.")
#range for the number of triples (BEGIN and END)
parser.add_argument("-s", "--start", type=int, default=0, help="Start of the range for the number of triples per propertie/relation (default: 0")
parser.add_argument("-e", "--end", type=int, default=200, help="End of the range for the number of triples per propertie/relation (default: 200")
# Add argument for wikidata properties/relation codes - accepts comma-separated string
#if its empty means no specific relations so we stay with all
parser.add_argument("-c", "--codes", type=str, default="", help="Comma-separated list of wikidata properties/relation codes (e.g., P135,P136,P137)")
#the final file after the filterings
parser.add_argument("-f", "--file_end", type=str, default="wikidata_triples_final.csv" ,help="Path to the end csv file after filterings")
args = parser.parse_args()
filename = args.file_path
filtered_filename = filename.replace('.csv', '_filtered.csv')
filter_Qids(args.file_path,filtered_filename)
# Convert to dict (split by comma and remove empty strings)
if args.codes:
code_set = set(filter(None, args.codes.split(',')))
print(f"Code set: {code_set}")
filter_and_range_relations(filtered_filename,args.file_end,code_set,args.start,args.end)
else:
print("Did not specify relations")
df = pd.read_csv(filtered_filename)
df.to_csv(args.file_end, index=False)