-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfile_functions.py
More file actions
186 lines (123 loc) · 7.9 KB
/
file_functions.py
File metadata and controls
186 lines (123 loc) · 7.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
from useful_functions import tokenize
from files_path_variables import all_cleaned_manually_text_files, all_cleaned_corr_text_files
import json, pdb, re
from pathlib import Path
def modify_neighborhoods(document, neighborhoods_dict, start_indexes_dict, end_indexes_dict, text_data, processed_documents, number_of_saved_neighborhoods, modified=False):
#Go through all the neighborhoods in the document
for n, neighborhood in enumerate(neighborhoods_dict[document]):
#Get the start and end indexes of the neighborhood
start_index = start_indexes_dict[document][n]
end_index = end_indexes_dict[document][n]
original_neighborhood = text_data[document][start_index:end_index]
#Get the extracted neighborhood and tokenize it to add it to the original tokenized text
extracted_neighborhood = tokenize(neighborhoods_dict[document][n])
#Check if the original neighborhood and extracted neighborhood are different (modified)
if not original_neighborhood == extracted_neighborhood:
#Save this file
modified = True
print(f"\nReplacing neighborhood number {n} of document {document}. Number of processed documents: {len(processed_documents)}")
number_of_saved_neighborhoods += 1
print(f"Total number of changed neighborhoods: {number_of_saved_neighborhoods}")
#Replace neighborhood in the text data
text_data[document] = text_data[document][:start_index] + extracted_neighborhood + text_data[document][end_index+1:]
return text_data, modified, number_of_saved_neighborhoods
def insert_neighborhoods_into_files(neighborhoods_dict, start_indexes_dict, end_indexes_dict, databases=[all_cleaned_manually_text_files, all_cleaned_corr_text_files], single_document = None):
#Get the list of documents that have neighborhoods
neighborhoods_documents_list = list(neighborhoods_dict.keys())
#Control which documents have been processed and how many neighborhoods
processed_documents = []
#Maybe define it as set in case a document is repeated but check first to see if there are repeated cases
processed_files = []
number_of_saved_neighborhoods = 0
#Controls if there was a change in the documents to control if it gets saved
modified = False
#We want to insert the neighborhoods in the files that are cleaner first so the databases list uses first the files that are cleaner
for d in databases:
for file in d:
#Open a file and extract its content
with open(file) as json_file:
text_data = json.loads(json_file.read())
#If only one document to save was provided
if single_document:
#If the document exists in the file and in the neighborhoods list then modify it
if single_document in text_data and single_document in neighborhoods_documents_list:
print(f"\nModifying document {single_document} in file {file}.\n")
text_data, modified, number_of_saved_neighborhoods = modify_neighborhoods(single_document, neighborhoods_dict, start_indexes_dict, end_indexes_dict, text_data, processed_documents, number_of_saved_neighborhoods)
#Else process all documents
else:
#Loop the documents in the file
for document in text_data.keys():
#Check the document hasn't already been processed and that it has neighborhoods to check for
if document not in processed_documents and document in neighborhoods_documents_list:
text_data, modified, number_of_saved_neighborhoods = modify_neighborhoods(document, neighborhoods_dict, start_indexes_dict, end_indexes_dict, text_data, processed_documents, number_of_saved_neighborhoods, modified)
#Add the document as processed
processed_documents.append(document)
#Save file if any of the documents was changed
if modified:
with open(file, "w") as json_file:
json.dump(text_data, json_file)
print(f"\nFile {file} saved succesfully!")
#If it's a single document then stop the loop
if single_document:
print("\nProcessing finished.")
return text_data
#Set modification variable back to False
modified = False
processed_files.append(file.name)
print("Finished processing all files!")
return text_data
# Create a function to check and modify strings
def check_and_modify_string(input_string, term):
# Define the regular expression to look for in the string with word boundaries
pattern = re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
# Use re.sub to replace the matched term with spaces around it
modified_str = pattern.sub(' ' + term + ' ', input_string)
# Remove extra spaces that may have been added by the previous operation
modified_str = ' '.join(modified_str.split())
return modified_str
#Separates the term inside each word if it exists
#For example if the term to look for is naturaleza and it finds the word "grandenaturaleza" in one of the words
#then it returns "grande naturaleza" as two separate terms and saves the file with the separated terms
def separate_term(term, databases=[all_cleaned_manually_text_files, all_cleaned_corr_text_files]):
# Define compiled regex pattern
#pattern = re.compile(r'\b'+term+'\b', re.IGNORECASE)
#Control which documents have been processed and how many neighborhoods
processed_documents = []
#Control which files and documents were modified
modified_files = []
modified_documents = []
#Maybe define it as set in case a document is repeated but check first to see if there are repeated cases
processed_files = []
modified = False
#We want to insert the neighborhoods in the files that are cleaner first so the databases list uses first the files that are cleaner
for d in databases:
for file in d:
modified = False # Flag to track if any modification was made in this file
print(f"Processing file {file.name} in directory {file.parent.name}.")
#Open a file and extract its content
with open(file) as json_file:
text_data = json.loads(json_file.read())
#Loop the documents in the file
for document, text in text_data.items():
#Check the document hasn't already been processed and that it has neighborhoods to check for
if document not in processed_documents:
if isinstance(text, list):
# Iterate over lists
for i, item in enumerate(text):
modified_item = check_and_modify_string(item, term)
if modified_item != item:
text[i] = modified_item
modified = True
modified_documents.append(document)
processed_documents.append(document)
# Save the modified dictionary back to the file if any modification was made
if modified:
with open(file, 'w') as json_file:
json.dump(text_data, json_file)
modified_files.append(file)
modified = False
processed_files.append(file)
# Print the list of modified files
print(f"\n{len(processed_documents)} documents were processed from {len(processed_files)} JSON files. This is the list of {len(modified_files)} modified files where {len(modified_documents)} modified documents where the term {term} was found and separated:\n")
for fil in modified_files:
print(fil)