From 8015712552f349f9bf26a8125a3e7893d1c153d2 Mon Sep 17 00:00:00 2001 From: Mayank Musaddi <32260560+mayankmusaddi@users.noreply.github.com> Date: Thu, 15 Sep 2022 00:48:11 +0530 Subject: [PATCH 1/2] bugfix: mentions in wiki table getting missed as wiki_anchor_files are not it sorted order some mentions were getting missed as the last_processed_id was not getting reset to -1 --- REL/wikipedia_yago_freq.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/REL/wikipedia_yago_freq.py b/REL/wikipedia_yago_freq.py index 8c95456..5844a47 100644 --- a/REL/wikipedia_yago_freq.py +++ b/REL/wikipedia_yago_freq.py @@ -236,9 +236,6 @@ def __wiki_counts(self): print("Calculating Wikipedia mention/entity occurrences") - last_processed_id = -1 - exist_id_found = False - wiki_anchor_files = os.listdir( os.path.join(self.base_url, self.wiki_version, "basic_data/anchor_files/") ) @@ -249,6 +246,9 @@ def __wiki_counts(self): "basic_data/anchor_files/", wiki_anchor, ) + + last_processed_id = -1 + exist_id_found = False with open(wiki_file, "r", encoding="utf-8") as f: for line in f: From 4ebdefe4a9d128fc1aefbb582a76123268b08589 Mon Sep 17 00:00:00 2001 From: Mayank Musaddi <32260560+mayankmusaddi@users.noreply.github.com> Date: Thu, 15 Sep 2022 01:29:04 +0530 Subject: [PATCH 2/2] bugfix: errors due to mismatch in doc_name --- REL/generate_train_test.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/REL/generate_train_test.py b/REL/generate_train_test.py index 46ce8c5..f601e14 100644 --- a/REL/generate_train_test.py +++ b/REL/generate_train_test.py @@ -74,8 +74,17 @@ def process_wned(self, dataset): contents = {} exist_doc_names = [] + cnt_replaced = 0 for doc in root: doc_name = doc.attrib["docName"].replace("&", "&") + + # Replacement in string to makeup for mismatch in doc_names present in annotations_xml and generic folder + doc_name = doc_name.replace("'","_").replace("É","Р").replace("í","б").replace( + "_&_","___").replace("ü","Б").replace("â","Г").replace("ö","Ф").replace( + "é","В").replace("ó","в") + if doc_name.endswith("."): + doc_name = doc_name[:-1]+"_" + if doc_name in exist_doc_names: print( "Duplicate document found, will be removed later in the process: {}".format( @@ -87,13 +96,17 @@ def process_wned(self, dataset): doc_path = os.path.join( self.wned_path, "{}/RawText/{}".format(dataset, doc_name) ) - with open(doc_path, "r", encoding="utf-8") as cf: - doc_text = " ".join(cf.readlines()) - cf.close() + + try: + with open(doc_path, "r", encoding="utf-8") as cf: + doc_text = " ".join(cf.readlines()) + except: + print("Couldn't find: ", doc_path) + continue + doc_text = doc_text.replace("&", "&") split_text = re.split(r"{}".format(split), doc_text) - cnt_replaced = 0 sentences = {} mentions_gt = {} total_gt = 0