From 8015712552f349f9bf26a8125a3e7893d1c153d2 Mon Sep 17 00:00:00 2001
From: Mayank Musaddi <32260560+mayankmusaddi@users.noreply.github.com>
Date: Thu, 15 Sep 2022 00:48:11 +0530
Subject: [PATCH 1/2] bugfix: mentions in wiki table getting missed

as wiki_anchor_files are not it sorted order some mentions were getting missed as the last_processed_id was not getting reset to -1
---
 REL/wikipedia_yago_freq.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/REL/wikipedia_yago_freq.py b/REL/wikipedia_yago_freq.py
index 8c95456..5844a47 100644
--- a/REL/wikipedia_yago_freq.py
+++ b/REL/wikipedia_yago_freq.py
@@ -236,9 +236,6 @@ def __wiki_counts(self):
 
         print("Calculating Wikipedia mention/entity occurrences")
 
-        last_processed_id = -1
-        exist_id_found = False
-
         wiki_anchor_files = os.listdir(
             os.path.join(self.base_url, self.wiki_version, "basic_data/anchor_files/")
         )
@@ -249,6 +246,9 @@ def __wiki_counts(self):
                 "basic_data/anchor_files/",
                 wiki_anchor,
             )
+            
+            last_processed_id = -1
+            exist_id_found = False
 
             with open(wiki_file, "r", encoding="utf-8") as f:
                 for line in f:

From 4ebdefe4a9d128fc1aefbb582a76123268b08589 Mon Sep 17 00:00:00 2001
From: Mayank Musaddi <32260560+mayankmusaddi@users.noreply.github.com>
Date: Thu, 15 Sep 2022 01:29:04 +0530
Subject: [PATCH 2/2] bugfix: errors due to mismatch in doc_name

---
 REL/generate_train_test.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/REL/generate_train_test.py b/REL/generate_train_test.py
index 46ce8c5..f601e14 100644
--- a/REL/generate_train_test.py
+++ b/REL/generate_train_test.py
@@ -74,8 +74,17 @@ def process_wned(self, dataset):
 
         contents = {}
         exist_doc_names = []
+        cnt_replaced = 0
         for doc in root:
             doc_name = doc.attrib["docName"].replace("&amp;", "&")
+            
+            # Replacement in string to makeup for mismatch in doc_names present in annotations_xml and generic folder
+            doc_name = doc_name.replace("'","_").replace("É","Р").replace("í","б").replace(
+                "_&_","___").replace("ü","Б").replace("â","Г").replace("ö","Ф").replace(
+                "é","В").replace("ó","в")
+            if doc_name.endswith("."):
+                doc_name = doc_name[:-1]+"_"
+                
             if doc_name in exist_doc_names:
                 print(
                     "Duplicate document found, will be removed later in the process: {}".format(
@@ -87,13 +96,17 @@ def process_wned(self, dataset):
             doc_path = os.path.join(
                 self.wned_path, "{}/RawText/{}".format(dataset, doc_name)
             )
-            with open(doc_path, "r", encoding="utf-8") as cf:
-                doc_text = " ".join(cf.readlines())
-            cf.close()
+            
+            try:
+                with open(doc_path, "r", encoding="utf-8") as cf:
+                    doc_text = " ".join(cf.readlines())
+            except:
+                print("Couldn't find: ", doc_path)
+                continue
+            
             doc_text = doc_text.replace("&amp;", "&")
             split_text = re.split(r"{}".format(split), doc_text)
 
-            cnt_replaced = 0
             sentences = {}
             mentions_gt = {}
             total_gt = 0