fani-lab · farinamhz · Feb 27, 2024 · Feb 27, 2024
diff --git a/cat.zip b/cat.zip
diff --git a/cat/dataset.py b/cat/dataset.py
@@ -5,14 +5,29 @@
 
 def loader(instance_path,
            label_path,
+           label_multi_path,
            subset_labels_path,
            split_labels=False,
            mapping=None):
     # subset_labels = set(subset_labels)
+
+    multi_labels = []
+    with open(label_multi_path, 'r') as file:
+        for line in file:
+            current_array = eval(line.strip())
+            multi_labels.append(current_array)
+
+    # multi_labels = open(label_multi_path)
+    # multi_labels = [x for x in multi_labels]
+
     labels = open(label_path)
     labels = [x.strip().lower().split() for x in labels]
 
-    subset_labels = open(subset_labels_path)
+    # subset_labels = open(subset_labels_path)
+    subset_labels = []
+    with open(subset_labels_path, 'r', encoding='utf-8') as file:
+        for line in file:
+            subset_labels.append(line.strip())
     subset_labels = set([x.strip().lower() for x in subset_labels])
     # print(subset_labels)
 
@@ -21,15 +36,15 @@ def loader(instance_path,
     # subset_labels = {'wine', 'place', 'food'}
 
     instances = []
-    for line in open(instance_path):
+    for line in open(instance_path, encoding='utf-8'):
         instances.append(line.strip().lower().split())
 
     if split_labels:
         labels = [[l.split("#")[0] for l in x] for x in labels]
 
     instances, gold = zip(*[(x, y[0]) for x, y in zip(instances, labels)
                             if len(y) == 1])
-                            # y[0] in subset_labels])
+                            # and y[0] in subset_labels])
 
     if mapping is not None:
         gold = [mapping.get(x, x) for x in gold]
@@ -38,63 +53,25 @@ def loader(instance_path,
     y = le.fit_transform(gold)
     label_set = le.classes_.tolist()
 
-    return instances, y, label_set, subset_labels, gold
-
-
-rest_14_test = partial(loader,
-                       instance_path="data/restaurant_test_2014_tok.txt",  # noqa
-                       label_path="data/labels_restaurant_test_2014.txt",  # noqa
-                       subset_labels={"ambience",
-                                      "service",
-                                      "food"})
-
-
-rest_14_train = partial(loader,
-                        instance_path="data/restaurant_train_2014.txt",  # noqa
-                        label_path="data/labels_restaurant_train_2014.txt",  # noqa
-                        subset_labels={"ambience",
-                                       "service",
-                                       "food"})
-
-
-ganu_test = partial(loader,
-                    instance_path="data/test_tok.txt",
-                    label_path="data/test_label.txt",
-                    subset_labels={"ambience",
-                                   "staff",
-                                   "food"})
-
-
-rest_15_train = partial(loader,
-                        instance_path="data/restaurant_train_2015_tok.txt",
-                        label_path="data/labels_restaurant_train_2015.txt",
-                        subset_labels={"ambience",
-                                       "service",
-                                       "food"},
-                        split_labels=True)
-
-rest_15_test = partial(loader,
-                       instance_path="data/restaurant_test_2015_tok.txt",
-                       label_path="data/labels_restaurant_test_2015.txt",
-                       subset_labels={"ambience",
-                                      "service",
-                                      "food"},
-                       split_labels=True)
+    return instances, y, label_set, subset_labels, gold, multi_labels
 
-toy_test = partial(loader,
-                   instance_path="../data/0/toy_test.txt",
-                   label_path="../data/0/toy_test_label.txt",
-                   subset_labels_path="../data/toy_train_label.txt",
-                   split_labels=True)
 
+# rest_15_test = partial(loader,
+#                        instance_path="data/restaurant_test_2015_tok.txt",
+#                        label_path="data/labels_restaurant_test_2015.txt",
+#                        subset_labels={"ambience",
+#                                       "service",
+#                                       "food"},
+#                        split_labels=True)
 
-def restaurants_train():
-    yield rest_14_train()
-    yield rest_15_train()
 
+def test(f, dataset):
+    for h in range(0, 101, 10):
+        data_test = partial(loader,
+                            instance_path=f"../data/{dataset}/test/{h}/test.txt",
+                            label_path=f"../data/{dataset}/test/{h}/test_label.txt",
+                            label_multi_path=f"../data/{dataset}/test/{h}/test_label_multi.txt",
+                            subset_labels_path=f"../data/{dataset}/train/{f}/train_label.txt",
+                            split_labels=True)
 
-def restaurants_test():
-    yield toy_test()
-    # yield rest_14_test()
-    # yield rest_15_test()
-    # yield ganu_test()
+        yield data_test()
diff --git a/cat/utils.py b/cat/utils.py
@@ -5,7 +5,7 @@
 
 def conll2text(paths, outpath):
     """Write a conll file to a text file."""
-    with open(outpath, 'w') as f:
+    with open(outpath, 'w', encoding='utf-8') as f:
         for path in paths:
             for sent in pyconll.iter_from_file(path):
                 txt = []

diff --git a/cmn/__init__.py b/cmn/__init__.py
diff --git a/cmn/mams.py b/cmn/mams.py
@@ -0,0 +1,93 @@
+import os, spacy
+from tqdm import tqdm
+import xml.etree.ElementTree as ET
+
+from cmn.review import Review
+
+class MAMSReview(Review):
+    def __init__(self, id, sentences, time, author, aos):
+        super().__init__(self, id, sentences, time, author, aos)
+
+    @staticmethod
+    def xmlloader(path):
+        reviews_list = []
+        nlp = spacy.load("en_core_web_sm")
+        tree = ET.parse(path)
+        sentences = tree.getroot()
+        i = -1
+        for sentence in sentences: # each sentence is an individual review, unlike SemEval16
+            i += 1
+
+            text = ""
+            tokens = []
+            aos_list_list = []
+
+            for data in sentence:
+                if data.tag == "text": # clean the associated aspect tokens from punctuations
+                    raw_text = data.text
+                    current_text = raw_text
+                    opinion_text = sentence.findall(".//aspectTerm")
+                    for o in opinion_text:
+                        aspect = o.attrib["term"]
+                        aspect_list = aspect.split()
+                        if len(aspect_list) == 0: # contains no aspect (mams dataset doesn't have NULL aspects)
+                            continue
+                        letter_index_tuple = (int(o.attrib['from']), int(o.attrib['to']))
+                        current_text = current_text.replace('  ', ' ')
+                        current_text = current_text[0:letter_index_tuple[0]] + ' ' + aspect + ' ' + current_text[letter_index_tuple[1]+1:]
+                        #print("processing text:" + str(current_text))
+                    tokens = current_text.split()
+
+                if data.tag == "aspectTerms":
+                    aos_list = []
+                    for o in data: # each o is an aspectTerm
+
+                        sentiment = o.attrib["polarity"].replace('positive', '+1').replace('negative', '-1').replace('neutral', '0')
+
+                        aspect = o.attrib["term"]
+                        aspect_list = aspect.split() # the aspect may consist more than 1 word
+                        if len(aspect_list) == 0:
+                            continue
+
+                        letter_index_tuple = (int(o.attrib['from']), int(o.attrib['to']))
+
+                        # find the aspect instance of all text instances of the phrase
+                        #print(tokens)
+
+                        text_incidences = [i for i in range(len(raw_text)) 
+                                            if raw_text.startswith(aspect, i) 
+                                            and not raw_text[i-1].isalpha()
+                                            and not raw_text[i+len(aspect)].isalpha()]
+                        #print("text incidences: " + str(text_incidences))
+                        idx_of_from = text_incidences.index(letter_index_tuple[0])
+                        #print("index of from: " + str(idx_of_from))
+
+                        # find the location of the aspect token
+                        start_token_of_aspect = [i for i in range(len(tokens)) 
+                                                if i + len(aspect_list) <= len(tokens) 
+                                                and tokens[i:i + len(aspect_list)] == aspect_list]
+
+                        #print("start token of aspect: " + str(start_token_of_aspect))
+
+                        idx_start_token_of_aspect = start_token_of_aspect[idx_of_from]
+
+                        idx_aspect_list = list(
+                                            range(idx_start_token_of_aspect, idx_start_token_of_aspect + len(aspect_list)))
+
+                        # compile the final aos 3-tuple for each aspect
+                        aos = (idx_aspect_list, [], eval(sentiment))
+
+                        if len(aos) != 0:
+                            aos_list.append(aos)
+
+                    if len(aos_list) != 0:
+                        aos_list_list.append(aos_list)
+
+            if len(aos_list_list) == 0:  # if no aspect in the sentence, it is not added
+                continue 
+
+            reviews_list.append(
+                            Review(id=i, sentences=[[str(t).lower() for t in current_text.split()]], time=None,
+                            author=None, aos=aos_list_list, lempos=""))
+
+        return reviews_list