Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added cat.zip
Binary file not shown.
93 changes: 35 additions & 58 deletions cat/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,29 @@

def loader(instance_path,
label_path,
label_multi_path,
subset_labels_path,
split_labels=False,
mapping=None):
# subset_labels = set(subset_labels)

multi_labels = []
with open(label_multi_path, 'r') as file:
for line in file:
current_array = eval(line.strip())
multi_labels.append(current_array)

# multi_labels = open(label_multi_path)
# multi_labels = [x for x in multi_labels]

labels = open(label_path)
labels = [x.strip().lower().split() for x in labels]

subset_labels = open(subset_labels_path)
# subset_labels = open(subset_labels_path)
subset_labels = []
with open(subset_labels_path, 'r', encoding='utf-8') as file:
for line in file:
subset_labels.append(line.strip())
subset_labels = set([x.strip().lower() for x in subset_labels])
# print(subset_labels)

Expand All @@ -21,15 +36,15 @@ def loader(instance_path,
# subset_labels = {'wine', 'place', 'food'}

instances = []
for line in open(instance_path):
for line in open(instance_path, encoding='utf-8'):
instances.append(line.strip().lower().split())

if split_labels:
labels = [[l.split("#")[0] for l in x] for x in labels]

instances, gold = zip(*[(x, y[0]) for x, y in zip(instances, labels)
if len(y) == 1])
# y[0] in subset_labels])
# and y[0] in subset_labels])

if mapping is not None:
gold = [mapping.get(x, x) for x in gold]
Expand All @@ -38,63 +53,25 @@ def loader(instance_path,
y = le.fit_transform(gold)
label_set = le.classes_.tolist()

return instances, y, label_set, subset_labels, gold


rest_14_test = partial(loader,
instance_path="data/restaurant_test_2014_tok.txt", # noqa
label_path="data/labels_restaurant_test_2014.txt", # noqa
subset_labels={"ambience",
"service",
"food"})


rest_14_train = partial(loader,
instance_path="data/restaurant_train_2014.txt", # noqa
label_path="data/labels_restaurant_train_2014.txt", # noqa
subset_labels={"ambience",
"service",
"food"})


ganu_test = partial(loader,
instance_path="data/test_tok.txt",
label_path="data/test_label.txt",
subset_labels={"ambience",
"staff",
"food"})


rest_15_train = partial(loader,
instance_path="data/restaurant_train_2015_tok.txt",
label_path="data/labels_restaurant_train_2015.txt",
subset_labels={"ambience",
"service",
"food"},
split_labels=True)

rest_15_test = partial(loader,
instance_path="data/restaurant_test_2015_tok.txt",
label_path="data/labels_restaurant_test_2015.txt",
subset_labels={"ambience",
"service",
"food"},
split_labels=True)
return instances, y, label_set, subset_labels, gold, multi_labels

toy_test = partial(loader,
instance_path="../data/0/toy_test.txt",
label_path="../data/0/toy_test_label.txt",
subset_labels_path="../data/toy_train_label.txt",
split_labels=True)

# rest_15_test = partial(loader,
# instance_path="data/restaurant_test_2015_tok.txt",
# label_path="data/labels_restaurant_test_2015.txt",
# subset_labels={"ambience",
# "service",
# "food"},
# split_labels=True)

def restaurants_train():
yield rest_14_train()
yield rest_15_train()

def test(f, dataset):
for h in range(0, 101, 10):
data_test = partial(loader,
instance_path=f"../data/{dataset}/test/{h}/test.txt",
label_path=f"../data/{dataset}/test/{h}/test_label.txt",
label_multi_path=f"../data/{dataset}/test/{h}/test_label_multi.txt",
subset_labels_path=f"../data/{dataset}/train/{f}/train_label.txt",
split_labels=True)

def restaurants_test():
yield toy_test()
# yield rest_14_test()
# yield rest_15_test()
# yield ganu_test()
yield data_test()
2 changes: 1 addition & 1 deletion cat/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

def conll2text(paths, outpath):
"""Write a conll file to a text file."""
with open(outpath, 'w') as f:
with open(outpath, 'w', encoding='utf-8') as f:
for path in paths:
for sent in pyconll.iter_from_file(path):
txt = []
Expand Down
Empty file added cmn/__init__.py
Empty file.
93 changes: 93 additions & 0 deletions cmn/mams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import os, spacy
from tqdm import tqdm
import xml.etree.ElementTree as ET

from cmn.review import Review

class MAMSReview(Review):
def __init__(self, id, sentences, time, author, aos):
super().__init__(self, id, sentences, time, author, aos)

@staticmethod
def xmlloader(path):
reviews_list = []
nlp = spacy.load("en_core_web_sm")
tree = ET.parse(path)
sentences = tree.getroot()
i = -1
for sentence in sentences: # each sentence is an individual review, unlike SemEval16
i += 1

text = ""
tokens = []
aos_list_list = []

for data in sentence:
if data.tag == "text": # clean the associated aspect tokens from punctuations
raw_text = data.text
current_text = raw_text
opinion_text = sentence.findall(".//aspectTerm")
for o in opinion_text:
aspect = o.attrib["term"]
aspect_list = aspect.split()
if len(aspect_list) == 0: # contains no aspect (mams dataset doesn't have NULL aspects)
continue
letter_index_tuple = (int(o.attrib['from']), int(o.attrib['to']))
current_text = current_text.replace(' ', ' ')
current_text = current_text[0:letter_index_tuple[0]] + ' ' + aspect + ' ' + current_text[letter_index_tuple[1]+1:]
#print("processing text:" + str(current_text))
tokens = current_text.split()

if data.tag == "aspectTerms":
aos_list = []
for o in data: # each o is an aspectTerm

sentiment = o.attrib["polarity"].replace('positive', '+1').replace('negative', '-1').replace('neutral', '0')

aspect = o.attrib["term"]
aspect_list = aspect.split() # the aspect may consist more than 1 word
if len(aspect_list) == 0:
continue

letter_index_tuple = (int(o.attrib['from']), int(o.attrib['to']))

# find the aspect instance of all text instances of the phrase
#print(tokens)

text_incidences = [i for i in range(len(raw_text))
if raw_text.startswith(aspect, i)
and not raw_text[i-1].isalpha()
and not raw_text[i+len(aspect)].isalpha()]
#print("text incidences: " + str(text_incidences))
idx_of_from = text_incidences.index(letter_index_tuple[0])
#print("index of from: " + str(idx_of_from))

# find the location of the aspect token
start_token_of_aspect = [i for i in range(len(tokens))
if i + len(aspect_list) <= len(tokens)
and tokens[i:i + len(aspect_list)] == aspect_list]

#print("start token of aspect: " + str(start_token_of_aspect))

idx_start_token_of_aspect = start_token_of_aspect[idx_of_from]

idx_aspect_list = list(
range(idx_start_token_of_aspect, idx_start_token_of_aspect + len(aspect_list)))

# compile the final aos 3-tuple for each aspect
aos = (idx_aspect_list, [], eval(sentiment))

if len(aos) != 0:
aos_list.append(aos)

if len(aos_list) != 0:
aos_list_list.append(aos_list)

if len(aos_list_list) == 0: # if no aspect in the sentence, it is not added
continue

reviews_list.append(
Review(id=i, sentences=[[str(t).lower() for t in current_text.split()]], time=None,
author=None, aos=aos_list_list, lempos=""))

return reviews_list
Loading