Multi-LLM-Agent-Debate/absa.py at master · animeshsg/Multi-LLM-Agent-Debate · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from bertopic import BERTopic
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
from umap import UMAP
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

class Absa():
    def __init__(self,data):

        # bertopic Tokenizers and model params and init

        self.vectorizer_model = CountVectorizer(stop_words="english")
        self.cluster_model = KMeans(n_clusters=2)
        self.topic_model = BERTopic(
                            n_gram_range=(1, 2),
                            vectorizer_model=self.vectorizer_model,
                            hdbscan_model=self.cluster_model,
                            nr_topics='auto',
                            min_topic_size=5,
                            top_n_words=5,
                            calculate_probabilities=True)
        self.model_embedding = SentenceTransformer('all-MiniLM-L6-v2')

        #ABSA Tokenizers and model params init
        self.absa_model_name = "yangheng/deberta-v3-base-absa-v1.1"
        self.absa_tokenizer = AutoTokenizer.from_pretrained(self.absa_model_name)
        self.absa_model = AutoModelForSequenceClassification.from_pretrained(self.absa_model_name)

        #Input
        self.data=data

    def get_topics(self):
        # breakpoint()
        convs=self.data.Dialogue.to_list()
        corpus_embeddings = self.model_embedding.encode(convs)
        self.topic_model=self.topic_model.fit(convs,corpus_embeddings)
        topics, probabilities = self.topic_model.transform(convs, corpus_embeddings)
        # new_topics = self.topic_model.reduce_outliers(convs, topics, strategy="c-tf-idf")
        # self.topic_model.update_topics(convs, topics=new_topics, vectorizer_model=self.vectorizer_model)
        print(self.topic_model.get_topic_freq())
        doc_info=self.topic_model.get_document_info(convs)
        doc_info=doc_info[['Document','Representation']]
        doc_info = doc_info.rename(columns={'Document': 'Dialogue'})
        #print(self.topic_model.get_topic_info()['Representation'][0])
        doc_info['FreqTopic']=doc_info.apply(lambda row: self.topic_model.get_topic_info()['Representation'][0], axis=1)
        self.data=pd.merge(self.data,doc_info,on="Dialogue",how="inner")
        return self.data

    def get_aspect_scores(self,row):
        #Get sentiment scores given a text and its aspect list
        text=row['Dialogue']
        aspect_list=row['Representation']
        sentiment_aspect = {}
        for aspect in aspect_list:
            inputs = self.absa_tokenizer(text, aspect, return_tensors="pt")

            with torch.inference_mode():
                outputs = self.absa_model(**inputs)

            scores = F.softmax(outputs.logits[0], dim=-1)
            label_id = torch.argmax(scores).item()
            sentiment_aspect[aspect] = (self.absa_model.config.id2label[label_id], scores[label_id].item())

        return sentiment_aspect

    def get_top_aspect_scores(self,row):
        #Get sentiment scores given a text and its aspect list
        text=row['Dialogue']
        aspect_list=row['FreqTopic']
        sentiment_aspect = {}
        for aspect in aspect_list:
            inputs = self.absa_tokenizer(text, aspect, return_tensors="pt")

            with torch.inference_mode():
                outputs = self.absa_model(**inputs)

            scores = F.softmax(outputs.logits[0], dim=-1)
            label_id = torch.argmax(scores).item()
            sentiment_aspect[aspect] = (self.absa_model.config.id2label[label_id], scores[label_id].item())

        return sentiment_aspect

    def get_absa(self):
        #  Main Function to get the absa sentiment scores
        self.data=self.get_topics()
        #print(self.data.columns)
        self.data['SentimentScores'] = self.data.apply(self.get_aspect_scores, axis=1)
        self.data['TopSentimentScores'] = self.data.apply(self.get_top_aspect_scores, axis=1)
        self.data = self.data.drop(['Representation','FreqTopic'], axis=1)
        return self.data