Multi-LLM-Agent-Debate/TextFileTransformer.py at master · animeshsg/Multi-LLM-Agent-Debate · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
from bert_score_eval import get_bertscore
from perpexlity_score import perplexity_score
from absa import Absa

class TextFileTransformer(BaseEstimator, TransformerMixin):
    '''
    Input - Text file of conversation with separator of #### between each dialogue
    Output - Pandas Dataframe with metric
    '''

    def __init__(self, file_path):

        self.file_path = file_path
        self.data = None

    def fit(self, X, y=None):
        # The fit method is typically used for parameter tuning in transformers.
        return self

    def transform(self,X):
        '''
        Input : X as None Always
        Output : Dataframe with conversation and evaluation scores
        '''
        print("1. Reading File into Dataframe")
        self.data=self.read_file()
        print("2. Calculating pairwise Bert Metrics")
        self.data=self.calculate_pairwise_bert_score()

        print("2.5 Calculate bert score wrt seed prompt")
        self.data=self.calculate_seed_bert_score()

        print("3. Calculating Perplexity Score Metrics")
        self.data=self.calculate_perplexity_score()

        print("4. Calculating aspect based Sentiments Metrics")
        absa=Absa(self.data)
        self.data=absa.get_absa()
        return self.data

    def read_file(self):
        """
        Input: File with conversations separated by #####
        Output : Dataframe with character and dialogue
        """
        try:
            with open(self.file_path, 'r') as file:
                # Read the entire file into a string
                file_content = file.read()

                # Separate the file into chunks using the keyword "#####"
                separated_chunks = file_content.split('#####')[:-1]


                # Remove whitespaces and new line characters from each element
                cleaned_chunks = [chunk.strip() for chunk in separated_chunks]

                # Create a list to store character names and dialogues
                data = []

                # Iterate through the cleaned chunks to extract character names and dialogues
                for i in range(0, len(cleaned_chunks), 2):
                    character_name = cleaned_chunks[i]
                    dialogue = cleaned_chunks[i + 1] if i + 1 < len(cleaned_chunks) else ""  # Handle odd-length chunks

                    data.append({'Character': character_name, 'Dialogue': dialogue})

                # Convert the list of dictionaries to a DataFrame
                df = pd.DataFrame(data)
                return df

        except FileNotFoundError:
            print(f"Error: File '{self.file_path}' not found.")
            return None
        except Exception as e:
            print(f"Error: {e}")
            return None

    def calculate_perplexity_score(self):
        px=perplexity_score()
        self.data['Perplexity']=self.data['Dialogue'].apply(px.calculate)
        return self.data

    def calculate_pairwise_bert_score(self):
        # Create new columns to store similarity scores
        self.data['F1_Score'] = 0.0

        # Iterate through each pair of adjacent dialogues to calculate and store similarity scores
        for i in range(1, len(self.data)):  # Iterate up to the second-to-last row
            candidate_dialogue = self.data.at[i, 'Dialogue']
            reference_dialogue = self.data.at[i -1, 'Dialogue']

            # Calculate similarity scores
            precision, recall, f1_score = get_bertscore(candidate_dialogue, reference_dialogue)
            #print(precision,recall,f1_score)

            # Store the scores in the DataFrame
            self.data.loc[self.data.index[i], 'F1_Score'] = f1_score
        return self.data

    def calculate_seed_bert_score(self):

        self.data['F1_Score_Seed'] = 0.0

        # Iterate through each pair of adjacent dialogues to calculate and store similarity scores
        for i in range(3, len(self.data)):  # Iterate up to the second-to-last row
            if i%2 == 1:
                candidate_dialogue = self.data.at[i, 'Dialogue']
                reference_dialogue = self.data.at[1, 'Dialogue']
            else:
                candidate_dialogue = self.data.at[i, 'Dialogue']
                reference_dialogue = self.data.at[2, 'Dialogue']

            # Calculate similarity scores
            precision, recall, f1_score = get_bertscore(candidate_dialogue, reference_dialogue)
            #print(precision,recall,f1_score)

            # Store the scores in the DataFrame
            self.data.loc[self.data.index[i], 'F1_Score_Seed'] = f1_score
        return self.data


# # Example usage:
# file_path = 'your_file.txt'  # Replace with the actual file path
# text_transformer = TextFileTransformer(file_path)

# # Transform the file
# df_result = text_transformer.transform(None)

# # Display the resulting DataFrame
# print(df_result)