SPECCONVERTOR/spider_topics.py at main · koolbear-oss/SPECCONVERTOR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
import json
from topic_modeling_script import AdvancedTextMatcher  # Import the class

# Custom unpickler to handle missing attributes
class CustomUnpickler(joblib.numpy_pickle.Unpickler):
    def find_class(self, module, name):
        try:
            return super().find_class(module, name)
        except AttributeError:
            # Provide a fallback if the attribute is missing
            if name == 'AdvancedTextMatcher':
                return lambda: None
            raise

# Load the vectorizer and NMF model using the custom unpickler
vectorizer_path = 'vectorizer.joblib'
nmf_model_path = 'nmf_model.joblib'

with open(vectorizer_path, 'rb') as f:
    vectorizer = joblib.load(f)

with open(nmf_model_path, 'rb') as f:
    loaded_data = joblib.load(f)

# Extract the NMF model from the tuple
nmf_model = loaded_data[0]

def get_top_words(vectorizer, nmf_model, n_top_words=10):
    feature_names = vectorizer.get_feature_names_out()
    top_words = {}

    for topic_idx in range(nmf_model.n_components):
        top_words_per_topic = feature_names[nmf_model.components_[topic_idx].argsort()[-n_top_words:][::-1]]
        top_words[topic_idx] = top_words_per_topic
        print(f"Topic {topic_idx}: {top_words_per_topic}")
    return top_words

top_words = get_top_words(vectorizer, nmf_model)

def calculate_scores(df, vectorizer, nmf_model, top_words):
    scores = {}

    # Transform the 'Preprocessed_Text' column to a document-term matrix
    X = vectorizer.transform(df['Preprocessed_Text'])

    for index, row in df.iterrows():
        item_number = row['Item_Number']
        description_vector = X[index]
        topic_scores = cosine_similarity(description_vector.reshape(1, -1), nmf_model.components_)
        scores[item_number] = topic_scores.flatten()
    return scores

def prepare_spider_chart_data(df, vectorizer, nmf_model, top_words, n_top_words=5):
    spider_chart_data = {}

    # Transform the 'Preprocessed_Text' column to a document-term matrix
    X = vectorizer.transform(df['Preprocessed_Text'])

    for index, row in df.iterrows():
        item_number = row['Item_Number']
        description_vector = X[index]
        topic_scores = cosine_similarity(description_vector.reshape(1, -1), nmf_model.components_)

        # Get the top topics for this item
        top_topics = topic_scores.argsort()[0][-n_top_words:][::-1]

        # Get the top words for each of the top topics
        item_top_words = []
        for topic_idx in top_topics:
            item_top_words.extend(top_words[topic_idx])

        # Remove duplicates while preserving order
        item_top_words = list(dict.fromkeys(item_top_words))

        # Calculate the relative scores for the top words
        word_scores = {}
        for word in item_top_words:
            word_index = vectorizer.vocabulary_.get(word)
            if word_index is not None:
                word_scores[word] = description_vector[0, word_index]

        # Sort words by their scores
        sorted_word_scores = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)

        # Prepare data for spider chart
        categories = [word for word, score in sorted_word_scores[:n_top_words]]
        values = [score for word, score in sorted_word_scores[:n_top_words]]

        spider_chart_data[item_number] = {
            'categories': categories,
            'values': values
        }

    return spider_chart_data

# Assuming 'matcher' is an instance of your class and 'df' is the DataFrame attribute
# You need to define or import 'matcher' and 'df' here
# For example, if 'matcher' is an instance of AdvancedTextMatcher:
matcher = AdvancedTextMatcher('dataset.csv')
df = matcher.load_and_preprocess_dataset()

# Ensure you pass the top_words argument to the calculate_scores function
df_scores = calculate_scores(df, vectorizer, nmf_model, top_words)

# Prepare spider chart data
spider_chart_data = prepare_spider_chart_data(df, vectorizer, nmf_model, top_words)

# Save spider chart data to a JSON file
with open('spider_chart_data.json', 'w') as f:
    json.dump(spider_chart_data, f)

print("Spider chart data prepared and saved to spider_chart_data.json")