-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathspider_topics.py
More file actions
117 lines (90 loc) · 4.35 KB
/
spider_topics.py
File metadata and controls
117 lines (90 loc) · 4.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
import json
from topic_modeling_script import AdvancedTextMatcher # Import the class
# Custom unpickler to handle missing attributes
class CustomUnpickler(joblib.numpy_pickle.Unpickler):
def find_class(self, module, name):
try:
return super().find_class(module, name)
except AttributeError:
# Provide a fallback if the attribute is missing
if name == 'AdvancedTextMatcher':
return lambda: None
raise
# Load the vectorizer and NMF model using the custom unpickler
vectorizer_path = 'vectorizer.joblib'
nmf_model_path = 'nmf_model.joblib'
with open(vectorizer_path, 'rb') as f:
vectorizer = joblib.load(f)
with open(nmf_model_path, 'rb') as f:
loaded_data = joblib.load(f)
# Extract the NMF model from the tuple
nmf_model = loaded_data[0]
def get_top_words(vectorizer, nmf_model, n_top_words=10):
feature_names = vectorizer.get_feature_names_out()
top_words = {}
for topic_idx in range(nmf_model.n_components):
top_words_per_topic = feature_names[nmf_model.components_[topic_idx].argsort()[-n_top_words:][::-1]]
top_words[topic_idx] = top_words_per_topic
print(f"Topic {topic_idx}: {top_words_per_topic}")
return top_words
top_words = get_top_words(vectorizer, nmf_model)
def calculate_scores(df, vectorizer, nmf_model, top_words):
scores = {}
# Transform the 'Preprocessed_Text' column to a document-term matrix
X = vectorizer.transform(df['Preprocessed_Text'])
for index, row in df.iterrows():
item_number = row['Item_Number']
description_vector = X[index]
topic_scores = cosine_similarity(description_vector.reshape(1, -1), nmf_model.components_)
scores[item_number] = topic_scores.flatten()
return scores
def prepare_spider_chart_data(df, vectorizer, nmf_model, top_words, n_top_words=5):
spider_chart_data = {}
# Transform the 'Preprocessed_Text' column to a document-term matrix
X = vectorizer.transform(df['Preprocessed_Text'])
for index, row in df.iterrows():
item_number = row['Item_Number']
description_vector = X[index]
topic_scores = cosine_similarity(description_vector.reshape(1, -1), nmf_model.components_)
# Get the top topics for this item
top_topics = topic_scores.argsort()[0][-n_top_words:][::-1]
# Get the top words for each of the top topics
item_top_words = []
for topic_idx in top_topics:
item_top_words.extend(top_words[topic_idx])
# Remove duplicates while preserving order
item_top_words = list(dict.fromkeys(item_top_words))
# Calculate the relative scores for the top words
word_scores = {}
for word in item_top_words:
word_index = vectorizer.vocabulary_.get(word)
if word_index is not None:
word_scores[word] = description_vector[0, word_index]
# Sort words by their scores
sorted_word_scores = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)
# Prepare data for spider chart
categories = [word for word, score in sorted_word_scores[:n_top_words]]
values = [score for word, score in sorted_word_scores[:n_top_words]]
spider_chart_data[item_number] = {
'categories': categories,
'values': values
}
return spider_chart_data
# Assuming 'matcher' is an instance of your class and 'df' is the DataFrame attribute
# You need to define or import 'matcher' and 'df' here
# For example, if 'matcher' is an instance of AdvancedTextMatcher:
matcher = AdvancedTextMatcher('dataset.csv')
df = matcher.load_and_preprocess_dataset()
# Ensure you pass the top_words argument to the calculate_scores function
df_scores = calculate_scores(df, vectorizer, nmf_model, top_words)
# Prepare spider chart data
spider_chart_data = prepare_spider_chart_data(df, vectorizer, nmf_model, top_words)
# Save spider chart data to a JSON file
with open('spider_chart_data.json', 'w') as f:
json.dump(spider_chart_data, f)
print("Spider chart data prepared and saved to spider_chart_data.json")