-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathRSS_train_model.py
More file actions
99 lines (71 loc) · 3.51 KB
/
RSS_train_model.py
File metadata and controls
99 lines (71 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# Training and testing + evaluating the tenders model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from scipy.sparse import hstack, csr_matrix
import json
import numpy as np
import pickle
import joblib
FILE_PATH = "/Users/tristanblackledge/TenderAusAgent/ai-agent-project/tender_scraper/RSS_tenders_data.json"
def loads_tenders_data(file_path):
"""Load tender data"""
with open(file_path, "r", encoding='utf-8') as f:
data = json.load(f)
return data
def prepare_data(tenders):
"""Combines features and splits into training/validation/test sets."""
# feature extraction
full_text = [tenders[i].get('Title', '') + ' ' +
tenders[i].get('description', '') for i in range(len(tenders))]
keyword_scores = np.array([tender.get('keyword_score', 0) for tender in tenders]).reshape(-1, 1)
# target feature (labels)
labels = np.array([tender.get('is_relevant') for tender in tenders])
# split indicies to keep all features aligned
indices = np.arange(len(tenders))
train_idx, temp_idx, y_train, y_temp = train_test_split(
indices, labels, random_state=42, test_size=0.4, stratify=labels)
val_idx, test_idx, y_val, y_test = train_test_split(
temp_idx, y_temp, random_state=42, test_size=0.5, stratify=y_temp)
# Fit vectorizer on training data
full_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
# use the combined text strings corresponding to the training indicies
train_text = [full_text[i] for i in train_idx]
full_vectorizer.fit(train_text)
# Transform all splits using the fitted vectorizer
X_train = full_vectorizer.transform(train_text)
X_val = full_vectorizer.transform([full_text[i] for i in val_idx])
X_test = full_vectorizer.transform([full_text[i] for i in test_idx])
# add the keyword scores
X_train_kw = keyword_scores[train_idx]
X_val_kw = keyword_scores[val_idx]
X_test_kw = keyword_scores[test_idx]
# combine the matrices
X_train = hstack([X_train, csr_matrix(X_train_kw)])
X_val = hstack([X_val, csr_matrix(X_val_kw)])
X_test = hstack([X_test, csr_matrix(X_test_kw)])
return X_train, X_val, X_test, y_train, y_val, y_test, full_vectorizer
def train_and_evaluate(X_train, X_test, y_train, y_test):
"""Trains the logisitc regression models
Evaudate the model against the test data."""
model = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42, C=0.5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary', pos_label=True)
recall = recall_score(y_test, y_pred, average='binary', pos_label=True)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print("CM\n", confusion_matrix(y_test, y_pred))
return model
if __name__ == "__main__":
tenders_data = loads_tenders_data(FILE_PATH)
# prep the data
X_train, x_val, X_test, y_train, y_val, y_test, vectorizer = prepare_data(tenders_data)
# train model
final_model = train_and_evaluate(X_train, X_test, y_train, y_test)
# save the model and vectorizer
joblib.dump(final_model, 'RSS_tender_relevance_model.pkl')
joblib.dump(vectorizer, 'RSS_tfidf_vectorizer.pkl')