UnleashTenderAgent/RSSmodel.py at main · tblackledge97/UnleashTenderAgent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
import os
import json
import requests
import xml.etree.ElementTree as ET
from datetime import datetime
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import joblib
import numpy as np
from scipy.sparse import hstack, csr_matrix
from keywords_file import keywords
from datetime import date


load_dotenv()
OUTPUT_DIR = "TenderAusAgent_logs"
os.makedirs(OUTPUT_DIR, exist_ok=True)
LOG_FILE = os.path.join(OUTPUT_DIR, f"tender_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt")

HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
NOTION_TOKEN = os.getenv("NOTION_TOKEN", "").strip()
NOTION_DB_ID = os.getenv("NOTION_DB_ID", "").strip()

RSS_URLS = ["https://www.tenders.gov.au/public_data/rss/rss.xml"]

# Load ML model and vectorizer
MODEL_PATH = "RSS_tender_relevance_model.pkl"
VECTORIZER_PATH = "RSS_tfidf_vectorizer.pkl"

model = joblib.load(MODEL_PATH)
vectorizer = joblib.load(VECTORIZER_PATH)


HEADERS = {
    'User-Agent': 'Mozilla/5.0 (compatible; TenderBot/1.0; +https://unleashlive.com)'
}

# logging functions (from gpt)
def log(message: str):
    """Simple console + file logger."""
    print(message)
    with open(LOG_FILE, "a") as f:
        f.write(f"[{datetime.now()}] {message}\n")


def safe_get(element, tag, default=""):
    """Safely extract text from an XML element."""
    try:
        node = element.find(tag)
        return node.text.strip() if node is not None and node.text else default
    except Exception:
        return default


def calculate_keyword_scores(tender):
    """Calculate weighted keyword score."""
    text = (tender.get('title', '') + " " + tender.get('description', '')).lower()
    matched = []
    total = 0

    # changes KEYWORDS to keywords to bring from other file
    for keyword, weight in keywords.items():
        if keyword in text:
            matched.append(f"{keyword} (x{weight})")
            total += weight

    return total, matched


def get_RSS():
    """
    Get the RSS feed and filter the data that come through it.
    """
    found_matches = []
    processed_log = "processed_links.txt"

    # look for processed links file, this is how we know if the tenders are new or not
    # via the tender url link (as unique, acts as primary key)
    try:
        with open(processed_log, "r") as f:
            processed_links = {line.strip() for line in f}
    except FileNotFoundError:
        processed_links = set()
        log("No previous log file found, starting fresh.")

    # go through RSS urls and get the data
    for rss_url in RSS_URLS:
        log(f"Fetching RSS feed: {rss_url}")
        try:
            resp = requests.get(rss_url, headers=HEADERS, timeout=15)
            resp.raise_for_status()
        except requests.RequestException as e:
            log(f"ERROR fetching RSS feed: {e}")
            continue

        try:
            root = ET.fromstring(resp.text)
            items = root.findall(".//item")
        except ET.ParseError as e:
            log(f"ERROR parsing RSS XML: {e}")
            continue

        # from the RSS take the information available
        # from the RSS we only get a summary, a link and a title
        # start filtering the data
        for item in items:
            try:
                link = safe_get(item, "link")
                if not link or link in processed_links:
                    continue

                title = safe_get(item, "title")
                description = safe_get(item, "description")

                total_score, matched = calculate_keyword_scores({
                    "title": title,
                    "description": description
                })

                if total_score >= 10:
                    found_matches.append({
                        "title": title,
                        "description": description,
                        "url": link,
                        "total_score": total_score,
                        "matched_keywords": matched
                    })
                    processed_links.add(link)

            except Exception as e:
                log(f"ERROR processing item: {e}")
                continue

    # Update log file of processed links
    try:
        with open(processed_log, "a") as f:
            for tender in found_matches:
                f.write(tender["url"] + "\n")
    except Exception as e:
        log(f"ERROR updating processed links file: {e}")

    log(f"Found {len(found_matches)} new tenders.")
    return sorted(found_matches, key=lambda x: x["total_score"], reverse=True)


def post_to_hubspot(tender):
    if not HUBSPOT_TOKEN:
        log("Missing HubSpot token.")
        return False

    url = "https://api.hubspot.com/crm/v3/objects/deals"
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {HUBSPOT_TOKEN}"}

    # for title I want to only display after the :
    # for desctiption I want to remove the <p> from the front and the </p> from the back
    payload = {
        "properties": {
            # split the title into 2 parts, before the : and after (if there is one)
            # return the second half
            "dealname": tender.get("Title", "No Title").split(":", 1)[1].strip() if ":" in tender.get("Title", "") else tender.get("Title", "No Title"),
            "dealstage": "appointmentscheduled",
            # remove the <p> and </p> if they are present
            "tender_description": tender.get("description", "")[3:-4] if tender.get("description", "").startswith("<p>") and tender.get("description", "").endswith("</p>") else tender.get("description", ""),
            "keyword_score": float(tender.get("keyword_score", 0)),

            "url": tender.get("url", "N/A"),

            "ml_recommendation": str(tender['ml_recommendation']),

            "date_added": date.today().isoformat()
        }
    }

    try:
        resp = requests.post(url, headers=headers, json=payload, timeout=15)
        if resp.status_code == 201:
            log(f"HubSpot: Created deal for '{tender['Title']}'")
            return True
        else:
            log(f"HubSpot error {resp.status_code}: {resp.text}")
            return False
    except requests.RequestException as e:
        log(f"ERROR posting to HubSpot: {e}")
        return False


def post_to_notion(tender):
    if not NOTION_TOKEN:
        log("Missing Notion token.")
        return False

    url = "https://api.notion.com/v1/pages"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {NOTION_TOKEN}",
        "Notion-Version": "2022-06-28"
    }

    # for title I want to only display after the :
    # for desctiption I want to remove the <p> from the front and the </p> from the back
    payload = {
        "parent": {"database_id": NOTION_DB_ID},
        "properties": {
            "Title": {
                "title": [
                    {
                        "text": {
                            "content": (
                                tender.get("Title", "No Title").split(":", 1)[1].strip()
                                if ":" in tender.get("Title", "")
                                else tender.get("Title", "No Title")
                            )
                        }
                    }
                ]
            },
            "tenderdecription": {
                "rich_text": [
                    {
                        "text": {
                            "content": (
                                tender.get("description", "")
                                .removeprefix("<p>")
                                .removesuffix("</p>")[:2000]
                            )
                        }
                    }
                ]
            },
            "keyword_score": {"number": float(tender.get("keyword_score", 0))},
            "url": {"url": tender.get("url", "N/A")},

            "MLRecommendation": {
                "rich_text": [{"text": {"content": str(tender.get("ml_recommendation", ""))}}]
            },

            "date_added": {
                "date": {"start": date.today().isoformat()}
            }
        }
    }

    try:
        resp = requests.post(url, headers=headers, json=payload, timeout=15)
        if resp.status_code in (200, 201):
            log(f"Notion: Added '{tender['Title']}'")
            return True
        else:
            log(f"Notion error {resp.status_code}: {resp.text}")
            return False
    except requests.RequestException as e:
        log(f"ERROR posting to Notion: {e}")
        return False


def formatTender(tender):
    """Format the data so it is better readable for a machine learning model"""
    # Remove unwanted HTML tags etc from the description
    soup = BeautifulSoup(tender.get("description", ""), "html.parser")
    description = soup.get_text(strip=True)

    formatted_tender = {
        # title
        "Title": (
            tender.get("title", "No Title").split(":", 1)[1].strip()
            if ":" in tender.get("title", "")
            else tender.get("title", "No Title")
        ),

        # description
        "description": description[:2000],

        # keyword score
        "keyword_score": float(tender.get("total_score", 0)),
        # url
        "url": tender.get("url", "N/A"),
    }
    return formatted_tender


def predict_tender_relevance(tender, model, vectorizer):
    """
    Given a tender dict with title, description, and keyword_score,
    return True/False for whether it's relevant.
    """
    text = tender.get('Title', '') + ' ' + tender.get('description', '')
    X_text = vectorizer.transform([text])
    X_kw = np.array([[tender.get('keyword_score', 0)]])  # keyword score feature
    X_combined = hstack([X_text, csr_matrix(X_kw)])

    prediction = model.predict(X_combined)[0]
    # prob = model.predict_proba(X_combined)[0][1]  # optional probability
    return bool(prediction)


def main():
    log("=== Tender Keyword Scanner Started ===")
    tenders = get_RSS()

    if not tenders:
        log("No tenders found in RSS feed.")
        return

    for tender in tenders:
        formatted = formatTender(tender)
        is_relevant = predict_tender_relevance(formatted, model, vectorizer)
        formatted["ml_recommendation"] = f"{'True' if is_relevant else 'False'}"

        log(f"Prediction for '{tender['title']}': {is_relevant}")

        if is_relevant or tender.get('total_score', 0) > 10:
            post_to_hubspot(formatted)
            post_to_notion(formatted)

    log("=== Tender Keyword Scanner Completed ===")


if __name__ == "__main__":
    main()