From fc9c2a22f48b7598046eafaa0faf8f0f0e0870d1 Mon Sep 17 00:00:00 2001 From: Corinne H Date: Sat, 1 Nov 2025 22:59:42 +0000 Subject: [PATCH 1/3] initial work --- api/app/services/context_aggregator.py | 11 +++++++++++ api/requirements.txt | 1 + 2 files changed, 12 insertions(+) diff --git a/api/app/services/context_aggregator.py b/api/app/services/context_aggregator.py index e1bf49d..268749c 100644 --- a/api/app/services/context_aggregator.py +++ b/api/app/services/context_aggregator.py @@ -5,6 +5,8 @@ from collections.abc import Mapping from datetime import date from typing import Any, Optional +from openai import OpenAI + import requests from dotenv import load_dotenv @@ -48,6 +50,15 @@ def _estimate_season(target_date: date) -> str: return "summer" return "autumn" +# Starter function for twitter data +def get_events_from_twitter(): + tweets = _tweets_get_tweets(30) + response = client.responses.create( + model="gpt-5", + input="Given these tweets ${tweets}, can you return a list in the format..." + ) + # TODO: add coordinates? + class ContextAggregator: """Collects context data used to tailor event recommendations.""" diff --git a/api/requirements.txt b/api/requirements.txt index 7886e66..693600f 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -5,6 +5,7 @@ requests>=2.31.0 python-dotenv>=1.0.0 langchain>=1.0.3 langchain-openai>=1.0.1 +openai>=1.0.0 beautifulsoup4>=4.12.0 lxml>=4.9.0 python-dateutil>=2.8.2 From c74ee3f2534146ed258fd52c4e27cde1705b9278 Mon Sep 17 00:00:00 2001 From: Corinne H Date: Sun, 2 Nov 2025 00:14:49 +0000 Subject: [PATCH 2/3] fixed import error --- api/app/services/context_aggregator.py | 26 ++++++++++++++----------- api/app/utils/scrapers/scrape_tweets.py | 12 ++++++++---- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/api/app/services/context_aggregator.py b/api/app/services/context_aggregator.py index ec99692..e74c049 100644 --- a/api/app/services/context_aggregator.py +++ b/api/app/services/context_aggregator.py @@ -5,8 +5,6 @@ from collections.abc import Mapping from datetime import date from typing import Any, Optional -from openai import OpenAI - import requests from dotenv import load_dotenv @@ -50,15 +48,6 @@ def _estimate_season(target_date: date) -> str: return "summer" return "autumn" -# Starter function for twitter data -def get_events_from_twitter(): - tweets = _tweets_get_tweets(30) - response = client.responses.create( - model="gpt-5", - input="Given these tweets ${tweets}, can you return a list in the format..." - ) - # TODO: add coordinates? - class ContextAggregator: """Collects context data used to tailor event recommendations.""" @@ -205,6 +194,19 @@ def gather_context( else: logger.debug("Eventbrite scraper not available; skipping Eventbrite events") + # Optionally include tweets when the optional scraper + # dependency is available and credentials are configured. + tweets: list[Any] = [] + tweets_error: Optional[str] = None + if _tweets_get_tweets is not None: + try: + tweets = _tweets_get_tweets() + except Exception as exc: # pragma: no cover - network/env dependent + logger.warning("Tweet fetch failed: %s", exc) + tweets_error = str(exc) + else: + logger.debug("Tweet scraper not available; skipping tweets") + return { "date": resolved_date.isoformat(), "preferences": normalized_preferences, @@ -214,4 +216,6 @@ def gather_context( "festival_events": festival_events, "eventbrite_events": eventbrite_events, "eventbrite_error": eventbrite_error, + "tweets": tweets, + "tweets_error": tweets_error, } diff --git a/api/app/utils/scrapers/scrape_tweets.py b/api/app/utils/scrapers/scrape_tweets.py index 19e8816..b2badba 100644 --- a/api/app/utils/scrapers/scrape_tweets.py +++ b/api/app/utils/scrapers/scrape_tweets.py @@ -12,15 +12,14 @@ # Add parent directory to path to enable imports script_dir = Path(__file__).resolve().parent -api_dir = script_dir.parent.parent +api_dir = script_dir.parent.parent.parent sys.path.insert(0, str(api_dir)) from app.core.database import SessionLocal, engine, Base from app.models.tweet import Tweet # Load .env file from the root directory -env_path = api_dir.parent / ".env" -load_dotenv(dotenv_path=env_path) +load_dotenv() API_URL = "https://api.x.com/2/tweets/search/recent" @@ -287,7 +286,12 @@ def get_tweets(limit=10, threshold_hours_for_refresh=2): # Refresh data if needed if needs_refresh: - write_tweets_to_db(limit) + try: + write_tweets_to_db(limit) + except ValueError as e: + # API key not configured, return empty list + print(f"Cannot fetch tweets: {e}") + return [] # Fetch and return tweets from database db = SessionLocal() From ed65ffc755c9e68892955af1f0d5538efb077ec9 Mon Sep 17 00:00:00 2001 From: Corinne H Date: Sun, 2 Nov 2025 00:33:04 +0000 Subject: [PATCH 3/3] tried to get twitter data to work --- api/app/services/context_aggregator.py | 29 +++-- api/app/utils/scrapers/scrape_tweets.py | 143 +++++++++++++++++++++--- 2 files changed, 140 insertions(+), 32 deletions(-) diff --git a/api/app/services/context_aggregator.py b/api/app/services/context_aggregator.py index e74c049..ef7480b 100644 --- a/api/app/services/context_aggregator.py +++ b/api/app/services/context_aggregator.py @@ -3,7 +3,7 @@ import logging import os from collections.abc import Mapping -from datetime import date +from datetime import date, datetime from typing import Any, Optional import requests @@ -21,10 +21,10 @@ try: # pragma: no cover - optional dependency may fail at import time from ..utils.scrapers.scrape_tweets import ( # type: ignore[attr-defined] - get_tweets as _tweets_get_tweets, + get_events as _tweets_get_events, ) except Exception as exc: # pragma: no cover - _tweets_get_tweets = None # type: ignore[assignment] + _tweets_get_events = None # type: ignore[assignment] logger.debug("Tweet scraper import failed: %s", exc) from .scrapers import ( @@ -48,7 +48,6 @@ def _estimate_season(target_date: date) -> str: return "summer" return "autumn" - class ContextAggregator: """Collects context data used to tailor event recommendations.""" @@ -193,19 +192,19 @@ def gather_context( eventbrite_error = str(exc) else: logger.debug("Eventbrite scraper not available; skipping Eventbrite events") - - # Optionally include tweets when the optional scraper + + # Optionally include events from Twitter API when the optional scraper # dependency is available and credentials are configured. - tweets: list[Any] = [] - tweets_error: Optional[str] = None - if _tweets_get_tweets is not None: + twitter_events: list[Any] = [] + twitter_error: Optional[str] = None + if _tweets_get_events is not None: try: - tweets = _tweets_get_tweets() + twitter_events = _tweets_get_events() except Exception as exc: # pragma: no cover - network/env dependent - logger.warning("Tweet fetch failed: %s", exc) - tweets_error = str(exc) + logger.warning("Twitter fetch failed: %s", exc) + twitter_error = str(exc) else: - logger.debug("Tweet scraper not available; skipping tweets") + logger.debug("Twitter scraper not available; skipping Twitter events") return { "date": resolved_date.isoformat(), @@ -216,6 +215,6 @@ def gather_context( "festival_events": festival_events, "eventbrite_events": eventbrite_events, "eventbrite_error": eventbrite_error, - "tweets": tweets, - "tweets_error": tweets_error, + "twitter_events": twitter_events, + "twitter_error": twitter_error } diff --git a/api/app/utils/scrapers/scrape_tweets.py b/api/app/utils/scrapers/scrape_tweets.py index b2badba..46d27e3 100644 --- a/api/app/utils/scrapers/scrape_tweets.py +++ b/api/app/utils/scrapers/scrape_tweets.py @@ -1,19 +1,24 @@ #!/usr/bin/env python3 """Scrape tweets from X API and store them in the database.""" +import argparse +import json +import logging import os import sys -import argparse -import requests import time -from datetime import datetime, timezone +from datetime import datetime, timezone, date from pathlib import Path + +import requests from dotenv import load_dotenv +from openai import OpenAI # Add parent directory to path to enable imports script_dir = Path(__file__).resolve().parent api_dir = script_dir.parent.parent.parent sys.path.insert(0, str(api_dir)) +logger = logging.getLogger(__name__) from app.core.database import SessionLocal, engine, Base from app.models.tweet import Tweet @@ -30,14 +35,21 @@ def _require_api_key() -> str: raise ValueError("Please set the X_API_KEY environment variable.") return api_key +def _require_openai_api_key() -> str: + """Return the configured OPENAI API key or raise a helpful error.""" + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + raise ValueError("Please set the OPENAI_API_KEY environment variable.") + return api_key + # Keywords that tend to indicate real-world activities # Prioritized list kept under query length limit (512 chars) # Multi-word terms are stored without quotes - will be quoted in build_query() ACTIVITY_KEYWORDS = [ "event", "festival", "concert", "show", - "comedy", "market", "food market", "brunch", "dinner", - "exhibition", "museum", "gallery", "workshop", "meetup", - "hike", "walk", "running", + "today", "market", "market", "restauraunt", + "restauraunt", "museum", "gallery", "workshop", "meetup", + "hike", "walk", "park", "tour", "popup" ] @@ -161,18 +173,18 @@ def write_tweets_to_db(limit=10): # Validate query length (X API v2 has a 512 character limit) query_length = len(query) if query_length > 512: - print(f"āš ļø Warning: Query length ({query_length} chars) exceeds X API limit (512 chars)") - print(f"Query preview: {query[:100]}...") - print("Attempting to send anyway, but may fail.\n") + logger.warning(f"Query length ({query_length} chars) exceeds X API limit (512 chars)") + logger.warning(f"Query preview: {query[:100]}...") + logger.warning("Attempting to send anyway, but may fail.") else: - print(f"āœ“ Query length: {query_length} characters (limit: 512)\n") + logger.info(f"Query length: {query_length} characters (limit: 512)") total = 0 stored = 0 next_token = None seen = set() - print(f"šŸ” Searching for up to {limit} tweets about activities in Edinburgh...\n") + logger.info(f"Searching for up to {limit} tweets about activities in Edinburgh...") # Create database session db = SessionLocal() @@ -215,7 +227,7 @@ def write_tweets_to_db(limit=10): db.add(tweet) stored += 1 - print(f"@{username}: {text}\n→ {url}\n") + logger.debug(f"@{username}: {text} → {url}") total += 1 if total >= limit: @@ -228,11 +240,11 @@ def write_tweets_to_db(limit=10): if not next_token: break - print(f"Done — fetched {total} tweets, stored {stored} in database.\n") + logger.info(f"Done — fetched {total} tweets, stored {stored} in database.") except Exception as e: db.rollback() - print(f"Error: {e}") + logger.error(f"Error: {e}") raise finally: db.close() @@ -269,7 +281,7 @@ def get_tweets(limit=10, threshold_hours_for_refresh=2): needs_refresh = False if last_scrape is None: - print("No tweets in database. Fetching new tweets...") + logger.info("No tweets in database. Fetching new tweets...") needs_refresh = True else: # Calculate time since last scrape @@ -281,7 +293,7 @@ def get_tweets(limit=10, threshold_hours_for_refresh=2): hours_since_scrape = time_since_scrape.total_seconds() / 3600 if hours_since_scrape > threshold_hours_for_refresh: - print(f"Last scrape was {hours_since_scrape:.1f} hours ago (threshold: {threshold_hours_for_refresh} hours). Fetching new tweets...") + logger.info(f"Last scrape was {hours_since_scrape:.1f} hours ago (threshold: {threshold_hours_for_refresh} hours). Fetching new tweets...") needs_refresh = True # Refresh data if needed @@ -290,7 +302,7 @@ def get_tweets(limit=10, threshold_hours_for_refresh=2): write_tweets_to_db(limit) except ValueError as e: # API key not configured, return empty list - print(f"Cannot fetch tweets: {e}") + logger.warning(f"Cannot fetch tweets: {e}") return [] # Fetch and return tweets from database @@ -301,6 +313,103 @@ def get_tweets(limit=10, threshold_hours_for_refresh=2): finally: db.close() +def get_events(limit=30, threshold_hours_for_refresh=2): + """ + Convert tweets to structured event data using OpenAI API. + + Args: + limit: Number of tweets to process (default: 10) + threshold_hours_for_refresh: Number of hours before data is considered stale (default: 2) + + Returns: + List of event dictionaries with location_name, activity_name, time, and url fields + """ + try: + # Get tweets from database + tweets = get_tweets(limit, threshold_hours_for_refresh) + + if not tweets: + logger.debug("No tweets retrieved") + return [] + + # Get OpenAI API key + openai_api_key = _require_openai_api_key() + + # Initialize OpenAI client + client = OpenAI(api_key=openai_api_key) + + # Get today's date for filtering + today = date.today().isoformat() + + # Convert tweets to JSON dictionaries + tweets_json = [ + { + "text": tweet.text, + "like_count": tweet.like_count, + "retweet_count": tweet.retweet_count, + "created_at": tweet.created_at.isoformat() if tweet.created_at else None, + "scraped_at": tweet.scraped_at.isoformat() if tweet.scraped_at else None, + } + for tweet in tweets + ] + + # Format tweets as JSON string for the prompt + tweets_json_str = json.dumps(tweets_json, indent=2) + + # Create prompt for OpenAI + prompt = f"""Given the following tweets (in JSON format), extract any events mentioned that are happening TODAY ({today}). + +For each event, return a JSON object with these fields: +- location_name: The venue or location name (optional) +- activity_name: The event name/title (optional) +- time: (optional) The event time in ISO 8601 format (YYYY-MM-DDTHH:MM:SS+00:00). If the time is not available in the tweet, omit this field or set it to null. +- url: The event URL if available, otherwise null + +If the event includes a date, make sure that it's for today's date ({today}). If the tweet does not include a date and is an activity that could be done any day (such as a bar or restauraunt), include it and set the time field to null. +Exclude any events that explicitely include a date that is not today's date ({today}). +Return ONLY a JSON array of event objects, with no additional text or formatting. +If any of these are missing just return the information you can find + +Tweets: +{tweets_json_str}""" + + # Call OpenAI API + response = client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": "You are a helpful assistant that extracts structured event data from tweets. Return only valid JSON arrays."}, + {"role": "user", "content": prompt} + ], + temperature=0.3, + response_format={"type": "json_object"} + ) + + # Parse the response + content = response.choices[0].message.content + if not content: + logger.debug("Empty response from OpenAI") + return [] + + # The response_format ensures we get JSON, but it might be wrapped + parsed = json.loads(content) + + # Handle different response formats + if isinstance(parsed, list): + events = parsed + elif isinstance(parsed, dict) and "events" in parsed: + events = parsed["events"] + else: + logger.debug(f"Unexpected response format from OpenAI: {parsed}") + return [] + + logger.debug(f"Extracted {len(events)} events from tweets") + return events + + except Exception as exc: + logger.debug(f"Failed to extract events from tweets: {exc}") + return [] + + def main(): """Main function to scrape tweets and store in database."""