Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions api/app/services/context_aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import os
from collections.abc import Mapping
from datetime import date
from datetime import date, datetime
from typing import Any, Optional

import requests
Expand All @@ -21,10 +21,10 @@

try: # pragma: no cover - optional dependency may fail at import time
from ..utils.scrapers.scrape_tweets import ( # type: ignore[attr-defined]
get_tweets as _tweets_get_tweets,
get_events as _tweets_get_events,
)
except Exception as exc: # pragma: no cover
_tweets_get_tweets = None # type: ignore[assignment]
_tweets_get_events = None # type: ignore[assignment]
logger.debug("Tweet scraper import failed: %s", exc)

from .scrapers import (
Expand All @@ -48,7 +48,6 @@ def _estimate_season(target_date: date) -> str:
return "summer"
return "autumn"


class ContextAggregator:
"""Collects context data used to tailor event recommendations."""

Expand Down Expand Up @@ -193,6 +192,19 @@ def gather_context(
eventbrite_error = str(exc)
else:
logger.debug("Eventbrite scraper not available; skipping Eventbrite events")

# Optionally include events from Twitter API when the optional scraper
# dependency is available and credentials are configured.
twitter_events: list[Any] = []
twitter_error: Optional[str] = None
if _tweets_get_events is not None:
try:
twitter_events = _tweets_get_events()
except Exception as exc: # pragma: no cover - network/env dependent
logger.warning("Twitter fetch failed: %s", exc)
twitter_error = str(exc)
else:
logger.debug("Twitter scraper not available; skipping Twitter events")

return {
"date": resolved_date.isoformat(),
Expand All @@ -203,4 +215,6 @@ def gather_context(
"festival_events": festival_events,
"eventbrite_events": eventbrite_events,
"eventbrite_error": eventbrite_error,
"twitter_events": twitter_events,
"twitter_error": twitter_error
}
153 changes: 133 additions & 20 deletions api/app/utils/scrapers/scrape_tweets.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,30 @@
#!/usr/bin/env python3
"""Scrape tweets from X API and store them in the database."""

import argparse
import json
import logging
import os
import sys
import argparse
import requests
import time
from datetime import datetime, timezone
from datetime import datetime, timezone, date
from pathlib import Path

import requests
from dotenv import load_dotenv
from openai import OpenAI

# Add parent directory to path to enable imports
script_dir = Path(__file__).resolve().parent
api_dir = script_dir.parent.parent
api_dir = script_dir.parent.parent.parent
sys.path.insert(0, str(api_dir))
logger = logging.getLogger(__name__)

from app.core.database import SessionLocal, engine, Base
from app.models.tweet import Tweet

# Load .env file from the root directory
env_path = api_dir.parent / ".env"
load_dotenv(dotenv_path=env_path)
load_dotenv()

API_URL = "https://api.x.com/2/tweets/search/recent"

Expand All @@ -31,14 +35,21 @@ def _require_api_key() -> str:
raise ValueError("Please set the X_API_KEY environment variable.")
return api_key

def _require_openai_api_key() -> str:
"""Return the configured OPENAI API key or raise a helpful error."""
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("Please set the OPENAI_API_KEY environment variable.")
return api_key

# Keywords that tend to indicate real-world activities
# Prioritized list kept under query length limit (512 chars)
# Multi-word terms are stored without quotes - will be quoted in build_query()
ACTIVITY_KEYWORDS = [
"event", "festival", "concert", "show",
"comedy", "market", "food market", "brunch", "dinner",
"exhibition", "museum", "gallery", "workshop", "meetup",
"hike", "walk", "running",
"today", "market", "market", "restauraunt",
"restauraunt", "museum", "gallery", "workshop", "meetup",
"hike", "walk",
"park", "tour", "popup"
]

Expand Down Expand Up @@ -162,18 +173,18 @@ def write_tweets_to_db(limit=10):
# Validate query length (X API v2 has a 512 character limit)
query_length = len(query)
if query_length > 512:
print(f"⚠️ Warning: Query length ({query_length} chars) exceeds X API limit (512 chars)")
print(f"Query preview: {query[:100]}...")
print("Attempting to send anyway, but may fail.\n")
logger.warning(f"Query length ({query_length} chars) exceeds X API limit (512 chars)")
logger.warning(f"Query preview: {query[:100]}...")
logger.warning("Attempting to send anyway, but may fail.")
else:
print(f"Query length: {query_length} characters (limit: 512)\n")
logger.info(f"Query length: {query_length} characters (limit: 512)")

total = 0
stored = 0
next_token = None
seen = set()

print(f"🔍 Searching for up to {limit} tweets about activities in Edinburgh...\n")
logger.info(f"Searching for up to {limit} tweets about activities in Edinburgh...")

# Create database session
db = SessionLocal()
Expand Down Expand Up @@ -216,7 +227,7 @@ def write_tweets_to_db(limit=10):
db.add(tweet)
stored += 1

print(f"@{username}: {text}\n→ {url}\n")
logger.debug(f"@{username}: {text} → {url}")
total += 1

if total >= limit:
Expand All @@ -229,11 +240,11 @@ def write_tweets_to_db(limit=10):
if not next_token:
break

print(f"Done — fetched {total} tweets, stored {stored} in database.\n")
logger.info(f"Done — fetched {total} tweets, stored {stored} in database.")

except Exception as e:
db.rollback()
print(f"Error: {e}")
logger.error(f"Error: {e}")
raise
finally:
db.close()
Expand Down Expand Up @@ -270,7 +281,7 @@ def get_tweets(limit=10, threshold_hours_for_refresh=2):
needs_refresh = False

if last_scrape is None:
print("No tweets in database. Fetching new tweets...")
logger.info("No tweets in database. Fetching new tweets...")
needs_refresh = True
else:
# Calculate time since last scrape
Expand All @@ -282,12 +293,17 @@ def get_tweets(limit=10, threshold_hours_for_refresh=2):
hours_since_scrape = time_since_scrape.total_seconds() / 3600

if hours_since_scrape > threshold_hours_for_refresh:
print(f"Last scrape was {hours_since_scrape:.1f} hours ago (threshold: {threshold_hours_for_refresh} hours). Fetching new tweets...")
logger.info(f"Last scrape was {hours_since_scrape:.1f} hours ago (threshold: {threshold_hours_for_refresh} hours). Fetching new tweets...")
needs_refresh = True

# Refresh data if needed
if needs_refresh:
write_tweets_to_db(limit)
try:
write_tweets_to_db(limit)
except ValueError as e:
# API key not configured, return empty list
logger.warning(f"Cannot fetch tweets: {e}")
return []

# Fetch and return tweets from database
db = SessionLocal()
Expand All @@ -297,6 +313,103 @@ def get_tweets(limit=10, threshold_hours_for_refresh=2):
finally:
db.close()

def get_events(limit=30, threshold_hours_for_refresh=2):
"""
Convert tweets to structured event data using OpenAI API.

Args:
limit: Number of tweets to process (default: 10)
threshold_hours_for_refresh: Number of hours before data is considered stale (default: 2)

Returns:
List of event dictionaries with location_name, activity_name, time, and url fields
"""
try:
# Get tweets from database
tweets = get_tweets(limit, threshold_hours_for_refresh)

if not tweets:
logger.debug("No tweets retrieved")
return []

# Get OpenAI API key
openai_api_key = _require_openai_api_key()

# Initialize OpenAI client
client = OpenAI(api_key=openai_api_key)

# Get today's date for filtering
today = date.today().isoformat()

# Convert tweets to JSON dictionaries
tweets_json = [
{
"text": tweet.text,
"like_count": tweet.like_count,
"retweet_count": tweet.retweet_count,
"created_at": tweet.created_at.isoformat() if tweet.created_at else None,
"scraped_at": tweet.scraped_at.isoformat() if tweet.scraped_at else None,
}
for tweet in tweets
]

# Format tweets as JSON string for the prompt
tweets_json_str = json.dumps(tweets_json, indent=2)

# Create prompt for OpenAI
prompt = f"""Given the following tweets (in JSON format), extract any events mentioned that are happening TODAY ({today}).

For each event, return a JSON object with these fields:
- location_name: The venue or location name (optional)
- activity_name: The event name/title (optional)
- time: (optional) The event time in ISO 8601 format (YYYY-MM-DDTHH:MM:SS+00:00). If the time is not available in the tweet, omit this field or set it to null.
- url: The event URL if available, otherwise null

If the event includes a date, make sure that it's for today's date ({today}). If the tweet does not include a date and is an activity that could be done any day (such as a bar or restauraunt), include it and set the time field to null.
Exclude any events that explicitely include a date that is not today's date ({today}).
Return ONLY a JSON array of event objects, with no additional text or formatting.
If any of these are missing just return the information you can find

Tweets:
{tweets_json_str}"""

# Call OpenAI API
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a helpful assistant that extracts structured event data from tweets. Return only valid JSON arrays."},
{"role": "user", "content": prompt}
],
temperature=0.3,
response_format={"type": "json_object"}
)

# Parse the response
content = response.choices[0].message.content
if not content:
logger.debug("Empty response from OpenAI")
return []

# The response_format ensures we get JSON, but it might be wrapped
parsed = json.loads(content)

# Handle different response formats
if isinstance(parsed, list):
events = parsed
elif isinstance(parsed, dict) and "events" in parsed:
events = parsed["events"]
else:
logger.debug(f"Unexpected response format from OpenAI: {parsed}")
return []

logger.debug(f"Extracted {len(events)} events from tweets")
return events

except Exception as exc:
logger.debug(f"Failed to extract events from tweets: {exc}")
return []



def main():
"""Main function to scrape tweets and store in database."""
Expand Down
1 change: 1 addition & 0 deletions api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ requests>=2.31.0
python-dotenv>=1.0.0
langchain>=1.0.3
langchain-openai>=1.0.1
openai>=1.0.0
beautifulsoup4>=4.12.0
lxml>=4.9.0
python-dateutil>=2.8.2
Expand Down