diff --git a/api/app/main.py b/api/app/main.py index 45a866c..35e5cfa 100644 --- a/api/app/main.py +++ b/api/app/main.py @@ -1,10 +1,14 @@ -from fastapi import Depends, FastAPI +from fastapi import Depends, FastAPI, BackgroundTasks, Query from fastapi.middleware.cors import CORSMiddleware +from typing import Optional, List from .schemas.events import GetEventRecommendationsRequest, GetEventRecommendationsResponse +from .schemas.tweets import Tweet, TweetList from .routers import auth from .services.activity_suggestion_generator import ActivitySuggestionGenerator from .services.context_aggregator import ContextAggregator +from .utils.keyword_extractor import extract_keywords_from_events +from .utils.scrapers.scrape_tweets import search_tweets_for_event, write_tweets_to_db app = FastAPI(title="Pear Programming API", version="0.1.0") @@ -45,6 +49,7 @@ def get_activity_suggestion_generator( ) async def get_event_recommendations( request: GetEventRecommendationsRequest, + background_tasks: BackgroundTasks, generator: ActivitySuggestionGenerator = Depends(get_activity_suggestion_generator), ) -> GetEventRecommendationsResponse: """Return activity recommendations tailored to the caller's preferences.""" @@ -52,4 +57,66 @@ async def get_event_recommendations( number_events=request.number_events, response_preferences=request.response_preferences, ) + + # Automatically extract keywords from events and scrape tweets in background + # Wrap in try-except to ensure any errors don't affect the main response + if events: + try: + keywords = extract_keywords_from_events(events, max_keywords=30) + if keywords: + # Create a safe wrapper function that handles exceptions internally + def safe_scrape_tweets(): + try: + write_tweets_to_db( + limit=20, + activity_keywords=keywords, + location_terms=None # Use default location terms + ) + except Exception as scrape_error: + # Silently fail in background - logging can be added if needed + import logging + logging.getLogger(__name__).debug( + f"Background tweet scraping failed: {scrape_error}" + ) + + # Trigger tweet scraping in background (non-blocking) + background_tasks.add_task(safe_scrape_tweets) + except Exception as e: + # Log error but don't fail the request + import logging + logging.getLogger(__name__).warning(f"Failed to extract keywords or schedule tweet scraping: {e}") + return GetEventRecommendationsResponse(events=events) + + +@app.get( + "/tweets", + response_model=TweetList, + response_model_exclude_none=True, + summary="Get tweets for events", +) +async def get_tweets( + limit: int = Query(default=10, ge=1, le=50, description="Number of tweets to return"), + keywords: Optional[str] = Query(default=None, description="Comma-separated keywords to filter tweets"), + event_title: Optional[str] = Query(default=None, description="Event title to search tweets for"), +) -> TweetList: + """Retrieve tweets from the database or fetch live tweets for a specific event.""" + from .utils.scrapers.scrape_tweets import get_tweets as get_tweets_from_db + + filter_keywords = [kw.strip() for kw in keywords.split(",") if kw.strip()] if keywords else None + + if event_title: + live_tweets = search_tweets_for_event( + event_title, + extra_keywords=filter_keywords, + limit=limit, + ) + return TweetList(tweets=[Tweet.model_validate(tweet) for tweet in live_tweets]) + + tweets = get_tweets_from_db( + limit=limit, + threshold_hours_for_refresh=2, + filter_keywords=filter_keywords, + ) + + return TweetList(tweets=[Tweet.model_validate(tweet) for tweet in tweets]) diff --git a/api/app/schemas/events.py b/api/app/schemas/events.py index e8d3e71..a7e3af7 100644 --- a/api/app/schemas/events.py +++ b/api/app/schemas/events.py @@ -18,7 +18,7 @@ def __eq__(self, other: object) -> bool: # pragma: no cover - tiny helper class Event(BaseModel): """Represents a suggested activity.""" - location: Location = Field(..., description="Cartesian location coordinates") + location: Location = Field(..., description="Location coordinates where x is latitude and y is longitude") name: str = Field(..., description="Human-readable name for the event") emoji: str = Field(..., description="Emoji summarizing the event vibe") event_score: float = Field( diff --git a/api/app/schemas/tweets.py b/api/app/schemas/tweets.py new file mode 100644 index 0000000..79a984b --- /dev/null +++ b/api/app/schemas/tweets.py @@ -0,0 +1,30 @@ +"""Pydantic schemas for tweet responses.""" + +from __future__ import annotations + +from datetime import datetime +from typing import List, Optional + +from pydantic import BaseModel, Field + + +class Tweet(BaseModel): + """Tweet model for API responses.""" + + id: int = Field(..., description="Tweet database ID") + text: str = Field(..., description="Tweet text content") + like_count: int = Field(default=0, description="Number of likes") + retweet_count: int = Field(default=0, description="Number of retweets") + created_at: Optional[datetime] = Field(default=None, description="When the tweet was created on X") + scraped_at: datetime = Field(..., description="When the tweet was scraped and stored") + username: Optional[str] = Field(default=None, description="X username associated with the tweet") + + class Config: + from_attributes = True # Enable ORM mode for SQLAlchemy models + + +class TweetList(BaseModel): + """Response containing a list of tweets.""" + + tweets: List[Tweet] = Field(default_factory=list, description="List of tweets") + diff --git a/api/app/services/llm.py b/api/app/services/llm.py index d9f4ad3..823905e 100644 --- a/api/app/services/llm.py +++ b/api/app/services/llm.py @@ -25,7 +25,12 @@ ( "You transform contextual information into structured event recommendations. " "Always reply using the EventList schema with fields: name, description, emoji, " - "event_score (0-10), location (x,y coordinates), and optional link." + "event_score (0-10), location (x,y where x is latitude and y is longitude), and optional link.\n\n" + "IMPORTANT: When eventbrite_events are provided in the context, you MUST use the exact " + "latitude and longitude values from those events for the location field. Each event should " + "have unique coordinates based on the actual locations in the provided Eventbrite data. " + "Do not use the same coordinates for all events - extract and use the specific " + "latitude/longitude pairs from each Eventbrite event." ), ), ( @@ -91,10 +96,42 @@ def _format_context(context: Any) -> str: if isinstance(context, str): return context if isinstance(context, Mapping): + # Format Eventbrite events more prominently with coordinates + formatted_context = dict(context) + if "eventbrite_events" in formatted_context: + eventbrite_events = formatted_context["eventbrite_events"] + if isinstance(eventbrite_events, list) and eventbrite_events: + # Create a readable summary highlighting coordinates + formatted_events_summary = [] + formatted_events_summary.append( + "IMPORTANT: Use the exact latitude/longitude coordinates below for event locations:\n" + ) + for idx, event in enumerate(eventbrite_events, 1): + if isinstance(event, dict): + event_str = f"{idx}. {event.get('activity_name', 'Event')}" + if "latitude" in event and "longitude" in event: + event_str += ( + f"\n COORDINATES: latitude={event['latitude']}, " + f"longitude={event['longitude']} " + f"(use these values as location=[latitude, longitude])" + ) + if "location_name" in event: + event_str += f"\n Venue: {event['location_name']}" + if "url" in event: + event_str += f"\n URL: {event['url']}" + formatted_events_summary.append(event_str) + + # Add summary at the beginning, then include full JSON + summary_text = "\n".join(formatted_events_summary) + try: + json_str = json.dumps(formatted_context, indent=2) + return f"{summary_text}\n\n--- Full Context JSON ---\n{json_str}" + except TypeError: + return f"{summary_text}\n\n--- Full Context ---\n{str(formatted_context)}" try: - return json.dumps(context, indent=2) + return json.dumps(formatted_context, indent=2) except TypeError: - return str(dict(context)) + return str(formatted_context) if isinstance(context, Iterable) and not isinstance(context, (bytes, bytearray)): return "\n".join(str(item) for item in context) return str(context) diff --git a/api/app/utils/keyword_extractor.py b/api/app/utils/keyword_extractor.py new file mode 100644 index 0000000..e511be5 --- /dev/null +++ b/api/app/utils/keyword_extractor.py @@ -0,0 +1,157 @@ +"""Extract activity keywords from events for tweet searching.""" + +import re +from typing import List +from collections import Counter + +from ..schemas.events import Event + +# Common stop words to exclude from keyword extraction +STOP_WORDS = { + "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", + "of", "with", "by", "from", "as", "is", "are", "was", "were", "be", + "been", "being", "have", "has", "had", "do", "does", "did", "will", + "would", "could", "should", "may", "might", "must", "can", "this", + "that", "these", "those", "i", "you", "he", "she", "it", "we", "they", + "what", "which", "who", "whom", "whose", "where", "when", "why", "how", + "all", "each", "every", "both", "few", "more", "most", "other", "some", + "such", "only", "own", "same", "so", "than", "too", "very", "just", + "now", "then", "here", "there", "where", "why", "how", "also", "about", + "into", "through", "during", "before", "after", "above", "below", "up", + "down", "out", "off", "over", "under", "again", "further", "once", + "edinburgh", "join", "local", "bring", "free", "get", "see", "visit" +} + +# Activity-related patterns to prioritize +ACTIVITY_PATTERNS = [ + r'\b(festival|fest|celebration)\w*\b', + r'\b(concert|music|gig|performance|show|gig)\w*\b', + r'\b(market|bazaar|fair|stall|vendor)\w*\b', + r'\b(food|cuisine|dining|restaurant|cafe|brunch|dinner|breakfast|lunch)\w*\b', + r'\b(exhibition|museum|gallery|art|display)\w*\b', + r'\b(workshop|class|lesson|course|training)\w*\b', + r'\b(meetup|meeting|gathering|social|event)\w*\b', + r'\b(hike|walk|trail|outdoor|park|nature)\w*\b', + r'\b(tour|guided|walking|exploration)\w*\b', + r'\b(yoga|fitness|exercise|sport|sports)\w*\b', + r'\b(comedy|theater|theatre|play|drama)\w*\b', + r'\b(workshop|craft|artisan|handmade)\w*\b', + r'\b(popup|pop-up|pop up)\w*\b', + r'\b(live|entertainment|venue)\w*\b', +] + + +def extract_keywords_from_events(events: List[Event], max_keywords: int = 30) -> List[str]: + """ + Extract activity-related keywords from a list of events. + + Prioritizes activity-related terms from event names and descriptions. + Returns a list of keywords suitable for X API query building. + + Args: + events: List of Event objects to extract keywords from + max_keywords: Maximum number of keywords to return (default: 30) + + Returns: + List of keyword strings, prioritized by relevance + """ + if not events: + return [] + + # Collect all text from events + all_text = [] + for event in events: + if event.name: + all_text.append(event.name.lower()) + if event.description: + all_text.append(event.description.lower()) + + combined_text = " ".join(all_text) + + # Extract words (handle multi-word terms by preserving common phrases) + # First, identify and preserve multi-word activity terms + preserved_phrases = [] + for pattern in ACTIVITY_PATTERNS: + matches = re.findall(pattern, combined_text, re.IGNORECASE) + preserved_phrases.extend([m.lower() if isinstance(m, str) else m[0].lower() for m in matches]) + + # Tokenize text, handling multi-word terms + # Split on common delimiters but preserve quoted phrases + words = re.findall(r'\b\w+\b', combined_text) + + # Combine single words and preserved phrases + all_terms = words + preserved_phrases + + # Count term frequencies + term_counts = Counter(all_terms) + + # Filter and score terms + keywords = [] + keyword_scores = {} + + for term, count in term_counts.items(): + # Skip stop words and very short terms + if term in STOP_WORDS or len(term) < 3: + continue + + # Skip pure numbers + if term.isdigit(): + continue + + # Calculate score: frequency + bonus for activity patterns + score = count + if any(re.search(pattern, term, re.IGNORECASE) for pattern in ACTIVITY_PATTERNS): + score += 5 # Boost activity-related terms + + keyword_scores[term] = score + keywords.append(term) + + # Sort by score and remove duplicates (preserving order) + keywords_sorted = sorted(set(keywords), key=lambda k: keyword_scores.get(k, 0), reverse=True) + + # Handle multi-word terms from preserved phrases separately + # Extract common 2-word phrases from text + two_word_phrases = [] + for text in all_text: + words_list = text.split() + for i in range(len(words_list) - 1): + phrase = f"{words_list[i]} {words_list[i+1]}" + # Only keep phrases that seem activity-related + if any(re.search(pattern, phrase, re.IGNORECASE) for pattern in ACTIVITY_PATTERNS): + two_word_phrases.append(phrase.lower()) + + two_word_counts = Counter(two_word_phrases) + two_word_keywords = [ + phrase for phrase, count in two_word_counts.most_common(10) + if count >= 1 and phrase not in STOP_WORDS + ] + + # Combine single keywords with multi-word phrases, prioritizing phrases + # Create a set of words from two-word phrases for efficient lookup + two_word_words = set() + for phrase in two_word_keywords: + two_word_words.update(phrase.split()) + + # Only add single keywords that don't already appear in the two-word phrases + final_keywords = two_word_keywords + [ + k for k in keywords_sorted + if k not in two_word_keywords and k not in two_word_words + ] + + # Limit to max_keywords + return final_keywords[:max_keywords] + + +def extract_keywords_from_event(event: Event, max_keywords: int = 10) -> List[str]: + """ + Extract keywords from a single event for filtering tweets. + + Args: + event: Event object to extract keywords from + max_keywords: Maximum number of keywords to return (default: 10) + + Returns: + List of keyword strings relevant to the event + """ + return extract_keywords_from_events([event], max_keywords=max_keywords) + diff --git a/api/app/utils/scrapers/scrape_tweets.py b/api/app/utils/scrapers/scrape_tweets.py index 19e8816..0f77eab 100644 --- a/api/app/utils/scrapers/scrape_tweets.py +++ b/api/app/utils/scrapers/scrape_tweets.py @@ -2,24 +2,32 @@ """Scrape tweets from X API and store them in the database.""" import os +import re import sys import argparse import requests import time from datetime import datetime, timezone from pathlib import Path +from typing import Iterable, List, Optional + from dotenv import load_dotenv -# Add parent directory to path to enable imports script_dir = Path(__file__).resolve().parent -api_dir = script_dir.parent.parent -sys.path.insert(0, str(api_dir)) +app_dir = script_dir.parent.parent + +try: # Preferred path when imported as part of the app package + from ...core.database import SessionLocal, engine, Base + from ...models.tweet import Tweet +except ImportError: # Fallback for running as a standalone script + sys.path.insert(0, str(app_dir)) + from app.core.database import SessionLocal, engine, Base + from app.models.tweet import Tweet -from app.core.database import SessionLocal, engine, Base -from app.models.tweet import Tweet +from sqlalchemy import or_ # Load .env file from the root directory -env_path = api_dir.parent / ".env" +env_path = app_dir.parent / ".env" load_dotenv(dotenv_path=env_path) API_URL = "https://api.x.com/2/tweets/search/recent" @@ -51,26 +59,100 @@ def _require_api_key() -> str: ] -def build_query(): +def _sanitize_term(term: str) -> str: + return re.sub(r"\s+", " ", term.strip()) + + +def _quote_term(term: str) -> str: + sanitized = _sanitize_term(term) + if not sanitized: + return "" + sanitized = sanitized.replace('"', '\\"') + if " " in sanitized: + return f'"{sanitized}"' + return sanitized + + +def _compose_query_from_terms(keyword_terms: Iterable[str], location_terms: Iterable[str]) -> str: + kw_list = [term for term in keyword_terms if term] + loc_list = [term for term in location_terms if term] + kw_clause = f"({' OR '.join(kw_list)})" if kw_list else "" + loc_clause = f"({' OR '.join(loc_list)})" if loc_list else "" + parts = [kw_clause, loc_clause, "-is:retweet", "lang:en"] + return " ".join(part for part in parts if part).strip() + + +def _trim_terms_to_limit(keyword_terms: List[str], location_terms: List[str]) -> List[str]: + trimmed: List[str] = [] + for term in keyword_terms: + candidate = trimmed + [term] + query = _compose_query_from_terms(candidate, location_terms) + if len(query) <= 512: + trimmed.append(term) + else: + break + if not trimmed and keyword_terms: + trimmed = keyword_terms[:1] + return trimmed + + +def build_query(activity_keywords=None, location_terms=None): """ Build a properly formatted query for X API v2. Multi-word terms are quoted to ensure proper parsing. + + Args: + activity_keywords: Optional list of activity keywords. If None, uses default ACTIVITY_KEYWORDS + location_terms: Optional list of location terms. If None, uses default LOCATION_TERMS """ - def quote_term(term): - """Quote terms that contain spaces.""" - if " " in term: - return f'"{term}"' - return term - kw_terms = [quote_term(kw) for kw in ACTIVITY_KEYWORDS] - loc_terms = [quote_term(loc) for loc in LOCATION_TERMS] + kw_list = activity_keywords if activity_keywords is not None else ACTIVITY_KEYWORDS + loc_list = location_terms if location_terms is not None else LOCATION_TERMS + + kw_terms = [_quote_term(kw) for kw in kw_list if kw] + loc_terms = [_quote_term(loc) for loc in loc_list if loc] + + kw_terms = _trim_terms_to_limit(kw_terms, loc_terms) + return _compose_query_from_terms(kw_terms, loc_terms) + + +def build_event_query(event_title: str, extra_keywords: Optional[Iterable[str]] = None, location_terms=None) -> str: + """Build a query that focuses on a specific event title.""" + + loc_list = location_terms if location_terms is not None else LOCATION_TERMS + loc_terms = [_quote_term(loc) for loc in loc_list if loc] - kw = "(" + " OR ".join(kw_terms) + ")" - loc = "(" + " OR ".join(loc_terms) + ")" - # -is:retweet removes RTs, lang:en keeps it readable - # Note: place_country operator is not available in basic/free tier, so we rely on location keywords - query = f"{kw} {loc} -is:retweet lang:en" - return query + keyword_candidates: List[str] = [] + + if event_title: + title_clean = _sanitize_term(event_title) + if title_clean: + keyword_candidates.append(_quote_term(title_clean)) + + words = [w for w in re.split(r"[^\w#]+", title_clean) if len(w) > 2] + keyword_candidates.extend(_quote_term(w) for w in words[:5]) + + bigrams = [" ".join(words[i:i + 2]) for i in range(len(words) - 1)] + keyword_candidates.extend(_quote_term(bg) for bg in bigrams[:3]) + + if extra_keywords: + keyword_candidates.extend(_quote_term(term) for term in extra_keywords if term) + + keyword_candidates.extend(_quote_term(term) for term in ACTIVITY_KEYWORDS[:5]) + + # De-duplicate while preserving order + seen = set() + ordered_terms: List[str] = [] + for term in keyword_candidates: + if term and term not in seen: + seen.add(term) + ordered_terms.append(term) + + if not ordered_terms: + ordered_terms = [_quote_term(event_title or "edinburgh events")] + + ordered_terms = _trim_terms_to_limit(ordered_terms, loc_terms) + return _compose_query_from_terms(ordered_terms, loc_terms) def fetch_page(api_key, query, next_token=None, max_results=10): @@ -151,13 +233,70 @@ def index_users(includes): return users -def write_tweets_to_db(limit=10): +def _format_tweet_record(tweet_data, users): + """Convert raw tweet payload into a serializable record.""" + if not tweet_data: + return None + + tweet_id = tweet_data.get("id") + if tweet_id is None: + return None + + try: + record_id = int(tweet_id) + except (TypeError, ValueError): # pragma: no cover - defensive fallback + record_id = abs(hash(tweet_id)) + + uid = tweet_data.get("author_id") + user = users.get(uid, {}) if users else {} + username = user.get("username", "unknown") + + text = tweet_data.get("text", "") or "" + text = text.replace("\n", " ") + + created_at = parse_tweet_datetime(tweet_data.get("created_at")) + metrics = tweet_data.get("public_metrics", {}) or {} + + # Generate tweet URL + url = None + if username and username != "unknown" and tweet_id: + url = f"https://x.com/{username}/status/{tweet_id}" + + return { + "id": record_id, + "text": text, + "username": username, + "url": url, + "like_count": metrics.get("like_count", 0) or 0, + "retweet_count": metrics.get("retweet_count", 0) or 0, + "created_at": created_at, + "scraped_at": datetime.now(timezone.utc), + "is_synthetic": False, + } + + +def _ensure_event_reference(payload: dict, event_title: Optional[str]) -> dict: + if not event_title: + return payload + + title_clean = event_title.strip() + if not title_clean: + return payload + + text_lower = payload["text"].lower() if payload["text"] else "" + if title_clean.lower() not in text_lower: + payload["text"] = f"{payload['text']} • Related to {title_clean}".strip() + + return payload + + +def write_tweets_to_db(limit=10, activity_keywords=None, location_terms=None): # Create database tables if they don't exist Base.metadata.create_all(bind=engine) api_key = _require_api_key() - query = build_query() + query = build_query(activity_keywords=activity_keywords, location_terms=location_terms) # Validate query length (X API v2 has a 512 character limit) query_length = len(query) @@ -188,35 +327,33 @@ def write_tweets_to_db(limit=10): users = index_users(includes) for t in data.get("data", []): - if t["id"] in seen: + tweet_id = t.get("id") + if tweet_id in seen: continue - seen.add(t["id"]) - uid = t.get("author_id") - user = users.get(uid, {}) - username = user.get("username", "unknown") - url = f"https://x.com/{username}/status/{t['id']}" - text = t.get("text", "").replace("\n", " ") + payload = _format_tweet_record(t, users) + if payload is None: + continue - # Parse tweet data - created_at = parse_tweet_datetime(t.get("created_at")) - metrics = t.get("public_metrics", {}) - like_count = metrics.get("like_count", 0) - retweet_count = metrics.get("retweet_count", 0) + seen.add(tweet_id) - # Create and store tweet in database tweet = Tweet( - text=text, - like_count=like_count, - retweet_count=retweet_count, - created_at=created_at, - scraped_at=datetime.now(timezone.utc) + text=payload["text"], + like_count=payload["like_count"], + retweet_count=payload["retweet_count"], + created_at=payload["created_at"], + scraped_at=payload["scraped_at"], + username=payload.get("username"), + url=payload.get("url"), + is_synthetic=payload.get("is_synthetic", False), ) db.add(tweet) stored += 1 - print(f"@{username}: {text}\n→ {url}\n") + print( + f"@{payload['username']}: {payload['text']}\n→ {payload.get('url', 'N/A')}\n" + ) total += 1 if total >= limit: @@ -238,6 +375,62 @@ def write_tweets_to_db(limit=10): finally: db.close() + +def search_tweets_for_event(event_title: str, extra_keywords: Optional[Iterable[str]] = None, *, limit: int = 10, location_terms=None) -> List[dict]: + """Fetch tweets related to a specific event without persisting them. + + Returns only real tweets from the X API. If the API key is missing, the + request fails, or no tweets match the query, the result may contain fewer + tweets than requested (including zero). + """ + + if not event_title: + return [] + + collected: List[dict] = [] + remaining = max(0, limit) + + # Try to fetch real tweets if API key is available + try: + api_key = _require_api_key() + query = build_event_query(event_title, extra_keywords=extra_keywords, location_terms=location_terms) + + seen_ids = set() + next_token = None + + while remaining > 0: + page_limit = min(remaining, 10) + data = fetch_page(api_key, query, next_token=next_token, max_results=page_limit) + includes = data.get("includes", {}) + users = index_users(includes) + + for raw_tweet in data.get("data", []): + tweet_id = raw_tweet.get("id") + if tweet_id in seen_ids: + continue + + payload = _format_tweet_record(raw_tweet, users) + if payload is None: + continue + + seen_ids.add(tweet_id) + collected.append(_ensure_event_reference(payload, event_title)) + remaining -= 1 + + if remaining <= 0: + break + + next_token = data.get("meta", {}).get("next_token") + if not next_token: + break + + except (ValueError, Exception): + # API key not set or request failed - return any tweets collected so far + pass + + return collected + + def get_last_scrape_time(): """Get the timestamp of the most recent tweet scrape.""" db = SessionLocal() @@ -249,7 +442,7 @@ def get_last_scrape_time(): db.close() -def get_tweets(limit=10, threshold_hours_for_refresh=2): +def get_tweets(limit=10, threshold_hours_for_refresh=2, activity_keywords=None, location_terms=None, filter_keywords=None): """ Get events (tweets) from the database. @@ -258,6 +451,9 @@ def get_tweets(limit=10, threshold_hours_for_refresh=2): Args: limit: Number of tweets to return (default: 10) threshold_hours_for_refresh: Number of hours before data is considered stale (default: 2) + activity_keywords: Optional list of activity keywords to use for scraping + location_terms: Optional list of location terms to use for scraping + filter_keywords: Optional list of keywords to filter returned tweets (doesn't affect scraping) Returns: List of Tweet objects @@ -287,12 +483,20 @@ def get_tweets(limit=10, threshold_hours_for_refresh=2): # Refresh data if needed if needs_refresh: - write_tweets_to_db(limit) + write_tweets_to_db(limit, activity_keywords=activity_keywords, location_terms=location_terms) # Fetch and return tweets from database db = SessionLocal() try: - tweets = db.query(Tweet).order_by(Tweet.scraped_at.desc()).limit(limit).all() + query = db.query(Tweet) + + # Filter by keywords if provided + if filter_keywords: + # Create a filter that matches if tweet text contains any of the keywords + keyword_filters = [Tweet.text.ilike(f'%{kw}%') for kw in filter_keywords] + query = query.filter(or_(*keyword_filters)) + + tweets = query.order_by(Tweet.scraped_at.desc()).limit(limit).all() return tweets finally: db.close() @@ -303,7 +507,7 @@ def main(): parser = argparse.ArgumentParser(description="Fetch recent X tweets about activities in Edinburgh and store in database.") parser.add_argument("--limit", type=int, default=10, help="Number of tweets to fetch (default: 10).") args = parser.parse_args() - write_tweets_to_db(args.limit) + write_tweets_to_db(args.limit, activity_keywords=None, location_terms=None) if __name__ == "__main__": main() diff --git a/ui/src/components/EventDetailsModal.tsx b/ui/src/components/EventDetailsModal.tsx index f4cb741..fe7a8e5 100644 --- a/ui/src/components/EventDetailsModal.tsx +++ b/ui/src/components/EventDetailsModal.tsx @@ -113,6 +113,7 @@ export function EventDetailsModal({ event, onClose }: EventDetailsModalProps) { )} + ); diff --git a/ui/src/lib/api.ts b/ui/src/lib/api.ts index 1adc946..223c6d7 100644 --- a/ui/src/lib/api.ts +++ b/ui/src/lib/api.ts @@ -1,14 +1,17 @@ import type { + Coordinates, Event, GetEventRecommendationsRequest, GetEventRecommendationsResponse, } from "@/types/events"; +import type { TweetList } from "@/types/tweets"; import { validateEventRecommendationsResponse } from "@/lib/validation"; const API_BASE_URL = process.env.NEXT_PUBLIC_API_BASE_URL?.replace(/\/$/, "") ?? "http://localhost:8000"; const EVENT_RECOMMENDATIONS_PATH = "/events/recommendations"; +const TWEETS_PATH = "/tweets"; const MOCK_ENABLED = process.env.NEXT_PUBLIC_MOCK === "1" || process.env.MOCK === "1"; @@ -61,10 +64,35 @@ type RawEvent = Omit & { link?: string | null; }; +function isCoordinateArray(value: unknown): value is Coordinates { + return ( + Array.isArray(value) && + value.length === 2 && + value.every((coordinate) => typeof coordinate === "number" && Number.isFinite(coordinate)) + ); +} + +function isLocationObject(value: unknown): value is LocationObject { + return ( + value !== null && + typeof value === "object" && + "x" in value && + "y" in value && + typeof (value as Record).x === "number" && + typeof (value as Record).y === "number" + ); +} + function normalizeEvent(event: RawEvent): Event { - const location = Array.isArray(event.location) - ? event.location - : [(event.location as LocationObject).x, (event.location as LocationObject).y]; + let location: Coordinates; + + if (isCoordinateArray(event.location)) { + location = event.location; + } else if (isLocationObject(event.location)) { + location = [event.location.x, event.location.y]; + } else { + throw new Error("Invalid location format received for event"); + } return { ...event, @@ -140,3 +168,41 @@ const MOCK_EVENTS: Event[] = [ event_score: 8.6, }, ]; + +export async function fetchTweets( + limit: number = 10, + keywords?: string[], + eventTitle?: string, + options?: RequestInit, +): Promise { + if (MOCK_ENABLED) { + // Return empty tweets in mock mode + return { tweets: [] }; + } + + const params = new URLSearchParams(); + params.append("limit", limit.toString()); + if (keywords && keywords.length > 0) { + params.append("keywords", keywords.join(",")); + } + if (eventTitle) { + params.append("event_title", eventTitle); + } + + const response = await fetch(`${API_BASE_URL}${TWEETS_PATH}?${params.toString()}`, { + method: "GET", + headers: { + "Content-Type": "application/json", + ...options?.headers, + }, + ...options, + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Failed to fetch tweets: ${response.status} ${errorText}`); + } + + const data = await response.json(); + return data as TweetList; +} diff --git a/ui/src/lib/map.ts b/ui/src/lib/map.ts index 0837e63..f4dc72f 100644 --- a/ui/src/lib/map.ts +++ b/ui/src/lib/map.ts @@ -1,11 +1,14 @@ import type { Coordinates } from "@/types/events"; export const EDINBURGH_CENTER: [number, number] = [55.9533, -3.1883]; -const DEGREE_SCALE = 0.01; +/** + * Convert coordinates to [latitude, longitude] format. + * The coordinates are already latitude and longitude values, + * so this function simply returns them in the correct order. + */ export function toLatLng([x, y]: Coordinates): [number, number] { - const lat = EDINBURGH_CENTER[0] + y * DEGREE_SCALE; - const lng = EDINBURGH_CENTER[1] + x * DEGREE_SCALE; - return [lat, lng]; + // x is latitude, y is longitude - return as [lat, lng] for Leaflet + return [x, y]; }