erickoch3 · omer38 · Nov 2, 2025 · Nov 2, 2025
diff --git a/api/app/main.py b/api/app/main.py
@@ -1,10 +1,14 @@
-from fastapi import Depends, FastAPI
+from fastapi import Depends, FastAPI, BackgroundTasks, Query
 from fastapi.middleware.cors import CORSMiddleware
+from typing import Optional, List
 
 from .schemas.events import GetEventRecommendationsRequest, GetEventRecommendationsResponse
+from .schemas.tweets import Tweet, TweetList
 from .routers import auth
 from .services.activity_suggestion_generator import ActivitySuggestionGenerator
 from .services.context_aggregator import ContextAggregator
+from .utils.keyword_extractor import extract_keywords_from_events
+from .utils.scrapers.scrape_tweets import search_tweets_for_event, write_tweets_to_db
 
 app = FastAPI(title="Pear Programming API", version="0.1.0")
 
@@ -45,11 +49,74 @@ def get_activity_suggestion_generator(
 )
 async def get_event_recommendations(
     request: GetEventRecommendationsRequest,
+    background_tasks: BackgroundTasks,
     generator: ActivitySuggestionGenerator = Depends(get_activity_suggestion_generator),
 ) -> GetEventRecommendationsResponse:
     """Return activity recommendations tailored to the caller's preferences."""
     events = generator.generate_suggestions(
         number_events=request.number_events,
         response_preferences=request.response_preferences,
     )
+
+    # Automatically extract keywords from events and scrape tweets in background
+    # Wrap in try-except to ensure any errors don't affect the main response
+    if events:
+        try:
+            keywords = extract_keywords_from_events(events, max_keywords=30)
+            if keywords:
+                # Create a safe wrapper function that handles exceptions internally
+                def safe_scrape_tweets():
+                    try:
+                        write_tweets_to_db(
+                            limit=20,
+                            activity_keywords=keywords,
+                            location_terms=None  # Use default location terms
+                        )
+                    except Exception as scrape_error:
+                        # Silently fail in background - logging can be added if needed
+                        import logging
+                        logging.getLogger(__name__).debug(
+                            f"Background tweet scraping failed: {scrape_error}"
+                        )
+
+                # Trigger tweet scraping in background (non-blocking)
+                background_tasks.add_task(safe_scrape_tweets)
+        except Exception as e:
+            # Log error but don't fail the request
+            import logging
+            logging.getLogger(__name__).warning(f"Failed to extract keywords or schedule tweet scraping: {e}")
+
     return GetEventRecommendationsResponse(events=events)
+
+
+@app.get(
+    "/tweets",
+    response_model=TweetList,
+    response_model_exclude_none=True,
+    summary="Get tweets for events",
+)
+async def get_tweets(
+    limit: int = Query(default=10, ge=1, le=50, description="Number of tweets to return"),
+    keywords: Optional[str] = Query(default=None, description="Comma-separated keywords to filter tweets"),
+    event_title: Optional[str] = Query(default=None, description="Event title to search tweets for"),
+) -> TweetList:
+    """Retrieve tweets from the database or fetch live tweets for a specific event."""
+    from .utils.scrapers.scrape_tweets import get_tweets as get_tweets_from_db
+
+    filter_keywords = [kw.strip() for kw in keywords.split(",") if kw.strip()] if keywords else None
+
+    if event_title:
+        live_tweets = search_tweets_for_event(
+            event_title,
+            extra_keywords=filter_keywords,
+            limit=limit,
+        )
+        return TweetList(tweets=[Tweet.model_validate(tweet) for tweet in live_tweets])
+
+    tweets = get_tweets_from_db(
+        limit=limit,
+        threshold_hours_for_refresh=2,
+        filter_keywords=filter_keywords,
+    )
+
+    return TweetList(tweets=[Tweet.model_validate(tweet) for tweet in tweets])
diff --git a/api/app/schemas/events.py b/api/app/schemas/events.py
@@ -18,7 +18,7 @@ def __eq__(self, other: object) -> bool:  # pragma: no cover - tiny helper
 class Event(BaseModel):
     """Represents a suggested activity."""
 
-    location: Location = Field(..., description="Cartesian location coordinates")
+    location: Location = Field(..., description="Location coordinates where x is latitude and y is longitude")
     name: str = Field(..., description="Human-readable name for the event")
     emoji: str = Field(..., description="Emoji summarizing the event vibe")
     event_score: float = Field(

diff --git a/api/app/schemas/tweets.py b/api/app/schemas/tweets.py
@@ -0,0 +1,30 @@
+"""Pydantic schemas for tweet responses."""
+
+from __future__ import annotations
+
+from datetime import datetime
+from typing import List, Optional
+
+from pydantic import BaseModel, Field
+
+
+class Tweet(BaseModel):
+    """Tweet model for API responses."""
+
+    id: int = Field(..., description="Tweet database ID")
+    text: str = Field(..., description="Tweet text content")
+    like_count: int = Field(default=0, description="Number of likes")
+    retweet_count: int = Field(default=0, description="Number of retweets")
+    created_at: Optional[datetime] = Field(default=None, description="When the tweet was created on X")
+    scraped_at: datetime = Field(..., description="When the tweet was scraped and stored")
+    username: Optional[str] = Field(default=None, description="X username associated with the tweet")
+
+    class Config:
+        from_attributes = True  # Enable ORM mode for SQLAlchemy models
+
+
+class TweetList(BaseModel):
+    """Response containing a list of tweets."""
+
+    tweets: List[Tweet] = Field(default_factory=list, description="List of tweets")
+
diff --git a/api/app/services/llm.py b/api/app/services/llm.py
@@ -25,7 +25,12 @@
             (
                 "You transform contextual information into structured event recommendations. "
                 "Always reply using the EventList schema with fields: name, description, emoji, "
-                "event_score (0-10), location (x,y coordinates), and optional link."
+                "event_score (0-10), location (x,y where x is latitude and y is longitude), and optional link.\n\n"
+                "IMPORTANT: When eventbrite_events are provided in the context, you MUST use the exact "
+                "latitude and longitude values from those events for the location field. Each event should "
+                "have unique coordinates based on the actual locations in the provided Eventbrite data. "
+                "Do not use the same coordinates for all events - extract and use the specific "
+                "latitude/longitude pairs from each Eventbrite event."
             ),
         ),
         (
@@ -91,10 +96,42 @@ def _format_context(context: Any) -> str:
         if isinstance(context, str):
             return context
         if isinstance(context, Mapping):
+            # Format Eventbrite events more prominently with coordinates
+            formatted_context = dict(context)
+            if "eventbrite_events" in formatted_context:
+                eventbrite_events = formatted_context["eventbrite_events"]
+                if isinstance(eventbrite_events, list) and eventbrite_events:
+                    # Create a readable summary highlighting coordinates
+                    formatted_events_summary = []
+                    formatted_events_summary.append(
+                        "IMPORTANT: Use the exact latitude/longitude coordinates below for event locations:\n"
+                    )
+                    for idx, event in enumerate(eventbrite_events, 1):
+                        if isinstance(event, dict):
+                            event_str = f"{idx}. {event.get('activity_name', 'Event')}"
+                            if "latitude" in event and "longitude" in event:
+                                event_str += (
+                                    f"\n   COORDINATES: latitude={event['latitude']}, "
+                                    f"longitude={event['longitude']} "
+                                    f"(use these values as location=[latitude, longitude])"
+                                )
+                            if "location_name" in event:
+                                event_str += f"\n   Venue: {event['location_name']}"
+                            if "url" in event:
+                                event_str += f"\n   URL: {event['url']}"
+                            formatted_events_summary.append(event_str)
+
+                    # Add summary at the beginning, then include full JSON
+                    summary_text = "\n".join(formatted_events_summary)
+                    try:
+                        json_str = json.dumps(formatted_context, indent=2)
+                        return f"{summary_text}\n\n--- Full Context JSON ---\n{json_str}"
+                    except TypeError:
+                        return f"{summary_text}\n\n--- Full Context ---\n{str(formatted_context)}"
             try:
-                return json.dumps(context, indent=2)
+                return json.dumps(formatted_context, indent=2)
             except TypeError:
-                return str(dict(context))
+                return str(formatted_context)
         if isinstance(context, Iterable) and not isinstance(context, (bytes, bytearray)):
             return "\n".join(str(item) for item in context)
         return str(context)

diff --git a/api/app/utils/keyword_extractor.py b/api/app/utils/keyword_extractor.py
@@ -0,0 +1,157 @@
+"""Extract activity keywords from events for tweet searching."""
+
+import re
+from typing import List
+from collections import Counter
+
+from ..schemas.events import Event
+
+# Common stop words to exclude from keyword extraction
+STOP_WORDS = {
+    "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
+    "of", "with", "by", "from", "as", "is", "are", "was", "were", "be",
+    "been", "being", "have", "has", "had", "do", "does", "did", "will",
+    "would", "could", "should", "may", "might", "must", "can", "this",
+    "that", "these", "those", "i", "you", "he", "she", "it", "we", "they",
+    "what", "which", "who", "whom", "whose", "where", "when", "why", "how",
+    "all", "each", "every", "both", "few", "more", "most", "other", "some",
+    "such", "only", "own", "same", "so", "than", "too", "very", "just",
+    "now", "then", "here", "there", "where", "why", "how", "also", "about",
+    "into", "through", "during", "before", "after", "above", "below", "up",
+    "down", "out", "off", "over", "under", "again", "further", "once",
+    "edinburgh", "join", "local", "bring", "free", "get", "see", "visit"
+}
+
+# Activity-related patterns to prioritize
+ACTIVITY_PATTERNS = [
+    r'\b(festival|fest|celebration)\w*\b',
+    r'\b(concert|music|gig|performance|show|gig)\w*\b',
+    r'\b(market|bazaar|fair|stall|vendor)\w*\b',
+    r'\b(food|cuisine|dining|restaurant|cafe|brunch|dinner|breakfast|lunch)\w*\b',
+    r'\b(exhibition|museum|gallery|art|display)\w*\b',
+    r'\b(workshop|class|lesson|course|training)\w*\b',
+    r'\b(meetup|meeting|gathering|social|event)\w*\b',
+    r'\b(hike|walk|trail|outdoor|park|nature)\w*\b',
+    r'\b(tour|guided|walking|exploration)\w*\b',
+    r'\b(yoga|fitness|exercise|sport|sports)\w*\b',
+    r'\b(comedy|theater|theatre|play|drama)\w*\b',
+    r'\b(workshop|craft|artisan|handmade)\w*\b',
+    r'\b(popup|pop-up|pop up)\w*\b',
+    r'\b(live|entertainment|venue)\w*\b',
+]
+
+
+def extract_keywords_from_events(events: List[Event], max_keywords: int = 30) -> List[str]:
+    """
+    Extract activity-related keywords from a list of events.
+
+    Prioritizes activity-related terms from event names and descriptions.
+    Returns a list of keywords suitable for X API query building.
+
+    Args:
+        events: List of Event objects to extract keywords from
+        max_keywords: Maximum number of keywords to return (default: 30)
+
+    Returns:
+        List of keyword strings, prioritized by relevance
+    """
+    if not events:
+        return []
+
+    # Collect all text from events
+    all_text = []
+    for event in events:
+        if event.name:
+            all_text.append(event.name.lower())
+        if event.description:
+            all_text.append(event.description.lower())
+
+    combined_text = " ".join(all_text)
+
+    # Extract words (handle multi-word terms by preserving common phrases)
+    # First, identify and preserve multi-word activity terms
+    preserved_phrases = []
+    for pattern in ACTIVITY_PATTERNS:
+        matches = re.findall(pattern, combined_text, re.IGNORECASE)
+        preserved_phrases.extend([m.lower() if isinstance(m, str) else m[0].lower() for m in matches])
+
+    # Tokenize text, handling multi-word terms
+    # Split on common delimiters but preserve quoted phrases
+    words = re.findall(r'\b\w+\b', combined_text)
+
+    # Combine single words and preserved phrases
+    all_terms = words + preserved_phrases
+
+    # Count term frequencies
+    term_counts = Counter(all_terms)
+
+    # Filter and score terms
+    keywords = []
+    keyword_scores = {}
+
+    for term, count in term_counts.items():
+        # Skip stop words and very short terms
+        if term in STOP_WORDS or len(term) < 3:
+            continue
+
+        # Skip pure numbers
+        if term.isdigit():
+            continue
+
+        # Calculate score: frequency + bonus for activity patterns
+        score = count
+        if any(re.search(pattern, term, re.IGNORECASE) for pattern in ACTIVITY_PATTERNS):
+            score += 5  # Boost activity-related terms
+
+        keyword_scores[term] = score
+        keywords.append(term)
+
+    # Sort by score and remove duplicates (preserving order)
+    keywords_sorted = sorted(set(keywords), key=lambda k: keyword_scores.get(k, 0), reverse=True)
+
+    # Handle multi-word terms from preserved phrases separately
+    # Extract common 2-word phrases from text
+    two_word_phrases = []
+    for text in all_text:
+        words_list = text.split()
+        for i in range(len(words_list) - 1):
+            phrase = f"{words_list[i]} {words_list[i+1]}"
+            # Only keep phrases that seem activity-related
+            if any(re.search(pattern, phrase, re.IGNORECASE) for pattern in ACTIVITY_PATTERNS):
+                two_word_phrases.append(phrase.lower())
+
+    two_word_counts = Counter(two_word_phrases)
+    two_word_keywords = [
+        phrase for phrase, count in two_word_counts.most_common(10)
+        if count >= 1 and phrase not in STOP_WORDS
+    ]
+
+    # Combine single keywords with multi-word phrases, prioritizing phrases
+    # Create a set of words from two-word phrases for efficient lookup
+    two_word_words = set()
+    for phrase in two_word_keywords:
+        two_word_words.update(phrase.split())
+
+    # Only add single keywords that don't already appear in the two-word phrases
+    final_keywords = two_word_keywords + [
+        k for k in keywords_sorted 
+        if k not in two_word_keywords and k not in two_word_words
+    ]
+
+    # Limit to max_keywords
+    return final_keywords[:max_keywords]
+
+
+def extract_keywords_from_event(event: Event, max_keywords: int = 10) -> List[str]:
+    """
+    Extract keywords from a single event for filtering tweets.
+
+    Args:
+        event: Event object to extract keywords from
+        max_keywords: Maximum number of keywords to return (default: 10)
+
+    Returns:
+        List of keyword strings relevant to the event
+    """
+    return extract_keywords_from_events([event], max_keywords=max_keywords)
+