diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 0ecee88..ccba373 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -20,18 +20,23 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 with: python-version: 3.11 + - name: Install dependencies run: | + sudo apt-get update + sudo apt-get install -y build-essential + pip3 install --upgrade pip - pip3 install pylint for f in $(find -type f -name "requirements.txt"); do pip3 install -r $f done + - name: Analyzing the python code run: | set -ex - export PYTHONPATH=$PWD/src/tia/ - find . -type f -name "*.py" | xargs pylint + export PYTHONPATH=$PWD/src/gentrade/ + make lint diff --git a/.pylintrc b/.pylintrc index ded06ab..365e076 100644 --- a/.pylintrc +++ b/.pylintrc @@ -104,10 +104,6 @@ recursive=no # source root. source-roots= -# When enabled, pylint would attempt to guess common misconfiguration and emit -# user-friendly hints instead of false-positive error messages. -suggestion-mode=yes - # Allow loading of arbitrary C extensions. Extensions are imported into the # active Python interpreter and may run arbitrary code. unsafe-load-any-extension=no @@ -457,7 +453,8 @@ disable=raw-checker-failed, import-error, duplicate-code, redefined-outer-name, - logging-fstring-interpolation + logging-fstring-interpolation, + abstract-class-instantiated # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..de3b8e6 --- /dev/null +++ b/Makefile @@ -0,0 +1,56 @@ + +CURRENT_DIR := $(shell pwd) + +PYTHON_VERSION := "3.11" +PYTHON_VERSION_OK := $(shell python -c "import sys; print(sys.version_info >= (3,11) and sys.version_info < (3,12))") + +# Pylint configuration +PYLINT_ARGS = --rcfile=.pylintrc +PYLINT_VERSION = 4.0.4 +PYLINT_INSTALLED := $(shell python -c "import pkg_resources; print('pylint' in {pkg.key for pkg in pkg_resources.working_set})" 2>/dev/null || echo "False") +PYLINT_VERSION_OK := $(shell python -c "import pylint; print(pylint.__version__ == '$(PYLINT_VERSION)')" 2>/dev/null || echo "False") + +# Default target +all: lint + +ensure-pylint: + @if [ "$(PYLINT_INSTALLED)" = "False" ] || [ "$(PYLINT_VERSION_OK)" = "False" ]; then \ + echo "Installing/upgrading pylint to version $(PYLINT_VERSION)..."; \ + pip install pylint=="$(PYLINT_VERSION)"; \ + else \ + echo "✅ pylint $(PYLINT_VERSION) is already installed"; \ + fi + +# Lint all Python files +lint: check-python ensure-pylint + @echo "Running Pylint ..." + @export PYTHONPATH=$(CURRENT_DIR)/src + find . -type f -name "*.py" | xargs pylint $(PYLINT_ARGS) + +# Lint specific file(s) +lint-file: + @if [ -z "$(FILE)" ]; then \ + echo "Usage: make lint-file FILE=path/to/file.py"; \ + exit 1; \ + fi + pip install pylint=="$(PYLINT_VERSION)" + pylint $(PYLINT_ARGS) $(FILE) + +# Check python version +check-python: + @echo "Checking Python version..." + @if [ "$(PYTHON_VERSION_OK)" = "True" ]; then \ + echo "✅ Python version is $(PYTHON_VERSION) (compatible)"; \ + else \ + echo "❌ Error: Python $(PYTHON_VERSION) is required (found $(shell python --version | cut -d' ' -f2))"; \ + exit 1; \ + fi + +# Clean up +clean: + @echo "Cleaning in: $(CURRENT_DIR)..." + find $(CURRENT_DIR) -type d -name "__pycache__" -exec rm -rf {} + + find $(CURRENT_DIR) -type f -name "*.pyc" -delete + find $(CURRENT_DIR) -type f -name ".pylint-cache" -exec rm -rf {} + + +.PHONY: all lint lint-file clean check-python ensure-pylint diff --git a/requirements.txt b/requirements.txt index cc5d1d3..65c6d97 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,9 +8,13 @@ backtrader ag2 mplfinance ntplib +loguru langchain_openai langchain_core langchain_tavily langchain langgraph + +lxml[html_clean] +newspaper3k \ No newline at end of file diff --git a/src/gentrade/llm/factory.py b/src/gentrade/llm/factory.py index dc86183..fd297c3 100644 --- a/src/gentrade/llm/factory.py +++ b/src/gentrade/llm/factory.py @@ -24,11 +24,11 @@ from pydantic import Field from langchain_openai import ChatOpenAI -from langchain.schema import ( +from langchain_core.messages import ( AIMessage, BaseMessage ) -from langchain.schema.runnable import RunnableConfig +from langchain_core.runnables import RunnableConfig LOG = logging.getLogger(__name__) diff --git a/src/gentrade/news/factory.py b/src/gentrade/news/factory.py index 54b75b5..233c62f 100644 --- a/src/gentrade/news/factory.py +++ b/src/gentrade/news/factory.py @@ -7,24 +7,18 @@ """ import os -import logging import time import threading -from typing import List, Optional, Set -from urllib.parse import urlparse # Add this to extract domain from URL +from typing import List, Optional +from loguru import logger -import requests -from newspaper import Article -from bs4 import BeautifulSoup +from gentrade.scraper.extractor import ArticleContentExtractor -from gentrade.news.meta import NewsInfo, NewsProviderBase, NewsDatabase -from gentrade.news.googlenews import GoogleNewsProvider +from gentrade.news.meta import NewsProviderBase, NewsDatabase from gentrade.news.newsapi import NewsApiProvider from gentrade.news.rss import RssProvider from gentrade.news.finnhub import FinnhubNewsProvider -LOG = logging.getLogger(__name__) - class NewsFactory: """Factory class for creating news provider instances based on provider type. @@ -38,8 +32,7 @@ def create_provider(provider_type: str, **kwargs) -> NewsProviderBase: """Create a news provider instance based on the specified provider type. Args: - provider_type: Type of news provider. Supported values: "newsapi", "finnhub", - "google", "rss". + provider_type: Type of news provider. Supported values: "newsapi", "finnhub", "rss". ** kwargs: Additional keyword arguments for provider initialization (e.g., feed_url for RSS providers). @@ -54,7 +47,6 @@ def create_provider(provider_type: str, **kwargs) -> NewsProviderBase: providers = { "newsapi": NewsApiProvider, "finnhub": FinnhubNewsProvider, - "google": GoogleNewsProvider, "rss": RssProvider } @@ -74,15 +66,6 @@ def create_provider(provider_type: str, **kwargs) -> NewsProviderBase: raise ValueError("FINNHUB_API_KEY environment variable not set") return provider_class(api_key=api_key) - if provider_type_lower == "google": - api_key = os.getenv("GOOGLE_CLOUD_API_KEY") - cse_id = os.getenv("GOOGLE_CSE_ID") - if not api_key or not cse_id: - raise ValueError( - "GOOGLE_CLOUD_API_KEY or GOOGLE_CSE_ID environment variable not set" - ) - return provider_class(api_key=api_key, cse_id=cse_id) - if provider_type_lower == "rss": feed_url = kwargs.get("feed_url", os.getenv("RSS_FEED_URL")) return provider_class(feed_url=feed_url) @@ -108,29 +91,34 @@ def __init__(self, providers: List[NewsProviderBase], db: NewsDatabase): self.db = db self.db_lock = threading.Lock() - # 1. Add blocklist (stores blocked domain names, e.g., "example.com") - self.blocklist: Set[str] = set() - - # 2. Add dummy content keywords (expand this list based on your needs) - self.dummy_keywords = { - "we use cookies", "cookie policy", "analyze website traffic", - "accept cookies", "reject cookies", "by continuing to use", - "this website uses cookies", "improve user experience", - "ads by", "sponsored content", "subscribe to access" - } - #self.blocklist = self._load_blocklist() + def _fetch_thread(self, provider, aggregator, ticker, category, + max_hour_interval, max_count, is_process=True): + if ticker: + news = provider.fetch_stock_news( + ticker, category, max_hour_interval, max_count + ) + logger.info( + f"Fetched {len(news)} stock news articles for {ticker} from " + f"{provider.__class__.__name__}" + ) + else: + news = provider.fetch_latest_market_news( + category, max_hour_interval, max_count + ) + logger.info( + f"Fetched {len(news)} market news articles from " + f"{provider.__class__.__name__}" + ) - def _load_blocklist(self) -> Set[str]: - try: - with open("news_blocklist.txt", "r", encoding="utf-8") as f: - return set(line.strip() for line in f if line.strip()) - except FileNotFoundError: - return set() + ace = ArticleContentExtractor.inst() + for item in news: + item.summary = ace.clean_html(item.summary) + if is_process: + item.content = ace.extract_content(item.url) + logger.info(item.content) - def _save_blocklist(self) -> None: - with open("news_blocklist.txt", "w", encoding="utf-8") as f: - for domain in self.blocklist: - f.write(f"{domain}\n") + with aggregator.db_lock: + aggregator.db.add_news(news) def sync_news( self, @@ -152,37 +140,15 @@ def sync_news( """ current_time = time.time() if current_time < self.db.last_sync + 3600: - LOG.info("Skipping sync: Last sync was less than 1 hour ago.") + logger.info("Skipping sync: Last sync was less than 1 hour ago.") return - LOG.info("Starting news sync...") - - def fetch_and_process(provider, aggregator, ticker, category, max_hour_interval, max_count): - if ticker: - news = provider.fetch_stock_news( - ticker, category, max_hour_interval, max_count - ) - LOG.info( - f"Fetched {len(news)} stock news articles for {ticker} from " - f"{provider.__class__.__name__}" - ) - else: - news = provider.fetch_latest_market_news( - category, max_hour_interval, max_count - ) - LOG.info( - f"Fetched {len(news)} market news articles from " - f"{provider.__class__.__name__}" - ) - - aggregator.process_news(news) - with aggregator.db_lock: - aggregator.db.add_news(news) + logger.info("Starting news sync...") threads = [] for provider in self.providers: thread = threading.Thread( - target=fetch_and_process, + target=self._fetch_thread, args=(provider, self, ticker, category, max_hour_interval, max_count) ) threads.append(thread) @@ -192,163 +158,19 @@ def fetch_and_process(provider, aggregator, ticker, category, max_hour_interval, thread.join() self.db.last_sync = current_time - LOG.info("News sync completed.") - - def process_news(self, news: List[NewsInfo]) -> None: - """Process news: Skip blocked sites → Check for dummy content → Clean content""" - # Filter out news from blocked websites FIRST - filtered_news = [n for n in news if not self._is_blocked(n.url)] - - for article in filtered_news: - LOG.info(f"Processing news: {article.headline}") - - # Extract content and check for dummy messages - content = self._extract_news_text(article.url) - if self._contains_dummy_content(content): - # Add the website to blocklist if dummy content is found - domain = self._extract_domain(article.url) - self.blocklist.add(domain) - LOG.warning(f"Blocked website {domain} (contains dummy content)") - continue # Skip storing this article - - # Proceed with normal cleaning if no dummy content - article.summary = self._clean_html(article.summary) - article.content = content - time.sleep(1) - - def _is_blocked(self, url: str) -> bool: - """Check if the website of the URL is in the blocklist""" - domain = self._extract_domain(url) - if domain in self.blocklist: - LOG.info(f"Skipping blocked website: {domain} (URL: {url})") - return True - return False - - def _extract_domain(self, url: str) -> str: - """Extract the main domain from a URL - (e.g., "https://www.example.com/news" → "example.com") - """ - try: - parsed = urlparse(url) - # Split subdomains (e.g., "www.example.co.uk" → "example.co.uk" for common TLDs) - domain_parts = parsed.netloc.split(".") - # Handle cases like "co.uk" (adjust based on your target regions) - if len(domain_parts) >= 3 and domain_parts[-2] in ["co", "com", "org", "net"]: - return ".".join(domain_parts[-3:]) - return ".".join(domain_parts[-2:]) - except Exception as e: - LOG.error(f"Failed to extract domain from {url}: {e}") - return url # Fallback to full URL if parsing fails - - def _contains_dummy_content(self, content: str) -> bool: - """Check if content contains dummy messages (case-insensitive)""" - if not content: - return False - content_lower = content.lower() - # Count how many dummy keywords match - dummy_count = sum(1 for keyword in self.dummy_keywords if keyword in content_lower) - # Return True if ≥1 keyword matches (adjust threshold if needed) - return dummy_count >= 1 - - def _extract_news_text(self, url: str) -> str: - """Extract text content from a news article URL using newspaper3k. - - Falls back to HTML scraping with BeautifulSoup if newspaper3k fails. - - Args: - url: URL of the news article to extract text from. - - Returns: - Cleaned text content of the article, or empty string if extraction fails. - """ - try: - article = Article(url) - article.download() - article.parse() - if article.text: - return article.text - - # Fallback to HTML scraping if newspaper3k returns empty text - html = self._fetch_original_html(url) - return self._clean_html(html) - - except Exception as e: - LOG.error(f"Failed to extract text with newspaper3k ({url}): {e}") - html = self._fetch_original_html(url) - return self._clean_html(html) - - def _fetch_original_html(self, url: str, timeout: int = 10) -> Optional[str]: - """Fetch raw HTML content from a URL with retries. - - Args: - url: URL to fetch HTML from. - timeout: Request timeout in seconds (default: 10). - - Returns: - Raw HTML content as a string, or None if fetch fails. - """ - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - } - retries = 3 - - for attempt in range(retries): - try: - response = requests.get( - url, headers=headers, timeout=timeout, verify=False - ) - response.raise_for_status() - return response.text - except Exception as e: - if attempt < retries - 1: - time.sleep(1) - continue - LOG.error(f"Failed to fetch HTML after {retries} retries ({url}): {e}") - return None - - return None - - def _clean_html(self, html_content: Optional[str]) -> str: - """Clean HTML content to extract readable text. - - Removes scripts, styles, and other non-content elements, then normalizes whitespace. - - Args: - html_content: Raw HTML content to clean. - - Returns: - Cleaned text string, or empty string if input is None/empty. - """ - if not html_content: - return "" - - soup = BeautifulSoup(html_content, "html.parser") - - # Remove non-content elements - for element in soup(["script", "style", "iframe", "nav", "aside", "footer"]): - element.decompose() - - # Extract and normalize text - text = soup.get_text() - lines = (line.strip() for line in text.splitlines()) - chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) - return "\n".join(chunk for chunk in chunks if chunk) - + logger.info("News sync completed.") if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) db = NewsDatabase() try: # Initialize providers using the factory newsapi_provider = NewsFactory.create_provider("newsapi") finnhub_provider = NewsFactory.create_provider("finnhub") - google_provider = NewsFactory.create_provider("google") rss_provider = NewsFactory.create_provider("rss") # Create aggregator with selected providers - aggregator = NewsAggregator(providers=[newsapi_provider], db=db) + aggregator = NewsAggregator(providers=[rss_provider], db=db) # Sync market news and stock-specific news aggregator.sync_news(category="business", max_hour_interval=64, max_count=10) @@ -361,18 +183,18 @@ def _clean_html(self, html_content: Optional[str]) -> str: # Log results all_news = db.get_all_news() - LOG.info(f"Total articles in database: {len(all_news)}") + logger.info(f"Total articles in database: {len(all_news)}") if all_news: - LOG.info("Example article:") - LOG.info(all_news[0].to_dict()) + logger.info("Example article:") + logger.info(all_news[0].to_dict()) for news_item in all_news: - LOG.info("--------------------------------") + logger.info("--------------------------------") print(news_item.headline) print(news_item.url) print(news_item.content) - LOG.info("--------------------------------") + logger.info("--------------------------------") except ValueError as e: - LOG.error(f"Error during news aggregation: {e}") + logger.error(f"Error during news aggregation: {e}") diff --git a/src/gentrade/news/finnhub.py b/src/gentrade/news/finnhub.py index d51da79..beb1d53 100644 --- a/src/gentrade/news/finnhub.py +++ b/src/gentrade/news/finnhub.py @@ -5,17 +5,14 @@ and news specific to individual stock tickers, with filtering by time interval and article count. """ -import logging import time from typing import List from datetime import datetime, timedelta import requests +from loguru import logger from gentrade.news.meta import NewsInfo, NewsProviderBase -LOG = logging.getLogger(__name__) - - class FinnhubNewsProvider(NewsProviderBase): """News provider implementation for fetching news via the Finnhub.io API. @@ -33,6 +30,10 @@ def __init__(self, api_key: str): self.api_key = api_key self.base_url = "https://finnhub.io/api/v1" + @property + def market(self): + return 'us' + def fetch_latest_market_news( self, category: str = "business", @@ -87,7 +88,7 @@ def fetch_latest_market_news( return self._filter_news(news_list, max_hour_interval, max_count) except requests.RequestException as e: - LOG.debug(f"Error fetching market news from Finnhub: {e}") + logger.debug(f"Error fetching market news from Finnhub: {e}") return [] def fetch_stock_news( @@ -146,5 +147,5 @@ def fetch_stock_news( return self._filter_news(news_list, max_hour_interval, max_count) except requests.RequestException as e: - LOG.debug(f"Error fetching stock news from Finnhub: {e}") + logger.debug(f"Error fetching stock news from Finnhub: {e}") return [] diff --git a/src/gentrade/news/googlenews.py b/src/gentrade/news/googlenews.py deleted file mode 100644 index 87f1193..0000000 --- a/src/gentrade/news/googlenews.py +++ /dev/null @@ -1,158 +0,0 @@ -"""Google Custom Search (GCS) news provider for financial news retrieval. - -Implements the NewsProviderBase abstract class to fetch general market news and stock-specific -news via Google's Custom Search API. Supports filtering by time interval, article count, -region, and language, while formatting results into standardized NewsInfo objects. -""" - -import logging -import time -from typing import List - -import requests - -from gentrade.news.meta import NewsInfo, NewsProviderBase - -LOG = logging.getLogger(__name__) - - -class GoogleNewsProvider(NewsProviderBase): - """News provider using Google Custom Search API to retrieve financial news. - - Authenticates with Google Cloud API key and Custom Search Engine (CSE) ID. Fetches - market-wide or stock-specific news, with built-in filtering for recency and result count. - """ - - def __init__(self, api_key: str, cse_id: str): - """Initialize GoogleNewsProvider with required authentication credentials. - - Args: - api_key: Google Cloud API key for Custom Search request authentication. - cse_id: Google Custom Search Engine (CSE) ID configured for news retrieval. - """ - self.api_key = api_key - self.cse_id = cse_id - self.base_url = "https://www.googleapis.com/customsearch/v1" - - def fetch_latest_market_news( - self, - category: str = "business", - max_hour_interval: int = 24, - max_count: int = 10 - ) -> List[NewsInfo]: - """Fetch latest general market news via Google Custom Search. - - Retrieves financial market news from the last `max_hour_interval` hours, limited to - `max_count` articles, and assigns the specified category. - - Args: - category: Category label for fetched news (default: "business"). - max_hour_interval: Maximum age (in hours) of articles to retrieve (default: 24). - max_count: Maximum number of articles to return (default: 10). - - Returns: - List of NewsInfo objects with formatted market news; empty list if fetch fails - or no results exist. - """ - params = { - "key": self.api_key, - "cx": self.cse_id, - "q": "finance stock market", # Core query for market news - "num": max_count, - "dateRestrict": f"h{max_hour_interval}", # Filter by recent hours - "gl": "us", # Focus on US region results - "lr": "lang_en", # Restrict to English language - "siteSearch": "news.google.com", # Limit to Google News sources - "siteSearchFilter": "i" # Exclude duplicate results - } - - try: - response = requests.get(self.base_url, params=params, timeout=10) - response.raise_for_status() # Raise error for HTTP status codes ≥400 - items = response.json().get("items", []) # Extract articles from response - - # Convert API response to standardized NewsInfo objects - news_list = [ - NewsInfo( - category=category, - datetime=int(time.time()), # Google CSE lacks article timestamp - headline=item.get("title", ""), - id=self.url_to_hash_id(item.get("link", "")), - image=item.get("pagemap", {}).get("cse_image", [{}])[0].get("src", ""), - related="", # No stock ticker for general market news - source=item.get("displayLink", ""), # Source domain (e.g., "bloomberg.com") - summary=item.get("snippet", ""), # Short article preview - url=item.get("link", ""), # Direct article URL - content="", # Content extracted later by aggregator - provider='google', - market='us' - ) - for item in items - ] - - return self._filter_news(news_list, max_hour_interval, max_count) - - except requests.RequestException as e: - LOG.debug(f"Failed to fetch market news from Google Custom Search: {e}") - return [] - - def fetch_stock_news( - self, - ticker: str, - category: str = "business", - max_hour_interval: int = 24, - max_count: int = 10 - ) -> List[NewsInfo]: - """Fetch stock-specific news for a given ticker via Google Custom Search. - - Retrieves news related to the specified stock ticker from the last `max_hour_interval` - hours, limited to `max_count` articles, and assigns the specified category. - - Args: - ticker: Stock ticker symbol (e.g., "AAPL") to fetch news for. - category: Category label for fetched news (default: "business"). - max_hour_interval: Maximum age (in hours) of articles to retrieve (default: 24). - max_count: Maximum number of articles to return (default: 10). - - Returns: - List of NewsInfo objects with formatted stock news; empty list if fetch fails - or no results exist. - """ - params = { - "key": self.api_key, - "cx": self.cse_id, - "q": f"{ticker} stock news", # Ticker-specific query - "num": max_count, - "dateRestrict": f"h{max_hour_interval}", # Filter by recent hours - "sort": "date" # Sort results by most recent first - } - - try: - response = requests.get(self.base_url, params=params, timeout=10) - response.raise_for_status() - items = response.json().get("items", []) - - # Convert API response to standardized NewsInfo objects - news_list = [ - NewsInfo( - category=category, - datetime=int(time.time()), # Google CSE lacks article timestamp - headline=item.get("title", ""), - id=hash(item.get("link", "")), # Unique ID from URL - image=item.get("pagemap", {}).get("cse_image", [{}])[0].get("src", ""), - related=ticker, # Associate with target stock ticker - source=item.get("displayLink", ""), - summary=item.get("snippet", ""), - url=item.get("link", ""), - content="", # Content extracted later - provider='google', - market='us' - ) - for item in items - ] - - return self._filter_news(news_list, max_hour_interval, max_count) - - except requests.RequestException as e: - LOG.debug(f"Failed to fetch {ticker} stock news from Google Custom Search: {e}") - return [] diff --git a/src/gentrade/news/meta.py b/src/gentrade/news/meta.py index 4b59d1c..dd19ca3 100644 --- a/src/gentrade/news/meta.py +++ b/src/gentrade/news/meta.py @@ -9,17 +9,14 @@ """ import abc -import logging import time import hashlib from typing import Dict, List, Any, Optional from datetime import datetime from dataclasses import dataclass - +from loguru import logger import requests -LOG = logging.getLogger(__name__) - NEWS_MARKET = [ 'us', 'zh', 'hk', 'cypto', 'common' ] @@ -37,7 +34,7 @@ class NewsInfo: summary: str url: str content: str - provider: str # provder like newsapi, google, finnhub, rss + provider: str # provder like newsapi, finnhub, rss market: str # market type like us, chn, eur, hk, crypto def to_dict(self) -> Dict[str, Any]: @@ -79,7 +76,7 @@ def fetch_article_html(self) -> Optional[str]: response.raise_for_status() return response.text except requests.RequestException as e: - LOG.debug(f"Failed to fetch HTML for {self.url}: {e}") + logger.debug(f"Failed to fetch HTML for {self.url}: {e}") return None @@ -89,6 +86,10 @@ class NewsProviderBase(metaclass=abc.ABCMeta): All concrete news providers (e.g., NewsAPI, Finnhub) must implement these methods. """ + @property + def market(self): + return 'common' + @abc.abstractmethod def fetch_latest_market_news( self, diff --git a/src/gentrade/news/newsapi.py b/src/gentrade/news/newsapi.py index 67a8ca4..53d0a40 100644 --- a/src/gentrade/news/newsapi.py +++ b/src/gentrade/news/newsapi.py @@ -5,17 +5,13 @@ article count, and language, while formatting results into standardized NewsInfo objects. """ -import logging from typing import List from datetime import datetime, timedelta - import requests +from loguru import logger from gentrade.news.meta import NewsInfo, NewsProviderBase -LOG = logging.getLogger(__name__) - - class NewsApiProvider(NewsProviderBase): """News provider that uses NewsAPI.org to fetch financial and stock-specific news. @@ -33,6 +29,10 @@ def __init__(self, api_key: str): self.api_key = api_key self.base_url = "https://newsapi.org/v2/everything" # Core endpoint for news retrieval + @property + def market(self): + return 'us' + def fetch_latest_market_news( self, category: str = "business", @@ -92,10 +92,10 @@ def fetch_latest_market_news( return self._filter_news(news_list, max_hour_interval, max_count) except requests.RequestException as e: - LOG.debug(f"Failed to fetch market news from NewsAPI.org: {e}") + logger.debug(f"Failed to fetch market news from NewsAPI.org: {e}") return [] except Exception as e: - LOG.debug(f"Unexpected error: {e}") + logger.debug(f"Unexpected error: {e}") return [] def fetch_stock_news( @@ -159,5 +159,5 @@ def fetch_stock_news( return self._filter_news(news_list, max_hour_interval, max_count) except requests.RequestException as e: - LOG.debug(f"Failed to fetch {ticker} stock news from NewsAPI.org: {e}") + logger.debug(f"Failed to fetch {ticker} stock news from NewsAPI.org: {e}") return [] diff --git a/src/gentrade/news/rss.py b/src/gentrade/news/rss.py index 9d0516f..5fcbb9d 100644 --- a/src/gentrade/news/rss.py +++ b/src/gentrade/news/rss.py @@ -7,16 +7,14 @@ """ import os -import logging from typing import List import requests import feedparser +from loguru import logger from gentrade.news.meta import NewsInfo, NewsProviderBase -LOG = logging.getLogger(__name__) - class RssProvider(NewsProviderBase): """News provider that fetches news from RSS/ATOM feeds. @@ -63,7 +61,7 @@ def fetch_latest_market_news( parsing fails, or no valid articles exist. """ if not self.feed_url: - LOG.error("RSS feed URL is missing (no explicit URL, env var, or default).") + logger.error("RSS feed URL is missing (no explicit URL, env var, or default).") return [] # Headers to mimic browser (avoid feed server blocking) and accept RSS/XML @@ -81,7 +79,7 @@ def fetch_latest_market_news( # Parse feed with feedparser feed = feedparser.parse(response.text) if not feed.entries: - LOG.warning(f"No articles found in RSS feed: {self.feed_url}") + logger.warning(f"No articles found in RSS feed: {self.feed_url}") return [] # Convert feed entries to standardized NewsInfo objects @@ -111,16 +109,16 @@ def fetch_latest_market_news( return self._filter_news(news_list, max_hour_interval, max_count) except requests.HTTPError as e: - LOG.error( + logger.error( f"HTTP error fetching RSS feed {self.feed_url}: " f"Status {e.response.status_code} - {str(e)}" ) return [] except requests.RequestException as e: - LOG.error(f"Network error fetching RSS feed {self.feed_url}: {str(e)}") + logger.error(f"Network error fetching RSS feed {self.feed_url}: {str(e)}") return [] except Exception as e: - LOG.error(f"Unexpected error parsing RSS feed {self.feed_url}: {str(e)}") + logger.error(f"Unexpected error parsing RSS feed {self.feed_url}: {str(e)}") return [] def fetch_stock_news( diff --git a/src/gentrade/scraper/extractor.py b/src/gentrade/scraper/extractor.py index 57a99bd..12a91b8 100644 --- a/src/gentrade/scraper/extractor.py +++ b/src/gentrade/scraper/extractor.py @@ -14,7 +14,7 @@ import re import time -from typing import Dict, List +from typing import Dict, List, Optional from urllib.parse import urlparse import requests @@ -93,7 +93,9 @@ def save_dummy_patterns(self, dummy_patterns: List[str]): class ArticleContentExtractor: """Handles article content extraction with dummy content filtering.""" - def __init__(self, storage: ScraperStorage): + _instance = None + + def __init__(self, storage: ScraperStorage=None): self.ignored_extensions = ( ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".zip", ".rar", ".jpg", ".png", ".jpeg", ".gif" @@ -105,7 +107,10 @@ def __init__(self, storage: ScraperStorage): "ads by", "sponsored content", "subscribe to access" } + if storage is None: + storage = ScraperStorage() self.storage = storage + self.blocked_domains = self.storage.load_blocked_domains() self.dummy_patterns = self.storage.load_dummy_patterns() @@ -129,7 +134,7 @@ def _get_random_headers(self) -> Dict[str, str]: "Upgrade-Insecure-Requests": "1", } - def _clean_html(self, html: str) -> str: + def clean_html(self, html: str) -> str: """Clean raw HTML by removing non-content elements and ads.""" if not html: return "" @@ -221,31 +226,23 @@ def extract_content(self, url: str) -> str: return "Unsupported file type (non-HTML)" try: - article = Article(url, language="zh") + article = Article(url) article.download() article.parse() - content = article.text.strip() + if article.text: + content = article.text.strip() + else: + # Fallback to HTML scrapping if newspaper3k returns empty text + html = self._fetch_original_html(url) + content = self.clean_html(html) except ArticleException as e: logger.warning( "newspaper3k extraction failed: %s - falling back to HTML cleaning", str(e) ) - try: - headers = self._get_random_headers() - response = requests.get( - url, headers=headers, timeout=10, allow_redirects=True - ) - response.encoding = response.apparent_encoding - - if response.status_code != 200: - logger.warning("Failed to retrieve article (status %s): %s", - response.status_code, url) - return "Failed to retrieve content (HTTP error)" - - content = self._clean_html(response.text) - except requests.exceptions.RequestException as e: - logger.error("Request error for %s: %s", url, str(e)) - return "Failed to retrieve content (network error)" + # Fallback to HTML scrapping if newspaper3k returns empty text + html = self._fetch_original_html(url) + content = self.clean_html(html) if self._is_dummy_content(content): logger.warning("Dummy content detected at: %s", url) @@ -254,3 +251,42 @@ def extract_content(self, url: str) -> str: return "Content blocked: Contains cookie notices or irrelevant material" return content + + def _fetch_original_html(self, url: str, timeout: int = 10) -> Optional[str]: + """Fetch raw HTML content from a URL with retries. + + Args: + url: URL to fetch HTML from. + timeout: Request timeout in seconds (default: 10). + + Returns: + Raw HTML content as a string, or None if fetch fails. + """ + + retries = 3 + headers = self._get_random_headers() + + for attempt in range(retries): + try: + # Add random delay between retries (0.5-2 seconds) + if attempt > 0: + time.sleep(random.uniform(0.5, 2.0)) + + response = requests.get( + url, headers=headers, timeout=timeout, verify=True + ) + response.raise_for_status() + return response.text + except Exception as e: + if attempt < retries - 1: + continue + logger.error(f"Failed to fetch HTML after {retries} retries ({url}): {e}") + return None + + return None + + @staticmethod + def inst(storage: ScraperStorage=None): + if ArticleContentExtractor._instance is None: + ArticleContentExtractor._instance = ArticleContentExtractor(storage) + return ArticleContentExtractor._instance diff --git a/src/gentrade/scraper/search.py b/src/gentrade/scraper/search.py index 4a43e35..0509a4b 100644 --- a/src/gentrade/scraper/search.py +++ b/src/gentrade/scraper/search.py @@ -266,4 +266,5 @@ def search( fetch_content=True, ) + print(json.dumps(news, ensure_ascii=False, indent=2)) diff --git a/tests/test_api_google_news.py b/tests/test_api_google_news.py deleted file mode 100644 index 0596d23..0000000 --- a/tests/test_api_google_news.py +++ /dev/null @@ -1,182 +0,0 @@ -""" -Google News API Test Suite - -This module contains pytest tests to verify functionality of Google's Custom Search API -for retrieving financial and stock-related news. It includes tests for: -- API credential validation -- General financial news retrieval -- Specific stock symbol news retrieval - -Requirements: -- Valid Google Cloud API key with Custom Search API enabled: GOOGLE_API_KEY -- Custom Search Engine ID (CX) configured for news search: CX -- Environment variables stored in .env file -""" - -import os -import requests -import pytest -from dotenv import load_dotenv - -# Load environment variables from .env file -load_dotenv() - -@pytest.fixture(scope="module") -def api_credentials(): - """ - Fixture to validate and provide API credentials. - - Retrieves Google API key and Custom Search Engine ID from environment variables - and performs basic validation. Fails if any credential is missing. - - Returns: - dict: Contains valid API credentials with keys 'api_key' and 'cx' - """ - api_key = os.getenv("GOOGLE_API_KEY") - cx = os.getenv("GOOGLE_CX") - - error_messages = [] - if not api_key: - error_messages.append( - "GOOGLE_API_KEY not found in environment variables. " - "Please check your .env file." - ) - if not cx: - error_messages.append( - "GOOGLE_CX (Custom Search Engine ID) not found. " - "Please check your .env file." - ) - - assert not error_messages, "\n".join(error_messages) - - return { - "api_key": api_key, - "cx": cx - } - - -def test_api_credentials_work(api_credentials): - """ - Test if API credentials are valid and functional. - - Performs a basic test query to verify that the provided API key and - CX ID can successfully authenticate with the Google Custom Search API. - Provides detailed error messages for common authentication issues. - """ - url = "https://www.googleapis.com/customsearch/v1" - test_params = { - "key": api_credentials["api_key"], - "cx": api_credentials["cx"], - "q": "test query", - "num": 1 - } - - response = requests.get(url, params=test_params, timeout=30) - - # Handle common authentication errors with detailed guidance - if response.status_code == 403: - pytest.fail( - "403 Forbidden: Invalid credentials or insufficient permissions.\n" - "Possible fixes:\n" - "1. Verify your API key is correct in .env\n" - "2. Ensure Custom Search API is enabled in Google Cloud Console\n" - "3. Check if your API key has IP restrictions that block this request\n" - "4. Confirm your project has billing enabled (required for production use)\n" - f"API Response: {response.text}" - ) - elif response.status_code == 400: - pytest.fail( - f"400 Bad Request: Invalid parameters. Check your CX ID.\n" - f"API Response: {response.text}" - ) - - assert response.status_code == 200, \ - f"API request failed with status code {response.status_code}. Response: {response.text}" - - -def test_get_latest_financial_news(api_credentials): - """ - Test retrieval of latest financial news from Google News. - - Queries the Custom Search API for recent financial news (past 1 week) - and validates the structure and content of returned results. - """ - url = "https://www.googleapis.com/customsearch/v1" - params = { - "key": api_credentials["api_key"], - "cx": api_credentials["cx"], - "q": "finance stock market", - "num": 10, - "dateRestrict": "w1", # Restrict results to past 1 week - "gl": "us", # Focus on United States results - "lr": "lang_en", # Restrict to English language - "siteSearch": "news.google.com", # Search only Google News - "siteSearchFilter": "i" # Include only specified sites - } - - # Execute API request - response = requests.get(url, params=params, timeout=30) - - # Handle specific API errors - if response.status_code == 403: - pytest.fail(f"403 Forbidden: Check API key and permissions. Response: {response.text}") - if response.status_code == 429: - pytest.fail(f"429 Too Many Requests: API quota exceeded. Response: {response.text}") - - # Verify successful response - assert response.status_code == 200, \ - f"API request failed with status code {response.status_code}. Response: {response.text}" - - # Parse and validate response content - results = response.json() - - assert "items" in results, f"No news items found. API response: {results}" - assert len(results["items"]) > 0, "No articles returned from Google News" - - # Validate individual news articles - for item in results["items"][:3]: # Check first 3 articles - assert "title" in item, "News item missing title" - assert "link" in item, "News item missing URL" - assert "snippet" in item, "News item missing snippet" - - -def test_get_specific_stock_news(api_credentials): - """ - Test retrieval of news for specific stock symbols. - - Queries the Custom Search API for news related to major tech stocks - and verifies that returned articles mention the target stock symbol. - """ - stock_symbols = ["AAPL", "MSFT", "GOOGL"] - url = "https://www.googleapis.com/customsearch/v1" - - for symbol in stock_symbols: - params = { - "key": api_credentials["api_key"], - "cx": api_credentials["cx"], - "q": f"{symbol} stock news", - "num": 5, - "dateRestrict": "w1", # Restrict to past 1 week - "gl": "us", - "lr": "lang_en", - "siteSearch": "news.google.com", - "siteSearchFilter": "i" - } - - response = requests.get(url, params=params, timeout=30) - - if response.status_code == 403: - pytest.fail(f"403 Forbidden for {symbol}: Check API key and permissions") - - assert response.status_code == 200, \ - f"Failed to get news for {symbol} (status code {response.status_code})" - - results = response.json() - - # Validate stock symbol appears in results when available - if "items" in results and len(results["items"]) > 0: - symbol_in_results = any( - symbol in item["title"].upper() or symbol in item.get("snippet", "").upper() - for item in results["items"] - ) - assert symbol_in_results, f"No results mentioning {symbol} found" diff --git a/tests/test_gentrade_news.py b/tests/test_gentrade_news.py index 52e000b..0fd2859 100644 --- a/tests/test_gentrade_news.py +++ b/tests/test_gentrade_news.py @@ -7,7 +7,6 @@ from gentrade.news.meta import NewsProviderBase from gentrade.news.newsapi import NewsApiProvider from gentrade.news.finnhub import FinnhubNewsProvider -from gentrade.news.googlenews import GoogleNewsProvider from gentrade.news.rss import RssProvider @@ -43,24 +42,6 @@ def test_create_finnhub_missing_key(self): NewsFactory.create_provider("finnhub") assert "FINNHUB_API_KEY" in str(excinfo.value) - @patch.dict(os.environ, { - "GOOGLE_CLOUD_API_KEY": "test_google_key", - "GOOGLE_CSE_ID": "test_cse_id" - }) - def test_create_google_provider(self): - """Test Google News provider creation with valid env vars""" - provider = NewsFactory.create_provider("google") - assert isinstance(provider, GoogleNewsProvider) - assert provider.api_key == "test_google_key" - assert provider.cse_id == "test_cse_id" - - def test_create_google_missing_credentials(self): - """Test Google creation fails with missing credentials""" - with patch.dict(os.environ, {}, clear=True): - with pytest.raises(ValueError) as excinfo: - NewsFactory.create_provider("google") - assert "GOOGLE_CSE_ID" in str(excinfo.value) - def test_create_rss_provider_with_feed_url(self): """Test RSS provider creation with explicit feed URL""" feed_url = "https://test-feed.com/rss" @@ -94,10 +75,6 @@ class TestNewsProvidersCommon: @pytest.fixture(params=[ ("newsapi", NewsApiProvider, {"NEWSAPI_API_KEY": "test_key"}), ("finnhub", FinnhubNewsProvider, {"FINNHUB_API_KEY": "test_key"}), - ("google", GoogleNewsProvider, { - "GOOGLE_CLOUD_API_KEY": "test_key", - "GOOGLE_CSE_ID": "test_id" - }), ("rss", RssProvider, {}) ]) def provider_setup(self, request): @@ -128,8 +105,6 @@ def test_fetch_market_news_returns_list(self, provider_setup): mock_response.json.return_value = {"articles": []} elif provider_setup[0] == "finnhub": mock_response.json.return_value = [] - elif provider_setup[0] == "google": - mock_response.json.return_value = {"items": []} elif provider_setup[0] == "rss": pass # Handled in RSS specific tests @@ -152,9 +127,6 @@ def test_fetch_stock_news_returns_list(self, provider_setup): elif provider_type == "finnhub": # Finnhub returns list directly mock_response.json.return_value = [] - elif provider_type == "google": - # Google returns {"items": [...]} - mock_response.json.return_value = {"items": []} elif provider_type == "rss": # RSS uses feedparser, handled separately pass @@ -190,29 +162,6 @@ def test_fetch_market_news_params(self, mock_get, newsapi_provider): assert "from" in params -class TestGoogleNewsProvider: - """Google News-specific test cases""" - - @pytest.fixture - def google_provider(self): - with patch.dict(os.environ, { - "GOOGLE_CLOUD_API_KEY": "test_key", - "GOOGLE_CSE_ID": "test_id" - }): - return NewsFactory.create_provider("google") - - @patch("gentrade.news.googlenews.requests.get") - def test_fetch_stock_news_query(self, mock_get, google_provider): - """Test Google News uses correct stock query""" - mock_get.return_value = Mock(status_code=200, json=lambda: {"items": []}) - google_provider.fetch_stock_news(ticker="MSFT", max_count=3) - - _, kwargs = mock_get.call_args - params = kwargs["params"] - assert params["q"] == "MSFT stock news" - assert params["num"] == 3 - - class TestRssProvider: """RSS Provider-specific test cases""" @@ -268,17 +217,13 @@ def test_company_news_endpoint(self, mock_get, finnhub_provider): class TestProviderErrorHandling: """Tests for provider error handling""" - @pytest.fixture(params=["newsapi", "finnhub", "google"]) + @pytest.fixture(params=["newsapi", "finnhub"]) def api_provider(self, request): """Fixture for API-based providers (non-RSS)""" provider_type = request.param env_vars = { "newsapi": {"NEWSAPI_API_KEY": "test"}, - "finnhub": {"FINNHUB_API_KEY": "test"}, - "google": { - "GOOGLE_CLOUD_API_KEY": "test", - "GOOGLE_CSE_ID": "test" - } + "finnhub": {"FINNHUB_API_KEY": "test"} }[provider_type] with patch.dict(os.environ, env_vars):