diff --git a/src/gentrade/news/factory.py b/src/gentrade/news/factory.py index 1dfa8eb..9aa9fd5 100644 --- a/src/gentrade/news/factory.py +++ b/src/gentrade/news/factory.py @@ -12,13 +12,12 @@ from typing import List, Optional from loguru import logger -from gentrade.scraper.extractor import ArticleContentExtractor - from gentrade.news.meta import NewsProviderBase, NewsDatabase, NewsFileDatabase -from gentrade.news.newsapi import NewsApiProvider -from gentrade.news.rss import RssProvider -from gentrade.news.finnhub import FinnhubNewsProvider - +from gentrade.news.providers.newsapi import NewsApiProvider +from gentrade.news.providers.rss import RssProvider +from gentrade.news.providers.finnhub import FinnhubNewsProvider +from gentrade.news.providers.newsnow import NewsNowProvider +from gentrade.utils.download import ArticleDownloader class NewsFactory: """Factory class for creating news provider instances based on provider type. @@ -47,7 +46,8 @@ def create_provider(provider_type: str, **kwargs) -> NewsProviderBase: providers = { "newsapi": NewsApiProvider, "finnhub": FinnhubNewsProvider, - "rss": RssProvider + "rss": RssProvider, + "newsnow": NewsNowProvider } provider_class = providers.get(provider_type_lower) @@ -70,6 +70,10 @@ def create_provider(provider_type: str, **kwargs) -> NewsProviderBase: feed_url = kwargs.get("feed_url", os.getenv("RSS_FEED_URL")) return provider_class(feed_url=feed_url) + if provider_type_lower == "newsnow": + source = kwargs.get("source", "baidu") + return provider_class(source=source) + return provider_class(**kwargs) class NewsAggregator: @@ -79,7 +83,7 @@ class NewsAggregator: and stores results in a database. Includes logic to avoid frequent syncs. """ - def __init__(self, providers: List[NewsProviderBase], db: NewsDatabase): + def __init__(self, providers: List[NewsProviderBase], db: NewsDatabase = None): """Initialize the NewsAggregator with a list of providers and a database. Args: @@ -91,7 +95,7 @@ def __init__(self, providers: List[NewsProviderBase], db: NewsDatabase): self.db_lock = threading.Lock() def _fetch_thread(self, provider, aggregator, ticker, category, - max_hour_interval, max_count, is_process=False): + max_hour_interval, max_count, process_content=True): if ticker: news = provider.fetch_stock_news( ticker, category, max_hour_interval, max_count @@ -109,21 +113,26 @@ def _fetch_thread(self, provider, aggregator, ticker, category, f"{provider.__class__.__name__}" ) - ace = ArticleContentExtractor.inst() + downloader = ArticleDownloader.inst() for item in news: - item.summary = ace.clean_html(item.summary) - if is_process: - item.content = ace.extract_content(item.url) + item.summary = downloader.clean_html(item.summary) + if process_content: + logger.info(f"Process content ... {item.url}") + item.content = downloader.get_content(item.url) + if item.content: + logger.info(f"Content: {item.content[:20]}") - with aggregator.db_lock: - aggregator.db.add_news(news) + if self.db: + with aggregator.db_lock: + aggregator.db.add_news(news) def sync_news( self, ticker: Optional[str] = None, category: str = "business", max_hour_interval: int = 24, - max_count: int = 10 + max_count: int = 10, + process_content: bool = True ) -> None: """Synchronize news from providers, skipping if last sync was within 1 hour. @@ -136,21 +145,24 @@ def sync_news( max_hour_interval: Maximum age (in hours) of news articles to fetch (default: 24). max_count: Maximum number of articles to fetch per provider (default: 10). """ - current_time = time.time() - if current_time < self.db.last_sync + 3600: - logger.info("Skipping sync: Last sync was less than 1 hour ago.") - return + if self.db: + current_time = time.time() + if current_time < self.db.last_sync + 3600: + logger.info("Skipping sync: Last sync was less than 1 hour ago.") + return logger.info("Starting news sync...") threads = [] for provider in self.providers: if not provider.is_available: + logger.error(f"Provider {provider.__class__.__name__} is not available") continue thread = threading.Thread( target=self._fetch_thread, - args=(provider, self, ticker, category, max_hour_interval, max_count) + args=(provider, self, ticker, category, max_hour_interval, + max_count, process_content) ) threads.append(thread) thread.start() @@ -158,8 +170,10 @@ def sync_news( for thread in threads: thread.join() - self.db.last_sync = current_time - self.db.save() + if self.db: + self.db.last_sync = current_time + self.db.save() + logger.info("News sync completed.") if __name__ == "__main__": @@ -170,18 +184,24 @@ def sync_news( newsapi_provider = NewsFactory.create_provider("newsapi") finnhub_provider = NewsFactory.create_provider("finnhub") rss_provider = NewsFactory.create_provider("rss") + newsnow_provider = NewsFactory.create_provider("newsnow", source="jin10") # Create aggregator with selected providers aggregator = NewsAggregator( - providers=[rss_provider, newsapi_provider, finnhub_provider], db=db) + providers=[newsnow_provider, finnhub_provider, rss_provider, newsapi_provider], db=db) # Sync market news and stock-specific news - aggregator.sync_news(category="business", max_hour_interval=64, max_count=10) + aggregator.sync_news( + category="business", + max_hour_interval=64, + max_count=10, + process_content = True) aggregator.sync_news( ticker="AAPL", category="business", max_hour_interval=240, - max_count=10 + max_count=10, + process_content = True ) # Log results diff --git a/src/gentrade/news/meta.py b/src/gentrade/news/meta.py index 5393e0b..1802307 100644 --- a/src/gentrade/news/meta.py +++ b/src/gentrade/news/meta.py @@ -12,14 +12,14 @@ import abc import time import hashlib -from typing import Dict, List, Any, Optional + +from typing import Dict, List, Any from datetime import datetime from dataclasses import dataclass from loguru import logger -import requests NEWS_MARKET = [ - 'us', 'zh', 'hk', 'cypto', 'common' + 'us', 'cn', 'hk', 'cypto', 'common' ] @dataclass @@ -35,7 +35,7 @@ class NewsInfo: summary: str url: str content: str - provider: str # provder like newsapi, finnhub, rss + provider: str # provider like newsapi, finnhub, rss market: str # market type like us, chn, eur, hk, crypto def to_dict(self) -> Dict[str, Any]: @@ -59,27 +59,6 @@ def to_dict(self) -> Dict[str, Any]: "market": self.market, } - def fetch_article_html(self) -> Optional[str]: - """Fetch raw HTML content from the article's direct URL. - - Uses a browser-like user agent to avoid being blocked by servers. - - Returns: - Raw HTML string if successful; None if request fails. - """ - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - } - - try: - response = requests.get(self.url, headers=headers, timeout=15) - response.raise_for_status() - return response.text - except requests.RequestException as e: - logger.debug(f"Failed to fetch HTML for {self.url}: {e}") - return None - class NewsProviderBase(metaclass=abc.ABCMeta): """Abstract base class defining the interface for news providers. @@ -278,11 +257,11 @@ def save(self): "last_sync": self.last_sync, "news_list": news_dicts } - with open(self._filepath, 'w', encoding='utf-8') as f: - json.dump(content, f, indent=4) # indent for readability + with open(self._filepath, 'w', encoding="utf-8") as f: + json.dump(content, f, ensure_ascii=False, indent=4) # indent for readability def load(self): - with open(self._filepath, 'r', encoding='utf-8') as f: + with open(self._filepath, 'r', encoding="utf-8") as f: content = json.load(f) # Directly loads JSON content into a Python list/dict self.last_sync = content['last_sync'] self.news_list = [NewsInfo(**item_dict) for item_dict in content['news_list']] diff --git a/src/gentrade/news/providers/__init__.py b/src/gentrade/news/providers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gentrade/news/finnhub.py b/src/gentrade/news/providers/finnhub.py similarity index 100% rename from src/gentrade/news/finnhub.py rename to src/gentrade/news/providers/finnhub.py diff --git a/src/gentrade/news/newsapi.py b/src/gentrade/news/providers/newsapi.py similarity index 100% rename from src/gentrade/news/newsapi.py rename to src/gentrade/news/providers/newsapi.py diff --git a/src/gentrade/news/providers/newsnow.py b/src/gentrade/news/providers/newsnow.py new file mode 100644 index 0000000..52e0d74 --- /dev/null +++ b/src/gentrade/news/providers/newsnow.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +gentrade - NewsNow News Provider Module + +Project: gentrade +Module: news.providers.newsnow +Description: + Implementation of the NewsNow news provider for fetching real-time market news + from the NewsNow API endpoint. This module inherits from NewsProviderBase and + implements core methods for news retrieval, parsing, and filtering across + multiple supported sources (e.g., baidu, zhihu, weibo). + +Key Features: + - Source-specific news fetching from 38+ supported platforms + - Automatic news parsing into standardized NewsInfo objects + - Time-based and count-based news filtering + - Jittered exponential backoff for retry logic + - Robust error handling and logging + - Compatibility with China (cn) market news by default +""" + +import time +import random +from typing import List +from loguru import logger + +from gentrade.news.meta import NewsProviderBase, NewsInfo +from gentrade.utils.download import HttpDownloader + +# Supported news sources for NewsNow provider (38+ platforms) +AVAILABLE_SOURCE = [ + 'baidu', 'bilibili', 'cankaoxiaoxi', 'chongbuluo', 'douban', 'douyin', + 'fastbull', 'freebuf', 'gelonghui', 'ghxi', 'github', 'hackernews', + 'hupu', 'ifeng', 'ithome', 'jin10', 'juejin', 'kaopu', 'kuaishou', + 'linuxdo', 'mktnews', 'nowcoder', 'pcbeta', 'producthunt', 'smzdm', + 'solidot', 'sputniknewscn', 'sspai', 'steam', 'tencent', 'thepaper', + 'tieba', 'toutiao', 'v2ex', 'wallstreetcn', 'weibo', 'xueqiu', 'zaobao', + 'zhihu' +] + + +class NewsNowProvider(NewsProviderBase): + """News provider for fetching real-time market news from NewsNow service. + + Inherits from NewsProviderBase and implements abstract methods to fetch + categorized market news using the NewsNow API endpoint with source-specific + configurations. + """ + + def __init__(self, source: str = "baidu"): + """Initialize NewsNowProvider with specified news source. + + Args: + source: Platform identifier (from AVAILABLE_SOURCE) used in API request + """ + self.source = source + self.url = f"https://newsnow.busiyi.world/api/s?id={self.source}&latest" + + @property + def market(self) -> str: + """Override market property to specify target market (China).""" + return "cn" # Target market: China (adjustable for other regions) + + def fetch_latest_market_news( + self, + category: str = "business", + max_hour_interval: int = 24, + max_count: int = 10 + ) -> List[NewsInfo]: + """Fetch and filter latest market news from NewsNow service. + + Args: + category: News category filter (unused by NewsNow API, kept for compat) + max_hour_interval: Max age (hours) of articles to include + max_count: Maximum number of articles to return + + Returns: + List of NewsInfo objects filtered by time and count constraints + """ + # Fetch raw JSON data from NewsNow API endpoint + response = HttpDownloader.inst().get(self.url) + if not response: + logger.warning(f"Empty response from NewsNow API (source: {self.source})") + return [] + + # Parse raw response to NewsInfo objects and apply filters + news_list = self._parse_news(response.json()) + filtered_news = self.filter_news(news_list, max_hour_interval, max_count) + + logger.info(f"Fetched {len(filtered_news)} news items (source: {self.source})") + return filtered_news + + @staticmethod + def _calculate_retry_wait( + retry_number: int, + min_wait: int = 3, + max_wait: int = 5 + ) -> float: + """Calculate exponential backoff wait time for request retries. + + Implements jittered exponential backoff to avoid thundering herd effect. + + Args: + retry_number: Current retry attempt number (starting at 1) + min_wait: Minimum base wait time in seconds + max_wait: Maximum base wait time in seconds + + Returns: + Calculated wait time in seconds (with random jitter) + """ + base_wait = random.uniform(min_wait, max_wait) + additional_wait = (retry_number - 1) * random.uniform(1, 2) + return base_wait + additional_wait + + def _parse_news(self, raw_data: dict) -> List[NewsInfo]: + """Parse raw NewsNow API JSON response into NewsInfo objects. + + Extracts and normalizes news fields with fallbacks for missing values, + handles time conversion, and generates unique IDs from URLs. + + Args: + raw_data: Parsed JSON dictionary from NewsNow API response + + Returns: + List of valid NewsInfo objects (skipped invalid/corrupted items) + """ + news_items = [] + for item in raw_data.get("items", []): + try: + # Extract URL with mobile fallback (critical field) + url = item.get("url", "") or item.get("mobileUrl", "") + if not url: + logger.warning("Skipping news item - no URL found") + continue + + # Convert publication time to epoch timestamp (fallback: current time) + pub_time = item.get("pubTime", "") + datetime_epoch = ( + self._timestamp_to_epoch(pub_time) if pub_time else int(time.time()) + ) + + # Create normalized NewsInfo object with default values + news_info = NewsInfo( + category=item.get("category", "general"), + datetime=datetime_epoch, + headline=item.get("title", "No headline"), + id=self.url_to_hash_id(url), # Unique ID from URL hash + image=item.get("image", ""), + related=item.get("related", []), + source=item.get("source", self.source), + summary=item.get("summary", ""), + url=url, + content=item.get("content", ""), + provider="newsnow", + market=self.market + ) + news_items.append(news_info) + + except Exception as e: + logger.error(f"Failed to parse news item (source: {self.source}): {str(e)}") + continue + + return news_items + + +if __name__ == "__main__": + logger.info("Starting NewsNowProvider test for all available sources...") + + for source in AVAILABLE_SOURCE: + try: + provider = NewsNowProvider(source) + news_items = provider.fetch_latest_market_news() + logger.info(f"Source {source}: Found {len(news_items)} news items") + time.sleep(1) # Rate limiting for test execution + except Exception as e: + logger.error(f"Test failed for source {source}: {str(e)}") + + logger.info("NewsNowProvider test completed") diff --git a/src/gentrade/news/rss.py b/src/gentrade/news/providers/rss.py similarity index 100% rename from src/gentrade/news/rss.py rename to src/gentrade/news/providers/rss.py diff --git a/src/gentrade/utils/__init__.py b/src/gentrade/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gentrade/scraper/extractor.py b/src/gentrade/utils/download.py similarity index 54% rename from src/gentrade/scraper/extractor.py rename to src/gentrade/utils/download.py index 12a91b8..e803f30 100644 --- a/src/gentrade/scraper/extractor.py +++ b/src/gentrade/utils/download.py @@ -1,33 +1,215 @@ """ -Baidu Search Scraper with Article Extraction - -This module provides tools for: -1. Persistent storage of blocked domains and dummy content patterns. -2. Cleaning and extracting article content while filtering irrelevant material. -3. Scraping Baidu search results and enriching them with extracted article content. +HTTP Downloader Utility + +Description: + A robust HTTP downloader implementation with singleton pattern, automatic retry mechanism, + randomized User-Agent spoofing, and proxy support from environment variables. + + Key Features: + - Singleton pattern for consistent configuration across application + - Configurable retry attempts and request timeout + - Random User-Agent rotation to mimic different browsers + - Proxy configuration loaded from standard environment variables + - Comprehensive error logging with backoff timing + - SSL certificate verification for secure requests + +Usage Example: + >>> downloader = HttpDownloader.inst() + >>> content = downloader.get("https://example.com") + >>> if content: + >>> print("Content downloaded successfully") + +Dependencies: + - requests >= 2.25.1 + - loguru >= 0.7.0 + - Python >= 3.8 """ - -import json -import logging +import re import os import random -import re import time - -from typing import Dict, List, Optional +import json +from typing import Dict, List from urllib.parse import urlparse import requests +from loguru import logger + from bs4 import BeautifulSoup, Comment from newspaper import Article from newspaper.article import ArticleException -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s" -) -logger = logging.getLogger(__name__) +class HttpDownloader: + """HTTP Downloader with retry mechanism, random User-Agent, and proxy support + + Implements singleton pattern for consistent HTTP GET requests with: + - Automatic retry on failure (configurable max retries) + - Randomized User-Agent to mimic different browsers + - Proxy configuration loaded from environment variables + - Timeout control for request safety + """ + # Singleton instance storage + _INSTANCE = None + + def __init__(self, max_retries: int = 3, timeout: int = 5): + """Initialize downloader configuration + + Args: + max_retries: Maximum retry attempts on failure (default: 3) + timeout: Request timeout in seconds (default: 10) + """ + self.max_retries = max_retries # Max retry attempts for failed requests + self.timeout = timeout # Request timeout threshold (seconds) + + @property + def http_headers(self) -> Dict: + """Generate randomized HTTP request headers + + Returns: + Dictionary of HTTP headers with random User-Agent + """ + # List of common browser User-Agents for request spoofing + user_agents = [ + ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" + ), + ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15" + ), + ( + "Mozilla/5.0 (X11; Linux x86_64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36" + ), + ] + + # Construct headers with random User-Agent selection + return { + "User-Agent": random.choice(user_agents), + "Accept": ( + "text/html,application/xhtml+xml,application/xml;q=0.9," + "image/avif,image/webp,*/*;q=0.8" + ), + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + } + + @property + def proxies(self) -> Dict: + """Load proxy configuration from environment variables + + Supported environment variables (case-insensitive): + - http_proxy / HTTP_PROXY + - https_proxy / HTTPS_PROXY + - no_proxy / NO_PROXY + + Returns: + Dictionary of proxy configurations (empty if no proxies set) + """ + proxy_config = {} + # Check all standard proxy environment variables + proxy_env_keys = [ + 'http_proxy', 'https_proxy', 'no_proxy', + 'HTTP_PROXY', 'HTTPS_PROXY', 'NO_PROXY' + ] + + for key in proxy_env_keys: + env_value = os.environ.get(key) + if env_value: + proxy_config[key] = env_value + + return proxy_config + + def get(self, url: str, verify: bool = True, params: Dict = None) -> requests.Response: + """Send HTTP GET request with automatic retry mechanism + + Args: + url: Target URL to retrieve content from + + Returns: + Response text if successful, None if all retries fail + """ + retry_count = 0 # Current retry attempt counter + + logger.debug(f"Http download {url} {verify} {params} ") + # Retry loop until max retries or successful response + while retry_count <= self.max_retries: + try: + # Send GET request with configured headers/proxies/timeout + response = requests.get( + url, + proxies=self.proxies, + headers=self.http_headers, + timeout=self.timeout, + params=params, + verify=verify # Enable SSL certificate verification + ) + + # Raise exception for HTTP error status codes (4xx/5xx) + response.raise_for_status() + return response + except Exception as e: + logger.error(e) + retry_count += 1 # Increment retry counter on failure + + # Final retry failed - log error and return None + if retry_count > self.max_retries: + logger.error( + f"Failed to download URL after {self.max_retries} retries: {e} | URL: {url}" + ) + return None + + # Calculate random backoff time (0.5-2.0s) to avoid rate limiting + backoff_time = random.uniform(0.5, 2.0) + logger.error( + f"Request failed (attempt {retry_count}/{self.max_retries}): {e}. " + f"Retrying in {backoff_time:.2f} seconds..." + ) + time.sleep(backoff_time) # Wait before next retry attempt + + return None + + def clean_html(self, html: str) -> str: + """Clean raw HTML by removing non-content elements and ads.""" + if not html: + return "" + + soup = BeautifulSoup(html, "html.parser") + + for tag in soup( + ["script", "style", "noscript", "iframe", "aside", "nav", "footer"] + ): + tag.decompose() + + for comment in soup.find_all(text=lambda t: isinstance(t, Comment)): + comment.extract() + + ad_selectors = [ + "div[class*='ad']", "div[id*='ad']", + "div[class*='advert']", "div[id*='advert']", + "div[class*='推广']", "div[id*='推广']", + ] + for selector in ad_selectors: + for tag in soup.select(selector): + tag.decompose() + + text = soup.get_text() + return re.sub(r"\s+", " ", text).strip() + + @staticmethod + def inst() -> "HttpDownloader": + """Get singleton instance of HttpDownloader + + Implements lazy initialization - creates instance only on first call + + Returns: + Singleton HttpDownloader instance + """ + if HttpDownloader._INSTANCE is None: + HttpDownloader._INSTANCE = HttpDownloader() + return HttpDownloader._INSTANCE class ScraperStorage: @@ -90,12 +272,13 @@ def save_dummy_patterns(self, dummy_patterns: List[str]): logger.error("Failed to save dummy patterns: %s", str(e)) -class ArticleContentExtractor: +class ArticleDownloader(HttpDownloader): """Handles article content extraction with dummy content filtering.""" - _instance = None + _INSTANCE = None def __init__(self, storage: ScraperStorage=None): + super().__init__() self.ignored_extensions = ( ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".zip", ".rar", ".jpg", ".png", ".jpeg", ".gif" @@ -114,53 +297,6 @@ def __init__(self, storage: ScraperStorage=None): self.blocked_domains = self.storage.load_blocked_domains() self.dummy_patterns = self.storage.load_dummy_patterns() - self.user_agents = [ - ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"), - ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15"), - ("Mozilla/5.0 (X11; Linux x86_64) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"), - ] - - def _get_random_headers(self) -> Dict[str, str]: - """Generate random browser-like headers.""" - return { - "User-Agent": random.choice(self.user_agents), - "Accept": ("text/html,application/xhtml+xml,application/xml;q=0.9," - "image/avif,image/webp,*/*;q=0.8"), - "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", - "Connection": "keep-alive", - "Upgrade-Insecure-Requests": "1", - } - - def clean_html(self, html: str) -> str: - """Clean raw HTML by removing non-content elements and ads.""" - if not html: - return "" - - soup = BeautifulSoup(html, "html.parser") - - for tag in soup( - ["script", "style", "noscript", "iframe", "aside", "nav", "footer"] - ): - tag.decompose() - - for comment in soup.find_all(text=lambda t: isinstance(t, Comment)): - comment.extract() - - ad_selectors = [ - "div[class*='ad']", "div[id*='ad']", - "div[class*='advert']", "div[id*='advert']", - "div[class*='推广']", "div[id*='推广']", - ] - for selector in ad_selectors: - for tag in soup.select(selector): - tag.decompose() - - text = soup.get_text() - return re.sub(r"\s+", " ", text).strip() - def _is_dummy_content(self, content: str) -> bool: """Check if content contains dummy patterns or keywords.""" if not content: @@ -214,8 +350,8 @@ def _add_dummy_content_pattern(self, content: str): self.storage.save_dummy_patterns(self.dummy_patterns) - def extract_content(self, url: str) -> str: - """Extract article content with dummy filtering and blocklisting.""" + def get_content(self, url: str, verify: bool=True, params: Dict = None) -> str: + """Get article content with dummy filtering and blocklisting.""" if self._is_domain_blocked(url): logger.warning("Content source blocked: %s", url) return "Content source blocked: Previously detected irrelevant content" @@ -225,68 +361,32 @@ def extract_content(self, url: str) -> str: logger.warning("Skipping non-HTML file: %s", url) return "Unsupported file type (non-HTML)" + resp = super().get(url, verify, params) + if not resp: + return None + try: - article = Article(url) - article.download() + article = Article(url, language='zh') + article.set_html(resp.text) article.parse() - if article.text: - content = article.text.strip() - else: - # Fallback to HTML scrapping if newspaper3k returns empty text - html = self._fetch_original_html(url) - content = self.clean_html(html) + content = article.text except ArticleException as e: logger.warning( "newspaper3k extraction failed: %s - falling back to HTML cleaning", str(e) ) - # Fallback to HTML scrapping if newspaper3k returns empty text - html = self._fetch_original_html(url) - content = self.clean_html(html) + content = self.clean_html(resp.text) if self._is_dummy_content(content): logger.warning("Dummy content detected at: %s", url) self._block_domain(url) self._add_dummy_content_pattern(content) - return "Content blocked: Contains cookie notices or irrelevant material" + return None return content - def _fetch_original_html(self, url: str, timeout: int = 10) -> Optional[str]: - """Fetch raw HTML content from a URL with retries. - - Args: - url: URL to fetch HTML from. - timeout: Request timeout in seconds (default: 10). - - Returns: - Raw HTML content as a string, or None if fetch fails. - """ - - retries = 3 - headers = self._get_random_headers() - - for attempt in range(retries): - try: - # Add random delay between retries (0.5-2 seconds) - if attempt > 0: - time.sleep(random.uniform(0.5, 2.0)) - - response = requests.get( - url, headers=headers, timeout=timeout, verify=True - ) - response.raise_for_status() - return response.text - except Exception as e: - if attempt < retries - 1: - continue - logger.error(f"Failed to fetch HTML after {retries} retries ({url}): {e}") - return None - - return None - @staticmethod def inst(storage: ScraperStorage=None): - if ArticleContentExtractor._instance is None: - ArticleContentExtractor._instance = ArticleContentExtractor(storage) - return ArticleContentExtractor._instance + if ArticleDownloader._INSTANCE is None: + ArticleDownloader._INSTANCE = ArticleDownloader(storage) + return ArticleDownloader._INSTANCE diff --git a/src/gentrade/scraper/search.py b/src/gentrade/utils/search.py similarity index 91% rename from src/gentrade/scraper/search.py rename to src/gentrade/utils/search.py index 0509a4b..12cb658 100644 --- a/src/gentrade/scraper/search.py +++ b/src/gentrade/utils/search.py @@ -7,7 +7,6 @@ """ import json -import logging import random import re import time @@ -16,19 +15,11 @@ import requests from bs4 import BeautifulSoup +from loguru import logger -from gentrade.scraper.extractor import ScraperStorage, ArticleContentExtractor - - -# pylint: disable=too-many-locals,too-many-statements,too-many-branches,possibly-used-before-assignment - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s" -) -logger = logging.getLogger(__name__) +from gentrade.utils.download import ArticleDownloader +# pylint: disable=too-many-branches,too-many-locals,too-many-statements class BaiduSearchScraper: """Scrapes Baidu search results and extracts structured article data.""" @@ -36,19 +27,8 @@ class BaiduSearchScraper: def __init__(self) -> None: """Initialize scraper with user agents, storage, and regex patterns.""" self.base_url = "https://www.baidu.com/s" - self.user_agents = [ - ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/114.0.0.0 Safari/537.36"), - ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) " - "Version/16.5 Safari/605.1.15"), - ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36") - ] - self.storage = ScraperStorage() - self.content_extractor = ArticleContentExtractor(self.storage) + self.content_downloader = ArticleDownloader() self.time_patterns = { "minute": re.compile(r"(\d+)\s*分钟前"), @@ -64,20 +44,16 @@ def __init__(self) -> None: ), } - def _get_random_headers(self) -> Dict[str, str]: - """Generate random HTTP headers for requests.""" - return { - "User-Agent": random.choice(self.user_agents), - "Accept": ("text/html,application/xhtml+xml,application/xml;" - "q=0.9,image/avif,image/webp,*/*;q=0.8"), - "Accept-Language": ( - "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5," - "en-US;q=0.3,en;q=0.2" - ), - "Connection": "keep-alive", - "Referer": "https://www.baidu.com/", - "Upgrade-Insecure-Requests": "1", - } + self.user_agents = [ + ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/114.0.0.0 Safari/537.36"), + ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) " + "Version/16.5 Safari/605.1.15"), + ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36") + ] def _parse_time_to_timestamp(self, time_text: str) -> int: """Convert a time string into a Unix timestamp.""" @@ -92,6 +68,7 @@ def _parse_time_to_timestamp(self, time_text: str) -> int: if match: try: num = int(match.group(1)) + dt = None if unit == "minute": dt = now - timedelta(minutes=num) elif unit == "hour": @@ -104,7 +81,9 @@ def _parse_time_to_timestamp(self, time_text: str) -> int: dt = now - timedelta(days=num * 30) elif unit == "year": dt = now - timedelta(days=num * 365) - return int(dt.timestamp()) + if dt: + return int(dt.timestamp()) + continue except Exception: continue @@ -129,6 +108,21 @@ def _parse_time_to_timestamp(self, time_text: str) -> int: logger.warning("Unrecognized time format: %s", time_text) return int(time.time()) + def _get_random_headers(self) -> Dict[str, str]: + """Generate random HTTP headers for requests.""" + return { + "User-Agent": random.choice(self.user_agents), + "Accept": ("text/html,application/xhtml+xml,application/xml;" + "q=0.9,image/avif,image/webp,*/*;q=0.8"), + "Accept-Language": ( + "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5," + "en-US;q=0.3,en;q=0.2" + ), + "Connection": "keep-alive", + "Referer": "https://www.baidu.com/", + "Upgrade-Insecure-Requests": "1", + } + def search( self, query: str, @@ -219,7 +213,7 @@ def search( content = "" if fetch_content and url: - content = self.content_extractor.extract_content(url) + content = self.content_downloader.get_content(url) time.sleep(random.uniform(0.5, 1.5)) results.append({ @@ -235,8 +229,7 @@ def search( logger.error("Error parsing result: %s", str(e)) continue - logger.info("Fetched page %d - total results: %d", - current_page, len(results)) + logger.info(f"Fetched page {current_page} - total results: {len(results)}") next_page = soup.select_one("a.n") if not next_page: @@ -257,7 +250,6 @@ def search( if __name__ == "__main__": - logging.basicConfig(level=logging.DEBUG) scraper = BaiduSearchScraper() news = scraper.search( query="最近24小时关于TESLA的财经新闻", @@ -266,5 +258,4 @@ def search( fetch_content=True, ) - - print(json.dumps(news, ensure_ascii=False, indent=2)) + logger.info(json.dumps(news, ensure_ascii=False, indent=2)) diff --git a/tests/test_gentrade_news.py b/tests/test_gentrade_news.py index 0fd2859..94952cd 100644 --- a/tests/test_gentrade_news.py +++ b/tests/test_gentrade_news.py @@ -1,254 +1,48 @@ -import os -from datetime import datetime, timedelta -from unittest.mock import patch, Mock import pytest -import requests -from gentrade.news.factory import NewsFactory -from gentrade.news.meta import NewsProviderBase -from gentrade.news.newsapi import NewsApiProvider -from gentrade.news.finnhub import FinnhubNewsProvider -from gentrade.news.rss import RssProvider +from loguru import logger +from gentrade.news.factory import NewsAggregator, NewsFactory -# ------------------------------ NewsFactory Tests ------------------------------ -class TestNewsFactory: - """Tests for NewsFactory provider creation logic""" +from gentrade.news.meta import NewsFileDatabase +from gentrade.news.providers.newsnow import AVAILABLE_SOURCE - @patch.dict(os.environ, {"NEWSAPI_API_KEY": "test_newsapi_key"}) - def test_create_newsapi_provider(self): - """Test NewsAPI provider creation with valid env var""" - provider = NewsFactory.create_provider("newsapi") - assert isinstance(provider, NewsApiProvider) - assert provider.api_key == "test_newsapi_key" +@pytest.mark.parametrize("provider_name", + [ "newsapi", "finnhub", "rss", "newsnow"]) +def test_provider_basic(provider_name:str): + db = NewsFileDatabase("news_db.txt") - def test_create_newsapi_missing_key(self): - """Test NewsAPI creation fails with missing API key""" - with patch.dict(os.environ, {}, clear=True): - with pytest.raises(ValueError) as excinfo: - NewsFactory.create_provider("newsapi") - assert "NEWSAPI_API_KEY" in str(excinfo.value) + provider = NewsFactory.create_provider(provider_name) + aggregator = NewsAggregator([ provider], db) + aggregator.sync_news( + category="business", + max_hour_interval=64, + max_count=10, + process_content = True) - @patch.dict(os.environ, {"FINNHUB_API_KEY": "test_finnhub_key"}) - def test_create_finnhub_provider(self): - """Test Finnhub provider creation with valid env var""" - provider = NewsFactory.create_provider("finnhub") - assert isinstance(provider, FinnhubNewsProvider) - assert provider.api_key == "test_finnhub_key" + # Log results + all_news = db.get_all_news() + logger.info(f"Total articles in database: {len(all_news)}") - def test_create_finnhub_missing_key(self): - """Test Finnhub creation fails with missing API key""" - with patch.dict(os.environ, {}, clear=True): - with pytest.raises(ValueError) as excinfo: - NewsFactory.create_provider("finnhub") - assert "FINNHUB_API_KEY" in str(excinfo.value) + for news_item in all_news: + logger.info("[%s...]: %s..." % (str(news_item.id)[:10], news_item.headline[:15])) - def test_create_rss_provider_with_feed_url(self): - """Test RSS provider creation with explicit feed URL""" - feed_url = "https://test-feed.com/rss" - provider = NewsFactory.create_provider("rss", feed_url=feed_url) - assert isinstance(provider, RssProvider) - assert provider.feed_url == feed_url - @patch.dict(os.environ, {"RSS_FEED_URL": "https://env-feed.com/rss"}) - def test_create_rss_provider_from_env(self): - """Test RSS provider uses env var when no URL is provided""" - provider = NewsFactory.create_provider("rss") - assert provider.feed_url == "https://env-feed.com/rss" +@pytest.mark.parametrize("source", + AVAILABLE_SOURCE) +def test_provider_newsnow(source:str): + db = NewsFileDatabase("news_db.txt") - def test_create_rss_provider_default_url(self): - """Test RSS provider uses default URL when no env var/URL provided""" - with patch.dict(os.environ, {}, clear=True): - provider = NewsFactory.create_provider("rss") - assert provider.feed_url == "https://plink.anyfeeder.com/chinadaily/caijing" + provider = NewsFactory.create_provider("newsnow", source=source) + aggregator = NewsAggregator([ provider], db) + aggregator.sync_news( + category="business", + max_hour_interval=64, + max_count=10, + process_content = True) - def test_create_unknown_provider(self): - """Test factory raises error for unknown provider types""" - with pytest.raises(ValueError) as excinfo: - NewsFactory.create_provider("unknown") - assert "Unknown provider type: unknown" in str(excinfo.value) + # Log results + all_news = db.get_all_news() + logger.info(f"Total articles in database: {len(all_news)}") - -# ------------------------------ News Provider Common Tests ------------------------------ -class TestNewsProvidersCommon: - """Parametrized tests for common provider functionality""" - - @pytest.fixture(params=[ - ("newsapi", NewsApiProvider, {"NEWSAPI_API_KEY": "test_key"}), - ("finnhub", FinnhubNewsProvider, {"FINNHUB_API_KEY": "test_key"}), - ("rss", RssProvider, {}) - ]) - def provider_setup(self, request): - """Fixture providing provider type, class, and required env vars""" - provider_type, provider_class, env_vars = request.param - with patch.dict(os.environ, env_vars): - if provider_type == "rss": - provider = NewsFactory.create_provider(provider_type, - feed_url="https://test.com/rss") - else: - provider = NewsFactory.create_provider(provider_type) - return provider_type, provider_class, provider - - def test_provider_base_class(self, provider_setup): - """Test all providers inherit from NewsProviderBase""" - _, _, provider = provider_setup - assert isinstance(provider, NewsProviderBase) - - def test_fetch_market_news_returns_list(self, provider_setup): - """Test market news fetch returns list (empty or with items)""" - _, _, provider = provider_setup - with patch("requests.get") as mock_get: - mock_response = Mock() - mock_response.status_code = 200 - - # Provider-specific mock responses - if provider_setup[0] == "newsapi": - mock_response.json.return_value = {"articles": []} - elif provider_setup[0] == "finnhub": - mock_response.json.return_value = [] - elif provider_setup[0] == "rss": - pass # Handled in RSS specific tests - - mock_get.return_value = mock_response - - result = provider.fetch_stock_news(ticker="AAPL") - assert isinstance(result, list) - - def test_fetch_stock_news_returns_list(self, provider_setup): - """Test stock news fetch returns list (empty or with items)""" - provider_type, _, provider = provider_setup - with patch("requests.get") as mock_get: - mock_response = Mock() - mock_response.status_code = 200 - - # Match mock responses to actual provider return formats - if provider_type == "newsapi": - # NewsAPI returns {"articles": [...]} - mock_response.json.return_value = {"articles": []} - elif provider_type == "finnhub": - # Finnhub returns list directly - mock_response.json.return_value = [] - elif provider_type == "rss": - # RSS uses feedparser, handled separately - pass - - mock_get.return_value = mock_response - - result = provider.fetch_stock_news(ticker="AAPL") - assert isinstance(result, list) - -# ------------------------------ Provider-Specific Tests ------------------------------ -class TestNewsApiProvider: - """NewsAPI-specific test cases""" - - @pytest.fixture - def newsapi_provider(self): - with patch.dict(os.environ, {"NEWSAPI_API_KEY": "test_key"}): - return NewsFactory.create_provider("newsapi") - - @patch("gentrade.news.newsapi.requests.get") - def test_fetch_market_news_params(self, mock_get, newsapi_provider): - """Test NewsAPI market news uses correct parameters""" - mock_get.return_value = Mock(status_code=200, json=lambda: {"articles": []}) - newsapi_provider.fetch_latest_market_news( - category="finance", - max_hour_interval=12, - max_count=5 - ) - - _, kwargs = mock_get.call_args - params = kwargs["params"] - assert params["q"] == "financial market OR stock market" - assert params["language"] == "en" - assert "from" in params - - -class TestRssProvider: - """RSS Provider-specific test cases""" - - @pytest.fixture - def rss_provider(self): - return NewsFactory.create_provider("rss", feed_url="https://test.com/rss") - - @patch("feedparser.parse") - def test_rss_feed_parsing(self, mock_parse): - # Calculate a timestamp within the default 24-hour window - recent_time = (datetime.now() - timedelta(hours=1)).isoformat() + "Z" # 1 hour ago - - # Mock a valid RSS feed response with recent timestamp - mock_parse.return_value = { - "entries": [ - { - "title": "Test Article", - "link": "https://example.com/news", - "published": recent_time, # Use time within 24 hours - "summary": "Test summary content", - "media_content": [{"url": "https://example.com/image.jpg"}] - } - ], - "feed": {"title": "Test Feed"} - } - - provider = RssProvider() - news = provider.fetch_latest_market_news(max_count=1) - assert len(news) == 1 - assert news[0].headline == "Test Article" - assert news[0].url == "https://example.com/news" - - -class TestFinnhubProviderAdditional: - """Additional Finnhub-specific tests""" - - @pytest.fixture - def finnhub_provider(self): - with patch.dict(os.environ, {"FINNHUB_API_KEY": "test_key"}): - return NewsFactory.create_provider("finnhub") - - @patch("gentrade.news.finnhub.requests.get") - def test_company_news_endpoint(self, mock_get, finnhub_provider): - """Test Finnhub uses correct endpoint for company news""" - mock_get.return_value = Mock(status_code=200, json=lambda: []) - finnhub_provider.fetch_stock_news(ticker="AAPL") - - args, _ = mock_get.call_args - assert "company-news" in args[0] - - -# ------------------------------ Error Handling Tests ------------------------------ -class TestProviderErrorHandling: - """Tests for provider error handling""" - - @pytest.fixture(params=["newsapi", "finnhub"]) - def api_provider(self, request): - """Fixture for API-based providers (non-RSS)""" - provider_type = request.param - env_vars = { - "newsapi": {"NEWSAPI_API_KEY": "test"}, - "finnhub": {"FINNHUB_API_KEY": "test"} - }[provider_type] - - with patch.dict(os.environ, env_vars): - return NewsFactory.create_provider(provider_type) - - @patch("requests.get") - def test_provider_handles_http_errors(self, mock_get, api_provider): - """Test providers return empty list on HTTP errors""" - mock_response = Mock() - mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError("Forbidden") - mock_get.return_value = mock_response - - market_news = api_provider.fetch_latest_market_news() - stock_news = api_provider.fetch_stock_news(ticker="AAPL") - - assert market_news == [] - assert stock_news == [] - - @patch("requests.get") - def test_provider_handles_connection_errors(self, mock_get, api_provider): - """Test providers return empty list on connection errors""" - mock_get.side_effect = requests.exceptions.ConnectionError("Connection failed") - - market_news = api_provider.fetch_latest_market_news() - stock_news = api_provider.fetch_stock_news(ticker="AAPL") - - assert market_news == [] - assert stock_news == [] + for news_item in all_news: + logger.info("[%s...]: %s..." % (str(news_item.id)[:10], news_item.headline[:15])) diff --git a/tests/test_gentrade_search.py b/tests/test_gentrade_search.py deleted file mode 100644 index 0bd52cc..0000000 --- a/tests/test_gentrade_search.py +++ /dev/null @@ -1,191 +0,0 @@ -from unittest.mock import patch, Mock -import pytest -from gentrade.scraper.search import BaiduSearchScraper - - -@pytest.fixture -def scraper(): - """Fixture to provide a BaiduSearchScraper instance""" - return BaiduSearchScraper() - - -def test_initialization(scraper): - """Test scraper initialization sets up required components""" - assert scraper.base_url == "https://www.baidu.com/s" - assert len(scraper.user_agents) > 0 - assert hasattr(scraper, "storage") - assert hasattr(scraper, "content_extractor") - assert len(scraper.time_patterns) == 8 # Check all time patterns are loaded - - -# def test_get_random_headers(scraper): -# """Test header generation contains required fields""" -# headers = scraper._get_random_headers() -# assert "User-Agent" in headers -# assert headers["User-Agent"] in scraper.user_agents -# assert "Accept" in headers -# assert "Referer" in headers -# assert headers["Referer"] == "https://www.baidu.com/" - - -@patch("gentrade.scraper.search.requests.get") -def test_search_basic(mock_get, scraper): - """Test basic search functionality returns expected structure""" - # Mock successful response with sample search results - mock_response = Mock() - mock_response.status_code = 200 - mock_response.text = """ - -
- - 下一页 - - """ - mock_get.return_value = mock_response - - # Execute search - results = scraper.search(query="test", limit=2, fetch_content=False) - - # Verify results structure - assert len(results) == 2 - assert results[0]["title"] == "Test Title 1" - assert results[0]["url"] == "https://example.com/news1" - assert results[0]["summary"] == "Test summary 1" - assert results[0]["source"] == "Example Source" - assert results[0]["content"] == "" # fetch_content=False - - # Verify request parameters - mock_get.assert_called_once() - _, kwargs = mock_get.call_args - assert kwargs["params"]["wd"] == "test" - assert kwargs["params"]["pn"] == 0 # First page - - -@patch("gentrade.scraper.search.requests.get") -def test_search_with_limit(mock_get, scraper): - """Test search respects result limit""" - # Create mock response with 5 results - result_html = """ - - """ - mock_response = Mock() - mock_response.status_code = 200 - mock_response.text = f""" - - {''.join([result_html.replace('{{i}}', str(i)) for i in range(5)])} - 下一页 - - """ - mock_get.return_value = mock_response - - # Request limit=3 - results = scraper.search(query="limit test", limit=3) - assert len(results) == 3 - - -@patch("gentrade.scraper.search.requests.get") -def test_search_pagination(mock_get, scraper): - """Test search handles pagination correctly""" - # Create two page responses - page1_html = """ - -