From a99db241659f570b3f6a335cc0617153d581b073 Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Tue, 9 Dec 2025 21:53:20 +0800 Subject: [PATCH 1/3] add newsnow provider Signed-off-by: Lu Ken --- src/gentrade/news/meta.py | 43 ++++++- src/gentrade/news/newsnow.py | 231 +++++++++++++++++++++++++++++++++++ 2 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 src/gentrade/news/newsnow.py diff --git a/src/gentrade/news/meta.py b/src/gentrade/news/meta.py index 5393e0b..f6cb0bc 100644 --- a/src/gentrade/news/meta.py +++ b/src/gentrade/news/meta.py @@ -12,6 +12,8 @@ import abc import time import hashlib +import random + from typing import Dict, List, Any, Optional from datetime import datetime from dataclasses import dataclass @@ -19,7 +21,7 @@ import requests NEWS_MARKET = [ - 'us', 'zh', 'hk', 'cypto', 'common' + 'us', 'cn', 'hk', 'cypto', 'common' ] @dataclass @@ -129,6 +131,45 @@ def fetch_latest_market_news( """ raise NotImplementedError + @property + def proxies(self) -> Dict: + """Get proxies. The default implementation is retrieving from os.environ + + Returns: + Dict of proxies configurations + """ + ret_dict = {} + for key in ['http_proxy', 'https_proxy', 'no_proxy', \ + 'HTTP_PROXY', 'HTTPS_PROXY', 'NO_PROXY']: + if os.environ.get(key): + ret_dict[key] = os.environ[key] + return ret_dict + + @property + def http_headers(self) -> Dict: + """Get http headers. + + Returns: + Dict of http headers + """ + user_agents = [ + ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"), + ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15"), + ("Mozilla/5.0 (X11; Linux x86_64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"), + ] + + return { + "User-Agent": random.choice(user_agents), + "Accept": ("text/html,application/xhtml+xml,application/xml;q=0.9," + "image/avif,image/webp,*/*;q=0.8"), + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + } + def fetch_stock_news( self, ticker: str, diff --git a/src/gentrade/news/newsnow.py b/src/gentrade/news/newsnow.py new file mode 100644 index 0000000..5fe4ec6 --- /dev/null +++ b/src/gentrade/news/newsnow.py @@ -0,0 +1,231 @@ +import json +import time +import random +from typing import List, Optional +from loguru import logger +import requests + +from gentrade.news.meta import NewsProviderBase, NewsInfo + +AVAILABLE_SOURCE_ID = [ + 'baidu', + 'bilibili', + 'cankaoxiaoxi', + 'chongbuluo', + 'douban', + 'douyin', + 'fastbull', + 'freebuf', + 'gelonghui', + 'ghxi', + 'github', + 'hackernews', + 'hupu', + 'ifeng', + 'ithome', + 'jin10', + 'juejin', + 'kaopu', + 'kuaishou', + 'linuxdo', + 'mktnews', + 'nowcoder', + 'pcbeta', + 'producthunt', + 'smzdm', + 'solidot', + 'sputniknewscn', + 'sspai', + 'steam', + 'tencent', + 'thepaper', + 'tieba', + 'toutiao', + 'v2ex', + 'wallstreetcn', + 'weibo', + 'xueqiu', + 'zaobao', + 'zhihu' +] + +class NewsNowProvider(NewsProviderBase): + """News provider for fetching news from NewsNow service. + + Inherits from NewsProviderBase and implements the required abstract methods + to fetch market news using the NewsNow API endpoint. + """ + + def __init__(self, source_id: str = "baidu"): + """Initialize NewsNowProvider with optional proxy and platform ID. + + Args: + proxy_url: Optional proxy URL for making requests. + source_id: Platform identifier used in the API request. + """ + self.source_id = source_id + + @property + def market(self) -> str: + """Override market to specify the target market for this provider.""" + return "cn" # Default to US market, can be adjusted as needed + + def fetch_latest_market_news( + self, + category: str = "business", + max_hour_interval: int = 24, + max_count: int = 10 + ) -> List[NewsInfo]: + """Fetch latest market news from NewsNow service. + + Args: + category: News category filter (not used by NewsNow API). + max_hour_interval: Maximum age (in hours) of articles to include. + max_count: Maximum number of articles to return. + + Returns: + List of NewsInfo objects filtered by time and count constraints. + """ + # Fetch raw data from NewsNow API + raw_data = self._fetch_news_data() + if not raw_data: + return [] + + # Convert raw data to NewsInfo objects + news_list = self._parse_news(raw_data) + + # Filter news by time interval and count + return self.filter_news(news_list, max_hour_interval, max_count) + + def _fetch_news_data(self, max_retries: int = 2) -> Optional[dict]: + """Fetch raw news data from NewsNow API with retry mechanism. + + Args: + max_retries: Maximum number of retry attempts on failure. + + Returns: + Parsed JSON data if successful, None otherwise. + """ + url = f"https://newsnow.busiyi.world/api/s?id={self.source_id}&latest" + + retries = 0 + while retries <= max_retries: + try: + response = requests.get( + url, + proxies=self.proxies, + headers=self.http_headers, + timeout=10 + ) + response.raise_for_status() + return self._validate_response(response.text) + + except Exception as e: + retries += 1 + if retries <= max_retries: + wait_time = self._calculate_retry_wait(retries) + logger.error( + f"[{self.source_id}] Request failed (attempt {retries}/{max_retries}): {e}." + f"[{self.source_id}] Retrying in {wait_time:.2f}s..." + ) + time.sleep(wait_time) + else: + logger.error(f"Failed after {max_retries} retries: {e}") + return None + + return None + + @staticmethod + def _validate_response(response_text: str) -> Optional[dict]: + """Validate and parse API response. + + Args: + response_text: Raw text response from the API. + + Returns: + Parsed JSON data if valid, None otherwise. + """ + try: + data = json.loads(response_text) + status = data.get("status", "unknown") + if status not in ["success", "cache"]: + logger.error(f"Invalid response status: {status}") + return None + + status_info = "latest data" if status == "success" else "cached data" + logger.info(f"Successfully fetched {status_info}") + return data + + except json.JSONDecodeError as e: + logger.error(f"Failed to parse JSON response: {e}") + return None + + @staticmethod + def _calculate_retry_wait(retry_number: int, min_wait: int = 3, max_wait: int = 5) -> float: + """Calculate exponential backoff wait time for retries. + + Args: + retry_number: Current retry attempt number (starting at 1). + min_wait: Minimum base wait time in seconds. + max_wait: Maximum base wait time in seconds. + + Returns: + Calculated wait time in seconds. + """ + base_wait = random.uniform(min_wait, max_wait) + additional_wait = (retry_number - 1) * random.uniform(1, 2) + return base_wait + additional_wait + + def _parse_news(self, raw_data: dict) -> List[NewsInfo]: + """Parse raw API response into list of NewsInfo objects. + + Args: + raw_data: Parsed JSON data from the API. + + Returns: + List of NewsInfo objects created from the raw data. + """ + news_items = [] + for item in raw_data.get("items", []): + try: + # Extract required fields with fallbacks + url = item.get("url", "") or item.get("mobileUrl", "") + if not url: + logger.warning("Skipping item - no URL found") + continue + + # Convert publication time to epoch timestamp + pub_time = item.get("pubTime", "") + datetime_epoch = self._timestamp_to_epoch(pub_time) \ + if pub_time else int(time.time()) + + # Create NewsInfo object + news_info = NewsInfo( + category=item.get("category", "general"), + datetime=datetime_epoch, + headline=item.get("title", "No headline"), + id=self.url_to_hash_id(url), # Use URL hash as unique ID + image=item.get("image", ""), + related=item.get("related", []), + source=item.get("source", self.source_id), + summary=item.get("summary", ""), + url=url, + content=item.get("content", ""), + provider="newsnow", + market=self.market + ) + news_items.append(news_info) + + except Exception as e: + logger.error(f"Failed to parse news item: {e}") + continue + + return news_items + +if __name__ == "__main__": + logger.info("hello") + for source_id in AVAILABLE_SOURCE_ID: + inst = NewsNowProvider(source_id) + ret = inst.fetch_latest_market_news() + logger.info(ret) + time.sleep(5) From 0c8037dd47950a4d74abbd37fee5b3bbb5afa3f7 Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Thu, 11 Dec 2025 15:40:51 +0800 Subject: [PATCH 2/3] clean up unused function Signed-off-by: Lu Ken --- src/gentrade/news/factory.py | 53 ++++-- src/gentrade/news/meta.py | 70 +------- src/gentrade/news/newsnow.py | 90 ++-------- src/gentrade/scraper/extractor.py | 2 +- src/gentrade/utils/__init__.py | 0 src/gentrade/utils/download.py | 177 +++++++++++++++++++ tests/test_gentrade_news.py | 282 +++--------------------------- 7 files changed, 260 insertions(+), 414 deletions(-) create mode 100644 src/gentrade/utils/__init__.py create mode 100644 src/gentrade/utils/download.py diff --git a/src/gentrade/news/factory.py b/src/gentrade/news/factory.py index 1dfa8eb..be10dd4 100644 --- a/src/gentrade/news/factory.py +++ b/src/gentrade/news/factory.py @@ -18,7 +18,7 @@ from gentrade.news.newsapi import NewsApiProvider from gentrade.news.rss import RssProvider from gentrade.news.finnhub import FinnhubNewsProvider - +from gentrade.news.newsnow import NewsNowProvider class NewsFactory: """Factory class for creating news provider instances based on provider type. @@ -47,7 +47,8 @@ def create_provider(provider_type: str, **kwargs) -> NewsProviderBase: providers = { "newsapi": NewsApiProvider, "finnhub": FinnhubNewsProvider, - "rss": RssProvider + "rss": RssProvider, + "newsnow": NewsNowProvider } provider_class = providers.get(provider_type_lower) @@ -70,6 +71,10 @@ def create_provider(provider_type: str, **kwargs) -> NewsProviderBase: feed_url = kwargs.get("feed_url", os.getenv("RSS_FEED_URL")) return provider_class(feed_url=feed_url) + if provider_type_lower == "newsnow": + source = kwargs.get("source", "baidu") + return provider_class(source=source) + return provider_class(**kwargs) class NewsAggregator: @@ -79,7 +84,7 @@ class NewsAggregator: and stores results in a database. Includes logic to avoid frequent syncs. """ - def __init__(self, providers: List[NewsProviderBase], db: NewsDatabase): + def __init__(self, providers: List[NewsProviderBase], db: NewsDatabase = None): """Initialize the NewsAggregator with a list of providers and a database. Args: @@ -91,7 +96,7 @@ def __init__(self, providers: List[NewsProviderBase], db: NewsDatabase): self.db_lock = threading.Lock() def _fetch_thread(self, provider, aggregator, ticker, category, - max_hour_interval, max_count, is_process=False): + max_hour_interval, max_count, process_content=True): if ticker: news = provider.fetch_stock_news( ticker, category, max_hour_interval, max_count @@ -112,18 +117,20 @@ def _fetch_thread(self, provider, aggregator, ticker, category, ace = ArticleContentExtractor.inst() for item in news: item.summary = ace.clean_html(item.summary) - if is_process: + if process_content: item.content = ace.extract_content(item.url) - with aggregator.db_lock: - aggregator.db.add_news(news) + if self.db: + with aggregator.db_lock: + aggregator.db.add_news(news) def sync_news( self, ticker: Optional[str] = None, category: str = "business", max_hour_interval: int = 24, - max_count: int = 10 + max_count: int = 10, + process_content: bool = True ) -> None: """Synchronize news from providers, skipping if last sync was within 1 hour. @@ -136,10 +143,11 @@ def sync_news( max_hour_interval: Maximum age (in hours) of news articles to fetch (default: 24). max_count: Maximum number of articles to fetch per provider (default: 10). """ - current_time = time.time() - if current_time < self.db.last_sync + 3600: - logger.info("Skipping sync: Last sync was less than 1 hour ago.") - return + if self.db: + current_time = time.time() + if current_time < self.db.last_sync + 3600: + logger.info("Skipping sync: Last sync was less than 1 hour ago.") + return logger.info("Starting news sync...") @@ -150,7 +158,8 @@ def sync_news( thread = threading.Thread( target=self._fetch_thread, - args=(provider, self, ticker, category, max_hour_interval, max_count) + args=(provider, self, ticker, category, max_hour_interval, + max_count, process_content) ) threads.append(thread) thread.start() @@ -158,8 +167,10 @@ def sync_news( for thread in threads: thread.join() - self.db.last_sync = current_time - self.db.save() + if self.db: + self.db.last_sync = current_time + self.db.save() + logger.info("News sync completed.") if __name__ == "__main__": @@ -170,18 +181,24 @@ def sync_news( newsapi_provider = NewsFactory.create_provider("newsapi") finnhub_provider = NewsFactory.create_provider("finnhub") rss_provider = NewsFactory.create_provider("rss") + newsnow_provider = NewsFactory.create_provider("newsnow", source="jin10") # Create aggregator with selected providers aggregator = NewsAggregator( - providers=[rss_provider, newsapi_provider, finnhub_provider], db=db) + providers=[newsnow_provider, ], db=db) # Sync market news and stock-specific news - aggregator.sync_news(category="business", max_hour_interval=64, max_count=10) + aggregator.sync_news( + category="business", + max_hour_interval=64, + max_count=10, + process_content = True) aggregator.sync_news( ticker="AAPL", category="business", max_hour_interval=240, - max_count=10 + max_count=10, + process_content = True ) # Log results diff --git a/src/gentrade/news/meta.py b/src/gentrade/news/meta.py index f6cb0bc..a4f238f 100644 --- a/src/gentrade/news/meta.py +++ b/src/gentrade/news/meta.py @@ -12,13 +12,11 @@ import abc import time import hashlib -import random -from typing import Dict, List, Any, Optional +from typing import Dict, List, Any from datetime import datetime from dataclasses import dataclass from loguru import logger -import requests NEWS_MARKET = [ 'us', 'cn', 'hk', 'cypto', 'common' @@ -61,27 +59,6 @@ def to_dict(self) -> Dict[str, Any]: "market": self.market, } - def fetch_article_html(self) -> Optional[str]: - """Fetch raw HTML content from the article's direct URL. - - Uses a browser-like user agent to avoid being blocked by servers. - - Returns: - Raw HTML string if successful; None if request fails. - """ - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - } - - try: - response = requests.get(self.url, headers=headers, timeout=15) - response.raise_for_status() - return response.text - except requests.RequestException as e: - logger.debug(f"Failed to fetch HTML for {self.url}: {e}") - return None - class NewsProviderBase(metaclass=abc.ABCMeta): """Abstract base class defining the interface for news providers. @@ -131,45 +108,6 @@ def fetch_latest_market_news( """ raise NotImplementedError - @property - def proxies(self) -> Dict: - """Get proxies. The default implementation is retrieving from os.environ - - Returns: - Dict of proxies configurations - """ - ret_dict = {} - for key in ['http_proxy', 'https_proxy', 'no_proxy', \ - 'HTTP_PROXY', 'HTTPS_PROXY', 'NO_PROXY']: - if os.environ.get(key): - ret_dict[key] = os.environ[key] - return ret_dict - - @property - def http_headers(self) -> Dict: - """Get http headers. - - Returns: - Dict of http headers - """ - user_agents = [ - ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"), - ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15"), - ("Mozilla/5.0 (X11; Linux x86_64) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"), - ] - - return { - "User-Agent": random.choice(user_agents), - "Accept": ("text/html,application/xhtml+xml,application/xml;q=0.9," - "image/avif,image/webp,*/*;q=0.8"), - "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", - "Connection": "keep-alive", - "Upgrade-Insecure-Requests": "1", - } - def fetch_stock_news( self, ticker: str, @@ -319,11 +257,11 @@ def save(self): "last_sync": self.last_sync, "news_list": news_dicts } - with open(self._filepath, 'w', encoding='utf-8') as f: - json.dump(content, f, indent=4) # indent for readability + with open(self._filepath, 'w', encoding="utf-8") as f: + json.dump(content, f, ensure_ascii=False, indent=4) # indent for readability def load(self): - with open(self._filepath, 'r', encoding='utf-8') as f: + with open(self._filepath, 'r', encoding="utf-8") as f: content = json.load(f) # Directly loads JSON content into a Python list/dict self.last_sync = content['last_sync'] self.news_list = [NewsInfo(**item_dict) for item_dict in content['news_list']] diff --git a/src/gentrade/news/newsnow.py b/src/gentrade/news/newsnow.py index 5fe4ec6..91aba49 100644 --- a/src/gentrade/news/newsnow.py +++ b/src/gentrade/news/newsnow.py @@ -1,13 +1,12 @@ -import json import time import random -from typing import List, Optional +from typing import List from loguru import logger -import requests from gentrade.news.meta import NewsProviderBase, NewsInfo +from gentrade.utils.download import HttpDownloader -AVAILABLE_SOURCE_ID = [ +AVAILABLE_SOURCE = [ 'baidu', 'bilibili', 'cankaoxiaoxi', @@ -56,14 +55,15 @@ class NewsNowProvider(NewsProviderBase): to fetch market news using the NewsNow API endpoint. """ - def __init__(self, source_id: str = "baidu"): + def __init__(self, source: str = "baidu"): """Initialize NewsNowProvider with optional proxy and platform ID. Args: proxy_url: Optional proxy URL for making requests. - source_id: Platform identifier used in the API request. + source: Platform identifier used in the API request. """ - self.source_id = source_id + self.source = source + self.url = f"https://newsnow.busiyi.world/api/s?id={self.source}&latest" @property def market(self) -> str: @@ -87,78 +87,16 @@ def fetch_latest_market_news( List of NewsInfo objects filtered by time and count constraints. """ # Fetch raw data from NewsNow API - raw_data = self._fetch_news_data() - if not raw_data: + response = HttpDownloader.inst().get(self.url) + if not response: return [] # Convert raw data to NewsInfo objects - news_list = self._parse_news(raw_data) + news_list = self._parse_news(response) # Filter news by time interval and count return self.filter_news(news_list, max_hour_interval, max_count) - def _fetch_news_data(self, max_retries: int = 2) -> Optional[dict]: - """Fetch raw news data from NewsNow API with retry mechanism. - - Args: - max_retries: Maximum number of retry attempts on failure. - - Returns: - Parsed JSON data if successful, None otherwise. - """ - url = f"https://newsnow.busiyi.world/api/s?id={self.source_id}&latest" - - retries = 0 - while retries <= max_retries: - try: - response = requests.get( - url, - proxies=self.proxies, - headers=self.http_headers, - timeout=10 - ) - response.raise_for_status() - return self._validate_response(response.text) - - except Exception as e: - retries += 1 - if retries <= max_retries: - wait_time = self._calculate_retry_wait(retries) - logger.error( - f"[{self.source_id}] Request failed (attempt {retries}/{max_retries}): {e}." - f"[{self.source_id}] Retrying in {wait_time:.2f}s..." - ) - time.sleep(wait_time) - else: - logger.error(f"Failed after {max_retries} retries: {e}") - return None - - return None - - @staticmethod - def _validate_response(response_text: str) -> Optional[dict]: - """Validate and parse API response. - - Args: - response_text: Raw text response from the API. - - Returns: - Parsed JSON data if valid, None otherwise. - """ - try: - data = json.loads(response_text) - status = data.get("status", "unknown") - if status not in ["success", "cache"]: - logger.error(f"Invalid response status: {status}") - return None - - status_info = "latest data" if status == "success" else "cached data" - logger.info(f"Successfully fetched {status_info}") - return data - - except json.JSONDecodeError as e: - logger.error(f"Failed to parse JSON response: {e}") - return None @staticmethod def _calculate_retry_wait(retry_number: int, min_wait: int = 3, max_wait: int = 5) -> float: @@ -207,7 +145,7 @@ def _parse_news(self, raw_data: dict) -> List[NewsInfo]: id=self.url_to_hash_id(url), # Use URL hash as unique ID image=item.get("image", ""), related=item.get("related", []), - source=item.get("source", self.source_id), + source=item.get("source", self.source), summary=item.get("summary", ""), url=url, content=item.get("content", ""), @@ -224,8 +162,8 @@ def _parse_news(self, raw_data: dict) -> List[NewsInfo]: if __name__ == "__main__": logger.info("hello") - for source_id in AVAILABLE_SOURCE_ID: - inst = NewsNowProvider(source_id) + for source in AVAILABLE_SOURCE: + inst = NewsNowProvider(source) ret = inst.fetch_latest_market_news() logger.info(ret) - time.sleep(5) + time.sleep(1) diff --git a/src/gentrade/scraper/extractor.py b/src/gentrade/scraper/extractor.py index 12a91b8..d77898f 100644 --- a/src/gentrade/scraper/extractor.py +++ b/src/gentrade/scraper/extractor.py @@ -226,7 +226,7 @@ def extract_content(self, url: str) -> str: return "Unsupported file type (non-HTML)" try: - article = Article(url) + article = Article(url, language='zh') article.download() article.parse() if article.text: diff --git a/src/gentrade/utils/__init__.py b/src/gentrade/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gentrade/utils/download.py b/src/gentrade/utils/download.py new file mode 100644 index 0000000..6867177 --- /dev/null +++ b/src/gentrade/utils/download.py @@ -0,0 +1,177 @@ +""" +HTTP Downloader Utility + +Description: + A robust HTTP downloader implementation with singleton pattern, automatic retry mechanism, + randomized User-Agent spoofing, and proxy support from environment variables. + + Key Features: + - Singleton pattern for consistent configuration across application + - Configurable retry attempts and request timeout + - Random User-Agent rotation to mimic different browsers + - Proxy configuration loaded from standard environment variables + - Comprehensive error logging with backoff timing + - SSL certificate verification for secure requests + +Usage Example: + >>> downloader = HttpDownloader.inst() + >>> content = downloader.get("https://example.com") + >>> if content: + >>> print("Content downloaded successfully") + +Dependencies: + - requests >= 2.25.1 + - loguru >= 0.7.0 + - Python >= 3.8 +""" + +import os +import random +import time +from typing import Dict +import requests +from loguru import logger + +class HttpDownloader: + """HTTP Downloader with retry mechanism, random User-Agent, and proxy support + + Implements singleton pattern for consistent HTTP GET requests with: + - Automatic retry on failure (configurable max retries) + - Randomized User-Agent to mimic different browsers + - Proxy configuration loaded from environment variables + - Timeout control for request safety + """ + # Singleton instance storage + _INSTANCE = None + + def __init__(self, max_retries: int = 3, timeout: int = 10): + """Initialize downloader configuration + + Args: + max_retries: Maximum retry attempts on failure (default: 3) + timeout: Request timeout in seconds (default: 10) + """ + self.max_retries = max_retries # Max retry attempts for failed requests + self.timeout = timeout # Request timeout threshold (seconds) + + @property + def http_headers(self) -> Dict: + """Generate randomized HTTP request headers + + Returns: + Dictionary of HTTP headers with random User-Agent + """ + # List of common browser User-Agents for request spoofing + user_agents = [ + ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" + ), + ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15" + ), + ( + "Mozilla/5.0 (X11; Linux x86_64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36" + ), + ] + + # Construct headers with random User-Agent selection + return { + "User-Agent": random.choice(user_agents), + "Accept": ( + "text/html,application/xhtml+xml,application/xml;q=0.9," + "image/avif,image/webp,*/*;q=0.8" + ), + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + } + + @property + def proxies(self) -> Dict: + """Load proxy configuration from environment variables + + Supported environment variables (case-insensitive): + - http_proxy / HTTP_PROXY + - https_proxy / HTTPS_PROXY + - no_proxy / NO_PROXY + + Returns: + Dictionary of proxy configurations (empty if no proxies set) + """ + proxy_config = {} + # Check all standard proxy environment variables + proxy_env_keys = [ + 'http_proxy', 'https_proxy', 'no_proxy', + 'HTTP_PROXY', 'HTTPS_PROXY', 'NO_PROXY' + ] + + for key in proxy_env_keys: + env_value = os.environ.get(key) + if env_value: + proxy_config[key] = env_value + + return proxy_config + + def get(self, url: str, params: Dict = None) -> Dict: + """Send HTTP GET request with automatic retry mechanism + + Args: + url: Target URL to retrieve content from + + Returns: + Response text if successful, None if all retries fail + """ + retry_count = 0 # Current retry attempt counter + + logger.debug(f"Http download {url} ...") + # Retry loop until max retries or successful response + while retry_count <= self.max_retries: + try: + # Send GET request with configured headers/proxies/timeout + response = requests.get( + url, + proxies=self.proxies, + headers=self.http_headers, + timeout=self.timeout, + params=params, + verify=True # Enable SSL certificate verification + ) + # Raise exception for HTTP error status codes (4xx/5xx) + response.raise_for_status() + return response.json() # Return successful response content + + except Exception as e: + retry_count += 1 # Increment retry counter on failure + + # Final retry failed - log error and return None + if retry_count > self.max_retries: + logger.error( + f"Failed to download URL after {self.max_retries} retries: {e} | URL: {url}" + ) + return None + + # Calculate random backoff time (0.5-2.0s) to avoid rate limiting + backoff_time = random.uniform(0.5, 2.0) + logger.error( + f"Request failed (attempt {retry_count}/{self.max_retries}): {e}. " + f"Retrying in {backoff_time:.2f} seconds..." + ) + time.sleep(backoff_time) # Wait before next retry attempt + + return None + + @staticmethod + def inst() -> "HttpDownloader": + """Get singleton instance of HttpDownloader + + Implements lazy initialization - creates instance only on first call + + Returns: + Singleton HttpDownloader instance + """ + if HttpDownloader._INSTANCE is None: + HttpDownloader._INSTANCE = HttpDownloader() + return HttpDownloader._INSTANCE diff --git a/tests/test_gentrade_news.py b/tests/test_gentrade_news.py index 0fd2859..ae777dd 100644 --- a/tests/test_gentrade_news.py +++ b/tests/test_gentrade_news.py @@ -1,254 +1,30 @@ -import os -from datetime import datetime, timedelta -from unittest.mock import patch, Mock import pytest -import requests -from gentrade.news.factory import NewsFactory -from gentrade.news.meta import NewsProviderBase -from gentrade.news.newsapi import NewsApiProvider -from gentrade.news.finnhub import FinnhubNewsProvider -from gentrade.news.rss import RssProvider - - -# ------------------------------ NewsFactory Tests ------------------------------ -class TestNewsFactory: - """Tests for NewsFactory provider creation logic""" - - @patch.dict(os.environ, {"NEWSAPI_API_KEY": "test_newsapi_key"}) - def test_create_newsapi_provider(self): - """Test NewsAPI provider creation with valid env var""" - provider = NewsFactory.create_provider("newsapi") - assert isinstance(provider, NewsApiProvider) - assert provider.api_key == "test_newsapi_key" - - def test_create_newsapi_missing_key(self): - """Test NewsAPI creation fails with missing API key""" - with patch.dict(os.environ, {}, clear=True): - with pytest.raises(ValueError) as excinfo: - NewsFactory.create_provider("newsapi") - assert "NEWSAPI_API_KEY" in str(excinfo.value) - - @patch.dict(os.environ, {"FINNHUB_API_KEY": "test_finnhub_key"}) - def test_create_finnhub_provider(self): - """Test Finnhub provider creation with valid env var""" - provider = NewsFactory.create_provider("finnhub") - assert isinstance(provider, FinnhubNewsProvider) - assert provider.api_key == "test_finnhub_key" - - def test_create_finnhub_missing_key(self): - """Test Finnhub creation fails with missing API key""" - with patch.dict(os.environ, {}, clear=True): - with pytest.raises(ValueError) as excinfo: - NewsFactory.create_provider("finnhub") - assert "FINNHUB_API_KEY" in str(excinfo.value) - - def test_create_rss_provider_with_feed_url(self): - """Test RSS provider creation with explicit feed URL""" - feed_url = "https://test-feed.com/rss" - provider = NewsFactory.create_provider("rss", feed_url=feed_url) - assert isinstance(provider, RssProvider) - assert provider.feed_url == feed_url - - @patch.dict(os.environ, {"RSS_FEED_URL": "https://env-feed.com/rss"}) - def test_create_rss_provider_from_env(self): - """Test RSS provider uses env var when no URL is provided""" - provider = NewsFactory.create_provider("rss") - assert provider.feed_url == "https://env-feed.com/rss" - - def test_create_rss_provider_default_url(self): - """Test RSS provider uses default URL when no env var/URL provided""" - with patch.dict(os.environ, {}, clear=True): - provider = NewsFactory.create_provider("rss") - assert provider.feed_url == "https://plink.anyfeeder.com/chinadaily/caijing" - - def test_create_unknown_provider(self): - """Test factory raises error for unknown provider types""" - with pytest.raises(ValueError) as excinfo: - NewsFactory.create_provider("unknown") - assert "Unknown provider type: unknown" in str(excinfo.value) - - -# ------------------------------ News Provider Common Tests ------------------------------ -class TestNewsProvidersCommon: - """Parametrized tests for common provider functionality""" - - @pytest.fixture(params=[ - ("newsapi", NewsApiProvider, {"NEWSAPI_API_KEY": "test_key"}), - ("finnhub", FinnhubNewsProvider, {"FINNHUB_API_KEY": "test_key"}), - ("rss", RssProvider, {}) - ]) - def provider_setup(self, request): - """Fixture providing provider type, class, and required env vars""" - provider_type, provider_class, env_vars = request.param - with patch.dict(os.environ, env_vars): - if provider_type == "rss": - provider = NewsFactory.create_provider(provider_type, - feed_url="https://test.com/rss") - else: - provider = NewsFactory.create_provider(provider_type) - return provider_type, provider_class, provider - - def test_provider_base_class(self, provider_setup): - """Test all providers inherit from NewsProviderBase""" - _, _, provider = provider_setup - assert isinstance(provider, NewsProviderBase) - - def test_fetch_market_news_returns_list(self, provider_setup): - """Test market news fetch returns list (empty or with items)""" - _, _, provider = provider_setup - with patch("requests.get") as mock_get: - mock_response = Mock() - mock_response.status_code = 200 - - # Provider-specific mock responses - if provider_setup[0] == "newsapi": - mock_response.json.return_value = {"articles": []} - elif provider_setup[0] == "finnhub": - mock_response.json.return_value = [] - elif provider_setup[0] == "rss": - pass # Handled in RSS specific tests - - mock_get.return_value = mock_response - - result = provider.fetch_stock_news(ticker="AAPL") - assert isinstance(result, list) - - def test_fetch_stock_news_returns_list(self, provider_setup): - """Test stock news fetch returns list (empty or with items)""" - provider_type, _, provider = provider_setup - with patch("requests.get") as mock_get: - mock_response = Mock() - mock_response.status_code = 200 - - # Match mock responses to actual provider return formats - if provider_type == "newsapi": - # NewsAPI returns {"articles": [...]} - mock_response.json.return_value = {"articles": []} - elif provider_type == "finnhub": - # Finnhub returns list directly - mock_response.json.return_value = [] - elif provider_type == "rss": - # RSS uses feedparser, handled separately - pass - - mock_get.return_value = mock_response - - result = provider.fetch_stock_news(ticker="AAPL") - assert isinstance(result, list) - -# ------------------------------ Provider-Specific Tests ------------------------------ -class TestNewsApiProvider: - """NewsAPI-specific test cases""" - - @pytest.fixture - def newsapi_provider(self): - with patch.dict(os.environ, {"NEWSAPI_API_KEY": "test_key"}): - return NewsFactory.create_provider("newsapi") - - @patch("gentrade.news.newsapi.requests.get") - def test_fetch_market_news_params(self, mock_get, newsapi_provider): - """Test NewsAPI market news uses correct parameters""" - mock_get.return_value = Mock(status_code=200, json=lambda: {"articles": []}) - newsapi_provider.fetch_latest_market_news( - category="finance", - max_hour_interval=12, - max_count=5 - ) - - _, kwargs = mock_get.call_args - params = kwargs["params"] - assert params["q"] == "financial market OR stock market" - assert params["language"] == "en" - assert "from" in params - - -class TestRssProvider: - """RSS Provider-specific test cases""" - - @pytest.fixture - def rss_provider(self): - return NewsFactory.create_provider("rss", feed_url="https://test.com/rss") - - @patch("feedparser.parse") - def test_rss_feed_parsing(self, mock_parse): - # Calculate a timestamp within the default 24-hour window - recent_time = (datetime.now() - timedelta(hours=1)).isoformat() + "Z" # 1 hour ago - - # Mock a valid RSS feed response with recent timestamp - mock_parse.return_value = { - "entries": [ - { - "title": "Test Article", - "link": "https://example.com/news", - "published": recent_time, # Use time within 24 hours - "summary": "Test summary content", - "media_content": [{"url": "https://example.com/image.jpg"}] - } - ], - "feed": {"title": "Test Feed"} - } - - provider = RssProvider() - news = provider.fetch_latest_market_news(max_count=1) - assert len(news) == 1 - assert news[0].headline == "Test Article" - assert news[0].url == "https://example.com/news" - - -class TestFinnhubProviderAdditional: - """Additional Finnhub-specific tests""" - - @pytest.fixture - def finnhub_provider(self): - with patch.dict(os.environ, {"FINNHUB_API_KEY": "test_key"}): - return NewsFactory.create_provider("finnhub") - - @patch("gentrade.news.finnhub.requests.get") - def test_company_news_endpoint(self, mock_get, finnhub_provider): - """Test Finnhub uses correct endpoint for company news""" - mock_get.return_value = Mock(status_code=200, json=lambda: []) - finnhub_provider.fetch_stock_news(ticker="AAPL") - - args, _ = mock_get.call_args - assert "company-news" in args[0] - - -# ------------------------------ Error Handling Tests ------------------------------ -class TestProviderErrorHandling: - """Tests for provider error handling""" - - @pytest.fixture(params=["newsapi", "finnhub"]) - def api_provider(self, request): - """Fixture for API-based providers (non-RSS)""" - provider_type = request.param - env_vars = { - "newsapi": {"NEWSAPI_API_KEY": "test"}, - "finnhub": {"FINNHUB_API_KEY": "test"} - }[provider_type] - - with patch.dict(os.environ, env_vars): - return NewsFactory.create_provider(provider_type) - - @patch("requests.get") - def test_provider_handles_http_errors(self, mock_get, api_provider): - """Test providers return empty list on HTTP errors""" - mock_response = Mock() - mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError("Forbidden") - mock_get.return_value = mock_response - - market_news = api_provider.fetch_latest_market_news() - stock_news = api_provider.fetch_stock_news(ticker="AAPL") - - assert market_news == [] - assert stock_news == [] - - @patch("requests.get") - def test_provider_handles_connection_errors(self, mock_get, api_provider): - """Test providers return empty list on connection errors""" - mock_get.side_effect = requests.exceptions.ConnectionError("Connection failed") - - market_news = api_provider.fetch_latest_market_news() - stock_news = api_provider.fetch_stock_news(ticker="AAPL") - - assert market_news == [] - assert stock_news == [] +from loguru import logger + +from gentrade.news.factory import NewsAggregator, NewsFactory + +from gentrade.news.meta import NewsFileDatabase +# from gentrade.news.newsapi import NewsApiProvider +# from gentrade.news.rss import RssProvider +# from gentrade.news.finnhub import FinnhubNewsProvider +# from gentrade.news.newsnow import NewsNowProvider + +@pytest.mark.parametrize("provider_name", + [ "newsapi", "finnhub", "rss", "newsnow"]) +def test_provider_basic(provider_name:str): + db = NewsFileDatabase("news_db.txt") + + provider = NewsFactory.create_provider(provider_name) + aggregator = NewsAggregator([ provider], db) + aggregator.sync_news( + category="business", + max_hour_interval=64, + max_count=10, + process_content = True) + + # Log results + all_news = db.get_all_news() + logger.info(f"Total articles in database: {len(all_news)}") + + for news_item in all_news: + logger.info("[%s...]: %s..." % (str(news_item.id)[:10], news_item.headline[:15])) From c5e4bac5f2ab05f730d3168718a34abcc560440b Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Wed, 17 Dec 2025 15:50:46 +0800 Subject: [PATCH 3/3] refactory Signed-off-by: Lu Ken --- src/gentrade/news/factory.py | 23 +- src/gentrade/news/meta.py | 2 +- src/gentrade/news/newsnow.py | 169 ----------- src/gentrade/news/providers/__init__.py | 0 src/gentrade/news/{ => providers}/finnhub.py | 0 src/gentrade/news/{ => providers}/newsapi.py | 0 src/gentrade/news/providers/newsnow.py | 179 ++++++++++++ src/gentrade/news/{ => providers}/rss.py | 0 src/gentrade/scraper/extractor.py | 292 ------------------- src/gentrade/utils/download.py | 231 ++++++++++++++- src/gentrade/{scraper => utils}/search.py | 81 +++-- tests/test_gentrade_news.py | 26 +- tests/test_gentrade_search.py | 191 ------------ 13 files changed, 474 insertions(+), 720 deletions(-) delete mode 100644 src/gentrade/news/newsnow.py create mode 100644 src/gentrade/news/providers/__init__.py rename src/gentrade/news/{ => providers}/finnhub.py (100%) rename src/gentrade/news/{ => providers}/newsapi.py (100%) create mode 100644 src/gentrade/news/providers/newsnow.py rename src/gentrade/news/{ => providers}/rss.py (100%) delete mode 100644 src/gentrade/scraper/extractor.py rename src/gentrade/{scraper => utils}/search.py (91%) delete mode 100644 tests/test_gentrade_search.py diff --git a/src/gentrade/news/factory.py b/src/gentrade/news/factory.py index be10dd4..9aa9fd5 100644 --- a/src/gentrade/news/factory.py +++ b/src/gentrade/news/factory.py @@ -12,13 +12,12 @@ from typing import List, Optional from loguru import logger -from gentrade.scraper.extractor import ArticleContentExtractor - from gentrade.news.meta import NewsProviderBase, NewsDatabase, NewsFileDatabase -from gentrade.news.newsapi import NewsApiProvider -from gentrade.news.rss import RssProvider -from gentrade.news.finnhub import FinnhubNewsProvider -from gentrade.news.newsnow import NewsNowProvider +from gentrade.news.providers.newsapi import NewsApiProvider +from gentrade.news.providers.rss import RssProvider +from gentrade.news.providers.finnhub import FinnhubNewsProvider +from gentrade.news.providers.newsnow import NewsNowProvider +from gentrade.utils.download import ArticleDownloader class NewsFactory: """Factory class for creating news provider instances based on provider type. @@ -114,11 +113,14 @@ def _fetch_thread(self, provider, aggregator, ticker, category, f"{provider.__class__.__name__}" ) - ace = ArticleContentExtractor.inst() + downloader = ArticleDownloader.inst() for item in news: - item.summary = ace.clean_html(item.summary) + item.summary = downloader.clean_html(item.summary) if process_content: - item.content = ace.extract_content(item.url) + logger.info(f"Process content ... {item.url}") + item.content = downloader.get_content(item.url) + if item.content: + logger.info(f"Content: {item.content[:20]}") if self.db: with aggregator.db_lock: @@ -154,6 +156,7 @@ def sync_news( threads = [] for provider in self.providers: if not provider.is_available: + logger.error(f"Provider {provider.__class__.__name__} is not available") continue thread = threading.Thread( @@ -185,7 +188,7 @@ def sync_news( # Create aggregator with selected providers aggregator = NewsAggregator( - providers=[newsnow_provider, ], db=db) + providers=[newsnow_provider, finnhub_provider, rss_provider, newsapi_provider], db=db) # Sync market news and stock-specific news aggregator.sync_news( diff --git a/src/gentrade/news/meta.py b/src/gentrade/news/meta.py index a4f238f..1802307 100644 --- a/src/gentrade/news/meta.py +++ b/src/gentrade/news/meta.py @@ -35,7 +35,7 @@ class NewsInfo: summary: str url: str content: str - provider: str # provder like newsapi, finnhub, rss + provider: str # provider like newsapi, finnhub, rss market: str # market type like us, chn, eur, hk, crypto def to_dict(self) -> Dict[str, Any]: diff --git a/src/gentrade/news/newsnow.py b/src/gentrade/news/newsnow.py deleted file mode 100644 index 91aba49..0000000 --- a/src/gentrade/news/newsnow.py +++ /dev/null @@ -1,169 +0,0 @@ -import time -import random -from typing import List -from loguru import logger - -from gentrade.news.meta import NewsProviderBase, NewsInfo -from gentrade.utils.download import HttpDownloader - -AVAILABLE_SOURCE = [ - 'baidu', - 'bilibili', - 'cankaoxiaoxi', - 'chongbuluo', - 'douban', - 'douyin', - 'fastbull', - 'freebuf', - 'gelonghui', - 'ghxi', - 'github', - 'hackernews', - 'hupu', - 'ifeng', - 'ithome', - 'jin10', - 'juejin', - 'kaopu', - 'kuaishou', - 'linuxdo', - 'mktnews', - 'nowcoder', - 'pcbeta', - 'producthunt', - 'smzdm', - 'solidot', - 'sputniknewscn', - 'sspai', - 'steam', - 'tencent', - 'thepaper', - 'tieba', - 'toutiao', - 'v2ex', - 'wallstreetcn', - 'weibo', - 'xueqiu', - 'zaobao', - 'zhihu' -] - -class NewsNowProvider(NewsProviderBase): - """News provider for fetching news from NewsNow service. - - Inherits from NewsProviderBase and implements the required abstract methods - to fetch market news using the NewsNow API endpoint. - """ - - def __init__(self, source: str = "baidu"): - """Initialize NewsNowProvider with optional proxy and platform ID. - - Args: - proxy_url: Optional proxy URL for making requests. - source: Platform identifier used in the API request. - """ - self.source = source - self.url = f"https://newsnow.busiyi.world/api/s?id={self.source}&latest" - - @property - def market(self) -> str: - """Override market to specify the target market for this provider.""" - return "cn" # Default to US market, can be adjusted as needed - - def fetch_latest_market_news( - self, - category: str = "business", - max_hour_interval: int = 24, - max_count: int = 10 - ) -> List[NewsInfo]: - """Fetch latest market news from NewsNow service. - - Args: - category: News category filter (not used by NewsNow API). - max_hour_interval: Maximum age (in hours) of articles to include. - max_count: Maximum number of articles to return. - - Returns: - List of NewsInfo objects filtered by time and count constraints. - """ - # Fetch raw data from NewsNow API - response = HttpDownloader.inst().get(self.url) - if not response: - return [] - - # Convert raw data to NewsInfo objects - news_list = self._parse_news(response) - - # Filter news by time interval and count - return self.filter_news(news_list, max_hour_interval, max_count) - - - @staticmethod - def _calculate_retry_wait(retry_number: int, min_wait: int = 3, max_wait: int = 5) -> float: - """Calculate exponential backoff wait time for retries. - - Args: - retry_number: Current retry attempt number (starting at 1). - min_wait: Minimum base wait time in seconds. - max_wait: Maximum base wait time in seconds. - - Returns: - Calculated wait time in seconds. - """ - base_wait = random.uniform(min_wait, max_wait) - additional_wait = (retry_number - 1) * random.uniform(1, 2) - return base_wait + additional_wait - - def _parse_news(self, raw_data: dict) -> List[NewsInfo]: - """Parse raw API response into list of NewsInfo objects. - - Args: - raw_data: Parsed JSON data from the API. - - Returns: - List of NewsInfo objects created from the raw data. - """ - news_items = [] - for item in raw_data.get("items", []): - try: - # Extract required fields with fallbacks - url = item.get("url", "") or item.get("mobileUrl", "") - if not url: - logger.warning("Skipping item - no URL found") - continue - - # Convert publication time to epoch timestamp - pub_time = item.get("pubTime", "") - datetime_epoch = self._timestamp_to_epoch(pub_time) \ - if pub_time else int(time.time()) - - # Create NewsInfo object - news_info = NewsInfo( - category=item.get("category", "general"), - datetime=datetime_epoch, - headline=item.get("title", "No headline"), - id=self.url_to_hash_id(url), # Use URL hash as unique ID - image=item.get("image", ""), - related=item.get("related", []), - source=item.get("source", self.source), - summary=item.get("summary", ""), - url=url, - content=item.get("content", ""), - provider="newsnow", - market=self.market - ) - news_items.append(news_info) - - except Exception as e: - logger.error(f"Failed to parse news item: {e}") - continue - - return news_items - -if __name__ == "__main__": - logger.info("hello") - for source in AVAILABLE_SOURCE: - inst = NewsNowProvider(source) - ret = inst.fetch_latest_market_news() - logger.info(ret) - time.sleep(1) diff --git a/src/gentrade/news/providers/__init__.py b/src/gentrade/news/providers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gentrade/news/finnhub.py b/src/gentrade/news/providers/finnhub.py similarity index 100% rename from src/gentrade/news/finnhub.py rename to src/gentrade/news/providers/finnhub.py diff --git a/src/gentrade/news/newsapi.py b/src/gentrade/news/providers/newsapi.py similarity index 100% rename from src/gentrade/news/newsapi.py rename to src/gentrade/news/providers/newsapi.py diff --git a/src/gentrade/news/providers/newsnow.py b/src/gentrade/news/providers/newsnow.py new file mode 100644 index 0000000..52e0d74 --- /dev/null +++ b/src/gentrade/news/providers/newsnow.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +gentrade - NewsNow News Provider Module + +Project: gentrade +Module: news.providers.newsnow +Description: + Implementation of the NewsNow news provider for fetching real-time market news + from the NewsNow API endpoint. This module inherits from NewsProviderBase and + implements core methods for news retrieval, parsing, and filtering across + multiple supported sources (e.g., baidu, zhihu, weibo). + +Key Features: + - Source-specific news fetching from 38+ supported platforms + - Automatic news parsing into standardized NewsInfo objects + - Time-based and count-based news filtering + - Jittered exponential backoff for retry logic + - Robust error handling and logging + - Compatibility with China (cn) market news by default +""" + +import time +import random +from typing import List +from loguru import logger + +from gentrade.news.meta import NewsProviderBase, NewsInfo +from gentrade.utils.download import HttpDownloader + +# Supported news sources for NewsNow provider (38+ platforms) +AVAILABLE_SOURCE = [ + 'baidu', 'bilibili', 'cankaoxiaoxi', 'chongbuluo', 'douban', 'douyin', + 'fastbull', 'freebuf', 'gelonghui', 'ghxi', 'github', 'hackernews', + 'hupu', 'ifeng', 'ithome', 'jin10', 'juejin', 'kaopu', 'kuaishou', + 'linuxdo', 'mktnews', 'nowcoder', 'pcbeta', 'producthunt', 'smzdm', + 'solidot', 'sputniknewscn', 'sspai', 'steam', 'tencent', 'thepaper', + 'tieba', 'toutiao', 'v2ex', 'wallstreetcn', 'weibo', 'xueqiu', 'zaobao', + 'zhihu' +] + + +class NewsNowProvider(NewsProviderBase): + """News provider for fetching real-time market news from NewsNow service. + + Inherits from NewsProviderBase and implements abstract methods to fetch + categorized market news using the NewsNow API endpoint with source-specific + configurations. + """ + + def __init__(self, source: str = "baidu"): + """Initialize NewsNowProvider with specified news source. + + Args: + source: Platform identifier (from AVAILABLE_SOURCE) used in API request + """ + self.source = source + self.url = f"https://newsnow.busiyi.world/api/s?id={self.source}&latest" + + @property + def market(self) -> str: + """Override market property to specify target market (China).""" + return "cn" # Target market: China (adjustable for other regions) + + def fetch_latest_market_news( + self, + category: str = "business", + max_hour_interval: int = 24, + max_count: int = 10 + ) -> List[NewsInfo]: + """Fetch and filter latest market news from NewsNow service. + + Args: + category: News category filter (unused by NewsNow API, kept for compat) + max_hour_interval: Max age (hours) of articles to include + max_count: Maximum number of articles to return + + Returns: + List of NewsInfo objects filtered by time and count constraints + """ + # Fetch raw JSON data from NewsNow API endpoint + response = HttpDownloader.inst().get(self.url) + if not response: + logger.warning(f"Empty response from NewsNow API (source: {self.source})") + return [] + + # Parse raw response to NewsInfo objects and apply filters + news_list = self._parse_news(response.json()) + filtered_news = self.filter_news(news_list, max_hour_interval, max_count) + + logger.info(f"Fetched {len(filtered_news)} news items (source: {self.source})") + return filtered_news + + @staticmethod + def _calculate_retry_wait( + retry_number: int, + min_wait: int = 3, + max_wait: int = 5 + ) -> float: + """Calculate exponential backoff wait time for request retries. + + Implements jittered exponential backoff to avoid thundering herd effect. + + Args: + retry_number: Current retry attempt number (starting at 1) + min_wait: Minimum base wait time in seconds + max_wait: Maximum base wait time in seconds + + Returns: + Calculated wait time in seconds (with random jitter) + """ + base_wait = random.uniform(min_wait, max_wait) + additional_wait = (retry_number - 1) * random.uniform(1, 2) + return base_wait + additional_wait + + def _parse_news(self, raw_data: dict) -> List[NewsInfo]: + """Parse raw NewsNow API JSON response into NewsInfo objects. + + Extracts and normalizes news fields with fallbacks for missing values, + handles time conversion, and generates unique IDs from URLs. + + Args: + raw_data: Parsed JSON dictionary from NewsNow API response + + Returns: + List of valid NewsInfo objects (skipped invalid/corrupted items) + """ + news_items = [] + for item in raw_data.get("items", []): + try: + # Extract URL with mobile fallback (critical field) + url = item.get("url", "") or item.get("mobileUrl", "") + if not url: + logger.warning("Skipping news item - no URL found") + continue + + # Convert publication time to epoch timestamp (fallback: current time) + pub_time = item.get("pubTime", "") + datetime_epoch = ( + self._timestamp_to_epoch(pub_time) if pub_time else int(time.time()) + ) + + # Create normalized NewsInfo object with default values + news_info = NewsInfo( + category=item.get("category", "general"), + datetime=datetime_epoch, + headline=item.get("title", "No headline"), + id=self.url_to_hash_id(url), # Unique ID from URL hash + image=item.get("image", ""), + related=item.get("related", []), + source=item.get("source", self.source), + summary=item.get("summary", ""), + url=url, + content=item.get("content", ""), + provider="newsnow", + market=self.market + ) + news_items.append(news_info) + + except Exception as e: + logger.error(f"Failed to parse news item (source: {self.source}): {str(e)}") + continue + + return news_items + + +if __name__ == "__main__": + logger.info("Starting NewsNowProvider test for all available sources...") + + for source in AVAILABLE_SOURCE: + try: + provider = NewsNowProvider(source) + news_items = provider.fetch_latest_market_news() + logger.info(f"Source {source}: Found {len(news_items)} news items") + time.sleep(1) # Rate limiting for test execution + except Exception as e: + logger.error(f"Test failed for source {source}: {str(e)}") + + logger.info("NewsNowProvider test completed") diff --git a/src/gentrade/news/rss.py b/src/gentrade/news/providers/rss.py similarity index 100% rename from src/gentrade/news/rss.py rename to src/gentrade/news/providers/rss.py diff --git a/src/gentrade/scraper/extractor.py b/src/gentrade/scraper/extractor.py deleted file mode 100644 index d77898f..0000000 --- a/src/gentrade/scraper/extractor.py +++ /dev/null @@ -1,292 +0,0 @@ -""" -Baidu Search Scraper with Article Extraction - -This module provides tools for: -1. Persistent storage of blocked domains and dummy content patterns. -2. Cleaning and extracting article content while filtering irrelevant material. -3. Scraping Baidu search results and enriching them with extracted article content. -""" - -import json -import logging -import os -import random -import re -import time - -from typing import Dict, List, Optional -from urllib.parse import urlparse - -import requests -from bs4 import BeautifulSoup, Comment -from newspaper import Article -from newspaper.article import ArticleException - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s" -) -logger = logging.getLogger(__name__) - - -class ScraperStorage: - """Manages persistent storage for scraper data such as blocklists and dummy patterns.""" - - def __init__(self, storage_dir: str = "scraper_data"): - self.storage_dir = storage_dir - self.blocklist_path = os.path.join(storage_dir, "blocked_domains.json") - self.dummy_patterns_path = os.path.join( - storage_dir, "dummy_content_patterns.json" - ) - - os.makedirs(storage_dir, exist_ok=True) - self._initialize_file(self.blocklist_path, {}) - self._initialize_file(self.dummy_patterns_path, []) - - def _initialize_file(self, file_path: str, default_content): - """Create a new storage file with default content if it doesn't exist.""" - if not os.path.exists(file_path): - with open(file_path, "w", encoding="utf-8") as f: - json.dump(default_content, f, ensure_ascii=False, indent=2) - - def load_blocked_domains(self) -> Dict[str, float]: - """Load list of blocked domains with their block timestamps.""" - try: - with open(self.blocklist_path, "r", encoding="utf-8") as f: - return json.load(f) - except Exception as e: - logger.error("Failed to load blocked domains: %s", str(e)) - return {} - - def save_blocked_domains(self, blocked_domains: Dict[str, float]): - """Save updated blocked domains list to storage.""" - try: - with open(self.blocklist_path, "w", encoding="utf-8") as f: - json.dump(blocked_domains, f, ensure_ascii=False, indent=2) - except Exception as e: - logger.error("Failed to save blocked domains: %s", str(e)) - - def load_dummy_patterns(self) -> List[str]: - """Load previously identified dummy content patterns.""" - try: - with open(self.dummy_patterns_path, "r", encoding="utf-8") as f: - return json.load(f) - except Exception as e: - logger.error("Failed to load dummy patterns: %s", str(e)) - return [] - - def save_dummy_patterns(self, dummy_patterns: List[str]): - """Save new dummy content patterns to storage, ensuring uniqueness.""" - try: - unique_patterns = [] - for pattern in dummy_patterns: - if pattern not in unique_patterns: - unique_patterns.append(pattern) - - with open(self.dummy_patterns_path, "w", encoding="utf-8") as f: - json.dump(unique_patterns, f, ensure_ascii=False, indent=2) - except Exception as e: - logger.error("Failed to save dummy patterns: %s", str(e)) - - -class ArticleContentExtractor: - """Handles article content extraction with dummy content filtering.""" - - _instance = None - - def __init__(self, storage: ScraperStorage=None): - self.ignored_extensions = ( - ".pdf", ".doc", ".docx", ".xls", ".xlsx", - ".zip", ".rar", ".jpg", ".png", ".jpeg", ".gif" - ) - self.dummy_keywords = { - "we use cookies", "cookie policy", "analyze website traffic", - "accept cookies", "reject cookies", "by continuing to use", - "this website uses cookies", "improve user experience", - "ads by", "sponsored content", "subscribe to access" - } - - if storage is None: - storage = ScraperStorage() - self.storage = storage - - self.blocked_domains = self.storage.load_blocked_domains() - self.dummy_patterns = self.storage.load_dummy_patterns() - - self.user_agents = [ - ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"), - ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15"), - ("Mozilla/5.0 (X11; Linux x86_64) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"), - ] - - def _get_random_headers(self) -> Dict[str, str]: - """Generate random browser-like headers.""" - return { - "User-Agent": random.choice(self.user_agents), - "Accept": ("text/html,application/xhtml+xml,application/xml;q=0.9," - "image/avif,image/webp,*/*;q=0.8"), - "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", - "Connection": "keep-alive", - "Upgrade-Insecure-Requests": "1", - } - - def clean_html(self, html: str) -> str: - """Clean raw HTML by removing non-content elements and ads.""" - if not html: - return "" - - soup = BeautifulSoup(html, "html.parser") - - for tag in soup( - ["script", "style", "noscript", "iframe", "aside", "nav", "footer"] - ): - tag.decompose() - - for comment in soup.find_all(text=lambda t: isinstance(t, Comment)): - comment.extract() - - ad_selectors = [ - "div[class*='ad']", "div[id*='ad']", - "div[class*='advert']", "div[id*='advert']", - "div[class*='推广']", "div[id*='推广']", - ] - for selector in ad_selectors: - for tag in soup.select(selector): - tag.decompose() - - text = soup.get_text() - return re.sub(r"\s+", " ", text).strip() - - def _is_dummy_content(self, content: str) -> bool: - """Check if content contains dummy patterns or keywords.""" - if not content: - return False - - content_lower = content.lower() - - if any(keyword in content_lower for keyword in self.dummy_keywords): - return True - - for pattern in self.dummy_patterns: - if pattern.lower() in content_lower and len(pattern) > 10: - return True - - return False - - def _get_domain(self, url: str) -> str: - """Extract domain from URL (without port).""" - try: - parsed = urlparse(url) - return parsed.netloc.split(":")[0] - except Exception: - return url - - def _is_domain_blocked(self, url: str) -> bool: - """Check if domain is in blocked list (7-day expiration).""" - domain = self._get_domain(url) - if domain in self.blocked_domains: - if time.time() - self.blocked_domains[domain] < 604800: - logger.info("Domain %s is blocked - skipping extraction", domain) - return True - del self.blocked_domains[domain] - self.storage.save_blocked_domains(self.blocked_domains) - return False - - def _block_domain(self, url: str): - """Add domain to blocked list with current timestamp.""" - domain = self._get_domain(url) - if domain not in self.blocked_domains: - self.blocked_domains[domain] = time.time() - self.storage.save_blocked_domains(self.blocked_domains) - logger.info("Added domain %s to blocked list", domain) - - def _add_dummy_content_pattern(self, content: str): - """Extract and save new dummy content patterns from detected content.""" - fragments = re.split(r"[.!?;]", content) - for fragment in fragments: - fragment = fragment.strip() - if 20 < len(fragment) < 200: - self.dummy_patterns.append(fragment) - - self.storage.save_dummy_patterns(self.dummy_patterns) - - def extract_content(self, url: str) -> str: - """Extract article content with dummy filtering and blocklisting.""" - if self._is_domain_blocked(url): - logger.warning("Content source blocked: %s", url) - return "Content source blocked: Previously detected irrelevant content" - - parsed_url = urlparse(url) - if parsed_url.path.lower().endswith(self.ignored_extensions): - logger.warning("Skipping non-HTML file: %s", url) - return "Unsupported file type (non-HTML)" - - try: - article = Article(url, language='zh') - article.download() - article.parse() - if article.text: - content = article.text.strip() - else: - # Fallback to HTML scrapping if newspaper3k returns empty text - html = self._fetch_original_html(url) - content = self.clean_html(html) - except ArticleException as e: - logger.warning( - "newspaper3k extraction failed: %s - falling back to HTML cleaning", - str(e) - ) - # Fallback to HTML scrapping if newspaper3k returns empty text - html = self._fetch_original_html(url) - content = self.clean_html(html) - - if self._is_dummy_content(content): - logger.warning("Dummy content detected at: %s", url) - self._block_domain(url) - self._add_dummy_content_pattern(content) - return "Content blocked: Contains cookie notices or irrelevant material" - - return content - - def _fetch_original_html(self, url: str, timeout: int = 10) -> Optional[str]: - """Fetch raw HTML content from a URL with retries. - - Args: - url: URL to fetch HTML from. - timeout: Request timeout in seconds (default: 10). - - Returns: - Raw HTML content as a string, or None if fetch fails. - """ - - retries = 3 - headers = self._get_random_headers() - - for attempt in range(retries): - try: - # Add random delay between retries (0.5-2 seconds) - if attempt > 0: - time.sleep(random.uniform(0.5, 2.0)) - - response = requests.get( - url, headers=headers, timeout=timeout, verify=True - ) - response.raise_for_status() - return response.text - except Exception as e: - if attempt < retries - 1: - continue - logger.error(f"Failed to fetch HTML after {retries} retries ({url}): {e}") - return None - - return None - - @staticmethod - def inst(storage: ScraperStorage=None): - if ArticleContentExtractor._instance is None: - ArticleContentExtractor._instance = ArticleContentExtractor(storage) - return ArticleContentExtractor._instance diff --git a/src/gentrade/utils/download.py b/src/gentrade/utils/download.py index 6867177..e803f30 100644 --- a/src/gentrade/utils/download.py +++ b/src/gentrade/utils/download.py @@ -24,14 +24,21 @@ - loguru >= 0.7.0 - Python >= 3.8 """ - +import re import os import random import time -from typing import Dict +import json +from typing import Dict, List +from urllib.parse import urlparse + import requests from loguru import logger +from bs4 import BeautifulSoup, Comment +from newspaper import Article +from newspaper.article import ArticleException + class HttpDownloader: """HTTP Downloader with retry mechanism, random User-Agent, and proxy support @@ -44,7 +51,7 @@ class HttpDownloader: # Singleton instance storage _INSTANCE = None - def __init__(self, max_retries: int = 3, timeout: int = 10): + def __init__(self, max_retries: int = 3, timeout: int = 5): """Initialize downloader configuration Args: @@ -115,7 +122,7 @@ def proxies(self) -> Dict: return proxy_config - def get(self, url: str, params: Dict = None) -> Dict: + def get(self, url: str, verify: bool = True, params: Dict = None) -> requests.Response: """Send HTTP GET request with automatic retry mechanism Args: @@ -126,7 +133,7 @@ def get(self, url: str, params: Dict = None) -> Dict: """ retry_count = 0 # Current retry attempt counter - logger.debug(f"Http download {url} ...") + logger.debug(f"Http download {url} {verify} {params} ") # Retry loop until max retries or successful response while retry_count <= self.max_retries: try: @@ -137,13 +144,14 @@ def get(self, url: str, params: Dict = None) -> Dict: headers=self.http_headers, timeout=self.timeout, params=params, - verify=True # Enable SSL certificate verification + verify=verify # Enable SSL certificate verification ) + # Raise exception for HTTP error status codes (4xx/5xx) response.raise_for_status() - return response.json() # Return successful response content - + return response except Exception as e: + logger.error(e) retry_count += 1 # Increment retry counter on failure # Final retry failed - log error and return None @@ -163,6 +171,33 @@ def get(self, url: str, params: Dict = None) -> Dict: return None + def clean_html(self, html: str) -> str: + """Clean raw HTML by removing non-content elements and ads.""" + if not html: + return "" + + soup = BeautifulSoup(html, "html.parser") + + for tag in soup( + ["script", "style", "noscript", "iframe", "aside", "nav", "footer"] + ): + tag.decompose() + + for comment in soup.find_all(text=lambda t: isinstance(t, Comment)): + comment.extract() + + ad_selectors = [ + "div[class*='ad']", "div[id*='ad']", + "div[class*='advert']", "div[id*='advert']", + "div[class*='推广']", "div[id*='推广']", + ] + for selector in ad_selectors: + for tag in soup.select(selector): + tag.decompose() + + text = soup.get_text() + return re.sub(r"\s+", " ", text).strip() + @staticmethod def inst() -> "HttpDownloader": """Get singleton instance of HttpDownloader @@ -175,3 +210,183 @@ def inst() -> "HttpDownloader": if HttpDownloader._INSTANCE is None: HttpDownloader._INSTANCE = HttpDownloader() return HttpDownloader._INSTANCE + + +class ScraperStorage: + """Manages persistent storage for scraper data such as blocklists and dummy patterns.""" + + def __init__(self, storage_dir: str = "scraper_data"): + self.storage_dir = storage_dir + self.blocklist_path = os.path.join(storage_dir, "blocked_domains.json") + self.dummy_patterns_path = os.path.join( + storage_dir, "dummy_content_patterns.json" + ) + + os.makedirs(storage_dir, exist_ok=True) + self._initialize_file(self.blocklist_path, {}) + self._initialize_file(self.dummy_patterns_path, []) + + def _initialize_file(self, file_path: str, default_content): + """Create a new storage file with default content if it doesn't exist.""" + if not os.path.exists(file_path): + with open(file_path, "w", encoding="utf-8") as f: + json.dump(default_content, f, ensure_ascii=False, indent=2) + + def load_blocked_domains(self) -> Dict[str, float]: + """Load list of blocked domains with their block timestamps.""" + try: + with open(self.blocklist_path, "r", encoding="utf-8") as f: + return json.load(f) + except Exception as e: + logger.error("Failed to load blocked domains: %s", str(e)) + return {} + + def save_blocked_domains(self, blocked_domains: Dict[str, float]): + """Save updated blocked domains list to storage.""" + try: + with open(self.blocklist_path, "w", encoding="utf-8") as f: + json.dump(blocked_domains, f, ensure_ascii=False, indent=2) + except Exception as e: + logger.error("Failed to save blocked domains: %s", str(e)) + + def load_dummy_patterns(self) -> List[str]: + """Load previously identified dummy content patterns.""" + try: + with open(self.dummy_patterns_path, "r", encoding="utf-8") as f: + return json.load(f) + except Exception as e: + logger.error("Failed to load dummy patterns: %s", str(e)) + return [] + + def save_dummy_patterns(self, dummy_patterns: List[str]): + """Save new dummy content patterns to storage, ensuring uniqueness.""" + try: + unique_patterns = [] + for pattern in dummy_patterns: + if pattern not in unique_patterns: + unique_patterns.append(pattern) + + with open(self.dummy_patterns_path, "w", encoding="utf-8") as f: + json.dump(unique_patterns, f, ensure_ascii=False, indent=2) + except Exception as e: + logger.error("Failed to save dummy patterns: %s", str(e)) + + +class ArticleDownloader(HttpDownloader): + """Handles article content extraction with dummy content filtering.""" + + _INSTANCE = None + + def __init__(self, storage: ScraperStorage=None): + super().__init__() + self.ignored_extensions = ( + ".pdf", ".doc", ".docx", ".xls", ".xlsx", + ".zip", ".rar", ".jpg", ".png", ".jpeg", ".gif" + ) + self.dummy_keywords = { + "we use cookies", "cookie policy", "analyze website traffic", + "accept cookies", "reject cookies", "by continuing to use", + "this website uses cookies", "improve user experience", + "ads by", "sponsored content", "subscribe to access" + } + + if storage is None: + storage = ScraperStorage() + self.storage = storage + + self.blocked_domains = self.storage.load_blocked_domains() + self.dummy_patterns = self.storage.load_dummy_patterns() + + def _is_dummy_content(self, content: str) -> bool: + """Check if content contains dummy patterns or keywords.""" + if not content: + return False + + content_lower = content.lower() + + if any(keyword in content_lower for keyword in self.dummy_keywords): + return True + + for pattern in self.dummy_patterns: + if pattern.lower() in content_lower and len(pattern) > 10: + return True + + return False + + def _get_domain(self, url: str) -> str: + """Extract domain from URL (without port).""" + try: + parsed = urlparse(url) + return parsed.netloc.split(":")[0] + except Exception: + return url + + def _is_domain_blocked(self, url: str) -> bool: + """Check if domain is in blocked list (7-day expiration).""" + domain = self._get_domain(url) + if domain in self.blocked_domains: + if time.time() - self.blocked_domains[domain] < 604800: + logger.info("Domain %s is blocked - skipping extraction", domain) + return True + del self.blocked_domains[domain] + self.storage.save_blocked_domains(self.blocked_domains) + return False + + def _block_domain(self, url: str): + """Add domain to blocked list with current timestamp.""" + domain = self._get_domain(url) + if domain not in self.blocked_domains: + self.blocked_domains[domain] = time.time() + self.storage.save_blocked_domains(self.blocked_domains) + logger.info("Added domain %s to blocked list", domain) + + def _add_dummy_content_pattern(self, content: str): + """Extract and save new dummy content patterns from detected content.""" + fragments = re.split(r"[.!?;]", content) + for fragment in fragments: + fragment = fragment.strip() + if 20 < len(fragment) < 200: + self.dummy_patterns.append(fragment) + + self.storage.save_dummy_patterns(self.dummy_patterns) + + def get_content(self, url: str, verify: bool=True, params: Dict = None) -> str: + """Get article content with dummy filtering and blocklisting.""" + if self._is_domain_blocked(url): + logger.warning("Content source blocked: %s", url) + return "Content source blocked: Previously detected irrelevant content" + + parsed_url = urlparse(url) + if parsed_url.path.lower().endswith(self.ignored_extensions): + logger.warning("Skipping non-HTML file: %s", url) + return "Unsupported file type (non-HTML)" + + resp = super().get(url, verify, params) + if not resp: + return None + + try: + article = Article(url, language='zh') + article.set_html(resp.text) + article.parse() + content = article.text + except ArticleException as e: + logger.warning( + "newspaper3k extraction failed: %s - falling back to HTML cleaning", + str(e) + ) + content = self.clean_html(resp.text) + + if self._is_dummy_content(content): + logger.warning("Dummy content detected at: %s", url) + self._block_domain(url) + self._add_dummy_content_pattern(content) + return None + + return content + + @staticmethod + def inst(storage: ScraperStorage=None): + if ArticleDownloader._INSTANCE is None: + ArticleDownloader._INSTANCE = ArticleDownloader(storage) + return ArticleDownloader._INSTANCE diff --git a/src/gentrade/scraper/search.py b/src/gentrade/utils/search.py similarity index 91% rename from src/gentrade/scraper/search.py rename to src/gentrade/utils/search.py index 0509a4b..12cb658 100644 --- a/src/gentrade/scraper/search.py +++ b/src/gentrade/utils/search.py @@ -7,7 +7,6 @@ """ import json -import logging import random import re import time @@ -16,19 +15,11 @@ import requests from bs4 import BeautifulSoup +from loguru import logger -from gentrade.scraper.extractor import ScraperStorage, ArticleContentExtractor - - -# pylint: disable=too-many-locals,too-many-statements,too-many-branches,possibly-used-before-assignment - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s" -) -logger = logging.getLogger(__name__) +from gentrade.utils.download import ArticleDownloader +# pylint: disable=too-many-branches,too-many-locals,too-many-statements class BaiduSearchScraper: """Scrapes Baidu search results and extracts structured article data.""" @@ -36,19 +27,8 @@ class BaiduSearchScraper: def __init__(self) -> None: """Initialize scraper with user agents, storage, and regex patterns.""" self.base_url = "https://www.baidu.com/s" - self.user_agents = [ - ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/114.0.0.0 Safari/537.36"), - ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) " - "Version/16.5 Safari/605.1.15"), - ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36") - ] - self.storage = ScraperStorage() - self.content_extractor = ArticleContentExtractor(self.storage) + self.content_downloader = ArticleDownloader() self.time_patterns = { "minute": re.compile(r"(\d+)\s*分钟前"), @@ -64,20 +44,16 @@ def __init__(self) -> None: ), } - def _get_random_headers(self) -> Dict[str, str]: - """Generate random HTTP headers for requests.""" - return { - "User-Agent": random.choice(self.user_agents), - "Accept": ("text/html,application/xhtml+xml,application/xml;" - "q=0.9,image/avif,image/webp,*/*;q=0.8"), - "Accept-Language": ( - "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5," - "en-US;q=0.3,en;q=0.2" - ), - "Connection": "keep-alive", - "Referer": "https://www.baidu.com/", - "Upgrade-Insecure-Requests": "1", - } + self.user_agents = [ + ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/114.0.0.0 Safari/537.36"), + ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) " + "Version/16.5 Safari/605.1.15"), + ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36") + ] def _parse_time_to_timestamp(self, time_text: str) -> int: """Convert a time string into a Unix timestamp.""" @@ -92,6 +68,7 @@ def _parse_time_to_timestamp(self, time_text: str) -> int: if match: try: num = int(match.group(1)) + dt = None if unit == "minute": dt = now - timedelta(minutes=num) elif unit == "hour": @@ -104,7 +81,9 @@ def _parse_time_to_timestamp(self, time_text: str) -> int: dt = now - timedelta(days=num * 30) elif unit == "year": dt = now - timedelta(days=num * 365) - return int(dt.timestamp()) + if dt: + return int(dt.timestamp()) + continue except Exception: continue @@ -129,6 +108,21 @@ def _parse_time_to_timestamp(self, time_text: str) -> int: logger.warning("Unrecognized time format: %s", time_text) return int(time.time()) + def _get_random_headers(self) -> Dict[str, str]: + """Generate random HTTP headers for requests.""" + return { + "User-Agent": random.choice(self.user_agents), + "Accept": ("text/html,application/xhtml+xml,application/xml;" + "q=0.9,image/avif,image/webp,*/*;q=0.8"), + "Accept-Language": ( + "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5," + "en-US;q=0.3,en;q=0.2" + ), + "Connection": "keep-alive", + "Referer": "https://www.baidu.com/", + "Upgrade-Insecure-Requests": "1", + } + def search( self, query: str, @@ -219,7 +213,7 @@ def search( content = "" if fetch_content and url: - content = self.content_extractor.extract_content(url) + content = self.content_downloader.get_content(url) time.sleep(random.uniform(0.5, 1.5)) results.append({ @@ -235,8 +229,7 @@ def search( logger.error("Error parsing result: %s", str(e)) continue - logger.info("Fetched page %d - total results: %d", - current_page, len(results)) + logger.info(f"Fetched page {current_page} - total results: {len(results)}") next_page = soup.select_one("a.n") if not next_page: @@ -257,7 +250,6 @@ def search( if __name__ == "__main__": - logging.basicConfig(level=logging.DEBUG) scraper = BaiduSearchScraper() news = scraper.search( query="最近24小时关于TESLA的财经新闻", @@ -266,5 +258,4 @@ def search( fetch_content=True, ) - - print(json.dumps(news, ensure_ascii=False, indent=2)) + logger.info(json.dumps(news, ensure_ascii=False, indent=2)) diff --git a/tests/test_gentrade_news.py b/tests/test_gentrade_news.py index ae777dd..94952cd 100644 --- a/tests/test_gentrade_news.py +++ b/tests/test_gentrade_news.py @@ -4,10 +4,7 @@ from gentrade.news.factory import NewsAggregator, NewsFactory from gentrade.news.meta import NewsFileDatabase -# from gentrade.news.newsapi import NewsApiProvider -# from gentrade.news.rss import RssProvider -# from gentrade.news.finnhub import FinnhubNewsProvider -# from gentrade.news.newsnow import NewsNowProvider +from gentrade.news.providers.newsnow import AVAILABLE_SOURCE @pytest.mark.parametrize("provider_name", [ "newsapi", "finnhub", "rss", "newsnow"]) @@ -28,3 +25,24 @@ def test_provider_basic(provider_name:str): for news_item in all_news: logger.info("[%s...]: %s..." % (str(news_item.id)[:10], news_item.headline[:15])) + + +@pytest.mark.parametrize("source", + AVAILABLE_SOURCE) +def test_provider_newsnow(source:str): + db = NewsFileDatabase("news_db.txt") + + provider = NewsFactory.create_provider("newsnow", source=source) + aggregator = NewsAggregator([ provider], db) + aggregator.sync_news( + category="business", + max_hour_interval=64, + max_count=10, + process_content = True) + + # Log results + all_news = db.get_all_news() + logger.info(f"Total articles in database: {len(all_news)}") + + for news_item in all_news: + logger.info("[%s...]: %s..." % (str(news_item.id)[:10], news_item.headline[:15])) diff --git a/tests/test_gentrade_search.py b/tests/test_gentrade_search.py deleted file mode 100644 index 0bd52cc..0000000 --- a/tests/test_gentrade_search.py +++ /dev/null @@ -1,191 +0,0 @@ -from unittest.mock import patch, Mock -import pytest -from gentrade.scraper.search import BaiduSearchScraper - - -@pytest.fixture -def scraper(): - """Fixture to provide a BaiduSearchScraper instance""" - return BaiduSearchScraper() - - -def test_initialization(scraper): - """Test scraper initialization sets up required components""" - assert scraper.base_url == "https://www.baidu.com/s" - assert len(scraper.user_agents) > 0 - assert hasattr(scraper, "storage") - assert hasattr(scraper, "content_extractor") - assert len(scraper.time_patterns) == 8 # Check all time patterns are loaded - - -# def test_get_random_headers(scraper): -# """Test header generation contains required fields""" -# headers = scraper._get_random_headers() -# assert "User-Agent" in headers -# assert headers["User-Agent"] in scraper.user_agents -# assert "Accept" in headers -# assert "Referer" in headers -# assert headers["Referer"] == "https://www.baidu.com/" - - -@patch("gentrade.scraper.search.requests.get") -def test_search_basic(mock_get, scraper): - """Test basic search functionality returns expected structure""" - # Mock successful response with sample search results - mock_response = Mock() - mock_response.status_code = 200 - mock_response.text = """ - -
-

Test Title 1

-
Test summary 1
-
Example Source 2小时前
-
-
-

Test Title 2

-
Test summary 2
-
Another Source 1天前
-
- 下一页 - - """ - mock_get.return_value = mock_response - - # Execute search - results = scraper.search(query="test", limit=2, fetch_content=False) - - # Verify results structure - assert len(results) == 2 - assert results[0]["title"] == "Test Title 1" - assert results[0]["url"] == "https://example.com/news1" - assert results[0]["summary"] == "Test summary 1" - assert results[0]["source"] == "Example Source" - assert results[0]["content"] == "" # fetch_content=False - - # Verify request parameters - mock_get.assert_called_once() - _, kwargs = mock_get.call_args - assert kwargs["params"]["wd"] == "test" - assert kwargs["params"]["pn"] == 0 # First page - - -@patch("gentrade.scraper.search.requests.get") -def test_search_with_limit(mock_get, scraper): - """Test search respects result limit""" - # Create mock response with 5 results - result_html = """ -
-

Title {{i}}

-
Summary {{i}}
-
Source {{i}} 1小时前
-
- """ - mock_response = Mock() - mock_response.status_code = 200 - mock_response.text = f""" - - {''.join([result_html.replace('{{i}}', str(i)) for i in range(5)])} - 下一页 - - """ - mock_get.return_value = mock_response - - # Request limit=3 - results = scraper.search(query="limit test", limit=3) - assert len(results) == 3 - - -@patch("gentrade.scraper.search.requests.get") -def test_search_pagination(mock_get, scraper): - """Test search handles pagination correctly""" - # Create two page responses - page1_html = """ - - - 下一页 - - """ - page2_html = """ - - - - """ - - # Configure mock to return different pages - mock_get.side_effect = [ - Mock(status_code=200, text=page1_html), - Mock(status_code=200, text=page2_html) - ] - - # Search with limit=2 (needs 2 pages) - results = scraper.search(query="pagination test", limit=2) - assert len(results) == 2 - assert mock_get.call_count == 2 # Should call twice for two pages - - -@patch("gentrade.scraper.search.requests.get") -def test_search_failed_request(mock_get, scraper): - """Test search handles HTTP errors gracefully""" - mock_response = Mock() - mock_response.status_code = 403 # Forbidden - mock_get.return_value = mock_response - - results = scraper.search(query="failed request", limit=5) - assert len(results) == 0 # No results on failure - - -@patch("gentrade.scraper.search.ArticleContentExtractor.extract_content") -@patch("gentrade.scraper.search.requests.get") -def test_fetch_content_flag(mock_get, mock_extract, scraper): - """Test fetch_content flag controls content extraction""" - # Basic result HTML - mock_response = Mock(status_code=200, text=""" -
-

Content Test

-
Source 10分钟前
-
- """) - mock_get.return_value = mock_response - mock_extract.return_value = "Full article content" - - # With content fetching - results_with_content = scraper.search(query="content test", fetch_content=True) - assert results_with_content[0]["content"] == "Full article content" - mock_extract.assert_called_once() - - # Without content fetching - mock_extract.reset_mock() - results_no_content = scraper.search(query="content test", fetch_content=False) - assert results_no_content[0]["content"] == "" - mock_extract.assert_not_called() - - -# def test_time_pattern_robustness(scraper): -# """Test time parsing handles messy real-world formats""" -# messy_formats = [ -# " 3小时 前 ", # Extra spaces -# "2023/ 08 / 15", # Inconsistent spacing -# "2023-05-06 09:45", # Extra spaces -# "5 天前", # Space between number and unit -# "2022年12月31日18:30", # No space between date and time -# ] - -# for time_str in messy_formats: -# # Should not raise exceptions and return valid timestamp -# timestamp = scraper._parse_time_to_timestamp(time_str) -# assert isinstance(timestamp, int) -# assert timestamp > 0 # Valid timestamp - - -@patch("gentrade.scraper.search.requests.get") -def test_search_no_results(mock_get, scraper): - """Test search handles empty results gracefully""" - mock_response = Mock(status_code=200, text="") # No results - mock_get.return_value = mock_response - - results = scraper.search(query="no results possible", limit=5) - assert len(results) == 0