From 7c70f911df6ff1643253e2c6e76f31af69e87d41 Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Mon, 1 Dec 2025 14:00:12 +0800 Subject: [PATCH 01/15] add missing python dependency Signed-off-by: Lu Ken --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index cc5d1d3..ad49e6b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,5 @@ langchain_core langchain_tavily langchain langgraph + +lxml[html_clean] From fed62d1e1e278d34256c64d6a776eb63d5c171f8 Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Mon, 1 Dec 2025 14:29:30 +0800 Subject: [PATCH 02/15] improve the HTML download by adding random sleep and multiple browser's headers Signed-off-by: Lu Ken --- src/gentrade/news/factory.py | 43 +++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/src/gentrade/news/factory.py b/src/gentrade/news/factory.py index 54b75b5..81f01e6 100644 --- a/src/gentrade/news/factory.py +++ b/src/gentrade/news/factory.py @@ -10,6 +10,7 @@ import logging import time import threading +import random from typing import List, Optional, Set from urllib.parse import urlparse # Add this to extract domain from URL @@ -261,6 +262,9 @@ def _extract_news_text(self, url: str) -> str: Returns: Cleaned text content of the article, or empty string if extraction fails. """ + # Add random delay before request to avoid rate limiting + time.sleep(random.uniform(1, 3)) + try: article = Article(url) article.download() @@ -287,22 +291,49 @@ def _fetch_original_html(self, url: str, timeout: int = 10) -> Optional[str]: Returns: Raw HTML content as a string, or None if fetch fails. """ - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - } + # More realistic headers that mimic popular browsers + headers_list = [ + { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif," + "image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1" + }, + { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1" + } + ] retries = 3 + # Use a random header from the list for each request + headers = random.choice(headers_list) for attempt in range(retries): try: + # Add random delay between retries (0.5-2 seconds) + if attempt > 0: + time.sleep(random.uniform(0.5, 2.0)) + response = requests.get( - url, headers=headers, timeout=timeout, verify=False + url, headers=headers, timeout=timeout, verify=True ) response.raise_for_status() return response.text except Exception as e: if attempt < retries - 1: - time.sleep(1) continue LOG.error(f"Failed to fetch HTML after {retries} retries ({url}): {e}") return None From c75724b0a9caa4f8fc19567920143ad642197d78 Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Mon, 1 Dec 2025 17:33:51 +0800 Subject: [PATCH 03/15] add Makefile Signed-off-by: Lu Ken --- Makefile | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8a518a1 --- /dev/null +++ b/Makefile @@ -0,0 +1,33 @@ +# Makefile for Pylint with suggestion-mode support + +# Default target +all: lint + +# Pylint configuration +PYLINT = pylint +PYLINT_ARGS = --rcfile=.pylintrc +PYLINT_VERSION = pylint==4.0.4 + + +# Lint all Python files with suggestion-mode support +lint: + pip install $(PYLINT_VERSION) + @echo "Running Pylint with suggestion-mode support..." + find . -type f -name "*.py" | xargs $(PYLINT) $(PYLINT_ARGS) + +# Lint specific file(s) +lint-file: + @if [ -z "$(FILE)" ]; then \ + echo "Usage: make lint-file FILE=path/to/file.py"; \ + exit 1; \ + fi + pip install $(PYLINT_VERSION) + $(PYLINT) $(PYLINT_ARGS) $(FILE) + + +# Clean up (optional) +clean: + find . -type d -name "__pycache__" -exec rm -rf {} + + find . -type f -name "*.pyc" -delete + +.PHONY: all lint lint-file clean From 088d654a564866c624f594b7451926a59036307a Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Mon, 1 Dec 2025 17:34:17 +0800 Subject: [PATCH 04/15] remove obsoleted option Signed-off-by: Lu Ken --- .pylintrc | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.pylintrc b/.pylintrc index ded06ab..365e076 100644 --- a/.pylintrc +++ b/.pylintrc @@ -104,10 +104,6 @@ recursive=no # source root. source-roots= -# When enabled, pylint would attempt to guess common misconfiguration and emit -# user-friendly hints instead of false-positive error messages. -suggestion-mode=yes - # Allow loading of arbitrary C extensions. Extensions are imported into the # active Python interpreter and may run arbitrary code. unsafe-load-any-extension=no @@ -457,7 +453,8 @@ disable=raw-checker-failed, import-error, duplicate-code, redefined-outer-name, - logging-fstring-interpolation + logging-fstring-interpolation, + abstract-class-instantiated # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option From 44fac0b0b686a60d87f5d817f9b6ddf276da06b4 Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Mon, 1 Dec 2025 17:34:46 +0800 Subject: [PATCH 05/15] update langchain's package name Signed-off-by: Lu Ken --- src/gentrade/llm/factory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gentrade/llm/factory.py b/src/gentrade/llm/factory.py index dc86183..fd297c3 100644 --- a/src/gentrade/llm/factory.py +++ b/src/gentrade/llm/factory.py @@ -24,11 +24,11 @@ from pydantic import Field from langchain_openai import ChatOpenAI -from langchain.schema import ( +from langchain_core.messages import ( AIMessage, BaseMessage ) -from langchain.schema.runnable import RunnableConfig +from langchain_core.runnables import RunnableConfig LOG = logging.getLogger(__name__) From 3d2c3b4953ef1210056b364c61fe8cacfc779bcc Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Mon, 1 Dec 2025 17:35:14 +0800 Subject: [PATCH 06/15] update the directory name Signed-off-by: Lu Ken --- .github/workflows/pylint.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 0ecee88..504ab9f 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -33,5 +33,5 @@ jobs: - name: Analyzing the python code run: | set -ex - export PYTHONPATH=$PWD/src/tia/ + export PYTHONPATH=$PWD/src/gentrade/ find . -type f -name "*.py" | xargs pylint From fd7619fbeef3a9e637ee5683b9b26629259dc26e Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Mon, 1 Dec 2025 18:05:24 +0800 Subject: [PATCH 07/15] add more action in Makefile Signed-off-by: Lu Ken --- .github/workflows/pylint.yml | 9 ++++++-- Makefile | 39 ++++++++++++++++++++++------------ src/gentrade/scraper/search.py | 1 + 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 504ab9f..ccba373 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -20,18 +20,23 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 with: python-version: 3.11 + - name: Install dependencies run: | + sudo apt-get update + sudo apt-get install -y build-essential + pip3 install --upgrade pip - pip3 install pylint for f in $(find -type f -name "requirements.txt"); do pip3 install -r $f done + - name: Analyzing the python code run: | set -ex export PYTHONPATH=$PWD/src/gentrade/ - find . -type f -name "*.py" | xargs pylint + make lint diff --git a/Makefile b/Makefile index 8a518a1..cf24f20 100644 --- a/Makefile +++ b/Makefile @@ -1,19 +1,21 @@ -# Makefile for Pylint with suggestion-mode support -# Default target -all: lint +CURRENT_DIR := $(shell pwd) # Pylint configuration -PYLINT = pylint PYLINT_ARGS = --rcfile=.pylintrc PYLINT_VERSION = pylint==4.0.4 +REQUIRED_PYTHON_VERSION := "3.11" +PYTHON_VERSION_OK := $(shell python -c "import sys; print(sys.version_info >= (3,11) and sys.version_info < (3,12))") + +# Default target +all: lint -# Lint all Python files with suggestion-mode support +# Lint all Python files lint: pip install $(PYLINT_VERSION) - @echo "Running Pylint with suggestion-mode support..." - find . -type f -name "*.py" | xargs $(PYLINT) $(PYLINT_ARGS) + @echo "Running Pylint ..." + find . -type f -name "*.py" | xargs pylint $(PYLINT_ARGS) # Lint specific file(s) lint-file: @@ -22,12 +24,23 @@ lint-file: exit 1; \ fi pip install $(PYLINT_VERSION) - $(PYLINT) $(PYLINT_ARGS) $(FILE) - + pylint $(PYLINT_ARGS) $(FILE) + +# Check python version +check-python: + @echo "Checking Python version..." + @if [ "$(PYTHON_VERSION_OK)" = "True" ]; then \ + echo "✅ Python version is $(REQUIRED_PYTHON_VERSION) (compatible)"; \ + else \ + echo "❌ Error: Python $(REQUIRED_PYTHON_VERSION) is required (found $(shell python --version | cut -d' ' -f2))"; \ + exit 1; \ + fi -# Clean up (optional) +# Clean up clean: - find . -type d -name "__pycache__" -exec rm -rf {} + - find . -type f -name "*.pyc" -delete + @echo "Cleaning in: $(CURRENT_DIR)..." + find $(CURRENT_DIR) -type d -name "__pycache__" -exec rm -rf {} + + find $(CURRENT_DIR) -type f -name "*.pyc" -delete + find $(CURRENT_DIR) -type f -name ".pylint-cache" -exec rm -rf {} + -.PHONY: all lint lint-file clean +.PHONY: all lint lint-file clean check-python diff --git a/src/gentrade/scraper/search.py b/src/gentrade/scraper/search.py index 4a43e35..0509a4b 100644 --- a/src/gentrade/scraper/search.py +++ b/src/gentrade/scraper/search.py @@ -266,4 +266,5 @@ def search( fetch_content=True, ) + print(json.dumps(news, ensure_ascii=False, indent=2)) From 4f5cd029d86127c38b9f7205c435b4d17d07a8ac Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Tue, 2 Dec 2025 10:40:38 +0800 Subject: [PATCH 08/15] add missing python dependency: newspaper3k Signed-off-by: Lu Ken --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index ad49e6b..ed82156 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,4 @@ langchain langgraph lxml[html_clean] +newspaper3k \ No newline at end of file From 3ac2a9b4ed4cad5dd1d4df19971bf342ac4eced1 Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Tue, 2 Dec 2025 10:50:04 +0800 Subject: [PATCH 09/15] clean up and refine the code in factory Signed-off-by: Lu Ken --- src/gentrade/news/factory.py | 201 ++++++----------------------------- 1 file changed, 31 insertions(+), 170 deletions(-) diff --git a/src/gentrade/news/factory.py b/src/gentrade/news/factory.py index 81f01e6..81f4917 100644 --- a/src/gentrade/news/factory.py +++ b/src/gentrade/news/factory.py @@ -15,6 +15,7 @@ from urllib.parse import urlparse # Add this to extract domain from URL import requests +from gentrade.scraper.extractor import ArticleContentExtractor from newspaper import Article from bs4 import BeautifulSoup @@ -133,6 +134,35 @@ def _save_blocklist(self) -> None: for domain in self.blocklist: f.write(f"{domain}\n") + def _fetch_thread(self, provider, aggregator, ticker, category, + max_hour_interval, max_count, is_process=False): + if ticker: + news = provider.fetch_stock_news( + ticker, category, max_hour_interval, max_count + ) + LOG.info( + f"Fetched {len(news)} stock news articles for {ticker} from " + f"{provider.__class__.__name__}" + ) + else: + news = provider.fetch_latest_market_news( + category, max_hour_interval, max_count + ) + LOG.info( + f"Fetched {len(news)} market news articles from " + f"{provider.__class__.__name__}" + ) + + ace = ArticleContentExtractor.inst() + for item in news: + item.summary = ace.clean_html(item.summary) + if is_process: + item.content = ace.extract_content(item.url) + LOG.info(item.content) + + with aggregator.db_lock: + aggregator.db.add_news(news) + def sync_news( self, ticker: Optional[str] = None, @@ -158,32 +188,10 @@ def sync_news( LOG.info("Starting news sync...") - def fetch_and_process(provider, aggregator, ticker, category, max_hour_interval, max_count): - if ticker: - news = provider.fetch_stock_news( - ticker, category, max_hour_interval, max_count - ) - LOG.info( - f"Fetched {len(news)} stock news articles for {ticker} from " - f"{provider.__class__.__name__}" - ) - else: - news = provider.fetch_latest_market_news( - category, max_hour_interval, max_count - ) - LOG.info( - f"Fetched {len(news)} market news articles from " - f"{provider.__class__.__name__}" - ) - - aggregator.process_news(news) - with aggregator.db_lock: - aggregator.db.add_news(news) - threads = [] for provider in self.providers: thread = threading.Thread( - target=fetch_and_process, + target=self._fetch_thread, args=(provider, self, ticker, category, max_hour_interval, max_count) ) threads.append(thread) @@ -195,28 +203,6 @@ def fetch_and_process(provider, aggregator, ticker, category, max_hour_interval, self.db.last_sync = current_time LOG.info("News sync completed.") - def process_news(self, news: List[NewsInfo]) -> None: - """Process news: Skip blocked sites → Check for dummy content → Clean content""" - # Filter out news from blocked websites FIRST - filtered_news = [n for n in news if not self._is_blocked(n.url)] - - for article in filtered_news: - LOG.info(f"Processing news: {article.headline}") - - # Extract content and check for dummy messages - content = self._extract_news_text(article.url) - if self._contains_dummy_content(content): - # Add the website to blocklist if dummy content is found - domain = self._extract_domain(article.url) - self.blocklist.add(domain) - LOG.warning(f"Blocked website {domain} (contains dummy content)") - continue # Skip storing this article - - # Proceed with normal cleaning if no dummy content - article.summary = self._clean_html(article.summary) - article.content = content - time.sleep(1) - def _is_blocked(self, url: str) -> bool: """Check if the website of the URL is in the blocklist""" domain = self._extract_domain(url) @@ -241,131 +227,6 @@ def _extract_domain(self, url: str) -> str: LOG.error(f"Failed to extract domain from {url}: {e}") return url # Fallback to full URL if parsing fails - def _contains_dummy_content(self, content: str) -> bool: - """Check if content contains dummy messages (case-insensitive)""" - if not content: - return False - content_lower = content.lower() - # Count how many dummy keywords match - dummy_count = sum(1 for keyword in self.dummy_keywords if keyword in content_lower) - # Return True if ≥1 keyword matches (adjust threshold if needed) - return dummy_count >= 1 - - def _extract_news_text(self, url: str) -> str: - """Extract text content from a news article URL using newspaper3k. - - Falls back to HTML scraping with BeautifulSoup if newspaper3k fails. - - Args: - url: URL of the news article to extract text from. - - Returns: - Cleaned text content of the article, or empty string if extraction fails. - """ - # Add random delay before request to avoid rate limiting - time.sleep(random.uniform(1, 3)) - - try: - article = Article(url) - article.download() - article.parse() - if article.text: - return article.text - - # Fallback to HTML scraping if newspaper3k returns empty text - html = self._fetch_original_html(url) - return self._clean_html(html) - - except Exception as e: - LOG.error(f"Failed to extract text with newspaper3k ({url}): {e}") - html = self._fetch_original_html(url) - return self._clean_html(html) - - def _fetch_original_html(self, url: str, timeout: int = 10) -> Optional[str]: - """Fetch raw HTML content from a URL with retries. - - Args: - url: URL to fetch HTML from. - timeout: Request timeout in seconds (default: 10). - - Returns: - Raw HTML content as a string, or None if fetch fails. - """ - # More realistic headers that mimic popular browsers - headers_list = [ - { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif," - "image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.5", - "Accept-Encoding": "gzip, deflate, br", - "Connection": "keep-alive", - "Upgrade-Insecure-Requests": "1", - "Sec-Fetch-Dest": "document", - "Sec-Fetch-Mode": "navigate", - "Sec-Fetch-Site": "none", - "Sec-Fetch-User": "?1" - }, - { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.5", - "Accept-Encoding": "gzip, deflate, br", - "Connection": "keep-alive", - "Upgrade-Insecure-Requests": "1" - } - ] - retries = 3 - # Use a random header from the list for each request - headers = random.choice(headers_list) - - for attempt in range(retries): - try: - # Add random delay between retries (0.5-2 seconds) - if attempt > 0: - time.sleep(random.uniform(0.5, 2.0)) - - response = requests.get( - url, headers=headers, timeout=timeout, verify=True - ) - response.raise_for_status() - return response.text - except Exception as e: - if attempt < retries - 1: - continue - LOG.error(f"Failed to fetch HTML after {retries} retries ({url}): {e}") - return None - - return None - - def _clean_html(self, html_content: Optional[str]) -> str: - """Clean HTML content to extract readable text. - - Removes scripts, styles, and other non-content elements, then normalizes whitespace. - - Args: - html_content: Raw HTML content to clean. - - Returns: - Cleaned text string, or empty string if input is None/empty. - """ - if not html_content: - return "" - - soup = BeautifulSoup(html_content, "html.parser") - - # Remove non-content elements - for element in soup(["script", "style", "iframe", "nav", "aside", "footer"]): - element.decompose() - - # Extract and normalize text - text = soup.get_text() - lines = (line.strip() for line in text.splitlines()) - chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) - return "\n".join(chunk for chunk in chunks if chunk) - if __name__ == "__main__": logging.basicConfig(level=logging.INFO) From feee11239cb2e293a332b671f203c441375100eb Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Tue, 2 Dec 2025 10:51:13 +0800 Subject: [PATCH 10/15] centralize the html download and process action into ArticleContentExtractor class Signed-off-by: Lu Ken --- src/gentrade/scraper/extractor.py | 79 +++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 21 deletions(-) diff --git a/src/gentrade/scraper/extractor.py b/src/gentrade/scraper/extractor.py index 57a99bd..2fc59a1 100644 --- a/src/gentrade/scraper/extractor.py +++ b/src/gentrade/scraper/extractor.py @@ -10,11 +10,12 @@ import json import logging import os +from pickle import NONE import random import re import time -from typing import Dict, List +from typing import Dict, List, Optional from urllib.parse import urlparse import requests @@ -93,7 +94,9 @@ def save_dummy_patterns(self, dummy_patterns: List[str]): class ArticleContentExtractor: """Handles article content extraction with dummy content filtering.""" - def __init__(self, storage: ScraperStorage): + _instance = None + + def __init__(self, storage: ScraperStorage=None): self.ignored_extensions = ( ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".zip", ".rar", ".jpg", ".png", ".jpeg", ".gif" @@ -105,7 +108,10 @@ def __init__(self, storage: ScraperStorage): "ads by", "sponsored content", "subscribe to access" } + if storage is None: + storage = ScraperStorage() self.storage = storage + self.blocked_domains = self.storage.load_blocked_domains() self.dummy_patterns = self.storage.load_dummy_patterns() @@ -129,7 +135,7 @@ def _get_random_headers(self) -> Dict[str, str]: "Upgrade-Insecure-Requests": "1", } - def _clean_html(self, html: str) -> str: + def clean_html(self, html: str) -> str: """Clean raw HTML by removing non-content elements and ads.""" if not html: return "" @@ -221,31 +227,23 @@ def extract_content(self, url: str) -> str: return "Unsupported file type (non-HTML)" try: - article = Article(url, language="zh") + article = Article(url) article.download() article.parse() - content = article.text.strip() + if article.text: + content = article.text.strip() + else: + # Fallback to HTML scrapping if newspaper3k returns empty text + html = self._fetch_original_html(url) + content = self.clean_html(html) except ArticleException as e: logger.warning( "newspaper3k extraction failed: %s - falling back to HTML cleaning", str(e) ) - try: - headers = self._get_random_headers() - response = requests.get( - url, headers=headers, timeout=10, allow_redirects=True - ) - response.encoding = response.apparent_encoding - - if response.status_code != 200: - logger.warning("Failed to retrieve article (status %s): %s", - response.status_code, url) - return "Failed to retrieve content (HTTP error)" - - content = self._clean_html(response.text) - except requests.exceptions.RequestException as e: - logger.error("Request error for %s: %s", url, str(e)) - return "Failed to retrieve content (network error)" + # Fallback to HTML scrapping if newspaper3k returns empty text + html = self._fetch_original_html(url) + content = self.clean_html(html) if self._is_dummy_content(content): logger.warning("Dummy content detected at: %s", url) @@ -254,3 +252,42 @@ def extract_content(self, url: str) -> str: return "Content blocked: Contains cookie notices or irrelevant material" return content + + def _fetch_original_html(self, url: str, timeout: int = 10) -> Optional[str]: + """Fetch raw HTML content from a URL with retries. + + Args: + url: URL to fetch HTML from. + timeout: Request timeout in seconds (default: 10). + + Returns: + Raw HTML content as a string, or None if fetch fails. + """ + + retries = 3 + headers = self._get_random_headers() + + for attempt in range(retries): + try: + # Add random delay between retries (0.5-2 seconds) + if attempt > 0: + time.sleep(random.uniform(0.5, 2.0)) + + response = requests.get( + url, headers=headers, timeout=timeout, verify=True + ) + response.raise_for_status() + return response.text + except Exception as e: + if attempt < retries - 1: + continue + logger.error(f"Failed to fetch HTML after {retries} retries ({url}): {e}") + return None + + return None + + @staticmethod + def inst(storage: ScraperStorage=None): + if ArticleContentExtractor._instance is None: + ArticleContentExtractor._instance = ArticleContentExtractor(storage) + return ArticleContentExtractor._instance \ No newline at end of file From 62bfa9d603b6db9b5641536bfe59d8b0c64c18ce Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Tue, 2 Dec 2025 15:27:13 +0800 Subject: [PATCH 11/15] Makefile: check pylint version before run make line instead of installing it everytime Signed-off-by: Lu Ken --- Makefile | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index cf24f20..2fe8813 100644 --- a/Makefile +++ b/Makefile @@ -1,19 +1,29 @@ CURRENT_DIR := $(shell pwd) +PYTHON_VERSION := "3.11" +PYTHON_VERSION_OK := $(shell python -c "import sys; print(sys.version_info >= (3,11) and sys.version_info < (3,12))") + # Pylint configuration PYLINT_ARGS = --rcfile=.pylintrc -PYLINT_VERSION = pylint==4.0.4 +PYLINT_VERSION = 4.0.4 +PYLINT_INSTALLED := $(shell python -c "import pkg_resources; print('pylint' in {pkg.key for pkg in pkg_resources.working_set})" 2>/dev/null || echo "False") +PYLINT_VERSION_OK := $(shell python -c "import pylint; print(pylint.__version__ == '$(PYLINT_VERSION)')" 2>/dev/null || echo "False") -REQUIRED_PYTHON_VERSION := "3.11" -PYTHON_VERSION_OK := $(shell python -c "import sys; print(sys.version_info >= (3,11) and sys.version_info < (3,12))") # Default target all: lint +ensure-pylint: + @if [ "$(PYLINT_INSTALLED)" = "False" ] || [ "$(PYLINT_VERSION_OK)" = "False" ]; then \ + echo "Installing/upgrading pylint to version $(PYLINT_VERSION)..."; \ + pip install pylint=="$(PYLINT_VERSION)"; \ + else \ + echo "✅ pylint $(PYLINT_VERSION) is already installed"; \ + fi + # Lint all Python files -lint: - pip install $(PYLINT_VERSION) +lint: check-python ensure-pylint @echo "Running Pylint ..." find . -type f -name "*.py" | xargs pylint $(PYLINT_ARGS) @@ -23,16 +33,16 @@ lint-file: echo "Usage: make lint-file FILE=path/to/file.py"; \ exit 1; \ fi - pip install $(PYLINT_VERSION) + pip install pylint=="$(PYLINT_VERSION)" pylint $(PYLINT_ARGS) $(FILE) # Check python version check-python: @echo "Checking Python version..." @if [ "$(PYTHON_VERSION_OK)" = "True" ]; then \ - echo "✅ Python version is $(REQUIRED_PYTHON_VERSION) (compatible)"; \ + echo "✅ Python version is $(PYTHON_VERSION) (compatible)"; \ else \ - echo "❌ Error: Python $(REQUIRED_PYTHON_VERSION) is required (found $(shell python --version | cut -d' ' -f2))"; \ + echo "❌ Error: Python $(PYTHON_VERSION) is required (found $(shell python --version | cut -d' ' -f2))"; \ exit 1; \ fi @@ -43,4 +53,4 @@ clean: find $(CURRENT_DIR) -type f -name "*.pyc" -delete find $(CURRENT_DIR) -type f -name ".pylint-cache" -exec rm -rf {} + -.PHONY: all lint lint-file clean check-python +.PHONY: all lint lint-file clean check-python ensure-pylint From b7b295e1ea99f070e23ec1270453538a8f7d1fcc Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Tue, 2 Dec 2025 17:05:56 +0800 Subject: [PATCH 12/15] refactory Signed-off-by: Lu Ken --- Makefile | 112 +++++++++++++++--------------- src/gentrade/news/factory.py | 6 +- src/gentrade/scraper/extractor.py | 3 +- 3 files changed, 58 insertions(+), 63 deletions(-) diff --git a/Makefile b/Makefile index 2fe8813..de3b8e6 100644 --- a/Makefile +++ b/Makefile @@ -1,56 +1,56 @@ - -CURRENT_DIR := $(shell pwd) - -PYTHON_VERSION := "3.11" -PYTHON_VERSION_OK := $(shell python -c "import sys; print(sys.version_info >= (3,11) and sys.version_info < (3,12))") - -# Pylint configuration -PYLINT_ARGS = --rcfile=.pylintrc -PYLINT_VERSION = 4.0.4 -PYLINT_INSTALLED := $(shell python -c "import pkg_resources; print('pylint' in {pkg.key for pkg in pkg_resources.working_set})" 2>/dev/null || echo "False") -PYLINT_VERSION_OK := $(shell python -c "import pylint; print(pylint.__version__ == '$(PYLINT_VERSION)')" 2>/dev/null || echo "False") - - -# Default target -all: lint - -ensure-pylint: - @if [ "$(PYLINT_INSTALLED)" = "False" ] || [ "$(PYLINT_VERSION_OK)" = "False" ]; then \ - echo "Installing/upgrading pylint to version $(PYLINT_VERSION)..."; \ - pip install pylint=="$(PYLINT_VERSION)"; \ - else \ - echo "✅ pylint $(PYLINT_VERSION) is already installed"; \ - fi - -# Lint all Python files -lint: check-python ensure-pylint - @echo "Running Pylint ..." - find . -type f -name "*.py" | xargs pylint $(PYLINT_ARGS) - -# Lint specific file(s) -lint-file: - @if [ -z "$(FILE)" ]; then \ - echo "Usage: make lint-file FILE=path/to/file.py"; \ - exit 1; \ - fi - pip install pylint=="$(PYLINT_VERSION)" - pylint $(PYLINT_ARGS) $(FILE) - -# Check python version -check-python: - @echo "Checking Python version..." - @if [ "$(PYTHON_VERSION_OK)" = "True" ]; then \ - echo "✅ Python version is $(PYTHON_VERSION) (compatible)"; \ - else \ - echo "❌ Error: Python $(PYTHON_VERSION) is required (found $(shell python --version | cut -d' ' -f2))"; \ - exit 1; \ - fi - -# Clean up -clean: - @echo "Cleaning in: $(CURRENT_DIR)..." - find $(CURRENT_DIR) -type d -name "__pycache__" -exec rm -rf {} + - find $(CURRENT_DIR) -type f -name "*.pyc" -delete - find $(CURRENT_DIR) -type f -name ".pylint-cache" -exec rm -rf {} + - -.PHONY: all lint lint-file clean check-python ensure-pylint + +CURRENT_DIR := $(shell pwd) + +PYTHON_VERSION := "3.11" +PYTHON_VERSION_OK := $(shell python -c "import sys; print(sys.version_info >= (3,11) and sys.version_info < (3,12))") + +# Pylint configuration +PYLINT_ARGS = --rcfile=.pylintrc +PYLINT_VERSION = 4.0.4 +PYLINT_INSTALLED := $(shell python -c "import pkg_resources; print('pylint' in {pkg.key for pkg in pkg_resources.working_set})" 2>/dev/null || echo "False") +PYLINT_VERSION_OK := $(shell python -c "import pylint; print(pylint.__version__ == '$(PYLINT_VERSION)')" 2>/dev/null || echo "False") + +# Default target +all: lint + +ensure-pylint: + @if [ "$(PYLINT_INSTALLED)" = "False" ] || [ "$(PYLINT_VERSION_OK)" = "False" ]; then \ + echo "Installing/upgrading pylint to version $(PYLINT_VERSION)..."; \ + pip install pylint=="$(PYLINT_VERSION)"; \ + else \ + echo "✅ pylint $(PYLINT_VERSION) is already installed"; \ + fi + +# Lint all Python files +lint: check-python ensure-pylint + @echo "Running Pylint ..." + @export PYTHONPATH=$(CURRENT_DIR)/src + find . -type f -name "*.py" | xargs pylint $(PYLINT_ARGS) + +# Lint specific file(s) +lint-file: + @if [ -z "$(FILE)" ]; then \ + echo "Usage: make lint-file FILE=path/to/file.py"; \ + exit 1; \ + fi + pip install pylint=="$(PYLINT_VERSION)" + pylint $(PYLINT_ARGS) $(FILE) + +# Check python version +check-python: + @echo "Checking Python version..." + @if [ "$(PYTHON_VERSION_OK)" = "True" ]; then \ + echo "✅ Python version is $(PYTHON_VERSION) (compatible)"; \ + else \ + echo "❌ Error: Python $(PYTHON_VERSION) is required (found $(shell python --version | cut -d' ' -f2))"; \ + exit 1; \ + fi + +# Clean up +clean: + @echo "Cleaning in: $(CURRENT_DIR)..." + find $(CURRENT_DIR) -type d -name "__pycache__" -exec rm -rf {} + + find $(CURRENT_DIR) -type f -name "*.pyc" -delete + find $(CURRENT_DIR) -type f -name ".pylint-cache" -exec rm -rf {} + + +.PHONY: all lint lint-file clean check-python ensure-pylint diff --git a/src/gentrade/news/factory.py b/src/gentrade/news/factory.py index 81f4917..4fb9e60 100644 --- a/src/gentrade/news/factory.py +++ b/src/gentrade/news/factory.py @@ -10,16 +10,12 @@ import logging import time import threading -import random from typing import List, Optional, Set from urllib.parse import urlparse # Add this to extract domain from URL -import requests from gentrade.scraper.extractor import ArticleContentExtractor -from newspaper import Article -from bs4 import BeautifulSoup -from gentrade.news.meta import NewsInfo, NewsProviderBase, NewsDatabase +from gentrade.news.meta import NewsProviderBase, NewsDatabase from gentrade.news.googlenews import GoogleNewsProvider from gentrade.news.newsapi import NewsApiProvider from gentrade.news.rss import RssProvider diff --git a/src/gentrade/scraper/extractor.py b/src/gentrade/scraper/extractor.py index 2fc59a1..12a91b8 100644 --- a/src/gentrade/scraper/extractor.py +++ b/src/gentrade/scraper/extractor.py @@ -10,7 +10,6 @@ import json import logging import os -from pickle import NONE import random import re import time @@ -290,4 +289,4 @@ def _fetch_original_html(self, url: str, timeout: int = 10) -> Optional[str]: def inst(storage: ScraperStorage=None): if ArticleContentExtractor._instance is None: ArticleContentExtractor._instance = ArticleContentExtractor(storage) - return ArticleContentExtractor._instance \ No newline at end of file + return ArticleContentExtractor._instance From bb3035e2b4c0ba9cbc70ddae1b90da5756fd718e Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Tue, 2 Dec 2025 17:22:59 +0800 Subject: [PATCH 13/15] remove Google news provider Signed-off-by: Lu Ken --- src/gentrade/news/factory.py | 71 +------------ src/gentrade/news/googlenews.py | 158 --------------------------- tests/test_api_google_news.py | 182 -------------------------------- tests/test_gentrade_news.py | 59 +---------- 4 files changed, 6 insertions(+), 464 deletions(-) delete mode 100644 src/gentrade/news/googlenews.py delete mode 100644 tests/test_api_google_news.py diff --git a/src/gentrade/news/factory.py b/src/gentrade/news/factory.py index 4fb9e60..0ed7626 100644 --- a/src/gentrade/news/factory.py +++ b/src/gentrade/news/factory.py @@ -10,13 +10,11 @@ import logging import time import threading -from typing import List, Optional, Set -from urllib.parse import urlparse # Add this to extract domain from URL +from typing import List, Optional from gentrade.scraper.extractor import ArticleContentExtractor from gentrade.news.meta import NewsProviderBase, NewsDatabase -from gentrade.news.googlenews import GoogleNewsProvider from gentrade.news.newsapi import NewsApiProvider from gentrade.news.rss import RssProvider from gentrade.news.finnhub import FinnhubNewsProvider @@ -36,8 +34,7 @@ def create_provider(provider_type: str, **kwargs) -> NewsProviderBase: """Create a news provider instance based on the specified provider type. Args: - provider_type: Type of news provider. Supported values: "newsapi", "finnhub", - "google", "rss". + provider_type: Type of news provider. Supported values: "newsapi", "finnhub", "rss". ** kwargs: Additional keyword arguments for provider initialization (e.g., feed_url for RSS providers). @@ -52,7 +49,6 @@ def create_provider(provider_type: str, **kwargs) -> NewsProviderBase: providers = { "newsapi": NewsApiProvider, "finnhub": FinnhubNewsProvider, - "google": GoogleNewsProvider, "rss": RssProvider } @@ -72,15 +68,6 @@ def create_provider(provider_type: str, **kwargs) -> NewsProviderBase: raise ValueError("FINNHUB_API_KEY environment variable not set") return provider_class(api_key=api_key) - if provider_type_lower == "google": - api_key = os.getenv("GOOGLE_CLOUD_API_KEY") - cse_id = os.getenv("GOOGLE_CSE_ID") - if not api_key or not cse_id: - raise ValueError( - "GOOGLE_CLOUD_API_KEY or GOOGLE_CSE_ID environment variable not set" - ) - return provider_class(api_key=api_key, cse_id=cse_id) - if provider_type_lower == "rss": feed_url = kwargs.get("feed_url", os.getenv("RSS_FEED_URL")) return provider_class(feed_url=feed_url) @@ -106,32 +93,8 @@ def __init__(self, providers: List[NewsProviderBase], db: NewsDatabase): self.db = db self.db_lock = threading.Lock() - # 1. Add blocklist (stores blocked domain names, e.g., "example.com") - self.blocklist: Set[str] = set() - - # 2. Add dummy content keywords (expand this list based on your needs) - self.dummy_keywords = { - "we use cookies", "cookie policy", "analyze website traffic", - "accept cookies", "reject cookies", "by continuing to use", - "this website uses cookies", "improve user experience", - "ads by", "sponsored content", "subscribe to access" - } - #self.blocklist = self._load_blocklist() - - def _load_blocklist(self) -> Set[str]: - try: - with open("news_blocklist.txt", "r", encoding="utf-8") as f: - return set(line.strip() for line in f if line.strip()) - except FileNotFoundError: - return set() - - def _save_blocklist(self) -> None: - with open("news_blocklist.txt", "w", encoding="utf-8") as f: - for domain in self.blocklist: - f.write(f"{domain}\n") - def _fetch_thread(self, provider, aggregator, ticker, category, - max_hour_interval, max_count, is_process=False): + max_hour_interval, max_count, is_process=True): if ticker: news = provider.fetch_stock_news( ticker, category, max_hour_interval, max_count @@ -199,31 +162,6 @@ def sync_news( self.db.last_sync = current_time LOG.info("News sync completed.") - def _is_blocked(self, url: str) -> bool: - """Check if the website of the URL is in the blocklist""" - domain = self._extract_domain(url) - if domain in self.blocklist: - LOG.info(f"Skipping blocked website: {domain} (URL: {url})") - return True - return False - - def _extract_domain(self, url: str) -> str: - """Extract the main domain from a URL - (e.g., "https://www.example.com/news" → "example.com") - """ - try: - parsed = urlparse(url) - # Split subdomains (e.g., "www.example.co.uk" → "example.co.uk" for common TLDs) - domain_parts = parsed.netloc.split(".") - # Handle cases like "co.uk" (adjust based on your target regions) - if len(domain_parts) >= 3 and domain_parts[-2] in ["co", "com", "org", "net"]: - return ".".join(domain_parts[-3:]) - return ".".join(domain_parts[-2:]) - except Exception as e: - LOG.error(f"Failed to extract domain from {url}: {e}") - return url # Fallback to full URL if parsing fails - - if __name__ == "__main__": logging.basicConfig(level=logging.INFO) db = NewsDatabase() @@ -232,11 +170,10 @@ def _extract_domain(self, url: str) -> str: # Initialize providers using the factory newsapi_provider = NewsFactory.create_provider("newsapi") finnhub_provider = NewsFactory.create_provider("finnhub") - google_provider = NewsFactory.create_provider("google") rss_provider = NewsFactory.create_provider("rss") # Create aggregator with selected providers - aggregator = NewsAggregator(providers=[newsapi_provider], db=db) + aggregator = NewsAggregator(providers=[rss_provider], db=db) # Sync market news and stock-specific news aggregator.sync_news(category="business", max_hour_interval=64, max_count=10) diff --git a/src/gentrade/news/googlenews.py b/src/gentrade/news/googlenews.py deleted file mode 100644 index 87f1193..0000000 --- a/src/gentrade/news/googlenews.py +++ /dev/null @@ -1,158 +0,0 @@ -"""Google Custom Search (GCS) news provider for financial news retrieval. - -Implements the NewsProviderBase abstract class to fetch general market news and stock-specific -news via Google's Custom Search API. Supports filtering by time interval, article count, -region, and language, while formatting results into standardized NewsInfo objects. -""" - -import logging -import time -from typing import List - -import requests - -from gentrade.news.meta import NewsInfo, NewsProviderBase - -LOG = logging.getLogger(__name__) - - -class GoogleNewsProvider(NewsProviderBase): - """News provider using Google Custom Search API to retrieve financial news. - - Authenticates with Google Cloud API key and Custom Search Engine (CSE) ID. Fetches - market-wide or stock-specific news, with built-in filtering for recency and result count. - """ - - def __init__(self, api_key: str, cse_id: str): - """Initialize GoogleNewsProvider with required authentication credentials. - - Args: - api_key: Google Cloud API key for Custom Search request authentication. - cse_id: Google Custom Search Engine (CSE) ID configured for news retrieval. - """ - self.api_key = api_key - self.cse_id = cse_id - self.base_url = "https://www.googleapis.com/customsearch/v1" - - def fetch_latest_market_news( - self, - category: str = "business", - max_hour_interval: int = 24, - max_count: int = 10 - ) -> List[NewsInfo]: - """Fetch latest general market news via Google Custom Search. - - Retrieves financial market news from the last `max_hour_interval` hours, limited to - `max_count` articles, and assigns the specified category. - - Args: - category: Category label for fetched news (default: "business"). - max_hour_interval: Maximum age (in hours) of articles to retrieve (default: 24). - max_count: Maximum number of articles to return (default: 10). - - Returns: - List of NewsInfo objects with formatted market news; empty list if fetch fails - or no results exist. - """ - params = { - "key": self.api_key, - "cx": self.cse_id, - "q": "finance stock market", # Core query for market news - "num": max_count, - "dateRestrict": f"h{max_hour_interval}", # Filter by recent hours - "gl": "us", # Focus on US region results - "lr": "lang_en", # Restrict to English language - "siteSearch": "news.google.com", # Limit to Google News sources - "siteSearchFilter": "i" # Exclude duplicate results - } - - try: - response = requests.get(self.base_url, params=params, timeout=10) - response.raise_for_status() # Raise error for HTTP status codes ≥400 - items = response.json().get("items", []) # Extract articles from response - - # Convert API response to standardized NewsInfo objects - news_list = [ - NewsInfo( - category=category, - datetime=int(time.time()), # Google CSE lacks article timestamp - headline=item.get("title", ""), - id=self.url_to_hash_id(item.get("link", "")), - image=item.get("pagemap", {}).get("cse_image", [{}])[0].get("src", ""), - related="", # No stock ticker for general market news - source=item.get("displayLink", ""), # Source domain (e.g., "bloomberg.com") - summary=item.get("snippet", ""), # Short article preview - url=item.get("link", ""), # Direct article URL - content="", # Content extracted later by aggregator - provider='google', - market='us' - ) - for item in items - ] - - return self._filter_news(news_list, max_hour_interval, max_count) - - except requests.RequestException as e: - LOG.debug(f"Failed to fetch market news from Google Custom Search: {e}") - return [] - - def fetch_stock_news( - self, - ticker: str, - category: str = "business", - max_hour_interval: int = 24, - max_count: int = 10 - ) -> List[NewsInfo]: - """Fetch stock-specific news for a given ticker via Google Custom Search. - - Retrieves news related to the specified stock ticker from the last `max_hour_interval` - hours, limited to `max_count` articles, and assigns the specified category. - - Args: - ticker: Stock ticker symbol (e.g., "AAPL") to fetch news for. - category: Category label for fetched news (default: "business"). - max_hour_interval: Maximum age (in hours) of articles to retrieve (default: 24). - max_count: Maximum number of articles to return (default: 10). - - Returns: - List of NewsInfo objects with formatted stock news; empty list if fetch fails - or no results exist. - """ - params = { - "key": self.api_key, - "cx": self.cse_id, - "q": f"{ticker} stock news", # Ticker-specific query - "num": max_count, - "dateRestrict": f"h{max_hour_interval}", # Filter by recent hours - "sort": "date" # Sort results by most recent first - } - - try: - response = requests.get(self.base_url, params=params, timeout=10) - response.raise_for_status() - items = response.json().get("items", []) - - # Convert API response to standardized NewsInfo objects - news_list = [ - NewsInfo( - category=category, - datetime=int(time.time()), # Google CSE lacks article timestamp - headline=item.get("title", ""), - id=hash(item.get("link", "")), # Unique ID from URL - image=item.get("pagemap", {}).get("cse_image", [{}])[0].get("src", ""), - related=ticker, # Associate with target stock ticker - source=item.get("displayLink", ""), - summary=item.get("snippet", ""), - url=item.get("link", ""), - content="", # Content extracted later - provider='google', - market='us' - ) - for item in items - ] - - return self._filter_news(news_list, max_hour_interval, max_count) - - except requests.RequestException as e: - LOG.debug(f"Failed to fetch {ticker} stock news from Google Custom Search: {e}") - return [] diff --git a/tests/test_api_google_news.py b/tests/test_api_google_news.py deleted file mode 100644 index 0596d23..0000000 --- a/tests/test_api_google_news.py +++ /dev/null @@ -1,182 +0,0 @@ -""" -Google News API Test Suite - -This module contains pytest tests to verify functionality of Google's Custom Search API -for retrieving financial and stock-related news. It includes tests for: -- API credential validation -- General financial news retrieval -- Specific stock symbol news retrieval - -Requirements: -- Valid Google Cloud API key with Custom Search API enabled: GOOGLE_API_KEY -- Custom Search Engine ID (CX) configured for news search: CX -- Environment variables stored in .env file -""" - -import os -import requests -import pytest -from dotenv import load_dotenv - -# Load environment variables from .env file -load_dotenv() - -@pytest.fixture(scope="module") -def api_credentials(): - """ - Fixture to validate and provide API credentials. - - Retrieves Google API key and Custom Search Engine ID from environment variables - and performs basic validation. Fails if any credential is missing. - - Returns: - dict: Contains valid API credentials with keys 'api_key' and 'cx' - """ - api_key = os.getenv("GOOGLE_API_KEY") - cx = os.getenv("GOOGLE_CX") - - error_messages = [] - if not api_key: - error_messages.append( - "GOOGLE_API_KEY not found in environment variables. " - "Please check your .env file." - ) - if not cx: - error_messages.append( - "GOOGLE_CX (Custom Search Engine ID) not found. " - "Please check your .env file." - ) - - assert not error_messages, "\n".join(error_messages) - - return { - "api_key": api_key, - "cx": cx - } - - -def test_api_credentials_work(api_credentials): - """ - Test if API credentials are valid and functional. - - Performs a basic test query to verify that the provided API key and - CX ID can successfully authenticate with the Google Custom Search API. - Provides detailed error messages for common authentication issues. - """ - url = "https://www.googleapis.com/customsearch/v1" - test_params = { - "key": api_credentials["api_key"], - "cx": api_credentials["cx"], - "q": "test query", - "num": 1 - } - - response = requests.get(url, params=test_params, timeout=30) - - # Handle common authentication errors with detailed guidance - if response.status_code == 403: - pytest.fail( - "403 Forbidden: Invalid credentials or insufficient permissions.\n" - "Possible fixes:\n" - "1. Verify your API key is correct in .env\n" - "2. Ensure Custom Search API is enabled in Google Cloud Console\n" - "3. Check if your API key has IP restrictions that block this request\n" - "4. Confirm your project has billing enabled (required for production use)\n" - f"API Response: {response.text}" - ) - elif response.status_code == 400: - pytest.fail( - f"400 Bad Request: Invalid parameters. Check your CX ID.\n" - f"API Response: {response.text}" - ) - - assert response.status_code == 200, \ - f"API request failed with status code {response.status_code}. Response: {response.text}" - - -def test_get_latest_financial_news(api_credentials): - """ - Test retrieval of latest financial news from Google News. - - Queries the Custom Search API for recent financial news (past 1 week) - and validates the structure and content of returned results. - """ - url = "https://www.googleapis.com/customsearch/v1" - params = { - "key": api_credentials["api_key"], - "cx": api_credentials["cx"], - "q": "finance stock market", - "num": 10, - "dateRestrict": "w1", # Restrict results to past 1 week - "gl": "us", # Focus on United States results - "lr": "lang_en", # Restrict to English language - "siteSearch": "news.google.com", # Search only Google News - "siteSearchFilter": "i" # Include only specified sites - } - - # Execute API request - response = requests.get(url, params=params, timeout=30) - - # Handle specific API errors - if response.status_code == 403: - pytest.fail(f"403 Forbidden: Check API key and permissions. Response: {response.text}") - if response.status_code == 429: - pytest.fail(f"429 Too Many Requests: API quota exceeded. Response: {response.text}") - - # Verify successful response - assert response.status_code == 200, \ - f"API request failed with status code {response.status_code}. Response: {response.text}" - - # Parse and validate response content - results = response.json() - - assert "items" in results, f"No news items found. API response: {results}" - assert len(results["items"]) > 0, "No articles returned from Google News" - - # Validate individual news articles - for item in results["items"][:3]: # Check first 3 articles - assert "title" in item, "News item missing title" - assert "link" in item, "News item missing URL" - assert "snippet" in item, "News item missing snippet" - - -def test_get_specific_stock_news(api_credentials): - """ - Test retrieval of news for specific stock symbols. - - Queries the Custom Search API for news related to major tech stocks - and verifies that returned articles mention the target stock symbol. - """ - stock_symbols = ["AAPL", "MSFT", "GOOGL"] - url = "https://www.googleapis.com/customsearch/v1" - - for symbol in stock_symbols: - params = { - "key": api_credentials["api_key"], - "cx": api_credentials["cx"], - "q": f"{symbol} stock news", - "num": 5, - "dateRestrict": "w1", # Restrict to past 1 week - "gl": "us", - "lr": "lang_en", - "siteSearch": "news.google.com", - "siteSearchFilter": "i" - } - - response = requests.get(url, params=params, timeout=30) - - if response.status_code == 403: - pytest.fail(f"403 Forbidden for {symbol}: Check API key and permissions") - - assert response.status_code == 200, \ - f"Failed to get news for {symbol} (status code {response.status_code})" - - results = response.json() - - # Validate stock symbol appears in results when available - if "items" in results and len(results["items"]) > 0: - symbol_in_results = any( - symbol in item["title"].upper() or symbol in item.get("snippet", "").upper() - for item in results["items"] - ) - assert symbol_in_results, f"No results mentioning {symbol} found" diff --git a/tests/test_gentrade_news.py b/tests/test_gentrade_news.py index 52e000b..0fd2859 100644 --- a/tests/test_gentrade_news.py +++ b/tests/test_gentrade_news.py @@ -7,7 +7,6 @@ from gentrade.news.meta import NewsProviderBase from gentrade.news.newsapi import NewsApiProvider from gentrade.news.finnhub import FinnhubNewsProvider -from gentrade.news.googlenews import GoogleNewsProvider from gentrade.news.rss import RssProvider @@ -43,24 +42,6 @@ def test_create_finnhub_missing_key(self): NewsFactory.create_provider("finnhub") assert "FINNHUB_API_KEY" in str(excinfo.value) - @patch.dict(os.environ, { - "GOOGLE_CLOUD_API_KEY": "test_google_key", - "GOOGLE_CSE_ID": "test_cse_id" - }) - def test_create_google_provider(self): - """Test Google News provider creation with valid env vars""" - provider = NewsFactory.create_provider("google") - assert isinstance(provider, GoogleNewsProvider) - assert provider.api_key == "test_google_key" - assert provider.cse_id == "test_cse_id" - - def test_create_google_missing_credentials(self): - """Test Google creation fails with missing credentials""" - with patch.dict(os.environ, {}, clear=True): - with pytest.raises(ValueError) as excinfo: - NewsFactory.create_provider("google") - assert "GOOGLE_CSE_ID" in str(excinfo.value) - def test_create_rss_provider_with_feed_url(self): """Test RSS provider creation with explicit feed URL""" feed_url = "https://test-feed.com/rss" @@ -94,10 +75,6 @@ class TestNewsProvidersCommon: @pytest.fixture(params=[ ("newsapi", NewsApiProvider, {"NEWSAPI_API_KEY": "test_key"}), ("finnhub", FinnhubNewsProvider, {"FINNHUB_API_KEY": "test_key"}), - ("google", GoogleNewsProvider, { - "GOOGLE_CLOUD_API_KEY": "test_key", - "GOOGLE_CSE_ID": "test_id" - }), ("rss", RssProvider, {}) ]) def provider_setup(self, request): @@ -128,8 +105,6 @@ def test_fetch_market_news_returns_list(self, provider_setup): mock_response.json.return_value = {"articles": []} elif provider_setup[0] == "finnhub": mock_response.json.return_value = [] - elif provider_setup[0] == "google": - mock_response.json.return_value = {"items": []} elif provider_setup[0] == "rss": pass # Handled in RSS specific tests @@ -152,9 +127,6 @@ def test_fetch_stock_news_returns_list(self, provider_setup): elif provider_type == "finnhub": # Finnhub returns list directly mock_response.json.return_value = [] - elif provider_type == "google": - # Google returns {"items": [...]} - mock_response.json.return_value = {"items": []} elif provider_type == "rss": # RSS uses feedparser, handled separately pass @@ -190,29 +162,6 @@ def test_fetch_market_news_params(self, mock_get, newsapi_provider): assert "from" in params -class TestGoogleNewsProvider: - """Google News-specific test cases""" - - @pytest.fixture - def google_provider(self): - with patch.dict(os.environ, { - "GOOGLE_CLOUD_API_KEY": "test_key", - "GOOGLE_CSE_ID": "test_id" - }): - return NewsFactory.create_provider("google") - - @patch("gentrade.news.googlenews.requests.get") - def test_fetch_stock_news_query(self, mock_get, google_provider): - """Test Google News uses correct stock query""" - mock_get.return_value = Mock(status_code=200, json=lambda: {"items": []}) - google_provider.fetch_stock_news(ticker="MSFT", max_count=3) - - _, kwargs = mock_get.call_args - params = kwargs["params"] - assert params["q"] == "MSFT stock news" - assert params["num"] == 3 - - class TestRssProvider: """RSS Provider-specific test cases""" @@ -268,17 +217,13 @@ def test_company_news_endpoint(self, mock_get, finnhub_provider): class TestProviderErrorHandling: """Tests for provider error handling""" - @pytest.fixture(params=["newsapi", "finnhub", "google"]) + @pytest.fixture(params=["newsapi", "finnhub"]) def api_provider(self, request): """Fixture for API-based providers (non-RSS)""" provider_type = request.param env_vars = { "newsapi": {"NEWSAPI_API_KEY": "test"}, - "finnhub": {"FINNHUB_API_KEY": "test"}, - "google": { - "GOOGLE_CLOUD_API_KEY": "test", - "GOOGLE_CSE_ID": "test" - } + "finnhub": {"FINNHUB_API_KEY": "test"} }[provider_type] with patch.dict(os.environ, env_vars): From 433b07de649e88bdf6921013a15195698f116f6b Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Tue, 2 Dec 2025 18:02:02 +0800 Subject: [PATCH 14/15] use loguru instead of logging Signed-off-by: Lu Ken --- requirements.txt | 1 + src/gentrade/news/factory.py | 29 +++++++++++++---------------- src/gentrade/news/finnhub.py | 9 +++------ src/gentrade/news/meta.py | 7 ++----- src/gentrade/news/newsapi.py | 12 ++++-------- src/gentrade/news/rss.py | 14 ++++++-------- 6 files changed, 29 insertions(+), 43 deletions(-) diff --git a/requirements.txt b/requirements.txt index ed82156..65c6d97 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,7 @@ backtrader ag2 mplfinance ntplib +loguru langchain_openai langchain_core diff --git a/src/gentrade/news/factory.py b/src/gentrade/news/factory.py index 0ed7626..233c62f 100644 --- a/src/gentrade/news/factory.py +++ b/src/gentrade/news/factory.py @@ -7,10 +7,10 @@ """ import os -import logging import time import threading from typing import List, Optional +from loguru import logger from gentrade.scraper.extractor import ArticleContentExtractor @@ -19,8 +19,6 @@ from gentrade.news.rss import RssProvider from gentrade.news.finnhub import FinnhubNewsProvider -LOG = logging.getLogger(__name__) - class NewsFactory: """Factory class for creating news provider instances based on provider type. @@ -99,7 +97,7 @@ def _fetch_thread(self, provider, aggregator, ticker, category, news = provider.fetch_stock_news( ticker, category, max_hour_interval, max_count ) - LOG.info( + logger.info( f"Fetched {len(news)} stock news articles for {ticker} from " f"{provider.__class__.__name__}" ) @@ -107,7 +105,7 @@ def _fetch_thread(self, provider, aggregator, ticker, category, news = provider.fetch_latest_market_news( category, max_hour_interval, max_count ) - LOG.info( + logger.info( f"Fetched {len(news)} market news articles from " f"{provider.__class__.__name__}" ) @@ -117,7 +115,7 @@ def _fetch_thread(self, provider, aggregator, ticker, category, item.summary = ace.clean_html(item.summary) if is_process: item.content = ace.extract_content(item.url) - LOG.info(item.content) + logger.info(item.content) with aggregator.db_lock: aggregator.db.add_news(news) @@ -142,10 +140,10 @@ def sync_news( """ current_time = time.time() if current_time < self.db.last_sync + 3600: - LOG.info("Skipping sync: Last sync was less than 1 hour ago.") + logger.info("Skipping sync: Last sync was less than 1 hour ago.") return - LOG.info("Starting news sync...") + logger.info("Starting news sync...") threads = [] for provider in self.providers: @@ -160,10 +158,9 @@ def sync_news( thread.join() self.db.last_sync = current_time - LOG.info("News sync completed.") + logger.info("News sync completed.") if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) db = NewsDatabase() try: @@ -186,18 +183,18 @@ def sync_news( # Log results all_news = db.get_all_news() - LOG.info(f"Total articles in database: {len(all_news)}") + logger.info(f"Total articles in database: {len(all_news)}") if all_news: - LOG.info("Example article:") - LOG.info(all_news[0].to_dict()) + logger.info("Example article:") + logger.info(all_news[0].to_dict()) for news_item in all_news: - LOG.info("--------------------------------") + logger.info("--------------------------------") print(news_item.headline) print(news_item.url) print(news_item.content) - LOG.info("--------------------------------") + logger.info("--------------------------------") except ValueError as e: - LOG.error(f"Error during news aggregation: {e}") + logger.error(f"Error during news aggregation: {e}") diff --git a/src/gentrade/news/finnhub.py b/src/gentrade/news/finnhub.py index d51da79..14fb5cc 100644 --- a/src/gentrade/news/finnhub.py +++ b/src/gentrade/news/finnhub.py @@ -5,17 +5,14 @@ and news specific to individual stock tickers, with filtering by time interval and article count. """ -import logging import time from typing import List from datetime import datetime, timedelta import requests +from loguru import logger from gentrade.news.meta import NewsInfo, NewsProviderBase -LOG = logging.getLogger(__name__) - - class FinnhubNewsProvider(NewsProviderBase): """News provider implementation for fetching news via the Finnhub.io API. @@ -87,7 +84,7 @@ def fetch_latest_market_news( return self._filter_news(news_list, max_hour_interval, max_count) except requests.RequestException as e: - LOG.debug(f"Error fetching market news from Finnhub: {e}") + logger.debug(f"Error fetching market news from Finnhub: {e}") return [] def fetch_stock_news( @@ -146,5 +143,5 @@ def fetch_stock_news( return self._filter_news(news_list, max_hour_interval, max_count) except requests.RequestException as e: - LOG.debug(f"Error fetching stock news from Finnhub: {e}") + logger.debug(f"Error fetching stock news from Finnhub: {e}") return [] diff --git a/src/gentrade/news/meta.py b/src/gentrade/news/meta.py index 4b59d1c..088f350 100644 --- a/src/gentrade/news/meta.py +++ b/src/gentrade/news/meta.py @@ -9,17 +9,14 @@ """ import abc -import logging import time import hashlib from typing import Dict, List, Any, Optional from datetime import datetime from dataclasses import dataclass - +from loguru import logger import requests -LOG = logging.getLogger(__name__) - NEWS_MARKET = [ 'us', 'zh', 'hk', 'cypto', 'common' ] @@ -79,7 +76,7 @@ def fetch_article_html(self) -> Optional[str]: response.raise_for_status() return response.text except requests.RequestException as e: - LOG.debug(f"Failed to fetch HTML for {self.url}: {e}") + logger.debug(f"Failed to fetch HTML for {self.url}: {e}") return None diff --git a/src/gentrade/news/newsapi.py b/src/gentrade/news/newsapi.py index 67a8ca4..51e6c05 100644 --- a/src/gentrade/news/newsapi.py +++ b/src/gentrade/news/newsapi.py @@ -5,17 +5,13 @@ article count, and language, while formatting results into standardized NewsInfo objects. """ -import logging from typing import List from datetime import datetime, timedelta - import requests +from loguru import logger from gentrade.news.meta import NewsInfo, NewsProviderBase -LOG = logging.getLogger(__name__) - - class NewsApiProvider(NewsProviderBase): """News provider that uses NewsAPI.org to fetch financial and stock-specific news. @@ -92,10 +88,10 @@ def fetch_latest_market_news( return self._filter_news(news_list, max_hour_interval, max_count) except requests.RequestException as e: - LOG.debug(f"Failed to fetch market news from NewsAPI.org: {e}") + logger.debug(f"Failed to fetch market news from NewsAPI.org: {e}") return [] except Exception as e: - LOG.debug(f"Unexpected error: {e}") + logger.debug(f"Unexpected error: {e}") return [] def fetch_stock_news( @@ -159,5 +155,5 @@ def fetch_stock_news( return self._filter_news(news_list, max_hour_interval, max_count) except requests.RequestException as e: - LOG.debug(f"Failed to fetch {ticker} stock news from NewsAPI.org: {e}") + logger.debug(f"Failed to fetch {ticker} stock news from NewsAPI.org: {e}") return [] diff --git a/src/gentrade/news/rss.py b/src/gentrade/news/rss.py index 9d0516f..5fcbb9d 100644 --- a/src/gentrade/news/rss.py +++ b/src/gentrade/news/rss.py @@ -7,16 +7,14 @@ """ import os -import logging from typing import List import requests import feedparser +from loguru import logger from gentrade.news.meta import NewsInfo, NewsProviderBase -LOG = logging.getLogger(__name__) - class RssProvider(NewsProviderBase): """News provider that fetches news from RSS/ATOM feeds. @@ -63,7 +61,7 @@ def fetch_latest_market_news( parsing fails, or no valid articles exist. """ if not self.feed_url: - LOG.error("RSS feed URL is missing (no explicit URL, env var, or default).") + logger.error("RSS feed URL is missing (no explicit URL, env var, or default).") return [] # Headers to mimic browser (avoid feed server blocking) and accept RSS/XML @@ -81,7 +79,7 @@ def fetch_latest_market_news( # Parse feed with feedparser feed = feedparser.parse(response.text) if not feed.entries: - LOG.warning(f"No articles found in RSS feed: {self.feed_url}") + logger.warning(f"No articles found in RSS feed: {self.feed_url}") return [] # Convert feed entries to standardized NewsInfo objects @@ -111,16 +109,16 @@ def fetch_latest_market_news( return self._filter_news(news_list, max_hour_interval, max_count) except requests.HTTPError as e: - LOG.error( + logger.error( f"HTTP error fetching RSS feed {self.feed_url}: " f"Status {e.response.status_code} - {str(e)}" ) return [] except requests.RequestException as e: - LOG.error(f"Network error fetching RSS feed {self.feed_url}: {str(e)}") + logger.error(f"Network error fetching RSS feed {self.feed_url}: {str(e)}") return [] except Exception as e: - LOG.error(f"Unexpected error parsing RSS feed {self.feed_url}: {str(e)}") + logger.error(f"Unexpected error parsing RSS feed {self.feed_url}: {str(e)}") return [] def fetch_stock_news( From aee73469586ef0de1f0afd5e1ecf1eccc3a7b67c Mon Sep 17 00:00:00 2001 From: Lu Ken Date: Tue, 2 Dec 2025 21:26:15 +0800 Subject: [PATCH 15/15] add market interface for news provider Signed-off-by: Lu Ken --- src/gentrade/news/finnhub.py | 4 ++++ src/gentrade/news/meta.py | 6 +++++- src/gentrade/news/newsapi.py | 4 ++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/gentrade/news/finnhub.py b/src/gentrade/news/finnhub.py index 14fb5cc..beb1d53 100644 --- a/src/gentrade/news/finnhub.py +++ b/src/gentrade/news/finnhub.py @@ -30,6 +30,10 @@ def __init__(self, api_key: str): self.api_key = api_key self.base_url = "https://finnhub.io/api/v1" + @property + def market(self): + return 'us' + def fetch_latest_market_news( self, category: str = "business", diff --git a/src/gentrade/news/meta.py b/src/gentrade/news/meta.py index 088f350..dd19ca3 100644 --- a/src/gentrade/news/meta.py +++ b/src/gentrade/news/meta.py @@ -34,7 +34,7 @@ class NewsInfo: summary: str url: str content: str - provider: str # provder like newsapi, google, finnhub, rss + provider: str # provder like newsapi, finnhub, rss market: str # market type like us, chn, eur, hk, crypto def to_dict(self) -> Dict[str, Any]: @@ -86,6 +86,10 @@ class NewsProviderBase(metaclass=abc.ABCMeta): All concrete news providers (e.g., NewsAPI, Finnhub) must implement these methods. """ + @property + def market(self): + return 'common' + @abc.abstractmethod def fetch_latest_market_news( self, diff --git a/src/gentrade/news/newsapi.py b/src/gentrade/news/newsapi.py index 51e6c05..53d0a40 100644 --- a/src/gentrade/news/newsapi.py +++ b/src/gentrade/news/newsapi.py @@ -29,6 +29,10 @@ def __init__(self, api_key: str): self.api_key = api_key self.base_url = "https://newsapi.org/v2/everything" # Core endpoint for news retrieval + @property + def market(self): + return 'us' + def fetch_latest_market_news( self, category: str = "business",