diff --git a/.gitignore b/.gitignore index 75c1545..1fbafed 100644 --- a/.gitignore +++ b/.gitignore @@ -59,3 +59,4 @@ unit_test.py testing_workflow.py *.yaml +scripts/ diff --git a/backend/mainService/Dockerfile b/backend/mainService/Dockerfile index 08ab309..084364c 100644 --- a/backend/mainService/Dockerfile +++ b/backend/mainService/Dockerfile @@ -7,6 +7,7 @@ WORKDIR /app # Removes the package lists downloaded during the update to reduce the image size. RUN apt-get update && apt-get install -y \ build-essential \ + cron \ && rm -rf /var/lib/apt/lists/* # Set the PATH environment variable to include /app @@ -33,6 +34,5 @@ RUN playwright install && playwright install-deps # Expose the port the app runs on EXPOSE 8000 - -# Command to run the application -CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file +# Start both cron and the FastAPI application +CMD ["sh", "-c", "cron && uvicorn app:app --host 0.0.0.0 --port 8000"] \ No newline at end of file diff --git a/backend/mainService/requirements.txt b/backend/mainService/requirements.txt index 3ea9a61..a5189a8 100644 --- a/backend/mainService/requirements.txt +++ b/backend/mainService/requirements.txt @@ -25,4 +25,6 @@ google-genai redis>=4.2.0 uvicorn httpx>=0.28.1 +pypdf +pypdf2 diff --git a/backend/mainService/scripts/delete_stale_data.py b/backend/mainService/scripts/delete_stale_data.py deleted file mode 100644 index fb4786e..0000000 --- a/backend/mainService/scripts/delete_stale_data.py +++ /dev/null @@ -1,125 +0,0 @@ -import os -import json -import asyncio -from datetime import datetime, timezone, timedelta -from typing import Dict, List, Tuple -from pinecone import PineconeAsyncio as Pinecone -from collections import defaultdict -from src.config.log_config import setup_logging - -log_filename = os.path.basename(__file__) -logger = setup_logging(filename=log_filename) - -# Initialize Pinecone with your API key and environment - - -INDEX_DICT_FILE = 'index_dict.json' -THRESHOLD_HOURS = 2 # Delete indexes older than 2 hours - - -def load_index_dict() -> Dict[str, List[str]]: - """Load the index dictionary from the JSON file. - Returns: - Dict[str, List[str]]: Dictionary mapping hourly timestamps to lists of index names. - Returns defaultdict with empty list as default if file doesn't exist. - """ - if os.path.exists(INDEX_DICT_FILE): - with open(INDEX_DICT_FILE, 'r') as f: - return defaultdict(list, json.load(f)) - return defaultdict(list) - - -def save_index_dict(index_dict: Dict[str, List[str]]) -> None: - """Save the index dictionary to the JSON file. - - Args: - index_dict (Dict[str, List[str]]): Dictionary mapping timestamps to lists of index names. - """ - with open(INDEX_DICT_FILE, 'w') as f: - json.dump(dict(index_dict), f) # Convert defaultdict to regular dict for JSON serialization - - -async def delete_index(index_name: str, pc:Pinecone) -> Tuple[str, bool, str]: - """Delete a single Pinecone index. - - Args: - index_name (str): Name of the index to delete - - Returns: - Tuple[str, bool, str]: Tuple containing: - - Index name - - Boolean indicating success/failure - - Error message if failure, empty string if success - """ - - try: - await pc.delete_index(index_name) - return index_name, True, "" - except Exception as e: - return index_name, False, str(e) - - -async def delete_old_indexes(threshold_hours: int = THRESHOLD_HOURS) -> None: - """Asynchronously delete Pinecone indexes older than the threshold. - - This function loads the index dictionary, identifies indexes from timestamps older - than the threshold, and deletes them concurrently using asyncio.gather. - - Args: - threshold_hours (int, optional): Age threshold in hours. Defaults to THRESHOLD_HOURS. - """ - API_KEY = os.getenv("PINECONE_API_KEY") - pc = Pinecone(api_key=API_KEY) - - index_dict = load_index_dict() - now = datetime.now(timezone.utc) - updated_dict = defaultdict(list) - indexes_to_delete = [] - - # Process each timestamp and its indexes - for timestamp_str, index_list in index_dict.items(): - try: - creation_time = datetime.strptime(timestamp_str, "%Y-%m-%d %H").replace(tzinfo=timezone.utc) - print("creeation_time:", creation_time) - if now - creation_time >= timedelta(minutes=threshold_hours): - # Add all indexes from this timestamp to deletion list - indexes_to_delete.extend(index_list) - else: - # Keep indexes from recent timestamps - updated_dict[timestamp_str] = index_list - except ValueError as e: - logger.exception(f"Error parsing timestamp '{timestamp_str}': {e}") - # Keep entries with invalid timestamps for manual review - updated_dict[timestamp_str] = index_list - - if indexes_to_delete: - # Execute deletions concurrently - results = await asyncio.gather( - *[delete_index(index_name, pc=pc) for index_name in indexes_to_delete], - return_exceptions=True - ) - # Process results - for result in results: - if isinstance(result, Exception): - logger.error(f"Unexpected error during deletion: {result}") - continue - - index_name, success, error = result - if success: - logger.info(f"Successfully deleted index '{index_name}'") - else: - print(f"Failed to delete index '{index_name}': {error}") - # For failed deletions, keep them in their original timestamp bucket - # This requires finding the original timestamp - for timestamp_str, indexes in index_dict.items(): - if index_name in indexes: - updated_dict[timestamp_str].append(index_name) - break - - save_index_dict(updated_dict) - print("Deletion job complete. Remaining indexes by timestamp:", - {ts: indexes for ts, indexes in updated_dict.items() if indexes}) - await pc.close() - -if __name__ == "__main__": - asyncio.run(delete_old_indexes()) \ No newline at end of file diff --git a/backend/mainService/src/config/config.py b/backend/mainService/src/config/config.py index c70b91d..7ca0a07 100644 --- a/backend/mainService/src/config/config.py +++ b/backend/mainService/src/config/config.py @@ -32,7 +32,7 @@ class ScraperConfig: """ This is the timeout duration for the requests made to the web scraper """ - TIMEOUT_DURATION: int = 8000 + TIMEOUT_DURATION: int = 10000 def __post_init__(self): if self.MAX_FILE_SIZE <= 0: diff --git a/backend/mainService/src/llm/Pinecone.py b/backend/mainService/src/llm/Pinecone.py index 2ca06d6..7b7b50b 100644 --- a/backend/mainService/src/llm/Pinecone.py +++ b/backend/mainService/src/llm/Pinecone.py @@ -186,14 +186,14 @@ async def set_current_index( :param index_name: Name of the index to set as current """ + if not await self._pc.has_index(index_name): + return False if not self._current_index_name == index_name and self._current_index: await self._current_index.close() elif self._current_index_name == index_name: return True if not index_host: - if not await self._pc.has_index(index_name): - return False index_model = await self._pc.describe_index(index_name) self._current_index_host = index_model.host else: diff --git a/backend/mainService/src/llm/chat_llm/Gemini_llm.py b/backend/mainService/src/llm/chat_llm/Gemini_llm.py index 0ce0453..9fabc7a 100644 --- a/backend/mainService/src/llm/chat_llm/Gemini_llm.py +++ b/backend/mainService/src/llm/chat_llm/Gemini_llm.py @@ -11,7 +11,7 @@ class Genai_cite: - model = "gemini-2.0-pro-exp-02-05" + model = "gemini-2.0-flash" def __init__(self, api_key: str = os.getenv("GOOGLE_API_KEY"), llm_model: str = f'models/{model}'): diff --git a/backend/mainService/src/llm/chat_llm/Groq_llm.py b/backend/mainService/src/llm/chat_llm/Groq_llm.py index 4d8c967..a2a6370 100644 --- a/backend/mainService/src/llm/chat_llm/Groq_llm.py +++ b/backend/mainService/src/llm/chat_llm/Groq_llm.py @@ -19,7 +19,7 @@ def __init__(self, api_key: str = os.getenv("GROQ_API_KEY"), self.client = Groq(api_key=self.api_key) self.llm_model = llm_model - def getKeywordSearchTerm(self, document: str) -> Optional[str]: + def getKeywordSearchTerm(self, document: str, proposed_title: Optional[str] = None) -> str: """ Generate a search term from the provided document using LLM. @@ -46,12 +46,17 @@ def getKeywordSearchTerm(self, document: str) -> Optional[str]: # Make API call with error handling + if proposed_title: + document = f"Here is the proposed title: {proposed_title}\n\nHere is the content: {document}" + else: + document = f"Here is the content: {document}" + completion = self.client.chat.completions.create( model=self.llm_model, messages=[ { "role": "user", - "content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'. Here is the content: {document}" + "content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'.{document}" }, ], temperature=0.9, diff --git a/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py b/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py index 8ede8ad..06a88a9 100644 --- a/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py +++ b/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py @@ -38,7 +38,7 @@ async def _get_download_link(self, url: str) -> Optional[str]: try: page = await self.context.new_page() if not url.endswith("pdf"): - await page.goto(url, wait_until='networkidle') + await page.goto(url, wait_until='networkidle', timeout=self.element_timeout) await self._interact_with_dropdown(page) download_link = await self._extract_download_link(page) else: diff --git a/backend/mainService/src/services/citation_service.py b/backend/mainService/src/services/citation_service.py index 324734d..f9a3aa3 100644 --- a/backend/mainService/src/services/citation_service.py +++ b/backend/mainService/src/services/citation_service.py @@ -130,8 +130,7 @@ async def process_citation(self, """ try: # Step 0: Generate index name - title = (self.summarize_llm.getKeywordSearchTerm(content) - if title.lower() == "untitled" else title) + title = self.summarize_llm.getKeywordSearchTerm(content, proposed_title=title) index_name = self._generate_index_name(title) logger.info(f"index_name = {index_name}") if await self.PC.set_current_index(index_name): @@ -230,10 +229,11 @@ async def _process_documents( try: cleaned_result = search_results["cleaned_result"] - download_results = await self.scraper.get_pdfs( - target_urls=cleaned_result.get("links"), - storage_path=search_results["search_key"] - ) + async with asyncio.timeout(15): # 15 second timeout + download_results = await self.scraper.get_pdfs( + target_urls=cleaned_result.get("links"), + storage_path=search_results["search_key"] + ) return await self._prepare_document_batches( download_results, diff --git a/backend/metricsService/requirements.txt b/backend/metricsService/requirements.txt index 183bb12..3d56cd0 100644 --- a/backend/metricsService/requirements.txt +++ b/backend/metricsService/requirements.txt @@ -7,3 +7,4 @@ python-dotenv==1.0.1 Requests==2.32.3 scholarly==1.7.11 uvicorn + diff --git a/backend/metricsService/src/services/author_reputation.py b/backend/metricsService/src/services/author_reputation.py index fea9b71..5ca821a 100644 --- a/backend/metricsService/src/services/author_reputation.py +++ b/backend/metricsService/src/services/author_reputation.py @@ -37,7 +37,8 @@ from ..utils.api_config import ( ORCID_API, SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API, - OPEN_ALEX_AUTHOR_API + OPEN_ALEX_AUTHOR_API, + DEFAULT_TIMEOUT ) from ..utils.api_utils import rate_limit from ..utils.logging_config import get_logger @@ -64,7 +65,7 @@ async def get_authorship_reputation(author_id: Optional[str] = None, author_name orcid_response = requests.get( f"{ORCID_API}{author_id}/works", headers={"Accept": "application/json"}, - timeout=15 + timeout=DEFAULT_TIMEOUT ) if orcid_response.status_code == 200: orcid_data = orcid_response.json() @@ -119,7 +120,7 @@ async def get_openalex_author_reputation(author_name: str): """Fetch author reputation from OpenAlex using the authors endpoint.""" await rate_limit() try: - response = requests.get(f"{OPEN_ALEX_AUTHOR_API}?search={author_name}", timeout=10) + response = requests.get(f"{OPEN_ALEX_AUTHOR_API}?search={author_name}", timeout=DEFAULT_TIMEOUT) if response.status_code == 200: data = response.json() if data.get("results"): @@ -138,7 +139,7 @@ async def get_semantic_scholar_author_reputation(author_name: str): await rate_limit() try: params = {"query": author_name, "fields": "hIndex,paperCount", "limit": 1} - response = requests.get(SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API, params=params, timeout=10) + response = requests.get(SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API, params=params, timeout=DEFAULT_TIMEOUT) if response.status_code == 200: data = response.json() if data.get("data") and len(data["data"]) > 0: diff --git a/backend/metricsService/src/utils/api_config.py b/backend/metricsService/src/utils/api_config.py index 1267b11..8b6c9c6 100644 --- a/backend/metricsService/src/utils/api_config.py +++ b/backend/metricsService/src/utils/api_config.py @@ -34,3 +34,4 @@ OPEN_CITATIONS_API = "https://opencitations.net/index/api/v1/" MAX_CONCURRENT_WORKERS = 20 DEFAULT_CONCURRENT_WORKERS = 10 +DEFAULT_TIMEOUT = 10