Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,4 @@ unit_test.py
testing_workflow.py
*.yaml

scripts/
6 changes: 3 additions & 3 deletions backend/mainService/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ WORKDIR /app
# Removes the package lists downloaded during the update to reduce the image size.
RUN apt-get update && apt-get install -y \
build-essential \
cron \
&& rm -rf /var/lib/apt/lists/*

# Set the PATH environment variable to include /app
Expand All @@ -33,6 +34,5 @@ RUN playwright install && playwright install-deps
# Expose the port the app runs on
EXPOSE 8000


# Command to run the application
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
# Start both cron and the FastAPI application
CMD ["sh", "-c", "cron && uvicorn app:app --host 0.0.0.0 --port 8000"]
2 changes: 2 additions & 0 deletions backend/mainService/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,6 @@ google-genai
redis>=4.2.0
uvicorn
httpx>=0.28.1
pypdf
pypdf2

125 changes: 0 additions & 125 deletions backend/mainService/scripts/delete_stale_data.py

This file was deleted.

2 changes: 1 addition & 1 deletion backend/mainService/src/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class ScraperConfig:
"""
This is the timeout duration for the requests made to the web scraper
"""
TIMEOUT_DURATION: int = 8000
TIMEOUT_DURATION: int = 10000

def __post_init__(self):
if self.MAX_FILE_SIZE <= 0:
Expand Down
4 changes: 2 additions & 2 deletions backend/mainService/src/llm/Pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,14 +186,14 @@ async def set_current_index(

:param index_name: Name of the index to set as current
"""
if not await self._pc.has_index(index_name):
return False
if not self._current_index_name == index_name and self._current_index:
await self._current_index.close()
elif self._current_index_name == index_name:
return True

if not index_host:
if not await self._pc.has_index(index_name):
return False
index_model = await self._pc.describe_index(index_name)
self._current_index_host = index_model.host
else:
Expand Down
2 changes: 1 addition & 1 deletion backend/mainService/src/llm/chat_llm/Gemini_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


class Genai_cite:
model = "gemini-2.0-pro-exp-02-05"
model = "gemini-2.0-flash"

def __init__(self, api_key: str = os.getenv("GOOGLE_API_KEY"),
llm_model: str = f'models/{model}'):
Expand Down
9 changes: 7 additions & 2 deletions backend/mainService/src/llm/chat_llm/Groq_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(self, api_key: str = os.getenv("GROQ_API_KEY"),
self.client = Groq(api_key=self.api_key)
self.llm_model = llm_model

def getKeywordSearchTerm(self, document: str) -> Optional[str]:
def getKeywordSearchTerm(self, document: str, proposed_title: Optional[str] = None) -> str:
"""
Generate a search term from the provided document using LLM.

Expand All @@ -46,12 +46,17 @@ def getKeywordSearchTerm(self, document: str) -> Optional[str]:

# Make API call with error handling

if proposed_title:
document = f"Here is the proposed title: {proposed_title}\n\nHere is the content: {document}"
else:
document = f"Here is the content: {document}"

completion = self.client.chat.completions.create(
model=self.llm_model,
messages=[
{
"role": "user",
"content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'. Here is the content: {document}"
"content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'.{document}"
},
],
temperature=0.9,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ async def _get_download_link(self, url: str) -> Optional[str]:
try:
page = await self.context.new_page()
if not url.endswith("pdf"):
await page.goto(url, wait_until='networkidle')
await page.goto(url, wait_until='networkidle', timeout=self.element_timeout)
await self._interact_with_dropdown(page)
download_link = await self._extract_download_link(page)
else:
Expand Down
12 changes: 6 additions & 6 deletions backend/mainService/src/services/citation_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,7 @@ async def process_citation(self,
"""
try:
# Step 0: Generate index name
title = (self.summarize_llm.getKeywordSearchTerm(content)
if title.lower() == "untitled" else title)
title = self.summarize_llm.getKeywordSearchTerm(content, proposed_title=title)
index_name = self._generate_index_name(title)
logger.info(f"index_name = {index_name}")
if await self.PC.set_current_index(index_name):
Expand Down Expand Up @@ -230,10 +229,11 @@ async def _process_documents(

try:
cleaned_result = search_results["cleaned_result"]
download_results = await self.scraper.get_pdfs(
target_urls=cleaned_result.get("links"),
storage_path=search_results["search_key"]
)
async with asyncio.timeout(15): # 15 second timeout
download_results = await self.scraper.get_pdfs(
target_urls=cleaned_result.get("links"),
storage_path=search_results["search_key"]
)

return await self._prepare_document_batches(
download_results,
Expand Down
1 change: 1 addition & 0 deletions backend/metricsService/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ python-dotenv==1.0.1
Requests==2.32.3
scholarly==1.7.11
uvicorn

9 changes: 5 additions & 4 deletions backend/metricsService/src/services/author_reputation.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
from ..utils.api_config import (
ORCID_API,
SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API,
OPEN_ALEX_AUTHOR_API
OPEN_ALEX_AUTHOR_API,
DEFAULT_TIMEOUT
)
from ..utils.api_utils import rate_limit
from ..utils.logging_config import get_logger
Expand All @@ -64,7 +65,7 @@ async def get_authorship_reputation(author_id: Optional[str] = None, author_name
orcid_response = requests.get(
f"{ORCID_API}{author_id}/works",
headers={"Accept": "application/json"},
timeout=15
timeout=DEFAULT_TIMEOUT
)
if orcid_response.status_code == 200:
orcid_data = orcid_response.json()
Expand Down Expand Up @@ -119,7 +120,7 @@ async def get_openalex_author_reputation(author_name: str):
"""Fetch author reputation from OpenAlex using the authors endpoint."""
await rate_limit()
try:
response = requests.get(f"{OPEN_ALEX_AUTHOR_API}?search={author_name}", timeout=10)
response = requests.get(f"{OPEN_ALEX_AUTHOR_API}?search={author_name}", timeout=DEFAULT_TIMEOUT)
if response.status_code == 200:
data = response.json()
if data.get("results"):
Expand All @@ -138,7 +139,7 @@ async def get_semantic_scholar_author_reputation(author_name: str):
await rate_limit()
try:
params = {"query": author_name, "fields": "hIndex,paperCount", "limit": 1}
response = requests.get(SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API, params=params, timeout=10)
response = requests.get(SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API, params=params, timeout=DEFAULT_TIMEOUT)
if response.status_code == 200:
data = response.json()
if data.get("data") and len(data["data"]) > 0:
Expand Down
1 change: 1 addition & 0 deletions backend/metricsService/src/utils/api_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,4 @@
OPEN_CITATIONS_API = "https://opencitations.net/index/api/v1/"
MAX_CONCURRENT_WORKERS = 20
DEFAULT_CONCURRENT_WORKERS = 10
DEFAULT_TIMEOUT = 10