Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.gitignore merge=ours
README.md merge=ours
docker-compose.yml merge=ours
143 changes: 143 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,146 @@ testing_workflow.py
*.yaml

scripts/
playwright_browser
local.settings.json
function_app/
downloads/
*.pdf

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don’t work, or not
# install all needed dependencies.
#Pipfile.lock

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# Azure Functions artifacts
bin
obj
appsettings.json
local.settings.json

# Azurite artifacts
__blobstorage__
__queuestorage__
__azurite_db*__.json
.python_packages

playwright_browser/
1 change: 1 addition & 0 deletions backend/mainService/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@ uvicorn
httpx>=0.28.1
pypdf
pypdf2
azure-functions

59 changes: 42 additions & 17 deletions backend/mainService/src/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,24 @@ class ScraperConfig:
"""
This is the timeout duration for the requests made to the web scraper
"""
TIMEOUT_DURATION: int = 10000
TIMEOUT_DURATION: int = 10000 # Increased from 10000 to 30000 (30 seconds)

"""
This is the path to the directory where the downloads will be stored.
"""
MAIN_DOWNLOADS_DIR_PATH: str = os.path.join("/tmp", "downloads")

"""
This is the path to the playwright executable.
"""
PLAYWRIGHT_EXE_PATH=None # set to None if you want to use the default playwright executable

def __post_init__(self):
if self.MAX_FILE_SIZE <= 0:
raise ValueError("MAX_FILE_SIZE must be positive")
if self.TIMEOUT_DURATION <= 0:
raise ValueError("TIMEOUT_DURATION must be positive")
os.makedirs(self.MAIN_DOWNLOADS_DIR_PATH, exist_ok=True)


@dataclass
Expand Down Expand Up @@ -85,14 +96,6 @@ class LlmConfig:
"""
UPSERT_BATCH_SIZE: int = 1000

"""
This is the llm that open router uses for generating the intext citation and reference list for each query
"""
OPEN_ROUTER_MODEL: str = "meta-llama/llama-3.3-70b-instruct:free"

"""
This is the azure model api endpoint
"""


# Concurrency and Performance
Expand All @@ -101,12 +104,24 @@ class ConcurrencyConfig:
"""Configuration class for concurrency settings."""

# General concurrency settings
"""
This is the number of concurrent workers that will be used to process the source documents.
"""
DEFAULT_CONCURRENT_WORKERS: int = (os.cpu_count() // 2) + 1
HANDLE_INDEX_DELETE_WORKERS: int = 2

# Credibility service specific settings
"""
This is the maximum number of threads that will be used to calculate the credibility of the source documents.
"""
CREDIBILITY_MAX_THREADS: int = 4 # Maximum threads for credibility calculations

"""
This is the maximum number of concurrent operations that will be used to calculate the credibility of the source documents.
"""
CREDIBILITY_MAX_CONCURRENT: int = 8 # Maximum concurrent operations

"""
This is the size of the processing batches that will be used to calculate the credibility of the source documents.
"""
CREDIBILITY_BATCH_SIZE: int = 4 # Size of processing batches


Expand All @@ -117,13 +132,23 @@ class ModelConfig:
Contains settings specific to AI models and their deployment."""
"""Configuration for ML models and APIs."""

MODEL_ID: str = "BAAI/bge-m3"
MODEL_API_URL: str = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{MODEL_ID}"

# LLM Generation Parameters
DEFAULT_TEMPERATURE: float = 0.5
DEFAULT_TOP_P: float = 1.0
DEFAULT_MAX_TOKENS: int = 1024
"""
This is the temperature for the citation LLM.
"""
CITE_LLM_TEMPERATURE: float = 0.1
"""
This is the temperature for the summarize LLM.
"""
SUMMARIZE_LLM_TEMPERATURE: float = 0.9
"""
This is the top p for the citation LLM.
"""
CITE_LLM_TOP_P: float = 0.1
"""
This is the top p for the summarize LLM.
"""
SUMMARIZE_LLM_TOP_P: float = 0.1


@dataclass
Expand Down
3 changes: 2 additions & 1 deletion backend/mainService/src/config/playwright_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,9 @@ async def __initialize_browser(self) -> Browser:
"--disable-blink-features=AutomationControlled",
]
try:
exe_path = scraper_config.PLAYWRIGHT_EXE_PATH or None
self._playwright = await async_playwright().start()
self._browser = await self._playwright.chromium.launch(headless=True, args=args)
self._browser = await self._playwright.chromium.launch(headless=True, args=args, executable_path=exe_path)
except Exception as e:
logger.critical(f"Error while initializing browser: {e}")
raise e
Expand Down
9 changes: 6 additions & 3 deletions backend/mainService/src/llm/chat_llm/Azure_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from src.custom_exceptions.llm_exceptions import CitationGenerationError
import logging
from concurrent.futures import ThreadPoolExecutor
from src.config.config import concurrency_config
from src.config.config import concurrency_config, model_config

logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
logging.WARNING)
Expand Down Expand Up @@ -128,8 +128,11 @@ def _blocking_citation_request(
Dict[str, Any]: Raw API response containing citation data
"""
try:
response: ChatCompletions = self.client.complete(messages=messages, model=(
model_name or self.model_name), temperature=0.1, top_p=0.1)
response: ChatCompletions = self.client.complete(
messages=messages,
model=(model_name or self.model_name),
temperature=model_config.CITE_LLM_TEMPERATURE,
top_p=model_config.CITE_LLM_TOP_P)
response_content = response.choices[0].message.content
# amazonq-ignore-next-line
response_content = response_content.strip()
Expand Down
7 changes: 4 additions & 3 deletions backend/mainService/src/llm/chat_llm/Groq_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Optional
from json.decoder import JSONDecodeError
from src.custom_exceptions.llm_exceptions import SearchKeyGenerationError
from src.config.config import model_config

filename = os.path.basename(__file__)
logger = setup_logging(filename=filename)
Expand Down Expand Up @@ -59,9 +60,9 @@ def getKeywordSearchTerm(self, document: str, proposed_title: Optional[str] = No
"content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'.{document}"
},
],
temperature=0.9,
top_p=1,
max_tokens=1024,
temperature=model_config.SUMMARIZE_LLM_TEMPERATURE,
top_p=model_config.SUMMARIZE_LLM_TOP_P,
max_tokens=200,
stream=False,
stop=None,
response_format={"type": "json_object"}
Expand Down
25 changes: 16 additions & 9 deletions backend/mainService/src/scraper/async_content_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,13 @@
from playwright.async_api import Browser, BrowserContext
from src.config.log_config import setup_logging
from datetime import timezone as tz
from src.config.config import scraper_config


log_filename = os.path.basename(__file__)
logger = setup_logging(filename=log_filename)


"""
Citation Content Scraper Module

Expand Down Expand Up @@ -136,15 +138,19 @@ async def get_pdf(self,
parsed_url = parse_url(target_url)
base_url = f"{parsed_url.scheme}://{parsed_url.host}"

# Set up download path
# Set up download path in the main downloads directory
if not storage_path:
default_path = parsed_url.host + \
str(datetime.now(tz.utc).strftime("%d_%m_%Y_%H_%M_%S"))
storage_path = os.path.join(
os.getcwd(), "downloads", default_path)
# Create a subdirectory for this request
request_dir = os.path.join(
scraper_config.MAIN_DOWNLOADS_DIR_PATH,
f"{parsed_url.host}_{datetime.now(tz.utc).strftime('%d_%m_%Y_%H_%M_%S')}"
)
storage_path = request_dir
else:
storage_path = os.path.abspath(storage_path)

# If storage_path is provided, create it as a subdirectory of MAIN_DOWNLOADS_DIR
storage_path = os.path.join(scraper_config.MAIN_DOWNLOADS_DIR_PATH, storage_path)

storage_path = os.path.abspath(storage_path)
self.current_download_path = storage_path

# Check robots.txt
Expand Down Expand Up @@ -187,8 +193,9 @@ async def get_pdfs(self,
"""
results = {"count": 0, "paths": {}, "storage_path": None}

storage_path = storage_path + \
str(datetime.now(tz.utc).strftime("%d_%m_%Y_%H_%M_%S")) if storage_path else None
# Create a unique subdirectory for this batch of downloads
if storage_path:
storage_path = f"{storage_path}_{datetime.now(tz.utc).strftime('%d_%m_%Y_%H_%M_%S')}"

# Create tasks for all downloads
tasks = [self.get_pdf(url, storage_path) for url in target_urls]
Expand Down
8 changes: 8 additions & 0 deletions backend/metricsService/.funcignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.git*
.vscode
__azurite_db*__.json
__blobstorage__
__queuestorage__
local.settings.json
test
venv
11 changes: 11 additions & 0 deletions backend/metricsService/function_app/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import azure.functions as func
import logging
from main import app as fastapi_app # Import the FastAPI app from app.py
from dotenv import load_dotenv

load_dotenv()

async def main(req: func.HttpRequest, res:func.Out[func.HttpResponse]) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
response = await func.AsgiMiddleware(app=fastapi_app).handle_async(req)
res.set(response)
Loading