TheBluCoder · TheBluCoder · Mar 29, 2025 · Mar 28, 2025 · Mar 28, 2025 · Mar 28, 2025
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,3 @@
+.gitignore merge=ours
+README.md merge=ours
+docker-compose.yml merge=ours
diff --git a/.gitignore b/.gitignore
@@ -60,3 +60,146 @@ testing_workflow.py
 *.yaml
 
 scripts/
+playwright_browser
+local.settings.json
+function_app/
+downloads/
+*.pdf
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don’t work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Azure Functions artifacts
+bin
+obj
+appsettings.json
+local.settings.json
+
+# Azurite artifacts
+__blobstorage__
+__queuestorage__
+__azurite_db*__.json
+.python_packages
+
+playwright_browser/
diff --git a/backend/mainService/requirements.txt b/backend/mainService/requirements.txt
@@ -27,4 +27,5 @@ uvicorn
 httpx>=0.28.1
 pypdf
 pypdf2
+azure-functions
 
diff --git a/backend/mainService/src/config/config.py b/backend/mainService/src/config/config.py
@@ -32,13 +32,24 @@ class ScraperConfig:
     """
     This is the timeout duration for the requests made to the web scraper
     """
-    TIMEOUT_DURATION: int = 10000
+    TIMEOUT_DURATION: int = 10000  # Increased from 10000 to 30000 (30 seconds)
+
+    """
+    This is the path to the directory where the downloads will be stored.
+    """
+    MAIN_DOWNLOADS_DIR_PATH: str = os.path.join("/tmp", "downloads")
+
+    """
+    This is the path to the playwright executable.
+    """
+    PLAYWRIGHT_EXE_PATH=None # set to None if you want to use the default playwright executable
 
     def __post_init__(self):
         if self.MAX_FILE_SIZE <= 0:
             raise ValueError("MAX_FILE_SIZE must be positive")
         if self.TIMEOUT_DURATION <= 0:
             raise ValueError("TIMEOUT_DURATION must be positive")
+        os.makedirs(self.MAIN_DOWNLOADS_DIR_PATH, exist_ok=True)
 
 
 @dataclass
@@ -85,14 +96,6 @@ class LlmConfig:
     """
     UPSERT_BATCH_SIZE: int = 1000
 
-    """
-    This is the llm that open router uses for generating the intext citation and reference list for each query
-    """
-    OPEN_ROUTER_MODEL: str = "meta-llama/llama-3.3-70b-instruct:free"
-
-    """
-    This is the azure model api endpoint
-    """
 
 
 # Concurrency and Performance
@@ -101,12 +104,24 @@ class ConcurrencyConfig:
     """Configuration class for concurrency settings."""
 
     # General concurrency settings
+    """
+        This is the number of concurrent workers that will be used to process the source documents.
+    """
     DEFAULT_CONCURRENT_WORKERS: int = (os.cpu_count() // 2) + 1
-    HANDLE_INDEX_DELETE_WORKERS: int = 2
 
-    # Credibility service specific settings
+    """
+        This is the maximum number of threads that will be used to calculate the credibility of the source documents.
+    """
     CREDIBILITY_MAX_THREADS: int = 4  # Maximum threads for credibility calculations
+
+    """
+        This is the maximum number of concurrent operations that will be used to calculate the credibility of the source documents.
+    """
     CREDIBILITY_MAX_CONCURRENT: int = 8  # Maximum concurrent operations
+
+    """
+        This is the size of the processing batches that will be used to calculate the credibility of the source documents.
+    """
     CREDIBILITY_BATCH_SIZE: int = 4  # Size of processing batches
 
 
@@ -117,13 +132,23 @@ class ModelConfig:
     Contains settings specific to AI models and their deployment."""
     """Configuration for ML models and APIs."""
 
-    MODEL_ID: str = "BAAI/bge-m3"
-    MODEL_API_URL: str = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{MODEL_ID}"
-
     # LLM Generation Parameters
-    DEFAULT_TEMPERATURE: float = 0.5
-    DEFAULT_TOP_P: float = 1.0
-    DEFAULT_MAX_TOKENS: int = 1024
+    """
+        This is the temperature for the citation LLM.
+    """ 
+    CITE_LLM_TEMPERATURE: float = 0.1
+    """
+        This is the temperature for the summarize LLM.
+    """
+    SUMMARIZE_LLM_TEMPERATURE: float = 0.9
+    """
+        This is the top p for the citation LLM.
+    """
+    CITE_LLM_TOP_P: float = 0.1
+    """
+        This is the top p for the summarize LLM.
+    """
+    SUMMARIZE_LLM_TOP_P: float = 0.1
 
 
 @dataclass

diff --git a/backend/mainService/src/config/playwright_driver.py b/backend/mainService/src/config/playwright_driver.py
@@ -129,8 +129,9 @@ async def __initialize_browser(self) -> Browser:
                 "--disable-blink-features=AutomationControlled",
             ]
             try:
+                exe_path = scraper_config.PLAYWRIGHT_EXE_PATH or None
                 self._playwright = await async_playwright().start()
-                self._browser = await self._playwright.chromium.launch(headless=True, args=args)
+                self._browser = await self._playwright.chromium.launch(headless=True, args=args, executable_path=exe_path)
             except Exception as e:
                 logger.critical(f"Error while initializing browser: {e}")
                 raise e

diff --git a/backend/mainService/src/llm/chat_llm/Azure_llm.py b/backend/mainService/src/llm/chat_llm/Azure_llm.py
@@ -15,7 +15,7 @@
 from src.custom_exceptions.llm_exceptions import CitationGenerationError
 import logging
 from concurrent.futures import ThreadPoolExecutor
-from src.config.config import concurrency_config
+from src.config.config import concurrency_config, model_config
 
 logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
     logging.WARNING)
@@ -128,8 +128,11 @@ def _blocking_citation_request(
             Dict[str, Any]: Raw API response containing citation data
         """
         try:
-            response: ChatCompletions = self.client.complete(messages=messages, model=(
-                model_name or self.model_name), temperature=0.1, top_p=0.1)
+            response: ChatCompletions = self.client.complete(
+                messages=messages, 
+                model=(model_name or self.model_name), 
+                temperature=model_config.CITE_LLM_TEMPERATURE, 
+                top_p=model_config.CITE_LLM_TOP_P)
             response_content = response.choices[0].message.content
             # amazonq-ignore-next-line
             response_content = response_content.strip()

diff --git a/backend/mainService/src/llm/chat_llm/Groq_llm.py b/backend/mainService/src/llm/chat_llm/Groq_llm.py
@@ -6,6 +6,7 @@
 from typing import Optional
 from json.decoder import JSONDecodeError
 from src.custom_exceptions.llm_exceptions import SearchKeyGenerationError
+from src.config.config import model_config
 
 filename = os.path.basename(__file__)
 logger = setup_logging(filename=filename)
@@ -59,9 +60,9 @@ def getKeywordSearchTerm(self, document: str, proposed_title: Optional[str] = No
                         "content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'.{document}"
                     },
                 ],
-                temperature=0.9,
-                top_p=1,
-                max_tokens=1024,
+                temperature=model_config.SUMMARIZE_LLM_TEMPERATURE,
+                top_p=model_config.SUMMARIZE_LLM_TOP_P,
+                max_tokens=200,
                 stream=False,
                 stop=None,
                 response_format={"type": "json_object"}

diff --git a/backend/mainService/src/scraper/async_content_scraper.py b/backend/mainService/src/scraper/async_content_scraper.py
@@ -30,11 +30,13 @@
 from playwright.async_api import Browser, BrowserContext
 from src.config.log_config import setup_logging
 from datetime import timezone as tz
+from src.config.config import scraper_config
 
 
 log_filename = os.path.basename(__file__)
 logger = setup_logging(filename=log_filename)
 
+
 """
 Citation Content Scraper Module
 
@@ -136,15 +138,19 @@ async def get_pdf(self,
             parsed_url = parse_url(target_url)
             base_url = f"{parsed_url.scheme}://{parsed_url.host}"
 
-            # Set up download path
+            # Set up download path in the main downloads directory
             if not storage_path:
-                default_path = parsed_url.host + \
-                    str(datetime.now(tz.utc).strftime("%d_%m_%Y_%H_%M_%S"))
-                storage_path = os.path.join(
-                    os.getcwd(), "downloads", default_path)
+                # Create a subdirectory for this request
+                request_dir = os.path.join(
+                    scraper_config.MAIN_DOWNLOADS_DIR_PATH,
+                    f"{parsed_url.host}_{datetime.now(tz.utc).strftime('%d_%m_%Y_%H_%M_%S')}"
+                )
+                storage_path = request_dir
             else:
-                storage_path = os.path.abspath(storage_path)
-
+                # If storage_path is provided, create it as a subdirectory of MAIN_DOWNLOADS_DIR
+                storage_path = os.path.join(scraper_config.MAIN_DOWNLOADS_DIR_PATH, storage_path)
+
+            storage_path = os.path.abspath(storage_path)
             self.current_download_path = storage_path
 
             # Check robots.txt
@@ -187,8 +193,9 @@ async def get_pdfs(self,
         """
         results = {"count": 0, "paths": {}, "storage_path": None}
 
-        storage_path = storage_path + \
-            str(datetime.now(tz.utc).strftime("%d_%m_%Y_%H_%M_%S")) if storage_path else None
+        # Create a unique subdirectory for this batch of downloads
+        if storage_path:
+            storage_path = f"{storage_path}_{datetime.now(tz.utc).strftime('%d_%m_%Y_%H_%M_%S')}"
 
         # Create tasks for all downloads
         tasks = [self.get_pdf(url, storage_path) for url in target_urls]

diff --git a/backend/metricsService/.funcignore b/backend/metricsService/.funcignore
@@ -0,0 +1,8 @@
+.git*
+.vscode
+__azurite_db*__.json
+__blobstorage__
+__queuestorage__
+local.settings.json
+test
+venv
diff --git a/backend/metricsService/function_app/__init__.py b/backend/metricsService/function_app/__init__.py
@@ -0,0 +1,11 @@
+import azure.functions as func
+import logging
+from main import app as fastapi_app  # Import the FastAPI app from app.py 
+from dotenv import load_dotenv
+
+load_dotenv()
+
+async def main(req: func.HttpRequest, res:func.Out[func.HttpResponse]) -> func.HttpResponse:
+    logging.info('Python HTTP trigger function processed a request.')
+    response = await func.AsgiMiddleware(app=fastapi_app).handle_async(req)
+    res.set(response)
-Original file line number
+Diff line change
@@ Expand Up / @@ -27,4 +27,5 @@ uvicorn @@
     httpx>=0.28.1
     pypdf
     pypdf2
+    azure-functions