From 9e1ee62dfe12130bc4403dce5224bc3d45d23f99 Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Fri, 28 Mar 2025 15:39:14 -0400
Subject: [PATCH 01/10] azure function ready

---
 backend/mainService/.funcignore               |  16 +++
 backend/mainService/.gitignore                | 135 ++++++++++++++++++
 backend/mainService/function_app/__init__.py  |  41 ++++++
 .../mainService/function_app/function.json    |  17 +++
 backend/mainService/host.json                 |  21 +++
 backend/mainService/requirements.txt          |   1 +
 backend/mainService/src/config/config.py      |   2 +-
 7 files changed, 232 insertions(+), 1 deletion(-)
 create mode 100644 backend/mainService/.funcignore
 create mode 100644 backend/mainService/.gitignore
 create mode 100644 backend/mainService/function_app/__init__.py
 create mode 100644 backend/mainService/function_app/function.json
 create mode 100644 backend/mainService/host.json

diff --git a/backend/mainService/.funcignore b/backend/mainService/.funcignore
new file mode 100644
index 0000000..c5a4d9a
--- /dev/null
+++ b/backend/mainService/.funcignore
@@ -0,0 +1,16 @@
+.git*
+.vscode
+__azurite_db*__.json
+__blobstorage__
+__queuestorage__
+local.settings.json
+test
+venv
+.git
+.vscode
+.env
+.env.test
+.gitignore
+*.ini
+*.pyc
+__pycache__
\ No newline at end of file
diff --git a/backend/mainService/.gitignore b/backend/mainService/.gitignore
new file mode 100644
index 0000000..7685fc4
--- /dev/null
+++ b/backend/mainService/.gitignore
@@ -0,0 +1,135 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don’t work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Azure Functions artifacts
+bin
+obj
+appsettings.json
+local.settings.json
+
+# Azurite artifacts
+__blobstorage__
+__queuestorage__
+__azurite_db*__.json
+.python_packages
\ No newline at end of file
diff --git a/backend/mainService/function_app/__init__.py b/backend/mainService/function_app/__init__.py
new file mode 100644
index 0000000..28a5db4
--- /dev/null
+++ b/backend/mainService/function_app/__init__.py
@@ -0,0 +1,41 @@
+import azure.functions as func
+import logging
+from app import app as fastapi_app  # Import the FastAPI app from app.py 
+from src.config.startup import startup_event
+from src.llm.Pinecone import PineconeOperations
+from src.llm.chat_llm.Groq_llm import Summarize_llm
+from src.llm.chat_llm.Azure_llm import Citation
+from src.scraper.async_content_scraper import AsyncContentScraper
+from src.config.playwright_driver import PlaywrightDriver as ASD
+from src.config.async_http_session import AsyncHTTPClient
+from src.utils.concurrent_resources import cleanup_resources
+import nltk
+from dotenv import load_dotenv
+
+# Initialize NLTK data and environment variables (these are safe to do at module level)
+load_dotenv()
+nltk.download('punkt')
+nltk.download('punkt_tab')
+
+async def main(req: func.HttpRequest, res: func.Out[func.HttpResponse]) -> func.HttpResponse:
+    logging.info('Python HTTP trigger function processed a request.')
+    
+    # Initialize resources for this request
+    playwright_driver = await ASD.create()
+    pc = await PineconeOperations.create()
+    summarize_llm = Summarize_llm()
+    citation_llm = Citation()
+    
+    # Initialize content scraper
+    async with AsyncContentScraper(playwright_driver=playwright_driver) as content_scraper:
+        # Set up app state for this request
+        fastapi_app.state.playwright_driver = playwright_driver
+        fastapi_app.state.pc = pc
+        fastapi_app.state.summarize_llm = summarize_llm
+        fastapi_app.state.citation_llm = citation_llm
+        fastapi_app.state.async_content_scraper = content_scraper
+        
+        # Handle the request
+        response = await func.AsgiMiddleware(fastapi_app).handle_async(req)
+        res.set(response)
+    
\ No newline at end of file
diff --git a/backend/mainService/function_app/function.json b/backend/mainService/function_app/function.json
new file mode 100644
index 0000000..512c98b
--- /dev/null
+++ b/backend/mainService/function_app/function.json
@@ -0,0 +1,17 @@
+{
+     "bindings": [
+        {
+            "authLevel": "function",
+            "type": "httpTrigger",
+            "direction": "in",
+            "name": "req",
+            "methods": ["get", "post"],
+            "route": "{*route}"
+        },
+        {
+            "type": "http",
+            "direction": "out",
+            "name": "res"
+        }
+     ]
+}
\ No newline at end of file
diff --git a/backend/mainService/host.json b/backend/mainService/host.json
new file mode 100644
index 0000000..e3b6a9a
--- /dev/null
+++ b/backend/mainService/host.json
@@ -0,0 +1,21 @@
+{
+  "version": "2.0",
+  "logging": {
+    "applicationInsights": {
+      "samplingSettings": {
+        "isEnabled": true,
+        "excludedTypes": "Request"
+      }
+    }
+  },
+  "extensionBundle": {
+    "id": "Microsoft.Azure.Functions.ExtensionBundle",
+    "version": "[4.*, 5.0.0)"
+  },
+  "extensions": {
+    "http": {
+      "routePrefix": "",
+      "maxOutstandingRequests": 100
+    }
+  }
+}
\ No newline at end of file
diff --git a/backend/mainService/requirements.txt b/backend/mainService/requirements.txt
index a5189a8..04a3d93 100644
--- a/backend/mainService/requirements.txt
+++ b/backend/mainService/requirements.txt
@@ -27,4 +27,5 @@ uvicorn
 httpx>=0.28.1
 pypdf
 pypdf2
+azure-functions
 
diff --git a/backend/mainService/src/config/config.py b/backend/mainService/src/config/config.py
index 7ca0a07..98527da 100644
--- a/backend/mainService/src/config/config.py
+++ b/backend/mainService/src/config/config.py
@@ -32,7 +32,7 @@ class ScraperConfig:
     """
     This is the timeout duration for the requests made to the web scraper
     """
-    TIMEOUT_DURATION: int = 10000
+    TIMEOUT_DURATION: int = 30000  # Increased from 10000 to 30000 (30 seconds)
 
     def __post_init__(self):
         if self.MAX_FILE_SIZE <= 0:

From 96b41332718ee382222b2f3fecd204e1de5de900 Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Fri, 28 Mar 2025 19:39:57 -0400
Subject: [PATCH 02/10] before creating a main downloads directory in content
 scraper

---
 backend/mainService/function_app/__init__.py  | 56 +++++++++++++------
 .../src/scraper/async_content_scraper.py      | 27 ++++++---
 backend/metricsService/requirements.txt       |  2 +-
 3 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/backend/mainService/function_app/__init__.py b/backend/mainService/function_app/__init__.py
index 28a5db4..8f14f0e 100644
--- a/backend/mainService/function_app/__init__.py
+++ b/backend/mainService/function_app/__init__.py
@@ -11,31 +11,53 @@
 from src.utils.concurrent_resources import cleanup_resources
 import nltk
 from dotenv import load_dotenv
+import asyncio
+from contextlib import asynccontextmanager
 
 # Initialize NLTK data and environment variables (these are safe to do at module level)
 load_dotenv()
 nltk.download('punkt')
 nltk.download('punkt_tab')
 
-async def main(req: func.HttpRequest, res: func.Out[func.HttpResponse]) -> func.HttpResponse:
-    logging.info('Python HTTP trigger function processed a request.')
-    
-    # Initialize resources for this request
+@asynccontextmanager
+async def get_resources():
+    # Initialize resources
     playwright_driver = await ASD.create()
     pc = await PineconeOperations.create()
     summarize_llm = Summarize_llm()
     citation_llm = Citation()
     
-    # Initialize content scraper
-    async with AsyncContentScraper(playwright_driver=playwright_driver) as content_scraper:
-        # Set up app state for this request
-        fastapi_app.state.playwright_driver = playwright_driver
-        fastapi_app.state.pc = pc
-        fastapi_app.state.summarize_llm = summarize_llm
-        fastapi_app.state.citation_llm = citation_llm
-        fastapi_app.state.async_content_scraper = content_scraper
-        
-        # Handle the request
-        response = await func.AsgiMiddleware(fastapi_app).handle_async(req)
-        res.set(response)
-    
\ No newline at end of file
+    try:
+        async with AsyncContentScraper(playwright_driver=playwright_driver) as content_scraper:
+            # Set up app state
+            fastapi_app.state.playwright_driver = playwright_driver
+            fastapi_app.state.pc = pc
+            fastapi_app.state.summarize_llm = summarize_llm
+            fastapi_app.state.citation_llm = citation_llm
+            fastapi_app.state.async_content_scraper = content_scraper
+            yield
+    finally:
+        # Ensure resources are cleaned up
+        await asyncio.gather(
+            playwright_driver.quit(),
+            pc.cleanup(),
+            cleanup_resources(),
+            AsyncHTTPClient.close_session(),
+            return_exceptions=True
+        )
+        logging.info("Resources cleaned up successfully")
+
+async def main(req: func.HttpRequest, res: func.Out[func.HttpResponse]) -> func.HttpResponse:
+    logging.info('Python HTTP trigger function processed a request.')
+    
+    try:
+        async with get_resources():
+            response = await func.AsgiMiddleware(fastapi_app).handle_async(req)
+            res.set(response)
+            logging.info('Request processed successfully')
+    except Exception as e:
+        logging.error(f"Error processing request: {str(e)}")
+        res.set(func.HttpResponse(
+            "Internal server error",
+            status_code=500
+        ))
diff --git a/backend/mainService/src/scraper/async_content_scraper.py b/backend/mainService/src/scraper/async_content_scraper.py
index a0042da..bdcb205 100644
--- a/backend/mainService/src/scraper/async_content_scraper.py
+++ b/backend/mainService/src/scraper/async_content_scraper.py
@@ -35,6 +35,10 @@
 log_filename = os.path.basename(__file__)
 logger = setup_logging(filename=log_filename)
 
+# Define the main downloads directory
+MAIN_DOWNLOADS_DIR = os.path.join(os.getcwd(), "downloads")
+os.makedirs(MAIN_DOWNLOADS_DIR, exist_ok=True)
+
 """
 Citation Content Scraper Module
 
@@ -136,15 +140,19 @@ async def get_pdf(self,
             parsed_url = parse_url(target_url)
             base_url = f"{parsed_url.scheme}://{parsed_url.host}"
 
-            # Set up download path
+            # Set up download path in the main downloads directory
             if not storage_path:
-                default_path = parsed_url.host + \
-                    str(datetime.now(tz.utc).strftime("%d_%m_%Y_%H_%M_%S"))
-                storage_path = os.path.join(
-                    os.getcwd(), "downloads", default_path)
+                # Create a subdirectory for this request
+                request_dir = os.path.join(
+                    MAIN_DOWNLOADS_DIR,
+                    f"{parsed_url.host}_{datetime.now(tz.utc).strftime('%d_%m_%Y_%H_%M_%S')}"
+                )
+                storage_path = request_dir
             else:
-                storage_path = os.path.abspath(storage_path)
-
+                # If storage_path is provided, create it as a subdirectory of MAIN_DOWNLOADS_DIR
+                storage_path = os.path.join(MAIN_DOWNLOADS_DIR, storage_path)
+            
+            storage_path = os.path.abspath(storage_path)
             self.current_download_path = storage_path
 
             # Check robots.txt
@@ -187,8 +195,9 @@ async def get_pdfs(self,
         """
         results = {"count": 0, "paths": {}, "storage_path": None}
 
-        storage_path = storage_path + \
-            str(datetime.now(tz.utc).strftime("%d_%m_%Y_%H_%M_%S")) if storage_path else None
+        # Create a unique subdirectory for this batch of downloads
+        if storage_path:
+            storage_path = f"{storage_path}_{datetime.now(tz.utc).strftime('%d_%m_%Y_%H_%M_%S')}"
 
         # Create tasks for all downloads
         tasks = [self.get_pdf(url, storage_path) for url in target_urls]
diff --git a/backend/metricsService/requirements.txt b/backend/metricsService/requirements.txt
index 3d56cd0..cb4dba7 100644
--- a/backend/metricsService/requirements.txt
+++ b/backend/metricsService/requirements.txt
@@ -7,4 +7,4 @@ python-dotenv==1.0.1
 Requests==2.32.3
 scholarly==1.7.11
 uvicorn
-
+azure-functions

From 483e1f80b61aa60195d484904f03e6ba15d08ad2 Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Fri, 28 Mar 2025 19:40:33 -0400
Subject: [PATCH 03/10] before creating a main downloads directory in content
 scraper

---
 backend/mainService/function_app/func.ignore | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 backend/mainService/function_app/func.ignore

diff --git a/backend/mainService/function_app/func.ignore b/backend/mainService/function_app/func.ignore
new file mode 100644
index 0000000..7f8fee6
--- /dev/null
+++ b/backend/mainService/function_app/func.ignore
@@ -0,0 +1,3 @@
+# Ignore downloads directory to prevent function restarts
+downloads/
+*.pdf 
\ No newline at end of file

From 3ba2afabc8a24dc260297fd76c6445e2f9c33bbb Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Fri, 28 Mar 2025 19:41:14 -0400
Subject: [PATCH 04/10] before creating a main downloads directory in content
 scraper, we added the metric files that didn't get staged in the last commit

---
 .../metricsService/function_app/__init__.py   | 11 ++++++++++
 .../metricsService/function_app/function.json | 17 +++++++++++++++
 backend/metricsService/host.json              | 21 +++++++++++++++++++
 backend/metricsService/local.settings.json    |  7 +++++++
 4 files changed, 56 insertions(+)
 create mode 100644 backend/metricsService/function_app/__init__.py
 create mode 100644 backend/metricsService/function_app/function.json
 create mode 100644 backend/metricsService/host.json
 create mode 100644 backend/metricsService/local.settings.json

diff --git a/backend/metricsService/function_app/__init__.py b/backend/metricsService/function_app/__init__.py
new file mode 100644
index 0000000..dc2fdd9
--- /dev/null
+++ b/backend/metricsService/function_app/__init__.py
@@ -0,0 +1,11 @@
+import azure.functions as func
+import logging
+from app import app as fastapi_app  # Import the FastAPI app from app.py 
+from dotenv import load_dotenv
+
+load_dotenv()
+
+async def main(req: func.HttpRequest, res:func.Out[func.HttpResponse]) -> None:
+    logging.info('Python HTTP trigger function processed a request.')
+    response = await func.AsgiMiddleware(fastapi_app).handle_async(req)
+    res.set(response)
\ No newline at end of file
diff --git a/backend/metricsService/function_app/function.json b/backend/metricsService/function_app/function.json
new file mode 100644
index 0000000..512c98b
--- /dev/null
+++ b/backend/metricsService/function_app/function.json
@@ -0,0 +1,17 @@
+{
+     "bindings": [
+        {
+            "authLevel": "function",
+            "type": "httpTrigger",
+            "direction": "in",
+            "name": "req",
+            "methods": ["get", "post"],
+            "route": "{*route}"
+        },
+        {
+            "type": "http",
+            "direction": "out",
+            "name": "res"
+        }
+     ]
+}
\ No newline at end of file
diff --git a/backend/metricsService/host.json b/backend/metricsService/host.json
new file mode 100644
index 0000000..e3b6a9a
--- /dev/null
+++ b/backend/metricsService/host.json
@@ -0,0 +1,21 @@
+{
+  "version": "2.0",
+  "logging": {
+    "applicationInsights": {
+      "samplingSettings": {
+        "isEnabled": true,
+        "excludedTypes": "Request"
+      }
+    }
+  },
+  "extensionBundle": {
+    "id": "Microsoft.Azure.Functions.ExtensionBundle",
+    "version": "[4.*, 5.0.0)"
+  },
+  "extensions": {
+    "http": {
+      "routePrefix": "",
+      "maxOutstandingRequests": 100
+    }
+  }
+}
\ No newline at end of file
diff --git a/backend/metricsService/local.settings.json b/backend/metricsService/local.settings.json
new file mode 100644
index 0000000..4b4cfce
--- /dev/null
+++ b/backend/metricsService/local.settings.json
@@ -0,0 +1,7 @@
+{
+  "IsEncrypted": false,
+  "Values": {
+    "AzureWebJobsStorage": "",
+    "FUNCTIONS_WORKER_RUNTIME": "python"
+  }
+}
\ No newline at end of file

From a472665a739ea7519749a4282c2553b49538558f Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Fri, 28 Mar 2025 20:18:30 -0400
Subject: [PATCH 05/10] stable working version ?

---
 .gitignore                                    | 138 ++++++++++++++++++
 backend/mainService/.funcignore               |   6 +-
 backend/mainService/.gitignore                | 135 -----------------
 backend/mainService/function_app/__init__.py  |   3 +-
 backend/mainService/function_app/func.ignore  |   3 -
 backend/mainService/src/config/config.py      |   6 +-
 .../src/scraper/async_content_scraper.py      |   8 +-
 .../metricsService/function_app/__init__.py   |   6 +-
 8 files changed, 155 insertions(+), 150 deletions(-)
 delete mode 100644 backend/mainService/.gitignore
 delete mode 100644 backend/mainService/function_app/func.ignore

diff --git a/.gitignore b/.gitignore
index 1fbafed..45adcd1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,3 +60,141 @@ testing_workflow.py
 *.yaml
 
 scripts/
+downloads/
+*.pdf
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don’t work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Azure Functions artifacts
+bin
+obj
+appsettings.json
+local.settings.json
+
+# Azurite artifacts
+__blobstorage__
+__queuestorage__
+__azurite_db*__.json
+.python_packages
diff --git a/backend/mainService/.funcignore b/backend/mainService/.funcignore
index c5a4d9a..eb0b235 100644
--- a/backend/mainService/.funcignore
+++ b/backend/mainService/.funcignore
@@ -13,4 +13,8 @@ venv
 .gitignore
 *.ini
 *.pyc
-__pycache__
\ No newline at end of file
+__pycache__
+
+# Ignore downloads directory to prevent function restarts
+downloads/
+*.pdf 
\ No newline at end of file
diff --git a/backend/mainService/.gitignore b/backend/mainService/.gitignore
deleted file mode 100644
index 7685fc4..0000000
--- a/backend/mainService/.gitignore
+++ /dev/null
@@ -1,135 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-pip-wheel-metadata/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-.python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don’t work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# celery beat schedule file
-celerybeat-schedule
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# Azure Functions artifacts
-bin
-obj
-appsettings.json
-local.settings.json
-
-# Azurite artifacts
-__blobstorage__
-__queuestorage__
-__azurite_db*__.json
-.python_packages
\ No newline at end of file
diff --git a/backend/mainService/function_app/__init__.py b/backend/mainService/function_app/__init__.py
index 8f14f0e..4b1c4bf 100644
--- a/backend/mainService/function_app/__init__.py
+++ b/backend/mainService/function_app/__init__.py
@@ -41,10 +41,9 @@ async def get_resources():
         await asyncio.gather(
             playwright_driver.quit(),
             pc.cleanup(),
-            cleanup_resources(),
-            AsyncHTTPClient.close_session(),
             return_exceptions=True
         )
+        cleanup_resources()
         logging.info("Resources cleaned up successfully")
 
 async def main(req: func.HttpRequest, res: func.Out[func.HttpResponse]) -> func.HttpResponse:
diff --git a/backend/mainService/function_app/func.ignore b/backend/mainService/function_app/func.ignore
deleted file mode 100644
index 7f8fee6..0000000
--- a/backend/mainService/function_app/func.ignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# Ignore downloads directory to prevent function restarts
-downloads/
-*.pdf 
\ No newline at end of file
diff --git a/backend/mainService/src/config/config.py b/backend/mainService/src/config/config.py
index 98527da..ceedb48 100644
--- a/backend/mainService/src/config/config.py
+++ b/backend/mainService/src/config/config.py
@@ -32,13 +32,17 @@ class ScraperConfig:
     """
     This is the timeout duration for the requests made to the web scraper
     """
-    TIMEOUT_DURATION: int = 30000  # Increased from 10000 to 30000 (30 seconds)
+    TIMEOUT_DURATION: int = 10000  # Increased from 10000 to 30000 (30 seconds)
+
+    # Define the main downloads directory
+    MAIN_DOWNLOADS_DIR_PATH: str = os.path.join(os.getcwd(), "downloads")
 
     def __post_init__(self):
         if self.MAX_FILE_SIZE <= 0:
             raise ValueError("MAX_FILE_SIZE must be positive")
         if self.TIMEOUT_DURATION <= 0:
             raise ValueError("TIMEOUT_DURATION must be positive")
+        os.makedirs(self.MAIN_DOWNLOADS_DIR_PATH, exist_ok=True)
 
 
 @dataclass
diff --git a/backend/mainService/src/scraper/async_content_scraper.py b/backend/mainService/src/scraper/async_content_scraper.py
index bdcb205..fead8df 100644
--- a/backend/mainService/src/scraper/async_content_scraper.py
+++ b/backend/mainService/src/scraper/async_content_scraper.py
@@ -30,14 +30,12 @@
 from playwright.async_api import Browser, BrowserContext
 from src.config.log_config import setup_logging
 from datetime import timezone as tz
+from src.config.config import scraper_config
 
 
 log_filename = os.path.basename(__file__)
 logger = setup_logging(filename=log_filename)
 
-# Define the main downloads directory
-MAIN_DOWNLOADS_DIR = os.path.join(os.getcwd(), "downloads")
-os.makedirs(MAIN_DOWNLOADS_DIR, exist_ok=True)
 
 """
 Citation Content Scraper Module
@@ -144,13 +142,13 @@ async def get_pdf(self,
             if not storage_path:
                 # Create a subdirectory for this request
                 request_dir = os.path.join(
-                    MAIN_DOWNLOADS_DIR,
+                    scraper_config.MAIN_DOWNLOADS_DIR_PATH,
                     f"{parsed_url.host}_{datetime.now(tz.utc).strftime('%d_%m_%Y_%H_%M_%S')}"
                 )
                 storage_path = request_dir
             else:
                 # If storage_path is provided, create it as a subdirectory of MAIN_DOWNLOADS_DIR
-                storage_path = os.path.join(MAIN_DOWNLOADS_DIR, storage_path)
+                storage_path = os.path.join(scraper_config.MAIN_DOWNLOADS_DIR_PATH, storage_path)
             
             storage_path = os.path.abspath(storage_path)
             self.current_download_path = storage_path
diff --git a/backend/metricsService/function_app/__init__.py b/backend/metricsService/function_app/__init__.py
index dc2fdd9..96dc26c 100644
--- a/backend/metricsService/function_app/__init__.py
+++ b/backend/metricsService/function_app/__init__.py
@@ -1,11 +1,11 @@
 import azure.functions as func
 import logging
-from app import app as fastapi_app  # Import the FastAPI app from app.py 
+from main import app as fastapi_app  # Import the FastAPI app from app.py 
 from dotenv import load_dotenv
 
 load_dotenv()
 
-async def main(req: func.HttpRequest, res:func.Out[func.HttpResponse]) -> None:
+async def main(req: func.HttpRequest, res:func.Out[func.HttpResponse]) -> func.HttpResponse:
     logging.info('Python HTTP trigger function processed a request.')
-    response = await func.AsgiMiddleware(fastapi_app).handle_async(req)
+    response = await func.AsgiMiddleware(app=fastapi_app).handle_async(req)
     res.set(response)
\ No newline at end of file

From 44a649f9198587ccbdc668dabada5cdc2d5531a9 Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Sat, 29 Mar 2025 02:14:46 -0400
Subject: [PATCH 06/10] deployed this version to azure

---
 .gitignore                                    |  2 +
 backend/mainService/.funcignore               |  3 +
 backend/mainService/function_app/__init__.py  | 80 ++++++++++---------
 .../mainService/function_app/function.json    |  2 +-
 backend/mainService/src/config/config.py      |  8 +-
 .../src/config/playwright_driver.py           |  3 +-
 backend/metricsService/.funcignore            |  8 ++
 .../metricsService/function_app/function.json |  2 +-
 8 files changed, 65 insertions(+), 43 deletions(-)
 create mode 100644 backend/metricsService/.funcignore

diff --git a/.gitignore b/.gitignore
index 45adcd1..42b3671 100644
--- a/.gitignore
+++ b/.gitignore
@@ -198,3 +198,5 @@ __blobstorage__
 __queuestorage__
 __azurite_db*__.json
 .python_packages
+
+playwright_browser/
\ No newline at end of file
diff --git a/backend/mainService/.funcignore b/backend/mainService/.funcignore
index eb0b235..4b56b19 100644
--- a/backend/mainService/.funcignore
+++ b/backend/mainService/.funcignore
@@ -6,6 +6,7 @@ __queuestorage__
 local.settings.json
 test
 venv
+.venv
 .git
 .vscode
 .env
@@ -14,6 +15,8 @@ venv
 *.ini
 *.pyc
 __pycache__
+pytest.ini
+pytest_cache
 
 # Ignore downloads directory to prevent function restarts
 downloads/
diff --git a/backend/mainService/function_app/__init__.py b/backend/mainService/function_app/__init__.py
index 4b1c4bf..651ceb6 100644
--- a/backend/mainService/function_app/__init__.py
+++ b/backend/mainService/function_app/__init__.py
@@ -1,62 +1,64 @@
 import azure.functions as func
 import logging
-from app import app as fastapi_app  # Import the FastAPI app from app.py 
-from src.config.startup import startup_event
+from app import app as fastapi_app  # Import FastAPI app
+from src.config.playwright_driver import PlaywrightDriver as ASD
 from src.llm.Pinecone import PineconeOperations
 from src.llm.chat_llm.Groq_llm import Summarize_llm
 from src.llm.chat_llm.Azure_llm import Citation
 from src.scraper.async_content_scraper import AsyncContentScraper
-from src.config.playwright_driver import PlaywrightDriver as ASD
-from src.config.async_http_session import AsyncHTTPClient
-from src.utils.concurrent_resources import cleanup_resources
-import nltk
 from dotenv import load_dotenv
+import nltk
 import asyncio
-from contextlib import asynccontextmanager
 
-# Initialize NLTK data and environment variables (these are safe to do at module level)
+# Load environment variables and NLTK data
 load_dotenv()
 nltk.download('punkt')
 nltk.download('punkt_tab')
 
-@asynccontextmanager
-async def get_resources():
-    # Initialize resources
-    playwright_driver = await ASD.create()
-    pc = await PineconeOperations.create()
-    summarize_llm = Summarize_llm()
-    citation_llm = Citation()
-    
-    try:
-        async with AsyncContentScraper(playwright_driver=playwright_driver) as content_scraper:
-            # Set up app state
+# Global variables for resources
+playwright_driver = None
+pc = None
+summarize_llm = None
+citation_llm = None
+async_content_scraper = None
+resource_lock = asyncio.Lock()  # Prevent race conditions
+
+async def initialize_resources():
+    """
+    Initializes global resources only once and prevents multiple concurrent initializations.
+    """
+    global playwright_driver, pc, summarize_llm, citation_llm, async_content_scraper
+
+    async with resource_lock:  # Prevent multiple requests from initializing Playwright at the same time
+        if playwright_driver is None:
+            logging.info("Initializing Playwright and other global resources...")
+
+            playwright_driver = await ASD.create()
+            pc = await PineconeOperations.create()
+            summarize_llm = Summarize_llm()
+            citation_llm = Citation()
+            async_content_scraper = await AsyncContentScraper(playwright_driver).__aenter__()
+
+            # Set FastAPI state
             fastapi_app.state.playwright_driver = playwright_driver
             fastapi_app.state.pc = pc
             fastapi_app.state.summarize_llm = summarize_llm
             fastapi_app.state.citation_llm = citation_llm
-            fastapi_app.state.async_content_scraper = content_scraper
-            yield
-    finally:
-        # Ensure resources are cleaned up
-        await asyncio.gather(
-            playwright_driver.quit(),
-            pc.cleanup(),
-            return_exceptions=True
-        )
-        cleanup_resources()
-        logging.info("Resources cleaned up successfully")
+            fastapi_app.state.async_content_scraper = async_content_scraper
+
+            logging.info("Global resources initialized.")
 
 async def main(req: func.HttpRequest, res: func.Out[func.HttpResponse]) -> func.HttpResponse:
     logging.info('Python HTTP trigger function processed a request.')
-    
+
     try:
-        async with get_resources():
-            response = await func.AsgiMiddleware(fastapi_app).handle_async(req)
-            res.set(response)
-            logging.info('Request processed successfully')
+        if playwright_driver is None:
+            await initialize_resources()  # Make sure Playwright is running
+
+        response = await func.AsgiMiddleware(fastapi_app).handle_async(req)
+        res.set(response)
+        logging.info('Request processed successfully')
+
     except Exception as e:
         logging.error(f"Error processing request: {str(e)}")
-        res.set(func.HttpResponse(
-            "Internal server error",
-            status_code=500
-        ))
+        res.set(func.HttpResponse("Internal server error", status_code=500))
diff --git a/backend/mainService/function_app/function.json b/backend/mainService/function_app/function.json
index 512c98b..242db6a 100644
--- a/backend/mainService/function_app/function.json
+++ b/backend/mainService/function_app/function.json
@@ -1,7 +1,7 @@
 {
      "bindings": [
         {
-            "authLevel": "function",
+            "authLevel": "anonymous",
             "type": "httpTrigger",
             "direction": "in",
             "name": "req",
diff --git a/backend/mainService/src/config/config.py b/backend/mainService/src/config/config.py
index ceedb48..d500218 100644
--- a/backend/mainService/src/config/config.py
+++ b/backend/mainService/src/config/config.py
@@ -35,7 +35,13 @@ class ScraperConfig:
     TIMEOUT_DURATION: int = 10000  # Increased from 10000 to 30000 (30 seconds)
 
     # Define the main downloads directory
-    MAIN_DOWNLOADS_DIR_PATH: str = os.path.join(os.getcwd(), "downloads")
+    MAIN_DOWNLOADS_DIR_PATH: str = os.path.join("/tmp", "downloads")
+
+    CURRENT_FILE_PATH = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) # Go up one level from 'mainservice'
+
+    os.path.dirname("...") # Go up one level from 'src'
+
+    PLAYWRIGHT_EXE_PATH=os.path.join(os.path.dirname(os.path.realpath(CURRENT_FILE_PATH)), 'playwright_browser', 'chromium_headless_shell-1161', 'chrome-linux', 'headless_shell')
 
     def __post_init__(self):
         if self.MAX_FILE_SIZE <= 0:
diff --git a/backend/mainService/src/config/playwright_driver.py b/backend/mainService/src/config/playwright_driver.py
index 1eb5e28..10d8ea1 100644
--- a/backend/mainService/src/config/playwright_driver.py
+++ b/backend/mainService/src/config/playwright_driver.py
@@ -129,8 +129,9 @@ async def __initialize_browser(self) -> Browser:
                 "--disable-blink-features=AutomationControlled",
             ]
             try:
+                exe_path = scraper_config.PLAYWRIGHT_EXE_PATH or None
                 self._playwright = await async_playwright().start()
-                self._browser = await self._playwright.chromium.launch(headless=True, args=args)
+                self._browser = await self._playwright.chromium.launch(headless=True, args=args, executable_path=exe_path)
             except Exception as e:
                 logger.critical(f"Error while initializing browser: {e}")
                 raise e
diff --git a/backend/metricsService/.funcignore b/backend/metricsService/.funcignore
new file mode 100644
index 0000000..7dda614
--- /dev/null
+++ b/backend/metricsService/.funcignore
@@ -0,0 +1,8 @@
+.git*
+.vscode
+__azurite_db*__.json
+__blobstorage__
+__queuestorage__
+local.settings.json
+test
+venv
\ No newline at end of file
diff --git a/backend/metricsService/function_app/function.json b/backend/metricsService/function_app/function.json
index 512c98b..242db6a 100644
--- a/backend/metricsService/function_app/function.json
+++ b/backend/metricsService/function_app/function.json
@@ -1,7 +1,7 @@
 {
      "bindings": [
         {
-            "authLevel": "function",
+            "authLevel": "anonymous",
             "type": "httpTrigger",
             "direction": "in",
             "name": "req",

From 9060632db7d895aac190fa8c200b72ae66472dee Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Sat, 29 Mar 2025 16:53:25 -0400
Subject: [PATCH 07/10] updated gitnore file

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 1fbafed..8be83c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,3 +60,6 @@ testing_workflow.py
 *.yaml
 
 scripts/
+playwright_browser
+local.settings.json
+function_app/

From 68b736bf52b935ada99fe9080e7d9a3d7052e37a Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Sat, 29 Mar 2025 17:41:55 -0400
Subject: [PATCH 08/10] 1. Uses a particular download folder for the pdf
 download,, no cluttering of workdir. 2.removes older config like openrouter
 endpoint, now obsolete as it is not being used. 3.Provide option to set
 playwright binaries 4. Cleanup the backend dir

---
 backend/mainService/.funcignore               | 23 -------
 backend/mainService/function_app/__init__.py  | 64 -------------------
 .../mainService/function_app/function.json    | 17 -----
 backend/mainService/host.json                 | 21 ------
 backend/mainService/src/config/config.py      | 59 ++++++++++-------
 .../mainService/src/llm/chat_llm/Azure_llm.py |  9 ++-
 .../mainService/src/llm/chat_llm/Groq_llm.py  |  7 +-
 7 files changed, 47 insertions(+), 153 deletions(-)
 delete mode 100644 backend/mainService/.funcignore
 delete mode 100644 backend/mainService/function_app/__init__.py
 delete mode 100644 backend/mainService/function_app/function.json
 delete mode 100644 backend/mainService/host.json

diff --git a/backend/mainService/.funcignore b/backend/mainService/.funcignore
deleted file mode 100644
index 4b56b19..0000000
--- a/backend/mainService/.funcignore
+++ /dev/null
@@ -1,23 +0,0 @@
-.git*
-.vscode
-__azurite_db*__.json
-__blobstorage__
-__queuestorage__
-local.settings.json
-test
-venv
-.venv
-.git
-.vscode
-.env
-.env.test
-.gitignore
-*.ini
-*.pyc
-__pycache__
-pytest.ini
-pytest_cache
-
-# Ignore downloads directory to prevent function restarts
-downloads/
-*.pdf 
\ No newline at end of file
diff --git a/backend/mainService/function_app/__init__.py b/backend/mainService/function_app/__init__.py
deleted file mode 100644
index 651ceb6..0000000
--- a/backend/mainService/function_app/__init__.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import azure.functions as func
-import logging
-from app import app as fastapi_app  # Import FastAPI app
-from src.config.playwright_driver import PlaywrightDriver as ASD
-from src.llm.Pinecone import PineconeOperations
-from src.llm.chat_llm.Groq_llm import Summarize_llm
-from src.llm.chat_llm.Azure_llm import Citation
-from src.scraper.async_content_scraper import AsyncContentScraper
-from dotenv import load_dotenv
-import nltk
-import asyncio
-
-# Load environment variables and NLTK data
-load_dotenv()
-nltk.download('punkt')
-nltk.download('punkt_tab')
-
-# Global variables for resources
-playwright_driver = None
-pc = None
-summarize_llm = None
-citation_llm = None
-async_content_scraper = None
-resource_lock = asyncio.Lock()  # Prevent race conditions
-
-async def initialize_resources():
-    """
-    Initializes global resources only once and prevents multiple concurrent initializations.
-    """
-    global playwright_driver, pc, summarize_llm, citation_llm, async_content_scraper
-
-    async with resource_lock:  # Prevent multiple requests from initializing Playwright at the same time
-        if playwright_driver is None:
-            logging.info("Initializing Playwright and other global resources...")
-
-            playwright_driver = await ASD.create()
-            pc = await PineconeOperations.create()
-            summarize_llm = Summarize_llm()
-            citation_llm = Citation()
-            async_content_scraper = await AsyncContentScraper(playwright_driver).__aenter__()
-
-            # Set FastAPI state
-            fastapi_app.state.playwright_driver = playwright_driver
-            fastapi_app.state.pc = pc
-            fastapi_app.state.summarize_llm = summarize_llm
-            fastapi_app.state.citation_llm = citation_llm
-            fastapi_app.state.async_content_scraper = async_content_scraper
-
-            logging.info("Global resources initialized.")
-
-async def main(req: func.HttpRequest, res: func.Out[func.HttpResponse]) -> func.HttpResponse:
-    logging.info('Python HTTP trigger function processed a request.')
-
-    try:
-        if playwright_driver is None:
-            await initialize_resources()  # Make sure Playwright is running
-
-        response = await func.AsgiMiddleware(fastapi_app).handle_async(req)
-        res.set(response)
-        logging.info('Request processed successfully')
-
-    except Exception as e:
-        logging.error(f"Error processing request: {str(e)}")
-        res.set(func.HttpResponse("Internal server error", status_code=500))
diff --git a/backend/mainService/function_app/function.json b/backend/mainService/function_app/function.json
deleted file mode 100644
index 242db6a..0000000
--- a/backend/mainService/function_app/function.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-     "bindings": [
-        {
-            "authLevel": "anonymous",
-            "type": "httpTrigger",
-            "direction": "in",
-            "name": "req",
-            "methods": ["get", "post"],
-            "route": "{*route}"
-        },
-        {
-            "type": "http",
-            "direction": "out",
-            "name": "res"
-        }
-     ]
-}
\ No newline at end of file
diff --git a/backend/mainService/host.json b/backend/mainService/host.json
deleted file mode 100644
index e3b6a9a..0000000
--- a/backend/mainService/host.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
-  "version": "2.0",
-  "logging": {
-    "applicationInsights": {
-      "samplingSettings": {
-        "isEnabled": true,
-        "excludedTypes": "Request"
-      }
-    }
-  },
-  "extensionBundle": {
-    "id": "Microsoft.Azure.Functions.ExtensionBundle",
-    "version": "[4.*, 5.0.0)"
-  },
-  "extensions": {
-    "http": {
-      "routePrefix": "",
-      "maxOutstandingRequests": 100
-    }
-  }
-}
\ No newline at end of file
diff --git a/backend/mainService/src/config/config.py b/backend/mainService/src/config/config.py
index d500218..778a5a4 100644
--- a/backend/mainService/src/config/config.py
+++ b/backend/mainService/src/config/config.py
@@ -34,14 +34,15 @@ class ScraperConfig:
     """
     TIMEOUT_DURATION: int = 10000  # Increased from 10000 to 30000 (30 seconds)
 
-    # Define the main downloads directory
+    """
+    This is the path to the directory where the downloads will be stored.
+    """
     MAIN_DOWNLOADS_DIR_PATH: str = os.path.join("/tmp", "downloads")
 
-    CURRENT_FILE_PATH = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) # Go up one level from 'mainservice'
-
-    os.path.dirname("...") # Go up one level from 'src'
-
-    PLAYWRIGHT_EXE_PATH=os.path.join(os.path.dirname(os.path.realpath(CURRENT_FILE_PATH)), 'playwright_browser', 'chromium_headless_shell-1161', 'chrome-linux', 'headless_shell')
+    """
+    This is the path to the playwright executable.
+    """
+    PLAYWRIGHT_EXE_PATH=None # set to None if you want to use the default playwright executable
 
     def __post_init__(self):
         if self.MAX_FILE_SIZE <= 0:
@@ -95,14 +96,6 @@ class LlmConfig:
     """
     UPSERT_BATCH_SIZE: int = 1000
 
-    """
-    This is the llm that open router uses for generating the intext citation and reference list for each query
-    """
-    OPEN_ROUTER_MODEL: str = "meta-llama/llama-3.3-70b-instruct:free"
-
-    """
-    This is the azure model api endpoint
-    """
 
 
 # Concurrency and Performance
@@ -111,12 +104,24 @@ class ConcurrencyConfig:
     """Configuration class for concurrency settings."""
 
     # General concurrency settings
+    """
+        This is the number of concurrent workers that will be used to process the source documents.
+    """
     DEFAULT_CONCURRENT_WORKERS: int = (os.cpu_count() // 2) + 1
-    HANDLE_INDEX_DELETE_WORKERS: int = 2
 
-    # Credibility service specific settings
+    """
+        This is the maximum number of threads that will be used to calculate the credibility of the source documents.
+    """
     CREDIBILITY_MAX_THREADS: int = 4  # Maximum threads for credibility calculations
+
+    """
+        This is the maximum number of concurrent operations that will be used to calculate the credibility of the source documents.
+    """
     CREDIBILITY_MAX_CONCURRENT: int = 8  # Maximum concurrent operations
+
+    """
+        This is the size of the processing batches that will be used to calculate the credibility of the source documents.
+    """
     CREDIBILITY_BATCH_SIZE: int = 4  # Size of processing batches
 
 
@@ -127,13 +132,23 @@ class ModelConfig:
     Contains settings specific to AI models and their deployment."""
     """Configuration for ML models and APIs."""
 
-    MODEL_ID: str = "BAAI/bge-m3"
-    MODEL_API_URL: str = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{MODEL_ID}"
-
     # LLM Generation Parameters
-    DEFAULT_TEMPERATURE: float = 0.5
-    DEFAULT_TOP_P: float = 1.0
-    DEFAULT_MAX_TOKENS: int = 1024
+    """
+        This is the temperature for the citation LLM.
+    """ 
+    CITE_LLM_TEMPERATURE: float = 0.1
+    """
+        This is the temperature for the summarize LLM.
+    """
+    SUMMARIZE_LLM_TEMPERATURE: float = 0.9
+    """
+        This is the top p for the citation LLM.
+    """
+    CITE_LLM_TOP_P: float = 0.1
+    """
+        This is the top p for the summarize LLM.
+    """
+    SUMMARIZE_LLM_TOP_P: float = 0.1
 
 
 @dataclass
diff --git a/backend/mainService/src/llm/chat_llm/Azure_llm.py b/backend/mainService/src/llm/chat_llm/Azure_llm.py
index 1e8bb84..3752ca7 100644
--- a/backend/mainService/src/llm/chat_llm/Azure_llm.py
+++ b/backend/mainService/src/llm/chat_llm/Azure_llm.py
@@ -15,7 +15,7 @@
 from src.custom_exceptions.llm_exceptions import CitationGenerationError
 import logging
 from concurrent.futures import ThreadPoolExecutor
-from src.config.config import concurrency_config
+from src.config.config import concurrency_config, model_config
 
 logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
     logging.WARNING)
@@ -128,8 +128,11 @@ def _blocking_citation_request(
             Dict[str, Any]: Raw API response containing citation data
         """
         try:
-            response: ChatCompletions = self.client.complete(messages=messages, model=(
-                model_name or self.model_name), temperature=0.1, top_p=0.1)
+            response: ChatCompletions = self.client.complete(
+                messages=messages, 
+                model=(model_name or self.model_name), 
+                temperature=model_config.CITE_LLM_TEMPERATURE, 
+                top_p=model_config.DEFAULT_TOP_P)
             response_content = response.choices[0].message.content
             # amazonq-ignore-next-line
             response_content = response_content.strip()
diff --git a/backend/mainService/src/llm/chat_llm/Groq_llm.py b/backend/mainService/src/llm/chat_llm/Groq_llm.py
index a2a6370..960381a 100644
--- a/backend/mainService/src/llm/chat_llm/Groq_llm.py
+++ b/backend/mainService/src/llm/chat_llm/Groq_llm.py
@@ -6,6 +6,7 @@
 from typing import Optional
 from json.decoder import JSONDecodeError
 from src.custom_exceptions.llm_exceptions import SearchKeyGenerationError
+from src.config.config import model_config
 
 filename = os.path.basename(__file__)
 logger = setup_logging(filename=filename)
@@ -59,9 +60,9 @@ def getKeywordSearchTerm(self, document: str, proposed_title: Optional[str] = No
                         "content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'.{document}"
                     },
                 ],
-                temperature=0.9,
-                top_p=1,
-                max_tokens=1024,
+                temperature=model_config.SUMMARIZE_LLM_TEMPERATURE,
+                top_p=model_config.DEFAULT_TOP_P,
+                max_tokens=200,
                 stream=False,
                 stop=None,
                 response_format={"type": "json_object"}

From 28b4991a60a6460a221b4feff67dc28d02356236 Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Sat, 29 Mar 2025 17:47:52 -0400
Subject: [PATCH 09/10] use a dedicated top p for cite me llm and summarize_llm

---
 backend/mainService/src/llm/chat_llm/Azure_llm.py | 2 +-
 backend/mainService/src/llm/chat_llm/Groq_llm.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/mainService/src/llm/chat_llm/Azure_llm.py b/backend/mainService/src/llm/chat_llm/Azure_llm.py
index 3752ca7..dd346b8 100644
--- a/backend/mainService/src/llm/chat_llm/Azure_llm.py
+++ b/backend/mainService/src/llm/chat_llm/Azure_llm.py
@@ -132,7 +132,7 @@ def _blocking_citation_request(
                 messages=messages, 
                 model=(model_name or self.model_name), 
                 temperature=model_config.CITE_LLM_TEMPERATURE, 
-                top_p=model_config.DEFAULT_TOP_P)
+                top_p=model_config.CITE_LLM_TOP_P)
             response_content = response.choices[0].message.content
             # amazonq-ignore-next-line
             response_content = response_content.strip()
diff --git a/backend/mainService/src/llm/chat_llm/Groq_llm.py b/backend/mainService/src/llm/chat_llm/Groq_llm.py
index 960381a..e35e42b 100644
--- a/backend/mainService/src/llm/chat_llm/Groq_llm.py
+++ b/backend/mainService/src/llm/chat_llm/Groq_llm.py
@@ -61,7 +61,7 @@ def getKeywordSearchTerm(self, document: str, proposed_title: Optional[str] = No
                     },
                 ],
                 temperature=model_config.SUMMARIZE_LLM_TEMPERATURE,
-                top_p=model_config.DEFAULT_TOP_P,
+                top_p=model_config.SUMMARIZE_LLM_TOP_P,
                 max_tokens=200,
                 stream=False,
                 stop=None,

From 6e2110324284d0a682b8d80d64ae342be81e0a0a Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Sat, 29 Mar 2025 17:55:06 -0400
Subject: [PATCH 10/10] Ensure .gitignore, README.md, and docker-compose.yml
 always come from main

---
 .gitattributes | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 .gitattributes

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..38cd5be
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,3 @@
+.gitignore merge=ours
+README.md merge=ours
+docker-compose.yml merge=ours