From 9e1ee62dfe12130bc4403dce5224bc3d45d23f99 Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Fri, 28 Mar 2025 15:39:14 -0400 Subject: [PATCH 01/10] azure function ready --- backend/mainService/.funcignore | 16 +++ backend/mainService/.gitignore | 135 ++++++++++++++++++ backend/mainService/function_app/__init__.py | 41 ++++++ .../mainService/function_app/function.json | 17 +++ backend/mainService/host.json | 21 +++ backend/mainService/requirements.txt | 1 + backend/mainService/src/config/config.py | 2 +- 7 files changed, 232 insertions(+), 1 deletion(-) create mode 100644 backend/mainService/.funcignore create mode 100644 backend/mainService/.gitignore create mode 100644 backend/mainService/function_app/__init__.py create mode 100644 backend/mainService/function_app/function.json create mode 100644 backend/mainService/host.json diff --git a/backend/mainService/.funcignore b/backend/mainService/.funcignore new file mode 100644 index 0000000..c5a4d9a --- /dev/null +++ b/backend/mainService/.funcignore @@ -0,0 +1,16 @@ +.git* +.vscode +__azurite_db*__.json +__blobstorage__ +__queuestorage__ +local.settings.json +test +venv +.git +.vscode +.env +.env.test +.gitignore +*.ini +*.pyc +__pycache__ \ No newline at end of file diff --git a/backend/mainService/.gitignore b/backend/mainService/.gitignore new file mode 100644 index 0000000..7685fc4 --- /dev/null +++ b/backend/mainService/.gitignore @@ -0,0 +1,135 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don’t work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Azure Functions artifacts +bin +obj +appsettings.json +local.settings.json + +# Azurite artifacts +__blobstorage__ +__queuestorage__ +__azurite_db*__.json +.python_packages \ No newline at end of file diff --git a/backend/mainService/function_app/__init__.py b/backend/mainService/function_app/__init__.py new file mode 100644 index 0000000..28a5db4 --- /dev/null +++ b/backend/mainService/function_app/__init__.py @@ -0,0 +1,41 @@ +import azure.functions as func +import logging +from app import app as fastapi_app # Import the FastAPI app from app.py +from src.config.startup import startup_event +from src.llm.Pinecone import PineconeOperations +from src.llm.chat_llm.Groq_llm import Summarize_llm +from src.llm.chat_llm.Azure_llm import Citation +from src.scraper.async_content_scraper import AsyncContentScraper +from src.config.playwright_driver import PlaywrightDriver as ASD +from src.config.async_http_session import AsyncHTTPClient +from src.utils.concurrent_resources import cleanup_resources +import nltk +from dotenv import load_dotenv + +# Initialize NLTK data and environment variables (these are safe to do at module level) +load_dotenv() +nltk.download('punkt') +nltk.download('punkt_tab') + +async def main(req: func.HttpRequest, res: func.Out[func.HttpResponse]) -> func.HttpResponse: + logging.info('Python HTTP trigger function processed a request.') + + # Initialize resources for this request + playwright_driver = await ASD.create() + pc = await PineconeOperations.create() + summarize_llm = Summarize_llm() + citation_llm = Citation() + + # Initialize content scraper + async with AsyncContentScraper(playwright_driver=playwright_driver) as content_scraper: + # Set up app state for this request + fastapi_app.state.playwright_driver = playwright_driver + fastapi_app.state.pc = pc + fastapi_app.state.summarize_llm = summarize_llm + fastapi_app.state.citation_llm = citation_llm + fastapi_app.state.async_content_scraper = content_scraper + + # Handle the request + response = await func.AsgiMiddleware(fastapi_app).handle_async(req) + res.set(response) + \ No newline at end of file diff --git a/backend/mainService/function_app/function.json b/backend/mainService/function_app/function.json new file mode 100644 index 0000000..512c98b --- /dev/null +++ b/backend/mainService/function_app/function.json @@ -0,0 +1,17 @@ +{ + "bindings": [ + { + "authLevel": "function", + "type": "httpTrigger", + "direction": "in", + "name": "req", + "methods": ["get", "post"], + "route": "{*route}" + }, + { + "type": "http", + "direction": "out", + "name": "res" + } + ] +} \ No newline at end of file diff --git a/backend/mainService/host.json b/backend/mainService/host.json new file mode 100644 index 0000000..e3b6a9a --- /dev/null +++ b/backend/mainService/host.json @@ -0,0 +1,21 @@ +{ + "version": "2.0", + "logging": { + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "excludedTypes": "Request" + } + } + }, + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[4.*, 5.0.0)" + }, + "extensions": { + "http": { + "routePrefix": "", + "maxOutstandingRequests": 100 + } + } +} \ No newline at end of file diff --git a/backend/mainService/requirements.txt b/backend/mainService/requirements.txt index a5189a8..04a3d93 100644 --- a/backend/mainService/requirements.txt +++ b/backend/mainService/requirements.txt @@ -27,4 +27,5 @@ uvicorn httpx>=0.28.1 pypdf pypdf2 +azure-functions diff --git a/backend/mainService/src/config/config.py b/backend/mainService/src/config/config.py index 7ca0a07..98527da 100644 --- a/backend/mainService/src/config/config.py +++ b/backend/mainService/src/config/config.py @@ -32,7 +32,7 @@ class ScraperConfig: """ This is the timeout duration for the requests made to the web scraper """ - TIMEOUT_DURATION: int = 10000 + TIMEOUT_DURATION: int = 30000 # Increased from 10000 to 30000 (30 seconds) def __post_init__(self): if self.MAX_FILE_SIZE <= 0: From 96b41332718ee382222b2f3fecd204e1de5de900 Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Fri, 28 Mar 2025 19:39:57 -0400 Subject: [PATCH 02/10] before creating a main downloads directory in content scraper --- backend/mainService/function_app/__init__.py | 56 +++++++++++++------ .../src/scraper/async_content_scraper.py | 27 ++++++--- backend/metricsService/requirements.txt | 2 +- 3 files changed, 58 insertions(+), 27 deletions(-) diff --git a/backend/mainService/function_app/__init__.py b/backend/mainService/function_app/__init__.py index 28a5db4..8f14f0e 100644 --- a/backend/mainService/function_app/__init__.py +++ b/backend/mainService/function_app/__init__.py @@ -11,31 +11,53 @@ from src.utils.concurrent_resources import cleanup_resources import nltk from dotenv import load_dotenv +import asyncio +from contextlib import asynccontextmanager # Initialize NLTK data and environment variables (these are safe to do at module level) load_dotenv() nltk.download('punkt') nltk.download('punkt_tab') -async def main(req: func.HttpRequest, res: func.Out[func.HttpResponse]) -> func.HttpResponse: - logging.info('Python HTTP trigger function processed a request.') - - # Initialize resources for this request +@asynccontextmanager +async def get_resources(): + # Initialize resources playwright_driver = await ASD.create() pc = await PineconeOperations.create() summarize_llm = Summarize_llm() citation_llm = Citation() - # Initialize content scraper - async with AsyncContentScraper(playwright_driver=playwright_driver) as content_scraper: - # Set up app state for this request - fastapi_app.state.playwright_driver = playwright_driver - fastapi_app.state.pc = pc - fastapi_app.state.summarize_llm = summarize_llm - fastapi_app.state.citation_llm = citation_llm - fastapi_app.state.async_content_scraper = content_scraper - - # Handle the request - response = await func.AsgiMiddleware(fastapi_app).handle_async(req) - res.set(response) - \ No newline at end of file + try: + async with AsyncContentScraper(playwright_driver=playwright_driver) as content_scraper: + # Set up app state + fastapi_app.state.playwright_driver = playwright_driver + fastapi_app.state.pc = pc + fastapi_app.state.summarize_llm = summarize_llm + fastapi_app.state.citation_llm = citation_llm + fastapi_app.state.async_content_scraper = content_scraper + yield + finally: + # Ensure resources are cleaned up + await asyncio.gather( + playwright_driver.quit(), + pc.cleanup(), + cleanup_resources(), + AsyncHTTPClient.close_session(), + return_exceptions=True + ) + logging.info("Resources cleaned up successfully") + +async def main(req: func.HttpRequest, res: func.Out[func.HttpResponse]) -> func.HttpResponse: + logging.info('Python HTTP trigger function processed a request.') + + try: + async with get_resources(): + response = await func.AsgiMiddleware(fastapi_app).handle_async(req) + res.set(response) + logging.info('Request processed successfully') + except Exception as e: + logging.error(f"Error processing request: {str(e)}") + res.set(func.HttpResponse( + "Internal server error", + status_code=500 + )) diff --git a/backend/mainService/src/scraper/async_content_scraper.py b/backend/mainService/src/scraper/async_content_scraper.py index a0042da..bdcb205 100644 --- a/backend/mainService/src/scraper/async_content_scraper.py +++ b/backend/mainService/src/scraper/async_content_scraper.py @@ -35,6 +35,10 @@ log_filename = os.path.basename(__file__) logger = setup_logging(filename=log_filename) +# Define the main downloads directory +MAIN_DOWNLOADS_DIR = os.path.join(os.getcwd(), "downloads") +os.makedirs(MAIN_DOWNLOADS_DIR, exist_ok=True) + """ Citation Content Scraper Module @@ -136,15 +140,19 @@ async def get_pdf(self, parsed_url = parse_url(target_url) base_url = f"{parsed_url.scheme}://{parsed_url.host}" - # Set up download path + # Set up download path in the main downloads directory if not storage_path: - default_path = parsed_url.host + \ - str(datetime.now(tz.utc).strftime("%d_%m_%Y_%H_%M_%S")) - storage_path = os.path.join( - os.getcwd(), "downloads", default_path) + # Create a subdirectory for this request + request_dir = os.path.join( + MAIN_DOWNLOADS_DIR, + f"{parsed_url.host}_{datetime.now(tz.utc).strftime('%d_%m_%Y_%H_%M_%S')}" + ) + storage_path = request_dir else: - storage_path = os.path.abspath(storage_path) - + # If storage_path is provided, create it as a subdirectory of MAIN_DOWNLOADS_DIR + storage_path = os.path.join(MAIN_DOWNLOADS_DIR, storage_path) + + storage_path = os.path.abspath(storage_path) self.current_download_path = storage_path # Check robots.txt @@ -187,8 +195,9 @@ async def get_pdfs(self, """ results = {"count": 0, "paths": {}, "storage_path": None} - storage_path = storage_path + \ - str(datetime.now(tz.utc).strftime("%d_%m_%Y_%H_%M_%S")) if storage_path else None + # Create a unique subdirectory for this batch of downloads + if storage_path: + storage_path = f"{storage_path}_{datetime.now(tz.utc).strftime('%d_%m_%Y_%H_%M_%S')}" # Create tasks for all downloads tasks = [self.get_pdf(url, storage_path) for url in target_urls] diff --git a/backend/metricsService/requirements.txt b/backend/metricsService/requirements.txt index 3d56cd0..cb4dba7 100644 --- a/backend/metricsService/requirements.txt +++ b/backend/metricsService/requirements.txt @@ -7,4 +7,4 @@ python-dotenv==1.0.1 Requests==2.32.3 scholarly==1.7.11 uvicorn - +azure-functions From 483e1f80b61aa60195d484904f03e6ba15d08ad2 Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Fri, 28 Mar 2025 19:40:33 -0400 Subject: [PATCH 03/10] before creating a main downloads directory in content scraper --- backend/mainService/function_app/func.ignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 backend/mainService/function_app/func.ignore diff --git a/backend/mainService/function_app/func.ignore b/backend/mainService/function_app/func.ignore new file mode 100644 index 0000000..7f8fee6 --- /dev/null +++ b/backend/mainService/function_app/func.ignore @@ -0,0 +1,3 @@ +# Ignore downloads directory to prevent function restarts +downloads/ +*.pdf \ No newline at end of file From 3ba2afabc8a24dc260297fd76c6445e2f9c33bbb Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Fri, 28 Mar 2025 19:41:14 -0400 Subject: [PATCH 04/10] before creating a main downloads directory in content scraper, we added the metric files that didn't get staged in the last commit --- .../metricsService/function_app/__init__.py | 11 ++++++++++ .../metricsService/function_app/function.json | 17 +++++++++++++++ backend/metricsService/host.json | 21 +++++++++++++++++++ backend/metricsService/local.settings.json | 7 +++++++ 4 files changed, 56 insertions(+) create mode 100644 backend/metricsService/function_app/__init__.py create mode 100644 backend/metricsService/function_app/function.json create mode 100644 backend/metricsService/host.json create mode 100644 backend/metricsService/local.settings.json diff --git a/backend/metricsService/function_app/__init__.py b/backend/metricsService/function_app/__init__.py new file mode 100644 index 0000000..dc2fdd9 --- /dev/null +++ b/backend/metricsService/function_app/__init__.py @@ -0,0 +1,11 @@ +import azure.functions as func +import logging +from app import app as fastapi_app # Import the FastAPI app from app.py +from dotenv import load_dotenv + +load_dotenv() + +async def main(req: func.HttpRequest, res:func.Out[func.HttpResponse]) -> None: + logging.info('Python HTTP trigger function processed a request.') + response = await func.AsgiMiddleware(fastapi_app).handle_async(req) + res.set(response) \ No newline at end of file diff --git a/backend/metricsService/function_app/function.json b/backend/metricsService/function_app/function.json new file mode 100644 index 0000000..512c98b --- /dev/null +++ b/backend/metricsService/function_app/function.json @@ -0,0 +1,17 @@ +{ + "bindings": [ + { + "authLevel": "function", + "type": "httpTrigger", + "direction": "in", + "name": "req", + "methods": ["get", "post"], + "route": "{*route}" + }, + { + "type": "http", + "direction": "out", + "name": "res" + } + ] +} \ No newline at end of file diff --git a/backend/metricsService/host.json b/backend/metricsService/host.json new file mode 100644 index 0000000..e3b6a9a --- /dev/null +++ b/backend/metricsService/host.json @@ -0,0 +1,21 @@ +{ + "version": "2.0", + "logging": { + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "excludedTypes": "Request" + } + } + }, + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[4.*, 5.0.0)" + }, + "extensions": { + "http": { + "routePrefix": "", + "maxOutstandingRequests": 100 + } + } +} \ No newline at end of file diff --git a/backend/metricsService/local.settings.json b/backend/metricsService/local.settings.json new file mode 100644 index 0000000..4b4cfce --- /dev/null +++ b/backend/metricsService/local.settings.json @@ -0,0 +1,7 @@ +{ + "IsEncrypted": false, + "Values": { + "AzureWebJobsStorage": "", + "FUNCTIONS_WORKER_RUNTIME": "python" + } +} \ No newline at end of file From a472665a739ea7519749a4282c2553b49538558f Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Fri, 28 Mar 2025 20:18:30 -0400 Subject: [PATCH 05/10] stable working version ? --- .gitignore | 138 ++++++++++++++++++ backend/mainService/.funcignore | 6 +- backend/mainService/.gitignore | 135 ----------------- backend/mainService/function_app/__init__.py | 3 +- backend/mainService/function_app/func.ignore | 3 - backend/mainService/src/config/config.py | 6 +- .../src/scraper/async_content_scraper.py | 8 +- .../metricsService/function_app/__init__.py | 6 +- 8 files changed, 155 insertions(+), 150 deletions(-) delete mode 100644 backend/mainService/.gitignore delete mode 100644 backend/mainService/function_app/func.ignore diff --git a/.gitignore b/.gitignore index 1fbafed..45adcd1 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,141 @@ testing_workflow.py *.yaml scripts/ +downloads/ +*.pdf + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don’t work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Azure Functions artifacts +bin +obj +appsettings.json +local.settings.json + +# Azurite artifacts +__blobstorage__ +__queuestorage__ +__azurite_db*__.json +.python_packages diff --git a/backend/mainService/.funcignore b/backend/mainService/.funcignore index c5a4d9a..eb0b235 100644 --- a/backend/mainService/.funcignore +++ b/backend/mainService/.funcignore @@ -13,4 +13,8 @@ venv .gitignore *.ini *.pyc -__pycache__ \ No newline at end of file +__pycache__ + +# Ignore downloads directory to prevent function restarts +downloads/ +*.pdf \ No newline at end of file diff --git a/backend/mainService/.gitignore b/backend/mainService/.gitignore deleted file mode 100644 index 7685fc4..0000000 --- a/backend/mainService/.gitignore +++ /dev/null @@ -1,135 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don’t work, or not -# install all needed dependencies. -#Pipfile.lock - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# Azure Functions artifacts -bin -obj -appsettings.json -local.settings.json - -# Azurite artifacts -__blobstorage__ -__queuestorage__ -__azurite_db*__.json -.python_packages \ No newline at end of file diff --git a/backend/mainService/function_app/__init__.py b/backend/mainService/function_app/__init__.py index 8f14f0e..4b1c4bf 100644 --- a/backend/mainService/function_app/__init__.py +++ b/backend/mainService/function_app/__init__.py @@ -41,10 +41,9 @@ async def get_resources(): await asyncio.gather( playwright_driver.quit(), pc.cleanup(), - cleanup_resources(), - AsyncHTTPClient.close_session(), return_exceptions=True ) + cleanup_resources() logging.info("Resources cleaned up successfully") async def main(req: func.HttpRequest, res: func.Out[func.HttpResponse]) -> func.HttpResponse: diff --git a/backend/mainService/function_app/func.ignore b/backend/mainService/function_app/func.ignore deleted file mode 100644 index 7f8fee6..0000000 --- a/backend/mainService/function_app/func.ignore +++ /dev/null @@ -1,3 +0,0 @@ -# Ignore downloads directory to prevent function restarts -downloads/ -*.pdf \ No newline at end of file diff --git a/backend/mainService/src/config/config.py b/backend/mainService/src/config/config.py index 98527da..ceedb48 100644 --- a/backend/mainService/src/config/config.py +++ b/backend/mainService/src/config/config.py @@ -32,13 +32,17 @@ class ScraperConfig: """ This is the timeout duration for the requests made to the web scraper """ - TIMEOUT_DURATION: int = 30000 # Increased from 10000 to 30000 (30 seconds) + TIMEOUT_DURATION: int = 10000 # Increased from 10000 to 30000 (30 seconds) + + # Define the main downloads directory + MAIN_DOWNLOADS_DIR_PATH: str = os.path.join(os.getcwd(), "downloads") def __post_init__(self): if self.MAX_FILE_SIZE <= 0: raise ValueError("MAX_FILE_SIZE must be positive") if self.TIMEOUT_DURATION <= 0: raise ValueError("TIMEOUT_DURATION must be positive") + os.makedirs(self.MAIN_DOWNLOADS_DIR_PATH, exist_ok=True) @dataclass diff --git a/backend/mainService/src/scraper/async_content_scraper.py b/backend/mainService/src/scraper/async_content_scraper.py index bdcb205..fead8df 100644 --- a/backend/mainService/src/scraper/async_content_scraper.py +++ b/backend/mainService/src/scraper/async_content_scraper.py @@ -30,14 +30,12 @@ from playwright.async_api import Browser, BrowserContext from src.config.log_config import setup_logging from datetime import timezone as tz +from src.config.config import scraper_config log_filename = os.path.basename(__file__) logger = setup_logging(filename=log_filename) -# Define the main downloads directory -MAIN_DOWNLOADS_DIR = os.path.join(os.getcwd(), "downloads") -os.makedirs(MAIN_DOWNLOADS_DIR, exist_ok=True) """ Citation Content Scraper Module @@ -144,13 +142,13 @@ async def get_pdf(self, if not storage_path: # Create a subdirectory for this request request_dir = os.path.join( - MAIN_DOWNLOADS_DIR, + scraper_config.MAIN_DOWNLOADS_DIR_PATH, f"{parsed_url.host}_{datetime.now(tz.utc).strftime('%d_%m_%Y_%H_%M_%S')}" ) storage_path = request_dir else: # If storage_path is provided, create it as a subdirectory of MAIN_DOWNLOADS_DIR - storage_path = os.path.join(MAIN_DOWNLOADS_DIR, storage_path) + storage_path = os.path.join(scraper_config.MAIN_DOWNLOADS_DIR_PATH, storage_path) storage_path = os.path.abspath(storage_path) self.current_download_path = storage_path diff --git a/backend/metricsService/function_app/__init__.py b/backend/metricsService/function_app/__init__.py index dc2fdd9..96dc26c 100644 --- a/backend/metricsService/function_app/__init__.py +++ b/backend/metricsService/function_app/__init__.py @@ -1,11 +1,11 @@ import azure.functions as func import logging -from app import app as fastapi_app # Import the FastAPI app from app.py +from main import app as fastapi_app # Import the FastAPI app from app.py from dotenv import load_dotenv load_dotenv() -async def main(req: func.HttpRequest, res:func.Out[func.HttpResponse]) -> None: +async def main(req: func.HttpRequest, res:func.Out[func.HttpResponse]) -> func.HttpResponse: logging.info('Python HTTP trigger function processed a request.') - response = await func.AsgiMiddleware(fastapi_app).handle_async(req) + response = await func.AsgiMiddleware(app=fastapi_app).handle_async(req) res.set(response) \ No newline at end of file From 44a649f9198587ccbdc668dabada5cdc2d5531a9 Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 02:14:46 -0400 Subject: [PATCH 06/10] deployed this version to azure --- .gitignore | 2 + backend/mainService/.funcignore | 3 + backend/mainService/function_app/__init__.py | 80 ++++++++++--------- .../mainService/function_app/function.json | 2 +- backend/mainService/src/config/config.py | 8 +- .../src/config/playwright_driver.py | 3 +- backend/metricsService/.funcignore | 8 ++ .../metricsService/function_app/function.json | 2 +- 8 files changed, 65 insertions(+), 43 deletions(-) create mode 100644 backend/metricsService/.funcignore diff --git a/.gitignore b/.gitignore index 45adcd1..42b3671 100644 --- a/.gitignore +++ b/.gitignore @@ -198,3 +198,5 @@ __blobstorage__ __queuestorage__ __azurite_db*__.json .python_packages + +playwright_browser/ \ No newline at end of file diff --git a/backend/mainService/.funcignore b/backend/mainService/.funcignore index eb0b235..4b56b19 100644 --- a/backend/mainService/.funcignore +++ b/backend/mainService/.funcignore @@ -6,6 +6,7 @@ __queuestorage__ local.settings.json test venv +.venv .git .vscode .env @@ -14,6 +15,8 @@ venv *.ini *.pyc __pycache__ +pytest.ini +pytest_cache # Ignore downloads directory to prevent function restarts downloads/ diff --git a/backend/mainService/function_app/__init__.py b/backend/mainService/function_app/__init__.py index 4b1c4bf..651ceb6 100644 --- a/backend/mainService/function_app/__init__.py +++ b/backend/mainService/function_app/__init__.py @@ -1,62 +1,64 @@ import azure.functions as func import logging -from app import app as fastapi_app # Import the FastAPI app from app.py -from src.config.startup import startup_event +from app import app as fastapi_app # Import FastAPI app +from src.config.playwright_driver import PlaywrightDriver as ASD from src.llm.Pinecone import PineconeOperations from src.llm.chat_llm.Groq_llm import Summarize_llm from src.llm.chat_llm.Azure_llm import Citation from src.scraper.async_content_scraper import AsyncContentScraper -from src.config.playwright_driver import PlaywrightDriver as ASD -from src.config.async_http_session import AsyncHTTPClient -from src.utils.concurrent_resources import cleanup_resources -import nltk from dotenv import load_dotenv +import nltk import asyncio -from contextlib import asynccontextmanager -# Initialize NLTK data and environment variables (these are safe to do at module level) +# Load environment variables and NLTK data load_dotenv() nltk.download('punkt') nltk.download('punkt_tab') -@asynccontextmanager -async def get_resources(): - # Initialize resources - playwright_driver = await ASD.create() - pc = await PineconeOperations.create() - summarize_llm = Summarize_llm() - citation_llm = Citation() - - try: - async with AsyncContentScraper(playwright_driver=playwright_driver) as content_scraper: - # Set up app state +# Global variables for resources +playwright_driver = None +pc = None +summarize_llm = None +citation_llm = None +async_content_scraper = None +resource_lock = asyncio.Lock() # Prevent race conditions + +async def initialize_resources(): + """ + Initializes global resources only once and prevents multiple concurrent initializations. + """ + global playwright_driver, pc, summarize_llm, citation_llm, async_content_scraper + + async with resource_lock: # Prevent multiple requests from initializing Playwright at the same time + if playwright_driver is None: + logging.info("Initializing Playwright and other global resources...") + + playwright_driver = await ASD.create() + pc = await PineconeOperations.create() + summarize_llm = Summarize_llm() + citation_llm = Citation() + async_content_scraper = await AsyncContentScraper(playwright_driver).__aenter__() + + # Set FastAPI state fastapi_app.state.playwright_driver = playwright_driver fastapi_app.state.pc = pc fastapi_app.state.summarize_llm = summarize_llm fastapi_app.state.citation_llm = citation_llm - fastapi_app.state.async_content_scraper = content_scraper - yield - finally: - # Ensure resources are cleaned up - await asyncio.gather( - playwright_driver.quit(), - pc.cleanup(), - return_exceptions=True - ) - cleanup_resources() - logging.info("Resources cleaned up successfully") + fastapi_app.state.async_content_scraper = async_content_scraper + + logging.info("Global resources initialized.") async def main(req: func.HttpRequest, res: func.Out[func.HttpResponse]) -> func.HttpResponse: logging.info('Python HTTP trigger function processed a request.') - + try: - async with get_resources(): - response = await func.AsgiMiddleware(fastapi_app).handle_async(req) - res.set(response) - logging.info('Request processed successfully') + if playwright_driver is None: + await initialize_resources() # Make sure Playwright is running + + response = await func.AsgiMiddleware(fastapi_app).handle_async(req) + res.set(response) + logging.info('Request processed successfully') + except Exception as e: logging.error(f"Error processing request: {str(e)}") - res.set(func.HttpResponse( - "Internal server error", - status_code=500 - )) + res.set(func.HttpResponse("Internal server error", status_code=500)) diff --git a/backend/mainService/function_app/function.json b/backend/mainService/function_app/function.json index 512c98b..242db6a 100644 --- a/backend/mainService/function_app/function.json +++ b/backend/mainService/function_app/function.json @@ -1,7 +1,7 @@ { "bindings": [ { - "authLevel": "function", + "authLevel": "anonymous", "type": "httpTrigger", "direction": "in", "name": "req", diff --git a/backend/mainService/src/config/config.py b/backend/mainService/src/config/config.py index ceedb48..d500218 100644 --- a/backend/mainService/src/config/config.py +++ b/backend/mainService/src/config/config.py @@ -35,7 +35,13 @@ class ScraperConfig: TIMEOUT_DURATION: int = 10000 # Increased from 10000 to 30000 (30 seconds) # Define the main downloads directory - MAIN_DOWNLOADS_DIR_PATH: str = os.path.join(os.getcwd(), "downloads") + MAIN_DOWNLOADS_DIR_PATH: str = os.path.join("/tmp", "downloads") + + CURRENT_FILE_PATH = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) # Go up one level from 'mainservice' + + os.path.dirname("...") # Go up one level from 'src' + + PLAYWRIGHT_EXE_PATH=os.path.join(os.path.dirname(os.path.realpath(CURRENT_FILE_PATH)), 'playwright_browser', 'chromium_headless_shell-1161', 'chrome-linux', 'headless_shell') def __post_init__(self): if self.MAX_FILE_SIZE <= 0: diff --git a/backend/mainService/src/config/playwright_driver.py b/backend/mainService/src/config/playwright_driver.py index 1eb5e28..10d8ea1 100644 --- a/backend/mainService/src/config/playwright_driver.py +++ b/backend/mainService/src/config/playwright_driver.py @@ -129,8 +129,9 @@ async def __initialize_browser(self) -> Browser: "--disable-blink-features=AutomationControlled", ] try: + exe_path = scraper_config.PLAYWRIGHT_EXE_PATH or None self._playwright = await async_playwright().start() - self._browser = await self._playwright.chromium.launch(headless=True, args=args) + self._browser = await self._playwright.chromium.launch(headless=True, args=args, executable_path=exe_path) except Exception as e: logger.critical(f"Error while initializing browser: {e}") raise e diff --git a/backend/metricsService/.funcignore b/backend/metricsService/.funcignore new file mode 100644 index 0000000..7dda614 --- /dev/null +++ b/backend/metricsService/.funcignore @@ -0,0 +1,8 @@ +.git* +.vscode +__azurite_db*__.json +__blobstorage__ +__queuestorage__ +local.settings.json +test +venv \ No newline at end of file diff --git a/backend/metricsService/function_app/function.json b/backend/metricsService/function_app/function.json index 512c98b..242db6a 100644 --- a/backend/metricsService/function_app/function.json +++ b/backend/metricsService/function_app/function.json @@ -1,7 +1,7 @@ { "bindings": [ { - "authLevel": "function", + "authLevel": "anonymous", "type": "httpTrigger", "direction": "in", "name": "req", From 9060632db7d895aac190fa8c200b72ae66472dee Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 16:53:25 -0400 Subject: [PATCH 07/10] updated gitnore file --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 1fbafed..8be83c9 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,6 @@ testing_workflow.py *.yaml scripts/ +playwright_browser +local.settings.json +function_app/ From 68b736bf52b935ada99fe9080e7d9a3d7052e37a Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 17:41:55 -0400 Subject: [PATCH 08/10] 1. Uses a particular download folder for the pdf download,, no cluttering of workdir. 2.removes older config like openrouter endpoint, now obsolete as it is not being used. 3.Provide option to set playwright binaries 4. Cleanup the backend dir --- backend/mainService/.funcignore | 23 ------- backend/mainService/function_app/__init__.py | 64 ------------------- .../mainService/function_app/function.json | 17 ----- backend/mainService/host.json | 21 ------ backend/mainService/src/config/config.py | 59 ++++++++++------- .../mainService/src/llm/chat_llm/Azure_llm.py | 9 ++- .../mainService/src/llm/chat_llm/Groq_llm.py | 7 +- 7 files changed, 47 insertions(+), 153 deletions(-) delete mode 100644 backend/mainService/.funcignore delete mode 100644 backend/mainService/function_app/__init__.py delete mode 100644 backend/mainService/function_app/function.json delete mode 100644 backend/mainService/host.json diff --git a/backend/mainService/.funcignore b/backend/mainService/.funcignore deleted file mode 100644 index 4b56b19..0000000 --- a/backend/mainService/.funcignore +++ /dev/null @@ -1,23 +0,0 @@ -.git* -.vscode -__azurite_db*__.json -__blobstorage__ -__queuestorage__ -local.settings.json -test -venv -.venv -.git -.vscode -.env -.env.test -.gitignore -*.ini -*.pyc -__pycache__ -pytest.ini -pytest_cache - -# Ignore downloads directory to prevent function restarts -downloads/ -*.pdf \ No newline at end of file diff --git a/backend/mainService/function_app/__init__.py b/backend/mainService/function_app/__init__.py deleted file mode 100644 index 651ceb6..0000000 --- a/backend/mainService/function_app/__init__.py +++ /dev/null @@ -1,64 +0,0 @@ -import azure.functions as func -import logging -from app import app as fastapi_app # Import FastAPI app -from src.config.playwright_driver import PlaywrightDriver as ASD -from src.llm.Pinecone import PineconeOperations -from src.llm.chat_llm.Groq_llm import Summarize_llm -from src.llm.chat_llm.Azure_llm import Citation -from src.scraper.async_content_scraper import AsyncContentScraper -from dotenv import load_dotenv -import nltk -import asyncio - -# Load environment variables and NLTK data -load_dotenv() -nltk.download('punkt') -nltk.download('punkt_tab') - -# Global variables for resources -playwright_driver = None -pc = None -summarize_llm = None -citation_llm = None -async_content_scraper = None -resource_lock = asyncio.Lock() # Prevent race conditions - -async def initialize_resources(): - """ - Initializes global resources only once and prevents multiple concurrent initializations. - """ - global playwright_driver, pc, summarize_llm, citation_llm, async_content_scraper - - async with resource_lock: # Prevent multiple requests from initializing Playwright at the same time - if playwright_driver is None: - logging.info("Initializing Playwright and other global resources...") - - playwright_driver = await ASD.create() - pc = await PineconeOperations.create() - summarize_llm = Summarize_llm() - citation_llm = Citation() - async_content_scraper = await AsyncContentScraper(playwright_driver).__aenter__() - - # Set FastAPI state - fastapi_app.state.playwright_driver = playwright_driver - fastapi_app.state.pc = pc - fastapi_app.state.summarize_llm = summarize_llm - fastapi_app.state.citation_llm = citation_llm - fastapi_app.state.async_content_scraper = async_content_scraper - - logging.info("Global resources initialized.") - -async def main(req: func.HttpRequest, res: func.Out[func.HttpResponse]) -> func.HttpResponse: - logging.info('Python HTTP trigger function processed a request.') - - try: - if playwright_driver is None: - await initialize_resources() # Make sure Playwright is running - - response = await func.AsgiMiddleware(fastapi_app).handle_async(req) - res.set(response) - logging.info('Request processed successfully') - - except Exception as e: - logging.error(f"Error processing request: {str(e)}") - res.set(func.HttpResponse("Internal server error", status_code=500)) diff --git a/backend/mainService/function_app/function.json b/backend/mainService/function_app/function.json deleted file mode 100644 index 242db6a..0000000 --- a/backend/mainService/function_app/function.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "bindings": [ - { - "authLevel": "anonymous", - "type": "httpTrigger", - "direction": "in", - "name": "req", - "methods": ["get", "post"], - "route": "{*route}" - }, - { - "type": "http", - "direction": "out", - "name": "res" - } - ] -} \ No newline at end of file diff --git a/backend/mainService/host.json b/backend/mainService/host.json deleted file mode 100644 index e3b6a9a..0000000 --- a/backend/mainService/host.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "version": "2.0", - "logging": { - "applicationInsights": { - "samplingSettings": { - "isEnabled": true, - "excludedTypes": "Request" - } - } - }, - "extensionBundle": { - "id": "Microsoft.Azure.Functions.ExtensionBundle", - "version": "[4.*, 5.0.0)" - }, - "extensions": { - "http": { - "routePrefix": "", - "maxOutstandingRequests": 100 - } - } -} \ No newline at end of file diff --git a/backend/mainService/src/config/config.py b/backend/mainService/src/config/config.py index d500218..778a5a4 100644 --- a/backend/mainService/src/config/config.py +++ b/backend/mainService/src/config/config.py @@ -34,14 +34,15 @@ class ScraperConfig: """ TIMEOUT_DURATION: int = 10000 # Increased from 10000 to 30000 (30 seconds) - # Define the main downloads directory + """ + This is the path to the directory where the downloads will be stored. + """ MAIN_DOWNLOADS_DIR_PATH: str = os.path.join("/tmp", "downloads") - CURRENT_FILE_PATH = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) # Go up one level from 'mainservice' - - os.path.dirname("...") # Go up one level from 'src' - - PLAYWRIGHT_EXE_PATH=os.path.join(os.path.dirname(os.path.realpath(CURRENT_FILE_PATH)), 'playwright_browser', 'chromium_headless_shell-1161', 'chrome-linux', 'headless_shell') + """ + This is the path to the playwright executable. + """ + PLAYWRIGHT_EXE_PATH=None # set to None if you want to use the default playwright executable def __post_init__(self): if self.MAX_FILE_SIZE <= 0: @@ -95,14 +96,6 @@ class LlmConfig: """ UPSERT_BATCH_SIZE: int = 1000 - """ - This is the llm that open router uses for generating the intext citation and reference list for each query - """ - OPEN_ROUTER_MODEL: str = "meta-llama/llama-3.3-70b-instruct:free" - - """ - This is the azure model api endpoint - """ # Concurrency and Performance @@ -111,12 +104,24 @@ class ConcurrencyConfig: """Configuration class for concurrency settings.""" # General concurrency settings + """ + This is the number of concurrent workers that will be used to process the source documents. + """ DEFAULT_CONCURRENT_WORKERS: int = (os.cpu_count() // 2) + 1 - HANDLE_INDEX_DELETE_WORKERS: int = 2 - # Credibility service specific settings + """ + This is the maximum number of threads that will be used to calculate the credibility of the source documents. + """ CREDIBILITY_MAX_THREADS: int = 4 # Maximum threads for credibility calculations + + """ + This is the maximum number of concurrent operations that will be used to calculate the credibility of the source documents. + """ CREDIBILITY_MAX_CONCURRENT: int = 8 # Maximum concurrent operations + + """ + This is the size of the processing batches that will be used to calculate the credibility of the source documents. + """ CREDIBILITY_BATCH_SIZE: int = 4 # Size of processing batches @@ -127,13 +132,23 @@ class ModelConfig: Contains settings specific to AI models and their deployment.""" """Configuration for ML models and APIs.""" - MODEL_ID: str = "BAAI/bge-m3" - MODEL_API_URL: str = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{MODEL_ID}" - # LLM Generation Parameters - DEFAULT_TEMPERATURE: float = 0.5 - DEFAULT_TOP_P: float = 1.0 - DEFAULT_MAX_TOKENS: int = 1024 + """ + This is the temperature for the citation LLM. + """ + CITE_LLM_TEMPERATURE: float = 0.1 + """ + This is the temperature for the summarize LLM. + """ + SUMMARIZE_LLM_TEMPERATURE: float = 0.9 + """ + This is the top p for the citation LLM. + """ + CITE_LLM_TOP_P: float = 0.1 + """ + This is the top p for the summarize LLM. + """ + SUMMARIZE_LLM_TOP_P: float = 0.1 @dataclass diff --git a/backend/mainService/src/llm/chat_llm/Azure_llm.py b/backend/mainService/src/llm/chat_llm/Azure_llm.py index 1e8bb84..3752ca7 100644 --- a/backend/mainService/src/llm/chat_llm/Azure_llm.py +++ b/backend/mainService/src/llm/chat_llm/Azure_llm.py @@ -15,7 +15,7 @@ from src.custom_exceptions.llm_exceptions import CitationGenerationError import logging from concurrent.futures import ThreadPoolExecutor -from src.config.config import concurrency_config +from src.config.config import concurrency_config, model_config logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel( logging.WARNING) @@ -128,8 +128,11 @@ def _blocking_citation_request( Dict[str, Any]: Raw API response containing citation data """ try: - response: ChatCompletions = self.client.complete(messages=messages, model=( - model_name or self.model_name), temperature=0.1, top_p=0.1) + response: ChatCompletions = self.client.complete( + messages=messages, + model=(model_name or self.model_name), + temperature=model_config.CITE_LLM_TEMPERATURE, + top_p=model_config.DEFAULT_TOP_P) response_content = response.choices[0].message.content # amazonq-ignore-next-line response_content = response_content.strip() diff --git a/backend/mainService/src/llm/chat_llm/Groq_llm.py b/backend/mainService/src/llm/chat_llm/Groq_llm.py index a2a6370..960381a 100644 --- a/backend/mainService/src/llm/chat_llm/Groq_llm.py +++ b/backend/mainService/src/llm/chat_llm/Groq_llm.py @@ -6,6 +6,7 @@ from typing import Optional from json.decoder import JSONDecodeError from src.custom_exceptions.llm_exceptions import SearchKeyGenerationError +from src.config.config import model_config filename = os.path.basename(__file__) logger = setup_logging(filename=filename) @@ -59,9 +60,9 @@ def getKeywordSearchTerm(self, document: str, proposed_title: Optional[str] = No "content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'.{document}" }, ], - temperature=0.9, - top_p=1, - max_tokens=1024, + temperature=model_config.SUMMARIZE_LLM_TEMPERATURE, + top_p=model_config.DEFAULT_TOP_P, + max_tokens=200, stream=False, stop=None, response_format={"type": "json_object"} From 28b4991a60a6460a221b4feff67dc28d02356236 Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 17:47:52 -0400 Subject: [PATCH 09/10] use a dedicated top p for cite me llm and summarize_llm --- backend/mainService/src/llm/chat_llm/Azure_llm.py | 2 +- backend/mainService/src/llm/chat_llm/Groq_llm.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/mainService/src/llm/chat_llm/Azure_llm.py b/backend/mainService/src/llm/chat_llm/Azure_llm.py index 3752ca7..dd346b8 100644 --- a/backend/mainService/src/llm/chat_llm/Azure_llm.py +++ b/backend/mainService/src/llm/chat_llm/Azure_llm.py @@ -132,7 +132,7 @@ def _blocking_citation_request( messages=messages, model=(model_name or self.model_name), temperature=model_config.CITE_LLM_TEMPERATURE, - top_p=model_config.DEFAULT_TOP_P) + top_p=model_config.CITE_LLM_TOP_P) response_content = response.choices[0].message.content # amazonq-ignore-next-line response_content = response_content.strip() diff --git a/backend/mainService/src/llm/chat_llm/Groq_llm.py b/backend/mainService/src/llm/chat_llm/Groq_llm.py index 960381a..e35e42b 100644 --- a/backend/mainService/src/llm/chat_llm/Groq_llm.py +++ b/backend/mainService/src/llm/chat_llm/Groq_llm.py @@ -61,7 +61,7 @@ def getKeywordSearchTerm(self, document: str, proposed_title: Optional[str] = No }, ], temperature=model_config.SUMMARIZE_LLM_TEMPERATURE, - top_p=model_config.DEFAULT_TOP_P, + top_p=model_config.SUMMARIZE_LLM_TOP_P, max_tokens=200, stream=False, stop=None, From 6e2110324284d0a682b8d80d64ae342be81e0a0a Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 17:55:06 -0400 Subject: [PATCH 10/10] Ensure .gitignore, README.md, and docker-compose.yml always come from main --- .gitattributes | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..38cd5be --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +.gitignore merge=ours +README.md merge=ours +docker-compose.yml merge=ours