From 74c3d93601cfaa95cada9bd17392bf60c1707db7 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Fri, 28 Feb 2025 16:45:27 -0500 Subject: [PATCH 01/41] Initial clone and added VS Code. - Created app 'Visual Studio Code' - Updated Requested GPUs to '0' - Modified files: .project/configpacks, code/chatui/utils/database.py, code/output.log --- .project/configpacks | 2 + .project/spec.yaml | 266 +++++++++++++++++----------------- code/chatui/utils/database.py | 2 + code/output.log | 8 + 4 files changed, 147 insertions(+), 131 deletions(-) diff --git a/.project/configpacks b/.project/configpacks index 0b6b387..994edce 100644 --- a/.project/configpacks +++ b/.project/configpacks @@ -3,10 +3,12 @@ *cuda.CUDA *defaults.EnvVars *defaults.Readme +*defaults.CA *defaults.Entrypoint *apt.PackageManager *bash.PreLanguage *python.PipPackageManager *bash.PostBuild *jupyterlab.JupyterLab +*vs_code.VSCode *tensorboard.Tensorboard \ No newline at end of file diff --git a/.project/spec.yaml b/.project/spec.yaml index 59b0916..8aa0d8b 100644 --- a/.project/spec.yaml +++ b/.project/spec.yaml @@ -1,137 +1,141 @@ specVersion: v2 specMinorVersion: 2 meta: - name: agentic-rag - image: project-agentic-rag - description: An example project for advanced RAG using agents - labels: [] - createdOn: "2024-07-15T21:09:46Z" - defaultBranch: main + name: agentic-rag + image: project-agentic-rag + description: An example project for advanced RAG using agents + labels: [] + createdOn: "2024-07-15T21:09:46Z" + defaultBranch: main layout: -- path: code/ - type: code - storage: git -- path: models/ - type: models - storage: gitlfs -- path: data/ - type: data - storage: gitlfs -- path: data/scratch/ - type: data - storage: gitignore + - path: code/ + type: code + storage: git + - path: models/ + type: models + storage: gitlfs + - path: data/ + type: data + storage: gitlfs + - path: data/scratch/ + type: data + storage: gitignore environment: - base: - registry: nvcr.io - image: nvidia/ai-workbench/pytorch:1.0.2 - build_timestamp: "20231102150513" - name: PyTorch - supported_architectures: [] - cuda_version: "12.2" - description: A Pytorch 2.1 Base with CUDA 12.2 - entrypoint_script: "" - labels: - - cuda12.2 - - pytorch2.1 - apps: - - name: chat - type: custom - class: webapp - start_command: cd /project/code/ && PROXY_PREFIX=$PROXY_PREFIX python3 -m chatui - health_check_command: curl -f "http://localhost:8080/" - stop_command: pkill -f "^python3 -m chatui" - user_msg: "" - logfile_path: "" - timeout_seconds: 60 - icon_url: "" - webapp_options: - autolaunch: true - port: "8080" - proxy: - trim_prefix: true - url: http://localhost:8080/ - - name: jupyterlab - type: jupyterlab - class: webapp - start_command: jupyter lab --allow-root --port 8888 --ip 0.0.0.0 --no-browser - --NotebookApp.base_url=\$PROXY_PREFIX --NotebookApp.default_url=/lab --NotebookApp.allow_origin='*' - health_check_command: '[ \$(echo url=\$(jupyter lab list | head -n 2 | tail - -n 1 | cut -f1 -d'' '' | grep -v ''Currently'' | sed "s@/?@/lab?@g") | curl - -o /dev/null -s -w ''%{http_code}'' --config -) == ''200'' ]' - stop_command: jupyter lab stop 8888 - user_msg: "" - logfile_path: "" - timeout_seconds: 60 - icon_url: "" - webapp_options: - autolaunch: true - port: "8888" - proxy: - trim_prefix: false - url_command: jupyter lab list | head -n 2 | tail -n 1 | cut -f1 -d' ' | grep - -v 'Currently' - - name: tensorboard - type: tensorboard - class: webapp - start_command: tensorboard --logdir \$TENSORBOARD_LOGS_DIRECTORY --path_prefix=\$PROXY_PREFIX - --bind_all - health_check_command: '[ \$(curl -o /dev/null -s -w ''%{http_code}'' http://localhost:\$TENSORBOARD_PORT\$PROXY_PREFIX/) - == ''200'' ]' - stop_command: pkill tensorboard - user_msg: "" - logfile_path: "" - timeout_seconds: 60 - icon_url: "" - webapp_options: - autolaunch: true - port: "6006" - proxy: - trim_prefix: false - url: http://localhost:6006 - programming_languages: - - python3 - icon_url: "" - image_version: 1.0.2 - os: linux - os_distro: ubuntu - os_distro_release: "22.04" - schema_version: v2 - user_info: - uid: "" - gid: "" - username: "" - package_managers: - - name: apt - binary_path: /usr/bin/apt - installed_packages: - - curl - - git - - git-lfs - - vim - - name: pip - binary_path: /usr/local/bin/pip - installed_packages: - - jupyterlab==4.0.7 - package_manager_environment: - name: "" - target: "" + base: + registry: nvcr.io + image: nvidia/ai-workbench/pytorch:1.0.2 + build_timestamp: "20231102150513" + name: PyTorch + supported_architectures: [] + cuda_version: "12.2" + description: A Pytorch 2.1 Base with CUDA 12.2 + entrypoint_script: "" + labels: + - cuda12.2 + - pytorch2.1 + apps: + - name: chat + type: custom + class: webapp + start_command: cd /project/code/ && PROXY_PREFIX=$PROXY_PREFIX python3 -m chatui + health_check_command: curl -f "http://localhost:8080/" + stop_command: pkill -f "^python3 -m chatui" + user_msg: "" + logfile_path: "" + timeout_seconds: 60 + icon_url: "" + webapp_options: + autolaunch: true + port: "8080" + proxy: + trim_prefix: true + url: http://localhost:8080/ + - name: jupyterlab + type: jupyterlab + class: webapp + start_command: jupyter lab --allow-root --port 8888 --ip 0.0.0.0 --no-browser --NotebookApp.base_url=\$PROXY_PREFIX --NotebookApp.default_url=/lab --NotebookApp.allow_origin='*' + health_check_command: '[ \$(echo url=\$(jupyter lab list | head -n 2 | tail -n 1 | cut -f1 -d'' '' | grep -v ''Currently'' | sed "s@/?@/lab?@g") | curl -o /dev/null -s -w ''%{http_code}'' --config -) == ''200'' ]' + stop_command: jupyter lab stop 8888 + user_msg: "" + logfile_path: "" + timeout_seconds: 60 + icon_url: "" + webapp_options: + autolaunch: true + port: "8888" + proxy: + trim_prefix: false + url_command: jupyter lab list | head -n 2 | tail -n 1 | cut -f1 -d' ' | grep -v 'Currently' + - name: tensorboard + type: tensorboard + class: webapp + start_command: tensorboard --logdir \$TENSORBOARD_LOGS_DIRECTORY --path_prefix=\$PROXY_PREFIX --bind_all + health_check_command: '[ \$(curl -o /dev/null -s -w ''%{http_code}'' http://localhost:\$TENSORBOARD_PORT\$PROXY_PREFIX/) == ''200'' ]' + stop_command: pkill tensorboard + user_msg: "" + logfile_path: "" + timeout_seconds: 60 + icon_url: "" + webapp_options: + autolaunch: true + port: "6006" + proxy: + trim_prefix: false + url: http://localhost:6006 + programming_languages: + - python3 + icon_url: "" + image_version: 1.0.2 + os: linux + os_distro: ubuntu + os_distro_release: "22.04" + schema_version: v2 + user_info: + uid: "" + gid: "" + username: "" + package_managers: + - name: apt + binary_path: /usr/bin/apt + installed_packages: + - curl + - git + - git-lfs + - vim + - name: pip + binary_path: /usr/local/bin/pip + installed_packages: + - jupyterlab==4.0.7 + package_manager_environment: + name: "" + target: "" execution: - apps: [] - resources: - gpu: - requested: 1 - sharedMemoryMB: 1024 - secrets: - - variable: NVIDIA_API_KEY - description: NVIDIA API Key for accessing the API catalog - - variable: TAVILY_API_KEY - description: Tavily Search API Key - mounts: - - type: project - target: /project/ - description: Project directory - options: rw - - type: volume - target: /data/tensorboard/logs/ - description: Tensorboard Log Files - options: volumeName=tensorboard-logs-volume + apps: + - name: Visual Studio Code + type: vs-code + class: native + start_command: "" + health_check_command: '[ \$(ps aux | grep ".vscode-server" | grep -v grep | wc -l ) -gt 4 ] && [ \$(ps aux | grep "/.vscode-server/bin/.*/node .* net.createConnection" | grep -v grep | wc -l) -gt 0 ]' + stop_command: "" + user_msg: "" + logfile_path: "" + timeout_seconds: 120 + icon_url: "" + resources: + gpu: + requested: 0 + sharedMemoryMB: 1024 + secrets: + - variable: NVIDIA_API_KEY + description: NVIDIA API Key for accessing the API catalog + - variable: TAVILY_API_KEY + description: Tavily Search API Key + mounts: + - type: project + target: /project/ + description: Project directory + options: rw + - type: volume + target: /data/tensorboard/logs/ + description: Tensorboard Log Files + options: volumeName=tensorboard-logs-volume diff --git a/code/chatui/utils/database.py b/code/chatui/utils/database.py index c018276..8bc6f4d 100644 --- a/code/chatui/utils/database.py +++ b/code/chatui/utils/database.py @@ -19,10 +19,12 @@ from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings from typing import Any, Dict, List, Tuple, Union + import nltk nltk.download("punkt") nltk.download("averaged_perceptron_tagger") + def upload(urls: List[str]): """ This is a helper function for parsing the user inputted URLs and uploading them into the vector store. """ docs = [WebBaseLoader(url).load() for url in urls] diff --git a/code/output.log b/code/output.log index e69de29..731d089 100644 --- a/code/output.log +++ b/code/output.log @@ -0,0 +1,8 @@ +http://localhost:8000 +Running on local URL: http://0.0.0.0:8080 + +To create a public link, set `share=True` in `launch()`. +IMPORTANT: You are using gradio version 4.15.0, however version 4.44.1 is available, please upgrade. +-------- +---ROUTE QUESTION--- +hi From 7be40f5608382ba29ac960cca2cc8dac5ffe3806 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sat, 1 Mar 2025 14:10:56 -0500 Subject: [PATCH 02/41] Add model identifier constants and refactor model selection logic - Modified files: code/chatui/pages/converse.py, code/chatui/utils/database.py, variables.env Added LLAMA and MISTRAL constants at top of file to centralize model identifiers - Updated model_list to use the new constants - Kept match/case pattern for maintainable model selection logic - Ensures consistent model identifier usage with API_PREFIX throughout code This change improves maintainability by centralizing model identifiers and makes it easier to add new models in the future. --- code/chatui/pages/converse.py | 29 +++++++++++++++++------------ code/chatui/utils/database.py | 18 ++++++++++++++---- variables.env | 2 ++ 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/code/chatui/pages/converse.py b/code/chatui/pages/converse.py index 0c4a3fa..e3c2501 100644 --- a/code/chatui/pages/converse.py +++ b/code/chatui/pages/converse.py @@ -25,6 +25,8 @@ import time import sys +API_PREFIX = os.getenv('API_PREFIX', '') + from chatui import assets, chat_client from chatui.prompts import prompts_llama3, prompts_mistral from chatui.utils import compile, database, logger @@ -36,6 +38,10 @@ OUTPUT_TOKENS = 250 MAX_DOCS = 5 +# Model identifiers with prefix +LLAMA = f"{API_PREFIX}meta/llama3-70b-instruct" +MISTRAL = f"{API_PREFIX}mistralai/mixtral-8x22b-instruct-v0.1" + ### Load in CSS here for components that need custom styling. ### _LOCAL_CSS = """ @@ -93,8 +99,7 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: """ List of currently supported models. """ - model_list = ["meta/llama3-70b-instruct", - "mistralai/mixtral-8x22b-instruct-v0.1"] + model_list = [LLAMA, MISTRAL] with gr.Blocks(title=TITLE, theme=kui_theme, css=kui_styles + _LOCAL_CSS) as page: gr.Markdown(f"# {TITLE}") @@ -631,45 +636,45 @@ def _toggle_model(btn: str): def _toggle_model_router(selected_model: str): match selected_model: - case "meta/llama3-70b-instruct": + case LLAMA: return gr.update(value=prompts_llama3.router_prompt) - case "mistralai/mixtral-8x22b-instruct-v0.1": + case MISTRAL: return gr.update(value=prompts_mistral.router_prompt) case _: return gr.update(value=prompts_llama3.router_prompt) def _toggle_model_retrieval(selected_model: str): match selected_model: - case "meta/llama3-70b-instruct": + case LLAMA: return gr.update(value=prompts_llama3.retrieval_prompt) - case "mistralai/mixtral-8x22b-instruct-v0.1": + case MISTRAL: return gr.update(value=prompts_mistral.retrieval_prompt) case _: return gr.update(value=prompts_llama3.retrieval_prompt) def _toggle_model_generator(selected_model: str): match selected_model: - case "meta/llama3-70b-instruct": + case LLAMA: return gr.update(value=prompts_llama3.generator_prompt) - case "mistralai/mixtral-8x22b-instruct-v0.1": + case MISTRAL: return gr.update(value=prompts_mistral.generator_prompt) case _: return gr.update(value=prompts_llama3.generator_prompt) def _toggle_model_hallucination(selected_model: str): match selected_model: - case "meta/llama3-70b-instruct": + case LLAMA: return gr.update(value=prompts_llama3.hallucination_prompt) - case "mistralai/mixtral-8x22b-instruct-v0.1": + case MISTRAL: return gr.update(value=prompts_mistral.hallucination_prompt) case _: return gr.update(value=prompts_llama3.hallucination_prompt) def _toggle_model_answer(selected_model: str): match selected_model: - case "meta/llama3-70b-instruct": + case LLAMA: return gr.update(value=prompts_llama3.answer_prompt) - case "mistralai/mixtral-8x22b-instruct-v0.1": + case MISTRAL: return gr.update(value=prompts_mistral.answer_prompt) case _: return gr.update(value=prompts_llama3.answer_prompt) diff --git a/code/chatui/utils/database.py b/code/chatui/utils/database.py index 8bc6f4d..c8f1e6a 100644 --- a/code/chatui/utils/database.py +++ b/code/chatui/utils/database.py @@ -19,7 +19,17 @@ from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings from typing import Any, Dict, List, Tuple, Union +import os +API_PREFIX = os.getenv('API_PREFIX', '') + +# Set the embeddings model target +if API_PREFIX=='': + EMBEDDINGS_MODEL = 'NV-Embed-QA' +else: + EMBEDDINGS_MODEL = 'nvdev/nvidia/nv-embedqa-e5-v5' + +# Download nltk data import nltk nltk.download("punkt") nltk.download("averaged_perceptron_tagger") @@ -39,7 +49,7 @@ def upload(urls: List[str]): vectorstore = Chroma.from_documents( documents=doc_splits, collection_name="rag-chroma", - embedding=NVIDIAEmbeddings(model='NV-Embed-QA'), + embedding=NVIDIAEmbeddings(model='EMBEDDINGS_MODEL'), persist_directory="/project/data", ) return vectorstore @@ -58,7 +68,7 @@ def upload_pdf(documents: List[str]): vectorstore = Chroma.from_documents( documents=doc_splits, collection_name="rag-chroma", - embedding=NVIDIAEmbeddings(model='NV-Embed-QA'), + embedding=NVIDIAEmbeddings(model='EMBEDDINGS_MODEL'), persist_directory="/project/data", ) return vectorstore @@ -67,7 +77,7 @@ def clear(): """ This is a helper function for emptying the collection the vector store. """ vectorstore = Chroma( collection_name="rag-chroma", - embedding_function=NVIDIAEmbeddings(model='NV-Embed-QA'), + embedding_function=NVIDIAEmbeddings(model='EMBEDDINGS_MODEL'), persist_directory="/project/data", ) @@ -78,7 +88,7 @@ def get_retriever(): """ This is a helper function for returning the retriever object of the vector store. """ vectorstore = Chroma( collection_name="rag-chroma", - embedding_function=NVIDIAEmbeddings(model='NV-Embed-QA'), + embedding_function=NVIDIAEmbeddings(model='EMBEDDINGS_MODEL'), persist_directory="/project/data", ) retriever = vectorstore.as_retriever() diff --git a/variables.env b/variables.env index e436bde..82a008f 100644 --- a/variables.env +++ b/variables.env @@ -3,3 +3,5 @@ # NOTE: If you change this file while the project is running, you must restart the project container for changes to take effect. TENSORBOARD_LOGS_DIRECTORY=/data/tensorboard/logs/ +#API_PREVIX---Let's you change the prefix on the endpoints. +API_PREVIX=nvdev/ From d5214f7b40d61c6630a052e29b0db6be29112e71 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sat, 1 Mar 2025 15:09:08 -0500 Subject: [PATCH 03/41] Finished getting env variable for changing api endpoint prefix implemented. - Modified files: code/chatui/pages/converse.py, code/output.log, variables.env - Fixed API_PREFIX to properly handle trailing slashes with rstrip() - Updated pattern matching to use guard clauses for model comparison - Changed from direct variable matching to str() type with equality guards APPLICATION SUCCESSFULLY STARTED. --- code/chatui/pages/converse.py | 30 +++++++++++++++--------------- code/output.log | 2 -- variables.env | 4 ++-- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/code/chatui/pages/converse.py b/code/chatui/pages/converse.py index e3c2501..cecea84 100644 --- a/code/chatui/pages/converse.py +++ b/code/chatui/pages/converse.py @@ -25,7 +25,11 @@ import time import sys -API_PREFIX = os.getenv('API_PREFIX', '') +API_PREFIX = os.getenv('API_PREFIX', '').rstrip('/') + +# Model identifiers with prefix +LLAMA = f"{API_PREFIX}/meta/llama3-70b-instruct" +MISTRAL = f"{API_PREFIX}/mistralai/mixtral-8x22b-instruct-v0.1" from chatui import assets, chat_client from chatui.prompts import prompts_llama3, prompts_mistral @@ -38,10 +42,6 @@ OUTPUT_TOKENS = 250 MAX_DOCS = 5 -# Model identifiers with prefix -LLAMA = f"{API_PREFIX}meta/llama3-70b-instruct" -MISTRAL = f"{API_PREFIX}mistralai/mixtral-8x22b-instruct-v0.1" - ### Load in CSS here for components that need custom styling. ### _LOCAL_CSS = """ @@ -636,45 +636,45 @@ def _toggle_model(btn: str): def _toggle_model_router(selected_model: str): match selected_model: - case LLAMA: + case str() if selected_model == LLAMA: return gr.update(value=prompts_llama3.router_prompt) - case MISTRAL: + case str() if selected_model == MISTRAL: return gr.update(value=prompts_mistral.router_prompt) case _: return gr.update(value=prompts_llama3.router_prompt) def _toggle_model_retrieval(selected_model: str): match selected_model: - case LLAMA: + case str() if selected_model == LLAMA: return gr.update(value=prompts_llama3.retrieval_prompt) - case MISTRAL: + case str() if selected_model == MISTRAL: return gr.update(value=prompts_mistral.retrieval_prompt) case _: return gr.update(value=prompts_llama3.retrieval_prompt) def _toggle_model_generator(selected_model: str): match selected_model: - case LLAMA: + case str() if selected_model == LLAMA: return gr.update(value=prompts_llama3.generator_prompt) - case MISTRAL: + case str() if selected_model == MISTRAL: return gr.update(value=prompts_mistral.generator_prompt) case _: return gr.update(value=prompts_llama3.generator_prompt) def _toggle_model_hallucination(selected_model: str): match selected_model: - case LLAMA: + case str() if selected_model == LLAMA: return gr.update(value=prompts_llama3.hallucination_prompt) - case MISTRAL: + case str() if selected_model == MISTRAL: return gr.update(value=prompts_mistral.hallucination_prompt) case _: return gr.update(value=prompts_llama3.hallucination_prompt) def _toggle_model_answer(selected_model: str): match selected_model: - case LLAMA: + case str() if selected_model == LLAMA: return gr.update(value=prompts_llama3.answer_prompt) - case MISTRAL: + case str() if selected_model == MISTRAL: return gr.update(value=prompts_mistral.answer_prompt) case _: return gr.update(value=prompts_llama3.answer_prompt) diff --git a/code/output.log b/code/output.log index 731d089..3af4e0c 100644 --- a/code/output.log +++ b/code/output.log @@ -4,5 +4,3 @@ Running on local URL: http://0.0.0.0:8080 To create a public link, set `share=True` in `launch()`. IMPORTANT: You are using gradio version 4.15.0, however version 4.44.1 is available, please upgrade. -------- ----ROUTE QUESTION--- -hi diff --git a/variables.env b/variables.env index 82a008f..7fe30c8 100644 --- a/variables.env +++ b/variables.env @@ -3,5 +3,5 @@ # NOTE: If you change this file while the project is running, you must restart the project container for changes to take effect. TENSORBOARD_LOGS_DIRECTORY=/data/tensorboard/logs/ -#API_PREVIX---Let's you change the prefix on the endpoints. -API_PREVIX=nvdev/ +#API_PREFIX---A prefix that you can add to the API endpoints. +API_PREFIX=nvdev From ae655b6147d5170af4910a6ef2dc59f3323469c7 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sat, 1 Mar 2025 15:55:26 -0500 Subject: [PATCH 04/41] Refactor model endpoints and internal API handling - Modified files: code/chatui/pages/converse.py, code/chatui/utils/database.py, code/output.log, variables.env - Fix incorrect Llama model endpoint string (llama3-70b-instruct -> llama-3.1-70b-instruct) - Simplify API prefix logic by using INTERNAL_API environment variable - Move model identifiers to top-level constants for better maintainability - Update model string construction to handle internal API paths more cleanly This change fixes endpoint resolution issues and improves code organization around API path handling. --- code/chatui/pages/converse.py | 14 +++++-- code/chatui/utils/database.py | 10 ++--- code/output.log | 74 ++++++++++++++++++++++++++++++++++- variables.env | 4 +- 4 files changed, 90 insertions(+), 12 deletions(-) diff --git a/code/chatui/pages/converse.py b/code/chatui/pages/converse.py index cecea84..68dcbe5 100644 --- a/code/chatui/pages/converse.py +++ b/code/chatui/pages/converse.py @@ -25,11 +25,19 @@ import time import sys -API_PREFIX = os.getenv('API_PREFIX', '').rstrip('/') +INTERNAL_API = os.getenv('INTERNAL_API', '') # Model identifiers with prefix -LLAMA = f"{API_PREFIX}/meta/llama3-70b-instruct" -MISTRAL = f"{API_PREFIX}/mistralai/mixtral-8x22b-instruct-v0.1" +LLAMA = "meta/llama3-70b-instruct" +MISTRAL = "mistralai/mixtral-8x22b-instruct-v0.1" + + + +if INTERNAL_API != '': + LLAMA = f'{INTERNAL_API}/meta/llama-3.1-70b-instruct' + MISTRAL = f'{INTERNAL_API}/mistralai/mixtral-8x22b-instruct-v0.1' + + from chatui import assets, chat_client from chatui.prompts import prompts_llama3, prompts_mistral diff --git a/code/chatui/utils/database.py b/code/chatui/utils/database.py index c8f1e6a..f251570 100644 --- a/code/chatui/utils/database.py +++ b/code/chatui/utils/database.py @@ -20,13 +20,13 @@ from typing import Any, Dict, List, Tuple, Union import os -API_PREFIX = os.getenv('API_PREFIX', '') - +INTERNAL_API = os.getenv('INTERNAL_API', '') # Set the embeddings model target -if API_PREFIX=='': - EMBEDDINGS_MODEL = 'NV-Embed-QA' -else: +EMBEDDINGS_MODEL = 'NV-Embed-QA' + +# +if INTERNAL_API != '': EMBEDDINGS_MODEL = 'nvdev/nvidia/nv-embedqa-e5-v5' # Download nltk data diff --git a/code/output.log b/code/output.log index 3af4e0c..034698d 100644 --- a/code/output.log +++ b/code/output.log @@ -1,6 +1,76 @@ http://localhost:8000 Running on local URL: http://0.0.0.0:8080 - -To create a public link, set `share=True` in `launch()`. IMPORTANT: You are using gradio version 4.15.0, however version 4.44.1 is available, please upgrade. -------- + +To create a public link, set `share=True` in `launch()`. +---ROUTE QUESTION--- +hi +{'datasource': 'web_search'} +---ROUTE QUESTION TO WEB SEARCH--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- +---GRADE GENERATION vs QUESTION--- +---DECISION: GENERATION ADDRESSES QUESTION--- +---ROUTE QUESTION--- +what about now? +{'datasource': 'web_search'} +---ROUTE QUESTION TO WEB SEARCH--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- +---GRADE GENERATION vs QUESTION--- +---DECISION: GENERATION DOES NOT ADDRESS QUESTION--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- +---GRADE GENERATION vs QUESTION--- +---DECISION: GENERATION DOES NOT ADDRESS QUESTION--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- +---GRADE GENERATION vs QUESTION--- +---DECISION: GENERATION DOES NOT ADDRESS QUESTION--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- +---GRADE GENERATION vs QUESTION--- +---DECISION: GENERATION DOES NOT ADDRESS QUESTION--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- +---GRADE GENERATION vs QUESTION--- +---DECISION: GENERATION DOES NOT ADDRESS QUESTION--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- +---GRADE GENERATION vs QUESTION--- +---DECISION: GENERATION DOES NOT ADDRESS QUESTION--- +---ROUTE QUESTION--- +i'm trying to debug something +{'datasource': 'web_search'} +---ROUTE QUESTION TO WEB SEARCH--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- +---GRADE GENERATION vs QUESTION--- +---DECISION: GENERATION ADDRESSES QUESTION--- +---ROUTE QUESTION--- +i just tried to change the endpoint for this app. +{'datasource': 'web_search'} +---ROUTE QUESTION TO WEB SEARCH--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- +---GRADE GENERATION vs QUESTION--- +---DECISION: GENERATION ADDRESSES QUESTION--- diff --git a/variables.env b/variables.env index 7fe30c8..e68210e 100644 --- a/variables.env +++ b/variables.env @@ -3,5 +3,5 @@ # NOTE: If you change this file while the project is running, you must restart the project container for changes to take effect. TENSORBOARD_LOGS_DIRECTORY=/data/tensorboard/logs/ -#API_PREFIX---A prefix that you can add to the API endpoints. -API_PREFIX=nvdev +#INTERNAL_API---Value should be either blank to work with public endpoints or the appropriate prefix to move to internal ones. +INTERNAL_API=nvdev From 20ca7d6ca806536b9bbf036a81843ef7be833125 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sat, 1 Mar 2025 16:02:54 -0500 Subject: [PATCH 05/41] Ran queries to demonstrate the changes worked. - Modified files: code/output.log --- code/output.log | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/code/output.log b/code/output.log index 034698d..039865b 100644 --- a/code/output.log +++ b/code/output.log @@ -1,11 +1,11 @@ http://localhost:8000 Running on local URL: http://0.0.0.0:8080 -IMPORTANT: You are using gradio version 4.15.0, however version 4.44.1 is available, please upgrade. --------- To create a public link, set `share=True` in `launch()`. +IMPORTANT: You are using gradio version 4.15.0, however version 4.44.1 is available, please upgrade. +-------- ---ROUTE QUESTION--- -hi +hello {'datasource': 'web_search'} ---ROUTE QUESTION TO WEB SEARCH--- ---WEB SEARCH--- @@ -15,7 +15,7 @@ hi ---GRADE GENERATION vs QUESTION--- ---DECISION: GENERATION ADDRESSES QUESTION--- ---ROUTE QUESTION--- -what about now? +i changed the end point. {'datasource': 'web_search'} ---ROUTE QUESTION TO WEB SEARCH--- ---WEB SEARCH--- @@ -45,30 +45,10 @@ what about now? ---WEB SEARCH--- ---GENERATE--- ---CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ----GRADE GENERATION vs QUESTION--- ----DECISION: GENERATION DOES NOT ADDRESS QUESTION--- ----WEB SEARCH--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ----GRADE GENERATION vs QUESTION--- ----DECISION: GENERATION DOES NOT ADDRESS QUESTION--- ----ROUTE QUESTION--- -i'm trying to debug something -{'datasource': 'web_search'} ----ROUTE QUESTION TO WEB SEARCH--- ----WEB SEARCH--- +---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- ---GENERATE--- ---CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ----GRADE GENERATION vs QUESTION--- ----DECISION: GENERATION ADDRESSES QUESTION--- ----ROUTE QUESTION--- -i just tried to change the endpoint for this app. -{'datasource': 'web_search'} ----ROUTE QUESTION TO WEB SEARCH--- ----WEB SEARCH--- +---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- ---GENERATE--- ---CHECK HALLUCINATIONS--- ---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- From 0fea6d3d7164bdd1b831e510db4c3e6d2694cc49 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sat, 1 Mar 2025 17:53:59 -0500 Subject: [PATCH 06/41] Simplified README - Modified files: README.md --- README.md | 151 ++++++++++++++++++++++-------------------------------- 1 file changed, 62 insertions(+), 89 deletions(-) diff --git a/README.md b/README.md index 415b29e..600ef39 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,13 @@ # Table of Contents * [Introduction](#nvidia-ai-workbench-introduction) - * [Project Description](#project-description) + * [Overview](#overview) * [Project Deep Dive](#project-deep-dive) * [Sizing Guide](#sizing-guide) -* [Quickstart](#quickstart) +* [Get Started](#get-started) * [Prerequisites](#prerequisites) * [Tutorial (Desktop App)](#tutorial-desktop-app) - * [Tutorial (CLI-Only)](#tutorial-cli-only) * [License](#license) -# NVIDIA AI Workbench: Introduction [![Open In AI Workbench](https://img.shields.io/badge/Open_In-AI_Workbench-76B900)](https://ngc.nvidia.com/open-ai-workbench/aHR0cHM6Ly9naXRodWIuY29tL05WSURJQS93b3JrYmVuY2gtZXhhbXBsZS1hZ2VudGljLXJhZw==) @@ -22,22 +20,33 @@ :rotating_light: Facing Issues? Let Us Know!

-## Project Description -This is an [NVIDIA AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) project for developing a websearch-based [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application with a customizable Gradio Chat app. It lets you: -* Embed your documents in the form of webpages or PDFs into a locally running Chroma vector database. -* Run inference using remotely running endpoints and microservices. - * Cloud endpoints using the [NVIDIA API Catalog](https://build.nvidia.com/explore/discover) - * Self-hosted endpoints using [NVIDIA Inference Microservices (NIMs)](https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama3-8b-instruct/tags) - * Third party self-hosted microservices like Ollama. +## Overview +This is an [NVIDIA AI Workbench Project](https://docs.nvidia.com/ai-workbench/user-guide/latest/projects/projects.html#projects-structure) with a customizable search-based [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application with Gradio front end. The application uses an agentic approach to significantly improve results. + + +* First, an LLM evaluates your query for relevance to the index and then appropriately routes it (to the DB or to search by [Tavily](https://tavily.com/)) + * Index relevant queries trigger a retrieval step followed by a grading step, followed by the generation step. + * Index irrelevant questions go to web search which is then fed into the generation step. +* All generated answers are evaluated for hallucination and for responsiveness, with "failing" answers (i.e. hallucinations or immaterial responses) run through the process again. + +The diagram **below** shows this agentic flow. -This project uses an agentic workflow depicted in the above diagram to improve response quality in RAG. Using LangGraph, user queries will first be sorted under a RAG or Websearch pipeline depending on an LLM evaluation of the query topic. Depending on its user-configurable prompt, the router LLM can narrow its focus on turning a specific subject or topic routable to the RAG Pipeline. + +This agentic-RAG application is **configurable**. You can: +* Change the prompts for the different components, e.g. the hallucination grader, directly within the front end. +* Change the webpages and pdfs you want to use for the context in the RAG. +* Use different remote endpoints or self-hosted microservices for the inference components. + * Cloud endpoints using endpoints from [build.nvidia.com](https://build.nvidia.com/explore/discover) + * Self-hosted endpoints using [NVIDIA Inference Microservices (NIMs)](https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama3-8b-instruct/tags) + * Third party self-hosted microservices like Ollama. +
-Expand this section for a description of RAG Pipeline. + RAG Pipeline Description Under the retrieval pipeline, the user query is first compared to documents in the vector database and the most relevant documents are retrieved. @@ -50,7 +59,7 @@ After generation, another set of LLMs calls evaluate the response for hallucinat
-Expand this section for a description of Websearch Pipeline. + Websearch Pipeline Description Under the web search pipeline, the user query is inputted onto the web and the search results are retrieved. Using these results, a response is generated. @@ -64,6 +73,44 @@ After generation, a set of LLMs calls evaluate the response for hallucinations a | :---------------------------| | This project is meant as an example workflow and a starting point; you are free to add new models, rearrange the interface, or edit the source code as you see fit for your particular use case! | + + + + +## Get Started + +#### Prerequisites + +You need NVIDIA AI Workbench installed. [See here for how to install it.](https://docs.nvidia.com/ai-workbench/user-guide/latest/installation/overview.html) + +You need the following API keys on hand to set the environment variables and make sure you have access to the necessary endpoints. + +An NVIDIA API Key. You can generate one under ``Get API Key`` on any API Catalog [model card](https://build.nvidia.com/mistralai/mistral-7b-instruct-v2) + +A Tavily Search API Key. You can generate one under a free account (1000 searches/month) [here](https://app.tavily.com/home). + +#### Steps to Open the Chat UI + +1. Fork this Project to your own GitHub namespace and copy the link + + ``` + https://github.com/[your_namespace]/ + ``` + +2. Open NVIDIA AI Workbench. Select a [location to work in](https://docs.nvidia.com/ai-workbench/user-guide/latest/locations/locations.html). + +3. Clone this Project (in Workbench, not with Git), and wait for the project container to build. + +4. When the build completes, set the following configurations. + + * `Environment` → `Secrets` → `Configure`. Specify the NVIDIA API Key and Tavily Search Key as project secrets. + +6. Open the **Chat** from Workbench and the chat UI should automatically open in a new browser tab. Happy chatting! + +7. **Note:** When doing RAG, make sure you (1) upload the document AND (2) Change the Router prompt to focus on the topic of your uploaded documents. Both changes are required for successful RAG! + + + ### Project Deep Dive
@@ -73,7 +120,7 @@ After generation, a set of LLMs calls evaluate the response for hallucinations a -When the user lands on the Chat UI application in the browser, they will see several components. On the left hand side is a standard chatbot user interface with a user input for queries (submittable with ``[ENTER]``) and a clear history button. Above this chatbot is a diagram of the agentic RAG pipeline which doubles as a progress bar indicator for any nontrivial user actions a user might take, like uploading a document. +When the user lands on the Chat UI application in the browser, they will see several components. On the left hand side is a standard chatbot user interface with an input box for queries (submittable with ``[ENTER]``) and a clear history button. Above this chatbot is a diagram of the agentic RAG pipeline which doubles as a progress bar indicator for any nontrivial user actions a user might take, like uploading a document. On the right hand side, users will see a collapsable settings panel with several tabs they may choose to navigate to and configure. @@ -141,83 +188,9 @@ This tab holds the agentic RAG monitoring tools built into this application. | 80 GB | A100-80GB | Y | | >80 GB | 8x A100-80GB | Y | -# Quickstart - -## Prerequisites -AI Workbench will prompt you to provide a few pieces of information before running any apps in this project. Ensure you have this information ready. - - * An NVIDIA API Key. You can generate one under ``Get API Key`` on any API Catalog [model card](https://build.nvidia.com/mistralai/mistral-7b-instruct-v2) - * A Tavily Search API Key. You can generate one under a free account (1000 searches/month) [here](https://app.tavily.com/home). - -## Tutorial (Desktop App) - -If you do not NVIDIA AI Workbench installed, first complete the installation for AI Workbench [here](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/). Then, -1. Fork this Project to your own GitHub namespace and copy the link - - ``` - https://github.com/[your_namespace]/ - ``` - -2. Open NVIDIA AI Workbench. Select a location to work in. - -3. Clone this Project onto your desired machine by selecting **Clone Project** and providing the GitHub link. - -4. Wait for the project to build. You can expand the bottom **Building** indicator to view real-time build logs. - -5. When the build completes, set the following configurations. - - * `Environment` → `Secrets` → `Configure`. Specify the NVIDIA API Key and Tavily Search Key as project secrets. - -6. On the top right of the window, select **Chat**. A frontend user interface should automatically open in a new browser tab. Happy chatting! - -7. **Note:** When doing RAG, make sure you (1) upload the document AND (2) Change the Router prompt to focus on the topic of your uploaded documents. Both changes are required for successful RAG! - -## Tutorial (CLI-Only) -Some users may choose to use the **CLI tool only** instead of the Desktop App. If you do not NVIDIA AI Workbench installed, first complete the installation for AI Workbench [here](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/). Then, -1. Fork this Project to your own GitHub namespace and copying the link - - ``` - https://github.com/[your_namespace]/ - ``` - -2. Open a shell and activating the Context you want to clone into by - ``` - $ nvwb list contexts - - $ nvwb activate - ``` - - | :bulb: Tip | - | :---------------------------| - | Use ```nvwb help``` to see a full list of AI Workbench commands. | - -3. Clone this Project onto your desired machine by running - - ``` - $ nvwb clone project - ``` - -4. Open the Project by - - ``` - $ nvwb list projects - - $ nvwb open - ``` - -5. Start **Chat** by - - ``` - $ nvwb start chat - ``` - - * Specify the NVIDIA API Key and Tavily Search Key as project secrets. -6. A frontend user interface should automatically open in a new browser tab. Happy chatting! - -7. **Note:** When doing RAG, make sure you (1) upload the document AND (2) Change the Router prompt to focus on the topic of your uploaded documents. Both changes are required for successful RAG! # License This NVIDIA AI Workbench example project is under the [Apache 2.0 License](https://github.com/NVIDIA/workbench-example-agentic-rag/blob/main/LICENSE.txt) From 42f03217be12601e81a507c5f3e4dc0b5d2922ee Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sat, 1 Mar 2025 17:55:24 -0500 Subject: [PATCH 07/41] Deleted image from readme - Modified files: README.md --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index 600ef39..2c3f51a 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,6 @@ * [License](#license) - - -

:arrow_down: Download AI Workbench • From eeba25ca776d8cf6b1e991e536a071226a6575dd Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Mon, 3 Mar 2025 09:39:59 -0500 Subject: [PATCH 08/41] Continued refactoring of README - Modified files: README.md Update README structure and content - Reorder workflow description for logical flow - Adjust agentic workflow explanation for clarity - Add details about prompt configuration and pipeline routing - Update router LLM evaluation process description --- README.md | 91 +++++++++++++++++++++++++++---------------------------- 1 file changed, 45 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index 2c3f51a..db53bcf 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,10 @@ -# Table of Contents -* [Introduction](#nvidia-ai-workbench-introduction) - * [Overview](#overview) - * [Project Deep Dive](#project-deep-dive) - * [Sizing Guide](#sizing-guide) +# Overview +* [Introduction](#introduction) * [Get Started](#get-started) - * [Prerequisites](#prerequisites) - * [Tutorial (Desktop App)](#tutorial-desktop-app) + * [Prerequisites](#prerequisites) + * [Opening the Chat](#opening-the-chat) +* [Deep Dive](#deep-dive) + * [Sizing Guide](#sizing-guide) * [License](#license) @@ -17,14 +16,14 @@ :rotating_light: Facing Issues? Let Us Know!

-## Overview +## Introduction This is an [NVIDIA AI Workbench Project](https://docs.nvidia.com/ai-workbench/user-guide/latest/projects/projects.html#projects-structure) with a customizable search-based [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application with Gradio front end. The application uses an agentic approach to significantly improve results. * First, an LLM evaluates your query for relevance to the index and then appropriately routes it (to the DB or to search by [Tavily](https://tavily.com/)) * Index relevant queries trigger a retrieval step followed by a grading step, followed by the generation step. * Index irrelevant questions go to web search which is then fed into the generation step. -* All generated answers are evaluated for hallucination and for responsiveness, with "failing" answers (i.e. hallucinations or immaterial responses) run through the process again. +* Second, all generated answers are checked for hallucination and relevance, with "failing" answers (i.e. hallucinations or immaterial responses) run through the process again. The diagram **below** shows this agentic flow. @@ -40,53 +39,26 @@ This agentic-RAG application is **configurable**. You can: * Third party self-hosted microservices like Ollama. -
-
- - RAG Pipeline Description - - -Under the retrieval pipeline, the user query is first compared to documents in the vector database and the most relevant documents are retrieved. - -Another LLM call evaluates the quality of the documents. If satisfactory, it proceeds to the generation phase to produce an response augmented by this relevant context. If the agent decides the best documents are irrelevant to the query, it redirects the user query to the websearch pipeline for a better quality response (see below section). - -After generation, another set of LLMs calls evaluate the response for hallucinations and accuracy. If the generation is both faithful to the retrieved context and answers the user's query in a satisfactory manner, the response is forwarded to the user and displayed. Otherwise, the agent will either regenerate the response, or redirect the query to a web search. - -
- -
- - Websearch Pipeline Description - - -Under the web search pipeline, the user query is inputted onto the web and the search results are retrieved. Using these results, a response is generated. - -After generation, a set of LLMs calls evaluate the response for hallucinations and accuracy. If the generation is both faithful to the retrieved context and answers the user's query in a satisfactory manner, the response is forwarded to the user and displayed. Otherwise, the agent will either regenerate the response, or redirect the query to another web search. - -
-
- | :memo: Remember | | :---------------------------| -| This project is meant as an example workflow and a starting point; you are free to add new models, rearrange the interface, or edit the source code as you see fit for your particular use case! | - - - +| This project is an **example** that you can **modify**. In addition to changing the prompts in the Gradio UI, you can edit the code + and do whatever you want like adding new models, changing the Gradio interface, or even changing the logic. | ## Get Started #### Prerequisites -You need NVIDIA AI Workbench installed. [See here for how to install it.](https://docs.nvidia.com/ai-workbench/user-guide/latest/installation/overview.html) +1. You need NVIDIA AI Workbench installed. [See here for how to install it.](https://docs.nvidia.com/ai-workbench/user-guide/latest/installation/overview.html) -You need the following API keys on hand to set the environment variables and make sure you have access to the necessary endpoints. - -An NVIDIA API Key. You can generate one under ``Get API Key`` on any API Catalog [model card](https://build.nvidia.com/mistralai/mistral-7b-instruct-v2) +2. You need an NVIDIA Personal key to use build.nvidia.com endpoints, as well as a [Tavily](https://tavily.com/) API key to use their web search. + +You can get an NVIDIA ``Get API Key`` on any model card, [e.g. Llama-3.3-70B-instruct](https://build.nvidia.com/meta/llama-3_3-70b-instruct?signin=true&api_key=true) -A Tavily Search API Key. You can generate one under a free account (1000 searches/month) [here](https://app.tavily.com/home). +You can get a Tavily Search API Key with a free account (1000 searches/month) [here](https://tavily.com/). -#### Steps to Open the Chat UI + +#### Opening the Chat 1. Fork this Project to your own GitHub namespace and copy the link @@ -108,7 +80,34 @@ A Tavily Search API Key. You can generate one under a free account (1000 searche -### Project Deep Dive +# Deep Dive + +
+
+ + RAG Pipeline Description + + +Under the retrieval pipeline, the user query is first compared to documents in the vector database and the most relevant documents are retrieved. + +Another LLM call evaluates the quality of the documents. If satisfactory, it proceeds to the generation phase to produce an response augmented by this relevant context. If the agent decides the best documents are irrelevant to the query, it redirects the user query to the websearch pipeline for a better quality response (see below section). + +After generation, another set of LLMs calls evaluate the response for hallucinations and accuracy. If the generation is both faithful to the retrieved context and answers the user's query in a satisfactory manner, the response is forwarded to the user and displayed. Otherwise, the agent will either regenerate the response, or redirect the query to a web search. + +
+ +
+ + Websearch Pipeline Description + + +Under the web search pipeline, the user query is inputted onto the web and the search results are retrieved. Using these results, a response is generated. + +After generation, a set of LLMs calls evaluate the response for hallucinations and accuracy. If the generation is both faithful to the retrieved context and answers the user's query in a satisfactory manner, the response is forwarded to the user and displayed. Otherwise, the agent will either regenerate the response, or redirect the query to another web search. + +
+
+
From 988357fe16edc9a391392226f2891926c837abac Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Mon, 3 Mar 2025 09:47:50 -0500 Subject: [PATCH 09/41] Eliminated the hierarchy of links at top of README - Modified files: README.md --- README.md | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index db53bcf..3e7abec 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,22 @@ # Overview -* [Introduction](#introduction) -* [Get Started](#get-started) - * [Prerequisites](#prerequisites) - * [Opening the Chat](#opening-the-chat) -* [Deep Dive](#deep-dive) - * [Sizing Guide](#sizing-guide) -* [License](#license) +This is a customizable search-based [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application with Gradio front end. The application uses an agentic approach to significantly improve results. + +It is implemented within an [NVIDIA AI Workbench Project](https://docs.nvidia.com/ai-workbench/user-guide/latest/projects/projects.html#projects-structure) that you can clone and run locally in AI Workbench. + + + +**Navigation**: [Application Overview](#application-overview) | [Get Started](#get-started) | [Deep Dive](#deep-dive) | [Sizing Guide](#sizing-guide) | [License](#license) -

+

:arrow_down: Download AI Workbench • - :book: Read the Docs • - :open_file_folder: Explore Example Projects • - :rotating_light: Facing Issues? Let Us Know! + :book: User Guide • + :open_file_folder: Other Projects • + :rotating_light: User Forum

-## Introduction -This is an [NVIDIA AI Workbench Project](https://docs.nvidia.com/ai-workbench/user-guide/latest/projects/projects.html#projects-structure) with a customizable search-based [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application with Gradio front end. The application uses an agentic approach to significantly improve results. - +## Application Overview * First, an LLM evaluates your query for relevance to the index and then appropriately routes it (to the DB or to search by [Tavily](https://tavily.com/)) * Index relevant queries trigger a retrieval step followed by a grading step, followed by the generation step. @@ -29,7 +27,6 @@ The diagram **below** shows this agentic flow. - This agentic-RAG application is **configurable**. You can: * Change the prompts for the different components, e.g. the hallucination grader, directly within the front end. * Change the webpages and pdfs you want to use for the context in the RAG. @@ -79,7 +76,6 @@ You can get a Tavily Search API Key with a free account (1000 searches/month) [h 7. **Note:** When doing RAG, make sure you (1) upload the document AND (2) Change the Router prompt to focus on the topic of your uploaded documents. Both changes are required for successful RAG! - # Deep Dive
From bacd69fa9592522bbecbe70664541a4c2239a1ef Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Mon, 3 Mar 2025 10:00:21 -0500 Subject: [PATCH 10/41] Minor changes to README - Modified files: README.md --- README.md | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 3e7abec..9419c65 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,11 @@ # Overview -This is a customizable search-based [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application with Gradio front end. The application uses an agentic approach to significantly improve results. +This is a [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application that uses an agentic approach to combine search with RAG, as well as hallucination and accuracy checks. It has a Gradio front end and the entire thing is customizable. -It is implemented within an [NVIDIA AI Workbench Project](https://docs.nvidia.com/ai-workbench/user-guide/latest/projects/projects.html#projects-structure) that you can clone and run locally in AI Workbench. +You can clone and use this application using [AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) with minimal setup and **no need to do anything in a terminal**. - -**Navigation**: [Application Overview](#application-overview) | [Get Started](#get-started) | [Deep Dive](#deep-dive) | [Sizing Guide](#sizing-guide) | [License](#license) - +#### Navigating the README and Other Resources: +[Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive) | [Sizing Guide](#sizing-guide) | [License](#license)

@@ -16,9 +15,11 @@ It is implemented within an [NVIDIA AI Workbench Project](https://docs.nvidia.co :rotating_light: User Forum

-## Application Overview +## The Agentic RAG Application + +This application uses an agentic approach to add a few extra steps to the typical RAG pipeline. -* First, an LLM evaluates your query for relevance to the index and then appropriately routes it (to the DB or to search by [Tavily](https://tavily.com/)) +* First, an LLM evaluates your query for relevance to the index and then routes it to the DB or to search by [Tavily](https://tavily.com/). * Index relevant queries trigger a retrieval step followed by a grading step, followed by the generation step. * Index irrelevant questions go to web search which is then fed into the generation step. * Second, all generated answers are checked for hallucination and relevance, with "failing" answers (i.e. hallucinations or immaterial responses) run through the process again. @@ -36,13 +37,8 @@ This agentic-RAG application is **configurable**. You can: * Third party self-hosted microservices like Ollama. -| :memo: Remember | -| :---------------------------| -| This project is an **example** that you can **modify**. In addition to changing the prompts in the Gradio UI, you can edit the code - and do whatever you want like adding new models, changing the Gradio interface, or even changing the logic. | - - ## Get Started +This RAG is implemented within an [NVIDIA AI Workbench Project](https://docs.nvidia.com/ai-workbench/user-guide/latest/projects/projects.html#projects-structure) that you can clone (or fork and clone) to run locally in AI Workbench. #### Prerequisites @@ -75,6 +71,9 @@ You can get a Tavily Search API Key with a free account (1000 searches/month) [h 7. **Note:** When doing RAG, make sure you (1) upload the document AND (2) Change the Router prompt to focus on the topic of your uploaded documents. Both changes are required for successful RAG! +| :memo: Remember | +| :---------------------------| +| This project is a **developerment environment** that you can **modify**. In addition to changing the prompts in the Gradio UI, you can edit the code and do whatever you want like adding new models, changing the Gradio interface, or even changing the logic. | # Deep Dive From 183596eba9a1467a023a6cecd49a7ad836426569 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Mon, 3 Mar 2025 10:37:38 -0500 Subject: [PATCH 11/41] Modified README.md - Modified files: README.md --- README.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 9419c65..700b326 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ # Overview -This is a [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application that uses an agentic approach to combine search with RAG, as well as hallucination and accuracy checks. It has a Gradio front end and the entire thing is customizable. +This is a [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application that uses an agentic approach to combine search, hallucination controls and accuracy checks with RAG. It has a Gradio front end and the entire thing is customizable. You can clone and use this application using [AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) with minimal setup and **no need to do anything in a terminal**. -#### Navigating the README and Other Resources: -[Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive) | [Sizing Guide](#sizing-guide) | [License](#license) +**Navigating the README:** [Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive) | [Sizing Guide](#sizing-guide) | [License](#license) +**Other Resources:**

:arrow_down: Download AI Workbench:book: User Guide • @@ -17,8 +17,6 @@ You can clone and use this application using [AI Workbench](https://www.nvidia.c ## The Agentic RAG Application -This application uses an agentic approach to add a few extra steps to the typical RAG pipeline. - * First, an LLM evaluates your query for relevance to the index and then routes it to the DB or to search by [Tavily](https://tavily.com/). * Index relevant queries trigger a retrieval step followed by a grading step, followed by the generation step. * Index irrelevant questions go to web search which is then fed into the generation step. @@ -28,7 +26,7 @@ The diagram **below** shows this agentic flow. -This agentic-RAG application is **configurable**. You can: +This application is **configurable** from within the Gradio UI. You can: * Change the prompts for the different components, e.g. the hallucination grader, directly within the front end. * Change the webpages and pdfs you want to use for the context in the RAG. * Use different remote endpoints or self-hosted microservices for the inference components. From aace9cbab4928c8ad050d35da5609c46dfe4ed78 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Mon, 3 Mar 2025 10:40:40 -0500 Subject: [PATCH 12/41] Modified README.md - Modified files: README.md --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 700b326..7f55409 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,9 @@ The diagram **below** shows this agentic flow. -This application is **configurable** from within the Gradio UI. You can: +This application is **configurable** from within the Gradio UI. + +You can: * Change the prompts for the different components, e.g. the hallucination grader, directly within the front end. * Change the webpages and pdfs you want to use for the context in the RAG. * Use different remote endpoints or self-hosted microservices for the inference components. @@ -34,6 +36,9 @@ This application is **configurable** from within the Gradio UI. You can: * Self-hosted endpoints using [NVIDIA Inference Microservices (NIMs)](https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama3-8b-instruct/tags) * Third party self-hosted microservices like Ollama. +| :memo: Modify this Application!!! | +| :---------------------------| +| This project is a development environment. You can edit the code to change things, like adding new models, changing the Gradio interface, or even changing the logic. | ## Get Started This RAG is implemented within an [NVIDIA AI Workbench Project](https://docs.nvidia.com/ai-workbench/user-guide/latest/projects/projects.html#projects-structure) that you can clone (or fork and clone) to run locally in AI Workbench. @@ -69,9 +74,6 @@ You can get a Tavily Search API Key with a free account (1000 searches/month) [h 7. **Note:** When doing RAG, make sure you (1) upload the document AND (2) Change the Router prompt to focus on the topic of your uploaded documents. Both changes are required for successful RAG! -| :memo: Remember | -| :---------------------------| -| This project is a **developerment environment** that you can **modify**. In addition to changing the prompts in the Gradio UI, you can edit the code and do whatever you want like adding new models, changing the Gradio interface, or even changing the logic. | # Deep Dive From dd751d011e7e3f632d5eceb2f920233a5cdaa227 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Mon, 3 Mar 2025 10:43:09 -0500 Subject: [PATCH 13/41] Modified README.md - Modified files: README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7f55409..c9898ec 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # Overview -This is a [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application that uses an agentic approach to combine search, hallucination controls and accuracy checks with RAG. It has a Gradio front end and the entire thing is customizable. +This is a [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application that uses an agentic approach to combine web search, hallucination controls and accuracy checks with RAG. It has a Gradio front end and the entire thing is customizable. -You can clone and use this application using [AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) with minimal setup and **no need to do anything in a terminal**. +You can clone and use this application using [AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) with minimal setup and **no** terminal steps. -**Navigating the README:** [Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive) | [Sizing Guide](#sizing-guide) | [License](#license) +**Navigating the README:** [Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive) | [Self-Hosted Sizing Guide](#self-hosted-sizing-guide) | [License](#license) **Other Resources:** @@ -38,7 +38,7 @@ You can: | :memo: Modify this Application!!! | | :---------------------------| -| This project is a development environment. You can edit the code to change things, like adding new models, changing the Gradio interface, or even changing the logic. | +| This isn't just an application. It's a development environment. You can edit the code to change things, like adding new models, changing the Gradio interface, or even changing the logic. | ## Get Started This RAG is implemented within an [NVIDIA AI Workbench Project](https://docs.nvidia.com/ai-workbench/user-guide/latest/projects/projects.html#projects-structure) that you can clone (or fork and clone) to run locally in AI Workbench. @@ -166,7 +166,7 @@ This tab holds the agentic RAG monitoring tools built into this application.

-## Sizing Guide +## Self-Hosted Sizing Guide | GPU VRAM | Example Hardware | Compatible? | | -------- | ------- | ------- | From bae8f2e55532abe5b7c41f2452a7dfe2359fe5e5 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Mon, 3 Mar 2025 10:54:03 -0500 Subject: [PATCH 14/41] Modified urls in README to open in in new tab. - Modified files: README.md --- README.md | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index c9898ec..3b9fc7d 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Overview -This is a [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application that uses an agentic approach to combine web search, hallucination controls and accuracy checks with RAG. It has a Gradio front end and the entire thing is customizable. +This is a [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application. It uses an agentic approach to combine web search, hallucination controls and accuracy checks with RAG. It has a Gradio front end. -You can clone and use this application using [AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) with minimal setup and **no** terminal steps. +You can clone, use and customize this application using [AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) with minimal setup and **no** terminal steps. **Navigating the README:** [Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive) | [Self-Hosted Sizing Guide](#self-hosted-sizing-guide) | [License](#license) @@ -9,7 +9,7 @@ You can clone and use this application using [AI Workbench](https://www.nvidia.c **Other Resources:**

- :arrow_down: Download AI Workbench • + :arrow_down: Download AI Workbench:book: User Guide:open_file_folder: Other Projects:rotating_light: User Forum @@ -17,28 +17,25 @@ You can clone and use this application using [AI Workbench](https://www.nvidia.c ## The Agentic RAG Application -* First, an LLM evaluates your query for relevance to the index and then routes it to the DB or to search by [Tavily](https://tavily.com/). - * Index relevant queries trigger a retrieval step followed by a grading step, followed by the generation step. - * Index irrelevant questions go to web search which is then fed into the generation step. -* Second, all generated answers are checked for hallucination and relevance, with "failing" answers (i.e. hallucinations or immaterial responses) run through the process again. +1. You embed your documents, pdfs or webpages, to the vector database. +2. You configure the prompts for the different components, e.g. the router or retrieval grader. +3. You submit your query. +4. An LLM evaluates your query for relevance to the index and then routes it to the DB or to search by [Tavily](https://tavily.com/). +5. Answers are checked for hallucination and relevance. "Failing"" answers are run through the process again. The diagram **below** shows this agentic flow. -This application is **configurable** from within the Gradio UI. -You can: +#### Customize the Application * Change the prompts for the different components, e.g. the hallucination grader, directly within the front end. * Change the webpages and pdfs you want to use for the context in the RAG. * Use different remote endpoints or self-hosted microservices for the inference components. * Cloud endpoints using endpoints from [build.nvidia.com](https://build.nvidia.com/explore/discover) * Self-hosted endpoints using [NVIDIA Inference Microservices (NIMs)](https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama3-8b-instruct/tags) * Third party self-hosted microservices like Ollama. - -| :memo: Modify this Application!!! | -| :---------------------------| -| This isn't just an application. It's a development environment. You can edit the code to change things, like adding new models, changing the Gradio interface, or even changing the logic. | +* Modify the application code to add new models, change the Gradio interface, or even change the logic. ## Get Started This RAG is implemented within an [NVIDIA AI Workbench Project](https://docs.nvidia.com/ai-workbench/user-guide/latest/projects/projects.html#projects-structure) that you can clone (or fork and clone) to run locally in AI Workbench. From 6898dd78b711428467881e1e162c2777f4651bd0 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Mon, 3 Mar 2025 10:55:37 -0500 Subject: [PATCH 15/41] Trying links again. - Modified files: README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3b9fc7d..3ba2820 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ You can clone, use and customize this application using [AI Workbench](https://w **Other Resources:**

:arrow_down: Download AI Workbench • - :book: User Guide • - :open_file_folder: Other Projects • - :rotating_light: User Forum + :book: User Guide • + :open_file_folder: Other Projects • + :rotating_light: User Forum

## The Agentic RAG Application From 5ee7422a10584a4d420b2b5623408df48ad1a533 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Mon, 3 Mar 2025 10:57:47 -0500 Subject: [PATCH 16/41] Modified README.md - Modified files: README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3ba2820..35b4693 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ You can clone, use and customize this application using [AI Workbench](https://w **Navigating the README:** [Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive) | [Self-Hosted Sizing Guide](#self-hosted-sizing-guide) | [License](#license) -**Other Resources:** +**Other Resources:** Right click to open in new tab

:arrow_down: Download AI Workbench:book: User Guide • From 711a4b4ee7453c35130ad426ef77a70d76e70723 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Mon, 3 Mar 2025 12:04:26 -0500 Subject: [PATCH 17/41] Modified README.md - Modified files: README.md --- README.md | 47 +++++++++++++++++------------------------------ 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 35b4693..3055175 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,14 @@ # Overview -This is a [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application. It uses an agentic approach to combine web search, hallucination controls and accuracy checks with RAG. It has a Gradio front end. +This is a [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application. It uses an agentic approach to combine web search, hallucination controls and accuracy checks with RAG. It has a Gradio front end. You can clone, use and customize this application using [AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) with minimal setup and **no** terminal steps. -You can clone, use and customize this application using [AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) with minimal setup and **no** terminal steps. - -**Navigating the README:** [Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive) | [Self-Hosted Sizing Guide](#self-hosted-sizing-guide) | [License](#license) +*Navigating the README:* [Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive) | [Self-Hosted Sizing Guide](#self-hosted-sizing-guide) | [License](#license) -**Other Resources:** Right click to open in new tab -

- :arrow_down: Download AI Workbench • - :book: User Guide • - :open_file_folder: Other Projects • - :rotating_light: User Forum -

+*Other Resources:* [:arrow_down: Download AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) | [:book: User Guide](https://docs.nvidia.com/ai-workbench/) |[:open_file_folder: Other Projects](https://docs.nvidia.com/ai-workbench/user-guide/latest/quickstart/example-projects.html) | [:rotating_light: User Forum](https://forums.developer.nvidia.com/t/support-workbench-example-project-agentic-rag/303414) ## The Agentic RAG Application - +#### Using the Application 1. You embed your documents, pdfs or webpages, to the vector database. 2. You configure the prompts for the different components, e.g. the router or retrieval grader. 3. You submit your query. @@ -28,7 +20,7 @@ The diagram **below** shows this agentic flow. -#### Customize the Application +#### Customizing the Application * Change the prompts for the different components, e.g. the hallucination grader, directly within the front end. * Change the webpages and pdfs you want to use for the context in the RAG. * Use different remote endpoints or self-hosted microservices for the inference components. @@ -40,40 +32,36 @@ The diagram **below** shows this agentic flow. ## Get Started This RAG is implemented within an [NVIDIA AI Workbench Project](https://docs.nvidia.com/ai-workbench/user-guide/latest/projects/projects.html#projects-structure) that you can clone (or fork and clone) to run locally in AI Workbench. -#### Prerequisites +#### Quickstart Prerequisites 1. You need NVIDIA AI Workbench installed. [See here for how to install it.](https://docs.nvidia.com/ai-workbench/user-guide/latest/installation/overview.html) 2. You need an NVIDIA Personal key to use build.nvidia.com endpoints, as well as a [Tavily](https://tavily.com/) API key to use their web search. + * You can get an NVIDIA Personal API Key on any model card, [e.g. Llama-3.3-70B-instruct](https://build.nvidia.com/meta/llama-3_3-70b-instruct?signin=true&api_key=true) + * You can get a Tavily Search API Key with a free account (1000 searches/month) [here](https://tavily.com/). -You can get an NVIDIA ``Get API Key`` on any model card, [e.g. Llama-3.3-70B-instruct](https://build.nvidia.com/meta/llama-3_3-70b-instruct?signin=true&api_key=true) - -You can get a Tavily Search API Key with a free account (1000 searches/month) [here](https://tavily.com/). +3. You should have some pdfs or webpages to use for the RAG. +4. You should fork this project to your own GitHub namespace and **copy the link** to the forked repository. #### Opening the Chat - -1. Fork this Project to your own GitHub namespace and copy the link - - ``` - https://github.com/[your_namespace]/ - ``` -2. Open NVIDIA AI Workbench. Select a [location to work in](https://docs.nvidia.com/ai-workbench/user-guide/latest/locations/locations.html). +1. Open NVIDIA AI Workbench. Select a [location to work in](https://docs.nvidia.com/ai-workbench/user-guide/latest/locations/locations.html). -3. Clone this Project (in Workbench, not with Git), and wait for the project container to build. +2. Clone this project with the link you copied in step 4 (in Workbench, not with Git), and wait for the project container to build. -4. When the build completes, set the following configurations. +3. When the build completes, set the following configurations. * `Environment` → `Secrets` → `Configure`. Specify the NVIDIA API Key and Tavily Search Key as project secrets. -6. Open the **Chat** from Workbench and the chat UI should automatically open in a new browser tab. Happy chatting! +4. Open the **Chat** from Workbench. It should automatically open in a new browser tab. -7. **Note:** When doing RAG, make sure you (1) upload the document AND (2) Change the Router prompt to focus on the topic of your uploaded documents. Both changes are required for successful RAG! +5. Upload your documents and change the Router prompt to focus on your uploaded documents. +6. Chat with your documents! # Deep Dive - +If you want to learn more about the RAG pipeline, expand the sections below.
@@ -100,7 +88,6 @@ After generation, a set of LLMs calls evaluate the response for hallucinations a
-
Expand this section for a full guide of the user-configurable project settings From cad55a1a7fd3e6c38c54389211c41983cb6d045a Mon Sep 17 00:00:00 2001 From: JT Casablanca Date: Thu, 6 Mar 2025 08:30:09 -0500 Subject: [PATCH 18/41] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3055175..9d3a8d2 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ The diagram **below** shows this agentic flow. ## Get Started This RAG is implemented within an [NVIDIA AI Workbench Project](https://docs.nvidia.com/ai-workbench/user-guide/latest/projects/projects.html#projects-structure) that you can clone (or fork and clone) to run locally in AI Workbench. -#### Quickstart Prerequisites +#### Prerequisites 1. You need NVIDIA AI Workbench installed. [See here for how to install it.](https://docs.nvidia.com/ai-workbench/user-guide/latest/installation/overview.html) From d08deb730d732e56ed5aba02b70dc7e7dfa8c82b Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sat, 8 Mar 2025 11:40:15 -0500 Subject: [PATCH 19/41] Fixed embedding string issues in database and added data to gitignore. - Updated storage in layout 'data/' - Deleted layout 'data/scratch/' - Modified files: .gitignore, code/chatui/utils/database.py, code/output.log --- .gitignore | 3 ++- .project/spec.yaml | 4 +-- code/chatui/utils/database.py | 8 +++--- code/output.log | 50 +++++------------------------------ 4 files changed, 14 insertions(+), 51 deletions(-) diff --git a/.gitignore b/.gitignore index 437cb38..0f5e848 100644 --- a/.gitignore +++ b/.gitignore @@ -54,4 +54,5 @@ cover/ # Workbench Project Layout data/scratch/* -!data/scratch/.gitkeep \ No newline at end of file +!data/scratch/.gitkeep +data/chroma.sqlite3 \ No newline at end of file diff --git a/.project/spec.yaml b/.project/spec.yaml index 8aa0d8b..df8c308 100644 --- a/.project/spec.yaml +++ b/.project/spec.yaml @@ -15,9 +15,6 @@ layout: type: models storage: gitlfs - path: data/ - type: data - storage: gitlfs - - path: data/scratch/ type: data storage: gitignore environment: @@ -109,6 +106,7 @@ environment: package_manager_environment: name: "" target: "" + compose_file_path: "" execution: apps: - name: Visual Studio Code diff --git a/code/chatui/utils/database.py b/code/chatui/utils/database.py index f251570..7260db2 100644 --- a/code/chatui/utils/database.py +++ b/code/chatui/utils/database.py @@ -49,7 +49,7 @@ def upload(urls: List[str]): vectorstore = Chroma.from_documents( documents=doc_splits, collection_name="rag-chroma", - embedding=NVIDIAEmbeddings(model='EMBEDDINGS_MODEL'), + embedding=NVIDIAEmbeddings(model=EMBEDDINGS_MODEL), persist_directory="/project/data", ) return vectorstore @@ -68,7 +68,7 @@ def upload_pdf(documents: List[str]): vectorstore = Chroma.from_documents( documents=doc_splits, collection_name="rag-chroma", - embedding=NVIDIAEmbeddings(model='EMBEDDINGS_MODEL'), + embedding=NVIDIAEmbeddings(model=EMBEDDINGS_MODEL), persist_directory="/project/data", ) return vectorstore @@ -77,7 +77,7 @@ def clear(): """ This is a helper function for emptying the collection the vector store. """ vectorstore = Chroma( collection_name="rag-chroma", - embedding_function=NVIDIAEmbeddings(model='EMBEDDINGS_MODEL'), + embedding_function=NVIDIAEmbeddings(model=EMBEDDINGS_MODEL), persist_directory="/project/data", ) @@ -88,7 +88,7 @@ def get_retriever(): """ This is a helper function for returning the retriever object of the vector store. """ vectorstore = Chroma( collection_name="rag-chroma", - embedding_function=NVIDIAEmbeddings(model='EMBEDDINGS_MODEL'), + embedding_function=NVIDIAEmbeddings(model=EMBEDDINGS_MODEL), persist_directory="/project/data", ) retriever = vectorstore.as_retriever() diff --git a/code/output.log b/code/output.log index 039865b..5812432 100644 --- a/code/output.log +++ b/code/output.log @@ -5,52 +5,16 @@ To create a public link, set `share=True` in `launch()`. IMPORTANT: You are using gradio version 4.15.0, however version 4.44.1 is available, please upgrade. -------- ---ROUTE QUESTION--- -hello -{'datasource': 'web_search'} ----ROUTE QUESTION TO WEB SEARCH--- +tell me about agents. +{'datasource': 'vectorstore'} +---ROUTE QUESTION TO RAG--- +---RETRIEVE--- +---CHECK DOCUMENT RELEVANCE TO QUESTION--- +---ASSESS GRADED DOCUMENTS--- +---DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, INCLUDE WEB SEARCH--- ---WEB SEARCH--- ---GENERATE--- ---CHECK HALLUCINATIONS--- ---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ---GRADE GENERATION vs QUESTION--- ---DECISION: GENERATION ADDRESSES QUESTION--- ----ROUTE QUESTION--- -i changed the end point. -{'datasource': 'web_search'} ----ROUTE QUESTION TO WEB SEARCH--- ----WEB SEARCH--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ----GRADE GENERATION vs QUESTION--- ----DECISION: GENERATION DOES NOT ADDRESS QUESTION--- ----WEB SEARCH--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ----GRADE GENERATION vs QUESTION--- ----DECISION: GENERATION DOES NOT ADDRESS QUESTION--- ----WEB SEARCH--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ----GRADE GENERATION vs QUESTION--- ----DECISION: GENERATION DOES NOT ADDRESS QUESTION--- ----WEB SEARCH--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ----GRADE GENERATION vs QUESTION--- ----DECISION: GENERATION DOES NOT ADDRESS QUESTION--- ----WEB SEARCH--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ----GRADE GENERATION vs QUESTION--- ----DECISION: GENERATION ADDRESSES QUESTION--- From 48a95540bef8f16b8b69cc724a83f9ffd924ce6c Mon Sep 17 00:00:00 2001 From: JT Casablanca Date: Sat, 8 Mar 2025 11:58:15 -0500 Subject: [PATCH 20/41] Revised README to fit with downloadable nim readme. still changes needed to deep dive. --- README.md | 60 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 9d3a8d2..b7f37c3 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,24 @@ -# Overview -This is a [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application. It uses an agentic approach to combine web search, hallucination controls and accuracy checks with RAG. It has a Gradio front end. You can clone, use and customize this application using [AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) with minimal setup and **no** terminal steps. +# Overview: An Easy Button for +This [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application uses an agentic approach to combine web search, hallucination controls and accuracy checks with RAG. Built with a Gradio front end, you can run it with AI Workbench without any complex setup. +The only requirement is curiosity. You don't need to be a developer or an expert. *Navigating the README:* [Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive) | [Self-Hosted Sizing Guide](#self-hosted-sizing-guide) | [License](#license) *Other Resources:* [:arrow_down: Download AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) | [:book: User Guide](https://docs.nvidia.com/ai-workbench/) |[:open_file_folder: Other Projects](https://docs.nvidia.com/ai-workbench/user-guide/latest/quickstart/example-projects.html) | [:rotating_light: User Forum](https://forums.developer.nvidia.com/t/support-workbench-example-project-agentic-rag/303414) +
+ + +> **Note:** +> You may want to [**fork**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo#forking-a-repository) this repository into your own account before proceeding. Otherwise you won't be able to fully retain any changes you make because this NVIDIA owned repository is **read-only**. + +
+ ## The Agentic RAG Application #### Using the Application -1. You embed your documents, pdfs or webpages, to the vector database. +1. You embed your documents (pdfs or webpages) to the vector database. 2. You configure the prompts for the different components, e.g. the router or retrieval grader. 3. You submit your query. 4. An LLM evaluates your query for relevance to the index and then routes it to the DB or to search by [Tavily](https://tavily.com/). @@ -20,45 +29,50 @@ The diagram **below** shows this agentic flow. -#### Customizing the Application -* Change the prompts for the different components, e.g. the hallucination grader, directly within the front end. -* Change the webpages and pdfs you want to use for the context in the RAG. -* Use different remote endpoints or self-hosted microservices for the inference components. +#### Modifying the Application + +* You can change the prompts for the different components, e.g. the hallucination grader, directly within the front end. +* You can change the webpages and pdfs you want to use for the context in the RAG. +* You can select different endpoints for the inference components. * Cloud endpoints using endpoints from [build.nvidia.com](https://build.nvidia.com/explore/discover) - * Self-hosted endpoints using [NVIDIA Inference Microservices (NIMs)](https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama3-8b-instruct/tags) - * Third party self-hosted microservices like Ollama. -* Modify the application code to add new models, change the Gradio interface, or even change the logic. + * *Advanced: Self-hosted endpoints using [NVIDIA Inference Microservices (NIMs)](https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama3-8b-instruct/tags)* + * *Advanced: Self-hosted endpoints with Ollama.* +* You can modify the application code to add new models, add your own endpoints, change the Gradio interface, or even change the logic. ## Get Started -This RAG is implemented within an [NVIDIA AI Workbench Project](https://docs.nvidia.com/ai-workbench/user-guide/latest/projects/projects.html#projects-structure) that you can clone (or fork and clone) to run locally in AI Workbench. +This app runs in [NVIDIA AI Workbench Project](https://docs.nvidia.com/ai-workbench/user-guide/latest/overview/introduction.html). -#### Prerequisites +#### Prerequisites for Using build.nvidia.com Endpoints -1. You need NVIDIA AI Workbench installed. [See here for how to install it.](https://docs.nvidia.com/ai-workbench/user-guide/latest/installation/overview.html) +1. Install [AI Workbench](https://docs.nvidia.com/ai-workbench/user-guide/latest/installation/overview.html). -2. You need an NVIDIA Personal key to use build.nvidia.com endpoints, as well as a [Tavily](https://tavily.com/) API key to use their web search. - * You can get an NVIDIA Personal API Key on any model card, [e.g. Llama-3.3-70B-instruct](https://build.nvidia.com/meta/llama-3_3-70b-instruct?signin=true&api_key=true) - * You can get a Tavily Search API Key with a free account (1000 searches/month) [here](https://tavily.com/). +2. Get an NVIDIA Developer Account and an API key. + * Go to [build.nvidia.com](https://build.nvidia.com/) and click `Login`. + * Create account, verify email. + * Make a Cloud Account. + * Click your initial > `API Keys`. + * Create and save your key. -3. You should have some pdfs or webpages to use for the RAG. +3. Get a Tavily account and an API key. + * Go to [Tavily](https://tavily.com/) and create an account. + * Create an API key on the overview page. + +4. Have some pdfs or web pages to put in the RAG. -4. You should fork this project to your own GitHub namespace and **copy the link** to the forked repository. #### Opening the Chat 1. Open NVIDIA AI Workbench. Select a [location to work in](https://docs.nvidia.com/ai-workbench/user-guide/latest/locations/locations.html). -2. Clone this project with the link you copied in step 4 (in Workbench, not with Git), and wait for the project container to build. +2. Use the repository URL to clone this project with AI Workbench and wait for it to build. -3. When the build completes, set the following configurations. - - * `Environment` → `Secrets` → `Configure`. Specify the NVIDIA API Key and Tavily Search Key as project secrets. +3. Add your NVIDIA API key and the Tavily API key when prompted. 4. Open the **Chat** from Workbench. It should automatically open in a new browser tab. 5. Upload your documents and change the Router prompt to focus on your uploaded documents. -6. Chat with your documents! +6. Start chatting. # Deep Dive If you want to learn more about the RAG pipeline, expand the sections below. From 2f33c6f7a6e5c04d548c69a5e9bd58802999be4b Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sat, 8 Mar 2025 12:29:25 -0500 Subject: [PATCH 21/41] Modified README.md - Modified files: README.md --- README.md | 97 +++++-------------------------------------------------- 1 file changed, 8 insertions(+), 89 deletions(-) diff --git a/README.md b/README.md index b7f37c3..28d7cc9 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,11 @@ -# Overview: An Easy Button for +# Overview: An Easy Button for Agentic RAG This [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application uses an agentic approach to combine web search, hallucination controls and accuracy checks with RAG. Built with a Gradio front end, you can run it with AI Workbench without any complex setup. -The only requirement is curiosity. You don't need to be a developer or an expert. +> **Note** +>This app runs in [NVIDIA AI Workbench](https://docs.nvidia.com/ai-workbench/user-guide/latest/overview/introduction.html). Workbench is a lightweight developer platform that you can run on your own systems for free. It's an easy way to get up and running with complex AI applications and workloads in a short amount of time. + +> The only requirement is curiosity. You don't need to be a developer or an expert. + *Navigating the README:* [Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive) | [Self-Hosted Sizing Guide](#self-hosted-sizing-guide) | [License](#license) @@ -40,7 +44,6 @@ The diagram **below** shows this agentic flow. * You can modify the application code to add new models, add your own endpoints, change the Gradio interface, or even change the logic. ## Get Started -This app runs in [NVIDIA AI Workbench Project](https://docs.nvidia.com/ai-workbench/user-guide/latest/overview/introduction.html). #### Prerequisites for Using build.nvidia.com Endpoints @@ -74,95 +77,11 @@ This app runs in [NVIDIA AI Workbench Project](https://docs.nvidia.com/ai-workbe 6. Start chatting. -# Deep Dive -If you want to learn more about the RAG pipeline, expand the sections below. -
-
- - RAG Pipeline Description - - -Under the retrieval pipeline, the user query is first compared to documents in the vector database and the most relevant documents are retrieved. - -Another LLM call evaluates the quality of the documents. If satisfactory, it proceeds to the generation phase to produce an response augmented by this relevant context. If the agent decides the best documents are irrelevant to the query, it redirects the user query to the websearch pipeline for a better quality response (see below section). - -After generation, another set of LLMs calls evaluate the response for hallucinations and accuracy. If the generation is both faithful to the retrieved context and answers the user's query in a satisfactory manner, the response is forwarded to the user and displayed. Otherwise, the agent will either regenerate the response, or redirect the query to a web search. - -
- -
- - Websearch Pipeline Description - - -Under the web search pipeline, the user query is inputted onto the web and the search results are retrieved. Using these results, a response is generated. - -After generation, a set of LLMs calls evaluate the response for hallucinations and accuracy. If the generation is both faithful to the retrieved context and answers the user's query in a satisfactory manner, the response is forwarded to the user and displayed. Otherwise, the agent will either regenerate the response, or redirect the query to another web search. - -
-
- -
- -Expand this section for a full guide of the user-configurable project settings - - - - -When the user lands on the Chat UI application in the browser, they will see several components. On the left hand side is a standard chatbot user interface with an input box for queries (submittable with ``[ENTER]``) and a clear history button. Above this chatbot is a diagram of the agentic RAG pipeline which doubles as a progress bar indicator for any nontrivial user actions a user might take, like uploading a document. - -On the right hand side, users will see a collapsable settings panel with several tabs they may choose to navigate to and configure. - -
-
- -Expand for Model Settings. - - - - -This tab holds every user-configurable setting for each of the LLM components of the agentic RAG pipeline: - -* Router -* Retrieval Grader -* Generator -* Hallucination Grader -* Answer Grader - -Expanding any such entry will yield a panel where users can specify the model they would like to use for that particular component from a dropdown (using NVIDIA API Catalog endpoints), or they can specify their own remotely running self-hosted NVIDIA NIM custom endpoint. - -Below this field is an expandable accordion where users can adjust the default prompts for that particular component's task. For example, under the Router component, users can re-write and customize their prompt to focus on only routing queries relating to LLMs and agents to the RAG pipeline and directing all other queries to the Websearch pipeline. - -
- -
- -Expand for Document Settings. - - - - -This tab holds every user-configurable setting for the vector database and document ingestion aspects of this agentic RAG pipeline. Users can upload their own webpages to the vector database by entering a newline-seperated list of URLs in the textbox and clicking Upload, or they can upload their own PDF files from their local machine to be stored in the vector datastore. - -
- - -
- -Expand for Monitoring Settings. - - - - -This tab holds the agentic RAG monitoring tools built into this application. +## Deep Dive +#### Using a Self-Hosted Endpoint -* The first tool is a console that logs all the actions the agent has decided to take when processing the user query and provides a general overview into the agent's decision making. -* The second tool is an in-depth trace of the agent's actions for the last submitted query, which gives more detail into the context retrieved, websearch documents found, LLM pipeline components used, etc. when generating out the most recent response. -
-
-
## Self-Hosted Sizing Guide From d460a70e66e34d807f9c3a4733b137d1c406e07a Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sat, 8 Mar 2025 12:38:32 -0500 Subject: [PATCH 22/41] minor revisions to read me. - Modified files: README.md --- README.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 28d7cc9..48887a4 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Overview: An Easy Button for Agentic RAG -This [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application uses an agentic approach to combine web search, hallucination controls and accuracy checks with RAG. Built with a Gradio front end, you can run it with AI Workbench without any complex setup. +This Retrieval Augmented Generation (RAG) application uses an agentic approach to combine web search, hallucination controls and accuracy checks with RAG. It has a friendly Gradio front end, you can run it with AI Workbench without any complex setup. > **Note** ->This app runs in [NVIDIA AI Workbench](https://docs.nvidia.com/ai-workbench/user-guide/latest/overview/introduction.html). Workbench is a lightweight developer platform that you can run on your own systems for free. It's an easy way to get up and running with complex AI applications and workloads in a short amount of time. +>This app runs in [NVIDIA AI Workbench](https://docs.nvidia.com/ai-workbench/user-guide/latest/overview/introduction.html). It's a free, lightweight developer platform that you can run on your own systems. It's an easy way to get up and running with complex AI applications and workloads in a short amount of time. > The only requirement is curiosity. You don't need to be a developer or an expert. @@ -35,13 +35,16 @@ The diagram **below** shows this agentic flow. #### Modifying the Application -* You can change the prompts for the different components, e.g. the hallucination grader, directly within the front end. -* You can change the webpages and pdfs you want to use for the context in the RAG. -* You can select different endpoints for the inference components. - * Cloud endpoints using endpoints from [build.nvidia.com](https://build.nvidia.com/explore/discover) +* Directly within the app you can: + * Change the prompts for the different components, e.g. the hallucination grader, directly within the front end. + * Change the webpages and pdfs you want to use for the context in the RAG. + * Select different endpoints rom [build.nvidia.com](https://build.nvidia.com/explore/discover) for the inference components. +* You can also use self-hosted endpoints that you setup yourself. * *Advanced: Self-hosted endpoints using [NVIDIA Inference Microservices (NIMs)](https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama3-8b-instruct/tags)* * *Advanced: Self-hosted endpoints with Ollama.* -* You can modify the application code to add new models, add your own endpoints, change the Gradio interface, or even change the logic. +* You can also modify the application code to: + * Add new endpoints and endpoint providers + * Change the Gradio interface or the application. ## Get Started From 33cf715f8fe6ad672a0449152371227ea06449ce Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sat, 8 Mar 2025 12:47:30 -0500 Subject: [PATCH 23/41] Modified README.md - Modified files: README.md --- README.md | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 48887a4..cf05eb5 100644 --- a/README.md +++ b/README.md @@ -36,12 +36,12 @@ The diagram **below** shows this agentic flow. #### Modifying the Application * Directly within the app you can: - * Change the prompts for the different components, e.g. the hallucination grader, directly within the front end. + * Change the prompts for the different components, e.g. the hallucination grader. * Change the webpages and pdfs you want to use for the context in the RAG. - * Select different endpoints rom [build.nvidia.com](https://build.nvidia.com/explore/discover) for the inference components. + * Select different endpoints from [build.nvidia.com](https://build.nvidia.com/explore/discover) for the inference components. * You can also use self-hosted endpoints that you setup yourself. - * *Advanced: Self-hosted endpoints using [NVIDIA Inference Microservices (NIMs)](https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama3-8b-instruct/tags)* - * *Advanced: Self-hosted endpoints with Ollama.* + * *Advanced: Self-hosted endpoint with [NVIDIA Inference Microservices (NIMs)](https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama3-8b-instruct/tags)* + * *Advanced: Self-hosted endpoint with [Ollama](https://hub.docker.com/r/ollama/ollama).* * You can also modify the application code to: * Add new endpoints and endpoint providers * Change the Gradio interface or the application. @@ -82,6 +82,18 @@ The diagram **below** shows this agentic flow. ## Deep Dive #### Using a Self-Hosted Endpoint +If you want to run your own models on your own GPUs, you will need to setup containerized endpoints and then connect them to the RAG application. +This requires some manual steps, but if you are relatively familiar with containers and NVIDIA software, it shouldn't be too bad. + +#### Prerequisites for Using a Self-Hosted Endpoint + +1. You need a remote system with a sufficient GPU that you have SSH access to. + +2. The remote should have the following installed: + * Ubuntu 22.04 or higher + * The latest NVIDIA drivers + * The latest version of Docker + * The latest version of the NVIDIA Container Toolkit @@ -102,7 +114,6 @@ The diagram **below** shows this agentic flow. - # License This NVIDIA AI Workbench example project is under the [Apache 2.0 License](https://github.com/NVIDIA/workbench-example-agentic-rag/blob/main/LICENSE.txt) From d012ace535e68385a3594dbc4c30bcb5ce336663 Mon Sep 17 00:00:00 2001 From: JT Casablanca Date: Sat, 8 Mar 2025 19:04:05 -0500 Subject: [PATCH 24/41] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cf05eb5..bf20df5 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Overview: An Easy Button for Agentic RAG -This Retrieval Augmented Generation (RAG) application uses an agentic approach to combine web search, hallucination controls and accuracy checks with RAG. It has a friendly Gradio front end, you can run it with AI Workbench without any complex setup. +This Retrieval Augmented Generation (RAG) application uses an agentic approach to combine web search, hallucination controls and accuracy checks with RAG. It's easy to modify because its a simple Gradio app. > **Note** >This app runs in [NVIDIA AI Workbench](https://docs.nvidia.com/ai-workbench/user-guide/latest/overview/introduction.html). It's a free, lightweight developer platform that you can run on your own systems. It's an easy way to get up and running with complex AI applications and workloads in a short amount of time. From fe4748ba5d73b0536014582aeed4de7d0bde1269 Mon Sep 17 00:00:00 2001 From: JT Casablanca Date: Sat, 8 Mar 2025 19:04:46 -0500 Subject: [PATCH 25/41] Update README.md --- README.md | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index bf20df5..8974a44 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,10 @@ This Retrieval Augmented Generation (RAG) application uses an agentic approach t > **Note** >This app runs in [NVIDIA AI Workbench](https://docs.nvidia.com/ai-workbench/user-guide/latest/overview/introduction.html). It's a free, lightweight developer platform that you can run on your own systems. It's an easy way to get up and running with complex AI applications and workloads in a short amount of time. -> The only requirement is curiosity. You don't need to be a developer or an expert. +> **Note:** +> You may want to [**fork**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo#forking-a-repository) this repository into your own account before proceeding. Otherwise you won't be able to fully retain any changes you make because this NVIDIA owned repository is **read-only**. +
*Navigating the README:* [Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive) | [Self-Hosted Sizing Guide](#self-hosted-sizing-guide) | [License](#license) @@ -14,12 +16,6 @@ This Retrieval Augmented Generation (RAG) application uses an agentic approach t
- -> **Note:** -> You may want to [**fork**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo#forking-a-repository) this repository into your own account before proceeding. Otherwise you won't be able to fully retain any changes you make because this NVIDIA owned repository is **read-only**. - -
- ## The Agentic RAG Application #### Using the Application 1. You embed your documents (pdfs or webpages) to the vector database. From 8894b5ec313c1ca5c61892ded9aeefa5ef9140bc Mon Sep 17 00:00:00 2001 From: JT Casablanca Date: Sat, 8 Mar 2025 19:05:10 -0500 Subject: [PATCH 26/41] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 8974a44..f8356a4 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ This Retrieval Augmented Generation (RAG) application uses an agentic approach t > **Note** >This app runs in [NVIDIA AI Workbench](https://docs.nvidia.com/ai-workbench/user-guide/latest/overview/introduction.html). It's a free, lightweight developer platform that you can run on your own systems. It's an easy way to get up and running with complex AI applications and workloads in a short amount of time. -> **Note:** > You may want to [**fork**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo#forking-a-repository) this repository into your own account before proceeding. Otherwise you won't be able to fully retain any changes you make because this NVIDIA owned repository is **read-only**.
From 115d5e956edc40dff09e2661d6930de8c5ec9593 Mon Sep 17 00:00:00 2001 From: JT Casablanca Date: Sat, 8 Mar 2025 19:05:30 -0500 Subject: [PATCH 27/41] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index f8356a4..0417352 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,6 @@ This Retrieval Augmented Generation (RAG) application uses an agentic approach t *Other Resources:* [:arrow_down: Download AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) | [:book: User Guide](https://docs.nvidia.com/ai-workbench/) |[:open_file_folder: Other Projects](https://docs.nvidia.com/ai-workbench/user-guide/latest/quickstart/example-projects.html) | [:rotating_light: User Forum](https://forums.developer.nvidia.com/t/support-workbench-example-project-agentic-rag/303414) -
## The Agentic RAG Application #### Using the Application From 4d35ca92f923d6e8303201add460201ecdab7dd3 Mon Sep 17 00:00:00 2001 From: JT Casablanca Date: Sat, 8 Mar 2025 19:07:48 -0500 Subject: [PATCH 28/41] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0417352..45c7959 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This Retrieval Augmented Generation (RAG) application uses an agentic approach to combine web search, hallucination controls and accuracy checks with RAG. It's easy to modify because its a simple Gradio app. > **Note** ->This app runs in [NVIDIA AI Workbench](https://docs.nvidia.com/ai-workbench/user-guide/latest/overview/introduction.html). It's a free, lightweight developer platform that you can run on your own systems. It's an easy way to get up and running with complex AI applications and workloads in a short amount of time. +>This app runs in [NVIDIA AI Workbench](https://docs.nvidia.com/ai-workbench/user-guide/latest/overview/introduction.html). It's a free, lightweight developer platform that you can run on your own systems to get up and running with complex AI applications and workloads in a short amount of time. > You may want to [**fork**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo#forking-a-repository) this repository into your own account before proceeding. Otherwise you won't be able to fully retain any changes you make because this NVIDIA owned repository is **read-only**. From 82e3a6299a0fbf8e9ba1979beb2e0db8264132e4 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sat, 8 Mar 2025 20:38:33 -0500 Subject: [PATCH 29/41] Minor changes to readme - Modified files: README.md, code/output.log --- README.md | 23 +++++++++++++++-------- code/output.log | 14 -------------- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index cf05eb5..289b10b 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,9 @@ This Retrieval Augmented Generation (RAG) application uses an agentic approach t ## The Agentic RAG Application #### Using the Application 1. You embed your documents (pdfs or webpages) to the vector database. -2. You configure the prompts for the different components, e.g. the router or retrieval grader. +2. You configure each of the separate components for the pipeline. For each component you can: + * Select from a drop down of endpoints or use a self-hosted endpoint. + * Modify the prompt. 3. You submit your query. 4. An LLM evaluates your query for relevance to the index and then routes it to the DB or to search by [Tavily](https://tavily.com/). 5. Answers are checked for hallucination and relevance. "Failing"" answers are run through the process again. @@ -39,16 +41,18 @@ The diagram **below** shows this agentic flow. * Change the prompts for the different components, e.g. the hallucination grader. * Change the webpages and pdfs you want to use for the context in the RAG. * Select different endpoints from [build.nvidia.com](https://build.nvidia.com/explore/discover) for the inference components. -* You can also use self-hosted endpoints that you setup yourself. - * *Advanced: Self-hosted endpoint with [NVIDIA Inference Microservices (NIMs)](https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama3-8b-instruct/tags)* - * *Advanced: Self-hosted endpoint with [Ollama](https://hub.docker.com/r/ollama/ollama).* + * Configure it to use self-hosted endpoints with [NVIDIA Inference Microservices (NIMs)](https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama3-8b-instruct/tags) or [Ollama](https://hub.docker.com/r/ollama/ollama). * You can also modify the application code to: * Add new endpoints and endpoint providers * Change the Gradio interface or the application. +> **Note** * Setting up self-hosted endpoints is relatively advanced because you will need to do it manually. + ## Get Started -#### Prerequisites for Using build.nvidia.com Endpoints +The quickest path is with the pre-configured build.nvidia.com endpoints. + +#### Prerequisites for Using Pre-configured Endpoints 1. Install [AI Workbench](https://docs.nvidia.com/ai-workbench/user-guide/latest/installation/overview.html). @@ -81,8 +85,13 @@ The diagram **below** shows this agentic flow. 6. Start chatting. ## Deep Dive + +> **Note** This assumes you've done the **Get Started** steps. + #### Using a Self-Hosted Endpoint -If you want to run your own models on your own GPUs, you will need to setup containerized endpoints and then connect them to the RAG application. + +Each component in the pipeline can be configured to use a self-hosted endpoint. This lets you use your own models on your own GPUs, but requires you to setup endpoints and then connect them to the RAG application. + This requires some manual steps, but if you are relatively familiar with containers and NVIDIA software, it shouldn't be too bad. #### Prerequisites for Using a Self-Hosted Endpoint @@ -96,8 +105,6 @@ This requires some manual steps, but if you are relatively familiar with contain * The latest version of the NVIDIA Container Toolkit - - ## Self-Hosted Sizing Guide | GPU VRAM | Example Hardware | Compatible? | diff --git a/code/output.log b/code/output.log index 5812432..3af4e0c 100644 --- a/code/output.log +++ b/code/output.log @@ -4,17 +4,3 @@ Running on local URL: http://0.0.0.0:8080 To create a public link, set `share=True` in `launch()`. IMPORTANT: You are using gradio version 4.15.0, however version 4.44.1 is available, please upgrade. -------- ----ROUTE QUESTION--- -tell me about agents. -{'datasource': 'vectorstore'} ----ROUTE QUESTION TO RAG--- ----RETRIEVE--- ----CHECK DOCUMENT RELEVANCE TO QUESTION--- ----ASSESS GRADED DOCUMENTS--- ----DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, INCLUDE WEB SEARCH--- ----WEB SEARCH--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ----GRADE GENERATION vs QUESTION--- ----DECISION: GENERATION ADDRESSES QUESTION--- From 9227b3f790ff959b08790b87d9a8a70a3dba0550 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sun, 9 Mar 2025 12:26:30 -0400 Subject: [PATCH 30/41] Minor changes. - Modified files: README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 4130267..5315635 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,9 @@ The diagram **below** shows this agentic flow. * Configure it to use self-hosted endpoints with [NVIDIA Inference Microservices (NIMs)](https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama3-8b-instruct/tags) or [Ollama](https://hub.docker.com/r/ollama/ollama). * You can also modify the application code to: * Add new endpoints and endpoint providers - * Change the Gradio interface or the application. + * Change the Gradio interface or the application structure and logic. -> **Note** * Setting up self-hosted endpoints is relatively advanced because you will need to do it manually. +> **Note** Setting up self-hosted endpoints is relatively advanced because you will need to do it manually. ## Get Started @@ -84,9 +84,7 @@ The quickest path is with the pre-configured build.nvidia.com endpoints. #### Using a Self-Hosted Endpoint -Each component in the pipeline can be configured to use a self-hosted endpoint. This lets you use your own models on your own GPUs, but requires you to setup endpoints and then connect them to the RAG application. - -This requires some manual steps, but if you are relatively familiar with containers and NVIDIA software, it shouldn't be too bad. +Each pipeline component can be configured to use a self-hosted endpoint. You can run your own models on your own GPUs, but you'll need to set up the endpoints first. This works best if you know how to use containers and install NVIDIA software on Ubuntu. #### Prerequisites for Using a Self-Hosted Endpoint @@ -98,6 +96,8 @@ This requires some manual steps, but if you are relatively familiar with contain * The latest version of Docker * The latest version of the NVIDIA Container Toolkit +3. + ## Self-Hosted Sizing Guide From 9bb32fd2f38796a2c7393cf70fd4cad374a3dade Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sun, 9 Mar 2025 12:30:59 -0400 Subject: [PATCH 31/41] Added nim support and disk size files to drive gpu selection UI - Added files: code/nim_disk_size.json, code/nim_gpu_support_matrix.json --- code/nim_disk_size.json | 288 +++++++++++++ code/nim_gpu_support_matrix.json | 675 +++++++++++++++++++++++++++++++ 2 files changed, 963 insertions(+) create mode 100644 code/nim_disk_size.json create mode 100644 code/nim_gpu_support_matrix.json diff --git a/code/nim_disk_size.json b/code/nim_disk_size.json new file mode 100644 index 0000000..8de0ae0 --- /dev/null +++ b/code/nim_disk_size.json @@ -0,0 +1,288 @@ +{ + "codellama-13b-instruct": { + "disk_space": { + "H100": { + "fp16": { + "throughput": 24.63, + "latency": 25.32 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 24.63, + "latency": 25.31 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 24.63, + "latency": 25.31 + } + }, + "L40S": { + "fp16": { + "throughput": 25.32, + "latency": 24.63 + } + }, + "A10G": { + "fp16": { + "throughput": 25.32, + "latency": 26.69 + } + } + } + }, + "codellama-34b-instruct": { + "disk_space": { + "H100": { + "fp8": { + "throughput": 32.17, + "latency": 32.42 + }, + "fp16": { + "throughput": 63.48, + "latency": 64.59 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 63.48, + "latency": 64.59 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 63.48, + "latency": 64.59 + } + }, + "L40S": { + "fp8": { + "throughput": 32.42 + }, + "fp16": { + "throughput": 64.58 + } + }, + "A10G": { + "fp16": { + "throughput": 64.58, + "latency": 66.8 + } + } + } + }, + "codellama-70b": { + "disk_space": { + "H100": { + "fp8": { + "throughput": 65.47, + "latency": 66.37 + }, + "fp16": { + "throughput": 130.35, + "latency": 66.37 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + }, + "A10G": { + "fp16": { + "throughput": 132.69 + } + } + } + }, + "deepseek-r1-distill-llama-8b-rtx": { + "disk_space": { + "RTX 6000 Ada": { + "int4_awq": { + "throughput": 5.42 + } + }, + "RTX 5090": { + "int4_awq": { + "throughput": 5.42 + } + }, + "RTX 5080": { + "int4_awq": { + "throughput": 5.42 + } + }, + "RTX 4090": { + "int4_awq": { + "throughput": 5.42 + } + }, + "RTX 4080": { + "int4_awq": { + "throughput": 5.42 + } + } + } + }, + "phind-codellama-34b-v2-instruct": { + "disk_space": { + "H100": { + "fp8": { + "throughput": 32.17, + "latency": 32.41 + }, + "fp16": { + "throughput": 63.48, + "latency": 64.59 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 63.48, + "latency": 64.59 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 63.48, + "latency": 64.59 + } + }, + "L40S": { + "fp8": { + "throughput": 32.43 + }, + "fp16": { + "throughput": 64.58 + } + }, + "A10G": { + "fp16": { + "latency": 66.8 + } + } + } + }, + "mixtral-8x7b-instruct-v0.1": { + "disk_space": { + "H100": { + "fp8": { + "throughput": 65.47, + "latency": 66.37 + }, + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + } + } + }, + "mixtral-8x22b-instruct-v0.1": { + "disk_space": { + "H100": { + "fp8": { + "throughput": 130.94, + "latency": 132.74 + }, + "fp16": { + "throughput": 260.7, + "latency": 265.42 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 260.7, + "latency": 265.42 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 260.7, + "latency": 265.42 + } + } + } + }, + "llama-3.1-8b-instruct": { + "disk_space": { + "H100": { + "fp8": { + "throughput": 3.8 + }, + "fp16": { + "throughput": 7.14 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 7.14 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 7.14 + } + }, + "L40S": { + "fp8": { + "throughput": 3.8 + }, + "fp16": { + "throughput": 7.14 + } + }, + "A10G": { + "fp16": { + "throughput": 7.14 + } + } + } + }, + "llama-3.1-70b-instruct": { + "disk_space": { + "H100": { + "fp8": { + "throughput": 65.47, + "latency": 66.37 + }, + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + } + } + } +} diff --git a/code/nim_gpu_support_matrix.json b/code/nim_gpu_support_matrix.json new file mode 100644 index 0000000..730119d --- /dev/null +++ b/code/nim_gpu_support_matrix.json @@ -0,0 +1,675 @@ +{ + "H100": { + "1": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "4": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "8": { + "models": [ + "llama-3.1-405b-instruct", + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "16": { + "models": [ + "deepseek-r1", + "llama-3.1-405b-instruct", + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + } + }, + "A100 80GB": { + "1": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "4": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "8": { + "models": [ + "llama-3.1-405b-instruct", + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "16": { + "models": [ + "deepseek-r1", + "llama-3.1-405b-instruct", + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + } + }, + "A100 40GB": { + "1": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "4": { + "models": [ + "mixtral-8x7b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "8": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "16": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + } + }, + "L40S": { + "1": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "2": { + "models": [ + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "4": { + "models": [ + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "8": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "16": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + } + }, + "A10G": { + "1": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "4": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "8": { + "models": [ + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "16": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + } + }, + "RTX 6000 Ada": { + "1": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "2": { + "models": [ + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "4": { + "models": [ + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "8": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "16": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + } + }, + "RTX 5090": { + "1": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "4": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "8": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "16": { + "models": [ + "llama-3.1-8b-instruct" + ] + } + }, + "RTX 5080": { + "1": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "4": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "8": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "16": { + "models": [ + "llama-3.1-8b-instruct" + ] + } + }, + "RTX 4090": { + "1": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "4": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "8": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "16": { + "models": [ + "llama-3.1-8b-instruct" + ] + } + }, + "RTX 4080": { + "1": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "4": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "8": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "16": { + "models": [ + "llama-3.1-8b-instruct" + ] + } + } +} \ No newline at end of file From 7d55975dd289f123fb11ff69a76ffa9e1ce6749b Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sun, 9 Mar 2025 13:00:49 -0400 Subject: [PATCH 32/41] feat: Add GPU configuration support for NIM endpoints - Added files: code/chatui/utils/gpu_compatibility.py - Modified files: .gitignore, code/chatui/pages/converse.py, code/chatui/utils/graph.py, code/chatui/utils/nim.py - Add GPU type and count selection UI for all NIM components - Implement GPU compatibility checking and validation - Add dynamic model selection based on GPU configuration - Update state management to include GPU settings - Add error handling for GPU configuration issues - Improve API key handling in NIM client The changes allow users to: - Select GPU type and count for each NIM endpoint - See compatible models based on GPU configuration - Get warnings for incompatible configurations - Validate GPU settings before making API calls Technical changes: - Add gpu_compatibility module for GPU support matrix - Update CustomChatOpenAI to handle GPU configuration - Add GPU fields to GraphState - Add GPU validation in NIM client - Implement dynamic UI updates based on GPU selection --- .gitignore | 5 +- code/chatui/pages/converse.py | 649 ++++++++++++++++--------- code/chatui/utils/gpu_compatibility.py | 66 +++ code/chatui/utils/graph.py | 22 +- code/chatui/utils/nim.py | 48 +- 5 files changed, 535 insertions(+), 255 deletions(-) create mode 100644 code/chatui/utils/gpu_compatibility.py diff --git a/.gitignore b/.gitignore index 0f5e848..7c5de6b 100644 --- a/.gitignore +++ b/.gitignore @@ -55,4 +55,7 @@ cover/ # Workbench Project Layout data/scratch/* !data/scratch/.gitkeep -data/chroma.sqlite3 \ No newline at end of file +data/chroma.sqlite3 + +data/* +!data/.gitkeep \ No newline at end of file diff --git a/code/chatui/pages/converse.py b/code/chatui/pages/converse.py index 68dcbe5..caa2769 100644 --- a/code/chatui/pages/converse.py +++ b/code/chatui/pages/converse.py @@ -24,6 +24,7 @@ import subprocess import time import sys +import json INTERNAL_API = os.getenv('INTERNAL_API', '') @@ -41,7 +42,7 @@ from chatui import assets, chat_client from chatui.prompts import prompts_llama3, prompts_mistral -from chatui.utils import compile, database, logger +from chatui.utils import compile, database, logger, gpu_compatibility from langgraph.graph import END, StateGraph @@ -178,19 +179,48 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: with gr.TabItem("NIM Endpoints", id=1) as router_nim: with gr.Row(): - nim_router_ip = gr.Textbox(placeholder = "10.123.45.678", - label = "Microservice Host", - info = "IP Address running the microservice", - elem_id="rag-inputs", scale=2) - nim_router_port = gr.Textbox(placeholder = "8000", - label = "Port", - info = "Optional, (default: 8000)", - elem_id="rag-inputs", scale=1) + nim_router_gpu_type = gr.Dropdown( + choices=gpu_compatibility.get_gpu_types(), + label="GPU Type", + info="Select your GPU type", + elem_id="rag-inputs", + scale=2 + ) + nim_router_gpu_count = gr.Dropdown( + choices=[], + label="Number of GPUs", + info="Select number of GPUs", + elem_id="rag-inputs", + scale=1, + interactive=False + ) - nim_router_id = gr.Textbox(placeholder = "meta/llama3-8b-instruct", - label = "Model running in microservice.", - info = "If none specified, defaults to: meta/llama3-8b-instruct", - elem_id="rag-inputs") + with gr.Row(): + nim_router_ip = gr.Textbox( + placeholder="10.123.45.678", + label="Microservice Host", + info="IP Address running the microservice", + elem_id="rag-inputs", + scale=2 + ) + nim_router_port = gr.Textbox( + placeholder="8000", + label="Port", + info="Optional, (default: 8000)", + elem_id="rag-inputs", + scale=1 + ) + + nim_router_id = gr.Dropdown( + choices=[], + label="Model running in microservice", + info="Select a compatible model for your GPU configuration", + elem_id="rag-inputs", + interactive=False + ) + + # Add warning box for compatibility issues + nim_router_warning = gr.Markdown(visible=False, value="") with gr.TabItem("Hide", id=2) as router_hide: gr.Markdown("") @@ -216,19 +246,48 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: interactive=True) with gr.TabItem("NIM Endpoints", id=1) as retrieval_nim: with gr.Row(): - nim_retrieval_ip = gr.Textbox(placeholder = "10.123.45.678", - label = "Microservice Host", - info = "IP Address running the microservice", - elem_id="rag-inputs", scale=2) - nim_retrieval_port = gr.Textbox(placeholder = "8000", - label = "Port", - info = "Optional, (default: 8000)", - elem_id="rag-inputs", scale=1) + nim_retrieval_gpu_type = gr.Dropdown( + choices=gpu_compatibility.get_gpu_types(), + label="GPU Type", + info="Select your GPU type", + elem_id="rag-inputs", + scale=2 + ) + nim_retrieval_gpu_count = gr.Dropdown( + choices=[], + label="Number of GPUs", + info="Select number of GPUs", + elem_id="rag-inputs", + scale=1, + interactive=False + ) + + with gr.Row(): + nim_retrieval_ip = gr.Textbox( + placeholder="10.123.45.678", + label="Microservice Host", + info="IP Address running the microservice", + elem_id="rag-inputs", + scale=2 + ) + nim_retrieval_port = gr.Textbox( + placeholder="8000", + label="Port", + info="Optional, (default: 8000)", + elem_id="rag-inputs", + scale=1 + ) - nim_retrieval_id = gr.Textbox(placeholder = "meta/llama3-8b-instruct", - label = "Model running in microservice.", - info = "If none specified, defaults to: meta/llama3-8b-instruct", - elem_id="rag-inputs") + nim_retrieval_id = gr.Dropdown( + choices=[], + label="Model running in microservice", + info="Select a compatible model for your GPU configuration", + elem_id="rag-inputs", + interactive=False + ) + + # Add warning box for compatibility issues + nim_retrieval_warning = gr.Markdown(visible=False, value="") with gr.TabItem("Hide", id=2) as retrieval_hide: gr.Markdown("") @@ -254,19 +313,48 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: interactive=True) with gr.TabItem("NIM Endpoints", id=1) as generator_nim: with gr.Row(): - nim_generator_ip = gr.Textbox(placeholder = "10.123.45.678", - label = "Microservice Host", - info = "IP Address running the microservice", - elem_id="rag-inputs", scale=2) - nim_generator_port = gr.Textbox(placeholder = "8000", - label = "Port", - info = "Optional, (default: 8000)", - elem_id="rag-inputs", scale=1) + nim_generator_gpu_type = gr.Dropdown( + choices=gpu_compatibility.get_gpu_types(), + label="GPU Type", + info="Select your GPU type", + elem_id="rag-inputs", + scale=2 + ) + nim_generator_gpu_count = gr.Dropdown( + choices=[], + label="Number of GPUs", + info="Select number of GPUs", + elem_id="rag-inputs", + scale=1, + interactive=False + ) + + with gr.Row(): + nim_generator_ip = gr.Textbox( + placeholder="10.123.45.678", + label="Microservice Host", + info="IP Address running the microservice", + elem_id="rag-inputs", + scale=2 + ) + nim_generator_port = gr.Textbox( + placeholder="8000", + label="Port", + info="Optional, (default: 8000)", + elem_id="rag-inputs", + scale=1 + ) - nim_generator_id = gr.Textbox(placeholder = "meta/llama3-8b-instruct", - label = "Model running in microservice.", - info = "If none specified, defaults to: meta/llama3-8b-instruct", - elem_id="rag-inputs") + nim_generator_id = gr.Dropdown( + choices=[], + label="Model running in microservice", + info="Select a compatible model for your GPU configuration", + elem_id="rag-inputs", + interactive=False + ) + + # Add warning box for compatibility issues + nim_generator_warning = gr.Markdown(visible=False, value="") with gr.TabItem("Hide", id=2) as generator_hide: gr.Markdown("") @@ -292,19 +380,48 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: interactive=True) with gr.TabItem("NIM Endpoints", id=1) as hallucination_nim: with gr.Row(): - nim_hallucination_ip = gr.Textbox(placeholder = "10.123.45.678", - label = "Microservice Host", - info = "IP Address running the microservice", - elem_id="rag-inputs", scale=2) - nim_hallucination_port = gr.Textbox(placeholder = "8000", - label = "Port", - info = "Optional, (default: 8000)", - elem_id="rag-inputs", scale=1) + nim_hallucination_gpu_type = gr.Dropdown( + choices=gpu_compatibility.get_gpu_types(), + label="GPU Type", + info="Select your GPU type", + elem_id="rag-inputs", + scale=2 + ) + nim_hallucination_gpu_count = gr.Dropdown( + choices=[], + label="Number of GPUs", + info="Select number of GPUs", + elem_id="rag-inputs", + scale=1, + interactive=False + ) + + with gr.Row(): + nim_hallucination_ip = gr.Textbox( + placeholder="10.123.45.678", + label="Microservice Host", + info="IP Address running the microservice", + elem_id="rag-inputs", + scale=2 + ) + nim_hallucination_port = gr.Textbox( + placeholder="8000", + label="Port", + info="Optional, (default: 8000)", + elem_id="rag-inputs", + scale=1 + ) - nim_hallucination_id = gr.Textbox(placeholder = "meta/llama3-8b-instruct", - label = "Model running in microservice.", - info = "If none specified, defaults to: meta/llama3-8b-instruct", - elem_id="rag-inputs") + nim_hallucination_id = gr.Dropdown( + choices=[], + label="Model running in microservice", + info="Select a compatible model for your GPU configuration", + elem_id="rag-inputs", + interactive=False + ) + + # Add warning box for compatibility issues + nim_hallucination_warning = gr.Markdown(visible=False, value="") with gr.TabItem("Hide", id=2) as hallucination_hide: gr.Markdown("") @@ -330,19 +447,48 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: interactive=True) with gr.TabItem("NIM Endpoints", id=1) as answer_nim: with gr.Row(): - nim_answer_ip = gr.Textbox(placeholder = "10.123.45.678", - label = "Microservice Host", - info = "IP Address running the microservice", - elem_id="rag-inputs", scale=2) - nim_answer_port = gr.Textbox(placeholder = "8000", - label = "Port", - info = "Optional, (default: 8000)", - elem_id="rag-inputs", scale=1) + nim_answer_gpu_type = gr.Dropdown( + choices=gpu_compatibility.get_gpu_types(), + label="GPU Type", + info="Select your GPU type", + elem_id="rag-inputs", + scale=2 + ) + nim_answer_gpu_count = gr.Dropdown( + choices=[], + label="Number of GPUs", + info="Select number of GPUs", + elem_id="rag-inputs", + scale=1, + interactive=False + ) - nim_answer_id = gr.Textbox(placeholder = "meta/llama3-8b-instruct", - label = "Model running in microservice.", - info = "If none specified, defaults to: meta/llama3-8b-instruct", - elem_id="rag-inputs") + with gr.Row(): + nim_answer_ip = gr.Textbox( + placeholder="10.123.45.678", + label="Microservice Host", + info="IP Address running the microservice", + elem_id="rag-inputs", + scale=2 + ) + nim_answer_port = gr.Textbox( + placeholder="8000", + label="Port", + info="Optional, (default: 8000)", + elem_id="rag-inputs", + scale=1 + ) + + nim_answer_id = gr.Dropdown( + choices=[], + label="Model running in microservice", + info="Select a compatible model for your GPU configuration", + elem_id="rag-inputs", + interactive=False + ) + + # Add warning box for compatibility issues + nim_answer_warning = gr.Markdown(visible=False, value="") with gr.TabItem("Hide", id=2) as answer_hide: gr.Markdown("") @@ -443,91 +589,204 @@ def _toggle_hide_answer(): """ These helper functions set state and prompts when either the NIM or API Endpoint tabs are selected. """ - def _toggle_router_endpoints(api_model: str, nim_model: str, evt: gr.EventData): - if (evt._data['value'] == "NIM Endpoints") and ("llama3" in nim_model or len(nim_model) == 0): - value = prompts_llama3.router_prompt - elif (evt._data['value'] == "NIM Endpoints") and ("mistral" in nim_model or "mixtral" in nim_model): - value = prompts_mistral.router_prompt - elif (evt._data['value'] == "API Endpoints") and ("llama3" in api_model): - value = prompts_llama3.router_prompt - elif (evt._data['value'] == "API Endpoints") and ("mistral" in api_model or "mixtral" in api_model): - value = prompts_mistral.router_prompt - return True if evt._data['value'] == "NIM Endpoints" else False, gr.update(value=value) if value is not None else gr.update(visible=True) - - def _toggle_retrieval_endpoints(api_model: str, nim_model: str, evt: gr.EventData): - if (evt._data['value'] == "NIM Endpoints") and ("llama3" in nim_model or len(nim_model) == 0): - value = prompts_llama3.retrieval_prompt - elif (evt._data['value'] == "NIM Endpoints") and ("mistral" in nim_model or "mixtral" in nim_model): - value = prompts_mistral.retrieval_prompt - elif (evt._data['value'] == "API Endpoints") and ("llama3" in api_model): - value = prompts_llama3.retrieval_prompt - elif (evt._data['value'] == "API Endpoints") and ("mistral" in api_model or "mixtral" in api_model): - value = prompts_mistral.retrieval_prompt - return True if evt._data['value'] == "NIM Endpoints" else False, gr.update(value=value) if value is not None else gr.update(visible=True) - - def _toggle_generator_endpoints(api_model: str, nim_model: str, evt: gr.EventData): - if (evt._data['value'] == "NIM Endpoints") and ("llama3" in nim_model or len(nim_model) == 0): - value = prompts_llama3.generator_prompt - elif (evt._data['value'] == "NIM Endpoints") and ("mistral" in nim_model or "mixtral" in nim_model): - value = prompts_mistral.generator_prompt - elif (evt._data['value'] == "API Endpoints") and ("llama3" in api_model): - value = prompts_llama3.generator_prompt - elif (evt._data['value'] == "API Endpoints") and ("mistral" in api_model or "mixtral" in api_model): - value = prompts_mistral.generator_prompt - return True if evt._data['value'] == "NIM Endpoints" else False, gr.update(value=value) if value is not None else gr.update(visible=True) - - def _toggle_hallucination_endpoints(api_model: str, nim_model: str, evt: gr.EventData): - if (evt._data['value'] == "NIM Endpoints") and ("llama3" in nim_model or len(nim_model) == 0): - value = prompts_llama3.hallucination_prompt - elif (evt._data['value'] == "NIM Endpoints") and ("mistral" in nim_model or "mixtral" in nim_model): - value = prompts_mistral.hallucination_prompt - elif (evt._data['value'] == "API Endpoints") and ("llama3" in api_model): - value = prompts_llama3.hallucination_prompt - elif (evt._data['value'] == "API Endpoints") and ("mistral" in api_model or "mixtral" in api_model): - value = prompts_mistral.hallucination_prompt - return True if evt._data['value'] == "NIM Endpoints" else False, gr.update(value=value) if value is not None else gr.update(visible=True) - - def _toggle_answer_endpoints(api_model: str, nim_model: str, evt: gr.EventData): - if (evt._data['value'] == "NIM Endpoints") and ("llama3" in nim_model or len(nim_model) == 0): - value = prompts_llama3.answer_prompt - elif (evt._data['value'] == "NIM Endpoints") and ("mistral" in nim_model or "mixtral" in nim_model): - value = prompts_mistral.answer_prompt - elif (evt._data['value'] == "API Endpoints") and ("llama3" in api_model): - value = prompts_llama3.answer_prompt - elif (evt._data['value'] == "API Endpoints") and ("mistral" in api_model or "mixtral" in api_model): - value = prompts_mistral.answer_prompt - return True if evt._data['value'] == "NIM Endpoints" else False, gr.update(value=value) if value is not None else gr.update(visible=True) - - router_api.select(_toggle_router_endpoints, [model_router, nim_router_id], [router_use_nim, prompt_router]) - router_nim.select(_toggle_router_endpoints, [model_router, nim_router_id], [router_use_nim, prompt_router]) - retrieval_api.select(_toggle_retrieval_endpoints, [model_retrieval, nim_retrieval_id], [retrieval_use_nim, prompt_retrieval]) - retrieval_nim.select(_toggle_retrieval_endpoints, [model_retrieval, nim_retrieval_id], [retrieval_use_nim, prompt_retrieval]) - generator_api.select(_toggle_generator_endpoints, [model_generator, nim_generator_id], [generator_use_nim, prompt_generator]) - generator_nim.select(_toggle_generator_endpoints, [model_generator, nim_generator_id], [generator_use_nim, prompt_generator]) - hallucination_api.select(_toggle_hallucination_endpoints, [model_hallucination, nim_hallucination_id], [hallucination_use_nim, prompt_hallucination]) - hallucination_nim.select(_toggle_hallucination_endpoints, [model_hallucination, nim_hallucination_id], [hallucination_use_nim, prompt_hallucination]) - answer_api.select(_toggle_answer_endpoints, [model_answer, nim_answer_id], [answer_use_nim, prompt_answer]) - answer_nim.select(_toggle_answer_endpoints, [model_answer, nim_answer_id], [answer_use_nim, prompt_answer]) + def _update_gpu_counts(component: str, gpu_type: str): + """Update the available GPU counts for selected GPU type.""" + counts = gpu_compatibility.get_supported_gpu_counts(gpu_type) + components = { + "router": [nim_router_gpu_count, nim_router_id, nim_router_warning], + "retrieval": [nim_retrieval_gpu_count, nim_retrieval_id, nim_retrieval_warning], + "generator": [nim_generator_gpu_count, nim_generator_id, nim_generator_warning], + "hallucination": [nim_hallucination_gpu_count, nim_hallucination_id, nim_hallucination_warning], + "answer": [nim_answer_gpu_count, nim_answer_id, nim_answer_warning] + } + return { + components[component][0]: gr.update(choices=counts, value=None, interactive=True), + components[component][1]: gr.update(choices=[], value=None, interactive=False), + components[component][2]: gr.update(visible=False, value="") + } + + def _update_compatible_models(component: str, gpu_type: str, num_gpus: str): + """Update the compatible models list based on GPU configuration.""" + if not gpu_type or not num_gpus: + components = { + "router": [nim_router_id, nim_router_warning], + "retrieval": [nim_retrieval_id, nim_retrieval_warning], + "generator": [nim_generator_id, nim_generator_warning], + "hallucination": [nim_hallucination_id, nim_hallucination_warning], + "answer": [nim_answer_id, nim_answer_warning] + } + return { + components[component][0]: gr.update(choices=[], value=None, interactive=False), + components[component][1]: gr.update(visible=False, value="") + } + + compatibility = gpu_compatibility.get_compatible_models(gpu_type, num_gpus) + + if compatibility["warning_message"]: + components = { + "router": [nim_router_id, nim_router_warning], + "retrieval": [nim_retrieval_id, nim_retrieval_warning], + "generator": [nim_generator_id, nim_generator_warning], + "hallucination": [nim_hallucination_id, nim_hallucination_warning], + "answer": [nim_answer_id, nim_answer_warning] + } + return { + components[component][0]: gr.update(choices=[], value=None, interactive=False), + components[component][1]: gr.update(visible=True, value=f"⚠️ {compatibility['warning_message']}") + } + + components = { + "router": [nim_router_id, nim_router_warning], + "retrieval": [nim_retrieval_id, nim_retrieval_warning], + "generator": [nim_generator_id, nim_generator_warning], + "hallucination": [nim_hallucination_id, nim_hallucination_warning], + "answer": [nim_answer_id, nim_answer_warning] + } + return { + components[component][0]: gr.update( + choices=compatibility["compatible_models"], + value=compatibility["compatible_models"][0] if compatibility["compatible_models"] else None, + interactive=True + ), + components[component][1]: gr.update(visible=False, value="") + } + + # Add the event handlers for all components + nim_router_gpu_type.change(lambda x: _update_gpu_counts("router", x), nim_router_gpu_type, + [nim_router_gpu_count, nim_router_id, nim_router_warning]) + nim_router_gpu_count.change(lambda x, y: _update_compatible_models("router", x, y), + [nim_router_gpu_type, nim_router_gpu_count], + [nim_router_id, nim_router_warning]) + + nim_retrieval_gpu_type.change(lambda x: _update_gpu_counts("retrieval", x), nim_retrieval_gpu_type, + [nim_retrieval_gpu_count, nim_retrieval_id, nim_retrieval_warning]) + nim_retrieval_gpu_count.change(lambda x, y: _update_compatible_models("retrieval", x, y), + [nim_retrieval_gpu_type, nim_retrieval_gpu_count], + [nim_retrieval_id, nim_retrieval_warning]) + + nim_generator_gpu_type.change(lambda x: _update_gpu_counts("generator", x), nim_generator_gpu_type, + [nim_generator_gpu_count, nim_generator_id, nim_generator_warning]) + nim_generator_gpu_count.change(lambda x, y: _update_compatible_models("generator", x, y), + [nim_generator_gpu_type, nim_generator_gpu_count], + [nim_generator_id, nim_generator_warning]) + + nim_hallucination_gpu_type.change(lambda x: _update_gpu_counts("hallucination", x), nim_hallucination_gpu_type, + [nim_hallucination_gpu_count, nim_hallucination_id, nim_hallucination_warning]) + nim_hallucination_gpu_count.change(lambda x, y: _update_compatible_models("hallucination", x, y), + [nim_hallucination_gpu_type, nim_hallucination_gpu_count], + [nim_hallucination_id, nim_hallucination_warning]) + + nim_answer_gpu_type.change(lambda x: _update_gpu_counts("answer", x), nim_answer_gpu_type, + [nim_answer_gpu_count, nim_answer_id, nim_answer_warning]) + nim_answer_gpu_count.change(lambda x, y: _update_compatible_models("answer", x, y), + [nim_answer_gpu_type, nim_answer_gpu_count], + [nim_answer_id, nim_answer_warning]) + + """ These helper functions track the API Endpoint selected and regenerates the prompt accordingly. """ + + def _toggle_model_router(selected_model: str): + match selected_model: + case str() if selected_model == LLAMA: + return gr.update(value=prompts_llama3.router_prompt) + case str() if selected_model == MISTRAL: + return gr.update(value=prompts_mistral.router_prompt) + case _: + return gr.update(value=prompts_llama3.router_prompt) - """ These helper functions hide and show the right-hand settings panel when toggled. """ + def _toggle_model_retrieval(selected_model: str): + match selected_model: + case str() if selected_model == LLAMA: + return gr.update(value=prompts_llama3.retrieval_prompt) + case str() if selected_model == MISTRAL: + return gr.update(value=prompts_mistral.retrieval_prompt) + case _: + return gr.update(value=prompts_llama3.retrieval_prompt) + + def _toggle_model_generator(selected_model: str): + match selected_model: + case str() if selected_model == LLAMA: + return gr.update(value=prompts_llama3.generator_prompt) + case str() if selected_model == MISTRAL: + return gr.update(value=prompts_mistral.generator_prompt) + case _: + return gr.update(value=prompts_llama3.generator_prompt) + + def _toggle_model_hallucination(selected_model: str): + match selected_model: + case str() if selected_model == LLAMA: + return gr.update(value=prompts_llama3.hallucination_prompt) + case str() if selected_model == MISTRAL: + return gr.update(value=prompts_mistral.hallucination_prompt) + case _: + return gr.update(value=prompts_llama3.hallucination_prompt) + + def _toggle_model_answer(selected_model: str): + match selected_model: + case str() if selected_model == LLAMA: + return gr.update(value=prompts_llama3.answer_prompt) + case str() if selected_model == MISTRAL: + return gr.update(value=prompts_mistral.answer_prompt) + case _: + return gr.update(value=prompts_llama3.answer_prompt) + + model_router.change(_toggle_model_router, [model_router], [prompt_router]) + model_retrieval.change(_toggle_model_retrieval, [model_retrieval], [prompt_retrieval]) + model_generator.change(_toggle_model_generator, [model_generator], [prompt_generator]) + model_hallucination.change(_toggle_model_hallucination, [model_hallucination], [prompt_hallucination]) + model_answer.change(_toggle_model_answer, [model_answer], [prompt_answer]) - def _toggle_hide_all_settings(): + """ These helper functions upload and clear the documents and webpages to/from the ChromaDB. """ + + def _upload_documents_pdf(files, progress=gr.Progress()): + progress(0.25, desc="Initializing Task") + time.sleep(0.75) + progress(0.5, desc="Uploading Docs") + database.upload_pdf(files) + progress(0.75, desc="Cleaning Up") + time.sleep(0.75) return { - settings_column: gr.update(visible=False), - hidden_settings_column: gr.update(visible=True), + url_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), + pdf_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), + agentic_flow: gr.update(visible=True), } - def _toggle_show_all_settings(): + def _upload_documents(docs: str, progress=gr.Progress()): + progress(0.2, desc="Initializing Task") + time.sleep(0.75) + progress(0.4, desc="Processing URL List") + docs_list = docs.splitlines() + progress(0.6, desc="Uploading Docs") + database.upload(docs_list) + progress(0.8, desc="Cleaning Up") + time.sleep(0.75) return { - settings_column: gr.update(visible=True), - settings_tabs: gr.update(selected=0), - hidden_settings_column: gr.update(visible=False), + url_docs_upload: gr.update(value="Docs Uploaded", variant="primary", interactive=False), + url_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), + pdf_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), + agentic_flow: gr.update(visible=True), } - hide_all_settings.select(_toggle_hide_all_settings, None, [settings_column, hidden_settings_column]) - show_settings.click(_toggle_show_all_settings, None, [settings_column, settings_tabs, hidden_settings_column]) - - """ This helper function ensures the model settings are reset when a user re-navigates to the tab. """ + def _clear_documents(progress=gr.Progress()): + progress(0.25, desc="Initializing Task") + time.sleep(0.75) + progress(0.5, desc="Clearing Database") + database.clear() + progress(0.75, desc="Cleaning Up") + time.sleep(0.75) + return { + url_docs_upload: gr.update(value="Upload Docs", variant="secondary", interactive=True), + url_docs_clear: gr.update(value="Docs Cleared", variant="primary", interactive=False), + pdf_docs_upload: gr.update(value=None), + pdf_docs_clear: gr.update(value="Docs Cleared", variant="primary", interactive=False), + agentic_flow: gr.update(visible=True), + } + + url_docs_upload.click(_upload_documents, [url_docs], [url_docs_upload, url_docs_clear, pdf_docs_clear, agentic_flow]) + url_docs_clear.click(_clear_documents, [], [url_docs_upload, url_docs_clear, pdf_docs_upload, pdf_docs_clear, agentic_flow]) + pdf_docs_upload.upload(_upload_documents_pdf, [pdf_docs_upload], [url_docs_clear, pdf_docs_clear, agentic_flow]) + pdf_docs_clear.click(_clear_documents, [], [url_docs_upload, url_docs_clear, pdf_docs_upload, pdf_docs_clear, agentic_flow]) + + """ These helper functions set state and prompts when either the NIM or API Endpoint tabs are selected. """ def _toggle_model_tab(): return { @@ -640,110 +899,6 @@ def _toggle_model(btn: str): hallucination_btn, answer_btn]) - """ These helper functions track the API Endpoint selected and regenerates the prompt accordingly. """ - - def _toggle_model_router(selected_model: str): - match selected_model: - case str() if selected_model == LLAMA: - return gr.update(value=prompts_llama3.router_prompt) - case str() if selected_model == MISTRAL: - return gr.update(value=prompts_mistral.router_prompt) - case _: - return gr.update(value=prompts_llama3.router_prompt) - - def _toggle_model_retrieval(selected_model: str): - match selected_model: - case str() if selected_model == LLAMA: - return gr.update(value=prompts_llama3.retrieval_prompt) - case str() if selected_model == MISTRAL: - return gr.update(value=prompts_mistral.retrieval_prompt) - case _: - return gr.update(value=prompts_llama3.retrieval_prompt) - - def _toggle_model_generator(selected_model: str): - match selected_model: - case str() if selected_model == LLAMA: - return gr.update(value=prompts_llama3.generator_prompt) - case str() if selected_model == MISTRAL: - return gr.update(value=prompts_mistral.generator_prompt) - case _: - return gr.update(value=prompts_llama3.generator_prompt) - - def _toggle_model_hallucination(selected_model: str): - match selected_model: - case str() if selected_model == LLAMA: - return gr.update(value=prompts_llama3.hallucination_prompt) - case str() if selected_model == MISTRAL: - return gr.update(value=prompts_mistral.hallucination_prompt) - case _: - return gr.update(value=prompts_llama3.hallucination_prompt) - - def _toggle_model_answer(selected_model: str): - match selected_model: - case str() if selected_model == LLAMA: - return gr.update(value=prompts_llama3.answer_prompt) - case str() if selected_model == MISTRAL: - return gr.update(value=prompts_mistral.answer_prompt) - case _: - return gr.update(value=prompts_llama3.answer_prompt) - - model_router.change(_toggle_model_router, [model_router], [prompt_router]) - model_retrieval.change(_toggle_model_retrieval, [model_retrieval], [prompt_retrieval]) - model_generator.change(_toggle_model_generator, [model_generator], [prompt_generator]) - model_hallucination.change(_toggle_model_hallucination, [model_hallucination], [prompt_hallucination]) - model_answer.change(_toggle_model_answer, [model_answer], [prompt_answer]) - - """ These helper functions upload and clear the documents and webpages to/from the ChromaDB. """ - - def _upload_documents_pdf(files, progress=gr.Progress()): - progress(0.25, desc="Initializing Task") - time.sleep(0.75) - progress(0.5, desc="Uploading Docs") - database.upload_pdf(files) - progress(0.75, desc="Cleaning Up") - time.sleep(0.75) - return { - url_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), - pdf_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), - agentic_flow: gr.update(visible=True), - } - - def _upload_documents(docs: str, progress=gr.Progress()): - progress(0.2, desc="Initializing Task") - time.sleep(0.75) - progress(0.4, desc="Processing URL List") - docs_list = docs.splitlines() - progress(0.6, desc="Uploading Docs") - database.upload(docs_list) - progress(0.8, desc="Cleaning Up") - time.sleep(0.75) - return { - url_docs_upload: gr.update(value="Docs Uploaded", variant="primary", interactive=False), - url_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), - pdf_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), - agentic_flow: gr.update(visible=True), - } - - def _clear_documents(progress=gr.Progress()): - progress(0.25, desc="Initializing Task") - time.sleep(0.75) - progress(0.5, desc="Clearing Database") - database.clear() - progress(0.75, desc="Cleaning Up") - time.sleep(0.75) - return { - url_docs_upload: gr.update(value="Upload Docs", variant="secondary", interactive=True), - url_docs_clear: gr.update(value="Docs Cleared", variant="primary", interactive=False), - pdf_docs_upload: gr.update(value=None), - pdf_docs_clear: gr.update(value="Docs Cleared", variant="primary", interactive=False), - agentic_flow: gr.update(visible=True), - } - - url_docs_upload.click(_upload_documents, [url_docs], [url_docs_upload, url_docs_clear, pdf_docs_clear, agentic_flow]) - url_docs_clear.click(_clear_documents, [], [url_docs_upload, url_docs_clear, pdf_docs_upload, pdf_docs_clear, agentic_flow]) - pdf_docs_upload.upload(_upload_documents_pdf, [pdf_docs_upload], [url_docs_clear, pdf_docs_clear, agentic_flow]) - pdf_docs_clear.click(_clear_documents, [], [url_docs_upload, url_docs_clear, pdf_docs_upload, pdf_docs_clear, agentic_flow]) - """ This helper function builds out the submission function call when a user submits a query. """ _my_build_stream = functools.partial(_stream_predict, client, app) @@ -874,3 +1029,13 @@ def _stream_predict( yield "", chat_history + [[question, final_value["generation"]]], gr.update(show_label=False) except Exception as e: yield "", chat_history + [[question, "*** ERR: Unable to process query. Check the Monitor tab for details. ***\n\nException: " + str(e)]], gr.update(show_label=False) + +_support_matrix_cache = None + +def load_gpu_support_matrix() -> Dict: + global _support_matrix_cache + if _support_matrix_cache is None: + matrix_path = os.path.join(os.path.dirname(__file__), '..', '..', 'nim_gpu_support_matrix.json') + with open(matrix_path, 'r') as f: + _support_matrix_cache = json.load(f) + return _support_matrix_cache diff --git a/code/chatui/utils/gpu_compatibility.py b/code/chatui/utils/gpu_compatibility.py new file mode 100644 index 0000000..d03a0e9 --- /dev/null +++ b/code/chatui/utils/gpu_compatibility.py @@ -0,0 +1,66 @@ +"""Utility module for GPU compatibility checking.""" + +import json +import os +from typing import Dict, List, Optional, TypedDict + +class GPUConfig(TypedDict): + """Type definition for GPU configuration.""" + gpu_type: str + num_gpus: int + +class ModelCompatibility(TypedDict): + """Type definition for model compatibility results.""" + compatible_models: List[str] + warning_message: Optional[str] + +def load_gpu_support_matrix() -> Dict: + """Load the GPU support matrix from JSON file.""" + matrix_path = os.path.join(os.path.dirname(__file__), '..', '..', 'nim_gpu_support_matrix.json') + with open(matrix_path, 'r') as f: + return json.load(f) + +def get_compatible_models(gpu_type: str, num_gpus: str) -> ModelCompatibility: + """ + Get list of compatible models for given GPU configuration. + + Args: + gpu_type: Type of GPU (e.g. "H100", "A100 80GB") + num_gpus: Number of GPUs as string (e.g. "1", "2", "4", "8", "16") + + Returns: + ModelCompatibility with list of compatible models and optional warning + """ + support_matrix = load_gpu_support_matrix() + + # Validate inputs + if gpu_type not in support_matrix: + return ModelCompatibility( + compatible_models=[], + warning_message=f"GPU type {gpu_type} not found in support matrix" + ) + + if num_gpus not in support_matrix[gpu_type]: + return ModelCompatibility( + compatible_models=[], + warning_message=f"Configuration with {num_gpus} GPUs not supported for {gpu_type}" + ) + + # Get compatible models + models = support_matrix[gpu_type][num_gpus]["models"] + + return ModelCompatibility( + compatible_models=models, + warning_message=None if models else f"No compatible models found for {gpu_type} with {num_gpus} GPUs" + ) + +def get_gpu_types() -> List[str]: + """Get list of supported GPU types.""" + return list(load_gpu_support_matrix().keys()) + +def get_supported_gpu_counts(gpu_type: str) -> List[str]: + """Get list of supported GPU counts for a given GPU type.""" + support_matrix = load_gpu_support_matrix() + if gpu_type not in support_matrix: + return [] + return list(support_matrix[gpu_type].keys()) \ No newline at end of file diff --git a/code/chatui/utils/graph.py b/code/chatui/utils/graph.py index 36cb3e7..2d90260 100644 --- a/code/chatui/utils/graph.py +++ b/code/chatui/utils/graph.py @@ -16,7 +16,7 @@ import os from typing_extensions import TypedDict -from typing import List +from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser, JsonOutputParser @@ -73,6 +73,16 @@ class GraphState(TypedDict): nim_retrieval_id: str nim_hallucination_id: str nim_answer_id: str + nim_generator_gpu_type: Optional[str] + nim_generator_gpu_count: Optional[str] + nim_router_gpu_type: Optional[str] + nim_router_gpu_count: Optional[str] + nim_retrieval_gpu_type: Optional[str] + nim_retrieval_gpu_count: Optional[str] + nim_hallucination_gpu_type: Optional[str] + nim_hallucination_gpu_count: Optional[str] + nim_answer_gpu_type: Optional[str] + nim_answer_gpu_count: Optional[str] from langchain.schema import Document @@ -121,6 +131,8 @@ def generate(state): llm = nim.CustomChatOpenAI(custom_endpoint=state["nim_generator_ip"], port=state["nim_generator_port"] if len(state["nim_generator_port"]) > 0 else "8000", model_name=state["nim_generator_id"] if len(state["nim_generator_id"]) > 0 else "meta/llama3-8b-instruct", + gpu_type=state["nim_generator_gpu_type"] if "nim_generator_gpu_type" in state else None, + gpu_count=state["nim_generator_gpu_count"] if "nim_generator_gpu_count" in state else None, temperature=0.7) if state["generator_use_nim"] else ChatNVIDIA(model=state["generator_model_id"], temperature=0.7) rag_chain = prompt | llm | StrOutputParser() generation = rag_chain.invoke({"context": documents, "question": question}) @@ -153,6 +165,8 @@ def grade_documents(state): llm = nim.CustomChatOpenAI(custom_endpoint=state["nim_retrieval_ip"], port=state["nim_retrieval_port"] if len(state["nim_retrieval_port"]) > 0 else "8000", model_name=state["nim_retrieval_id"] if len(state["nim_retrieval_id"]) > 0 else "meta/llama3-8b-instruct", + gpu_type=state["nim_retrieval_gpu_type"] if "nim_retrieval_gpu_type" in state else None, + gpu_count=state["nim_retrieval_gpu_count"] if "nim_retrieval_gpu_count" in state else None, temperature=0.7) if state["retrieval_use_nim"] else ChatNVIDIA(model=state["retrieval_model_id"], temperature=0) retrieval_grader = prompt | llm | JsonOutputParser() for d in documents: @@ -225,6 +239,8 @@ def route_question(state): llm = nim.CustomChatOpenAI(custom_endpoint=state["nim_router_ip"], port=state["nim_router_port"] if len(state["nim_router_port"]) > 0 else "8000", model_name=state["nim_router_id"] if len(state["nim_router_id"]) > 0 else "meta/llama3-8b-instruct", + gpu_type=state["nim_router_gpu_type"] if "nim_router_gpu_type" in state else None, + gpu_count=state["nim_router_gpu_count"] if "nim_router_gpu_count" in state else None, temperature=0.7) if state["router_use_nim"] else ChatNVIDIA(model=state["router_model_id"], temperature=0) question_router = prompt | llm | JsonOutputParser() source = question_router.invoke({"question": question}) @@ -292,6 +308,8 @@ def grade_generation_v_documents_and_question(state): llm = nim.CustomChatOpenAI(custom_endpoint=state["nim_hallucination_ip"], port=state["nim_hallucination_port"] if len(state["nim_hallucination_port"]) > 0 else "8000", model_name=state["nim_hallucination_id"] if len(state["nim_hallucination_id"]) > 0 else "meta/llama3-8b-instruct", + gpu_type=state["nim_hallucination_gpu_type"] if "nim_hallucination_gpu_type" in state else None, + gpu_count=state["nim_hallucination_gpu_count"] if "nim_hallucination_gpu_count" in state else None, temperature=0.7) if state["hallucination_use_nim"] else ChatNVIDIA(model=state["hallucination_model_id"], temperature=0) hallucination_grader = prompt | llm | JsonOutputParser() @@ -308,6 +326,8 @@ def grade_generation_v_documents_and_question(state): llm = nim.CustomChatOpenAI(custom_endpoint=state["nim_answer_ip"], port=state["nim_answer_port"] if len(state["nim_answer_port"]) > 0 else "8000", model_name=state["nim_answer_id"] if len(state["nim_answer_id"]) > 0 else "meta/llama3-8b-instruct", + gpu_type=state["nim_answer_gpu_type"] if "nim_answer_gpu_type" in state else None, + gpu_count=state["nim_answer_gpu_count"] if "nim_answer_gpu_count" in state else None, temperature=0.7) if state["answer_use_nim"] else ChatNVIDIA(model=state["answer_model_id"], temperature=0) answer_grader = prompt | llm | JsonOutputParser() diff --git a/code/chatui/utils/nim.py b/code/chatui/utils/nim.py index 69bf3d0..dcb2091 100644 --- a/code/chatui/utils/nim.py +++ b/code/chatui/utils/nim.py @@ -18,6 +18,8 @@ from langchain_core.load.dump import dumps from pydantic import Field from typing import List, Mapping, Optional, Any +from chatui.utils import gpu_compatibility +import os class CustomChatOpenAI(BaseChatModel): """ This is a custom built class for using LangChain to chat with custom OpenAI API-compatible endpoints, eg. NIMs. """ @@ -26,13 +28,24 @@ class CustomChatOpenAI(BaseChatModel): port: Optional[str] = "8000" model_name: Optional[str] = "meta/llama3-8b-instruct" temperature: Optional[float] = 0.0 + gpu_type: Optional[str] = None + gpu_count: Optional[str] = None - def __init__(self, custom_endpoint, port="8000", model_name="meta/llama3-8b-instruct", temperature=0.0, **kwargs): + def __init__(self, custom_endpoint, port="8000", model_name="meta/llama3-8b-instruct", + gpu_type=None, gpu_count=None, temperature=0.0, **kwargs): super().__init__(**kwargs) + if gpu_type and gpu_count: + compatibility = gpu_compatibility.get_compatible_models(gpu_type, gpu_count) + if compatibility["warning_message"]: + raise ValueError(compatibility["warning_message"]) + if model_name not in compatibility["compatible_models"]: + raise ValueError(f"Model {model_name} is not compatible with {gpu_type} ({gpu_count} GPUs)") self.custom_endpoint = custom_endpoint self.port = port self.model_name = model_name self.temperature = temperature + self.gpu_type = gpu_type + self.gpu_count = gpu_count @property def _llm_type(self) -> str: @@ -45,18 +58,31 @@ def _generate(self, messages, stop=None, run_manager=None, **kwargs): def _call_custom_endpoint(self, messages, **kwargs): import openai import json - - openai.api_key = "xyz" - openai.base_url = "http://" + self.custom_endpoint + ":" + self.port + "/v1/" - + + openai.api_key = os.getenv("OPENAI_API_KEY", "xyz") # Better API key handling + openai.base_url = f"http://{self.custom_endpoint}:{self.port}/v1/" + obj = json.loads(dumps(messages)) - response = openai.chat.completions.create( - model=self.model_name, - messages=[{"role": "user", "content": obj[0]["kwargs"]["content"]}], - temperature=self.temperature, - ) - return response + config = { + "model": self.model_name, + "messages": [{"role": "user", "content": obj[0]["kwargs"]["content"]}], + "temperature": self.temperature, + } + + if self.gpu_type and self.gpu_count: + config["gpu_config"] = { + "type": self.gpu_type, + "count": self.gpu_count + } + + try: + response = openai.chat.completions.create(**config) + return response + except Exception as e: + if self.gpu_type and self.gpu_count: + raise ValueError(f"Error with GPU configuration ({self.gpu_type}, {self.gpu_count} GPUs): {str(e)}") + raise e def _create_chat_result(self, response): from langchain_core.messages import ChatMessage From cc4a0fd9f6b8cbf7bbffa3b2a25ce8a85aa48c27 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sun, 9 Mar 2025 13:33:07 -0400 Subject: [PATCH 33/41] Added info on endpoint prefix for nvidians. Note. feature not implemented yet. - Modified files: README.md, code/output.log --- README.md | 7 +++++ code/output.log | 77 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/README.md b/README.md index 5315635..abf4867 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,13 @@ This Retrieval Augmented Generation (RAG) application uses an agentic approach t ## The Agentic RAG Application #### Using the Application 1. You embed your documents (pdfs or webpages) to the vector database. + +
+📝 Note About Internal Endpoints + +Some users (e.g., NVIDIA employees) may need to configure the `INTERNAL_API` environment variable to access internal endpoints. See the [Environment Variables documentation](https://docs.nvidia.com/ai-workbench/user-guide/latest/environment/variables.html) for setup instructions. +
+ 2. You configure each of the separate components for the pipeline. For each component you can: * Select from a drop down of endpoints or use a self-hosted endpoint. * Modify the prompt. diff --git a/code/output.log b/code/output.log index 3af4e0c..29e2815 100644 --- a/code/output.log +++ b/code/output.log @@ -4,3 +4,80 @@ Running on local URL: http://0.0.0.0:8080 To create a public link, set `share=True` in `launch()`. IMPORTANT: You are using gradio version 4.15.0, however version 4.44.1 is available, please upgrade. -------- +---ROUTE QUESTION--- +hello +{'datasource': 'web_search'} +---ROUTE QUESTION TO WEB SEARCH--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- +---GRADE GENERATION vs QUESTION--- +---DECISION: GENERATION DOES NOT ADDRESS QUESTION--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- +---GRADE GENERATION vs QUESTION--- +---DECISION: GENERATION DOES NOT ADDRESS QUESTION--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- +---GRADE GENERATION vs QUESTION--- +---DECISION: GENERATION DOES NOT ADDRESS QUESTION--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- +---GRADE GENERATION vs QUESTION--- +---DECISION: GENERATION DOES NOT ADDRESS QUESTION--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- +---GRADE GENERATION vs QUESTION--- +---DECISION: GENERATION DOES NOT ADDRESS QUESTION--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- +---GRADE GENERATION vs QUESTION--- +---DECISION: GENERATION DOES NOT ADDRESS QUESTION--- +---WEB SEARCH--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- +---GENERATE--- +---CHECK HALLUCINATIONS--- +---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- From d39688a2a2dd5d27cb8b12feccff77531863124e Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sun, 9 Mar 2025 13:37:19 -0400 Subject: [PATCH 34/41] changed wording around internal endpoints and made specific to nvidians. - Modified files: README.md --- README.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index abf4867..3439e2f 100644 --- a/README.md +++ b/README.md @@ -18,12 +18,6 @@ This Retrieval Augmented Generation (RAG) application uses an agentic approach t #### Using the Application 1. You embed your documents (pdfs or webpages) to the vector database. -
-📝 Note About Internal Endpoints - -Some users (e.g., NVIDIA employees) may need to configure the `INTERNAL_API` environment variable to access internal endpoints. See the [Environment Variables documentation](https://docs.nvidia.com/ai-workbench/user-guide/latest/environment/variables.html) for setup instructions. -
- 2. You configure each of the separate components for the pipeline. For each component you can: * Select from a drop down of endpoints or use a self-hosted endpoint. * Modify the prompt. @@ -70,6 +64,8 @@ The quickest path is with the pre-configured build.nvidia.com endpoints. 4. Have some pdfs or web pages to put in the RAG. +5. NVIDIA Employees: Configure `INTERNAL_API` API key to use internal endpoints instead of public ones. + #### Opening the Chat From eee7b4a97cb807e2a7bce0d8a6d6a714a44f21df Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sun, 9 Mar 2025 14:35:21 -0400 Subject: [PATCH 35/41] docs: improve self-hosted endpoint documentation - Modified files: README.md - Clarify that each pipeline component can be independently configured for self-hosting - List all available components that support self-hosting (Router, Generator, Retrieval, etc.) - Emphasize ability to mix between hosted and self-hosted components - Restructure setup steps to better explain component-specific configuration - Remove outdated GPU sizing guide --- README.md | 40 ++++++++++++++-------------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 3439e2f..752e8f0 100644 --- a/README.md +++ b/README.md @@ -85,36 +85,24 @@ The quickest path is with the pre-configured build.nvidia.com endpoints. > **Note** This assumes you've done the **Get Started** steps. -#### Using a Self-Hosted Endpoint +#### Using Self-Hosted Endpoints -Each pipeline component can be configured to use a self-hosted endpoint. You can run your own models on your own GPUs, but you'll need to set up the endpoints first. This works best if you know how to use containers and install NVIDIA software on Ubuntu. +You can configure any or all pipeline components (Router, Generator, Retrieval, Hallucination Check, Answer Check) to use self-hosted endpoints independently. This means you can mix and match between hosted and self-hosted components based on your needs. The application includes built-in GPU compatibility checking to help you select appropriate models for your hardware configuration. -#### Prerequisites for Using a Self-Hosted Endpoint +Prerequisites: +* NVIDIA GPU(s) with appropriate VRAM +* Ubuntu 22.04 or later with latest NVIDIA drivers +* Docker and NVIDIA Container Toolkit -1. You need a remote system with a sufficient GPU that you have SSH access to. - -2. The remote should have the following installed: - * Ubuntu 22.04 or higher - * The latest NVIDIA drivers - * The latest version of Docker - * The latest version of the NVIDIA Container Toolkit - -3. - - -## Self-Hosted Sizing Guide - -| GPU VRAM | Example Hardware | Compatible? | -| -------- | ------- | ------- | -| <16 GB | RTX 3080, RTX 3500 Ada | Y | -| 16 GB | RTX 4080 16GB, RTX A4000 | Y | -| 24 GB | RTX 3090/4090, RTX A5000/5500, A10/30 | Y | -| 32 GB | RTX 5000 Ada | Y | -| 40 GB | A100-40GB | Y | -| 48 GB | RTX 6000 Ada, L40/L40S, A40 | Y | -| 80 GB | A100-80GB | Y | -| >80 GB | 8x A100-80GB | Y | +To set up NIM endpoints for your components: +1. Check the [NIM documentation](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html) for detailed setup instructions +2. For each component you want to self-host: + * Select "NIM Endpoints" in the component's configuration + * Choose your GPU type and count - the UI will automatically show only compatible models + * Enter your endpoint details (host, port) +3. Components not set to self-hosted will continue using their configured cloud endpoints +The application will validate your GPU configuration for each component and prevent incompatible model selections. You can use different GPU configurations for different components based on their computational needs. From 57015fd8de249f5200d1e751e4a6a89af14eba0c Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sun, 9 Mar 2025 14:39:21 -0400 Subject: [PATCH 36/41] Changed readme links - Modified files: README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 752e8f0..aa937d6 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This Retrieval Augmented Generation (RAG) application uses an agentic approach t
-*Navigating the README:* [Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive) | [Self-Hosted Sizing Guide](#self-hosted-sizing-guide) | [License](#license) +*Navigating the README:* [Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive:-self-hosted-endpoints) | [License](#license) *Other Resources:* [:arrow_down: Download AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) | [:book: User Guide](https://docs.nvidia.com/ai-workbench/) |[:open_file_folder: Other Projects](https://docs.nvidia.com/ai-workbench/user-guide/latest/quickstart/example-projects.html) | [:rotating_light: User Forum](https://forums.developer.nvidia.com/t/support-workbench-example-project-agentic-rag/303414) @@ -81,7 +81,7 @@ The quickest path is with the pre-configured build.nvidia.com endpoints. 6. Start chatting. -## Deep Dive +## Deep Dive: Self-Hosted Endpoints > **Note** This assumes you've done the **Get Started** steps. From 41d28cab9d7106b155cbff6ce7776ce16859e0d0 Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sun, 9 Mar 2025 14:40:34 -0400 Subject: [PATCH 37/41] Modified README.md - Modified files: README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index aa937d6..03df6a6 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This Retrieval Augmented Generation (RAG) application uses an agentic approach t
-*Navigating the README:* [Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive:-self-hosted-endpoints) | [License](#license) +*Navigating the README:* [Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive-on-self-hosted-endpoints) | [License](#license) *Other Resources:* [:arrow_down: Download AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) | [:book: User Guide](https://docs.nvidia.com/ai-workbench/) |[:open_file_folder: Other Projects](https://docs.nvidia.com/ai-workbench/user-guide/latest/quickstart/example-projects.html) | [:rotating_light: User Forum](https://forums.developer.nvidia.com/t/support-workbench-example-project-agentic-rag/303414) @@ -81,7 +81,7 @@ The quickest path is with the pre-configured build.nvidia.com endpoints. 6. Start chatting. -## Deep Dive: Self-Hosted Endpoints +## Deep Dive on Self-Hosted Endpoints > **Note** This assumes you've done the **Get Started** steps. From d14a9b9d94d6d9af511714b715e77773e17779ad Mon Sep 17 00:00:00 2001 From: Tyler Whitehouse Date: Sun, 9 Mar 2025 14:41:47 -0400 Subject: [PATCH 38/41] Modified README.md - Modified files: README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 03df6a6..7a42f06 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ This Retrieval Augmented Generation (RAG) application uses an agentic approach t > **Note** >This app runs in [NVIDIA AI Workbench](https://docs.nvidia.com/ai-workbench/user-guide/latest/overview/introduction.html). It's a free, lightweight developer platform that you can run on your own systems to get up and running with complex AI applications and workloads in a short amount of time. -> You may want to [**fork**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo#forking-a-repository) this repository into your own account before proceeding. Otherwise you won't be able to fully retain any changes you make because this NVIDIA owned repository is **read-only**. +> You may want to [**fork**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo#forking-a-repository) this repository into your own account before proceeding. Otherwise you won't be able to save your local changes to GitHub because this NVIDIA owned repository is **read-only**.
From 9f7ddd24642b248c4b3ecf6babb0b5c134bc7135 Mon Sep 17 00:00:00 2001 From: T-Dubb Date: Tue, 15 Apr 2025 17:42:03 -0400 Subject: [PATCH 39/41] docs: added instructions tab to UI; modified documents tab (blurb and buttons) - Modified files: README.md, code/chatui/pages/converse.py, code/output.log --- README.md | 9 +++- code/chatui/pages/converse.py | 45 +++++++++++++++----- code/output.log | 77 ----------------------------------- 3 files changed, 42 insertions(+), 89 deletions(-) diff --git a/README.md b/README.md index 7a42f06..006d9b0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Overview: An Easy Button for Agentic RAG -This Retrieval Augmented Generation (RAG) application uses an agentic approach to combine web search, hallucination controls and accuracy checks with RAG. It's easy to modify because its a simple Gradio app. +This RAG application uses an agentic approach to combine web search, hallucination control and accuracy checks with RAG. It's easy to modify because its a simple Gradio app. > **Note** >This app runs in [NVIDIA AI Workbench](https://docs.nvidia.com/ai-workbench/user-guide/latest/overview/introduction.html). It's a free, lightweight developer platform that you can run on your own systems to get up and running with complex AI applications and workloads in a short amount of time. @@ -13,6 +13,11 @@ This Retrieval Augmented Generation (RAG) application uses an agentic approach t *Other Resources:* [:arrow_down: Download AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) | [:book: User Guide](https://docs.nvidia.com/ai-workbench/) |[:open_file_folder: Other Projects](https://docs.nvidia.com/ai-workbench/user-guide/latest/quickstart/example-projects.html) | [:rotating_light: User Forum](https://forums.developer.nvidia.com/t/support-workbench-example-project-agentic-rag/303414) +## Need, Don't Need and Nice to Have + +- Need: internet access because the chat app uses [Tavily](https://tavily.com/) for web-searches, as well as endpoints on build.nvidia.com +- Don't Need: Local GPU +- Nice to Have: Remote GPU system where you self-host an endpoint ## The Agentic RAG Application #### Using the Application @@ -22,7 +27,7 @@ This Retrieval Augmented Generation (RAG) application uses an agentic approach t * Select from a drop down of endpoints or use a self-hosted endpoint. * Modify the prompt. 3. You submit your query. -4. An LLM evaluates your query for relevance to the index and then routes it to the DB or to search by [Tavily](https://tavily.com/). +4. An LLM evaluates its relevance to the index and then routes it to the DB or to search by [Tavily](https://tavily.com/). 5. Answers are checked for hallucination and relevance. "Failing"" answers are run through the process again. The diagram **below** shows this agentic flow. diff --git a/code/chatui/pages/converse.py b/code/chatui/pages/converse.py index caa2769..82db99c 100644 --- a/code/chatui/pages/converse.py +++ b/code/chatui/pages/converse.py @@ -161,8 +161,33 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: with gr.Column(scale=10, min_width=350) as settings_column: with gr.Tabs(selected=0) as settings_tabs: + with gr.TabItem("Instructions", id=0) as instructions_tab: + gr.Markdown( + """ + + ##### Use the Models tab to configure inference individual components + - Click a component name (e.g. Router) to configure it + - Select an API endpoint or a self-hosted NIM (requires remote GPU) + - Customize component behavior by changing the prompts + + ##### Use the Documents tab to create a RAG context + - Webpages: Enter URLs of webpages for the context + - PDFs: Upload PDFs for the context + - Add to Context: Add documents to the context (can repeat) + - Clear Context: Resets the context to be empty + + ##### Use the Monitor tab to see the agent in action + - Actions Console: Conclusions and actions of the agent + - Response Trace: Full text of what's behind the response + + """ + ) + + + + # Settings for each component model of the agentic workflow - with gr.TabItem("Models", id=0) as agent_settings: + with gr.TabItem("Models", id=1) as agent_settings: ######################## ##### ROUTER MODEL ##### @@ -500,10 +525,10 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: show_label=False, interactive=True) - # Second tab item is for uploading to and clearing the vector database - with gr.TabItem("Documents", id=1) as document_settings: + # Thirdtab item is for uploading to and clearing the vector database + with gr.TabItem("Documents", id=2) as document_settings: gr.Markdown("") - gr.Markdown("Upload webpages or PDF files to be stored persistently in the vector database.\n") + gr.Markdown("Embed websites and PDFs into a vector database to create a context. You can do this in multiple rounds. Context is stored until you clear it.\n") with gr.Tabs(selected=0) as document_tabs: with gr.TabItem("Webpages", id=0) as url_tab: url_docs = gr.Textbox(value="https://lilianweng.github.io/posts/2023-06-23-agent/\nhttps://lilianweng.github.io/posts/2023-03-15-prompt-engineering/\nhttps://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/", @@ -513,18 +538,18 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: interactive=True) with gr.Row(): - url_docs_upload = gr.Button(value="Upload Docs") - url_docs_clear = gr.Button(value="Clear Docs") + url_docs_upload = gr.Button(value="Add to Context") + url_docs_clear = gr.Button(value="Clear Context") with gr.TabItem("PDFs", id=1) as pdf_tab: pdf_docs_upload = gr.File(interactive=True, show_label=False, file_types=[".pdf"], file_count="multiple") - pdf_docs_clear = gr.Button(value="Clear Docs") + pdf_docs_clear = gr.Button(value="Clear Context") - # Third tab item is for the actions output console. - with gr.TabItem("Monitor", id=2) as console_settings: + # Fourth tab item is for the actions output console. + with gr.TabItem("Monitor", id=3) as console_settings: gr.Markdown("") gr.Markdown("Monitor agentic actions and view the pipeline trace of the latest response.\n") with gr.Tabs(selected=0) as console_tabs: @@ -539,7 +564,7 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: ) # Third tab item is for collapsing the entire settings pane for readability. - with gr.TabItem("Hide All Settings", id=3) as hide_all_settings: + with gr.TabItem("Hide All Settings", id=4) as hide_all_settings: gr.Markdown("") page.load(logger.read_logs, None, logs, every=1) diff --git a/code/output.log b/code/output.log index 29e2815..3af4e0c 100644 --- a/code/output.log +++ b/code/output.log @@ -4,80 +4,3 @@ Running on local URL: http://0.0.0.0:8080 To create a public link, set `share=True` in `launch()`. IMPORTANT: You are using gradio version 4.15.0, however version 4.44.1 is available, please upgrade. -------- ----ROUTE QUESTION--- -hello -{'datasource': 'web_search'} ----ROUTE QUESTION TO WEB SEARCH--- ----WEB SEARCH--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ----GRADE GENERATION vs QUESTION--- ----DECISION: GENERATION DOES NOT ADDRESS QUESTION--- ----WEB SEARCH--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ----GRADE GENERATION vs QUESTION--- ----DECISION: GENERATION DOES NOT ADDRESS QUESTION--- ----WEB SEARCH--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ----GRADE GENERATION vs QUESTION--- ----DECISION: GENERATION DOES NOT ADDRESS QUESTION--- ----WEB SEARCH--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ----GRADE GENERATION vs QUESTION--- ----DECISION: GENERATION DOES NOT ADDRESS QUESTION--- ----WEB SEARCH--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ----GRADE GENERATION vs QUESTION--- ----DECISION: GENERATION DOES NOT ADDRESS QUESTION--- ----WEB SEARCH--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS GROUNDED IN DOCUMENTS--- ----GRADE GENERATION vs QUESTION--- ----DECISION: GENERATION DOES NOT ADDRESS QUESTION--- ----WEB SEARCH--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- ----GENERATE--- ----CHECK HALLUCINATIONS--- ----DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY--- From 65e6ba8a6bcc47bbb6e1497e7616276e1ca97d71 Mon Sep 17 00:00:00 2001 From: T-Dubb Date: Tue, 15 Apr 2025 17:42:37 -0400 Subject: [PATCH 40/41] docs: changed clear history button - Modified files: code/chatui/pages/converse.py --- code/chatui/pages/converse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/chatui/pages/converse.py b/code/chatui/pages/converse.py index 82db99c..4f94f80 100644 --- a/code/chatui/pages/converse.py +++ b/code/chatui/pages/converse.py @@ -151,7 +151,7 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: ) with gr.Column(scale=1, min_width=150): - _ = gr.ClearButton([msg, chatbot], value="Clear history") + _ = gr.ClearButton([msg, chatbot], value="Clear chat history") # Hidden column to be rendered when the user collapses all settings. with gr.Column(scale=1, min_width=100, visible=False) as hidden_settings_column: From f8ca62c18b9f7dc39b9d82f5ba8283ddcb236e43 Mon Sep 17 00:00:00 2001 From: T-Dubb Date: Wed, 16 Apr 2025 08:59:20 -0400 Subject: [PATCH 41/41] docs: modified model tab - Modified files: code/chatui/pages/converse.py --- code/chatui/pages/converse.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/code/chatui/pages/converse.py b/code/chatui/pages/converse.py index 4f94f80..9322dd5 100644 --- a/code/chatui/pages/converse.py +++ b/code/chatui/pages/converse.py @@ -165,7 +165,7 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: gr.Markdown( """ - ##### Use the Models tab to configure inference individual components + ##### Use the Models tab to configure individual components - Click a component name (e.g. Router) to configure it - Select an API endpoint or a self-hosted NIM (requires remote GPU) - Customize component behavior by changing the prompts @@ -174,11 +174,11 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: - Webpages: Enter URLs of webpages for the context - PDFs: Upload PDFs for the context - Add to Context: Add documents to the context (can repeat) - - Clear Context: Resets the context to be empty + - Clear Context: Resets the context to empty ##### Use the Monitor tab to see the agent in action - Actions Console: Conclusions and actions of the agent - - Response Trace: Full text of what's behind the response + - Response Trace: Full text behind the response """ ) @@ -188,7 +188,14 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: # Settings for each component model of the agentic workflow with gr.TabItem("Models", id=1) as agent_settings: - + gr.Markdown( + """ + ##### Model Configuration + Select and configure the models for each stage of the Agentic RAG pipeline. + You can use either API-hosted models or NIM microservices. + """ + ) + ######################## ##### ROUTER MODEL ##### ######################## @@ -528,7 +535,17 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: # Thirdtab item is for uploading to and clearing the vector database with gr.TabItem("Documents", id=2) as document_settings: gr.Markdown("") - gr.Markdown("Embed websites and PDFs into a vector database to create a context. You can do this in multiple rounds. Context is stored until you clear it.\n") + gr.Markdown( + """ + ##### Embed websites and PDFs into a vector database to create a context. + - You can do this in multiple rounds. + - Context is stored until you clear it. + + ##### URLs in Webpages are examples related to prompt engineering. + - They are **not** yet in the context + - You can replace them with your own URLs. \n + """ + ) with gr.Tabs(selected=0) as document_tabs: with gr.TabItem("Webpages", id=0) as url_tab: url_docs = gr.Textbox(value="https://lilianweng.github.io/posts/2023-06-23-agent/\nhttps://lilianweng.github.io/posts/2023-03-15-prompt-engineering/\nhttps://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",