diff --git a/.gitignore b/.gitignore index 437cb38..7c5de6b 100644 --- a/.gitignore +++ b/.gitignore @@ -54,4 +54,8 @@ cover/ # Workbench Project Layout data/scratch/* -!data/scratch/.gitkeep \ No newline at end of file +!data/scratch/.gitkeep +data/chroma.sqlite3 + +data/* +!data/.gitkeep \ No newline at end of file diff --git a/.project/configpacks b/.project/configpacks index 0b6b387..994edce 100644 --- a/.project/configpacks +++ b/.project/configpacks @@ -3,10 +3,12 @@ *cuda.CUDA *defaults.EnvVars *defaults.Readme +*defaults.CA *defaults.Entrypoint *apt.PackageManager *bash.PreLanguage *python.PipPackageManager *bash.PostBuild *jupyterlab.JupyterLab +*vs_code.VSCode *tensorboard.Tensorboard \ No newline at end of file diff --git a/.project/spec.yaml b/.project/spec.yaml index 59b0916..df8c308 100644 --- a/.project/spec.yaml +++ b/.project/spec.yaml @@ -1,137 +1,139 @@ specVersion: v2 specMinorVersion: 2 meta: - name: agentic-rag - image: project-agentic-rag - description: An example project for advanced RAG using agents - labels: [] - createdOn: "2024-07-15T21:09:46Z" - defaultBranch: main + name: agentic-rag + image: project-agentic-rag + description: An example project for advanced RAG using agents + labels: [] + createdOn: "2024-07-15T21:09:46Z" + defaultBranch: main layout: -- path: code/ - type: code - storage: git -- path: models/ - type: models - storage: gitlfs -- path: data/ - type: data - storage: gitlfs -- path: data/scratch/ - type: data - storage: gitignore + - path: code/ + type: code + storage: git + - path: models/ + type: models + storage: gitlfs + - path: data/ + type: data + storage: gitignore environment: - base: - registry: nvcr.io - image: nvidia/ai-workbench/pytorch:1.0.2 - build_timestamp: "20231102150513" - name: PyTorch - supported_architectures: [] - cuda_version: "12.2" - description: A Pytorch 2.1 Base with CUDA 12.2 - entrypoint_script: "" - labels: - - cuda12.2 - - pytorch2.1 - apps: - - name: chat - type: custom - class: webapp - start_command: cd /project/code/ && PROXY_PREFIX=$PROXY_PREFIX python3 -m chatui - health_check_command: curl -f "http://localhost:8080/" - stop_command: pkill -f "^python3 -m chatui" - user_msg: "" - logfile_path: "" - timeout_seconds: 60 - icon_url: "" - webapp_options: - autolaunch: true - port: "8080" - proxy: - trim_prefix: true - url: http://localhost:8080/ - - name: jupyterlab - type: jupyterlab - class: webapp - start_command: jupyter lab --allow-root --port 8888 --ip 0.0.0.0 --no-browser - --NotebookApp.base_url=\$PROXY_PREFIX --NotebookApp.default_url=/lab --NotebookApp.allow_origin='*' - health_check_command: '[ \$(echo url=\$(jupyter lab list | head -n 2 | tail - -n 1 | cut -f1 -d'' '' | grep -v ''Currently'' | sed "s@/?@/lab?@g") | curl - -o /dev/null -s -w ''%{http_code}'' --config -) == ''200'' ]' - stop_command: jupyter lab stop 8888 - user_msg: "" - logfile_path: "" - timeout_seconds: 60 - icon_url: "" - webapp_options: - autolaunch: true - port: "8888" - proxy: - trim_prefix: false - url_command: jupyter lab list | head -n 2 | tail -n 1 | cut -f1 -d' ' | grep - -v 'Currently' - - name: tensorboard - type: tensorboard - class: webapp - start_command: tensorboard --logdir \$TENSORBOARD_LOGS_DIRECTORY --path_prefix=\$PROXY_PREFIX - --bind_all - health_check_command: '[ \$(curl -o /dev/null -s -w ''%{http_code}'' http://localhost:\$TENSORBOARD_PORT\$PROXY_PREFIX/) - == ''200'' ]' - stop_command: pkill tensorboard - user_msg: "" - logfile_path: "" - timeout_seconds: 60 - icon_url: "" - webapp_options: - autolaunch: true - port: "6006" - proxy: - trim_prefix: false - url: http://localhost:6006 - programming_languages: - - python3 - icon_url: "" - image_version: 1.0.2 - os: linux - os_distro: ubuntu - os_distro_release: "22.04" - schema_version: v2 - user_info: - uid: "" - gid: "" - username: "" - package_managers: - - name: apt - binary_path: /usr/bin/apt - installed_packages: - - curl - - git - - git-lfs - - vim - - name: pip - binary_path: /usr/local/bin/pip - installed_packages: - - jupyterlab==4.0.7 - package_manager_environment: - name: "" - target: "" + base: + registry: nvcr.io + image: nvidia/ai-workbench/pytorch:1.0.2 + build_timestamp: "20231102150513" + name: PyTorch + supported_architectures: [] + cuda_version: "12.2" + description: A Pytorch 2.1 Base with CUDA 12.2 + entrypoint_script: "" + labels: + - cuda12.2 + - pytorch2.1 + apps: + - name: chat + type: custom + class: webapp + start_command: cd /project/code/ && PROXY_PREFIX=$PROXY_PREFIX python3 -m chatui + health_check_command: curl -f "http://localhost:8080/" + stop_command: pkill -f "^python3 -m chatui" + user_msg: "" + logfile_path: "" + timeout_seconds: 60 + icon_url: "" + webapp_options: + autolaunch: true + port: "8080" + proxy: + trim_prefix: true + url: http://localhost:8080/ + - name: jupyterlab + type: jupyterlab + class: webapp + start_command: jupyter lab --allow-root --port 8888 --ip 0.0.0.0 --no-browser --NotebookApp.base_url=\$PROXY_PREFIX --NotebookApp.default_url=/lab --NotebookApp.allow_origin='*' + health_check_command: '[ \$(echo url=\$(jupyter lab list | head -n 2 | tail -n 1 | cut -f1 -d'' '' | grep -v ''Currently'' | sed "s@/?@/lab?@g") | curl -o /dev/null -s -w ''%{http_code}'' --config -) == ''200'' ]' + stop_command: jupyter lab stop 8888 + user_msg: "" + logfile_path: "" + timeout_seconds: 60 + icon_url: "" + webapp_options: + autolaunch: true + port: "8888" + proxy: + trim_prefix: false + url_command: jupyter lab list | head -n 2 | tail -n 1 | cut -f1 -d' ' | grep -v 'Currently' + - name: tensorboard + type: tensorboard + class: webapp + start_command: tensorboard --logdir \$TENSORBOARD_LOGS_DIRECTORY --path_prefix=\$PROXY_PREFIX --bind_all + health_check_command: '[ \$(curl -o /dev/null -s -w ''%{http_code}'' http://localhost:\$TENSORBOARD_PORT\$PROXY_PREFIX/) == ''200'' ]' + stop_command: pkill tensorboard + user_msg: "" + logfile_path: "" + timeout_seconds: 60 + icon_url: "" + webapp_options: + autolaunch: true + port: "6006" + proxy: + trim_prefix: false + url: http://localhost:6006 + programming_languages: + - python3 + icon_url: "" + image_version: 1.0.2 + os: linux + os_distro: ubuntu + os_distro_release: "22.04" + schema_version: v2 + user_info: + uid: "" + gid: "" + username: "" + package_managers: + - name: apt + binary_path: /usr/bin/apt + installed_packages: + - curl + - git + - git-lfs + - vim + - name: pip + binary_path: /usr/local/bin/pip + installed_packages: + - jupyterlab==4.0.7 + package_manager_environment: + name: "" + target: "" + compose_file_path: "" execution: - apps: [] - resources: - gpu: - requested: 1 - sharedMemoryMB: 1024 - secrets: - - variable: NVIDIA_API_KEY - description: NVIDIA API Key for accessing the API catalog - - variable: TAVILY_API_KEY - description: Tavily Search API Key - mounts: - - type: project - target: /project/ - description: Project directory - options: rw - - type: volume - target: /data/tensorboard/logs/ - description: Tensorboard Log Files - options: volumeName=tensorboard-logs-volume + apps: + - name: Visual Studio Code + type: vs-code + class: native + start_command: "" + health_check_command: '[ \$(ps aux | grep ".vscode-server" | grep -v grep | wc -l ) -gt 4 ] && [ \$(ps aux | grep "/.vscode-server/bin/.*/node .* net.createConnection" | grep -v grep | wc -l) -gt 0 ]' + stop_command: "" + user_msg: "" + logfile_path: "" + timeout_seconds: 120 + icon_url: "" + resources: + gpu: + requested: 0 + sharedMemoryMB: 1024 + secrets: + - variable: NVIDIA_API_KEY + description: NVIDIA API Key for accessing the API catalog + - variable: TAVILY_API_KEY + description: Tavily Search API Key + mounts: + - type: project + target: /project/ + description: Project directory + options: rw + - type: volume + target: /data/tensorboard/logs/ + description: Tensorboard Log Files + options: volumeName=tensorboard-logs-volume diff --git a/README.md b/README.md index 415b29e..006d9b0 100644 --- a/README.md +++ b/README.md @@ -1,223 +1,115 @@ -# Table of Contents -* [Introduction](#nvidia-ai-workbench-introduction) - * [Project Description](#project-description) - * [Project Deep Dive](#project-deep-dive) - * [Sizing Guide](#sizing-guide) -* [Quickstart](#quickstart) - * [Prerequisites](#prerequisites) - * [Tutorial (Desktop App)](#tutorial-desktop-app) - * [Tutorial (CLI-Only)](#tutorial-cli-only) -* [License](#license) - -# NVIDIA AI Workbench: Introduction [![Open In AI Workbench](https://img.shields.io/badge/Open_In-AI_Workbench-76B900)](https://ngc.nvidia.com/open-ai-workbench/aHR0cHM6Ly9naXRodWIuY29tL05WSURJQS93b3JrYmVuY2gtZXhhbXBsZS1hZ2VudGljLXJhZw==) - - - +# Overview: An Easy Button for Agentic RAG +This RAG application uses an agentic approach to combine web search, hallucination control and accuracy checks with RAG. It's easy to modify because its a simple Gradio app. - -

- :arrow_down: Download AI Workbench • - :book: Read the Docs • - :open_file_folder: Explore Example Projects • - :rotating_light: Facing Issues? Let Us Know! -

- -## Project Description -This is an [NVIDIA AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) project for developing a websearch-based [Retrieval Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) application with a customizable Gradio Chat app. It lets you: -* Embed your documents in the form of webpages or PDFs into a locally running Chroma vector database. -* Run inference using remotely running endpoints and microservices. - * Cloud endpoints using the [NVIDIA API Catalog](https://build.nvidia.com/explore/discover) - * Self-hosted endpoints using [NVIDIA Inference Microservices (NIMs)](https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama3-8b-instruct/tags) - * Third party self-hosted microservices like Ollama. - - - -This project uses an agentic workflow depicted in the above diagram to improve response quality in RAG. Using LangGraph, user queries will first be sorted under a RAG or Websearch pipeline depending on an LLM evaluation of the query topic. Depending on its user-configurable prompt, the router LLM can narrow its focus on turning a specific subject or topic routable to the RAG Pipeline. - -
-
- -Expand this section for a description of RAG Pipeline. - - -Under the retrieval pipeline, the user query is first compared to documents in the vector database and the most relevant documents are retrieved. - -Another LLM call evaluates the quality of the documents. If satisfactory, it proceeds to the generation phase to produce an response augmented by this relevant context. If the agent decides the best documents are irrelevant to the query, it redirects the user query to the websearch pipeline for a better quality response (see below section). - -After generation, another set of LLMs calls evaluate the response for hallucinations and accuracy. If the generation is both faithful to the retrieved context and answers the user's query in a satisfactory manner, the response is forwarded to the user and displayed. Otherwise, the agent will either regenerate the response, or redirect the query to a web search. - -
- -
- -Expand this section for a description of Websearch Pipeline. - - -Under the web search pipeline, the user query is inputted onto the web and the search results are retrieved. Using these results, a response is generated. - -After generation, a set of LLMs calls evaluate the response for hallucinations and accuracy. If the generation is both faithful to the retrieved context and answers the user's query in a satisfactory manner, the response is forwarded to the user and displayed. Otherwise, the agent will either regenerate the response, or redirect the query to another web search. - -
-
- -| :memo: Remember | -| :---------------------------| -| This project is meant as an example workflow and a starting point; you are free to add new models, rearrange the interface, or edit the source code as you see fit for your particular use case! | - -### Project Deep Dive - -
- -Expand this section for a full guide of the user-configurable project settings - - - +> **Note** +>This app runs in [NVIDIA AI Workbench](https://docs.nvidia.com/ai-workbench/user-guide/latest/overview/introduction.html). It's a free, lightweight developer platform that you can run on your own systems to get up and running with complex AI applications and workloads in a short amount of time. -When the user lands on the Chat UI application in the browser, they will see several components. On the left hand side is a standard chatbot user interface with a user input for queries (submittable with ``[ENTER]``) and a clear history button. Above this chatbot is a diagram of the agentic RAG pipeline which doubles as a progress bar indicator for any nontrivial user actions a user might take, like uploading a document. +> You may want to [**fork**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo#forking-a-repository) this repository into your own account before proceeding. Otherwise you won't be able to save your local changes to GitHub because this NVIDIA owned repository is **read-only**. -On the right hand side, users will see a collapsable settings panel with several tabs they may choose to navigate to and configure. +
-
-
- -Expand for Model Settings. - +*Navigating the README:* [Application Overview](#the-agentic-rag-application) | [Get Started](#get-started) | [Deep Dive](#deep-dive-on-self-hosted-endpoints) | [License](#license) - - -This tab holds every user-configurable setting for each of the LLM components of the agentic RAG pipeline: - -* Router -* Retrieval Grader -* Generator -* Hallucination Grader -* Answer Grader - -Expanding any such entry will yield a panel where users can specify the model they would like to use for that particular component from a dropdown (using NVIDIA API Catalog endpoints), or they can specify their own remotely running self-hosted NVIDIA NIM custom endpoint. - -Below this field is an expandable accordion where users can adjust the default prompts for that particular component's task. For example, under the Router component, users can re-write and customize their prompt to focus on only routing queries relating to LLMs and agents to the RAG pipeline and directing all other queries to the Websearch pipeline. - -
- -
- -Expand for Document Settings. - + +*Other Resources:* [:arrow_down: Download AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) | [:book: User Guide](https://docs.nvidia.com/ai-workbench/) |[:open_file_folder: Other Projects](https://docs.nvidia.com/ai-workbench/user-guide/latest/quickstart/example-projects.html) | [:rotating_light: User Forum](https://forums.developer.nvidia.com/t/support-workbench-example-project-agentic-rag/303414) - +## Need, Don't Need and Nice to Have -This tab holds every user-configurable setting for the vector database and document ingestion aspects of this agentic RAG pipeline. Users can upload their own webpages to the vector database by entering a newline-seperated list of URLs in the textbox and clicking Upload, or they can upload their own PDF files from their local machine to be stored in the vector datastore. +- Need: internet access because the chat app uses [Tavily](https://tavily.com/) for web-searches, as well as endpoints on build.nvidia.com +- Don't Need: Local GPU +- Nice to Have: Remote GPU system where you self-host an endpoint -
+## The Agentic RAG Application +#### Using the Application +1. You embed your documents (pdfs or webpages) to the vector database. +2. You configure each of the separate components for the pipeline. For each component you can: + * Select from a drop down of endpoints or use a self-hosted endpoint. + * Modify the prompt. +3. You submit your query. +4. An LLM evaluates its relevance to the index and then routes it to the DB or to search by [Tavily](https://tavily.com/). +5. Answers are checked for hallucination and relevance. "Failing"" answers are run through the process again. -
- -Expand for Monitoring Settings. - +The diagram **below** shows this agentic flow. + + - -This tab holds the agentic RAG monitoring tools built into this application. +#### Modifying the Application -* The first tool is a console that logs all the actions the agent has decided to take when processing the user query and provides a general overview into the agent's decision making. -* The second tool is an in-depth trace of the agent's actions for the last submitted query, which gives more detail into the context retrieved, websearch documents found, LLM pipeline components used, etc. when generating out the most recent response. +* Directly within the app you can: + * Change the prompts for the different components, e.g. the hallucination grader. + * Change the webpages and pdfs you want to use for the context in the RAG. + * Select different endpoints from [build.nvidia.com](https://build.nvidia.com/explore/discover) for the inference components. + * Configure it to use self-hosted endpoints with [NVIDIA Inference Microservices (NIMs)](https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama3-8b-instruct/tags) or [Ollama](https://hub.docker.com/r/ollama/ollama). +* You can also modify the application code to: + * Add new endpoints and endpoint providers + * Change the Gradio interface or the application structure and logic. -
-
+> **Note** Setting up self-hosted endpoints is relatively advanced because you will need to do it manually. -
+## Get Started -## Sizing Guide +The quickest path is with the pre-configured build.nvidia.com endpoints. -| GPU VRAM | Example Hardware | Compatible? | -| -------- | ------- | ------- | -| <16 GB | RTX 3080, RTX 3500 Ada | Y | -| 16 GB | RTX 4080 16GB, RTX A4000 | Y | -| 24 GB | RTX 3090/4090, RTX A5000/5500, A10/30 | Y | -| 32 GB | RTX 5000 Ada | Y | -| 40 GB | A100-40GB | Y | -| 48 GB | RTX 6000 Ada, L40/L40S, A40 | Y | -| 80 GB | A100-80GB | Y | -| >80 GB | 8x A100-80GB | Y | +#### Prerequisites for Using Pre-configured Endpoints -# Quickstart +1. Install [AI Workbench](https://docs.nvidia.com/ai-workbench/user-guide/latest/installation/overview.html). -## Prerequisites -AI Workbench will prompt you to provide a few pieces of information before running any apps in this project. Ensure you have this information ready. - - * An NVIDIA API Key. You can generate one under ``Get API Key`` on any API Catalog [model card](https://build.nvidia.com/mistralai/mistral-7b-instruct-v2) - * A Tavily Search API Key. You can generate one under a free account (1000 searches/month) [here](https://app.tavily.com/home). +2. Get an NVIDIA Developer Account and an API key. + * Go to [build.nvidia.com](https://build.nvidia.com/) and click `Login`. + * Create account, verify email. + * Make a Cloud Account. + * Click your initial > `API Keys`. + * Create and save your key. -## Tutorial (Desktop App) +3. Get a Tavily account and an API key. + * Go to [Tavily](https://tavily.com/) and create an account. + * Create an API key on the overview page. + +4. Have some pdfs or web pages to put in the RAG. -If you do not NVIDIA AI Workbench installed, first complete the installation for AI Workbench [here](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/). Then, +5. NVIDIA Employees: Configure `INTERNAL_API` API key to use internal endpoints instead of public ones. -1. Fork this Project to your own GitHub namespace and copy the link - ``` - https://github.com/[your_namespace]/ - ``` - -2. Open NVIDIA AI Workbench. Select a location to work in. +#### Opening the Chat -3. Clone this Project onto your desired machine by selecting **Clone Project** and providing the GitHub link. +1. Open NVIDIA AI Workbench. Select a [location to work in](https://docs.nvidia.com/ai-workbench/user-guide/latest/locations/locations.html). -4. Wait for the project to build. You can expand the bottom **Building** indicator to view real-time build logs. +2. Use the repository URL to clone this project with AI Workbench and wait for it to build. -5. When the build completes, set the following configurations. +3. Add your NVIDIA API key and the Tavily API key when prompted. - * `Environment` → `Secrets` → `Configure`. Specify the NVIDIA API Key and Tavily Search Key as project secrets. +4. Open the **Chat** from Workbench. It should automatically open in a new browser tab. -6. On the top right of the window, select **Chat**. A frontend user interface should automatically open in a new browser tab. Happy chatting! +5. Upload your documents and change the Router prompt to focus on your uploaded documents. -7. **Note:** When doing RAG, make sure you (1) upload the document AND (2) Change the Router prompt to focus on the topic of your uploaded documents. Both changes are required for successful RAG! +6. Start chatting. -## Tutorial (CLI-Only) -Some users may choose to use the **CLI tool only** instead of the Desktop App. If you do not NVIDIA AI Workbench installed, first complete the installation for AI Workbench [here](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/). Then, -1. Fork this Project to your own GitHub namespace and copying the link +## Deep Dive on Self-Hosted Endpoints - ``` - https://github.com/[your_namespace]/ - ``` - -2. Open a shell and activating the Context you want to clone into by +> **Note** This assumes you've done the **Get Started** steps. - ``` - $ nvwb list contexts - - $ nvwb activate - ``` +#### Using Self-Hosted Endpoints - | :bulb: Tip | - | :---------------------------| - | Use ```nvwb help``` to see a full list of AI Workbench commands. | - -3. Clone this Project onto your desired machine by running - - ``` - $ nvwb clone project - ``` - -4. Open the Project by - - ``` - $ nvwb list projects - - $ nvwb open - ``` +You can configure any or all pipeline components (Router, Generator, Retrieval, Hallucination Check, Answer Check) to use self-hosted endpoints independently. This means you can mix and match between hosted and self-hosted components based on your needs. The application includes built-in GPU compatibility checking to help you select appropriate models for your hardware configuration. -5. Start **Chat** by +Prerequisites: +* NVIDIA GPU(s) with appropriate VRAM +* Ubuntu 22.04 or later with latest NVIDIA drivers +* Docker and NVIDIA Container Toolkit - ``` - $ nvwb start chat - ``` +To set up NIM endpoints for your components: +1. Check the [NIM documentation](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html) for detailed setup instructions +2. For each component you want to self-host: + * Select "NIM Endpoints" in the component's configuration + * Choose your GPU type and count - the UI will automatically show only compatible models + * Enter your endpoint details (host, port) +3. Components not set to self-hosted will continue using their configured cloud endpoints - * Specify the NVIDIA API Key and Tavily Search Key as project secrets. +The application will validate your GPU configuration for each component and prevent incompatible model selections. You can use different GPU configurations for different components based on their computational needs. -6. A frontend user interface should automatically open in a new browser tab. Happy chatting! -7. **Note:** When doing RAG, make sure you (1) upload the document AND (2) Change the Router prompt to focus on the topic of your uploaded documents. Both changes are required for successful RAG! # License This NVIDIA AI Workbench example project is under the [Apache 2.0 License](https://github.com/NVIDIA/workbench-example-agentic-rag/blob/main/LICENSE.txt) diff --git a/code/chatui/pages/converse.py b/code/chatui/pages/converse.py index 0c4a3fa..9322dd5 100644 --- a/code/chatui/pages/converse.py +++ b/code/chatui/pages/converse.py @@ -24,10 +24,25 @@ import subprocess import time import sys +import json + +INTERNAL_API = os.getenv('INTERNAL_API', '') + +# Model identifiers with prefix +LLAMA = "meta/llama3-70b-instruct" +MISTRAL = "mistralai/mixtral-8x22b-instruct-v0.1" + + + +if INTERNAL_API != '': + LLAMA = f'{INTERNAL_API}/meta/llama-3.1-70b-instruct' + MISTRAL = f'{INTERNAL_API}/mistralai/mixtral-8x22b-instruct-v0.1' + + from chatui import assets, chat_client from chatui.prompts import prompts_llama3, prompts_mistral -from chatui.utils import compile, database, logger +from chatui.utils import compile, database, logger, gpu_compatibility from langgraph.graph import END, StateGraph @@ -93,8 +108,7 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: """ List of currently supported models. """ - model_list = ["meta/llama3-70b-instruct", - "mistralai/mixtral-8x22b-instruct-v0.1"] + model_list = [LLAMA, MISTRAL] with gr.Blocks(title=TITLE, theme=kui_theme, css=kui_styles + _LOCAL_CSS) as page: gr.Markdown(f"# {TITLE}") @@ -137,7 +151,7 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: ) with gr.Column(scale=1, min_width=150): - _ = gr.ClearButton([msg, chatbot], value="Clear history") + _ = gr.ClearButton([msg, chatbot], value="Clear chat history") # Hidden column to be rendered when the user collapses all settings. with gr.Column(scale=1, min_width=100, visible=False) as hidden_settings_column: @@ -147,9 +161,41 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: with gr.Column(scale=10, min_width=350) as settings_column: with gr.Tabs(selected=0) as settings_tabs: + with gr.TabItem("Instructions", id=0) as instructions_tab: + gr.Markdown( + """ + + ##### Use the Models tab to configure individual components + - Click a component name (e.g. Router) to configure it + - Select an API endpoint or a self-hosted NIM (requires remote GPU) + - Customize component behavior by changing the prompts + + ##### Use the Documents tab to create a RAG context + - Webpages: Enter URLs of webpages for the context + - PDFs: Upload PDFs for the context + - Add to Context: Add documents to the context (can repeat) + - Clear Context: Resets the context to empty + + ##### Use the Monitor tab to see the agent in action + - Actions Console: Conclusions and actions of the agent + - Response Trace: Full text behind the response + + """ + ) + + + + # Settings for each component model of the agentic workflow - with gr.TabItem("Models", id=0) as agent_settings: - + with gr.TabItem("Models", id=1) as agent_settings: + gr.Markdown( + """ + ##### Model Configuration + Select and configure the models for each stage of the Agentic RAG pipeline. + You can use either API-hosted models or NIM microservices. + """ + ) + ######################## ##### ROUTER MODEL ##### ######################## @@ -165,19 +211,48 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: with gr.TabItem("NIM Endpoints", id=1) as router_nim: with gr.Row(): - nim_router_ip = gr.Textbox(placeholder = "10.123.45.678", - label = "Microservice Host", - info = "IP Address running the microservice", - elem_id="rag-inputs", scale=2) - nim_router_port = gr.Textbox(placeholder = "8000", - label = "Port", - info = "Optional, (default: 8000)", - elem_id="rag-inputs", scale=1) + nim_router_gpu_type = gr.Dropdown( + choices=gpu_compatibility.get_gpu_types(), + label="GPU Type", + info="Select your GPU type", + elem_id="rag-inputs", + scale=2 + ) + nim_router_gpu_count = gr.Dropdown( + choices=[], + label="Number of GPUs", + info="Select number of GPUs", + elem_id="rag-inputs", + scale=1, + interactive=False + ) + + with gr.Row(): + nim_router_ip = gr.Textbox( + placeholder="10.123.45.678", + label="Microservice Host", + info="IP Address running the microservice", + elem_id="rag-inputs", + scale=2 + ) + nim_router_port = gr.Textbox( + placeholder="8000", + label="Port", + info="Optional, (default: 8000)", + elem_id="rag-inputs", + scale=1 + ) - nim_router_id = gr.Textbox(placeholder = "meta/llama3-8b-instruct", - label = "Model running in microservice.", - info = "If none specified, defaults to: meta/llama3-8b-instruct", - elem_id="rag-inputs") + nim_router_id = gr.Dropdown( + choices=[], + label="Model running in microservice", + info="Select a compatible model for your GPU configuration", + elem_id="rag-inputs", + interactive=False + ) + + # Add warning box for compatibility issues + nim_router_warning = gr.Markdown(visible=False, value="") with gr.TabItem("Hide", id=2) as router_hide: gr.Markdown("") @@ -203,19 +278,48 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: interactive=True) with gr.TabItem("NIM Endpoints", id=1) as retrieval_nim: with gr.Row(): - nim_retrieval_ip = gr.Textbox(placeholder = "10.123.45.678", - label = "Microservice Host", - info = "IP Address running the microservice", - elem_id="rag-inputs", scale=2) - nim_retrieval_port = gr.Textbox(placeholder = "8000", - label = "Port", - info = "Optional, (default: 8000)", - elem_id="rag-inputs", scale=1) + nim_retrieval_gpu_type = gr.Dropdown( + choices=gpu_compatibility.get_gpu_types(), + label="GPU Type", + info="Select your GPU type", + elem_id="rag-inputs", + scale=2 + ) + nim_retrieval_gpu_count = gr.Dropdown( + choices=[], + label="Number of GPUs", + info="Select number of GPUs", + elem_id="rag-inputs", + scale=1, + interactive=False + ) + + with gr.Row(): + nim_retrieval_ip = gr.Textbox( + placeholder="10.123.45.678", + label="Microservice Host", + info="IP Address running the microservice", + elem_id="rag-inputs", + scale=2 + ) + nim_retrieval_port = gr.Textbox( + placeholder="8000", + label="Port", + info="Optional, (default: 8000)", + elem_id="rag-inputs", + scale=1 + ) - nim_retrieval_id = gr.Textbox(placeholder = "meta/llama3-8b-instruct", - label = "Model running in microservice.", - info = "If none specified, defaults to: meta/llama3-8b-instruct", - elem_id="rag-inputs") + nim_retrieval_id = gr.Dropdown( + choices=[], + label="Model running in microservice", + info="Select a compatible model for your GPU configuration", + elem_id="rag-inputs", + interactive=False + ) + + # Add warning box for compatibility issues + nim_retrieval_warning = gr.Markdown(visible=False, value="") with gr.TabItem("Hide", id=2) as retrieval_hide: gr.Markdown("") @@ -241,19 +345,48 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: interactive=True) with gr.TabItem("NIM Endpoints", id=1) as generator_nim: with gr.Row(): - nim_generator_ip = gr.Textbox(placeholder = "10.123.45.678", - label = "Microservice Host", - info = "IP Address running the microservice", - elem_id="rag-inputs", scale=2) - nim_generator_port = gr.Textbox(placeholder = "8000", - label = "Port", - info = "Optional, (default: 8000)", - elem_id="rag-inputs", scale=1) + nim_generator_gpu_type = gr.Dropdown( + choices=gpu_compatibility.get_gpu_types(), + label="GPU Type", + info="Select your GPU type", + elem_id="rag-inputs", + scale=2 + ) + nim_generator_gpu_count = gr.Dropdown( + choices=[], + label="Number of GPUs", + info="Select number of GPUs", + elem_id="rag-inputs", + scale=1, + interactive=False + ) + + with gr.Row(): + nim_generator_ip = gr.Textbox( + placeholder="10.123.45.678", + label="Microservice Host", + info="IP Address running the microservice", + elem_id="rag-inputs", + scale=2 + ) + nim_generator_port = gr.Textbox( + placeholder="8000", + label="Port", + info="Optional, (default: 8000)", + elem_id="rag-inputs", + scale=1 + ) - nim_generator_id = gr.Textbox(placeholder = "meta/llama3-8b-instruct", - label = "Model running in microservice.", - info = "If none specified, defaults to: meta/llama3-8b-instruct", - elem_id="rag-inputs") + nim_generator_id = gr.Dropdown( + choices=[], + label="Model running in microservice", + info="Select a compatible model for your GPU configuration", + elem_id="rag-inputs", + interactive=False + ) + + # Add warning box for compatibility issues + nim_generator_warning = gr.Markdown(visible=False, value="") with gr.TabItem("Hide", id=2) as generator_hide: gr.Markdown("") @@ -279,19 +412,48 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: interactive=True) with gr.TabItem("NIM Endpoints", id=1) as hallucination_nim: with gr.Row(): - nim_hallucination_ip = gr.Textbox(placeholder = "10.123.45.678", - label = "Microservice Host", - info = "IP Address running the microservice", - elem_id="rag-inputs", scale=2) - nim_hallucination_port = gr.Textbox(placeholder = "8000", - label = "Port", - info = "Optional, (default: 8000)", - elem_id="rag-inputs", scale=1) + nim_hallucination_gpu_type = gr.Dropdown( + choices=gpu_compatibility.get_gpu_types(), + label="GPU Type", + info="Select your GPU type", + elem_id="rag-inputs", + scale=2 + ) + nim_hallucination_gpu_count = gr.Dropdown( + choices=[], + label="Number of GPUs", + info="Select number of GPUs", + elem_id="rag-inputs", + scale=1, + interactive=False + ) - nim_hallucination_id = gr.Textbox(placeholder = "meta/llama3-8b-instruct", - label = "Model running in microservice.", - info = "If none specified, defaults to: meta/llama3-8b-instruct", - elem_id="rag-inputs") + with gr.Row(): + nim_hallucination_ip = gr.Textbox( + placeholder="10.123.45.678", + label="Microservice Host", + info="IP Address running the microservice", + elem_id="rag-inputs", + scale=2 + ) + nim_hallucination_port = gr.Textbox( + placeholder="8000", + label="Port", + info="Optional, (default: 8000)", + elem_id="rag-inputs", + scale=1 + ) + + nim_hallucination_id = gr.Dropdown( + choices=[], + label="Model running in microservice", + info="Select a compatible model for your GPU configuration", + elem_id="rag-inputs", + interactive=False + ) + + # Add warning box for compatibility issues + nim_hallucination_warning = gr.Markdown(visible=False, value="") with gr.TabItem("Hide", id=2) as hallucination_hide: gr.Markdown("") @@ -317,19 +479,48 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: interactive=True) with gr.TabItem("NIM Endpoints", id=1) as answer_nim: with gr.Row(): - nim_answer_ip = gr.Textbox(placeholder = "10.123.45.678", - label = "Microservice Host", - info = "IP Address running the microservice", - elem_id="rag-inputs", scale=2) - nim_answer_port = gr.Textbox(placeholder = "8000", - label = "Port", - info = "Optional, (default: 8000)", - elem_id="rag-inputs", scale=1) + nim_answer_gpu_type = gr.Dropdown( + choices=gpu_compatibility.get_gpu_types(), + label="GPU Type", + info="Select your GPU type", + elem_id="rag-inputs", + scale=2 + ) + nim_answer_gpu_count = gr.Dropdown( + choices=[], + label="Number of GPUs", + info="Select number of GPUs", + elem_id="rag-inputs", + scale=1, + interactive=False + ) - nim_answer_id = gr.Textbox(placeholder = "meta/llama3-8b-instruct", - label = "Model running in microservice.", - info = "If none specified, defaults to: meta/llama3-8b-instruct", - elem_id="rag-inputs") + with gr.Row(): + nim_answer_ip = gr.Textbox( + placeholder="10.123.45.678", + label="Microservice Host", + info="IP Address running the microservice", + elem_id="rag-inputs", + scale=2 + ) + nim_answer_port = gr.Textbox( + placeholder="8000", + label="Port", + info="Optional, (default: 8000)", + elem_id="rag-inputs", + scale=1 + ) + + nim_answer_id = gr.Dropdown( + choices=[], + label="Model running in microservice", + info="Select a compatible model for your GPU configuration", + elem_id="rag-inputs", + interactive=False + ) + + # Add warning box for compatibility issues + nim_answer_warning = gr.Markdown(visible=False, value="") with gr.TabItem("Hide", id=2) as answer_hide: gr.Markdown("") @@ -341,10 +532,20 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: show_label=False, interactive=True) - # Second tab item is for uploading to and clearing the vector database - with gr.TabItem("Documents", id=1) as document_settings: + # Thirdtab item is for uploading to and clearing the vector database + with gr.TabItem("Documents", id=2) as document_settings: gr.Markdown("") - gr.Markdown("Upload webpages or PDF files to be stored persistently in the vector database.\n") + gr.Markdown( + """ + ##### Embed websites and PDFs into a vector database to create a context. + - You can do this in multiple rounds. + - Context is stored until you clear it. + + ##### URLs in Webpages are examples related to prompt engineering. + - They are **not** yet in the context + - You can replace them with your own URLs. \n + """ + ) with gr.Tabs(selected=0) as document_tabs: with gr.TabItem("Webpages", id=0) as url_tab: url_docs = gr.Textbox(value="https://lilianweng.github.io/posts/2023-06-23-agent/\nhttps://lilianweng.github.io/posts/2023-03-15-prompt-engineering/\nhttps://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/", @@ -354,18 +555,18 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: interactive=True) with gr.Row(): - url_docs_upload = gr.Button(value="Upload Docs") - url_docs_clear = gr.Button(value="Clear Docs") + url_docs_upload = gr.Button(value="Add to Context") + url_docs_clear = gr.Button(value="Clear Context") with gr.TabItem("PDFs", id=1) as pdf_tab: pdf_docs_upload = gr.File(interactive=True, show_label=False, file_types=[".pdf"], file_count="multiple") - pdf_docs_clear = gr.Button(value="Clear Docs") + pdf_docs_clear = gr.Button(value="Clear Context") - # Third tab item is for the actions output console. - with gr.TabItem("Monitor", id=2) as console_settings: + # Fourth tab item is for the actions output console. + with gr.TabItem("Monitor", id=3) as console_settings: gr.Markdown("") gr.Markdown("Monitor agentic actions and view the pipeline trace of the latest response.\n") with gr.Tabs(selected=0) as console_tabs: @@ -380,7 +581,7 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: ) # Third tab item is for collapsing the entire settings pane for readability. - with gr.TabItem("Hide All Settings", id=3) as hide_all_settings: + with gr.TabItem("Hide All Settings", id=4) as hide_all_settings: gr.Markdown("") page.load(logger.read_logs, None, logs, every=1) @@ -430,91 +631,204 @@ def _toggle_hide_answer(): """ These helper functions set state and prompts when either the NIM or API Endpoint tabs are selected. """ - def _toggle_router_endpoints(api_model: str, nim_model: str, evt: gr.EventData): - if (evt._data['value'] == "NIM Endpoints") and ("llama3" in nim_model or len(nim_model) == 0): - value = prompts_llama3.router_prompt - elif (evt._data['value'] == "NIM Endpoints") and ("mistral" in nim_model or "mixtral" in nim_model): - value = prompts_mistral.router_prompt - elif (evt._data['value'] == "API Endpoints") and ("llama3" in api_model): - value = prompts_llama3.router_prompt - elif (evt._data['value'] == "API Endpoints") and ("mistral" in api_model or "mixtral" in api_model): - value = prompts_mistral.router_prompt - return True if evt._data['value'] == "NIM Endpoints" else False, gr.update(value=value) if value is not None else gr.update(visible=True) - - def _toggle_retrieval_endpoints(api_model: str, nim_model: str, evt: gr.EventData): - if (evt._data['value'] == "NIM Endpoints") and ("llama3" in nim_model or len(nim_model) == 0): - value = prompts_llama3.retrieval_prompt - elif (evt._data['value'] == "NIM Endpoints") and ("mistral" in nim_model or "mixtral" in nim_model): - value = prompts_mistral.retrieval_prompt - elif (evt._data['value'] == "API Endpoints") and ("llama3" in api_model): - value = prompts_llama3.retrieval_prompt - elif (evt._data['value'] == "API Endpoints") and ("mistral" in api_model or "mixtral" in api_model): - value = prompts_mistral.retrieval_prompt - return True if evt._data['value'] == "NIM Endpoints" else False, gr.update(value=value) if value is not None else gr.update(visible=True) - - def _toggle_generator_endpoints(api_model: str, nim_model: str, evt: gr.EventData): - if (evt._data['value'] == "NIM Endpoints") and ("llama3" in nim_model or len(nim_model) == 0): - value = prompts_llama3.generator_prompt - elif (evt._data['value'] == "NIM Endpoints") and ("mistral" in nim_model or "mixtral" in nim_model): - value = prompts_mistral.generator_prompt - elif (evt._data['value'] == "API Endpoints") and ("llama3" in api_model): - value = prompts_llama3.generator_prompt - elif (evt._data['value'] == "API Endpoints") and ("mistral" in api_model or "mixtral" in api_model): - value = prompts_mistral.generator_prompt - return True if evt._data['value'] == "NIM Endpoints" else False, gr.update(value=value) if value is not None else gr.update(visible=True) - - def _toggle_hallucination_endpoints(api_model: str, nim_model: str, evt: gr.EventData): - if (evt._data['value'] == "NIM Endpoints") and ("llama3" in nim_model or len(nim_model) == 0): - value = prompts_llama3.hallucination_prompt - elif (evt._data['value'] == "NIM Endpoints") and ("mistral" in nim_model or "mixtral" in nim_model): - value = prompts_mistral.hallucination_prompt - elif (evt._data['value'] == "API Endpoints") and ("llama3" in api_model): - value = prompts_llama3.hallucination_prompt - elif (evt._data['value'] == "API Endpoints") and ("mistral" in api_model or "mixtral" in api_model): - value = prompts_mistral.hallucination_prompt - return True if evt._data['value'] == "NIM Endpoints" else False, gr.update(value=value) if value is not None else gr.update(visible=True) - - def _toggle_answer_endpoints(api_model: str, nim_model: str, evt: gr.EventData): - if (evt._data['value'] == "NIM Endpoints") and ("llama3" in nim_model or len(nim_model) == 0): - value = prompts_llama3.answer_prompt - elif (evt._data['value'] == "NIM Endpoints") and ("mistral" in nim_model or "mixtral" in nim_model): - value = prompts_mistral.answer_prompt - elif (evt._data['value'] == "API Endpoints") and ("llama3" in api_model): - value = prompts_llama3.answer_prompt - elif (evt._data['value'] == "API Endpoints") and ("mistral" in api_model or "mixtral" in api_model): - value = prompts_mistral.answer_prompt - return True if evt._data['value'] == "NIM Endpoints" else False, gr.update(value=value) if value is not None else gr.update(visible=True) - - router_api.select(_toggle_router_endpoints, [model_router, nim_router_id], [router_use_nim, prompt_router]) - router_nim.select(_toggle_router_endpoints, [model_router, nim_router_id], [router_use_nim, prompt_router]) - retrieval_api.select(_toggle_retrieval_endpoints, [model_retrieval, nim_retrieval_id], [retrieval_use_nim, prompt_retrieval]) - retrieval_nim.select(_toggle_retrieval_endpoints, [model_retrieval, nim_retrieval_id], [retrieval_use_nim, prompt_retrieval]) - generator_api.select(_toggle_generator_endpoints, [model_generator, nim_generator_id], [generator_use_nim, prompt_generator]) - generator_nim.select(_toggle_generator_endpoints, [model_generator, nim_generator_id], [generator_use_nim, prompt_generator]) - hallucination_api.select(_toggle_hallucination_endpoints, [model_hallucination, nim_hallucination_id], [hallucination_use_nim, prompt_hallucination]) - hallucination_nim.select(_toggle_hallucination_endpoints, [model_hallucination, nim_hallucination_id], [hallucination_use_nim, prompt_hallucination]) - answer_api.select(_toggle_answer_endpoints, [model_answer, nim_answer_id], [answer_use_nim, prompt_answer]) - answer_nim.select(_toggle_answer_endpoints, [model_answer, nim_answer_id], [answer_use_nim, prompt_answer]) + def _update_gpu_counts(component: str, gpu_type: str): + """Update the available GPU counts for selected GPU type.""" + counts = gpu_compatibility.get_supported_gpu_counts(gpu_type) + components = { + "router": [nim_router_gpu_count, nim_router_id, nim_router_warning], + "retrieval": [nim_retrieval_gpu_count, nim_retrieval_id, nim_retrieval_warning], + "generator": [nim_generator_gpu_count, nim_generator_id, nim_generator_warning], + "hallucination": [nim_hallucination_gpu_count, nim_hallucination_id, nim_hallucination_warning], + "answer": [nim_answer_gpu_count, nim_answer_id, nim_answer_warning] + } + return { + components[component][0]: gr.update(choices=counts, value=None, interactive=True), + components[component][1]: gr.update(choices=[], value=None, interactive=False), + components[component][2]: gr.update(visible=False, value="") + } - """ These helper functions hide and show the right-hand settings panel when toggled. """ + def _update_compatible_models(component: str, gpu_type: str, num_gpus: str): + """Update the compatible models list based on GPU configuration.""" + if not gpu_type or not num_gpus: + components = { + "router": [nim_router_id, nim_router_warning], + "retrieval": [nim_retrieval_id, nim_retrieval_warning], + "generator": [nim_generator_id, nim_generator_warning], + "hallucination": [nim_hallucination_id, nim_hallucination_warning], + "answer": [nim_answer_id, nim_answer_warning] + } + return { + components[component][0]: gr.update(choices=[], value=None, interactive=False), + components[component][1]: gr.update(visible=False, value="") + } + + compatibility = gpu_compatibility.get_compatible_models(gpu_type, num_gpus) + + if compatibility["warning_message"]: + components = { + "router": [nim_router_id, nim_router_warning], + "retrieval": [nim_retrieval_id, nim_retrieval_warning], + "generator": [nim_generator_id, nim_generator_warning], + "hallucination": [nim_hallucination_id, nim_hallucination_warning], + "answer": [nim_answer_id, nim_answer_warning] + } + return { + components[component][0]: gr.update(choices=[], value=None, interactive=False), + components[component][1]: gr.update(visible=True, value=f"⚠️ {compatibility['warning_message']}") + } + + components = { + "router": [nim_router_id, nim_router_warning], + "retrieval": [nim_retrieval_id, nim_retrieval_warning], + "generator": [nim_generator_id, nim_generator_warning], + "hallucination": [nim_hallucination_id, nim_hallucination_warning], + "answer": [nim_answer_id, nim_answer_warning] + } + return { + components[component][0]: gr.update( + choices=compatibility["compatible_models"], + value=compatibility["compatible_models"][0] if compatibility["compatible_models"] else None, + interactive=True + ), + components[component][1]: gr.update(visible=False, value="") + } + + # Add the event handlers for all components + nim_router_gpu_type.change(lambda x: _update_gpu_counts("router", x), nim_router_gpu_type, + [nim_router_gpu_count, nim_router_id, nim_router_warning]) + nim_router_gpu_count.change(lambda x, y: _update_compatible_models("router", x, y), + [nim_router_gpu_type, nim_router_gpu_count], + [nim_router_id, nim_router_warning]) + + nim_retrieval_gpu_type.change(lambda x: _update_gpu_counts("retrieval", x), nim_retrieval_gpu_type, + [nim_retrieval_gpu_count, nim_retrieval_id, nim_retrieval_warning]) + nim_retrieval_gpu_count.change(lambda x, y: _update_compatible_models("retrieval", x, y), + [nim_retrieval_gpu_type, nim_retrieval_gpu_count], + [nim_retrieval_id, nim_retrieval_warning]) + + nim_generator_gpu_type.change(lambda x: _update_gpu_counts("generator", x), nim_generator_gpu_type, + [nim_generator_gpu_count, nim_generator_id, nim_generator_warning]) + nim_generator_gpu_count.change(lambda x, y: _update_compatible_models("generator", x, y), + [nim_generator_gpu_type, nim_generator_gpu_count], + [nim_generator_id, nim_generator_warning]) + + nim_hallucination_gpu_type.change(lambda x: _update_gpu_counts("hallucination", x), nim_hallucination_gpu_type, + [nim_hallucination_gpu_count, nim_hallucination_id, nim_hallucination_warning]) + nim_hallucination_gpu_count.change(lambda x, y: _update_compatible_models("hallucination", x, y), + [nim_hallucination_gpu_type, nim_hallucination_gpu_count], + [nim_hallucination_id, nim_hallucination_warning]) + + nim_answer_gpu_type.change(lambda x: _update_gpu_counts("answer", x), nim_answer_gpu_type, + [nim_answer_gpu_count, nim_answer_id, nim_answer_warning]) + nim_answer_gpu_count.change(lambda x, y: _update_compatible_models("answer", x, y), + [nim_answer_gpu_type, nim_answer_gpu_count], + [nim_answer_id, nim_answer_warning]) + + """ These helper functions track the API Endpoint selected and regenerates the prompt accordingly. """ + + def _toggle_model_router(selected_model: str): + match selected_model: + case str() if selected_model == LLAMA: + return gr.update(value=prompts_llama3.router_prompt) + case str() if selected_model == MISTRAL: + return gr.update(value=prompts_mistral.router_prompt) + case _: + return gr.update(value=prompts_llama3.router_prompt) - def _toggle_hide_all_settings(): + def _toggle_model_retrieval(selected_model: str): + match selected_model: + case str() if selected_model == LLAMA: + return gr.update(value=prompts_llama3.retrieval_prompt) + case str() if selected_model == MISTRAL: + return gr.update(value=prompts_mistral.retrieval_prompt) + case _: + return gr.update(value=prompts_llama3.retrieval_prompt) + + def _toggle_model_generator(selected_model: str): + match selected_model: + case str() if selected_model == LLAMA: + return gr.update(value=prompts_llama3.generator_prompt) + case str() if selected_model == MISTRAL: + return gr.update(value=prompts_mistral.generator_prompt) + case _: + return gr.update(value=prompts_llama3.generator_prompt) + + def _toggle_model_hallucination(selected_model: str): + match selected_model: + case str() if selected_model == LLAMA: + return gr.update(value=prompts_llama3.hallucination_prompt) + case str() if selected_model == MISTRAL: + return gr.update(value=prompts_mistral.hallucination_prompt) + case _: + return gr.update(value=prompts_llama3.hallucination_prompt) + + def _toggle_model_answer(selected_model: str): + match selected_model: + case str() if selected_model == LLAMA: + return gr.update(value=prompts_llama3.answer_prompt) + case str() if selected_model == MISTRAL: + return gr.update(value=prompts_mistral.answer_prompt) + case _: + return gr.update(value=prompts_llama3.answer_prompt) + + model_router.change(_toggle_model_router, [model_router], [prompt_router]) + model_retrieval.change(_toggle_model_retrieval, [model_retrieval], [prompt_retrieval]) + model_generator.change(_toggle_model_generator, [model_generator], [prompt_generator]) + model_hallucination.change(_toggle_model_hallucination, [model_hallucination], [prompt_hallucination]) + model_answer.change(_toggle_model_answer, [model_answer], [prompt_answer]) + + """ These helper functions upload and clear the documents and webpages to/from the ChromaDB. """ + + def _upload_documents_pdf(files, progress=gr.Progress()): + progress(0.25, desc="Initializing Task") + time.sleep(0.75) + progress(0.5, desc="Uploading Docs") + database.upload_pdf(files) + progress(0.75, desc="Cleaning Up") + time.sleep(0.75) return { - settings_column: gr.update(visible=False), - hidden_settings_column: gr.update(visible=True), + url_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), + pdf_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), + agentic_flow: gr.update(visible=True), } - def _toggle_show_all_settings(): + def _upload_documents(docs: str, progress=gr.Progress()): + progress(0.2, desc="Initializing Task") + time.sleep(0.75) + progress(0.4, desc="Processing URL List") + docs_list = docs.splitlines() + progress(0.6, desc="Uploading Docs") + database.upload(docs_list) + progress(0.8, desc="Cleaning Up") + time.sleep(0.75) return { - settings_column: gr.update(visible=True), - settings_tabs: gr.update(selected=0), - hidden_settings_column: gr.update(visible=False), + url_docs_upload: gr.update(value="Docs Uploaded", variant="primary", interactive=False), + url_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), + pdf_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), + agentic_flow: gr.update(visible=True), } - hide_all_settings.select(_toggle_hide_all_settings, None, [settings_column, hidden_settings_column]) - show_settings.click(_toggle_show_all_settings, None, [settings_column, settings_tabs, hidden_settings_column]) - - """ This helper function ensures the model settings are reset when a user re-navigates to the tab. """ + def _clear_documents(progress=gr.Progress()): + progress(0.25, desc="Initializing Task") + time.sleep(0.75) + progress(0.5, desc="Clearing Database") + database.clear() + progress(0.75, desc="Cleaning Up") + time.sleep(0.75) + return { + url_docs_upload: gr.update(value="Upload Docs", variant="secondary", interactive=True), + url_docs_clear: gr.update(value="Docs Cleared", variant="primary", interactive=False), + pdf_docs_upload: gr.update(value=None), + pdf_docs_clear: gr.update(value="Docs Cleared", variant="primary", interactive=False), + agentic_flow: gr.update(visible=True), + } + + url_docs_upload.click(_upload_documents, [url_docs], [url_docs_upload, url_docs_clear, pdf_docs_clear, agentic_flow]) + url_docs_clear.click(_clear_documents, [], [url_docs_upload, url_docs_clear, pdf_docs_upload, pdf_docs_clear, agentic_flow]) + pdf_docs_upload.upload(_upload_documents_pdf, [pdf_docs_upload], [url_docs_clear, pdf_docs_clear, agentic_flow]) + pdf_docs_clear.click(_clear_documents, [], [url_docs_upload, url_docs_clear, pdf_docs_upload, pdf_docs_clear, agentic_flow]) + + """ These helper functions set state and prompts when either the NIM or API Endpoint tabs are selected. """ def _toggle_model_tab(): return { @@ -627,110 +941,6 @@ def _toggle_model(btn: str): hallucination_btn, answer_btn]) - """ These helper functions track the API Endpoint selected and regenerates the prompt accordingly. """ - - def _toggle_model_router(selected_model: str): - match selected_model: - case "meta/llama3-70b-instruct": - return gr.update(value=prompts_llama3.router_prompt) - case "mistralai/mixtral-8x22b-instruct-v0.1": - return gr.update(value=prompts_mistral.router_prompt) - case _: - return gr.update(value=prompts_llama3.router_prompt) - - def _toggle_model_retrieval(selected_model: str): - match selected_model: - case "meta/llama3-70b-instruct": - return gr.update(value=prompts_llama3.retrieval_prompt) - case "mistralai/mixtral-8x22b-instruct-v0.1": - return gr.update(value=prompts_mistral.retrieval_prompt) - case _: - return gr.update(value=prompts_llama3.retrieval_prompt) - - def _toggle_model_generator(selected_model: str): - match selected_model: - case "meta/llama3-70b-instruct": - return gr.update(value=prompts_llama3.generator_prompt) - case "mistralai/mixtral-8x22b-instruct-v0.1": - return gr.update(value=prompts_mistral.generator_prompt) - case _: - return gr.update(value=prompts_llama3.generator_prompt) - - def _toggle_model_hallucination(selected_model: str): - match selected_model: - case "meta/llama3-70b-instruct": - return gr.update(value=prompts_llama3.hallucination_prompt) - case "mistralai/mixtral-8x22b-instruct-v0.1": - return gr.update(value=prompts_mistral.hallucination_prompt) - case _: - return gr.update(value=prompts_llama3.hallucination_prompt) - - def _toggle_model_answer(selected_model: str): - match selected_model: - case "meta/llama3-70b-instruct": - return gr.update(value=prompts_llama3.answer_prompt) - case "mistralai/mixtral-8x22b-instruct-v0.1": - return gr.update(value=prompts_mistral.answer_prompt) - case _: - return gr.update(value=prompts_llama3.answer_prompt) - - model_router.change(_toggle_model_router, [model_router], [prompt_router]) - model_retrieval.change(_toggle_model_retrieval, [model_retrieval], [prompt_retrieval]) - model_generator.change(_toggle_model_generator, [model_generator], [prompt_generator]) - model_hallucination.change(_toggle_model_hallucination, [model_hallucination], [prompt_hallucination]) - model_answer.change(_toggle_model_answer, [model_answer], [prompt_answer]) - - """ These helper functions upload and clear the documents and webpages to/from the ChromaDB. """ - - def _upload_documents_pdf(files, progress=gr.Progress()): - progress(0.25, desc="Initializing Task") - time.sleep(0.75) - progress(0.5, desc="Uploading Docs") - database.upload_pdf(files) - progress(0.75, desc="Cleaning Up") - time.sleep(0.75) - return { - url_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), - pdf_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), - agentic_flow: gr.update(visible=True), - } - - def _upload_documents(docs: str, progress=gr.Progress()): - progress(0.2, desc="Initializing Task") - time.sleep(0.75) - progress(0.4, desc="Processing URL List") - docs_list = docs.splitlines() - progress(0.6, desc="Uploading Docs") - database.upload(docs_list) - progress(0.8, desc="Cleaning Up") - time.sleep(0.75) - return { - url_docs_upload: gr.update(value="Docs Uploaded", variant="primary", interactive=False), - url_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), - pdf_docs_clear: gr.update(value="Clear Docs", variant="secondary", interactive=True), - agentic_flow: gr.update(visible=True), - } - - def _clear_documents(progress=gr.Progress()): - progress(0.25, desc="Initializing Task") - time.sleep(0.75) - progress(0.5, desc="Clearing Database") - database.clear() - progress(0.75, desc="Cleaning Up") - time.sleep(0.75) - return { - url_docs_upload: gr.update(value="Upload Docs", variant="secondary", interactive=True), - url_docs_clear: gr.update(value="Docs Cleared", variant="primary", interactive=False), - pdf_docs_upload: gr.update(value=None), - pdf_docs_clear: gr.update(value="Docs Cleared", variant="primary", interactive=False), - agentic_flow: gr.update(visible=True), - } - - url_docs_upload.click(_upload_documents, [url_docs], [url_docs_upload, url_docs_clear, pdf_docs_clear, agentic_flow]) - url_docs_clear.click(_clear_documents, [], [url_docs_upload, url_docs_clear, pdf_docs_upload, pdf_docs_clear, agentic_flow]) - pdf_docs_upload.upload(_upload_documents_pdf, [pdf_docs_upload], [url_docs_clear, pdf_docs_clear, agentic_flow]) - pdf_docs_clear.click(_clear_documents, [], [url_docs_upload, url_docs_clear, pdf_docs_upload, pdf_docs_clear, agentic_flow]) - """ This helper function builds out the submission function call when a user submits a query. """ _my_build_stream = functools.partial(_stream_predict, client, app) @@ -861,3 +1071,13 @@ def _stream_predict( yield "", chat_history + [[question, final_value["generation"]]], gr.update(show_label=False) except Exception as e: yield "", chat_history + [[question, "*** ERR: Unable to process query. Check the Monitor tab for details. ***\n\nException: " + str(e)]], gr.update(show_label=False) + +_support_matrix_cache = None + +def load_gpu_support_matrix() -> Dict: + global _support_matrix_cache + if _support_matrix_cache is None: + matrix_path = os.path.join(os.path.dirname(__file__), '..', '..', 'nim_gpu_support_matrix.json') + with open(matrix_path, 'r') as f: + _support_matrix_cache = json.load(f) + return _support_matrix_cache diff --git a/code/chatui/utils/database.py b/code/chatui/utils/database.py index c018276..7260db2 100644 --- a/code/chatui/utils/database.py +++ b/code/chatui/utils/database.py @@ -19,10 +19,22 @@ from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings from typing import Any, Dict, List, Tuple, Union +import os +INTERNAL_API = os.getenv('INTERNAL_API', '') + +# Set the embeddings model target +EMBEDDINGS_MODEL = 'NV-Embed-QA' + +# +if INTERNAL_API != '': + EMBEDDINGS_MODEL = 'nvdev/nvidia/nv-embedqa-e5-v5' + +# Download nltk data import nltk nltk.download("punkt") nltk.download("averaged_perceptron_tagger") + def upload(urls: List[str]): """ This is a helper function for parsing the user inputted URLs and uploading them into the vector store. """ docs = [WebBaseLoader(url).load() for url in urls] @@ -37,7 +49,7 @@ def upload(urls: List[str]): vectorstore = Chroma.from_documents( documents=doc_splits, collection_name="rag-chroma", - embedding=NVIDIAEmbeddings(model='NV-Embed-QA'), + embedding=NVIDIAEmbeddings(model=EMBEDDINGS_MODEL), persist_directory="/project/data", ) return vectorstore @@ -56,7 +68,7 @@ def upload_pdf(documents: List[str]): vectorstore = Chroma.from_documents( documents=doc_splits, collection_name="rag-chroma", - embedding=NVIDIAEmbeddings(model='NV-Embed-QA'), + embedding=NVIDIAEmbeddings(model=EMBEDDINGS_MODEL), persist_directory="/project/data", ) return vectorstore @@ -65,7 +77,7 @@ def clear(): """ This is a helper function for emptying the collection the vector store. """ vectorstore = Chroma( collection_name="rag-chroma", - embedding_function=NVIDIAEmbeddings(model='NV-Embed-QA'), + embedding_function=NVIDIAEmbeddings(model=EMBEDDINGS_MODEL), persist_directory="/project/data", ) @@ -76,7 +88,7 @@ def get_retriever(): """ This is a helper function for returning the retriever object of the vector store. """ vectorstore = Chroma( collection_name="rag-chroma", - embedding_function=NVIDIAEmbeddings(model='NV-Embed-QA'), + embedding_function=NVIDIAEmbeddings(model=EMBEDDINGS_MODEL), persist_directory="/project/data", ) retriever = vectorstore.as_retriever() diff --git a/code/chatui/utils/gpu_compatibility.py b/code/chatui/utils/gpu_compatibility.py new file mode 100644 index 0000000..d03a0e9 --- /dev/null +++ b/code/chatui/utils/gpu_compatibility.py @@ -0,0 +1,66 @@ +"""Utility module for GPU compatibility checking.""" + +import json +import os +from typing import Dict, List, Optional, TypedDict + +class GPUConfig(TypedDict): + """Type definition for GPU configuration.""" + gpu_type: str + num_gpus: int + +class ModelCompatibility(TypedDict): + """Type definition for model compatibility results.""" + compatible_models: List[str] + warning_message: Optional[str] + +def load_gpu_support_matrix() -> Dict: + """Load the GPU support matrix from JSON file.""" + matrix_path = os.path.join(os.path.dirname(__file__), '..', '..', 'nim_gpu_support_matrix.json') + with open(matrix_path, 'r') as f: + return json.load(f) + +def get_compatible_models(gpu_type: str, num_gpus: str) -> ModelCompatibility: + """ + Get list of compatible models for given GPU configuration. + + Args: + gpu_type: Type of GPU (e.g. "H100", "A100 80GB") + num_gpus: Number of GPUs as string (e.g. "1", "2", "4", "8", "16") + + Returns: + ModelCompatibility with list of compatible models and optional warning + """ + support_matrix = load_gpu_support_matrix() + + # Validate inputs + if gpu_type not in support_matrix: + return ModelCompatibility( + compatible_models=[], + warning_message=f"GPU type {gpu_type} not found in support matrix" + ) + + if num_gpus not in support_matrix[gpu_type]: + return ModelCompatibility( + compatible_models=[], + warning_message=f"Configuration with {num_gpus} GPUs not supported for {gpu_type}" + ) + + # Get compatible models + models = support_matrix[gpu_type][num_gpus]["models"] + + return ModelCompatibility( + compatible_models=models, + warning_message=None if models else f"No compatible models found for {gpu_type} with {num_gpus} GPUs" + ) + +def get_gpu_types() -> List[str]: + """Get list of supported GPU types.""" + return list(load_gpu_support_matrix().keys()) + +def get_supported_gpu_counts(gpu_type: str) -> List[str]: + """Get list of supported GPU counts for a given GPU type.""" + support_matrix = load_gpu_support_matrix() + if gpu_type not in support_matrix: + return [] + return list(support_matrix[gpu_type].keys()) \ No newline at end of file diff --git a/code/chatui/utils/graph.py b/code/chatui/utils/graph.py index 36cb3e7..2d90260 100644 --- a/code/chatui/utils/graph.py +++ b/code/chatui/utils/graph.py @@ -16,7 +16,7 @@ import os from typing_extensions import TypedDict -from typing import List +from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser, JsonOutputParser @@ -73,6 +73,16 @@ class GraphState(TypedDict): nim_retrieval_id: str nim_hallucination_id: str nim_answer_id: str + nim_generator_gpu_type: Optional[str] + nim_generator_gpu_count: Optional[str] + nim_router_gpu_type: Optional[str] + nim_router_gpu_count: Optional[str] + nim_retrieval_gpu_type: Optional[str] + nim_retrieval_gpu_count: Optional[str] + nim_hallucination_gpu_type: Optional[str] + nim_hallucination_gpu_count: Optional[str] + nim_answer_gpu_type: Optional[str] + nim_answer_gpu_count: Optional[str] from langchain.schema import Document @@ -121,6 +131,8 @@ def generate(state): llm = nim.CustomChatOpenAI(custom_endpoint=state["nim_generator_ip"], port=state["nim_generator_port"] if len(state["nim_generator_port"]) > 0 else "8000", model_name=state["nim_generator_id"] if len(state["nim_generator_id"]) > 0 else "meta/llama3-8b-instruct", + gpu_type=state["nim_generator_gpu_type"] if "nim_generator_gpu_type" in state else None, + gpu_count=state["nim_generator_gpu_count"] if "nim_generator_gpu_count" in state else None, temperature=0.7) if state["generator_use_nim"] else ChatNVIDIA(model=state["generator_model_id"], temperature=0.7) rag_chain = prompt | llm | StrOutputParser() generation = rag_chain.invoke({"context": documents, "question": question}) @@ -153,6 +165,8 @@ def grade_documents(state): llm = nim.CustomChatOpenAI(custom_endpoint=state["nim_retrieval_ip"], port=state["nim_retrieval_port"] if len(state["nim_retrieval_port"]) > 0 else "8000", model_name=state["nim_retrieval_id"] if len(state["nim_retrieval_id"]) > 0 else "meta/llama3-8b-instruct", + gpu_type=state["nim_retrieval_gpu_type"] if "nim_retrieval_gpu_type" in state else None, + gpu_count=state["nim_retrieval_gpu_count"] if "nim_retrieval_gpu_count" in state else None, temperature=0.7) if state["retrieval_use_nim"] else ChatNVIDIA(model=state["retrieval_model_id"], temperature=0) retrieval_grader = prompt | llm | JsonOutputParser() for d in documents: @@ -225,6 +239,8 @@ def route_question(state): llm = nim.CustomChatOpenAI(custom_endpoint=state["nim_router_ip"], port=state["nim_router_port"] if len(state["nim_router_port"]) > 0 else "8000", model_name=state["nim_router_id"] if len(state["nim_router_id"]) > 0 else "meta/llama3-8b-instruct", + gpu_type=state["nim_router_gpu_type"] if "nim_router_gpu_type" in state else None, + gpu_count=state["nim_router_gpu_count"] if "nim_router_gpu_count" in state else None, temperature=0.7) if state["router_use_nim"] else ChatNVIDIA(model=state["router_model_id"], temperature=0) question_router = prompt | llm | JsonOutputParser() source = question_router.invoke({"question": question}) @@ -292,6 +308,8 @@ def grade_generation_v_documents_and_question(state): llm = nim.CustomChatOpenAI(custom_endpoint=state["nim_hallucination_ip"], port=state["nim_hallucination_port"] if len(state["nim_hallucination_port"]) > 0 else "8000", model_name=state["nim_hallucination_id"] if len(state["nim_hallucination_id"]) > 0 else "meta/llama3-8b-instruct", + gpu_type=state["nim_hallucination_gpu_type"] if "nim_hallucination_gpu_type" in state else None, + gpu_count=state["nim_hallucination_gpu_count"] if "nim_hallucination_gpu_count" in state else None, temperature=0.7) if state["hallucination_use_nim"] else ChatNVIDIA(model=state["hallucination_model_id"], temperature=0) hallucination_grader = prompt | llm | JsonOutputParser() @@ -308,6 +326,8 @@ def grade_generation_v_documents_and_question(state): llm = nim.CustomChatOpenAI(custom_endpoint=state["nim_answer_ip"], port=state["nim_answer_port"] if len(state["nim_answer_port"]) > 0 else "8000", model_name=state["nim_answer_id"] if len(state["nim_answer_id"]) > 0 else "meta/llama3-8b-instruct", + gpu_type=state["nim_answer_gpu_type"] if "nim_answer_gpu_type" in state else None, + gpu_count=state["nim_answer_gpu_count"] if "nim_answer_gpu_count" in state else None, temperature=0.7) if state["answer_use_nim"] else ChatNVIDIA(model=state["answer_model_id"], temperature=0) answer_grader = prompt | llm | JsonOutputParser() diff --git a/code/chatui/utils/nim.py b/code/chatui/utils/nim.py index 69bf3d0..dcb2091 100644 --- a/code/chatui/utils/nim.py +++ b/code/chatui/utils/nim.py @@ -18,6 +18,8 @@ from langchain_core.load.dump import dumps from pydantic import Field from typing import List, Mapping, Optional, Any +from chatui.utils import gpu_compatibility +import os class CustomChatOpenAI(BaseChatModel): """ This is a custom built class for using LangChain to chat with custom OpenAI API-compatible endpoints, eg. NIMs. """ @@ -26,13 +28,24 @@ class CustomChatOpenAI(BaseChatModel): port: Optional[str] = "8000" model_name: Optional[str] = "meta/llama3-8b-instruct" temperature: Optional[float] = 0.0 + gpu_type: Optional[str] = None + gpu_count: Optional[str] = None - def __init__(self, custom_endpoint, port="8000", model_name="meta/llama3-8b-instruct", temperature=0.0, **kwargs): + def __init__(self, custom_endpoint, port="8000", model_name="meta/llama3-8b-instruct", + gpu_type=None, gpu_count=None, temperature=0.0, **kwargs): super().__init__(**kwargs) + if gpu_type and gpu_count: + compatibility = gpu_compatibility.get_compatible_models(gpu_type, gpu_count) + if compatibility["warning_message"]: + raise ValueError(compatibility["warning_message"]) + if model_name not in compatibility["compatible_models"]: + raise ValueError(f"Model {model_name} is not compatible with {gpu_type} ({gpu_count} GPUs)") self.custom_endpoint = custom_endpoint self.port = port self.model_name = model_name self.temperature = temperature + self.gpu_type = gpu_type + self.gpu_count = gpu_count @property def _llm_type(self) -> str: @@ -45,18 +58,31 @@ def _generate(self, messages, stop=None, run_manager=None, **kwargs): def _call_custom_endpoint(self, messages, **kwargs): import openai import json - - openai.api_key = "xyz" - openai.base_url = "http://" + self.custom_endpoint + ":" + self.port + "/v1/" - + + openai.api_key = os.getenv("OPENAI_API_KEY", "xyz") # Better API key handling + openai.base_url = f"http://{self.custom_endpoint}:{self.port}/v1/" + obj = json.loads(dumps(messages)) - response = openai.chat.completions.create( - model=self.model_name, - messages=[{"role": "user", "content": obj[0]["kwargs"]["content"]}], - temperature=self.temperature, - ) - return response + config = { + "model": self.model_name, + "messages": [{"role": "user", "content": obj[0]["kwargs"]["content"]}], + "temperature": self.temperature, + } + + if self.gpu_type and self.gpu_count: + config["gpu_config"] = { + "type": self.gpu_type, + "count": self.gpu_count + } + + try: + response = openai.chat.completions.create(**config) + return response + except Exception as e: + if self.gpu_type and self.gpu_count: + raise ValueError(f"Error with GPU configuration ({self.gpu_type}, {self.gpu_count} GPUs): {str(e)}") + raise e def _create_chat_result(self, response): from langchain_core.messages import ChatMessage diff --git a/code/nim_disk_size.json b/code/nim_disk_size.json new file mode 100644 index 0000000..8de0ae0 --- /dev/null +++ b/code/nim_disk_size.json @@ -0,0 +1,288 @@ +{ + "codellama-13b-instruct": { + "disk_space": { + "H100": { + "fp16": { + "throughput": 24.63, + "latency": 25.32 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 24.63, + "latency": 25.31 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 24.63, + "latency": 25.31 + } + }, + "L40S": { + "fp16": { + "throughput": 25.32, + "latency": 24.63 + } + }, + "A10G": { + "fp16": { + "throughput": 25.32, + "latency": 26.69 + } + } + } + }, + "codellama-34b-instruct": { + "disk_space": { + "H100": { + "fp8": { + "throughput": 32.17, + "latency": 32.42 + }, + "fp16": { + "throughput": 63.48, + "latency": 64.59 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 63.48, + "latency": 64.59 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 63.48, + "latency": 64.59 + } + }, + "L40S": { + "fp8": { + "throughput": 32.42 + }, + "fp16": { + "throughput": 64.58 + } + }, + "A10G": { + "fp16": { + "throughput": 64.58, + "latency": 66.8 + } + } + } + }, + "codellama-70b": { + "disk_space": { + "H100": { + "fp8": { + "throughput": 65.47, + "latency": 66.37 + }, + "fp16": { + "throughput": 130.35, + "latency": 66.37 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + }, + "A10G": { + "fp16": { + "throughput": 132.69 + } + } + } + }, + "deepseek-r1-distill-llama-8b-rtx": { + "disk_space": { + "RTX 6000 Ada": { + "int4_awq": { + "throughput": 5.42 + } + }, + "RTX 5090": { + "int4_awq": { + "throughput": 5.42 + } + }, + "RTX 5080": { + "int4_awq": { + "throughput": 5.42 + } + }, + "RTX 4090": { + "int4_awq": { + "throughput": 5.42 + } + }, + "RTX 4080": { + "int4_awq": { + "throughput": 5.42 + } + } + } + }, + "phind-codellama-34b-v2-instruct": { + "disk_space": { + "H100": { + "fp8": { + "throughput": 32.17, + "latency": 32.41 + }, + "fp16": { + "throughput": 63.48, + "latency": 64.59 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 63.48, + "latency": 64.59 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 63.48, + "latency": 64.59 + } + }, + "L40S": { + "fp8": { + "throughput": 32.43 + }, + "fp16": { + "throughput": 64.58 + } + }, + "A10G": { + "fp16": { + "latency": 66.8 + } + } + } + }, + "mixtral-8x7b-instruct-v0.1": { + "disk_space": { + "H100": { + "fp8": { + "throughput": 65.47, + "latency": 66.37 + }, + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + } + } + }, + "mixtral-8x22b-instruct-v0.1": { + "disk_space": { + "H100": { + "fp8": { + "throughput": 130.94, + "latency": 132.74 + }, + "fp16": { + "throughput": 260.7, + "latency": 265.42 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 260.7, + "latency": 265.42 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 260.7, + "latency": 265.42 + } + } + } + }, + "llama-3.1-8b-instruct": { + "disk_space": { + "H100": { + "fp8": { + "throughput": 3.8 + }, + "fp16": { + "throughput": 7.14 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 7.14 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 7.14 + } + }, + "L40S": { + "fp8": { + "throughput": 3.8 + }, + "fp16": { + "throughput": 7.14 + } + }, + "A10G": { + "fp16": { + "throughput": 7.14 + } + } + } + }, + "llama-3.1-70b-instruct": { + "disk_space": { + "H100": { + "fp8": { + "throughput": 65.47, + "latency": 66.37 + }, + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + }, + "A100 80GB": { + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + }, + "A100 40GB": { + "fp16": { + "throughput": 130.35, + "latency": 132.71 + } + } + } + } +} diff --git a/code/nim_gpu_support_matrix.json b/code/nim_gpu_support_matrix.json new file mode 100644 index 0000000..730119d --- /dev/null +++ b/code/nim_gpu_support_matrix.json @@ -0,0 +1,675 @@ +{ + "H100": { + "1": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "4": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "8": { + "models": [ + "llama-3.1-405b-instruct", + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "16": { + "models": [ + "deepseek-r1", + "llama-3.1-405b-instruct", + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + } + }, + "A100 80GB": { + "1": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "4": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "8": { + "models": [ + "llama-3.1-405b-instruct", + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "16": { + "models": [ + "deepseek-r1", + "llama-3.1-405b-instruct", + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + } + }, + "A100 40GB": { + "1": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "4": { + "models": [ + "mixtral-8x7b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "8": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "16": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + } + }, + "L40S": { + "1": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "2": { + "models": [ + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "4": { + "models": [ + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "8": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "16": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + } + }, + "A10G": { + "1": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "4": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "8": { + "models": [ + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "16": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + } + }, + "RTX 6000 Ada": { + "1": { + "models": [ + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "2": { + "models": [ + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "4": { + "models": [ + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "8": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + }, + "16": { + "models": [ + "mixtral-8x22b-instruct-v0.1", + "llama-3.3-70b-instruct", + "llama-3.1-70b-instruct", + "llama3-70b-instruct", + "codellama-70b", + "mixtral-8x7b-instruct-v0.1", + "llama-3.1-nemotron-70b-instruct", + "llama-3.1-swallow-70b-instruct-v0.1", + "llama-3-swallow-70b-instruct-v0.1", + "llama-3-taiwan-70b-instruct", + "llama-3.1-8b-instruct", + "llama3-8b-instruct", + "mistral-7b-instruct-v0.3", + "mistral-nemo-12b-instruct", + "gemma-2-9b-it", + "phi-3-mini-4k-instruct", + "llama-3.1-swallow-8b-instruct-v0.1", + "qwen2.5-7b-instruct" + ] + } + }, + "RTX 5090": { + "1": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "4": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "8": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "16": { + "models": [ + "llama-3.1-8b-instruct" + ] + } + }, + "RTX 5080": { + "1": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "4": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "8": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "16": { + "models": [ + "llama-3.1-8b-instruct" + ] + } + }, + "RTX 4090": { + "1": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "4": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "8": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "16": { + "models": [ + "llama-3.1-8b-instruct" + ] + } + }, + "RTX 4080": { + "1": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "2": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "4": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "8": { + "models": [ + "llama-3.1-8b-instruct" + ] + }, + "16": { + "models": [ + "llama-3.1-8b-instruct" + ] + } + } +} \ No newline at end of file diff --git a/code/output.log b/code/output.log index e69de29..3af4e0c 100644 --- a/code/output.log +++ b/code/output.log @@ -0,0 +1,6 @@ +http://localhost:8000 +Running on local URL: http://0.0.0.0:8080 + +To create a public link, set `share=True` in `launch()`. +IMPORTANT: You are using gradio version 4.15.0, however version 4.44.1 is available, please upgrade. +-------- diff --git a/variables.env b/variables.env index e436bde..e68210e 100644 --- a/variables.env +++ b/variables.env @@ -3,3 +3,5 @@ # NOTE: If you change this file while the project is running, you must restart the project container for changes to take effect. TENSORBOARD_LOGS_DIRECTORY=/data/tensorboard/logs/ +#INTERNAL_API---Value should be either blank to work with public endpoints or the appropriate prefix to move to internal ones. +INTERNAL_API=nvdev