diff --git a/experimental/knowledge_graph_rag/app.py b/experimental/knowledge_graph_rag/app.py index e19edc817..f6d387109 100644 --- a/experimental/knowledge_graph_rag/app.py +++ b/experimental/knowledge_graph_rag/app.py @@ -15,8 +15,8 @@ import os import streamlit as st -from llama_index.core import SimpleDirectoryReader, KnowledgeGraphIndex -from utils.preprocessor import extract_triples + +from utils.preprocessor import get_list_of_directories, has_pdf_files from llama_index.core import ServiceContext import multiprocessing import pandas as pd @@ -25,17 +25,8 @@ from vectorstore.search import SearchHandler from langchain_nvidia_ai_endpoints import ChatNVIDIA -def load_data(input_dir, num_workers): - reader = SimpleDirectoryReader(input_dir=input_dir) - documents = reader.load_data(num_workers=num_workers) - return documents - -def has_pdf_files(directory): - for file in os.listdir(directory): - if file.endswith(".pdf"): - return True - return False +st.set_page_config(page_title="Knowledge Graph RAG") st.title("Knowledge Graph RAG") st.subheader("Load Data from Files") @@ -52,19 +43,16 @@ def has_pdf_files(directory): llm = ChatNVIDIA(model=llm) def app(): - # Get the current working directory - cwd = os.getcwd() - # Get a list of visible directories in the current working directory - directories = [d for d in os.listdir(cwd) if os.path.isdir(os.path.join(cwd, d)) and not d.startswith('.') and '__' not in d] - + directories = get_list_of_directories() # Create a dropdown menu for directory selection - selected_dir = st.selectbox("Select a directory:", directories, index=0) + selected_dir = st.selectbox("Select a directory:", directories, index=None) # Construct the full path of the selected directory - directory = os.path.join(cwd, selected_dir) + if selected_dir: + directory = os.path.join(os.getcwd(), selected_dir) - if st.button("Process Documents"): + if st.button("Process Documents", disabled=(selected_dir is None)): # Check if the selected directory has PDF files res = has_pdf_files(directory) if not res: diff --git a/experimental/knowledge_graph_rag/pages/chat.py b/experimental/knowledge_graph_rag/pages/chat.py index 8e0225361..9a23e8253 100644 --- a/experimental/knowledge_graph_rag/pages/chat.py +++ b/experimental/knowledge_graph_rag/pages/chat.py @@ -20,7 +20,7 @@ import streamlit as st import json import networkx as nx -st.set_page_config(layout = "wide") +st.set_page_config(page_title="Knowledge Graph RAG", layout = "wide") from langchain_community.callbacks.streamlit import StreamlitCallbackHandler @@ -29,77 +29,82 @@ from vectorstore.search import SearchHandler -G = nx.read_graphml("knowledge_graph.graphml") -graph = NetworkxEntityGraph(G) - -models = ChatNVIDIA.get_available_models() -available_models = [model.id for model in models if model.model_type=="chat" and "instruct" in model.id] - -with st.sidebar: - llm = st.selectbox("Choose an LLM", available_models, index=available_models.index("mistralai/mixtral-8x7b-instruct-v0.1")) - st.write("You selected: ", llm) - llm = ChatNVIDIA(model=llm) - -st.subheader("Chat with your knowledge graph!") - -if "messages" not in st.session_state: - st.session_state.messages = [] - -for message in st.session_state.messages: - with st.chat_message(message["role"]): - st.markdown(message["content"]) - -with st.sidebar: - use_kg = st.toggle("Use knowledge graph") - -user_input = st.chat_input("Can you tell me how research helps users to solve problems?") - -graph_chain = GraphQAChain.from_llm(llm = llm, graph=graph, verbose=True) - -prompt_template = ChatPromptTemplate.from_messages( - [("system", "You are a helpful AI assistant named Envie. You will reply to questions only based on the context that you are provided. If something is out of context, you will refrain from replying and politely decline to respond to the user."), ("user", "{input}")] -) - -chain = prompt_template | llm | StrOutputParser() -search_handler = SearchHandler("hybrid_demo3", use_bge_m3=True, use_reranker=True) - -if user_input: - st.session_state.messages.append({"role": "user", "content": user_input}) - with st.chat_message("user"): - st.markdown(user_input) - - with st.chat_message("assistant"): - if use_kg: - entity_string = llm.invoke("""Return a JSON with a single key 'entities' and list of entities within this user query. Each element in your list MUST BE part of the user's query. Do not provide any explanation. If the returned list is not parseable in Python, you will be heavily penalized. For example, input: 'What is the difference between Apple and Google?' output: ['Apple', 'Google']. Always follow this output format. Here's the user query: """ + user_input) - try: - entities = json.loads(entity_string.content)['entities'] - with st.expander("Extracted triples"): - st.code(entities) - res = search_handler.search_and_rerank(user_input, k=5) - with st.expander("Retrieved and Reranked Sparse-Dense Hybrid Search"): - st.write(res) - context = "Here are the relevant passages from the knowledge base: \n\n" + "\n".join(item.text for item in res) - all_triplets = [] - for entity in entities: - all_triplets.extend(graph_chain.graph.get_entity_knowledge(entity, depth=2)) - context += "\n\nHere are the relationships from the knowledge graph: " + "\n".join(all_triplets) - with st.expander("All triplets"): - st.code(context) - except Exception as e: - st.write("Faced exception: ", e) - context = "No graph triples were available to extract from the knowledge graph. Always provide a disclaimer if you know the answer to the user's question, since it is not grounded in the knowledge you are provided from the graph." - message_placeholder = st.empty() - full_response = "" - - for response in chain.stream("Context: " + context + "\n\nUser query: " + user_input): - full_response += response - message_placeholder.markdown(full_response + "▌") - else: - message_placeholder = st.empty() - full_response = "" - for response in chain.stream(user_input): - full_response += response - message_placeholder.markdown(full_response + "▌") - message_placeholder.markdown(full_response) - - st.session_state.messages.append({"role": "assistant", "content": full_response}) + +try: + G = nx.read_graphml("knowledge_graph.graphml") +except: + st.subheader("Please upload documents to the knowledge base.") +else: + graph = NetworkxEntityGraph(G) + + models = ChatNVIDIA.get_available_models() + available_models = [model.id for model in models if model.model_type=="chat" and "instruct" in model.id] + + with st.sidebar: + llm = st.selectbox("Choose an LLM", available_models, index=available_models.index("mistralai/mixtral-8x7b-instruct-v0.1")) + st.write("You selected: ", llm) + llm = ChatNVIDIA(model=llm) + + st.subheader("Chat with your knowledge graph!") + + if "messages" not in st.session_state: + st.session_state.messages = [] + + for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.markdown(message["content"]) + + with st.sidebar: + use_kg = st.toggle("Use knowledge graph") + + user_input = st.chat_input("Can you tell me how research helps users to solve problems?") + + graph_chain = GraphQAChain.from_llm(llm = llm, graph=graph, verbose=True) + + prompt_template = ChatPromptTemplate.from_messages( + [("system", "You are a helpful AI assistant named Envie. You will reply to questions only based on the context that you are provided. If something is out of context, you will refrain from replying and politely decline to respond to the user."), ("user", "{input}")] + ) + + chain = prompt_template | llm | StrOutputParser() + search_handler = SearchHandler("hybrid_demo3", use_bge_m3=True, use_reranker=True) + + if user_input: + st.session_state.messages.append({"role": "user", "content": user_input}) + with st.chat_message("user"): + st.markdown(user_input) + + with st.chat_message("assistant"): + if use_kg: + entity_string = llm.invoke("""Return a JSON with a single key 'entities' and list of entities within this user query. Each element in your list MUST BE part of the user's query. Do not provide any explanation. If the returned list is not parseable in Python, you will be heavily penalized. For example, input: 'What is the difference between Apple and Google?' output: ['Apple', 'Google']. Always follow this output format. Here's the user query: """ + user_input) + try: + entities = json.loads(entity_string.content)['entities'] + with st.expander("Extracted triples"): + st.code(entities) + res = search_handler.search_and_rerank(user_input, k=5) + with st.expander("Retrieved and Reranked Sparse-Dense Hybrid Search"): + st.write(res) + context = "Here are the relevant passages from the knowledge base: \n\n" + "\n".join(item.text for item in res) + all_triplets = [] + for entity in entities: + all_triplets.extend(graph_chain.graph.get_entity_knowledge(entity, depth=2)) + context += "\n\nHere are the relationships from the knowledge graph: " + "\n".join(all_triplets) + with st.expander("All triplets"): + st.code(context) + except Exception as e: + st.write("Faced exception: ", e) + context = "No graph triples were available to extract from the knowledge graph. Always provide a disclaimer if you know the answer to the user's question, since it is not grounded in the knowledge you are provided from the graph." + message_placeholder = st.empty() + full_response = "" + + for response in chain.stream("Context: " + context + "\n\nUser query: " + user_input): + full_response += response + message_placeholder.markdown(full_response + "▌") + else: + message_placeholder = st.empty() + full_response = "" + for response in chain.stream(user_input): + full_response += response + message_placeholder.markdown(full_response + "▌") + message_placeholder.markdown(full_response) + + st.session_state.messages.append({"role": "assistant", "content": full_response}) diff --git a/experimental/knowledge_graph_rag/pages/evaluation.py b/experimental/knowledge_graph_rag/pages/evaluation.py index 60b63031e..ebe77d010 100644 --- a/experimental/knowledge_graph_rag/pages/evaluation.py +++ b/experimental/knowledge_graph_rag/pages/evaluation.py @@ -13,30 +13,29 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import os +import random + import streamlit as st -from llama_index.core import SimpleDirectoryReader, KnowledgeGraphIndex -from utils.preprocessor import generate_qa_pair -from llama_index.core import ServiceContext -import multiprocessing +import altair as alt +import matplotlib.pyplot as plt import pandas as pd import networkx as nx -from utils.lc_graph import process_documents, save_triples_to_csvs -from vectorstore.search import SearchHandler -from langchain_nvidia_ai_endpoints import ChatNVIDIA -import random -import pandas as pd -import time -import json +from concurrent.futures import ThreadPoolExecutor +from langchain_community.graphs.networkx_graph import NetworkxEntityGraph from langchain_core.output_parsers import StrOutputParser -from langchain_community.graphs.networkx_graph import NetworkxEntityGraph, get_entities from langchain_core.prompts import ChatPromptTemplate +from langchain_nvidia_ai_endpoints import ChatNVIDIA +from openai import OpenAI from vectorstore.search import SearchHandler +from utils.lc_graph import process_documents +from utils.preprocessor import get_list_of_directories, has_pdf_files, generate_qa_pair -from concurrent.futures import ThreadPoolExecutor, as_completed -from openai import OpenAI +st.set_page_config(page_title="Knowledge Graph RAG") + reward_client = OpenAI( base_url = "https://integrate.api.nvidia.com/v1", api_key = os.environ["NVIDIA_API_KEY"] @@ -80,17 +79,6 @@ def process_question(question, answer): [("system", "You are a helpful AI assistant named Envie. You will reply to questions only based on the context that you are provided. If something is out of context, you will refrain from replying and politely decline to respond to the user."), ("user", "{input}")] ) -def load_data(input_dir, num_workers): - reader = SimpleDirectoryReader(input_dir=input_dir) - documents = reader.load_data(num_workers=num_workers) - return documents - -def has_pdf_files(directory): - for file in os.listdir(directory): - if file.endswith(".pdf"): - return True - return False - def get_text_RAG_response(question): chain = prompt_template | llm | StrOutputParser() @@ -154,18 +142,16 @@ def get_combined_RAG_response(question): num_data = st.slider("How many Q&A pairs to generate?", 10, 100, 50, step=10) def app(): - # Get the current working directory - cwd = os.getcwd() - - # Get a list of visible directories in the current working directory - directories = [d for d in os.listdir(cwd) if os.path.isdir(os.path.join(cwd, d)) and not d.startswith('.') and '__' not in d] - + # Get a list of visible directories in the current working directory + directories = get_list_of_directories() # Create a dropdown menu for directory selection - selected_dir = st.selectbox("Select a directory:", directories, index=0) + selected_dir = st.selectbox("Select a directory:", directories, index=None) # Construct the full path of the selected directory - directory = os.path.join(cwd, selected_dir) - if st.button("Process Documents"): + if selected_dir: + directory = os.path.join(os.getcwd(), selected_dir) + + if st.button("Process Documents", disabled=(selected_dir is None)): # Check if the selected directory has PDF files res = has_pdf_files(directory) if not res: @@ -261,5 +247,21 @@ def app(): st.write("First few rows of the updated data:") st.dataframe(combined_results.head()) + average_scores = combined_results.mean(axis=0, numeric_only=True) + rows = [] + for index, value in average_scores.items(): + metric, category = tuple(index.split("_")) + rows.append([metric, category, value]) + + final_df = pd.DataFrame(rows, columns=["metric", "category", "average score"]) + + gp_chart = alt.Chart(final_df).mark_bar().encode( + x="metric:N", + y="average score:Q", + xOffset="category:N", + color="category:N" + ) + st.altair_chart(gp_chart, use_container_width=True) + if __name__ == "__main__": app() \ No newline at end of file diff --git a/experimental/knowledge_graph_rag/pages/visualization.py b/experimental/knowledge_graph_rag/pages/visualization.py index 633c4b9d8..b94a7f8e3 100644 --- a/experimental/knowledge_graph_rag/pages/visualization.py +++ b/experimental/knowledge_graph_rag/pages/visualization.py @@ -16,7 +16,7 @@ import streamlit as st import streamlit.components.v1 as components -st.set_page_config(layout="wide") +st.set_page_config(page_title="Knowledge Graph RAG", layout="wide") def app(): st.title("Visualize the Knowledge Graph!") diff --git a/experimental/knowledge_graph_rag/requirements.txt b/experimental/knowledge_graph_rag/requirements.txt index 82a9a1410..a9d094b99 100644 --- a/experimental/knowledge_graph_rag/requirements.txt +++ b/experimental/knowledge_graph_rag/requirements.txt @@ -7,8 +7,10 @@ llama_index==0.10.50 networkx==3.2.1 numpy==1.24.1 pandas==2.2.2 +psutil==6.0.0 pymilvus==2.4.3 -Requests==2.32.3 +pymilvus[model] +Requests==2.31.0 streamlit==1.30.0 unstructured[all-docs] tqdm==4.66.1 diff --git a/experimental/knowledge_graph_rag/utils/download_papers.py b/experimental/knowledge_graph_rag/scripts/download_papers.py similarity index 93% rename from experimental/knowledge_graph_rag/utils/download_papers.py rename to experimental/knowledge_graph_rag/scripts/download_papers.py index 5030f3259..a47df7e8e 100644 --- a/experimental/knowledge_graph_rag/utils/download_papers.py +++ b/experimental/knowledge_graph_rag/scripts/download_papers.py @@ -58,20 +58,21 @@ def download_paper(result, download_dir, max_retries=3, retry_delay=5): def download_papers(search_terms, start_date, end_date, max_results=10, download_dir='papers', num_threads=4, max_retries=3, retry_delay=5): # Create the search query based on search terms and dates search_query = f"({search_terms}) AND submittedDate:[{start_date.strftime('%Y%m%d')} TO {end_date.strftime('%Y%m%d')}]" + client = arxiv.Client() search = arxiv.Search( query=search_query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate, ) - # Create the download directory if it doesn't exist os.makedirs(download_dir, exist_ok=True) # Use a thread pool to download papers in parallel with ThreadPoolExecutor(max_workers=num_threads) as executor: futures = [] - for result in tqdm(search.results(), total=max_results, unit='paper'): + for result in tqdm(client.results(search), total=max_results, unit='paper'): + print(result.title) # Submit download tasks to the executor future = executor.submit(download_paper, result, download_dir, max_retries, retry_delay) futures.append(future) @@ -110,6 +111,7 @@ def download_papers(search_terms, start_date, end_date, max_results=10, download else: end_date = datetime.now() # Default to today + search_query = ' AND '.join([f'all:"{term}"' for term in args.search_terms.split(",")]) # Call the download_papers function with the provided arguments - download_papers(args.search_terms, start_date, end_date, args.max_results, args.download_dir, args.num_threads, args.max_retries, args.retry_delay) + download_papers(search_query, start_date, end_date, args.max_results, args.download_dir, args.num_threads, args.max_retries, args.retry_delay) diff --git a/experimental/knowledge_graph_rag/utils/lc_graph.py b/experimental/knowledge_graph_rag/utils/lc_graph.py index 5a5a81ac2..649296061 100644 --- a/experimental/knowledge_graph_rag/utils/lc_graph.py +++ b/experimental/knowledge_graph_rag/utils/lc_graph.py @@ -15,7 +15,6 @@ from langchain_nvidia_ai_endpoints import ChatNVIDIA import concurrent.futures -from preprocessor import extract_triples from tqdm import tqdm from langchain_community.document_loaders import DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -23,6 +22,8 @@ import csv import streamlit as st +from utils.preprocessor import extract_triples + # function to process a single document (will run many of these processes in parallel) def process_document(doc, llm): try: @@ -77,6 +78,8 @@ def save_triples_to_csvs(triples): # Create the relations DataFrame relations_df = pd.DataFrame({'relation_id': range(len(triples_df['relation'].unique())), 'relation_name': triples_df['relation'].unique()}) + print(triples_df) + print(relations_df) # Get unique entities (subjects and objects) from triples_df entities = pd.concat([triples_df['subject'], triples_df['object']]).unique() diff --git a/experimental/knowledge_graph_rag/utils/preprocessor.py b/experimental/knowledge_graph_rag/utils/preprocessor.py index 3861c3067..3ce577be7 100644 --- a/experimental/knowledge_graph_rag/utils/preprocessor.py +++ b/experimental/knowledge_graph_rag/utils/preprocessor.py @@ -17,15 +17,42 @@ import os import json import ast + + +from langchain_core.output_parsers import StrOutputParser +from langchain_core.prompts import ChatPromptTemplate from langchain_nvidia_ai_endpoints import ChatNVIDIA +from llama_index.core import SimpleDirectoryReader + if not os.environ.get("NVIDIA_API_KEY", "").startswith("nvapi-"): nvapi_key = getpass.getpass("Enter your NVIDIA API key: ") assert nvapi_key.startswith("nvapi-"), f"{nvapi_key[:5]}... is not a valid key" os.environ["NVIDIA_API_KEY"] = nvapi_key -from langchain_core.output_parsers import StrOutputParser -from langchain_core.prompts import ChatPromptTemplate +dirs_to_exclude = ["pages", "utils", "vectorstore", "venv", "volumes", "scripts"] +required_exts = ["pdf"] + + +def get_list_of_directories(): + cwd = os.getcwd() + + # Get a list of visible directories in the current working directory + return [d for d in os.listdir(cwd) if os.path.isdir(os.path.join(cwd, d)) and not d.startswith('.') and '__' not in d and d not in dirs_to_exclude] + + +def load_data(input_dir, num_workers): + reader = SimpleDirectoryReader(input_dir=input_dir, recursive=True, required_exts=required_exts) + documents = reader.load_data(num_workers=num_workers) + return documents + + +def has_pdf_files(directory): + for file in os.listdir(directory): + if file.endswith(".pdf"): + return True + return False + def process_response(triplets_str): triplets_list = ast.literal_eval(triplets_str) @@ -92,4 +119,5 @@ def generate_qa_pair(text, llm): parsed_response = json.loads(response) return parsed_response except: - return None \ No newline at end of file + return None + \ No newline at end of file