From ff3fe68e11ba44d2c6d5f42b99756bf53858d12f Mon Sep 17 00:00:00 2001 From: roti Date: Thu, 1 Feb 2024 22:35:20 +0000 Subject: [PATCH 01/51] feat: cache huggingface models --- README.md | 11 ++++++++--- cache/.keep | 0 2 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 cache/.keep diff --git a/README.md b/README.md index 2e2a009..c4dabd5 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,14 @@ To build and run the container locally with hot reload on python files do: ``` DOCKER_BUILDKIT=1 docker build . -t gbnc -docker run -v "$(pwd)/gswikichat":/workspace/gswikichat \ - -p 8000:8000 --rm --name gbnc -it gbnc \ - -e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN +docker run \ + -v "$(pwd)/gswikichat":/workspace/gswikichat \ + -v "$(pwd)/cache":/root/.cache \ + -e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN + -p 8000:8000 \ + --rm -it \ + --name gbnc \ + gbnc ``` Point your browser to http://localhost:8000/ and use the frontend. diff --git a/cache/.keep b/cache/.keep new file mode 100644 index 0000000..e69de29 From 38a3bf950b2a0171161a02c00f854e96bc9a0229 Mon Sep 17 00:00:00 2001 From: roti Date: Thu, 1 Feb 2024 22:35:32 +0000 Subject: [PATCH 02/51] fix: sentence_transformers version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b03f924..7369c03 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,7 +32,7 @@ python-dotenv==1.0.1 pytz==2023.3.post1 PyYAML==6.0.1 requests==2.31.0 -sentence-transformers>=2.2.0 +sentence-transformers==2.2.0 six==1.16.0 sniffio==1.3.0 starlette==0.35.1 From 3fb6fd0f5d6f944a319637843bb1812495d713d1 Mon Sep 17 00:00:00 2001 From: roti Date: Thu, 1 Feb 2024 22:40:01 +0000 Subject: [PATCH 03/51] chore: remove custom model based on modelfile --- Dockerfile | 7 ------- Modelfile | 2 -- 2 files changed, 9 deletions(-) delete mode 100644 Modelfile diff --git a/Dockerfile b/Dockerfile index 6202b20..796ccc4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -42,13 +42,6 @@ ARG MODEL=stablelm2:1.6b-zephyr ENV MODEL=${MODEL} RUN ollama serve & while ! curl http://localhost:11434; do sleep 1; done; ollama pull $MODEL -# Build a language model -# ARG MODEL=discolm -# ENV MODEL=${MODEL} -# WORKDIR /tmp/model -# COPY --chmod=644 Modelfile Modelfile -# RUN curl --location https://huggingface.co/TheBloke/DiscoLM_German_7b_v1-GGUF/resolve/main/discolm_german_7b_v1.Q5_K_S.gguf?download=true --output discolm_german_7b_v1.Q5_K_S.gguf; ollama serve & while ! curl http://localhost:11434; do sleep 1; done; ollama create ${MODEL} -f Modelfile && rm -rf /tmp/model - # Setup the custom API and frontend WORKDIR /workspace diff --git a/Modelfile b/Modelfile deleted file mode 100644 index e0c49cb..0000000 --- a/Modelfile +++ /dev/null @@ -1,2 +0,0 @@ -FROM ./discolm_german_7b_v1.Q5_K_S.gguf - From a4c729453f97c69e6378431ec8902ec50ca7c5d3 Mon Sep 17 00:00:00 2001 From: roti Date: Thu, 1 Feb 2024 22:50:34 +0000 Subject: [PATCH 04/51] fix(frontend): do not filter by score for now TBD --- frontend/src/components/field/FieldAnswer.vue | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/components/field/FieldAnswer.vue b/frontend/src/components/field/FieldAnswer.vue index 5c78d55..99afac7 100644 --- a/frontend/src/components/field/FieldAnswer.vue +++ b/frontend/src/components/field/FieldAnswer.vue @@ -12,7 +12,7 @@
-
+
From d38c5f052d3a8a617438a5c126c7571e7413e6b1 Mon Sep 17 00:00:00 2001 From: roti Date: Thu, 1 Feb 2024 22:52:31 +0000 Subject: [PATCH 05/51] chore: remove debug/test code --- gswikichat/__init__.py | 1 - gswikichat/api.py | 11 ---- gswikichat/vector_store_interface.py | 77 ---------------------------- 3 files changed, 89 deletions(-) diff --git a/gswikichat/__init__.py b/gswikichat/__init__.py index eab6613..0a0e47b 100644 --- a/gswikichat/__init__.py +++ b/gswikichat/__init__.py @@ -1,2 +1 @@ from .api import * -# from .haystack2beta_tutorial_InMemoryEmbeddingRetriever import * diff --git a/gswikichat/api.py b/gswikichat/api.py index a05ff27..5dc6677 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -2,7 +2,6 @@ from fastapi.staticfiles import StaticFiles from fastapi import FastAPI -# from .rag import rag_pipeline from .rag import embedder, retriever, prompt_builder, llm, answer_builder from haystack import Document @@ -23,9 +22,6 @@ async def root(): @app.get("/api") async def api(q): - embedder, retriever, prompt_builder, llm, answer_builder - - # query = "How many languages are there?" query = Document(content=q) result = embedder.run([query]) @@ -37,18 +33,11 @@ async def api(q): scale_score=None, return_embedding=None ) - # .run( - # result['documents'][0].embedding - # ) prompt = prompt_builder.run(documents=results['documents'])['prompt'] response = llm.run(prompt=prompt, generation_kwargs=None) - # reply = response['replies'][0] - # rag_pipeline.connect("llm.replies", "answer_builder.replies") - # rag_pipeline.connect("llm.metadata", "answer_builder.meta") - # rag_pipeline.connect("retriever", "answer_builder.documents") results = answer_builder.run( query=q, diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py index bc99b9d..106d050 100644 --- a/gswikichat/vector_store_interface.py +++ b/gswikichat/vector_store_interface.py @@ -1,16 +1,12 @@ import os import json -# from sentence_transformers import SentenceTransformer from tqdm import tqdm from haystack import Document # , Pipeline from haystack.components.embedders import SentenceTransformersDocumentEmbedder -# from haystack.components.embedders import SentenceTransformersTextEmbedder from haystack.document_stores.in_memory import InMemoryDocumentStore -# from haystack.components.retrievers.in_memory import InMemoryBM25Retriever from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever -# from haystack.components.writers import DocumentWriter from haystack.document_stores.types.policy import DuplicatePolicy HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN') @@ -64,54 +60,9 @@ # embedding_dim=768, # duplicate_documents="overwrite" ) -# document_store.write_documents(input_documents) - -# TODO Introduce Jina.AI from HuggingFace. Establish env-variable for trust_... - -# basic_transformer_models = [ -# "all-MiniLM-L6-v2", -# "xlm-clm-ende-1024", -# "xlm-mlm-ende-1024", -# "bert-base-german-cased", -# "bert-base-german-dbmdz-cased", -# "bert-base-german-dbmdz-uncased", -# "distilbert-base-german-cased", -# "xlm-roberta-large-finetuned-conll03-german", -# "deutsche-telekom/gbert-large-paraphrase-cosine" -# ] - -# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 -# sentence_transformer_model = "all-MiniLM-L6-v2" -# 3 minutes to batch 82 - -# https://huggingface.co/deutsche-telekom/gbert-large-paraphrase-cosine -# sentence_transformer_model = 'deutsche-telekom/gbert-large-paraphrase-cosine' -# 76 minutes to batch 82 - -# https://huggingface.co/jinaai/jina-embeddings-v2-base-de -# sentence_transformer_model = 'jinaai/jina-embeddings-v2-base-de' -# Cannot find or load the embedding model -# Unknown minutes to batch 82 - -# https://huggingface.co/aari1995/German_Semantic_STS_V2 -# sentence_transformer_model = 'aari1995/German_Semantic_STS_V2' -# 75 minutes to batch 82 - -# https://huggingface.co/Sahajtomar/German-semantic -# sentence_transformer_model = 'Sahajtomar/German-semantic' -# 72 minutes to batch 82 # https://huggingface.co/svalabs/german-gpl-adapted-covid sentence_transformer_model = 'svalabs/german-gpl-adapted-covid' -# 2 minutes to batch 82 - -# https://huggingface.co/PM-AI/bi-encoder_msmarco_bert-base_german -# sentence_transformer_model = 'PM-AI/bi-encoder_msmarco_bert-base_german' -# 14 minutes to batch 82 - -# https://huggingface.co/JoBeer/german-semantic-base -# sentence_transformer_model = 'JoBeer/german-semantic-base' -# 22 minutes to batch 82 print(f'Sentence Transformer Name:{sentence_transformer_model}') @@ -122,26 +73,9 @@ # token=HUGGING_FACE_HUB_TOKEN ) -# hg_embedder = SentenceTransformer( -# "jinaai/jina-embeddings-v2-base-de", -# token=HUGGING_FACE_HUB_TOKEN -# ) - embedder.warm_up() documents_with_embeddings = embedder.run(input_documents) -# documents_with_embeddings = embedder.encode(input_documents) - - -# print('\n\n') -# # print(documents_with_embeddings['documents']) -# print(type(documents_with_embeddings['documents'])) -# print(len(documents_with_embeddings['documents'])) -# print(dir(documents_with_embeddings['documents'][0])) -# print('\n\n') -# print(type(embedder.model)) -# print('\n\n') -# # print(dir(hg_embedder)) document_store.write_documents( @@ -155,14 +89,3 @@ top_k=top_k ) -# writer = DocumentWriter(document_store=document_store) - -# indexing_pipeline = Pipeline() -# indexing_pipeline.add_component("embedder", embedder) -# indexing_pipeline.add_component("writer", writer) -# indexing_pipeline.connect("embedder", "writer") -# indexing_pipeline.run( -# { -# "embedder": {"documents": input_documents} -# } -# ) From dc4501a2042743cc5b6f275014a85d66de20d375 Mon Sep 17 00:00:00 2001 From: roti Date: Thu, 1 Feb 2024 22:52:59 +0000 Subject: [PATCH 06/51] fix: required sentence_transformers version was actually > 2.2.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7369c03..723011a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,7 +32,7 @@ python-dotenv==1.0.1 pytz==2023.3.post1 PyYAML==6.0.1 requests==2.31.0 -sentence-transformers==2.2.0 +sentence-transformers==2.3.1 six==1.16.0 sniffio==1.3.0 starlette==0.35.1 From 42cdcc5e5cf4847e4bfe7c4db2dbf4b7237bc1d5 Mon Sep 17 00:00:00 2001 From: roti Date: Thu, 1 Feb 2024 22:55:25 +0000 Subject: [PATCH 07/51] docs: add notes about embedding models to readme --- README.md | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/README.md b/README.md index c4dabd5..528edaf 100644 --- a/README.md +++ b/README.md @@ -49,3 +49,52 @@ A [FastAPI](https://fastapi.tiangolo.com/) server is running in the container. I ### Frontend A minimal frontend lets the user input a question and renders the response from the system. + +## Sentence Transformers Statistics + +``` +basic_transformer_models = [ + "all-MiniLM-L6-v2", + "xlm-clm-ende-1024", + "xlm-mlm-ende-1024", + "bert-base-german-cased", + "bert-base-german-dbmdz-cased", + "bert-base-german-dbmdz-uncased", + "distilbert-base-german-cased", + "xlm-roberta-large-finetuned-conll03-german", + "deutsche-telekom/gbert-large-paraphrase-cosine" +] + +https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 +sentence_transformer_model = "all-MiniLM-L6-v2" +3 minutes to batch 82 + +https://huggingface.co/deutsche-telekom/gbert-large-paraphrase-cosine +sentence_transformer_model = 'deutsche-telekom/gbert-large-paraphrase-cosine' +76 minutes to batch 82 + +https://huggingface.co/jinaai/jina-embeddings-v2-base-de +sentence_transformer_model = 'jinaai/jina-embeddings-v2-base-de' +Cannot find or load the embedding model +Unknown minutes to batch 82 + +https://huggingface.co/aari1995/German_Semantic_STS_V2 +sentence_transformer_model = 'aari1995/German_Semantic_STS_V2' +75 minutes to batch 82 + +https://huggingface.co/Sahajtomar/German-semantic +sentence_transformer_model = 'Sahajtomar/German-semantic' +72 minutes to batch 82 + +https://huggingface.co/svalabs/german-gpl-adapted-covid +ntence_transformer_model = 'svalabs/german-gpl-adapted-covid' +2 minutes to batch 82 + +https://huggingface.co/PM-AI/bi-encoder_msmarco_bert-base_german +sentence_transformer_model = 'PM-AI/bi-encoder_msmarco_bert-base_german' +14 minutes to batch 82 + +https://huggingface.co/JoBeer/german-semantic-base +sentence_transformer_model = 'JoBeer/german-semantic-base' +22 minutes to batch 82 +``` From 13bc12eb1ca457a3c6ce65cb386dd41675922bfa Mon Sep 17 00:00:00 2001 From: roti Date: Thu, 1 Feb 2024 23:02:11 +0000 Subject: [PATCH 08/51] chore: add debug output to api.py --- gswikichat/api.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/gswikichat/api.py b/gswikichat/api.py index 5dc6677..b6f49f9 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -21,40 +21,49 @@ async def root(): @app.get("/api") async def api(q): + print("query: ", q) query = Document(content=q) - result = embedder.run([query]) + queryEmbedded = embedder.run([query]) + queryEmbedding = queryEmbedded['documents'][0].embedding - results = retriever.run( - query_embedding=list(result['documents'][0].embedding), + retrieverResults = retriever.run( + query_embedding=list(queryEmbedding), filters=None, top_k=None, scale_score=None, return_embedding=None ) - prompt = prompt_builder.run(documents=results['documents'])['prompt'] + print("retriever results:") + for retrieverResult in retrieverResults: + print(retrieverResult) - response = llm.run(prompt=prompt, generation_kwargs=None) + promptBuild = prompt_builder.run(documents=retrieverResults['documents']) + prompt = promptBuild['prompt'] + + print("prompt: ", prompt) + response = llm.run(prompt=prompt, generation_kwargs=None) - results = answer_builder.run( + answerBuild = answer_builder.run( query=q, replies=response['replies'], meta=response['meta'], - documents=results['documents'], + documents=retrieverResults['documents'], pattern=None, reference_pattern=None ) + print("answerBuild", answerBuild) + + answer = answerBuild['answers'][0] + + sources= [{ "src": d.meta['src'], "content": d.content, "score": d.score } for d in answer.documents] - answer = results['answers'][0] + print("answer", answer) return { "answer": answer.data, - "sources": [{ - "src": d.meta['src'], - "content": d.content, - "score": d.score - } for d in answer.documents] + "sources": sources } From 4933a9a89facafa6dc2b0bc69ed8252a63813682 Mon Sep 17 00:00:00 2001 From: roti Date: Thu, 1 Feb 2024 23:03:10 +0000 Subject: [PATCH 09/51] fix: question in prompt --- gswikichat/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gswikichat/api.py b/gswikichat/api.py index b6f49f9..7a0d28c 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -40,7 +40,7 @@ async def api(q): for retrieverResult in retrieverResults: print(retrieverResult) - promptBuild = prompt_builder.run(documents=retrieverResults['documents']) + promptBuild = prompt_builder.run(question=q, documents=retrieverResults['documents']) prompt = promptBuild['prompt'] print("prompt: ", prompt) From b23833b720975b7a763cb224e84b7d0ca53e2e4d Mon Sep 17 00:00:00 2001 From: roti Date: Thu, 1 Feb 2024 23:03:20 +0000 Subject: [PATCH 10/51] chore: top_k 3 results for now --- gswikichat/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gswikichat/api.py b/gswikichat/api.py index 7a0d28c..ce2a144 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -31,7 +31,7 @@ async def api(q): retrieverResults = retriever.run( query_embedding=list(queryEmbedding), filters=None, - top_k=None, + top_k=3, scale_score=None, return_embedding=None ) From da1017b3da12bce60af1e2cc9a315ab207079f3e Mon Sep 17 00:00:00 2001 From: roti Date: Thu, 1 Feb 2024 23:04:12 +0000 Subject: [PATCH 11/51] wip: embeddings cache --- gswikichat/vector_store_interface.py | 37 ++++++++++++++++++---------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py index 106d050..d55d230 100644 --- a/gswikichat/vector_store_interface.py +++ b/gswikichat/vector_store_interface.py @@ -10,6 +10,8 @@ from haystack.document_stores.types.policy import DuplicatePolicy HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN') +EMBEDDING_CACHE_FILE = '/tmp/gbnc_embeddings.json' + top_k = 5 input_documents = [] @@ -63,28 +65,37 @@ # https://huggingface.co/svalabs/german-gpl-adapted-covid sentence_transformer_model = 'svalabs/german-gpl-adapted-covid' - -print(f'Sentence Transformer Name:{sentence_transformer_model}') +print(f'Sentence Transformer Name: {sentence_transformer_model}') embedder = SentenceTransformersDocumentEmbedder( model=sentence_transformer_model, - # model="T-Systems-onsite/german-roberta-sentence-transformer-v2", - # model="jinaai/jina-embeddings-v2-base-de", - # token=HUGGING_FACE_HUB_TOKEN ) - embedder.warm_up() -documents_with_embeddings = embedder.run(input_documents) - -document_store.write_documents( - documents=documents_with_embeddings['documents'], - policy=DuplicatePolicy.OVERWRITE -) +# if os.path.isfile(EMBEDDING_CACHE_FILE): +# print("[INFO] Loading embeddings from cache") +# +# with open(EMBEDDING_CACHE_FILE, 'r') as f: +# documentsDict = json.load(f) +# document_store.write_documents( +# documents=[Document.from_dict(d) for d in documentsDict], +# policy=DuplicatePolicy.OVERWRITE +# ) +# +# else: +if True: + embedded = embedder.run(input_documents) + document_store.write_documents( + documents=embedded['documents'], + policy=DuplicatePolicy.OVERWRITE + ) + + with open(EMBEDDING_CACHE_FILE, 'w') as f: + documentsDict = [Document.to_dict(d) for d in embedded['documents']] + json.dump(documentsDict, f) retriever = InMemoryEmbeddingRetriever( - # embedding_model="sentence-transformers/all-MiniLM-L6-v2", document_store=document_store, top_k=top_k ) From 41ff046faa849431a8e83ed2e4caff78ae6e6522 Mon Sep 17 00:00:00 2001 From: roti Date: Thu, 1 Feb 2024 23:04:27 +0000 Subject: [PATCH 12/51] feat: document splitter --- gswikichat/vector_store_interface.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py index d55d230..a470a0a 100644 --- a/gswikichat/vector_store_interface.py +++ b/gswikichat/vector_store_interface.py @@ -8,6 +8,8 @@ from haystack.document_stores.in_memory import InMemoryDocumentStore from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever from haystack.document_stores.types.policy import DuplicatePolicy +from haystack.components.preprocessors import DocumentSplitter +from haystack.components.preprocessors import DocumentCleaner HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN') EMBEDDING_CACHE_FILE = '/tmp/gbnc_embeddings.json' @@ -16,7 +18,8 @@ input_documents = [] json_dir = 'json_input' -json_fname = 'excellent-articles_10_paragraphs.json' +json_fname = 'excellent-articles_10.json' + json_fpath = os.path.join(json_dir, json_fname) if os.path.isfile(json_fpath): @@ -28,11 +31,11 @@ for k, v in tqdm(json_obj.items()): print(f"Loading {k}") input_documents.append(Document(content=v, meta={"src": k})) + elif isinstance(json_obj, list): for obj_ in tqdm(json_obj): url = obj_['meta'] content = obj_['content'] - input_documents.append( Document( content=content, @@ -55,7 +58,14 @@ ), ] -# Write documents to InMemoryDocumentStore +# cleaner = DocumentCleaner( +# remove_empty_lines=True, +# remove_extra_whitespaces=True, +# remove_repeated_substrings=False) +# input_documents = cleaner.run(input_documents)['documents'] + +splitter = DocumentSplitter(split_by="sentence", split_length=20, split_overlap=0) +input_documents = splitter.run(input_documents)['documents'] document_store = InMemoryDocumentStore( embedding_similarity_function="cosine", From 4e69697a74e16c965c65d13968bf1fdc7dd9c51d Mon Sep 17 00:00:00 2001 From: Jonathan Fraine Date: Sat, 3 Feb 2024 00:13:21 +0100 Subject: [PATCH 13/51] Update .dockerignore --- .dockerignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dockerignore b/.dockerignore index b6853c9..441b043 100644 --- a/.dockerignore +++ b/.dockerignore @@ -100,4 +100,4 @@ frontend/dist !src/ !package.json !yarn.lock -!.yarnrc \ No newline at end of file +!.yarnrc From 0ee6ed5d1a55b85a22d2d75a947f30d2e336da71 Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Sun, 4 Feb 2024 13:25:21 +0000 Subject: [PATCH 14/51] docs: note on how to dev locally --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 528edaf..bb619f5 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,19 @@ Point your browser to http://localhost:8000/ and use the frontend. The container works on [runpod.io](https://www.runpod.io/) GPU instances. A [template is available here](https://runpod.io/gsc?template=0w8z55rf19&ref=yfvyfa0s). +### Local development +#### Backend +``` +python -m venv .venv +. ./.venv/bin/activate +pip install -r requirements.txt +``` +#### Frontend +``` +cd frontend +yarn dev +``` + ## What's in the box ### Docker container From 7a2c9553f668d60aa964f84fae0f6ecc050a271f Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Sun, 4 Feb 2024 13:26:44 +0000 Subject: [PATCH 15/51] docs: add research_log.md --- README.md | 48 ------------------------------------------------ research_log.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 48 deletions(-) create mode 100644 research_log.md diff --git a/README.md b/README.md index bb619f5..8aa376c 100644 --- a/README.md +++ b/README.md @@ -63,51 +63,3 @@ A [FastAPI](https://fastapi.tiangolo.com/) server is running in the container. I A minimal frontend lets the user input a question and renders the response from the system. -## Sentence Transformers Statistics - -``` -basic_transformer_models = [ - "all-MiniLM-L6-v2", - "xlm-clm-ende-1024", - "xlm-mlm-ende-1024", - "bert-base-german-cased", - "bert-base-german-dbmdz-cased", - "bert-base-german-dbmdz-uncased", - "distilbert-base-german-cased", - "xlm-roberta-large-finetuned-conll03-german", - "deutsche-telekom/gbert-large-paraphrase-cosine" -] - -https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 -sentence_transformer_model = "all-MiniLM-L6-v2" -3 minutes to batch 82 - -https://huggingface.co/deutsche-telekom/gbert-large-paraphrase-cosine -sentence_transformer_model = 'deutsche-telekom/gbert-large-paraphrase-cosine' -76 minutes to batch 82 - -https://huggingface.co/jinaai/jina-embeddings-v2-base-de -sentence_transformer_model = 'jinaai/jina-embeddings-v2-base-de' -Cannot find or load the embedding model -Unknown minutes to batch 82 - -https://huggingface.co/aari1995/German_Semantic_STS_V2 -sentence_transformer_model = 'aari1995/German_Semantic_STS_V2' -75 minutes to batch 82 - -https://huggingface.co/Sahajtomar/German-semantic -sentence_transformer_model = 'Sahajtomar/German-semantic' -72 minutes to batch 82 - -https://huggingface.co/svalabs/german-gpl-adapted-covid -ntence_transformer_model = 'svalabs/german-gpl-adapted-covid' -2 minutes to batch 82 - -https://huggingface.co/PM-AI/bi-encoder_msmarco_bert-base_german -sentence_transformer_model = 'PM-AI/bi-encoder_msmarco_bert-base_german' -14 minutes to batch 82 - -https://huggingface.co/JoBeer/german-semantic-base -sentence_transformer_model = 'JoBeer/german-semantic-base' -22 minutes to batch 82 -``` diff --git a/research_log.md b/research_log.md new file mode 100644 index 0000000..a6d31ed --- /dev/null +++ b/research_log.md @@ -0,0 +1,48 @@ +## Sentence Transformers Statistics + +``` +basic_transformer_models = [ + "all-MiniLM-L6-v2", + "xlm-clm-ende-1024", + "xlm-mlm-ende-1024", + "bert-base-german-cased", + "bert-base-german-dbmdz-cased", + "bert-base-german-dbmdz-uncased", + "distilbert-base-german-cased", + "xlm-roberta-large-finetuned-conll03-german", + "deutsche-telekom/gbert-large-paraphrase-cosine" +] + +https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 +sentence_transformer_model = "all-MiniLM-L6-v2" +3 minutes to batch 82 + +https://huggingface.co/deutsche-telekom/gbert-large-paraphrase-cosine +sentence_transformer_model = 'deutsche-telekom/gbert-large-paraphrase-cosine' +76 minutes to batch 82 + +https://huggingface.co/jinaai/jina-embeddings-v2-base-de +sentence_transformer_model = 'jinaai/jina-embeddings-v2-base-de' +Cannot find or load the embedding model +Unknown minutes to batch 82 + +https://huggingface.co/aari1995/German_Semantic_STS_V2 +sentence_transformer_model = 'aari1995/German_Semantic_STS_V2' +75 minutes to batch 82 + +https://huggingface.co/Sahajtomar/German-semantic +sentence_transformer_model = 'Sahajtomar/German-semantic' +72 minutes to batch 82 + +https://huggingface.co/svalabs/german-gpl-adapted-covid +ntence_transformer_model = 'svalabs/german-gpl-adapted-covid' +2 minutes to batch 82 + +https://huggingface.co/PM-AI/bi-encoder_msmarco_bert-base_german +sentence_transformer_model = 'PM-AI/bi-encoder_msmarco_bert-base_german' +14 minutes to batch 82 + +https://huggingface.co/JoBeer/german-semantic-base +sentence_transformer_model = 'JoBeer/german-semantic-base' +22 minutes to batch 82 +``` From 0a5e2be984558f494fed0b82f597cc73be0e5b62 Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Mon, 5 Feb 2024 08:14:41 +0000 Subject: [PATCH 16/51] feat: set top_k via api --- gswikichat/api.py | 4 ++-- gswikichat/vector_store_interface.py | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/gswikichat/api.py b/gswikichat/api.py index ce2a144..8ea3097 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -20,7 +20,7 @@ async def root(): @app.get("/api") -async def api(q): +async def api(q, top_k = 3): print("query: ", q) query = Document(content=q) @@ -31,7 +31,7 @@ async def api(q): retrieverResults = retriever.run( query_embedding=list(queryEmbedding), filters=None, - top_k=3, + top_k=top_k, scale_score=None, return_embedding=None ) diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py index a470a0a..8cb1b28 100644 --- a/gswikichat/vector_store_interface.py +++ b/gswikichat/vector_store_interface.py @@ -105,8 +105,5 @@ documentsDict = [Document.to_dict(d) for d in embedded['documents']] json.dump(documentsDict, f) -retriever = InMemoryEmbeddingRetriever( - document_store=document_store, - top_k=top_k -) +retriever = InMemoryEmbeddingRetriever(document_store=document_store) From 332e3dc17b3a64506f4de883d0180de126dd8421 Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Mon, 5 Feb 2024 08:22:58 +0000 Subject: [PATCH 17/51] feat: support en and de on the api to switch prompts --- gswikichat/api.py | 14 ++++++++++---- gswikichat/prompt.py | 23 +++++++++++++++++++++-- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/gswikichat/api.py b/gswikichat/api.py index 8ea3097..a68f829 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -20,8 +20,13 @@ async def root(): @app.get("/api") -async def api(q, top_k = 3): - print("query: ", q) +async def api(q, top_k = 3, lang = 'en'): + if not lang in ['en', 'de']: + raise Exception("language must be 'en' or 'de'") + + print(f"{q=}") + print(f"{top_k=}") + print(f"{lang=}") query = Document(content=q) @@ -40,10 +45,11 @@ async def api(q, top_k = 3): for retrieverResult in retrieverResults: print(retrieverResult) - promptBuild = prompt_builder.run(question=q, documents=retrieverResults['documents']) + promptBuilder = prompt_builder[lang] + promptBuild = promptBuilder.run(question=q, documents=retrieverResults['documents']) prompt = promptBuild['prompt'] - print("prompt: ", prompt) + print(f"{prompt=}") response = llm.run(prompt=prompt, generation_kwargs=None) diff --git a/gswikichat/prompt.py b/gswikichat/prompt.py index 36a6ebb..a9ea71b 100644 --- a/gswikichat/prompt.py +++ b/gswikichat/prompt.py @@ -9,7 +9,7 @@ # {% endfor %} # """ -prompt_template = """ +prompt_template_en = """ <|system|> You are a helpful assistant. You answer questions based on the given documents. Answer based on the documents only. If the information is not in the documents, @@ -25,6 +25,22 @@ <|assistant|> """ +prompt_template_de = """ +<|system|> +Du bist ein hilfreicher Assistent. Du beantwortest Fragen basierend auf den vorliegenden Dokumenten. +Beantworte basierend auf den Dokumenten nur. Wenn die Information nicht in den Dokumenten ist, +sage, dass du sie nicht finden kannst. +<|endoftext|> +<|user|> +Dokumente: +{% for doc in documents %} + {{ doc.content }} +{% endfor %} +Mit diesen Dokumenten, beantworte die folgende Frage: {{question}} +<|endoftext|> +<|assistant|> +""" + # prompt_template = """ # Given these documents, answer the question. Answer in a full sentence. Give the response only, no explanation. Don't mention the documents. # Documents: @@ -33,4 +49,7 @@ # {% endfor %} # """ -prompt_builder = PromptBuilder(template=prompt_template) +prompt_builder = { + 'en': PromptBuilder(template=prompt_template_en), + 'de': PromptBuilder(template=prompt_template_de), +} From 6225fccfcc975f3ec2fa51b72bf914202c4488e4 Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Mon, 5 Feb 2024 08:24:03 +0000 Subject: [PATCH 18/51] feat: cache embedding model during docker build --- Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Dockerfile b/Dockerfile index 796ccc4..9efb49c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -51,6 +51,11 @@ COPY --chmod=755 requirements.txt requirements.txt RUN pip install -r requirements.txt +# Load sentence-transformers model once in order to cache it in the image +# TODO: ARG / ENV for embedder model +RUN echo "from haystack.components.embedders import SentenceTransformersDocumentEmbedder\nSentenceTransformersDocumentEmbedder(model='svalabs/german-gpl-adapted-covid').warm_up()" | python3 + + # Install frontend dependencies COPY --chmod=755 frontend/package.json frontend/package.json COPY --chmod=755 frontend/yarn.lock frontend/yarn.lock From 4877807ef0212dc0ca531e6f0f7bdb5a88a7e82a Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Mon, 5 Feb 2024 08:24:18 +0000 Subject: [PATCH 19/51] wip: smaller chunk size, 5 sentences for now --- gswikichat/vector_store_interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py index 8cb1b28..3b1f4b0 100644 --- a/gswikichat/vector_store_interface.py +++ b/gswikichat/vector_store_interface.py @@ -64,7 +64,7 @@ # remove_repeated_substrings=False) # input_documents = cleaner.run(input_documents)['documents'] -splitter = DocumentSplitter(split_by="sentence", split_length=20, split_overlap=0) +splitter = DocumentSplitter(split_by="sentence", split_length=5, split_overlap=0) input_documents = splitter.run(input_documents)['documents'] document_store = InMemoryDocumentStore( From da9859dc2fd5db1e0d11be0d611dc32f697853bb Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Mon, 5 Feb 2024 08:24:35 +0000 Subject: [PATCH 20/51] chore: remove comment --- gswikichat/llm_config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gswikichat/llm_config.py b/gswikichat/llm_config.py index 10ded24..1e86174 100644 --- a/gswikichat/llm_config.py +++ b/gswikichat/llm_config.py @@ -1,7 +1,6 @@ import os from haystack_integrations.components.generators.ollama import OllamaGenerator -# TODO: discolm prompt https://huggingface.co/DiscoResearch/DiscoLM_German_7b_v1 print(f"Setting up ollama with {os.getenv('MODEL')}") llm = OllamaGenerator( model=os.getenv("MODEL"), From 291aaaf703167a6ef3e14acdab8c4b8995b62522 Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Fri, 9 Feb 2024 09:33:06 +0000 Subject: [PATCH 21/51] feat: enable embeddings cache (for developmnet) --- gswikichat/vector_store_interface.py | 34 ++++++++++++++++------------ 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py index 3b1f4b0..e348047 100644 --- a/gswikichat/vector_store_interface.py +++ b/gswikichat/vector_store_interface.py @@ -12,6 +12,8 @@ from haystack.components.preprocessors import DocumentCleaner HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN') + +# disable this line to disable the embedding cache EMBEDDING_CACHE_FILE = '/tmp/gbnc_embeddings.json' top_k = 5 @@ -83,27 +85,29 @@ embedder.warm_up() -# if os.path.isfile(EMBEDDING_CACHE_FILE): -# print("[INFO] Loading embeddings from cache") -# -# with open(EMBEDDING_CACHE_FILE, 'r') as f: -# documentsDict = json.load(f) -# document_store.write_documents( -# documents=[Document.from_dict(d) for d in documentsDict], -# policy=DuplicatePolicy.OVERWRITE -# ) -# -# else: -if True: +if EMBEDDING_CACHE_FILE and os.path.isfile(EMBEDDING_CACHE_FILE): + print("[INFO] Loading embeddings from cache") + + with open(EMBEDDING_CACHE_FILE, 'r') as f: + documentsDict = json.load(f) + document_store.write_documents( + documents=[Document.from_dict(d) for d in documentsDict], + policy=DuplicatePolicy.OVERWRITE + ) + +else: + print("[INFO] Generating embeddings") + embedded = embedder.run(input_documents) document_store.write_documents( documents=embedded['documents'], policy=DuplicatePolicy.OVERWRITE ) - with open(EMBEDDING_CACHE_FILE, 'w') as f: - documentsDict = [Document.to_dict(d) for d in embedded['documents']] - json.dump(documentsDict, f) + if EMBEDDING_CACHE_FILE: + with open(EMBEDDING_CACHE_FILE, 'w') as f: + documentsDict = [Document.to_dict(d) for d in embedded['documents']] + json.dump(documentsDict, f) retriever = InMemoryEmbeddingRetriever(document_store=document_store) From 936d83ebe802349f5bdde67ae80180b733e52ae7 Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Fri, 9 Feb 2024 09:33:19 +0000 Subject: [PATCH 22/51] feat: add document cleaner --- gswikichat/vector_store_interface.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py index e348047..36f0760 100644 --- a/gswikichat/vector_store_interface.py +++ b/gswikichat/vector_store_interface.py @@ -60,15 +60,16 @@ ), ] -# cleaner = DocumentCleaner( -# remove_empty_lines=True, -# remove_extra_whitespaces=True, -# remove_repeated_substrings=False) -# input_documents = cleaner.run(input_documents)['documents'] - splitter = DocumentSplitter(split_by="sentence", split_length=5, split_overlap=0) input_documents = splitter.run(input_documents)['documents'] +cleaner = DocumentCleaner( + remove_empty_lines=True, + remove_extra_whitespaces=True, + remove_repeated_substrings=False) +input_documents = cleaner.run(input_documents)['documents'] + + document_store = InMemoryDocumentStore( embedding_similarity_function="cosine", # embedding_dim=768, From 3e0b8f4957072a26c71dde12d05cb5dccd4cd418 Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Fri, 9 Feb 2024 09:36:21 +0000 Subject: [PATCH 23/51] docs: long docker run options --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 8aa376c..d30cc74 100644 --- a/README.md +++ b/README.md @@ -10,11 +10,13 @@ To build and run the container locally with hot reload on python files do: ``` DOCKER_BUILDKIT=1 docker build . -t gbnc docker run \ - -v "$(pwd)/gswikichat":/workspace/gswikichat \ - -v "$(pwd)/cache":/root/.cache \ - -e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN - -p 8000:8000 \ - --rm -it \ + --env HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \ + --volume "$(pwd)/gswikichat":/workspace/gswikichat \ + --volume "$(pwd)/cache":/root/.cache \ + --publish 8000:8000 \ + --rm \ + --interactive \ + --tty \ --name gbnc \ gbnc ``` From edf5eb279b6747ecf6efc7a3120fdc16f54e1fc1 Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Fri, 9 Feb 2024 09:40:34 +0000 Subject: [PATCH 24/51] fix: access mode --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 9efb49c..0b6cf1c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -67,7 +67,7 @@ COPY --chmod=755 json_input json_input # Copy backend for production -COPY --chmod=644 gswikichat gswikichat +COPY --chmod=755 gswikichat gswikichat # Copy and build frontend for production (into the frontend/dist folder) From 63baf2b09d2c135ec8283f66b28cd0c9f683fd29 Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Fri, 9 Feb 2024 10:47:46 +0000 Subject: [PATCH 25/51] fix: redraw loading animation on subsequent searches --- frontend/src/views/ChatView.vue | 1 + 1 file changed, 1 insertion(+) diff --git a/frontend/src/views/ChatView.vue b/frontend/src/views/ChatView.vue index 518a88f..9d79ddb 100644 --- a/frontend/src/views/ChatView.vue +++ b/frontend/src/views/ChatView.vue @@ -95,6 +95,7 @@ const inputFocused = ref(false) // } function search() { + response.value = undefined; displayResponse.value = true fetch(`/api?q=${inputText.value}`) .then((response) => response.json()) From 56a7b8c1d747515c1ae546a5022283b27c1ad5ca Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Fri, 9 Feb 2024 10:49:56 +0000 Subject: [PATCH 26/51] wip: workaround for runpod.io http port forwarding --- gswikichat/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gswikichat/api.py b/gswikichat/api.py index a68f829..be2e4f9 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -16,7 +16,8 @@ @app.get("/") async def root(): - return RedirectResponse(url="/frontend/dist", status_code=302) + # return RedirectResponse(url="/frontend/dist", status_code=308) + return {} @app.get("/api") From 8e05473e3aeb82795baa6db2d9f315f58860b54d Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Fri, 9 Feb 2024 10:51:39 +0000 Subject: [PATCH 27/51] feat: switch to openchat 7b model --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 6202b20..b180bf9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,7 +38,7 @@ ENV PATH="/usr/local/ollama/bin:${PATH}" # Pull a language model (see LICENSE_STABLELM2.txt) -ARG MODEL=stablelm2:1.6b-zephyr +ARG MODEL=openchat ENV MODEL=${MODEL} RUN ollama serve & while ! curl http://localhost:11434; do sleep 1; done; ollama pull $MODEL From 22b04d0ad0d2f77281063e317e2e6b1bbd6eea66 Mon Sep 17 00:00:00 2001 From: Jonathan Fraine Date: Fri, 9 Feb 2024 17:41:44 +0100 Subject: [PATCH 28/51] added logging via logger with Handler to api.py; PEP8 formatted api.py --- gswikichat/api.py | 97 ++++++++++++++++++++++++++++------------ gswikichat/llm_config.py | 3 ++ 2 files changed, 72 insertions(+), 28 deletions(-) diff --git a/gswikichat/api.py b/gswikichat/api.py index be2e4f9..d7f84ce 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -5,70 +5,111 @@ from .rag import embedder, retriever, prompt_builder, llm, answer_builder from haystack import Document +# TODO: Test if this can be included in the `__init__.py` file +import logging + +logging.basicConfig( + filename='gbnc.log', + encoding='utf-8', + level=logging.DEBUG +) + +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + +handler = logging.StreamHandler(sys.stdout) +handler.setLevel(logging.DEBUG) +formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +handler.setFormatter(formatter) +logger.addHandler(handler) + +# End Logging Handler Formatting + +homepage = "/frontend/dist" app = FastAPI() app.mount( - "/frontend/dist", - StaticFiles(directory="frontend/dist", html=True), + homepage, + StaticFiles( + directory=homepage, + html=True + ), name="frontend" ) @app.get("/") async def root(): - # return RedirectResponse(url="/frontend/dist", status_code=308) - return {} + return RedirectResponse( + url=homepage, + status_code=308 + ) + # return {} @app.get("/api") -async def api(q, top_k = 3, lang = 'en'): +async def api(query, top_k=3, lang='en'): if not lang in ['en', 'de']: - raise Exception("language must be 'en' or 'de'") + raise Exception("language must be 'en' or 'de'") - print(f"{q=}") - print(f"{top_k=}") - print(f"{lang=}") + logger.debug(f'{query=}') # Assuming we change the input name + logger.debug(f'{top_k=}') + logger.debug(f'{top_k=}') - query = Document(content=q) + query = Document(content=query) - queryEmbedded = embedder.run([query]) - queryEmbedding = queryEmbedded['documents'][0].embedding + query_embedded = embedder.run([query]) + query_embedding = query_embedded['documents'][0].embedding - retrieverResults = retriever.run( - query_embedding=list(queryEmbedding), + retreiver_results = retriever.run( + query_embedding=list(query_embedding), filters=None, top_k=top_k, scale_score=None, return_embedding=None ) - print("retriever results:") - for retrieverResult in retrieverResults: - print(retrieverResult) + logger.debug('retriever results:') + for retriever_result in retriever_results: + logger.debug(retriever_result_) + + prompt_builder = prompt_builders[lang] - promptBuilder = prompt_builder[lang] - promptBuild = promptBuilder.run(question=q, documents=retrieverResults['documents']) - prompt = promptBuild['prompt'] + prompt_build = prompt_builder.run( + question=query.content, # As a Document instance, .content returns a string + documents=retriever_results['documents'] + ) + + prompt = prompt_build['prompt'] - print(f"{prompt=}") + logger.debug(f'{prompt=}') response = llm.run(prompt=prompt, generation_kwargs=None) - answerBuild = answer_builder.run( - query=q, + answer_build = answer_builder.run( + query=query.content, # As a Document class, .content returns the string replies=response['replies'], meta=response['meta'], - documents=retrieverResults['documents'], + documents=retriever_results['documents'], pattern=None, reference_pattern=None ) - print("answerBuild", answerBuild) - answer = answerBuild['answers'][0] + logger.debug(f'{answer_build=}') + + answer = answer_build['answers'][0] - sources= [{ "src": d.meta['src'], "content": d.content, "score": d.score } for d in answer.documents] + sources = [ + { + "src": d_.meta['src'], + "content": d_.content, + "score": d_.score + } for d_ in answer.documents + ] - print("answer", answer) + logger.debug(f'{answer=}') return { "answer": answer.data, diff --git a/gswikichat/llm_config.py b/gswikichat/llm_config.py index 1e86174..6f1a732 100644 --- a/gswikichat/llm_config.py +++ b/gswikichat/llm_config.py @@ -1,6 +1,9 @@ import os from haystack_integrations.components.generators.ollama import OllamaGenerator +# import logging +# logger = logging.getLogger() + print(f"Setting up ollama with {os.getenv('MODEL')}") llm = OllamaGenerator( model=os.getenv("MODEL"), From 10f6b2191f6bc2d0ac660023eb83f66519ba599e Mon Sep 17 00:00:00 2001 From: Jonathan Fraine Date: Fri, 9 Feb 2024 17:45:17 +0100 Subject: [PATCH 29/51] debugging use of homepage instead of hard coded endpoint values --- gswikichat/api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gswikichat/api.py b/gswikichat/api.py index d7f84ce..bb27a22 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -7,6 +7,7 @@ # TODO: Test if this can be included in the `__init__.py` file import logging +import sys logging.basicConfig( filename='gbnc.log', From bfbd245f1db885c08187a7cbf76355ece17ff223 Mon Sep 17 00:00:00 2001 From: Jonathan Fraine Date: Fri, 9 Feb 2024 18:11:05 +0100 Subject: [PATCH 30/51] returning to previous to restart without errors --- gswikichat/api.py | 32 +++----------------------------- 1 file changed, 3 insertions(+), 29 deletions(-) diff --git a/gswikichat/api.py b/gswikichat/api.py index bb27a22..44ff6b3 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -5,38 +5,12 @@ from .rag import embedder, retriever, prompt_builder, llm, answer_builder from haystack import Document -# TODO: Test if this can be included in the `__init__.py` file -import logging -import sys - -logging.basicConfig( - filename='gbnc.log', - encoding='utf-8', - level=logging.DEBUG -) - -logger = logging.getLogger() -logger.setLevel(logging.DEBUG) - -handler = logging.StreamHandler(sys.stdout) -handler.setLevel(logging.DEBUG) -formatter = logging.Formatter( - '%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -handler.setFormatter(formatter) -logger.addHandler(handler) - -# End Logging Handler Formatting - homepage = "/frontend/dist" app = FastAPI() app.mount( - homepage, - StaticFiles( - directory=homepage, - html=True - ), + "/frontend/dist", + StaticFiles(directory="frontend/dist", html=True), name="frontend" ) @@ -44,7 +18,7 @@ @app.get("/") async def root(): return RedirectResponse( - url=homepage, + url="/frontend/dist", status_code=308 ) # return {} From 7b6ba0a29b6ea725b37b11af88620d8d995699ec Mon Sep 17 00:00:00 2001 From: Jonathan Fraine Date: Fri, 9 Feb 2024 18:57:10 +0100 Subject: [PATCH 31/51] renewed app.mount; bug fixed PEP8 changes in api.py; reformatted rag.py inputs; tests timed out --- frontend/src/views/ChatView.vue | 2 +- gswikichat/api.py | 26 ++++++++++++++++++++++---- gswikichat/prompt.py | 6 +++--- gswikichat/rag.py | 4 ---- 4 files changed, 26 insertions(+), 12 deletions(-) diff --git a/frontend/src/views/ChatView.vue b/frontend/src/views/ChatView.vue index 9d79ddb..981d2d7 100644 --- a/frontend/src/views/ChatView.vue +++ b/frontend/src/views/ChatView.vue @@ -97,7 +97,7 @@ const inputFocused = ref(false) function search() { response.value = undefined; displayResponse.value = true - fetch(`/api?q=${inputText.value}`) + fetch(`/api?query=${inputText.value}`) .then((response) => response.json()) .then((data) => { response.value = data diff --git a/gswikichat/api.py b/gswikichat/api.py index 44ff6b3..d820b75 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -2,10 +2,28 @@ from fastapi.staticfiles import StaticFiles from fastapi import FastAPI -from .rag import embedder, retriever, prompt_builder, llm, answer_builder +from .rag import answer_builder +from .llm_config import llm +from .prompt import prompt_builders +from .vector_store_interface import embedder, retriever, input_documents + from haystack import Document -homepage = "/frontend/dist" +import logging +import sys + +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + +handler = logging.StreamHandler(sys.stdout) +handler.setLevel(logging.DEBUG) +formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) + +static_dir = 'frontend/dist' +homepage = f'/{static_dir}' app = FastAPI() app.mount( @@ -38,7 +56,7 @@ async def api(query, top_k=3, lang='en'): query_embedded = embedder.run([query]) query_embedding = query_embedded['documents'][0].embedding - retreiver_results = retriever.run( + retriever_results = retriever.run( query_embedding=list(query_embedding), filters=None, top_k=top_k, @@ -47,7 +65,7 @@ async def api(query, top_k=3, lang='en'): ) logger.debug('retriever results:') - for retriever_result in retriever_results: + for retriever_result_ in retriever_results: logger.debug(retriever_result_) prompt_builder = prompt_builders[lang] diff --git a/gswikichat/prompt.py b/gswikichat/prompt.py index a9ea71b..a2d82af 100644 --- a/gswikichat/prompt.py +++ b/gswikichat/prompt.py @@ -49,7 +49,7 @@ # {% endfor %} # """ -prompt_builder = { - 'en': PromptBuilder(template=prompt_template_en), - 'de': PromptBuilder(template=prompt_template_de), +prompt_builders = { + 'en': PromptBuilder(template=prompt_template_en), + 'de': PromptBuilder(template=prompt_template_de), } diff --git a/gswikichat/rag.py b/gswikichat/rag.py index 8e198f0..05d7c2b 100644 --- a/gswikichat/rag.py +++ b/gswikichat/rag.py @@ -2,10 +2,6 @@ from haystack import Pipeline from haystack.components.builders.answer_builder import AnswerBuilder -from .llm_config import llm -from .prompt import prompt_builder -from .vector_store_interface import embedder, retriever, input_documents - answer_builder = AnswerBuilder() # rag_pipeline = Pipeline() From 0428f871c16c6a042f5dda41b18a098eeab34e4e Mon Sep 17 00:00:00 2001 From: Jonathan Fraine Date: Fri, 9 Feb 2024 19:10:03 +0100 Subject: [PATCH 32/51] returned to stablelm2 model for testing purposes. PEP8 upgrades in api.py included; logger in api.py functional --- Dockerfile | 3 ++- gswikichat/api.py | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 86fc2ac..6d4fa8e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,7 +38,8 @@ ENV PATH="/usr/local/ollama/bin:${PATH}" # Pull a language model (see LICENSE_STABLELM2.txt) -ARG MODEL=openchat +# ARG MODEL=openchat +ARG MODEL=stablelm2:1.6b-zephyr ENV MODEL=${MODEL} RUN ollama serve & while ! curl http://localhost:11434; do sleep 1; done; ollama pull $MODEL diff --git a/gswikichat/api.py b/gswikichat/api.py index d820b75..1eaf348 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -12,15 +12,29 @@ import logging import sys -logger = logging.getLogger() +# TODO: Test if this can be added to the `__init__.py` file +# TODO: Add volume to Dockerfile for `gbnc_api.log` file +# Source: https://docs.python.org/3/howto/logging.html +logging.basicConfig( + filename='gbnc_api.log', + encoding='utf-8', + level=logging.DEBUG +) + +# Source: https://stackoverflow.com/questions/14058453/ +# making-python-loggers-output-all-messages-to-stdout-in-addition-to-log-file +logger = logging.getLogger('gswikicat api') logger.setLevel(logging.DEBUG) handler = logging.StreamHandler(sys.stdout) handler.setLevel(logging.DEBUG) formatter = logging.Formatter( - '%(asctime)s - %(name)s - %(levelname)s - %(message)s') + '%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) handler.setFormatter(formatter) logger.addHandler(handler) +# End of logging logger configuration + static_dir = 'frontend/dist' homepage = f'/{static_dir}' From 8104dde885b26cc249481c717894e07c99bda895 Mon Sep 17 00:00:00 2001 From: Jonathan Fraine Date: Fri, 9 Feb 2024 19:52:46 +0100 Subject: [PATCH 33/51] added OLLAMA_MODEL_NAME and OLLAMA_URL as environment variables; called them in llm_config.py --- Dockerfile | 13 ++++++++----- gswikichat/api.py | 10 +++++----- gswikichat/llm_config.py | 20 +++++++++++++++----- gswikichat/rag.py | 3 +-- 4 files changed, 29 insertions(+), 17 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6d4fa8e..438ac1e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,12 +36,15 @@ RUN npm install -g yarn COPY --from=ollama /usr/bin/ollama /usr/local/ollama/bin/ollama ENV PATH="/usr/local/ollama/bin:${PATH}" - # Pull a language model (see LICENSE_STABLELM2.txt) -# ARG MODEL=openchat -ARG MODEL=stablelm2:1.6b-zephyr -ENV MODEL=${MODEL} -RUN ollama serve & while ! curl http://localhost:11434; do sleep 1; done; ollama pull $MODEL +# ARG OLLAMA_MODEL_NAME=openchat +ARG OLLAMA_MODEL_NAME=stablelm2:1.6b-zephyr +ARG OLLAMA_URL=http://localhost:11434 + +ENV OLLAMA_MODEL_NAME=${OLLAMA_MODEL_NAME} +ENV OLLAMA_URL=${OLLAMA_URL} + +RUN ollama serve & while ! curl ${OLLAMA_URL}; do sleep 1; done; ollama pull $OLLAMA_MODEL_NAME # Setup the custom API and frontend diff --git a/gswikichat/api.py b/gswikichat/api.py index 1eaf348..f2f6e50 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -36,13 +36,13 @@ # End of logging logger configuration -static_dir = 'frontend/dist' -homepage = f'/{static_dir}' +STATIC_DIR = 'frontend/dist' +LANDING_PAGE = f'/{STATIC_DIR}' app = FastAPI() app.mount( - "/frontend/dist", - StaticFiles(directory="frontend/dist", html=True), + LANDING_PAGE, + StaticFiles(directory=STATIC_DIR, html=True), name="frontend" ) @@ -50,7 +50,7 @@ @app.get("/") async def root(): return RedirectResponse( - url="/frontend/dist", + url=LANDING_PAGE, status_code=308 ) # return {} diff --git a/gswikichat/llm_config.py b/gswikichat/llm_config.py index 6f1a732..0179bc1 100644 --- a/gswikichat/llm_config.py +++ b/gswikichat/llm_config.py @@ -1,11 +1,21 @@ import os from haystack_integrations.components.generators.ollama import OllamaGenerator -# import logging -# logger = logging.getLogger() +import logging +logger = logging.getLogger() -print(f"Setting up ollama with {os.getenv('MODEL')}") +OLLAMA_MODEL_NAME = os.environ.get("OLLAMA_MODEL_NAME") +OLLAMA_URL = os.environ.get("OLLAMA_URL") +OLLAMA_GENERATE_URL = f"{OLLAMA_URL}/api/generate" + +logger.info(f'Using {OLLAMA_MODEL_NAME=}') +logger.info(f'Endpoint: {OLLAMA_URL=}') +logger.info(f'Generate: {OLLAMA_GENERATE_URL=}') + +logger.debug(f'I AM HERE') + +print(f"Setting up ollama with {OLLAMA_MODEL_NAME}") llm = OllamaGenerator( - model=os.getenv("MODEL"), - url="http://localhost:11434/api/generate" + model=OLLAMA_MODEL_NAME, + url=OLLAMA_GENERATE_URL ) diff --git a/gswikichat/rag.py b/gswikichat/rag.py index 05d7c2b..b9bb392 100644 --- a/gswikichat/rag.py +++ b/gswikichat/rag.py @@ -1,5 +1,4 @@ - -from haystack import Pipeline +# from haystack import Pipeline from haystack.components.builders.answer_builder import AnswerBuilder answer_builder = AnswerBuilder() From fbc45916cddbfc54241d94bf5967048d48570880 Mon Sep 17 00:00:00 2001 From: Jonathan Fraine Date: Fri, 9 Feb 2024 20:05:09 +0100 Subject: [PATCH 34/51] created logger.py to serve get_logger to all modules --- gswikichat/__init__.py | 1 + gswikichat/api.py | 29 +++-------------------------- gswikichat/llm_config.py | 10 +++++++--- gswikichat/logger.py | 30 ++++++++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 29 deletions(-) create mode 100644 gswikichat/logger.py diff --git a/gswikichat/__init__.py b/gswikichat/__init__.py index 0a0e47b..a127f79 100644 --- a/gswikichat/__init__.py +++ b/gswikichat/__init__.py @@ -1 +1,2 @@ +# from .logger import logger from .api import * diff --git a/gswikichat/api.py b/gswikichat/api.py index f2f6e50..882ce1a 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -8,33 +8,10 @@ from .vector_store_interface import embedder, retriever, input_documents from haystack import Document +from .logger import get_logger -import logging -import sys - -# TODO: Test if this can be added to the `__init__.py` file -# TODO: Add volume to Dockerfile for `gbnc_api.log` file -# Source: https://docs.python.org/3/howto/logging.html -logging.basicConfig( - filename='gbnc_api.log', - encoding='utf-8', - level=logging.DEBUG -) - -# Source: https://stackoverflow.com/questions/14058453/ -# making-python-loggers-output-all-messages-to-stdout-in-addition-to-log-file -logger = logging.getLogger('gswikicat api') -logger.setLevel(logging.DEBUG) - -handler = logging.StreamHandler(sys.stdout) -handler.setLevel(logging.DEBUG) -formatter = logging.Formatter( - '%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -handler.setFormatter(formatter) -logger.addHandler(handler) -# End of logging logger configuration - +# Create logger instance from base logger config in `logger.py` +logger = get_logger(__name__) STATIC_DIR = 'frontend/dist' LANDING_PAGE = f'/{STATIC_DIR}' diff --git a/gswikichat/llm_config.py b/gswikichat/llm_config.py index 0179bc1..fbcdf91 100644 --- a/gswikichat/llm_config.py +++ b/gswikichat/llm_config.py @@ -1,8 +1,11 @@ import os from haystack_integrations.components.generators.ollama import OllamaGenerator -import logging -logger = logging.getLogger() +from .logger import get_logger + +# Create logger instance from base logger config in `logger.py` +logger = get_logger(__name__) + OLLAMA_MODEL_NAME = os.environ.get("OLLAMA_MODEL_NAME") OLLAMA_URL = os.environ.get("OLLAMA_URL") @@ -14,7 +17,8 @@ logger.debug(f'I AM HERE') -print(f"Setting up ollama with {OLLAMA_MODEL_NAME}") +logger.info(f"Setting up ollama with {OLLAMA_MODEL_NAME}") + llm = OllamaGenerator( model=OLLAMA_MODEL_NAME, url=OLLAMA_GENERATE_URL diff --git a/gswikichat/logger.py b/gswikichat/logger.py new file mode 100644 index 0000000..5d89447 --- /dev/null +++ b/gswikichat/logger.py @@ -0,0 +1,30 @@ +import logging +import sys + + +def get_logger(name): + # Create a logger + # Source: https://docs.python.org/3/howto/logging.html + logging.basicConfig( + filename='gbnc_api.log', + encoding='utf-8', + level=logging.DEBUG + ) + + logger = logging.getLogger(name) + logger.setLevel(logging.DEBUG) # Set the logging level + + # Source: stackoverflow.com/questions/14058453/ + # making-python-loggers-output-all-messages- + # to-stdout-in-addition-to-log-file + + # Create console handler and set level to debug + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(logging.DEBUG) + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + handler.setFormatter(formatter) + logger.addHandler(handler) + + return logger From caecfd19a33be1ed94b423a5f8d8e3e73a3ac032 Mon Sep 17 00:00:00 2001 From: Jonathan Fraine Date: Fri, 9 Feb 2024 20:17:08 +0100 Subject: [PATCH 35/51] created a rag_pipeline in the rag.py based on the usage in api.py; removed rag_piipeline from api.py; introduced rag_pipeline from rag.py into api.py --- gswikichat/__init__.py | 1 - gswikichat/api.py | 50 +++---------------------- gswikichat/rag.py | 83 ++++++++++++++++++++++++++++++------------ 3 files changed, 65 insertions(+), 69 deletions(-) diff --git a/gswikichat/__init__.py b/gswikichat/__init__.py index a127f79..0a0e47b 100644 --- a/gswikichat/__init__.py +++ b/gswikichat/__init__.py @@ -1,2 +1 @@ -# from .logger import logger from .api import * diff --git a/gswikichat/api.py b/gswikichat/api.py index 882ce1a..e965841 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -2,10 +2,7 @@ from fastapi.staticfiles import StaticFiles from fastapi import FastAPI -from .rag import answer_builder -from .llm_config import llm -from .prompt import prompt_builders -from .vector_store_interface import embedder, retriever, input_documents +from .rag import rag_pipeline from haystack import Document from .logger import get_logger @@ -40,51 +37,14 @@ async def api(query, top_k=3, lang='en'): logger.debug(f'{query=}') # Assuming we change the input name logger.debug(f'{top_k=}') - logger.debug(f'{top_k=}') - - query = Document(content=query) + logger.debug(f'{lang=}') - query_embedded = embedder.run([query]) - query_embedding = query_embedded['documents'][0].embedding - - retriever_results = retriever.run( - query_embedding=list(query_embedding), - filters=None, + answer = rag_pipeline( + query=query, top_k=top_k, - scale_score=None, - return_embedding=None - ) - - logger.debug('retriever results:') - for retriever_result_ in retriever_results: - logger.debug(retriever_result_) - - prompt_builder = prompt_builders[lang] - - prompt_build = prompt_builder.run( - question=query.content, # As a Document instance, .content returns a string - documents=retriever_results['documents'] + lang=lang ) - prompt = prompt_build['prompt'] - - logger.debug(f'{prompt=}') - - response = llm.run(prompt=prompt, generation_kwargs=None) - - answer_build = answer_builder.run( - query=query.content, # As a Document class, .content returns the string - replies=response['replies'], - meta=response['meta'], - documents=retriever_results['documents'], - pattern=None, - reference_pattern=None - ) - - logger.debug(f'{answer_build=}') - - answer = answer_build['answers'][0] - sources = [ { "src": d_.meta['src'], diff --git a/gswikichat/rag.py b/gswikichat/rag.py index b9bb392..44c7e2b 100644 --- a/gswikichat/rag.py +++ b/gswikichat/rag.py @@ -1,26 +1,63 @@ # from haystack import Pipeline +from haystack import Document from haystack.components.builders.answer_builder import AnswerBuilder -answer_builder = AnswerBuilder() - -# rag_pipeline = Pipeline() -# rag_pipeline.add_component("text_embedder", embedder) -# rag_pipeline.add_component("retriever", retriever) -# # rag_pipeline.add_component("writer", writer) -# rag_pipeline.add_component("prompt_builder", prompt_builder) -# rag_pipeline.add_component("llm", llm) -# rag_pipeline.add_component("answer_builder", answer_builder) - -# # rag_pipeline.connect("embedder", "writer") -# rag_pipeline.connect("retriever.documents", "text_embedder") -# rag_pipeline.connect("retriever", "prompt_builder.documents") -# rag_pipeline.connect("prompt_builder", "llm") -# rag_pipeline.connect("llm.replies", "answer_builder.replies") -# rag_pipeline.connect("llm.metadata", "answer_builder.meta") -# rag_pipeline.connect("retriever", "answer_builder.documents") - -# rag_pipeline.run( -# { -# "text_embedder": {"documents": input_documents} -# } -# ) +from .llm_config import llm +from .logger import get_logger +from .prompt import prompt_builders +from .vector_store_interface import embedder, retriever, input_documents + +# Create logger instance from base logger config in `logger.py` +logger = get_logger(__name__) + + +def rag_pipeline(query: str = None, top_k: int = 3, lang: str = 'de'): + + assert (query is not None) + + if isinstance(query, str): + query = Document(content=query) + + assert (isinstance(query, Document)) + + query_embedded = embedder.run([query]) + query_embedding = query_embedded['documents'][0].embedding + + retriever_results = retriever.run( + query_embedding=list(query_embedding), + filters=None, + top_k=top_k, + scale_score=None, + return_embedding=None + ) + + logger.debug('retriever results:') + for retriever_result_ in retriever_results: + logger.debug(retriever_result_) + + prompt_builder = prompt_builders[lang] + + prompt_build = prompt_builder.run( + question=query.content, # As a Document instance, .content returns a string + documents=retriever_results['documents'] + ) + + prompt = prompt_build['prompt'] + + logger.debug(f'{prompt=}') + + response = llm.run(prompt=prompt, generation_kwargs=None) + + answer_builder = AnswerBuilder() + answer_build = answer_builder.run( + query=query.content, # As a Document class, .content returns the string + replies=response['replies'], + meta=response['meta'], + documents=retriever_results['documents'], + pattern=None, + reference_pattern=None + ) + + logger.debug(f'{answer_build=}') + + return answer_build['answers'][0] From 5c0b4d0de1a9a610922a457d97fa928024775e41 Mon Sep 17 00:00:00 2001 From: Jonathan Fraine Date: Fri, 9 Feb 2024 20:55:35 +0100 Subject: [PATCH 36/51] UPdated with PEP8 formatting in vector_store_interface.py --- gswikichat/llm_config.py | 1 - gswikichat/prompt.py | 25 ++-------- gswikichat/vector_store_interface.py | 69 +++++++++++++++++----------- 3 files changed, 46 insertions(+), 49 deletions(-) diff --git a/gswikichat/llm_config.py b/gswikichat/llm_config.py index fbcdf91..0ada3ce 100644 --- a/gswikichat/llm_config.py +++ b/gswikichat/llm_config.py @@ -6,7 +6,6 @@ # Create logger instance from base logger config in `logger.py` logger = get_logger(__name__) - OLLAMA_MODEL_NAME = os.environ.get("OLLAMA_MODEL_NAME") OLLAMA_URL = os.environ.get("OLLAMA_URL") OLLAMA_GENERATE_URL = f"{OLLAMA_URL}/api/generate" diff --git a/gswikichat/prompt.py b/gswikichat/prompt.py index a2d82af..d74d3e4 100644 --- a/gswikichat/prompt.py +++ b/gswikichat/prompt.py @@ -1,14 +1,5 @@ from haystack.components.builders.prompt_builder import PromptBuilder -# prompt_template = """ -# Given these documents, answer the question. Answer in a full sentence. Give the response only, no explanation. Don't mention the documents. -# Documents: -# {% for doc in documents %} -# If {{ doc.content }} answers the Question: {{question}} -# Then return {{ doc.meta["src"] }} -# {% endfor %} -# """ - prompt_template_en = """ <|system|> You are a helpful assistant. You answer questions based on the given documents. @@ -17,8 +8,8 @@ <|endoftext|> <|user|> Documents: -{% for doc in documents %} - {{ doc.content }} +{% for doc_ in documents %} + {{ doc_.content }} {% endfor %} With this documents, answer the following question: {{question}} <|endoftext|> @@ -33,22 +24,14 @@ <|endoftext|> <|user|> Dokumente: -{% for doc in documents %} - {{ doc.content }} +{% for doc_ in documents %} + {{ doc_.content }} {% endfor %} Mit diesen Dokumenten, beantworte die folgende Frage: {{question}} <|endoftext|> <|assistant|> """ -# prompt_template = """ -# Given these documents, answer the question. Answer in a full sentence. Give the response only, no explanation. Don't mention the documents. -# Documents: -# If {{ doc.content }} answers the Question: {{question}} -# Then only return {{ doc.meta["src"] }} and nothing at all. -# {% endfor %} -# """ - prompt_builders = { 'en': PromptBuilder(template=prompt_template_en), 'de': PromptBuilder(template=prompt_template_de), diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py index 36f0760..1aab187 100644 --- a/gswikichat/vector_store_interface.py +++ b/gswikichat/vector_store_interface.py @@ -11,6 +11,12 @@ from haystack.components.preprocessors import DocumentSplitter from haystack.components.preprocessors import DocumentCleaner + +from .logger import get_logger + +# Create logger instance from base logger config in `logger.py` +logger = get_logger(__name__) + HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN') # disable this line to disable the embedding cache @@ -19,31 +25,33 @@ top_k = 5 input_documents = [] +# TODO: Add the json strings as env variables json_dir = 'json_input' json_fname = 'excellent-articles_10.json' json_fpath = os.path.join(json_dir, json_fname) if os.path.isfile(json_fpath): - print(f'[INFO] Loading data from {json_fpath}') + logger.info(f'Loading data from {json_fpath}') with open(json_fpath, 'r') as finn: json_obj = json.load(finn) if isinstance(json_obj, dict): - for k, v in tqdm(json_obj.items()): - print(f"Loading {k}") - input_documents.append(Document(content=v, meta={"src": k})) - + input_documents = [ + Document( + content=content_, + meta={"src": url_} + ) + for url_, content_ in tqdm(json_obj.items()) + ] elif isinstance(json_obj, list): - for obj_ in tqdm(json_obj): - url = obj_['meta'] - content = obj_['content'] - input_documents.append( - Document( - content=content, - meta={'src': url} - ) + input_documents = [ + Document( + content=obj_['content'], + meta={'src': obj_['meta']} ) + for obj_ in tqdm(json_obj) + ] else: input_documents = [ Document( @@ -60,13 +68,18 @@ ), ] -splitter = DocumentSplitter(split_by="sentence", split_length=5, split_overlap=0) +splitter = DocumentSplitter( + split_by="sentence", + split_length=5, + split_overlap=0 +) input_documents = splitter.run(input_documents)['documents'] cleaner = DocumentCleaner( - remove_empty_lines=True, - remove_extra_whitespaces=True, - remove_repeated_substrings=False) + remove_empty_lines=True, + remove_extra_whitespaces=True, + remove_repeated_substrings=False +) input_documents = cleaner.run(input_documents)['documents'] @@ -78,7 +91,7 @@ # https://huggingface.co/svalabs/german-gpl-adapted-covid sentence_transformer_model = 'svalabs/german-gpl-adapted-covid' -print(f'Sentence Transformer Name: {sentence_transformer_model}') +logger.info(f'Sentence Transformer Name: {sentence_transformer_model}') embedder = SentenceTransformersDocumentEmbedder( model=sentence_transformer_model, @@ -87,17 +100,17 @@ if EMBEDDING_CACHE_FILE and os.path.isfile(EMBEDDING_CACHE_FILE): - print("[INFO] Loading embeddings from cache") + logger.info('Loading embeddings from cache') - with open(EMBEDDING_CACHE_FILE, 'r') as f: - documentsDict = json.load(f) + with open(EMBEDDING_CACHE_FILE, 'r') as f_in: + documents_dict = json.load(f_in) document_store.write_documents( - documents=[Document.from_dict(d) for d in documentsDict], + documents=[Document.from_dict(d_) for d_ in documents_dict], policy=DuplicatePolicy.OVERWRITE ) else: - print("[INFO] Generating embeddings") + logger.debug("Generating embeddings") embedded = embedder.run(input_documents) document_store.write_documents( @@ -106,9 +119,11 @@ ) if EMBEDDING_CACHE_FILE: - with open(EMBEDDING_CACHE_FILE, 'w') as f: - documentsDict = [Document.to_dict(d) for d in embedded['documents']] - json.dump(documentsDict, f) + with open(EMBEDDING_CACHE_FILE, 'w') as f_out: + documents_dict = [ + Document.to_dict(d_) + for d_ in embedded['documents'] + ] + json.dump(documents_dict, f_out) retriever = InMemoryEmbeddingRetriever(document_store=document_store) - From 8833af79e2e771b6578f0b34ae8a74d56edc0e8a Mon Sep 17 00:00:00 2001 From: roti Date: Mon, 12 Feb 2024 07:58:03 +0000 Subject: [PATCH 37/51] chore(Dockerfile): install python deps early To prevent huge redownloads on llm change. --- Dockerfile | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index 438ac1e..dc8d862 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,6 +36,15 @@ RUN npm install -g yarn COPY --from=ollama /usr/bin/ollama /usr/local/ollama/bin/ollama ENV PATH="/usr/local/ollama/bin:${PATH}" + +# Setup the app in workspace +WORKDIR /workspace + +# Install backend dependencies +COPY --chmod=755 requirements.txt requirements.txt +RUN pip install -r requirements.txt + + # Pull a language model (see LICENSE_STABLELM2.txt) # ARG OLLAMA_MODEL_NAME=openchat ARG OLLAMA_MODEL_NAME=stablelm2:1.6b-zephyr @@ -47,14 +56,6 @@ ENV OLLAMA_URL=${OLLAMA_URL} RUN ollama serve & while ! curl ${OLLAMA_URL}; do sleep 1; done; ollama pull $OLLAMA_MODEL_NAME -# Setup the custom API and frontend -WORKDIR /workspace - -# Install backend dependencies -COPY --chmod=755 requirements.txt requirements.txt -RUN pip install -r requirements.txt - - # Load sentence-transformers model once in order to cache it in the image # TODO: ARG / ENV for embedder model RUN echo "from haystack.components.embedders import SentenceTransformersDocumentEmbedder\nSentenceTransformersDocumentEmbedder(model='svalabs/german-gpl-adapted-covid').warm_up()" | python3 From 9ee8a32b3f417e04c1c1fd51bee97e3ee8997cc6 Mon Sep 17 00:00:00 2001 From: roti Date: Mon, 12 Feb 2024 12:55:12 +0000 Subject: [PATCH 38/51] fix(sentence-transformers): use cuda if available --- gswikichat/vector_store_interface.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py index 1aab187..5cb5e66 100644 --- a/gswikichat/vector_store_interface.py +++ b/gswikichat/vector_store_interface.py @@ -11,6 +11,7 @@ from haystack.components.preprocessors import DocumentSplitter from haystack.components.preprocessors import DocumentCleaner +import torch from .logger import get_logger @@ -25,6 +26,13 @@ top_k = 5 input_documents = [] +device = "cpu" + +if torch.cuda.is_available(): + logger.info('GPU is available.') + device = "cuda" + + # TODO: Add the json strings as env variables json_dir = 'json_input' json_fname = 'excellent-articles_10.json' @@ -95,6 +103,7 @@ embedder = SentenceTransformersDocumentEmbedder( model=sentence_transformer_model, + device=device ) embedder.warm_up() From b2357e3f0bd8611115975ec82d7138e1b555f4a8 Mon Sep 17 00:00:00 2001 From: roti Date: Mon, 12 Feb 2024 16:00:07 +0000 Subject: [PATCH 39/51] fix(frontend): run from webserver root --- frontend/vite.config.ts | 2 +- gswikichat/api.py | 22 ++++++++++------------ 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts index c29f416..0f4d1b0 100644 --- a/frontend/vite.config.ts +++ b/frontend/vite.config.ts @@ -5,7 +5,7 @@ import vue from '@vitejs/plugin-vue' // https://vitejs.dev/config/ export default defineConfig({ - base: '/frontend/dist', + base: '/', plugins: [ vue(), ], diff --git a/gswikichat/api.py b/gswikichat/api.py index e965841..749ffa7 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -1,4 +1,4 @@ -from fastapi.responses import RedirectResponse +from fastapi.responses import FileResponse from fastapi.staticfiles import StaticFiles from fastapi import FastAPI @@ -10,25 +10,23 @@ # Create logger instance from base logger config in `logger.py` logger = get_logger(__name__) -STATIC_DIR = 'frontend/dist' -LANDING_PAGE = f'/{STATIC_DIR}' +FRONTEND_STATIC_DIR = './frontend/dist' app = FastAPI() + app.mount( - LANDING_PAGE, - StaticFiles(directory=STATIC_DIR, html=True), - name="frontend" + "/assets", + StaticFiles(directory=f"{FRONTEND_STATIC_DIR}/assets"), + name="frontend-assets" ) - @app.get("/") async def root(): - return RedirectResponse( - url=LANDING_PAGE, - status_code=308 - ) - # return {} + return FileResponse(f"{FRONTEND_STATIC_DIR}/index.html") +@app.get("/favicon.ico") +async def favicon(): + return FileResponse(f"{FRONTEND_STATIC_DIR}/favicon.ico") @app.get("/api") async def api(query, top_k=3, lang='en'): From b518abf0813aea96db3663309b4bdc0023923847 Mon Sep 17 00:00:00 2001 From: roti Date: Mon, 12 Feb 2024 16:33:18 +0000 Subject: [PATCH 40/51] feat: store embedding cache in volume --- README.md | 2 +- gswikichat/vector_store_interface.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d30cc74..d46be0d 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ DOCKER_BUILDKIT=1 docker build . -t gbnc docker run \ --env HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \ --volume "$(pwd)/gswikichat":/workspace/gswikichat \ - --volume "$(pwd)/cache":/root/.cache \ + --volume gbnc_cache:/root/.cache --publish 8000:8000 \ --rm \ --interactive \ diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py index 5cb5e66..95d52db 100644 --- a/gswikichat/vector_store_interface.py +++ b/gswikichat/vector_store_interface.py @@ -21,7 +21,7 @@ HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN') # disable this line to disable the embedding cache -EMBEDDING_CACHE_FILE = '/tmp/gbnc_embeddings.json' +EMBEDDING_CACHE_FILE = '/root/.cache/gbnc_embeddings.json' top_k = 5 input_documents = [] From 69800b079af08ec9c11187e519b201d8b36d2d56 Mon Sep 17 00:00:00 2001 From: roti Date: Mon, 12 Feb 2024 22:29:39 +0000 Subject: [PATCH 41/51] feat(start.sh): pull llm using ollama (if not built into container) closes #35 --- start.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/start.sh b/start.sh index cf21953..b7a27f8 100644 --- a/start.sh +++ b/start.sh @@ -1,11 +1,13 @@ #!/bin/bash +set -e + if [[ $PUBLIC_KEY ]] then mkdir -p ~/.ssh chmod 700 ~/.ssh cd ~/.ssh - echo $PUBLIC_KEY >> authorized_keys + echo "$PUBLIC_KEY" >> authorized_keys chmod 700 -R ~/.ssh cd / service ssh start @@ -16,10 +18,17 @@ fi echo "Starting ollama" ollama serve & +while ! curl "$OLLAMA_URL"; do + sleep 1 +done + +echo "Pulling $OLLAMA_MODEL_NAME from ollama library" +ollama pull "$OLLAMA_MODEL_NAME" + cd /workspace echo "Starting api" uvicorn gswikichat:app --reload --host 0.0.0.0 --port 8000 & -echo "Sleeping..." +echo "Ready" sleep infinity From 7803649bb5c718bcfd9cfc795d0d400bc13f8ac9 Mon Sep 17 00:00:00 2001 From: roti Date: Mon, 12 Feb 2024 22:34:10 +0000 Subject: [PATCH 42/51] feat(ollama): use chat api to leverage prompt templates closes #34 --- gswikichat/api.py | 2 +- gswikichat/llm_config.py | 13 ++++++------- gswikichat/prompt.py | 24 +++++++----------------- gswikichat/rag.py | 40 +++++++++++++++++++++++----------------- 4 files changed, 37 insertions(+), 42 deletions(-) diff --git a/gswikichat/api.py b/gswikichat/api.py index 749ffa7..c97ff2a 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -54,6 +54,6 @@ async def api(query, top_k=3, lang='en'): logger.debug(f'{answer=}') return { - "answer": answer.data, + "answer": answer.data.content, "sources": sources } diff --git a/gswikichat/llm_config.py b/gswikichat/llm_config.py index 0ada3ce..5af6c1c 100644 --- a/gswikichat/llm_config.py +++ b/gswikichat/llm_config.py @@ -1,5 +1,5 @@ import os -from haystack_integrations.components.generators.ollama import OllamaGenerator +from haystack_integrations.components.generators.ollama import OllamaChatGenerator from .logger import get_logger @@ -8,17 +8,16 @@ OLLAMA_MODEL_NAME = os.environ.get("OLLAMA_MODEL_NAME") OLLAMA_URL = os.environ.get("OLLAMA_URL") -OLLAMA_GENERATE_URL = f"{OLLAMA_URL}/api/generate" +OLLAMA_CHAT_URL = f"{OLLAMA_URL}/api/chat" logger.info(f'Using {OLLAMA_MODEL_NAME=}') logger.info(f'Endpoint: {OLLAMA_URL=}') -logger.info(f'Generate: {OLLAMA_GENERATE_URL=}') - -logger.debug(f'I AM HERE') +logger.info(f'Generate: {OLLAMA_CHAT_URL=}') logger.info(f"Setting up ollama with {OLLAMA_MODEL_NAME}") -llm = OllamaGenerator( +llm = OllamaChatGenerator( model=OLLAMA_MODEL_NAME, - url=OLLAMA_GENERATE_URL + url=OLLAMA_CHAT_URL, + timeout=120 ) diff --git a/gswikichat/prompt.py b/gswikichat/prompt.py index d74d3e4..d3306fc 100644 --- a/gswikichat/prompt.py +++ b/gswikichat/prompt.py @@ -1,38 +1,28 @@ from haystack.components.builders.prompt_builder import PromptBuilder prompt_template_en = """ -<|system|> -You are a helpful assistant. You answer questions based on the given documents. -Answer based on the documents only. If the information is not in the documents, -say that you cannot find the information. -<|endoftext|> -<|user|> Documents: {% for doc_ in documents %} {{ doc_.content }} {% endfor %} With this documents, answer the following question: {{question}} -<|endoftext|> -<|assistant|> """ prompt_template_de = """ -<|system|> -Du bist ein hilfreicher Assistent. Du beantwortest Fragen basierend auf den vorliegenden Dokumenten. -Beantworte basierend auf den Dokumenten nur. Wenn die Information nicht in den Dokumenten ist, -sage, dass du sie nicht finden kannst. -<|endoftext|> -<|user|> Dokumente: {% for doc_ in documents %} {{ doc_.content }} {% endfor %} Mit diesen Dokumenten, beantworte die folgende Frage: {{question}} -<|endoftext|> -<|assistant|> """ -prompt_builders = { +system_prompts = { + 'en': 'You are a helpful assistant. You answer questions based on the given documents. Answer based on the documents only. If the information is not in the documents, say that you cannot find the information.', + 'de': 'Du bist ein hilfreicher Assistent. Du beantwortest Fragen basierend auf den vorliegenden Dokumenten. Beantworte basierend auf den Dokumenten nur. Wenn die Information nicht in den Dokumenten ist, sage, dass du sie nicht finden kannst.', +} + +user_prompt_builders = { 'en': PromptBuilder(template=prompt_template_en), 'de': PromptBuilder(template=prompt_template_de), } + diff --git a/gswikichat/rag.py b/gswikichat/rag.py index 44c7e2b..b916686 100644 --- a/gswikichat/rag.py +++ b/gswikichat/rag.py @@ -1,26 +1,21 @@ # from haystack import Pipeline from haystack import Document from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.dataclasses import ChatMessage from .llm_config import llm from .logger import get_logger -from .prompt import prompt_builders +from .prompt import user_prompt_builders, system_prompts from .vector_store_interface import embedder, retriever, input_documents # Create logger instance from base logger config in `logger.py` logger = get_logger(__name__) -def rag_pipeline(query: str = None, top_k: int = 3, lang: str = 'de'): +def rag_pipeline(query: str, top_k: int = 3, lang: str = 'de'): - assert (query is not None) - - if isinstance(query, str): - query = Document(content=query) - - assert (isinstance(query, Document)) - - query_embedded = embedder.run([query]) + query_document = Document(content=query) + query_embedded = embedder.run([query_document]) query_embedding = query_embedded['documents'][0].embedding retriever_results = retriever.run( @@ -35,24 +30,35 @@ def rag_pipeline(query: str = None, top_k: int = 3, lang: str = 'de'): for retriever_result_ in retriever_results: logger.debug(retriever_result_) - prompt_builder = prompt_builders[lang] + system_prompt = system_prompts[lang] + user_prompt_builder = user_prompt_builders[lang] - prompt_build = prompt_builder.run( - question=query.content, # As a Document instance, .content returns a string + user_prompt_build = user_prompt_builder.run( + question=query_document.content, documents=retriever_results['documents'] ) - prompt = prompt_build['prompt'] + prompt = user_prompt_build['prompt'] logger.debug(f'{prompt=}') - response = llm.run(prompt=prompt, generation_kwargs=None) + messages = [ + ChatMessage.from_system(system_prompt), + ChatMessage.from_user(prompt), + ] + + response = llm.run( + messages, + # generation_kwargs={"temperature": 0.2} + ) + + logger.debug(response) answer_builder = AnswerBuilder() answer_build = answer_builder.run( - query=query.content, # As a Document class, .content returns the string + query=query_document.content, replies=response['replies'], - meta=response['meta'], + meta=[r.meta for r in response['replies']], documents=retriever_results['documents'], pattern=None, reference_pattern=None From ff1fcab50294382484db5ce33a7f5a144ad76f1b Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Mon, 19 Feb 2024 21:29:38 +0000 Subject: [PATCH 43/51] docs: fix run cmd --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d46be0d..6f04ed3 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ DOCKER_BUILDKIT=1 docker build . -t gbnc docker run \ --env HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \ --volume "$(pwd)/gswikichat":/workspace/gswikichat \ - --volume gbnc_cache:/root/.cache + --volume gbnc_cache:/root/.cache \ --publish 8000:8000 \ --rm \ --interactive \ From 8c0a2cb6f928967a0015e5b0fa217b2a5e5e5e0b Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Sun, 25 Feb 2024 22:45:07 +0000 Subject: [PATCH 44/51] wip: postgres vecto.rs db, s/haystack/langchain --- Dockerfile | 19 +++- gswikichat/rag.py | 27 +++--- gswikichat/vector_store_interface.py | 136 ++++++++++----------------- requirements.txt | 6 ++ start.sh | 31 ++++++ 5 files changed, 113 insertions(+), 106 deletions(-) diff --git a/Dockerfile b/Dockerfile index dc8d862..d9eeac1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,18 +12,31 @@ FROM $CUDA_FROM ENV PATH="/usr/local/cuda/bin:${PATH}" +# Install unattendedly +ENV DEBIAN_FRONTEND=noninteractive + +# Force a config for tzdata package, otherwise it will interactively ask during install +RUN ln -fs /usr/share/zoneinfo/UTC /etc/localtime + # Install essential packages from ubuntu repository RUN apt-get update -y && \ apt-get install -y --no-install-recommends openssh-server openssh-client git git-lfs && \ apt-get install -y curl && \ apt-get install -y python3 python3-pip python3-venv && \ + apt-get install -y postgresql-14 && \ + apt-get install -y jq && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# Install vecto.rs extension to postgres +RUN curl -L -O https://github.com/tensorchord/pgvecto.rs/releases/download/v0.2.0/vectors-pg14_0.2.0_amd64.deb +RUN dpkg -i vectors-pg14_0.2.0_amd64.deb + + # Install node from upstream, ubuntu packages are too old -RUN curl -sL https://deb.nodesource.com/setup_18.x | bash -RUN apt-get install -y nodejs && \ +RUN curl -sL https://deb.nodesource.com/setup_18.x | bash && \ + apt-get install -y nodejs && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -53,11 +66,13 @@ ARG OLLAMA_URL=http://localhost:11434 ENV OLLAMA_MODEL_NAME=${OLLAMA_MODEL_NAME} ENV OLLAMA_URL=${OLLAMA_URL} +# TODO: cache path RUN ollama serve & while ! curl ${OLLAMA_URL}; do sleep 1; done; ollama pull $OLLAMA_MODEL_NAME # Load sentence-transformers model once in order to cache it in the image # TODO: ARG / ENV for embedder model +# TODO: SENTENCE_TRANSFORMERS_HOME for cache path RUN echo "from haystack.components.embedders import SentenceTransformersDocumentEmbedder\nSentenceTransformersDocumentEmbedder(model='svalabs/german-gpl-adapted-covid').warm_up()" | python3 diff --git a/gswikichat/rag.py b/gswikichat/rag.py index b916686..cb84e06 100644 --- a/gswikichat/rag.py +++ b/gswikichat/rag.py @@ -6,7 +6,7 @@ from .llm_config import llm from .logger import get_logger from .prompt import user_prompt_builders, system_prompts -from .vector_store_interface import embedder, retriever, input_documents +from .vector_store_interface import db # Create logger instance from base logger config in `logger.py` logger = get_logger(__name__) @@ -14,28 +14,23 @@ def rag_pipeline(query: str, top_k: int = 3, lang: str = 'de'): - query_document = Document(content=query) - query_embedded = embedder.run([query_document]) - query_embedding = query_embedded['documents'][0].embedding + docs_with_score = db.similarity_search_with_score(query, top_k) - retriever_results = retriever.run( - query_embedding=list(query_embedding), - filters=None, - top_k=top_k, - scale_score=None, - return_embedding=None - ) + for doc, score in docs_with_score: + print("-" * 80) + print("Score: ", score) + print(doc.page_content) + print("-" * 80) - logger.debug('retriever results:') - for retriever_result_ in retriever_results: - logger.debug(retriever_result_) + # langchain doc to haystack doc + docs = [Document.from_dict({"content":d.page_content,"meta":d.metadata}) for d,_ in docs_with_score] system_prompt = system_prompts[lang] user_prompt_builder = user_prompt_builders[lang] user_prompt_build = user_prompt_builder.run( - question=query_document.content, - documents=retriever_results['documents'] + question=query, + documents=docs ) prompt = user_prompt_build['prompt'] diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py index 95d52db..e329b50 100644 --- a/gswikichat/vector_store_interface.py +++ b/gswikichat/vector_store_interface.py @@ -2,14 +2,13 @@ import json from tqdm import tqdm +from pprint import pprint -from haystack import Document # , Pipeline -from haystack.components.embedders import SentenceTransformersDocumentEmbedder -from haystack.document_stores.in_memory import InMemoryDocumentStore -from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever -from haystack.document_stores.types.policy import DuplicatePolicy -from haystack.components.preprocessors import DocumentSplitter -from haystack.components.preprocessors import DocumentCleaner +from langchain.docstore.document import Document +from langchain.text_splitter import CharacterTextSplitter +from langchain_community.document_loaders import JSONLoader +from langchain_community.embeddings.fake import FakeEmbeddings +from langchain_community.embeddings import HuggingFaceEmbeddings import torch @@ -35,104 +34,65 @@ # TODO: Add the json strings as env variables json_dir = 'json_input' -json_fname = 'excellent-articles_10.json' +json_fname = 'data.json' json_fpath = os.path.join(json_dir, json_fname) -if os.path.isfile(json_fpath): - logger.info(f'Loading data from {json_fpath}') - with open(json_fpath, 'r') as finn: - json_obj = json.load(finn) - - if isinstance(json_obj, dict): - input_documents = [ - Document( - content=content_, - meta={"src": url_} - ) - for url_, content_ in tqdm(json_obj.items()) - ] - elif isinstance(json_obj, list): - input_documents = [ - Document( - content=obj_['content'], - meta={'src': obj_['meta']} - ) - for obj_ in tqdm(json_obj) - ] -else: - input_documents = [ - Document( - content="My name is Asra, I live in Paris.", - meta={"src": "doc_1"} - ), - Document( - content="My name is Lee, I live in Berlin.", - meta={"src": "doc2"} - ), - Document( - content="My name is Giorgio, I live in Rome.", - meta={"src": "doc_3"} - ), - ] - -splitter = DocumentSplitter( - split_by="sentence", - split_length=5, - split_overlap=0 -) -input_documents = splitter.run(input_documents)['documents'] +def metadata_func(record: dict, metadata: dict) -> dict: + metadata["source"] = record.get("meta").get("source") + return metadata -cleaner = DocumentCleaner( - remove_empty_lines=True, - remove_extra_whitespaces=True, - remove_repeated_substrings=False +# Create the JSONLoader instance +loader = JSONLoader( + file_path=json_fpath, + jq_schema='.[]', + content_key="content", + metadata_func=metadata_func ) -input_documents = cleaner.run(input_documents)['documents'] +documents = loader.load() +pprint(documents[0]) -document_store = InMemoryDocumentStore( - embedding_similarity_function="cosine", - # embedding_dim=768, - # duplicate_documents="overwrite" -) +text_splitter = CharacterTextSplitter(chunk_size=250, chunk_overlap=0) +docs = text_splitter.split_documents(documents) # https://huggingface.co/svalabs/german-gpl-adapted-covid sentence_transformer_model = 'svalabs/german-gpl-adapted-covid' logger.info(f'Sentence Transformer Name: {sentence_transformer_model}') -embedder = SentenceTransformersDocumentEmbedder( - model=sentence_transformer_model, - device=device +embeddings = HuggingFaceEmbeddings( + model_name=sentence_transformer_model, + model_kwargs={'device': device}, + show_progress=True, ) -embedder.warm_up() +from langchain_community.vectorstores.pgvecto_rs import PGVecto_rs -if EMBEDDING_CACHE_FILE and os.path.isfile(EMBEDDING_CACHE_FILE): - logger.info('Loading embeddings from cache') +import os - with open(EMBEDDING_CACHE_FILE, 'r') as f_in: - documents_dict = json.load(f_in) - document_store.write_documents( - documents=[Document.from_dict(d_) for d_ in documents_dict], - policy=DuplicatePolicy.OVERWRITE - ) +PORT = os.getenv("DB_PORT", 5432) +HOST = os.getenv("DB_HOST", "127.0.0.1") +USER = os.getenv("DB_USER", "gbnc") +PASS = os.getenv("DB_PASS", "") +DB_NAME = os.getenv("DB_NAME", "gbnc") -else: - logger.debug("Generating embeddings") - embedded = embedder.run(input_documents) - document_store.write_documents( - documents=embedded['documents'], - policy=DuplicatePolicy.OVERWRITE - ) +URL = "postgresql+psycopg://{username}:{password}@{host}:{port}/{db_name}".format( + port=PORT, + host=HOST, + username=USER, + password=PASS, + db_name=DB_NAME, +) - if EMBEDDING_CACHE_FILE: - with open(EMBEDDING_CACHE_FILE, 'w') as f_out: - documents_dict = [ - Document.to_dict(d_) - for d_ in embedded['documents'] - ] - json.dump(documents_dict, f_out) -retriever = InMemoryEmbeddingRetriever(document_store=document_store) +logger.info(f"Inserting {len(docs)} documents") + +db = PGVecto_rs.from_documents( + documents=docs, + embedding=embeddings, + db_url=URL, + collection_name="gbnc", +) + +logger.info('done') diff --git a/requirements.txt b/requirements.txt index 723011a..37cac52 100644 --- a/requirements.txt +++ b/requirements.txt @@ -45,3 +45,9 @@ uvicorn==0.27.0 uvloop==0.19.0 watchfiles==0.21.0 websockets==12.0 +pgvecto-rs==0.1.4 +langchain==0.1.9 +langchain-community==0.0.24 +langchain-core==0.1.26 +jq==1.6.0 +psycopg==3.1.18 diff --git a/start.sh b/start.sh index b7a27f8..17c54ab 100644 --- a/start.sh +++ b/start.sh @@ -1,6 +1,7 @@ #!/bin/bash set -e +set -x if [[ $PUBLIC_KEY ]] then @@ -15,6 +16,32 @@ else echo "No public key provided, skipping ssh setup" fi + +echo "Setting up postgres database server with vecto.rs extension" + +service postgresql start + +su postgres <<'EOF' +psql -c 'ALTER SYSTEM SET shared_preload_libraries = "vectors.so"' +psql -c 'ALTER SYSTEM SET search_path TO "$user", public, vectors' +EOF + +service postgresql restart + +DB_USER=postgres +DB_PASS=$(openssl rand -base64 32) +DB_NAME=gbnc +export DB_USER +export DB_PASS +export DB_NAME + +su --preserve-environment postgres <<'EOF' +psql -c "CREATE EXTENSION vectors;" +psql -c "ALTER USER $DB_USER WITH PASSWORD '$DB_PASS';" +psql -c "CREATE DATABASE $DB_NAME OWNER $DB_USER;" +EOF + + echo "Starting ollama" ollama serve & @@ -27,6 +54,10 @@ ollama pull "$OLLAMA_MODEL_NAME" cd /workspace +echo "Preparing data" +cat json_input/excellent-articles_10.json | jq 'to_entries | map({content: .value, meta: {source: .key}})' > json_input/data.json + + echo "Starting api" uvicorn gswikichat:app --reload --host 0.0.0.0 --port 8000 & From a0e5a11513ec8e660b6458e15b82c3642010046d Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Sun, 25 Feb 2024 22:57:28 +0000 Subject: [PATCH 45/51] fix: vars and fields --- gswikichat/api.py | 2 +- gswikichat/rag.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gswikichat/api.py b/gswikichat/api.py index c97ff2a..ed212e0 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -45,7 +45,7 @@ async def api(query, top_k=3, lang='en'): sources = [ { - "src": d_.meta['src'], + "src": d_.meta['source'], "content": d_.content, "score": d_.score } for d_ in answer.documents diff --git a/gswikichat/rag.py b/gswikichat/rag.py index cb84e06..734d6f0 100644 --- a/gswikichat/rag.py +++ b/gswikichat/rag.py @@ -51,10 +51,10 @@ def rag_pipeline(query: str, top_k: int = 3, lang: str = 'de'): answer_builder = AnswerBuilder() answer_build = answer_builder.run( - query=query_document.content, + query=query, replies=response['replies'], meta=[r.meta for r in response['replies']], - documents=retriever_results['documents'], + documents=docs, pattern=None, reference_pattern=None ) From 507f7e3c3736dd13528cd18b518ebffeb648c2de Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Mon, 26 Feb 2024 22:08:07 +0000 Subject: [PATCH 46/51] feat: separate data import --- README.md | 12 +- frontend/src/components/field/FieldAnswer.vue | 4 +- frontend/src/types/source.d.ts | 2 +- gswikichat/api.py | 45 ++++---- gswikichat/db.py | 107 ++++++++++++++++++ gswikichat/rag.py | 53 +++++---- gswikichat/vector_store_interface.py | 98 ---------------- start.sh | 10 +- 8 files changed, 176 insertions(+), 155 deletions(-) create mode 100644 gswikichat/db.py delete mode 100644 gswikichat/vector_store_interface.py diff --git a/README.md b/README.md index 6f04ed3..5dcb05f 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,8 @@ To build and run the container locally with hot reload on python files do: ``` -DOCKER_BUILDKIT=1 docker build . -t gbnc -docker run \ +$ DOCKER_BUILDKIT=1 docker build . -t gbnc +$ docker run \ --env HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \ --volume "$(pwd)/gswikichat":/workspace/gswikichat \ --volume gbnc_cache:/root/.cache \ @@ -22,6 +22,14 @@ docker run \ ``` Point your browser to http://localhost:8000/ and use the frontend. +To import data run: + +``` +$ docker exec -it gbnc bash +# cat json_input/excellent-articles_10.json | jq 'to_entries | map({content: .value, meta: {source: .key}})' > import.json +# python3 -m gswikichat.db import.json +``` + ### Runpod.io The container works on [runpod.io](https://www.runpod.io/) GPU instances. A [template is available here](https://runpod.io/gsc?template=0w8z55rf19&ref=yfvyfa0s). diff --git a/frontend/src/components/field/FieldAnswer.vue b/frontend/src/components/field/FieldAnswer.vue index 99afac7..04f1f95 100644 --- a/frontend/src/components/field/FieldAnswer.vue +++ b/frontend/src/components/field/FieldAnswer.vue @@ -17,8 +17,8 @@ class="text-sm cursor-pointer text-light-distinct-text dark:text-dark-distinct-text" > - {{ $t('source') }} ({{ s.score.toFixed(1) }}/5): - {{ s.src }} + {{ $t('source') }} ({{ s.score.toFixed(1) }}): + {{ s.source }}

{{ s.content }}

diff --git a/frontend/src/types/source.d.ts b/frontend/src/types/source.d.ts index 4e3cd7e..8d0de16 100644 --- a/frontend/src/types/source.d.ts +++ b/frontend/src/types/source.d.ts @@ -1,6 +1,6 @@ export type Source = { id: number - src: string + source: string content: string score: number } diff --git a/gswikichat/api.py b/gswikichat/api.py index ed212e0..ceae499 100644 --- a/gswikichat/api.py +++ b/gswikichat/api.py @@ -2,58 +2,53 @@ from fastapi.staticfiles import StaticFiles from fastapi import FastAPI -from .rag import rag_pipeline - -from haystack import Document from .logger import get_logger +from .rag import rag_pipeline # Create logger instance from base logger config in `logger.py` logger = get_logger(__name__) -FRONTEND_STATIC_DIR = './frontend/dist' +FRONTEND_STATIC_DIR = "./frontend/dist" app = FastAPI() app.mount( "/assets", StaticFiles(directory=f"{FRONTEND_STATIC_DIR}/assets"), - name="frontend-assets" + name="frontend-assets", ) + @app.get("/") async def root(): return FileResponse(f"{FRONTEND_STATIC_DIR}/index.html") + @app.get("/favicon.ico") async def favicon(): return FileResponse(f"{FRONTEND_STATIC_DIR}/favicon.ico") + @app.get("/api") -async def api(query, top_k=3, lang='en'): - if not lang in ['en', 'de']: +async def api(query, top_k=3, lang="en"): + if not lang in ["en", "de"]: raise Exception("language must be 'en' or 'de'") - logger.debug(f'{query=}') # Assuming we change the input name - logger.debug(f'{top_k=}') - logger.debug(f'{lang=}') + logger.debug(f"{query=}") + logger.debug(f"{top_k=}") + logger.debug(f"{lang=}") + + answer = rag_pipeline(query=query, top_k=top_k, lang=lang) - answer = rag_pipeline( - query=query, - top_k=top_k, - lang=lang - ) + if not answer: + return {} sources = [ - { - "src": d_.meta['source'], - "content": d_.content, - "score": d_.score - } for d_ in answer.documents + {"id": d_.id, "source": d_.meta["source"], "content": d_.content, "score": d_.score} + for d_ in answer.documents ] - logger.debug(f'{answer=}') + logger.debug(f"{answer.data=}") + logger.debug(f"{answer.documents=}") - return { - "answer": answer.data.content, - "sources": sources - } + return {"answer": answer.data.content, "sources": sources} diff --git a/gswikichat/db.py b/gswikichat/db.py new file mode 100644 index 0000000..cc0b3f2 --- /dev/null +++ b/gswikichat/db.py @@ -0,0 +1,107 @@ +import os + +import torch + +from langchain.text_splitter import CharacterTextSplitter +from langchain_community.document_loaders import JSONLoader +from langchain_community.embeddings import HuggingFaceEmbeddings +from langchain_community.vectorstores.pgvecto_rs import PGVecto_rs + +from .logger import get_logger + + +SENTENCE_TRANSFORMER_MODEL = "svalabs/german-gpl-adapted-covid" + +logger = get_logger(__name__) + + +def get_device(): + device = "cpu" + if torch.cuda.is_available(): + logger.info("GPU is available.") + device = "cuda" + return device + + +def get_embedding_model(): + # https://huggingface.co/svalabs/german-gpl-adapted-covid + logger.info(f"Embedding model: {SENTENCE_TRANSFORMER_MODEL}") + + return HuggingFaceEmbeddings( + model_name=SENTENCE_TRANSFORMER_MODEL, + model_kwargs={"device": get_device()}, + show_progress=True, + ) + + +def get_db(): + PORT = os.getenv("DB_PORT", 5432) + HOST = os.getenv("DB_HOST", "127.0.0.1") + USER = os.getenv("DB_USER", "gbnc") + PASS = os.getenv("DB_PASS", "") + DB_NAME = os.getenv("DB_NAME", "gbnc") + + URL = "postgresql+psycopg://{username}:{password}@{host}:{port}/{db_name}".format( + port=PORT, + host=HOST, + username=USER, + password=PASS, + db_name=DB_NAME, + ) + + return PGVecto_rs.from_collection_name( + embedding=get_embedding_model(), + db_url=URL, + collection_name="gbnc", + ) + + +def import_data(file): + def metadata_func(record: dict, metadata: dict) -> dict: + metadata["source"] = record.get("meta", {}).get("source") + return metadata + + loader = JSONLoader( + file_path=file, + jq_schema=".[]", + content_key="content", + metadata_func=metadata_func, + ) + + documents = loader.load() + + logger.debug(f"Loaded {len(documents)} documents.") + + text_splitter = CharacterTextSplitter(chunk_size=250, chunk_overlap=0) + chunks = text_splitter.split_documents(documents) + logger.debug(f"Split documents into {len(chunks)} chunks.") + + logger.debug(f"Importing into database.") + get_db().add_documents(chunks) + + +if __name__ == "__main__": + import sys + + if len(sys.argv) > 1: + file = sys.argv[1] + import_data(file) + + else: + logger.error( + """Provide JSON file with the following structure as first parameter + [ + { + "content":"document content one", "meta":{ + "source": "https://source.url/one" + } + }, + { + "content":"document content two", "meta":{ + "source": "https://source.url/two" + } + } + ] + """ + ) + sys.exit(1) diff --git a/gswikichat/rag.py b/gswikichat/rag.py index 734d6f0..d158fc4 100644 --- a/gswikichat/rag.py +++ b/gswikichat/rag.py @@ -3,39 +3,46 @@ from haystack.components.builders.answer_builder import AnswerBuilder from haystack.dataclasses import ChatMessage +from .db import get_db from .llm_config import llm from .logger import get_logger from .prompt import user_prompt_builders, system_prompts -from .vector_store_interface import db # Create logger instance from base logger config in `logger.py` logger = get_logger(__name__) -def rag_pipeline(query: str, top_k: int = 3, lang: str = 'de'): +def langchain_to_haystack_doc(langchain_document, score): + return Document.from_dict( + { + "content": langchain_document.page_content, + "meta": langchain_document.metadata, + "score": score, + } + ) + + +def rag_pipeline(query: str, top_k: int, lang: str): + docs_with_score = get_db().similarity_search_with_score(query, top_k) + docs_with_score.reverse() # best first - docs_with_score = db.similarity_search_with_score(query, top_k) + if len(docs_with_score) == 0: + return None + logger.debug("Matching documents: ") for doc, score in docs_with_score: - print("-" * 80) - print("Score: ", score) - print(doc.page_content) - print("-" * 80) + logger.debug("-" * 80) + logger.debug(f"Score: {score}") + logger.debug(doc.page_content) - # langchain doc to haystack doc - docs = [Document.from_dict({"content":d.page_content,"meta":d.metadata}) for d,_ in docs_with_score] + docs = [langchain_to_haystack_doc(d, s) for d, s in docs_with_score] system_prompt = system_prompts[lang] user_prompt_builder = user_prompt_builders[lang] + user_prompt_build = user_prompt_builder.run(question=query, documents=docs) + prompt = user_prompt_build["prompt"] - user_prompt_build = user_prompt_builder.run( - question=query, - documents=docs - ) - - prompt = user_prompt_build['prompt'] - - logger.debug(f'{prompt=}') + logger.debug(f"{prompt=}") messages = [ ChatMessage.from_system(system_prompt), @@ -43,7 +50,7 @@ def rag_pipeline(query: str, top_k: int = 3, lang: str = 'de'): ] response = llm.run( - messages, + messages, # generation_kwargs={"temperature": 0.2} ) @@ -52,13 +59,13 @@ def rag_pipeline(query: str, top_k: int = 3, lang: str = 'de'): answer_builder = AnswerBuilder() answer_build = answer_builder.run( query=query, - replies=response['replies'], - meta=[r.meta for r in response['replies']], + replies=response["replies"], + meta=[r.meta for r in response["replies"]], documents=docs, pattern=None, - reference_pattern=None + reference_pattern=None, ) - logger.debug(f'{answer_build=}') + logger.debug(f"{answer_build=}") - return answer_build['answers'][0] + return answer_build["answers"][0] diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py deleted file mode 100644 index e329b50..0000000 --- a/gswikichat/vector_store_interface.py +++ /dev/null @@ -1,98 +0,0 @@ -import os -import json - -from tqdm import tqdm -from pprint import pprint - -from langchain.docstore.document import Document -from langchain.text_splitter import CharacterTextSplitter -from langchain_community.document_loaders import JSONLoader -from langchain_community.embeddings.fake import FakeEmbeddings -from langchain_community.embeddings import HuggingFaceEmbeddings - -import torch - -from .logger import get_logger - -# Create logger instance from base logger config in `logger.py` -logger = get_logger(__name__) - -HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN') - -# disable this line to disable the embedding cache -EMBEDDING_CACHE_FILE = '/root/.cache/gbnc_embeddings.json' - -top_k = 5 -input_documents = [] - -device = "cpu" - -if torch.cuda.is_available(): - logger.info('GPU is available.') - device = "cuda" - - -# TODO: Add the json strings as env variables -json_dir = 'json_input' -json_fname = 'data.json' - -json_fpath = os.path.join(json_dir, json_fname) - -def metadata_func(record: dict, metadata: dict) -> dict: - metadata["source"] = record.get("meta").get("source") - return metadata - -# Create the JSONLoader instance -loader = JSONLoader( - file_path=json_fpath, - jq_schema='.[]', - content_key="content", - metadata_func=metadata_func -) - -documents = loader.load() -pprint(documents[0]) - -text_splitter = CharacterTextSplitter(chunk_size=250, chunk_overlap=0) -docs = text_splitter.split_documents(documents) - -# https://huggingface.co/svalabs/german-gpl-adapted-covid -sentence_transformer_model = 'svalabs/german-gpl-adapted-covid' -logger.info(f'Sentence Transformer Name: {sentence_transformer_model}') - -embeddings = HuggingFaceEmbeddings( - model_name=sentence_transformer_model, - model_kwargs={'device': device}, - show_progress=True, -) - -from langchain_community.vectorstores.pgvecto_rs import PGVecto_rs - -import os - -PORT = os.getenv("DB_PORT", 5432) -HOST = os.getenv("DB_HOST", "127.0.0.1") -USER = os.getenv("DB_USER", "gbnc") -PASS = os.getenv("DB_PASS", "") -DB_NAME = os.getenv("DB_NAME", "gbnc") - - -URL = "postgresql+psycopg://{username}:{password}@{host}:{port}/{db_name}".format( - port=PORT, - host=HOST, - username=USER, - password=PASS, - db_name=DB_NAME, -) - - -logger.info(f"Inserting {len(docs)} documents") - -db = PGVecto_rs.from_documents( - documents=docs, - embedding=embeddings, - db_url=URL, - collection_name="gbnc", -) - -logger.info('done') diff --git a/start.sh b/start.sh index 17c54ab..40d6f81 100644 --- a/start.sh +++ b/start.sh @@ -28,12 +28,18 @@ EOF service postgresql restart +cat > ~/.env <> ~/.bashrc su --preserve-environment postgres <<'EOF' psql -c "CREATE EXTENSION vectors;" @@ -54,10 +60,6 @@ ollama pull "$OLLAMA_MODEL_NAME" cd /workspace -echo "Preparing data" -cat json_input/excellent-articles_10.json | jq 'to_entries | map({content: .value, meta: {source: .key}})' > json_input/data.json - - echo "Starting api" uvicorn gswikichat:app --reload --host 0.0.0.0 --port 8000 & From 57ec054eb375cdb9504d939eec6132e93d0ab40b Mon Sep 17 00:00:00 2001 From: Silvan Date: Wed, 21 Feb 2024 23:03:06 +0100 Subject: [PATCH 47/51] auto fetch gs-wiki articles --- .gitignore | 3 + README.md | 3 + gswikichat/fetch_articles.py | 111 +++++++++++++++++++++++++++ gswikichat/vector_store_interface.py | 70 +++++++---------- json_input/gs-wiki.json | 73 ++++++++++++++++++ json_input/wp-policies.json | 30 ++++++++ requirements.txt | 2 + 7 files changed, 250 insertions(+), 42 deletions(-) create mode 100644 gswikichat/fetch_articles.py create mode 100644 json_input/gs-wiki.json create mode 100644 json_input/wp-policies.json diff --git a/.gitignore b/.gitignore index bc7d212..c41a78a 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,6 @@ __pycache__/ # macOS .DS_Store + +# logs +*.log diff --git a/README.md b/README.md index 6f04ed3..ce15ae2 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,9 @@ To build and run the container locally with hot reload on python files do: ``` DOCKER_BUILDKIT=1 docker build . -t gbnc docker run \ + --env DOCUMENTS_TOC=json_input/gs-wiki.json \ + --env GSWIKI_USER= \ + --env GSWIKI_PW= \ --env HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \ --volume "$(pwd)/gswikichat":/workspace/gswikichat \ --volume gbnc_cache:/root/.cache \ diff --git a/gswikichat/fetch_articles.py b/gswikichat/fetch_articles.py new file mode 100644 index 0000000..8a6c3df --- /dev/null +++ b/gswikichat/fetch_articles.py @@ -0,0 +1,111 @@ +import os +import re +import json +import requests +import configparser + +from bs4 import BeautifulSoup + +GSWIKI_USER = os.environ.get('GSWIKI_USER') +GSWIKI_PW = os.environ.get('GSWIKI_PW') + +HTML_FILTERS = { + 'div': ['navbox','navbox-styles','spoken-wikipedia', 'noprint', 'hatnote', 'rt-tooltip', 'reflist'], + 'span': ['mw-ext-cite-error'], + 'table': ['noprint','ombox'], + 'ol': ['breadcrumb-nav-container', 'references'], + 'sup': ['reference'] +} +SECTION_FILTERS = [ 'Siehe auch', 'See also', 'Weblinks', 'Anmerkungen', 'Notes' ] +REGEX_FILTERS = { + 'p': '→.*ersion' +} + +def filterHtml(soup): + for figure in soup.find_all('figure'): + figure.decompose() + + for tag, classes in HTML_FILTERS.items(): + for className in classes: + for div in soup.find_all(tag, {'class': className}): + div.decompose() + + for tag, regex in REGEX_FILTERS.items(): + for element in soup.find_all(tag): + if(re.search(regex, str(element)) != None): + element.decompose() + + return soup + +def fetchFromWiki(url, titles, loginRequired): + if(loginRequired == True): + session = loginToWiki(url) + else: + session = requests.Session() + + articles = {} + for title in titles: + sections = fetchSections(url, title, session.cookies) + print("fetching {} sections for article {}".format(len(sections), title)) + for section in [ { 'index' : 0, 'line': 'Intro', 'linkAnchor' : '', 'anchor' : '' } ] + sections : + if section['index'] == '' or section['line'] in SECTION_FILTERS: + continue + + query = { + 'action': 'parse', + 'page': title, + 'format': 'json', + 'prop':'text', + 'disabletoc': True, + 'disablelimitreport': True, + 'disableeditsection': True, + 'section': section['index'] + } + section_html = requests.get(url,params=query,cookies=session.cookies).json()['parse']['text']['*'] + section_soup = BeautifulSoup(section_html, 'lxml') + articles[title + '#' + section['anchor']] = filterHtml(section_soup).get_text() + + return articles + + +def fetchSections(url, title, cookies=None): + query = { + 'action':'parse', + 'page':title, + 'format':'json', + 'prop':'sections' + } + sectionsResponse = requests.get(url,params=query, cookies=cookies) + toplevelSections = [ section for section in sectionsResponse.json()['parse']['sections'] if section['toclevel'] == 1 ] + return toplevelSections + +def loginToWiki(url): + session = requests.Session() + + tokenQuery = { 'action': 'query', 'meta': 'tokens', 'type': 'login', 'format': 'json' } + token = session.get(url, params=tokenQuery).json()['query']['tokens']['logintoken'] + loginData = { + 'lgname': GSWIKI_USER, + 'lgpassword': GSWIKI_PW, + 'lgtoken': token, + 'action': 'login', + 'format': 'json' + } + response = session.post(url, data=loginData, headers={ 'Content-Type' : 'application/x-www-form-urlencoded' }) + #TODO: error handling in case of login failure + return session + +def fetch_articles(toc): + articles = [] + for wiki in toc: + url = wiki['host'] + wiki['api_path'] + wikiArticles = fetchFromWiki(url, wiki['titles'], wiki['login']) + + articles.append( { + 'wiki': wiki['name'], + 'url': wiki['host'], + 'lang': wiki['lang'], + 'articles': wikiArticles + } ) + return articles + diff --git a/gswikichat/vector_store_interface.py b/gswikichat/vector_store_interface.py index 95d52db..7febf39 100644 --- a/gswikichat/vector_store_interface.py +++ b/gswikichat/vector_store_interface.py @@ -14,13 +14,17 @@ import torch from .logger import get_logger +from .fetch_articles import fetch_articles + # Create logger instance from base logger config in `logger.py` logger = get_logger(__name__) HUGGING_FACE_HUB_TOKEN = os.environ.get('HUGGING_FACE_HUB_TOKEN') +DOCUMENTS_TOC = os.environ.get('DOCUMENTS_TOC') -# disable this line to disable the embedding cache +# disable these lines to disable the cache +DOCUMENTS_CACHE_FILE = '/root/.cache/gbnc_documents.json' EMBEDDING_CACHE_FILE = '/root/.cache/gbnc_embeddings.json' top_k = 5 @@ -32,49 +36,31 @@ logger.info('GPU is available.') device = "cuda" +if DOCUMENTS_CACHE_FILE and os.path.isfile(DOCUMENTS_CACHE_FILE): + logger.info('Loading documents from cache') + + with open(DOCUMENTS_CACHE_FILE, 'r') as f_in: + documents = json.load(f_in) -# TODO: Add the json strings as env variables -json_dir = 'json_input' -json_fname = 'excellent-articles_10.json' - -json_fpath = os.path.join(json_dir, json_fname) - -if os.path.isfile(json_fpath): - logger.info(f'Loading data from {json_fpath}') - with open(json_fpath, 'r') as finn: - json_obj = json.load(finn) - - if isinstance(json_obj, dict): - input_documents = [ - Document( - content=content_, - meta={"src": url_} - ) - for url_, content_ in tqdm(json_obj.items()) - ] - elif isinstance(json_obj, list): - input_documents = [ - Document( - content=obj_['content'], - meta={'src': obj_['meta']} - ) - for obj_ in tqdm(json_obj) - ] else: - input_documents = [ - Document( - content="My name is Asra, I live in Paris.", - meta={"src": "doc_1"} - ), - Document( - content="My name is Lee, I live in Berlin.", - meta={"src": "doc2"} - ), - Document( - content="My name is Giorgio, I live in Rome.", - meta={"src": "doc_3"} - ), - ] + logger.debug("fetch documents from wiki") + with open(DOCUMENTS_TOC, 'r') as tocFile: + toc = json.load(tocFile) + articles = fetch_articles(toc) + documents = {} + for wiki in articles: + documents.update(wiki['articles']) + if DOCUMENTS_CACHE_FILE: + with open(DOCUMENTS_CACHE_FILE, 'w') as f_out: + json.dump(documents, f_out) + +input_documents = [ + Document( + content=content_, + meta={"src": url_} + ) + for url_, content_ in tqdm(documents.items()) +] splitter = DocumentSplitter( split_by="sentence", diff --git a/json_input/gs-wiki.json b/json_input/gs-wiki.json new file mode 100644 index 0000000..25630df --- /dev/null +++ b/json_input/gs-wiki.json @@ -0,0 +1,73 @@ +[ + { + "name": "GS-Wiki de", + "host": "https://wiki.wikimedia.de/", + "api_path": "/api.php", + "lang": "de", + "login": true, + "titles" : [ + "Offboarding", + "Arbeitszeit", + "Beschwerdestelle_AGG", + "Betriebliche_Altersvorsorge", + "Betriebliches_Eingliederungsmanagement_(BEM)", + "Betriebsvereinbarung", + "Bildungszeit", + "COVID-19", + "Culture_Shock", + "Digitale_Gehaltsunterlagen_(Datev_Arbeitnehmer_Online)", + "Neue_Mitarbeiter", + "Elternzeit", + "Weiterbildung", + "Jubiläum", + "Krankmeldung", + "Mitarbeitendenjahresgespräch", + "Nebentätigkeit", + "Onboarding", + "Vorstandsbeschlüsse", + "Personio", + "Pme_Familienservice", + "Probezeit", + "Stellenausschreibungen", + "Überstunden", + "WMDE:Urlaub", + "Weiterbildung", + "Werkstudierende" + ] + }, + { + "name": "GS-Wiki en", + "host": "https://wiki.wikimedia.de/", + "api_path": "/api.php", + "lang": "en", + "login": true, + "titles" : [ + "Jubiläum/en", + "Betriebsvereinbarung", + "Company_pension_plan", + "COVID-19EN", + "Culture_Shock", + "Digital_Payslip", + "Beschwerdestelle_AGG/en", + "Betriebliches_Eingliederungsmanagement_(BEM)/en", + "Offboarding/en", + "Onboarding/en", + "Decisions_of_the_ED", + "Overtime", + "Bildungszeit/en", + "Parental_leave", + "Personio/en", + "Pme_Counselling_Service", + "Probationary_Period", + "Quartalsgespräche/en", + "Decisions_of_the_ED", + "Secondary_employment", + "Sick_leave", + "Fortbildung/en", + "Stellenausschreibungen/en", + "WMDE:Urlaub/en", + "Arbeitszeit/en", + "Werkstudierende/en" + ] + } +] \ No newline at end of file diff --git a/json_input/wp-policies.json b/json_input/wp-policies.json new file mode 100644 index 0000000..a0cfcd0 --- /dev/null +++ b/json_input/wp-policies.json @@ -0,0 +1,30 @@ +[ + { + "name": "German Wikipedia Policies", + "host": "https://de.wikipedia.org", + "api_path": "/w/api.php", + "lang": "de", + "login": false, + "titles" : [ + "Wikipedia:Grundprinzipien", + "Wikipedia:Was_Wikipedia_nicht_ist", + "Wikipedia:Neutraler_Standpunkt", + "Wikipedia:Urheberrechte_beachten", + "Wikipedia:Wikiquette" + ] + }, + { + "name": "English Wikipedia Policies", + "host": "https://en.wikipedia.org", + "api_path": "/w/api.php", + "lang": "en", + "login": false, + "titles" : [ + "Wikipedia:Five_pillars", + "Wikipedia:What_Wikipedia_is_not", + "Wikipedia:Neutral_point_of_view", + "Wikipedia:Copyrights", + "Wikipedia:Civility" + ] + } +] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 723011a..1cef685 100644 --- a/requirements.txt +++ b/requirements.txt @@ -45,3 +45,5 @@ uvicorn==0.27.0 uvloop==0.19.0 watchfiles==0.21.0 websockets==12.0 +beautifulsoup4==4.12.3 +lxml==5.1.0 \ No newline at end of file From 33db5ab3c8f1100bd31a148fd8e4abbb0e757572 Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Wed, 28 Feb 2024 16:29:18 +0000 Subject: [PATCH 48/51] chore: get_llm --- gswikichat/llm_config.py | 20 ++++++++++---------- gswikichat/rag.py | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/gswikichat/llm_config.py b/gswikichat/llm_config.py index 5af6c1c..67459c2 100644 --- a/gswikichat/llm_config.py +++ b/gswikichat/llm_config.py @@ -10,14 +10,14 @@ OLLAMA_URL = os.environ.get("OLLAMA_URL") OLLAMA_CHAT_URL = f"{OLLAMA_URL}/api/chat" -logger.info(f'Using {OLLAMA_MODEL_NAME=}') -logger.info(f'Endpoint: {OLLAMA_URL=}') -logger.info(f'Generate: {OLLAMA_CHAT_URL=}') +def get_llm(): + logger.info(f'Using {OLLAMA_MODEL_NAME=}') + logger.info(f'Endpoint: {OLLAMA_URL=}') + logger.info(f'Generate: {OLLAMA_CHAT_URL=}') + logger.info(f"Setting up ollama with {OLLAMA_MODEL_NAME}") -logger.info(f"Setting up ollama with {OLLAMA_MODEL_NAME}") - -llm = OllamaChatGenerator( - model=OLLAMA_MODEL_NAME, - url=OLLAMA_CHAT_URL, - timeout=120 -) + return OllamaChatGenerator( + model=OLLAMA_MODEL_NAME, + url=OLLAMA_CHAT_URL, + timeout=120 + ) diff --git a/gswikichat/rag.py b/gswikichat/rag.py index d158fc4..d9e33f1 100644 --- a/gswikichat/rag.py +++ b/gswikichat/rag.py @@ -4,7 +4,7 @@ from haystack.dataclasses import ChatMessage from .db import get_db -from .llm_config import llm +from .llm_config import get_llm from .logger import get_logger from .prompt import user_prompt_builders, system_prompts @@ -49,7 +49,7 @@ def rag_pipeline(query: str, top_k: int, lang: str): ChatMessage.from_user(prompt), ] - response = llm.run( + response = get_llm().run( messages, # generation_kwargs={"temperature": 0.2} ) From b6fee21c74bc909d30f8bf787a42d8b1da4ac4a4 Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Wed, 28 Feb 2024 16:29:45 +0000 Subject: [PATCH 49/51] feat: use fetch_articles from the command line --- README.md | 9 ++++- gswikichat/fetch_articles.py | 65 +++++++++++++++++++++++++++++++++--- 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 7c14dd0..902b631 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,15 @@ docker run \ ``` Point your browser to http://localhost:8000/ and use the frontend. -To import data run: +To fetch data from a `toc.json` wiki fetching definition, run: +``` +$ docker exec -it gbnc bash +# export WIKI_USER=wikibotusername +# export WIKI_PW=yoursecretbotuserpassword +# python -m gswikichat.fetch_articles toc.json > articles.json +``` +To import data run: ``` $ docker exec -it gbnc bash # cat json_input/excellent-articles_10.json | jq 'to_entries | map({content: .value, meta: {source: .key}})' > import.json diff --git a/gswikichat/fetch_articles.py b/gswikichat/fetch_articles.py index 8a6c3df..457ee12 100644 --- a/gswikichat/fetch_articles.py +++ b/gswikichat/fetch_articles.py @@ -6,8 +6,11 @@ from bs4 import BeautifulSoup -GSWIKI_USER = os.environ.get('GSWIKI_USER') -GSWIKI_PW = os.environ.get('GSWIKI_PW') +from .logger import get_logger +logger = get_logger(__name__) + +WIKI_USER = os.environ.get('WIKI_USER') +WIKI_PW = os.environ.get('WIKI_PW') HTML_FILTERS = { 'div': ['navbox','navbox-styles','spoken-wikipedia', 'noprint', 'hatnote', 'rt-tooltip', 'reflist'], @@ -46,7 +49,7 @@ def fetchFromWiki(url, titles, loginRequired): articles = {} for title in titles: sections = fetchSections(url, title, session.cookies) - print("fetching {} sections for article {}".format(len(sections), title)) + print("fetching {} sections for article {}".format(len(sections), title), file=sys.stderr) for section in [ { 'index' : 0, 'line': 'Intro', 'linkAnchor' : '', 'anchor' : '' } ] + sections : if section['index'] == '' or section['line'] in SECTION_FILTERS: continue @@ -85,8 +88,8 @@ def loginToWiki(url): tokenQuery = { 'action': 'query', 'meta': 'tokens', 'type': 'login', 'format': 'json' } token = session.get(url, params=tokenQuery).json()['query']['tokens']['logintoken'] loginData = { - 'lgname': GSWIKI_USER, - 'lgpassword': GSWIKI_PW, + 'lgname': WIKI_USER, + 'lgpassword': WIKI_PW, 'lgtoken': token, 'action': 'login', 'format': 'json' @@ -107,5 +110,57 @@ def fetch_articles(toc): 'lang': wiki['lang'], 'articles': wikiArticles } ) + return articles +def transform_articles(articles): + output = {} + for wiki in articles: + url = wiki.get("url") + "/wiki/" + articles = wiki.get("articles") + for name, content in articles.items(): + output[url+name] = content + return output + +if __name__ == "__main__": + import sys + import json + + if len(sys.argv) > 1: + file = sys.argv[1] + with open(file) as f: + data = json.load(f) + + articles = fetch_articles(data) + print(json.dumps(transform_articles(articles), indent=4)) + + else: + logger.error( + """Provide JSON file with the following structure as first parameter + [ + { + "name": "Name of the wiki", + "host": "https://somewiki.org", + "api_path": "/w/api.php", + "lang": "en", + "login": false, + "titles" : [ + "Namespace:Page1", + "Namespace:Page2" + ] + }, + { + "name": "Name of the another wiki", + "host": "https://someotherwiki.org", + "api_path": "/w/api.php", + "lang": "de", + "login": false, + "titles" : [ + "Namespace:SeiteEins", + "Namespace:SeiteZwei" + ] + } + ] + """ + ) + sys.exit(1) From 8f7c49e12b26eeadc2d57562eefe717af7e5c50c Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Wed, 28 Feb 2024 16:48:34 +0000 Subject: [PATCH 50/51] chore: simplify docker commands --- README.md | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 902b631..e71e4ac 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,7 @@ To build and run the container locally with hot reload on python files do: ``` DOCKER_BUILDKIT=1 docker build . -t gbnc docker run \ - --env DOCUMENTS_TOC=json_input/gs-wiki.json \ - --env WIKI_USER= \ - --env WIKI_PW= \ - --env HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \ --volume "$(pwd)/gswikichat":/workspace/gswikichat \ - --volume gbnc_cache:/root/.cache \ --publish 8000:8000 \ --rm \ --interactive \ @@ -28,8 +23,8 @@ Point your browser to http://localhost:8000/ and use the frontend. To fetch data from a `toc.json` wiki fetching definition, run: ``` $ docker exec -it gbnc bash -# export WIKI_USER=wikibotusername -# export WIKI_PW=yoursecretbotuserpassword +# export WIKI_USER= +# export WIKI_PW= # python -m gswikichat.fetch_articles toc.json > articles.json ``` From eaabe700e17cacf07dea268dba70c7ecd69986d9 Mon Sep 17 00:00:00 2001 From: Robert Timm Date: Wed, 28 Feb 2024 16:57:13 +0000 Subject: [PATCH 51/51] docs: readme fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e71e4ac..d3a64a6 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ To fetch data from a `toc.json` wiki fetching definition, run: $ docker exec -it gbnc bash # export WIKI_USER= # export WIKI_PW= -# python -m gswikichat.fetch_articles toc.json > articles.json +# python3 -m gswikichat.fetch_articles toc.json > articles.json ``` To import data run: