Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
.env
.vscode
virtualenv
*.pyc
*.pyc
*.bin
*.pdf
*.sqlite3
7 changes: 1 addition & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,7 @@ API_TOKEN = "<your api token>"

### Running the script

Run the following commad after activating the virtual environment.

```
cd experiments
python main.py
```
Check this [documentation](experiments.md) on how to run experimental scripts.

### Running FastAPI server

Expand Down
21 changes: 21 additions & 0 deletions experiments.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
## Running Experimental Scripts

Activate the python virtual environment

### hfQnA.py
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to give a hint to the reader what these two different scripts are used for?


To Run hfQnA script, type the following command

```
python hfQnA.py
```

### docQnA.py

To Run docQnA script, type the following command

```
python docQnA.py <path to pdf file>
```

The path only needs to be provided when vector data store for this file has not been created.
72 changes: 72 additions & 0 deletions experiments/docQnA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from hfQnA import query, API_URL
import os
import sys

def load_and_split_doc(path: str):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should be type hinting at the returns of each method too

loader = PyPDFLoader(path, extract_images=False)
docs = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 1000,
chunk_overlap = 200,
length_function = len,
add_start_index = True,
)

chunks = text_splitter.split_documents(docs)
return chunks

def load_chunks_to_vectorstore(chunks):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What type is chunks?

#use embedding to store in chroma db
#load embedding model
embedding_model = HuggingFaceEmbeddings()

db = Chroma.from_documents(chunks, embedding=embedding_model, persist_directory="test-index")
db.persist()

def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)

def retreive_context_vectorDB(question: str):
#load the vector db
vectordb = Chroma(persist_directory="test-index", embedding_function=HuggingFaceEmbeddings())
#load the retriever
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs = {"k" : 3})
relvent_docs = retriever.invoke(question)
context = format_docs(relvent_docs)
return context

def run_rag():
choice = input("Do you want to load the document(y/n): ")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this not - Do you want to upload a new document?


if choice == "y" or choice == "Y":
assert len(sys.argv) > 1, "Pdf file path not provided"
pathStr = sys.argv[1]
assert os.path.exists(pathStr), "File does not exist"
chunks = load_and_split_doc(os.path.abspath(pathStr))
load_chunks_to_vectorstore(chunks)

question = input("Ask any question from document: ")

context = retreive_context_vectorDB(question)
model = "distilbert-base-cased"
payload = {
"inputs": {
"question": f"{question}",
"context": f"{context}",
}
}

url = API_URL[model]

response = query(payload=payload, api_url=url)

print(response)

if __name__ == "__main__":
run_rag()

3 changes: 2 additions & 1 deletion experiments/main.py → experiments/hfQnA.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,5 @@ def start():

print(data)

start()
if __name__ == "__main__":
start()
131 changes: 131 additions & 0 deletions experiments/requirementsDocQnA.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
annotated-types==0.7.0
anyio==4.4.0
asgiref==3.8.1
attrs==24.2.0
backoff==2.2.1
bcrypt==4.2.0
build==1.2.1
cachetools==5.5.0
certifi==2024.7.4
charset-normalizer==3.3.2
chroma-hnswlib==0.7.6
chromadb==0.5.5
click==8.1.7
colorama==0.4.6
coloredlogs==15.0.1
dataclasses-json==0.6.7
Deprecated==1.2.14
dnspython==2.6.1
email_validator==2.2.0
fastapi==0.112.2
fastapi-cli==0.0.5
filelock==3.15.4
flatbuffers==24.3.25
frozenlist==1.4.1
fsspec==2024.6.1
google-auth==2.34.0
googleapis-common-protos==1.65.0
greenlet==3.0.3
grpcio==1.66.1
h11==0.14.0
httpcore==1.0.5
httptools==0.6.1
httpx==0.27.0
huggingface-hub==0.24.6
humanfriendly==10.0
idna==3.7
importlib_metadata==8.4.0
importlib_resources==6.4.4
iniconfig==2.0.0
Jinja2==3.1.4
joblib==1.4.2
jsonpatch==1.33
jsonpointer==3.0.0
kubernetes==30.1.0
langchain==0.2.15
langchain-community==0.2.15
langchain-core==0.2.37
langchain-huggingface==0.0.3
langchain-text-splitters==0.2.2
langsmith==0.1.108
markdown-it-py==3.0.0
MarkupSafe==2.1.5
marshmallow==3.22.0
mdurl==0.1.2
mmh3==4.1.0
monotonic==1.6
mpmath==1.3.0
multidict==6.0.5
mypy-extensions==1.0.0
networkx==3.3
numpy==1.26.4
oauthlib==3.2.2
onnxruntime==1.19.0
opentelemetry-api==1.27.0
opentelemetry-exporter-otlp-proto-common==1.27.0
opentelemetry-exporter-otlp-proto-grpc==1.27.0
opentelemetry-instrumentation==0.48b0
opentelemetry-instrumentation-asgi==0.48b0
opentelemetry-instrumentation-fastapi==0.48b0
opentelemetry-proto==1.27.0
opentelemetry-sdk==1.27.0
opentelemetry-semantic-conventions==0.48b0
opentelemetry-util-http==0.48b0
orjson==3.10.7
overrides==7.7.0
packaging==24.1
pillow==10.4.0
pluggy==1.5.0
posthog==3.6.0
protobuf==4.25.4
pyasn1==0.6.0
pyasn1_modules==0.4.0
pydantic==2.8.2
pydantic-settings==2.4.0
pydantic_core==2.20.1
Pygments==2.18.0
pypdf==4.3.1
PyPika==0.48.9
pyproject_hooks==1.1.0
pyreadline3==3.4.1
pytest==8.3.2
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-multipart==0.0.9
PyYAML==6.0.2
regex==2024.7.24
requests==2.32.3
requests-oauthlib==2.0.0
rich==13.7.1
rsa==4.9
safetensors==0.4.4
scikit-learn==1.5.1
scipy==1.14.1
sentence-transformers==3.0.1
setuptools==74.0.0
shellingham==1.5.4
six==1.16.0
sniffio==1.3.1
SQLAlchemy==2.0.32
starlette==0.38.2
sympy==1.13.2
tenacity==8.5.0
threadpoolctl==3.5.0
tokenizers==0.19.1
torch==2.4.0
tqdm==4.66.5
transformers==4.44.2
typer==0.12.5
typing-inspect==0.9.0
typing_extensions==4.12.2
urllib3==2.2.2
uvicorn==0.30.6
watchfiles==0.23.0
websocket-client==1.8.0
websockets==13.0
wrapt==1.16.0
yarl==1.9.6
zipp==3.20.1