-
Notifications
You must be signed in to change notification settings - Fork 93
Expand file tree
/
Copy pathhelper_utils.py
More file actions
92 lines (68 loc) · 2.5 KB
/
helper_utils.py
File metadata and controls
92 lines (68 loc) · 2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# helper_utils.py
import numpy as np
import chromadb
import pandas as pd
from pypdf import PdfReader
import numpy as np
def project_embeddings(embeddings, umap_transform):
"""
Projects the given embeddings using the provided UMAP transformer.
Args:
embeddings (numpy.ndarray): The embeddings to project.
umap_transform (umap.UMAP): The trained UMAP transformer.
Returns:
numpy.ndarray: The projected embeddings.
"""
projected_embeddings = umap_transform.transform(embeddings)
return projected_embeddings
def word_wrap(text, width=87):
"""
Wraps the given text to the specified width.
Args:
text (str): The text to wrap.
width (int): The width to wrap the text to.
Returns:
str: The wrapped text.
"""
return "\n".join([text[i : i + width] for i in range(0, len(text), width)])
def extract_text_from_pdf(file_path):
"""
Extracts text from a PDF file.
Args:
file_path (str): The path to the PDF file.
Returns:
str: The extracted text.
"""
text = []
with open(file_path, "rb") as f:
pdf = PdfReader(f)
for page_num in range(pdf.get_num_pages()):
page = pdf.get_page(page_num)
text.append(page.extract_text())
return "\n".join(text)
def load_chroma(filename, collection_name, embedding_function):
"""
Loads a document from a PDF, extracts text, generates embeddings, and stores it in a Chroma collection.
Args:
filename (str): The path to the PDF file.
collection_name (str): The name of the Chroma collection.
embedding_function (callable): A function to generate embeddings.
Returns:
chroma.Collection: The Chroma collection with the document embeddings.
"""
# Extract text from the PDF
text = extract_text_from_pdf(filename)
# Split text into paragraphs or chunks
paragraphs = text.split("\n\n")
# Generate embeddings for each chunk
embeddings = [embedding_function(paragraph) for paragraph in paragraphs]
# Create a DataFrame to store text and embeddings
data = {"text": paragraphs, "embeddings": embeddings}
df = pd.DataFrame(data)
# Create or load the Chroma collection
collection = chromadb.Client().create_collection(collection_name)
# Add the data to the Chroma collection
for ids, row in df.iterrows():
collection.add(ids=ids, documents=row["text"], embeddings=row["embeddings"])
# collection.add(text=row["text"], embedding=row["embeddings"])
return collection