-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathload-process.py
More file actions
43 lines (32 loc) · 1.13 KB
/
load-process.py
File metadata and controls
43 lines (32 loc) · 1.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import os
# Directory with text files
directory = "sources/"
# Load and preprocess text
documents = []
for filename in os.listdir(directory):
if filename.endswith(".txt"):
with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
text = file.read().strip()
documents.append({"filename": filename, "content": text})
from sentence_transformers import SentenceTransformer
# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Generate embeddings
embeddings = [{"filename": doc["filename"],
"content": doc["content"],
"embedding": model.encode(doc["content"])} for doc in documents]
import chromadb
# Initialize Chroma
client = chromadb.Client()
collection = client.create_collection("example_collection")
# Insert embeddings
for doc in embeddings:
collection.add(
ids=[doc["filename"]],
embeddings=[doc["embedding"]],
metadatas=[{"content": doc["content"]}]
)
query_embedding = model.encode("What is the FCC?")
# Query ChromaDB
results = collection.query(query_embeddings=[query_embedding], n_results=5)
print(results)