-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathvectorrize.py
More file actions
27 lines (21 loc) · 772 Bytes
/
vectorrize.py
File metadata and controls
27 lines (21 loc) · 772 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
model = SentenceTransformer("jhgan/ko-sroberta-multitask") # Ko-SBERT
texts = []
with open("gyeongsang.jsonl", "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
item = json.loads(line)
user_messages = [m["content"] for m in item["messages"] if m["role"] == "user"]
if user_messages:
texts.append(user_messages[0])
embeddings = model.encode(texts, convert_to_numpy=True)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
faiss.write_index(index, "gyeongsang_dialect.index")
np.save("gyeongsang_texts.npy", np.array(texts))