-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathembedder.py
More file actions
29 lines (26 loc) · 889 Bytes
/
embedder.py
File metadata and controls
29 lines (26 loc) · 889 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from bertopic.backend import BaseEmbedder
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
class ParsBERTEmbedder(BaseEmbedder):
def __init__(self):
self.tokenizer = AutoTokenizer.from_pretrained(
"HooshvareLab/bert-base-parsbert-uncased"
)
self.model = AutoModel.from_pretrained(
"HooshvareLab/bert-base-parsbert-uncased"
)
def embed(self, docs, verbose=False):
embs = []
for d in docs:
inp = self.tokenizer(
d,
return_tensors="pt",
truncation=True,
padding="max_length",
max_length=512,
)
with torch.no_grad():
out = self.model(**inp)
embs.append(out.last_hidden_state[:, 0, :].squeeze().numpy())
return np.array(embs)