-
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathkitoken.pyi
More file actions
192 lines (151 loc) · 5.33 KB
/
kitoken.pyi
File metadata and controls
192 lines (151 loc) · 5.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
from typing import Any, Optional
class Kitoken:
"""
Kitoken tokenizer.
A fast and versatile tokenizer for language models.
"""
def __init__(self, data: bytes) -> None:
"""
Initializes the tokenizer from a serialized `kitoken` definition.
:param data: The serialized definition.
"""
...
@staticmethod
def from_file(path: str) -> Kitoken:
"""
Deserializes the tokenizer definition from a file and initializes the tokenizer.
:param path: The path to the file.
"""
...
def to_file(self, path: str) -> None:
"""
Creates a definition from this tokenizer and serializes it to a file.
:param path: The path to the file.
"""
...
def to_bytes(self) -> bytes:
"""
Creates a definition from this tokenizer and serializes it to bytes.
"""
...
def encode(self, text: str, encode_specials: Optional[bool] = False) -> list[int]:
"""
Encodes the given text into a sequence of tokens.
If `encode_specials` is `True`, the text is first split around special tokens which are separately encoded with the special encoder.
Returns a list of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration.
:param text: The text to encode.
:param encode_specials: Whether to encode special tokens.
"""
...
def encode_all(
self, text: list[str], encode_specials: Optional[bool] = False
) -> list[list[int]]:
"""
Encodes the given texts into sequences of tokens.
If `encode_specials` is `True`, the text is first split around special tokens which are separately encoded with the special encoder.
Returns a list of lists of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration.
:param text: The texts to encode.
:param encode_specials: Whether to encode special tokens.
"""
...
def decode(self, data: list[int], decode_specials: Optional[bool] = False) -> bytes:
"""
Decodes the given sequence of tokens into text.
Returns a list of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration.
:param data: The sequence of tokens to decode.
"""
...
def decode_all(
self, data: list[list[int]], decode_specials: Optional[bool] = False
) -> list[bytes]:
"""
Decodes the given sequences of tokens into texts.
Returns a list of lists of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration.
:param data: The sequences of tokens to decode.
"""
...
def definition(self) -> Any:
"""
Returns the definition of the tokenizer.
"""
...
def set_definition(self, definition: Any) -> None:
"""
Sets the definition of the tokenizer.
:param definition: The new definition.
"""
...
def config(self) -> Any:
"""
Returns the configuration of the tokenizer.
"""
...
def set_config(self, config: Any) -> None:
"""
Sets the configuration of the tokenizer.
:param config: The new configuration.
"""
...
@staticmethod
def from_sentencepiece(data: bytes) -> Kitoken:
"""
Initializes the tokenizer from a serialized `sentencepiece` model.
:param data: The serialized model.
"""
...
@staticmethod
def from_sentencepiece_file(path: str) -> Kitoken:
"""
Initializes the tokenizer from a `sentencepiece` model file.
:param path: The path to the file.
"""
...
@staticmethod
def from_tiktoken(data: bytes) -> Kitoken:
"""
Initializes the tokenizer from a serialized `tiktoken` model.
:param data: The serialized model.
"""
...
@staticmethod
def from_tiktoken_file(path: str) -> Kitoken:
"""
Initializes the tokenizer from a `tiktoken` model file.
:param path: The path to the file.
"""
...
@staticmethod
def from_tokenizers(data: bytes) -> Kitoken:
"""
Initializes the tokenizer from a serialized `tokenizers` model.
:param data: The serialized model.
"""
...
@staticmethod
def from_tokenizers_file(path: str) -> Kitoken:
"""
Initializes the tokenizer from a `tokenizers` model file.
:param path: The path to the file.
"""
...
@staticmethod
def from_tekken(data: bytes) -> Kitoken:
"""
Initializes the tokenizer from a serialized `tekken` model.
:param data: The serialized model.
"""
...
@staticmethod
def from_tekken_file(path: str) -> Kitoken:
"""
Initializes the tokenizer from a `tekken` model file.
:param path: The path to the file.
"""
...
@staticmethod
def from_web(url: str) -> Kitoken:
"""
Initializes the tokenizer from a model URL.
:param url: The URL to the file.
"""
...