-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizer.ts
More file actions
27 lines (22 loc) · 714 Bytes
/
tokenizer.ts
File metadata and controls
27 lines (22 loc) · 714 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import chalk from 'chalk'
import type { TokenId, Vocabulary } from './vocabulary'
/**
* Converts input text into an array of token identifiers using the provided vocabulary.
*/
export const tokenizeText = (
inputText: string,
vocabulary: Vocabulary,
isTrainingMode: boolean,
): TokenId[] => {
const lowercaseText: string = inputText.toLowerCase()
const words: string[] = lowercaseText.split(/\s+/)
return words.map((word: string): TokenId => {
const tokenId: TokenId | undefined = isTrainingMode
? vocabulary.add(word)
: vocabulary.encodeWordToToken(word)
if (tokenId === undefined) {
throw new Error(chalk.red(`Unknown word: ${word}`))
}
return tokenId
})
}