-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathtokenizer.lua
More file actions
65 lines (49 loc) · 1.32 KB
/
tokenizer.lua
File metadata and controls
65 lines (49 loc) · 1.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
local lexer = require "pl.lexer"
local yield = coroutine.yield
local M = {}
local function word(token)
return yield("word", token)
end
local function quote(token)
return yield("quote", token)
end
local function space(token)
return yield("space", token)
end
local function tag(token)
return yield("tag", token)
end
local function punct(token)
return yield("punct", token)
end
local function endpunct(token)
return yield("endpunct", token)
end
local function unknown(token)
return yield("unknown", token)
end
function M.tokenize(text)
--make sure there are spaces around certain characters so that we predict them as individual units
local newtext =text
newtext=newtext:lower()
newtext = newtext:gsub("'", "")
newtext = newtext:gsub('-', " ")
newtext = newtext:gsub(',',' , ')
newtext = newtext:gsub('%.',' . ')
newtext = newtext:gsub('%:',' : ')
newtext = newtext:gsub('%;',' ; ')
newtext = newtext:gsub('%?',' ? ')
newtext = newtext:gsub('%!',' ! ')
newtext = newtext:gsub('\n',' \n ')
local matchstring = "([^%s]+)"
local words = newtext:gmatch(matchstring )
return words
end
function M.join(words)
local s = table.concat(words, " ")
s = s:gsub("^%l", string.upper)
s = s:gsub(" (') ", "%1")
s = s:gsub(" ([,:;%-%.%?!])", "%1")
return s
end
return M