-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLexer.cpp
More file actions
95 lines (84 loc) · 2.27 KB
/
Lexer.cpp
File metadata and controls
95 lines (84 loc) · 2.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#include "Lexer.h"
Lexer::Lexer() {};
void Lexer::addPattern(string regex, string name)
{
RegexParser regParser;
RegexNode *ast = regParser.parse(regex);
ThompsonConstructor thompson;
NFA *nfa = thompson.construct(ast);
SubsetConstructor subsetConst;
DFA *dfa = subsetConst.constructDFA(nfa);
this->dfas.push_back(dfa);
this->patterns.push_back(name);
}
string Lexer::tryMatchDFA(DFA *dfa, string input, int pos)
{
CombinedState *startState = dfa->getStartState();
CombinedState *curState = startState;
int lastAcceptingPos = -1;
if (curState->isAccepting)
{
lastAcceptingPos = pos;
}
for (int i = pos; i < (int)input.length(); i++)
{
map<char, CombinedState *> trans = curState->transitions;
if (trans.find(input[i]) != trans.end())
{
curState = trans[input[i]];
if (curState->isAccepting)
{
lastAcceptingPos = i + 1;
}
}
else
{
break;
}
}
if (lastAcceptingPos >= pos)
{
return input.substr(pos, lastAcceptingPos - pos);
}
else
{
return "";
}
}
vector<LexerToken> Lexer::tokenize(string input)
{
vector<LexerToken> result;
int len = input.length();
for (int i = 0; i < len; i++)
{
int longestMatch = 0;
string longestPatternName;
string longestValue;
for (int j = 0; j < (int)this->dfas.size(); j++)
{
DFA *dfa = this->dfas[j];
string pattern = this->patterns[j];
string tokenString = tryMatchDFA(dfa, input, i);
if ((int)tokenString.length() > longestMatch)
{
longestMatch = tokenString.length();
longestPatternName = pattern;
longestValue = tokenString;
}
}
if (longestMatch > 0)
{
LexerToken newToken = LexerToken(longestValue, longestPatternName);
result.push_back(newToken);
i += longestMatch - 1;
}
else
{
if (!isspace(input[i]))
{
throw runtime_error("Unable to tokenize character at index " + to_string(i));
}
}
}
return result;
}