-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathlexical_analysis.py
More file actions
117 lines (95 loc) · 3.65 KB
/
lexical_analysis.py
File metadata and controls
117 lines (95 loc) · 3.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import re
class LexicalAnalyzer:
PATTERNS = {
"keywords": ["const", "char", "int", "string", "float", "double", "bool", "if", "else", "for", "while", "do", "void", "main", "return"],
"operators": ["+", "-", "*", "/", "=", "%", "<", ">", ">=", "<=", "!=", "!"],
"punctuation": [",", ";", "(", ")", "{", "}", "[", "]"],
}
TOKEN_PATTERNS = [
("FLOAT_CONST", r"^[+-]*[0-9]*\.[0-9]*$"),
("INT_CONST", r"^[0-9]+$"),
("CHAR_CONST", r"^'[^']'$"),
("STRING_CONST", r'^"[^"\n]*"$'),
("COMMENT", r"/\*.*?\*/"),
("IDENTIFIER", r"^[a-zA-Z][a-zA-Z0-9_]*$"),
]
def split_into_tokens(self, line, patterns):
tokens = []
current = ""
start = 0
while start < len(line):
if line[start:start+2] == "//":
break
if line[start:start+2] == "/*":
end = line.find("*/", start+2)
if end == -1:
return tokens
start = end + 2
continue
char = line[start]
if char.isspace():
if current:
tokens.append(current)
current = ""
start += 1
continue
if char in patterns["operators"] or char in patterns["punctuation"]:
if current:
tokens.append(current)
current = ""
tokens.append(char)
start += 1
continue
if char == "'":
if current:
tokens.append(current)
current = ""
j = start + 1
while j < len(line) and line[j] != "'":
j += 1
if j < len(line):
tokens.append(line[start:j+1])
start = j + 1
else:
tokens.append(line[start:])
start = len(line)
continue
if char == "'":
if current:
tokens.append(current)
current = ""
if start+2 < len(line) and line[start+2] == "'":
tokens.append(line[start:start+3])
start += 3
else:
tokens.append(line[start:])
start = len(line)
continue
current += char
start += 1
if current:
tokens.append(current)
return tokens
def classify_tokens(self, token, line_no):
if token in self.PATTERNS["keywords"]:
return ("KEYWORD", token, line_no)
if token in self.PATTERNS["operators"]:
return ("OPERATOR", token, line_no)
if token in self.PATTERNS["punctuation"]:
return ("PUNCTUATION", token, line_no)
for name, pattern in self.TOKEN_PATTERNS:
if re.match(pattern, token):
return (name, token, line_no)
return ("ERROR", token, line_no)
def run(self):
with open("code.txt", "r") as f:
lines = f.readlines()
with open("output.txt", "w") as out:
out.write("Token\tLexeme\tLine No\n")
for line_no, line in enumerate(lines, start=1):
raw_tokens = self.split_into_tokens(line.strip(), self.PATTERNS)
for t in raw_tokens:
token_type, lexeme, ln = self.classify_tokens(t, line_no)
out.write(f"{token_type}\t{lexeme}\t{ln}\n")
if __name__ == "__main__":
LexicalAnalyzer().run()