forked from sankalp1999/code_qa
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtreesitter.py
More file actions
200 lines (184 loc) · 6.79 KB
/
treesitter.py
File metadata and controls
200 lines (184 loc) · 6.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
from abc import ABC
from tree_sitter import Language, Parser
from tree_sitter_languages import get_language, get_parser
from enum import Enum
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
class LanguageEnum(Enum):
JAVA = "java"
PYTHON = "python"
RUST = "rust"
JAVASCRIPT = "javascript"
UNKNOWN = "unknown"
LANGUAGE_QUERIES = {
LanguageEnum.JAVA: {
'class_query': """
(class_declaration
name: (identifier) @class.name)
""",
'method_query': """
[
(method_declaration
name: (identifier) @method.name)
(constructor_declaration
name: (identifier) @method.name)
]
""",
'doc_query': """
((block_comment) @comment)
"""
},
LanguageEnum.PYTHON: {
'class_query': """
(class_definition
name: (identifier) @class.name)
""",
'method_query': """
(function_definition
name: (identifier) @function.name)
""",
'doc_query': """
(expression_statement
(string) @comment)
"""
},
LanguageEnum.RUST: {
'class_query': """
(struct_item
name: (type_identifier) @class.name)
""",
'method_query': """
(function_item
name: (identifier) @function.name)
""",
'doc_query': """
[
(line_comment) @comment
(block_comment) @comment
]
"""
},
LanguageEnum.JAVASCRIPT: {
'class_query': """
(class_declaration
name: (identifier) @class.name)
""",
'method_query': """
(method_definition
name: (property_identifier) @method.name)
""",
'doc_query': """
((comment) @comment)
"""
},
# Add other languages as needed
}
class TreesitterMethodNode:
def __init__(
self,
name: str,
doc_comment: str,
method_source_code: str,
node,
class_name: str = None
):
self.name = name
self.doc_comment = doc_comment
self.method_source_code = method_source_code
self.node = node
self.class_name = class_name
class TreesitterClassNode:
def __init__(
self,
name: str,
method_declarations: list,
node,
):
self.name = name
self.source_code = node.text.decode()
self.method_declarations = method_declarations
self.node = node
class Treesitter(ABC):
def __init__(self, language: LanguageEnum):
self.language_enum = language
self.parser = get_parser(language.value)
self.language_obj = get_language(language.value)
self.query_config = LANGUAGE_QUERIES.get(language)
if not self.query_config:
raise ValueError(f"Unsupported language: {language}")
# Corrected query instantiation
self.class_query = self.language_obj.query(self.query_config['class_query'])
self.method_query = self.language_obj.query(self.query_config['method_query'])
self.doc_query = self.language_obj.query(self.query_config['doc_query'])
@staticmethod
def create_treesitter(language: LanguageEnum) -> "Treesitter":
return Treesitter(language)
def parse(self, file_bytes: bytes) -> tuple[list[TreesitterClassNode], list[TreesitterMethodNode]]:
tree = self.parser.parse(file_bytes)
root_node = tree.root_node
class_results = []
method_results = []
class_name_by_node = {}
class_captures = self.class_query.captures(root_node)
class_nodes = []
for node, capture_name in class_captures:
if capture_name == 'class.name':
class_name = node.text.decode()
class_node = node.parent
logging.info(f"Found class: {class_name}")
class_name_by_node[class_node.id] = class_name
method_declarations = self._extract_methods_in_class(class_node)
class_results.append(TreesitterClassNode(class_name, method_declarations, class_node))
class_nodes.append(class_node)
method_captures = self.method_query.captures(root_node)
for node, capture_name in method_captures:
if capture_name in ['method.name', 'function.name']:
method_name = node.text.decode()
method_node = node.parent
method_source_code = method_node.text.decode()
doc_comment = self._extract_doc_comment(method_node)
parent_class_name = None
for class_node in class_nodes:
if self._is_descendant_of(method_node, class_node):
parent_class_name = class_name_by_node[class_node.id]
break
method_results.append(TreesitterMethodNode(
name=method_name,
doc_comment=doc_comment,
method_source_code=method_source_code,
node=method_node,
class_name=parent_class_name
))
return class_results, method_results
def _extract_methods_in_class(self, class_node):
method_declarations = []
# Apply method_query to the class_node
method_captures = self.method_query.captures(class_node)
for node, capture_name in method_captures:
if capture_name in ['method.name', 'function.name']:
method_declaration = node.parent.text.decode()
method_declarations.append(method_declaration)
return method_declarations
def _extract_doc_comment(self, node):
# Search for doc comments preceding the node
doc_comment = ''
current_node = node.prev_sibling
while current_node:
captures = self.doc_query.captures(current_node)
if captures:
for cap_node, cap_name in captures:
if cap_name == 'comment':
doc_comment = cap_node.text.decode() + '\n' + doc_comment
elif current_node.type not in ['comment', 'block_comment', 'line_comment', 'expression_statement']:
# Stop if we reach a node that's not a comment
break
current_node = current_node.prev_sibling
return doc_comment.strip()
def _is_descendant_of(self, node, ancestor):
# Check if 'node' is a descendant of 'ancestor'
current = node.parent
while current:
if current == ancestor:
return True
current = current.parent
return False