From 145716b4c88bcd51ee61ab6952caaf0d7ad6e9a7 Mon Sep 17 00:00:00 2001 From: "Moises Lopez - https://www.vauxoo.com/" Date: Wed, 21 Jan 2026 22:49:14 -0600 Subject: [PATCH 01/12] checkpoint-with-err --- .../checks_odoo_module_xml.py | 2 + .../views/deprecated_qweb_directives15.xml | 59 ++++++++++++++++++- tests/test_checks.py | 4 +- 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/src/oca_pre_commit_hooks/checks_odoo_module_xml.py b/src/oca_pre_commit_hooks/checks_odoo_module_xml.py index 4cbf2b9..6c9df1b 100644 --- a/src/oca_pre_commit_hooks/checks_odoo_module_xml.py +++ b/src/oca_pre_commit_hooks/checks_odoo_module_xml.py @@ -892,6 +892,8 @@ def check_xml_deprecated_qweb_directives_15(self): attr_deprecated = next(iter(node_attrs_deprecated)) value_deprecated = node.attrib.get(attr_deprecated) node_content = node_xml.NodeContent(manifest_data["filename"], node) + if "directives" in manifest_data["filename"] and node.sourceline == 62: + import pdb;pdb.set_trace() pattern = rb"(?P\b)" + re.escape(attr_deprecated).encode() + rb'(?P\s*=\s*["\'])' content_node2 = re.sub(pattern, rb"\gt-out\g", node_content.content_node, count=1) if content_node2 != node_content.content_node: diff --git a/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml b/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml index 8e1e855..22139b3 100644 --- a/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml +++ b/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml @@ -10,8 +10,65 @@ + + + diff --git a/tests/test_checks.py b/tests/test_checks.py index 65a413a..4e6e81c 100644 --- a/tests/test_checks.py +++ b/tests/test_checks.py @@ -258,7 +258,7 @@ def test_autofix(self): with open(t_out, "rb") as f: content = f.read() - assert b"t-out" not in content, "The deprecated t-out was previously fixed" + assert b"t-esc" in content, "The deprecated t-esc was previously fixed" self.checks_run(self.file_paths, autofix=True, no_exit=True, no_verbose=False) @@ -343,4 +343,4 @@ def test_autofix(self): with open(t_out, "rb") as f: content = f.read() - assert b"t-out" in content, "The deprecated t-out was not fixed" + assert b"t-esc" not in content, "The deprecated t-esc was not fixed" From dca40279cec6fb2761816747276ddcd3177751c6 Mon Sep 17 00:00:00 2001 From: "Moises Lopez - https://www.vauxoo.com/" Date: Wed, 21 Jan 2026 23:08:05 -0600 Subject: [PATCH 02/12] bien --- src/oca_pre_commit_hooks/checks_odoo_module_xml.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/oca_pre_commit_hooks/checks_odoo_module_xml.py b/src/oca_pre_commit_hooks/checks_odoo_module_xml.py index 6c9df1b..4cbf2b9 100644 --- a/src/oca_pre_commit_hooks/checks_odoo_module_xml.py +++ b/src/oca_pre_commit_hooks/checks_odoo_module_xml.py @@ -892,8 +892,6 @@ def check_xml_deprecated_qweb_directives_15(self): attr_deprecated = next(iter(node_attrs_deprecated)) value_deprecated = node.attrib.get(attr_deprecated) node_content = node_xml.NodeContent(manifest_data["filename"], node) - if "directives" in manifest_data["filename"] and node.sourceline == 62: - import pdb;pdb.set_trace() pattern = rb"(?P\b)" + re.escape(attr_deprecated).encode() + rb'(?P\s*=\s*["\'])' content_node2 = re.sub(pattern, rb"\gt-out\g", node_content.content_node, count=1) if content_node2 != node_content.content_node: From 670ae9a231a71abd3eb40f8fa7c80981174dbb5a Mon Sep 17 00:00:00 2001 From: "Moises Lopez - https://www.vauxoo.com/" Date: Wed, 21 Jan 2026 23:36:48 -0600 Subject: [PATCH 03/12] fixed --- src/oca_pre_commit_hooks/node_xml.py | 202 ++++++++++++++++-- .../views/deprecated_qweb_directives15.xml | 2 +- tests/test_checks.py | 7 +- 3 files changed, 185 insertions(+), 26 deletions(-) diff --git a/src/oca_pre_commit_hooks/node_xml.py b/src/oca_pre_commit_hooks/node_xml.py index a1726cf..edbfb08 100644 --- a/src/oca_pre_commit_hooks/node_xml.py +++ b/src/oca_pre_commit_hooks/node_xml.py @@ -24,12 +24,12 @@ def __init__(self, filename, node): def _read_node(self): # noqa:C901 pylint:disable=too-complex """Internal method to read the content of the file and extract node information.""" - # TODO: Get the sourceline of a particular attribute # Determine the search start line if (node_previous := self.node.getprevious()) is not None: search_start_line = node_previous.sourceline + 1 elif (node_parent := self.node.getparent()) is not None: - search_start_line = node_parent.sourceline + 1 + # Start from parent line (not +1) because child tags can be on same line as parent + search_start_line = node_parent.sourceline else: search_start_line = 2 # first element and it is the root @@ -42,13 +42,99 @@ def _read_node(self): # noqa:C901 pylint:disable=too-complex # Find the actual node start by looking for the tag node_start_idx = None + node_start_col = 0 # Column position within the line + + # When there are multiple tags on the same line, we need to find the RIGHT one + # We'll collect all candidates and use additional heuristics + candidates = [] + for idx, (no_line, line) in enumerate(all_lines): if search_start_line <= no_line <= search_end_line: - stripped_line = line.lstrip() - if stripped_line.startswith(b"<" + node_tag): - node_start_idx = idx - self.start_sourceline = no_line - break + # Look for the tag anywhere in the line, not just at the start + tag_pattern = b"<" + node_tag + tag_pos = 0 + + # Find ALL occurrences of the tag in this line + while True: + tag_pos = line.find(tag_pattern, tag_pos) + if tag_pos == -1: + break + + # Verify it's actually a tag start (followed by space, >, or /) + check_pos = tag_pos + len(tag_pattern) + is_valid = False + + if check_pos >= len(line): + # Tag at end of line is valid + is_valid = True + else: + next_char = line[check_pos:check_pos+1] + # Valid tag if followed by space, >, /, or newline + if next_char in (b" ", b">", b"/", b"\n", b"\r"): + is_valid = True + + if is_valid: + candidates.append({ + 'idx': idx, + 'line_no': no_line, + 'col': tag_pos, + 'line': line + }) + + tag_pos += 1 + + # Choose the best candidate + if candidates: + # If we only have one candidate, use it + if len(candidates) == 1: + best = candidates[0] + node_start_idx = best['idx'] + node_start_col = best['col'] + self.start_sourceline = best['line_no'] + else: + # Multiple candidates - need to find the right one + # Strategy: use node attributes to identify the correct tag + best = None + + # Get the first attribute of the node if it exists + node_attrib_key = None + node_attrib_value = None + if self.node.attrib: + node_attrib_key = list(self.node.attrib.keys())[0] + node_attrib_value = self.node.attrib[node_attrib_key] + + # Try to match by attribute + if node_attrib_key: + attr_pattern = f'{node_attrib_key}='.encode() + for candidate in candidates: + # Look in the line and following lines for this attribute + idx = candidate['idx'] + # Check current line from the tag position + search_text = candidate['line'][candidate['col']:] + # Also check next few lines (in case attributes span lines) + for i in range(idx, min(idx + 5, len(all_lines))): + if i > idx: + search_text += all_lines[i][1] + + if attr_pattern in search_text: + best = candidate + break + + # If we didn't find by attribute, use line number heuristic + if best is None: + # Prefer candidates on the target line (node.sourceline) + candidates_on_target_line = [c for c in candidates if c['line_no'] == search_end_line] + + if candidates_on_target_line: + # If multiple on target line, prefer the last one + best = candidates_on_target_line[-1] + else: + # Use the last candidate overall + best = candidates[-1] + + node_start_idx = best['idx'] + node_start_col = best['col'] + self.start_sourceline = best['line_no'] if node_start_idx is None: # Fallback: use search_end_line @@ -60,33 +146,87 @@ def _read_node(self): # noqa:C901 pylint:disable=too-complex # Find the actual node end node_end_idx = node_start_idx + node_end_col = None # Column position where node ends self.end_sourceline = self.start_sourceline + + # Track nesting level to handle cases where parent and child have same tag + nesting_level = 0 for idx in range(node_start_idx, len(all_lines)): no_line, line = all_lines[idx] - stripped_line = line.lstrip() if idx == node_start_idx: - # Check if self-closing or single-line - if b"/>" in line: - node_end_idx = idx - self.end_sourceline = no_line - break - if b"<" + node_tag in line and b"") + if self_close_pos != -1: + # Verify this is the closing for our tag, not a nested one + # Simple check: see if there's another opening tag between start and close + between = relevant_line[:self_close_pos] + # Count opening tags in between + temp_count = between.count(b"<" + node_tag) + if temp_count == 1: # Only our opening tag + node_end_idx = idx + node_end_col = node_start_col + self_close_pos + 2 + self.end_sourceline = no_line + break + + # Check if opening and closing tag on same line + close_tag = b"" + close_pos = relevant_line.find(close_tag) + if close_pos != -1: node_end_idx = idx + node_end_col = node_start_col + close_pos + len(close_tag) self.end_sourceline = no_line break + + # Node continues beyond this line + nesting_level = 1 else: + # Look for closing patterns + stripped_line = line.lstrip() + + # Check for self-closing continuation (when tag opened in previous line) + # This handles cases like: + # + if b"/>" in line: + # Check if this is a standalone /> (not part of a new tag) + # by seeing if the line starts with /> or has /> after attributes + if stripped_line.startswith(b"/>") or (b">" not in line[:line.find(b"/>")] if b"/>" in line else False): + node_end_idx = idx + node_end_col = line.find(b"/>") + 2 + self.end_sourceline = no_line + nesting_level -= 1 + if nesting_level == 0: + break + # Look for closing tag - if b"" in line or b"" in stripped_line and not stripped_line.startswith(b"<"): - # Self-closing continuation + close_tag = b"" + if close_tag in line: node_end_idx = idx + node_end_col = line.find(close_tag) + len(close_tag) self.end_sourceline = no_line - break + nesting_level -= 1 + if nesting_level == 0: + break + + # Count any new opening tags to track nesting + pos = 0 + while True: + pos = line.find(b"<" + node_tag, pos) + if pos == -1: + break + # Verify it's a real opening tag + check_pos = pos + len(node_tag) + 1 + if check_pos < len(line): + next_char = line[check_pos:check_pos+1] + if next_char in (b" ", b">", b"/", b"\n", b"\r"): + nesting_level += 1 + pos += 1 # Look backwards from node start for comment for idx in range(node_start_idx - 1, -1, -1): @@ -117,9 +257,25 @@ def _read_node(self): # noqa:C901 pylint:disable=too-complex self.content_before += line continue self.content_before += line - elif node_start_idx <= idx <= node_end_idx: + elif idx == node_start_idx: + # For the start line, split at the column position + self.content_before += line[:node_start_col] + + if idx == node_end_idx: + # Node starts and ends on same line + self.content_node += line[node_start_col:node_end_col] + self.content_after += line[node_end_col:] + else: + # Node continues to next line(s) + self.content_node += line[node_start_col:] + elif node_start_idx < idx < node_end_idx: + # Full lines that are part of the node self.content_node += line - else: + elif idx == node_end_idx and idx != node_start_idx: + # Last line of the node + self.content_node += line[:node_end_col] + self.content_after += line[node_end_col:] + elif idx > node_end_idx: self.content_after += line # Remove comment from content_before if present diff --git a/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml b/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml index 22139b3..8d7d0af 100644 --- a/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml +++ b/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml @@ -1,6 +1,6 @@ - + - From 0f7d14fd5f9e4092a0a43882228e965fe965741f Mon Sep 17 00:00:00 2001 From: "Moises Lopez - https://www.vauxoo.com/" Date: Thu, 22 Jan 2026 23:45:38 -0600 Subject: [PATCH 05/12] checkpoint --- src/oca_pre_commit_hooks/node_xml.py | 328 +++++++++++++++++++++++++++ 1 file changed, 328 insertions(+) diff --git a/src/oca_pre_commit_hooks/node_xml.py b/src/oca_pre_commit_hooks/node_xml.py index a499739..1c8ecc2 100644 --- a/src/oca_pre_commit_hooks/node_xml.py +++ b/src/oca_pre_commit_hooks/node_xml.py @@ -1,3 +1,331 @@ +import re +from dataclasses import dataclass +from typing import List, Optional + +from lxml import etree + + +@dataclass +class AttributeInfo: + """Position information for an attribute.""" + + name: str + value: str + start_line: int + start_col: int + end_line: int + end_col: int + + +@dataclass +class ElementInfo: + """Position information for an XML element.""" + + name: str + start_line: int + start_col: int + end_line: int + end_col: int + attributes: List[AttributeInfo] + is_self_closing: bool + + +class XMLPositionParser: + """Parser that finds exact positions of elements and attributes.""" + + def __init__(self, xml_source: str, is_file: bool = False): + if is_file: + with open(xml_source, encoding="utf-8") as f: + self.xml_text = f.read() + else: + self.xml_text = xml_source + + self.lines = self.xml_text.split("\n") + self.elements: List[ElementInfo] = [] + + def parse(self) -> List[ElementInfo]: + """Parse XML and return position information for all elements.""" + tag_pattern = r"<([a-zA-Z_][\w:.-]*)((?:\s+[^>]*?)?)(/?)>" + + for match in re.finditer(tag_pattern, self.xml_text, re.MULTILINE | re.DOTALL): + tag_name = match.group(1) + attributes_str = match.group(2) + is_self_closing = match.group(3) == "/" + + start_pos = match.start() + start_line, start_col = self._pos_to_line_col(start_pos) + + end_pos = match.end() + end_line, end_col = self._pos_to_line_col(end_pos) + + attrs = self._parse_attributes(attributes_str, start_pos + len(tag_name) + 1) + + element = ElementInfo( + name=tag_name, + start_line=start_line, + start_col=start_col, + end_line=end_line, + end_col=end_col, + attributes=attrs, + is_self_closing=is_self_closing, + ) + + self.elements.append(element) + + return self.elements + + def _pos_to_line_col(self, pos: int) -> tuple: + """Convert an absolute position in text to (line, column).""" + line = 1 + col = 1 + + for i, char in enumerate(self.xml_text): + if i >= pos: + break + if char == "\n": + line += 1 + col = 1 + else: + col += 1 + + return line, col + + def _parse_attributes(self, attr_str: str, base_pos: int) -> List[AttributeInfo]: + """Parse attributes from a string and return their positions.""" + attributes = [] + attr_pattern = r'([a-zA-Z_][\w:.-]*)\s*=\s*(["\'])((?:(?!\2).)*)\2' + + for match in re.finditer(attr_pattern, attr_str): + attr_name = match.group(1) + match.group(2) + attr_value = match.group(3) + + attr_start_pos = base_pos + match.start() + attr_end_pos = base_pos + match.end() + + start_line, start_col = self._pos_to_line_col(attr_start_pos) + end_line, end_col = self._pos_to_line_col(attr_end_pos) + + attr_info = AttributeInfo( + name=attr_name, + value=attr_value, + start_line=start_line, + start_col=start_col, + end_line=end_line, + end_col=end_col, + ) + + attributes.append(attr_info) + + return attributes + + +class LXMLPositionEnricher: + """Enriches lxml nodes with precise position information from XMLPositionParser.""" + + def __init__(self, xml_source: str, is_file: bool = False): + self.xml_source = xml_source + self.is_file = is_file + + # Parse with lxml + if is_file: + self.tree = etree.parse(xml_source) + self.root = self.tree.getroot() + else: + self.root = etree.fromstring(xml_source.encode("utf-8")) + + # Parse with position parser + self.position_parser = XMLPositionParser(xml_source, is_file) + self.position_elements = self.position_parser.parse() + + # Create index for matching + self._create_matching_index() + + def _create_matching_index(self): + """Create an index to match lxml elements with position elements. + Uses tag name + document order as key.""" + # Count elements by tag name as we traverse lxml tree + self.lxml_elements = [] + self._traverse_lxml(self.root) + + # Match lxml elements with position elements by order and tag name + self.position_map = {} + + # Group position elements by tag name + pos_by_tag = {} + for pos_elem in self.position_elements: + if pos_elem.name not in pos_by_tag: + pos_by_tag[pos_elem.name] = [] + pos_by_tag[pos_elem.name].append(pos_elem) + + # Group lxml elements by tag name + lxml_by_tag = {} + for lxml_elem in self.lxml_elements: + tag = self._get_tag_name(lxml_elem) + if tag: # Skip None values (comments, etc.) + if tag not in lxml_by_tag: + lxml_by_tag[tag] = [] + lxml_by_tag[tag].append(lxml_elem) + + # Match elements with same tag name by order + for tag_name in lxml_by_tag: + if tag_name in pos_by_tag: + lxml_list = lxml_by_tag[tag_name] + pos_list = pos_by_tag[tag_name] + + # Match by order (assumes same document structure) + for i in range(min(len(lxml_list), len(pos_list))): + lxml_elem = lxml_list[i] + pos_elem = pos_list[i] + + # Verify attributes match for extra safety + if self._attributes_match(lxml_elem, pos_elem): + self.position_map[id(lxml_elem)] = pos_elem + + def _traverse_lxml(self, element): + """Traverse lxml tree in document order.""" + # Only add actual elements (not comments, processing instructions, etc.) + if isinstance(element.tag, str): + self.lxml_elements.append(element) + + for child in element: + self._traverse_lxml(child) + + def _get_tag_name(self, element) -> str: + """Get tag name from lxml element, handling namespaces.""" + tag = element.tag + + # Skip non-element nodes (comments, processing instructions, etc.) + if not isinstance(tag, str): + return None + + if "}" in tag: + # Remove namespace: {http://example.com}tag -> tag + tag = tag.split("}")[1] + return tag + + def _attributes_match(self, lxml_elem, pos_elem: ElementInfo) -> bool: + """Check if attributes match between lxml element and position element. + Returns True if they match or if comparison is inconclusive. + """ + lxml_attrs = dict(lxml_elem.attrib) + pos_attrs = {attr.name: attr.value for attr in pos_elem.attributes} + + # If both have no attributes, they match + if not lxml_attrs and not pos_attrs: + return True + + # If attribute counts differ, they don't match + if len(lxml_attrs) != len(pos_attrs): + return False + + # Check if all attributes match + for key, value in lxml_attrs.items(): + if key not in pos_attrs or pos_attrs[key] != value: + return False + + return True + + def get_position_info(self, lxml_element) -> Optional[ElementInfo]: + """Get position information for an lxml element. + + Args: + lxml_element: An lxml Element object + + Returns: + ElementInfo with position data, or None if not found + """ + return self.position_map.get(id(lxml_element)) + + def enrich_element(self, lxml_element): + """Add position information as attributes to an lxml element. + Adds: _start_line, _start_col, _end_line, _end_col + """ + pos_info = self.get_position_info(lxml_element) + if pos_info: + lxml_element.set("_start_line", str(pos_info.start_line)) + lxml_element.set("_start_col", str(pos_info.start_col)) + lxml_element.set("_end_line", str(pos_info.end_line)) + lxml_element.set("_end_col", str(pos_info.end_col)) + + def enrich_all(self): + """Add position information to all elements in the tree.""" + for elem in self.lxml_elements: + self.enrich_element(elem) + + +def demo(): + """Demonstration of the enricher.""" + xml_test = """ + + + + +""" + + print("=== lxml Position Enricher Demo ===\n") + + # Create enricher + enricher = LXMLPositionEnricher(xml_test) + + print("Matching lxml elements with position data...\n") + print("=" * 70) + + # Iterate through lxml tree and show position info + for elem in enricher.lxml_elements: + tag = enricher._get_tag_name(elem) + pos_info = enricher.get_position_info(elem) + + if pos_info: + attrs = ", ".join([f"{k}='{v}'" for k, v in elem.attrib.items()]) + attrs_str = f" [{attrs}]" if attrs else "" + + print(f"\n<{tag}>{attrs_str}") + print(f" lxml sourceline: {elem.sourceline}") + print(f" Precise position:") + print(f" Start: line {pos_info.start_line}, col {pos_info.start_col}") + print(f" End: line {pos_info.end_line}, col {pos_info.end_col}") + + if pos_info.attributes: + print(f" Attributes with positions:") + for attr in pos_info.attributes: + print( + f" • {attr.name}: ({attr.start_line},{attr.start_col}) → ({attr.end_line},{attr.end_col})" + ) + + print("\n" + "=" * 70) + print("\nExample: Enrich elements with position attributes") + print("=" * 70) + + enricher.enrich_all() + + # Show enriched XML snippet + for elem in list(enricher.lxml_elements)[:3]: + tag = enricher._get_tag_name(elem) + print(f"\n<{tag}>") + for key, value in elem.attrib.items(): + if key.startswith("_"): + print(f" {key}: {value}") + + +if __name__ == "__main__": + demo() + + class NodeContent: """Represents the content and metadata of an XML node.""" From 21cf84e74e4d5d6b4349b4c2a53afb95734d7860 Mon Sep 17 00:00:00 2001 From: "Moises Lopez - https://www.vauxoo.com/" Date: Fri, 23 Jan 2026 12:54:50 -0600 Subject: [PATCH 06/12] checkpoint --- .../xml_position_parser.py | 431 ++++++++++++++++++ 1 file changed, 431 insertions(+) create mode 100644 src/oca_pre_commit_hooks/xml_position_parser.py diff --git a/src/oca_pre_commit_hooks/xml_position_parser.py b/src/oca_pre_commit_hooks/xml_position_parser.py new file mode 100644 index 0000000..4d11326 --- /dev/null +++ b/src/oca_pre_commit_hooks/xml_position_parser.py @@ -0,0 +1,431 @@ +import re +from dataclasses import dataclass +from typing import List, Optional +from lxml import etree + +@dataclass +class AttributeInfo: + """Position information for an attribute.""" + name: str + value: str + start_line: int + start_col: int + end_line: int + end_col: int + +@dataclass +class ElementInfo: + """Position information for an XML element.""" + name: str + start_line: int + start_col: int + end_line: int + end_col: int + attributes: List[AttributeInfo] + is_self_closing: bool + +class XMLPositionParser: + """Parser that finds exact positions of elements and attributes.""" + + def __init__(self, xml_source: str, is_file: bool = False): + if is_file: + with open(xml_source, 'r', encoding='utf-8') as f: + self.xml_text = f.read() + else: + self.xml_text = xml_source + + self.lines = self.xml_text.split('\n') + self.elements: List[ElementInfo] = [] + + def parse(self) -> List[ElementInfo]: + """Parse XML and return position information for all elements.""" + tag_pattern = r'<([a-zA-Z_][\w:.-]*)((?:\s+[^>]*?)?)(/?)>' + + for match in re.finditer(tag_pattern, self.xml_text, re.MULTILINE | re.DOTALL): + tag_name = match.group(1) + attributes_str = match.group(2) + is_self_closing = match.group(3) == '/' + + start_pos = match.start() + start_line, start_col = self._pos_to_line_col(start_pos) + + end_pos = match.end() + end_line, end_col = self._pos_to_line_col(end_pos) + + attrs = self._parse_attributes( + attributes_str, + start_pos + len(tag_name) + 1 + ) + + element = ElementInfo( + name=tag_name, + start_line=start_line, + start_col=start_col, + end_line=end_line, + end_col=end_col, + attributes=attrs, + is_self_closing=is_self_closing + ) + + self.elements.append(element) + + return self.elements + + def _pos_to_line_col(self, pos: int) -> tuple: + """Convert an absolute position in text to (line, column).""" + line = 1 + col = 1 + + for i, char in enumerate(self.xml_text): + if i >= pos: + break + if char == '\n': + line += 1 + col = 1 + else: + col += 1 + + return line, col + + def _parse_attributes(self, attr_str: str, base_pos: int) -> List[AttributeInfo]: + """Parse attributes from a string and return their positions.""" + attributes = [] + attr_pattern = r'([a-zA-Z_][\w:.-]*)\s*=\s*(["\'])((?:(?!\2).)*)\2' + + for match in re.finditer(attr_pattern, attr_str): + attr_name = match.group(1) + quote = match.group(2) + attr_value = match.group(3) + + attr_start_pos = base_pos + match.start() + attr_end_pos = base_pos + match.end() + + start_line, start_col = self._pos_to_line_col(attr_start_pos) + end_line, end_col = self._pos_to_line_col(attr_end_pos) + + attr_info = AttributeInfo( + name=attr_name, + value=attr_value, + start_line=start_line, + start_col=start_col, + end_line=end_line, + end_col=end_col + ) + + attributes.append(attr_info) + + return attributes + + +class PositionElement(etree.ElementBase): + """ + Custom lxml Element class that includes position information. + + Additional attributes: + - start_line: Line where element starts + - start_col: Column where element starts + - end_line: Line where element ends + - end_col: Column where element ends + - is_self_closing: Whether element is self-closing + - position_attributes: List of AttributeInfo objects + """ + + # Use __slots__ to store position data directly in the instance + # This works better with lxml's internal structure + __slots__ = () + + def _set_position_data(self, start_line, start_col, end_line, end_col, is_self_closing, position_attributes): + """Internal method to set position data.""" + # Store in a special namespace to avoid conflicts with XML attributes + self.set('_pos_start_line', str(start_line)) + self.set('_pos_start_col', str(start_col)) + self.set('_pos_end_line', str(end_line)) + self.set('_pos_end_col', str(end_col)) + self.set('_pos_is_self_closing', str(is_self_closing)) + # Store position_attributes in a way that survives + # We'll use the element's __dict__ if available, or fall back to a global dict + try: + object.__setattr__(self, '_pos_attrs_data', position_attributes) + except (AttributeError, TypeError): + # Fallback: store in element's tail (not ideal but works) + pass + + @property + def start_line(self) -> Optional[int]: + """Line where element starts.""" + val = self.get('_pos_start_line') + return int(val) if val and val != 'None' else None + + @property + def start_col(self) -> Optional[int]: + """Column where element starts.""" + val = self.get('_pos_start_col') + return int(val) if val and val != 'None' else None + + @property + def end_line(self) -> Optional[int]: + """Line where element ends.""" + val = self.get('_pos_end_line') + return int(val) if val and val != 'None' else None + + @property + def end_col(self) -> Optional[int]: + """Column where element ends.""" + val = self.get('_pos_end_col') + return int(val) if val and val != 'None' else None + + @property + def is_self_closing(self) -> Optional[bool]: + """Whether element is self-closing.""" + val = self.get('_pos_is_self_closing') + if val is None or val == 'None': + return None + return val == 'True' + + @property + def position_attributes(self) -> List[AttributeInfo]: + """List of attributes with position information.""" + try: + return object.__getattribute__(self, '_pos_attrs_data') + except AttributeError: + return [] + + +class LXMLPositionEnricher: + """ + Custom lxml parser that enriches elements with position information. + + Usage: + enricher = LXMLPositionEnricher(xml_content) + root = enricher.root + + # Now you can access position info: + element = root.xpath("//record")[0] + print(element.start_line) + print(element.start_col) + print(element.is_self_closing) + print(element.position_attributes[0].start_line) + """ + + def __init__(self, xml_source: str, is_file: bool = False): + self.xml_source = xml_source + self.is_file = is_file + + # Create custom parser with our PositionElement class + parser = etree.XMLParser() + lookup = etree.ElementDefaultClassLookup(element=PositionElement) + parser.set_element_class_lookup(lookup) + + # Parse with lxml using custom element class + if is_file: + self.tree = etree.parse(xml_source, parser) + self.root = self.tree.getroot() + else: + self.root = etree.fromstring(xml_source.encode('utf-8'), parser) + + # Parse with position parser + self.position_parser = XMLPositionParser(xml_source, is_file) + self.position_elements = self.position_parser.parse() + + # Enrich elements with position data + self._enrich_elements() + + def _enrich_elements(self): + """Match and enrich lxml elements with position information.""" + # Collect all lxml elements in document order + lxml_elements = [] + self._traverse_lxml(self.root, lxml_elements) + + # Debug: Print what we found + print(f"DEBUG: Found {len(lxml_elements)} lxml elements") + print(f"DEBUG: Found {len(self.position_elements)} position elements") + + # Group position elements by tag name + pos_by_tag = {} + for pos_elem in self.position_elements: + if pos_elem.name not in pos_by_tag: + pos_by_tag[pos_elem.name] = [] + pos_by_tag[pos_elem.name].append(pos_elem) + + print(f"DEBUG: Position elements by tag: {list(pos_by_tag.keys())}") + + # Group lxml elements by tag name + lxml_by_tag = {} + for lxml_elem in lxml_elements: + tag = self._get_tag_name(lxml_elem) + if tag: + if tag not in lxml_by_tag: + lxml_by_tag[tag] = [] + lxml_by_tag[tag].append(lxml_elem) + + print(f"DEBUG: lxml elements by tag: {list(lxml_by_tag.keys())}") + + # Match and enrich elements with same tag name by order + for tag_name in lxml_by_tag: + if tag_name in pos_by_tag: + lxml_list = lxml_by_tag[tag_name] + pos_list = pos_by_tag[tag_name] + + print(f"DEBUG: Matching {len(lxml_list)} lxml <{tag_name}> with {len(pos_list)} position <{tag_name}>") + + for i in range(min(len(lxml_list), len(pos_list))): + lxml_elem = lxml_list[i] + pos_elem = pos_list[i] + + # Debug attributes + lxml_attrs = dict(lxml_elem.attrib) + pos_attrs = {attr.name: attr.value for attr in pos_elem.attributes} + print(f" DEBUG: Comparing element {i}:") + print(f" lxml attrs: {lxml_attrs}") + print(f" pos attrs: {pos_attrs}") + + # Verify attributes match + if self._attributes_match(lxml_elem, pos_elem): + print(f" ✓ MATCH! Setting position data") + # Set position information using the custom method + lxml_elem._set_position_data( + start_line=pos_elem.start_line, + start_col=pos_elem.start_col, + end_line=pos_elem.end_line, + end_col=pos_elem.end_col, + is_self_closing=pos_elem.is_self_closing, + position_attributes=pos_elem.attributes + ) + else: + print(f" ✗ NO MATCH") + + def _traverse_lxml(self, element, elements_list): + """Traverse lxml tree in document order.""" + if isinstance(element.tag, str): + elements_list.append(element) + + for child in element: + self._traverse_lxml(child, elements_list) + + def _get_tag_name(self, element) -> Optional[str]: + """Get tag name from lxml element, handling namespaces.""" + tag = element.tag + + # Skip non-element nodes + if not isinstance(tag, str): + return None + + if '}' in tag: + # Remove namespace: {http://example.com}tag -> tag + tag = tag.split('}')[1] + return tag + + def _attributes_match(self, lxml_elem, pos_elem: ElementInfo) -> bool: + """Check if attributes match between lxml element and position element.""" + lxml_attrs = dict(lxml_elem.attrib) + pos_attrs = {attr.name: attr.value for attr in pos_elem.attributes} + + # If both have no attributes, they match + if not lxml_attrs and not pos_attrs: + return True + + # If attribute counts differ, they don't match + if len(lxml_attrs) != len(pos_attrs): + return False + + # Check if all attributes match + for key, value in lxml_attrs.items(): + if key not in pos_attrs or pos_attrs[key] != value: + return False + + return True + + +def demo(): + """Demonstration of the enriched lxml parser.""" + xml_test = """ + + + + +""" + + print("=== Enhanced lxml Parser Demo ===\n") + print("Creating enriched lxml tree...") + + # Parse XML with position enrichment + enricher = LXMLPositionEnricher(xml_test) + root = enricher.root + + print(f"Root element: <{root.tag}>\n") + print("="*70) + + # Example 1: Access position info directly + print("\nExample 1: Direct property access") + print("-"*70) + + templates = root.xpath("//template") + for template in templates: + print(f"\n<{template.tag} id='{template.get('id')}'>") + print(f" start_line: {template.start_line}") + print(f" start_col: {template.start_col}") + print(f" end_line: {template.end_line}") + print(f" end_col: {template.end_col}") + print(f" is_self_closing: {template.is_self_closing}") + + if template.position_attributes: + print(f" Attributes with positions:") + for attr in template.position_attributes: + print(f" • {attr.name}='{attr.value}'") + print(f" ({attr.start_line},{attr.start_col}) → ({attr.end_line},{attr.end_col})") + + # Example 2: Find specific element and check position + print("\n" + "="*70) + print("\nExample 2: Find element with t-esc attribute") + print("-"*70) + + t_elements = root.xpath("//t[@t-esc]") + if t_elements: + t_elem = t_elements[0] + print(f"\nFound: <{t_elem.tag}>") + print(f" Position: ({t_elem.start_line},{t_elem.start_col}) → ({t_elem.end_line},{t_elem.end_col})") + print(f" Self-closing: {t_elem.is_self_closing}") + print(f" Attributes:") + for attr in t_elem.position_attributes: + print(f" {attr.name}='{attr.value}' @ line {attr.start_line}, col {attr.start_col}") + + # Example 3: Iterate all span elements + print("\n" + "="*70) + print("\nExample 3: All elements") + print("-"*70) + + for span in root.xpath("//span"): + attrs = ", ".join([f"{k}='{v}'" for k, v in span.attrib.items()]) + print(f"\n") + print(f" Lines: {span.start_line} → {span.end_line}") + print(f" Columns: {span.start_col} → {span.end_col}") + + # Example 4: Demonstrate it's still a regular lxml element + print("\n" + "="*70) + print("\nExample 4: Still works as regular lxml element") + print("-"*70) + + print(f"\nCan use all lxml methods:") + print(f" root.tag: {root.tag}") + print(f" root.getchildren() count: {len(root.getchildren())}") + print(f" root.xpath('//div') count: {len(root.xpath('//div'))}") + print(f" isinstance(root, etree._Element): {isinstance(root, etree._Element)}") + print(f" isinstance(root, PositionElement): {isinstance(root, PositionElement)}") + + +if __name__ == "__main__": + demo() \ No newline at end of file From 27d60469f8f81b7353fa7454de18efb2db7beb55 Mon Sep 17 00:00:00 2001 From: "Moises Lopez - https://www.vauxoo.com/" Date: Fri, 23 Jan 2026 13:30:54 -0600 Subject: [PATCH 07/12] checkpoint --- .../xml_position_parser.py | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/oca_pre_commit_hooks/xml_position_parser.py b/src/oca_pre_commit_hooks/xml_position_parser.py index 4d11326..e6f002d 100644 --- a/src/oca_pre_commit_hooks/xml_position_parser.py +++ b/src/oca_pre_commit_hooks/xml_position_parser.py @@ -27,13 +27,14 @@ class ElementInfo: class XMLPositionParser: """Parser that finds exact positions of elements and attributes.""" - def __init__(self, xml_source: str, is_file: bool = False): - if is_file: - with open(xml_source, 'r', encoding='utf-8') as f: - self.xml_text = f.read() - else: - self.xml_text = xml_source - + def __init__(self, xml_content: bytes): + """ + Initialize the parser. + + Args: + xml_content: XML content as bytes + """ + self.xml_text = xml_content.decode('utf-8') self.lines = self.xml_text.split('\n') self.elements: List[ElementInfo] = [] @@ -207,9 +208,14 @@ class LXMLPositionEnricher: print(element.position_attributes[0].start_line) """ - def __init__(self, xml_source: str, is_file: bool = False): - self.xml_source = xml_source - self.is_file = is_file + def __init__(self, xml_content: bytes): + """ + Initialize the enricher. + + Args: + xml_content: XML content as bytes + """ + self.xml_content = xml_content # Create custom parser with our PositionElement class parser = etree.XMLParser() @@ -217,14 +223,10 @@ def __init__(self, xml_source: str, is_file: bool = False): parser.set_element_class_lookup(lookup) # Parse with lxml using custom element class - if is_file: - self.tree = etree.parse(xml_source, parser) - self.root = self.tree.getroot() - else: - self.root = etree.fromstring(xml_source.encode('utf-8'), parser) + self.root = etree.fromstring(xml_content, parser) # Parse with position parser - self.position_parser = XMLPositionParser(xml_source, is_file) + self.position_parser = XMLPositionParser(xml_content) self.position_elements = self.position_parser.parse() # Enrich elements with position data @@ -338,7 +340,7 @@ def _attributes_match(self, lxml_elem, pos_elem: ElementInfo) -> bool: def demo(): """Demonstration of the enriched lxml parser.""" - xml_test = """ + xml_test = b""" diff --git a/tests/test_checks.py b/tests/test_checks.py index 6128241..21ebc59 100644 --- a/tests/test_checks.py +++ b/tests/test_checks.py @@ -38,7 +38,7 @@ "xml-deprecated-qweb-directive-15": 3, "xml-deprecated-qweb-directive": 2, "xml-deprecated-tree-attribute": 3, - "xml-double-quotes-py": 3, + "xml-double-quotes-py": 4, "xml-duplicate-fields": 3, "xml-duplicate-record-id": 2, "xml-not-valid-char-link": 2, From 5fdb9e1bd2dcd9b3df983a1c726389223227c116 Mon Sep 17 00:00:00 2001 From: "Moises Lopez - https://www.vauxoo.com/" Date: Mon, 26 Jan 2026 23:55:20 -0600 Subject: [PATCH 10/12] checkpoint-working --- .../checks_odoo_module_xml.py | 44 +- src/oca_pre_commit_hooks/node_xml.py | 293 +++++++----- .../xml_position_parser.py | 440 +++++++++--------- test_repo/test_module/website_templates.xml | 2 +- 4 files changed, 432 insertions(+), 347 deletions(-) diff --git a/src/oca_pre_commit_hooks/checks_odoo_module_xml.py b/src/oca_pre_commit_hooks/checks_odoo_module_xml.py index a064e29..57e3d90 100644 --- a/src/oca_pre_commit_hooks/checks_odoo_module_xml.py +++ b/src/oca_pre_commit_hooks/checks_odoo_module_xml.py @@ -8,7 +8,7 @@ from lxml import etree from packaging.version import Version -from oca_pre_commit_hooks import node_xml, xml_position_parser, utils +from oca_pre_commit_hooks import node_xml, utils, xml_position_parser from oca_pre_commit_hooks.base_checker import BaseChecker DFLT_DEPRECATED_TREE_ATTRS = ["colors", "fonts", "string"] @@ -130,7 +130,7 @@ def update_node(self, manifest_data): # node = xml_position_parser_obj. enricher = xml_position_parser.LXMLPositionEnricher(xml_content) node = enricher.root - # import pdb;pdb.set_trace() + # import pdb;pdb.set_trace() manifest_data.update({"node": node}) f_xml.seek(0) @@ -479,8 +479,8 @@ def visit_xml_record(self, manifest_data, record): def autofix_id_position_first(self, node, first_attr, manifest_data): attrs = dict(node.attrib) - - xml_node_content = manifest_data["xml_content"][node.start_index:node.end_index] + + xml_node_content = manifest_data["xml_content"][node.start_index : node.end_index] # node_content = node_xml.NodeContent(manifest_data["filename"], node) # Build regex pattern to match the tag with all its known attributes # sourceline is the last line of the last attribute, so we need to search backwards @@ -492,7 +492,9 @@ def autofix_id_position_first(self, node, first_attr, manifest_data): # Use the first attribute spaces since that id will be the new first attribute keys = [f"spaces_before_{first_attr}", "id"] if node.get("id") == "view_ir_config_search": - import pdb;pdb.set_trace() + import pdb + + pdb.set_trace() for attr_name, attr_value in attrs.items(): escaped_name = re.escape(attr_name) escaped_value = re.escape(attr_value) @@ -538,9 +540,11 @@ def autofix_id_position_first(self, node, first_attr, manifest_data): node.attrib.clear() new_attrs = {"id": id_value, **attrs} node.attrib.update(new_attrs) - before = manifest_data["xml_content"][0:node.start_index-1] - after = manifest_data["xml_content"][node.end_index:] - import pdb;pdb.set_trace() + before = manifest_data["xml_content"][0 : node.start_index - 1] + after = manifest_data["xml_content"][node.end_index :] + import pdb + + pdb.set_trace() utils.perform_fix(manifest_data["filename"], before + content_node2 + after) @utils.only_required_for_checks("xml-view-dangerous-replace-low-priority", "xml-deprecated-tree-attribute") @@ -759,6 +763,30 @@ def check_xml_double_quotes_py(self): if not (new_py_code := self.is_compatible_single_quote(attr_value)): continue node_content = node_xml.NodeContent(manifest_data["filename"], elem) + if "test_module/website_templates.xml" in manifest_data["filename"] and "t-options" == attr_name: + # print(elem.position_attributes) + print(f"Tag: {elem.tag}") + print(f"Attributes from lxml: {dict(elem.attrib)}") + print(f"position_attributes: {elem.position_attributes}") + print(f"start_line : {elem.start_line}") + with open(manifest_data["filename"], "rb") as mf: + xml_content = mf.read() + import pdb + from importlib import reload + + pdb.set_trace() + from oca_pre_commit_hooks import xml_position_parser + + reload(xml_position_parser) + xml_position_parser.LXMLPositionEnricher(xml_content) + # enricher = LXMLPositionEnricher(xml_content) + # root = enricher.root + + # Luego intenta acceder + # enricher = utils.LXMLPositionEnricher(xml_test) + # root = enricher.root + # if not node_content.content_node.strip(b" ").strip(b"/>\n"): + # import pdb;pdb.set_trace() if b""" not in node_content.content_node: continue self.register_error( diff --git a/src/oca_pre_commit_hooks/node_xml.py b/src/oca_pre_commit_hooks/node_xml.py index 62f8efa..e762bc2 100644 --- a/src/oca_pre_commit_hooks/node_xml.py +++ b/src/oca_pre_commit_hooks/node_xml.py @@ -619,24 +619,17 @@ def __repr__(self): ) - - - - - - - - - - import re from dataclasses import dataclass from typing import List, Optional + from lxml import etree + @dataclass class AttributeInfo: """Position information for an attribute.""" + name: str value: str start_line: int @@ -644,9 +637,11 @@ class AttributeInfo: end_line: int end_col: int + @dataclass class ElementInfo: """Position information for an XML element.""" + name: str start_line: int start_col: int @@ -655,39 +650,38 @@ class ElementInfo: attributes: List[AttributeInfo] is_self_closing: bool + class XMLPositionParser: """Parser that finds exact positions of elements and attributes.""" - - def __init__(self, xml_source: str, is_file: bool = False): - if is_file: - with open(xml_source, 'r', encoding='utf-8') as f: - self.xml_text = f.read() - else: - self.xml_text = xml_source - - self.lines = self.xml_text.split('\n') + + def __init__(self, xml_content: bytes): + """ + Initialize the parser. + + Args: + xml_content: XML content as bytes + """ + self.xml_text = xml_content.decode("utf-8") + self.lines = self.xml_text.split("\n") self.elements: List[ElementInfo] = [] - + def parse(self) -> List[ElementInfo]: """Parse XML and return position information for all elements.""" - tag_pattern = r'<([a-zA-Z_][\w:.-]*)((?:\s+[^>]*?)?)(/?)>' - + tag_pattern = r"<([a-zA-Z_][\w:.-]*)((?:\s+[^>]*?)?)(/?)>" + for match in re.finditer(tag_pattern, self.xml_text, re.MULTILINE | re.DOTALL): tag_name = match.group(1) attributes_str = match.group(2) - is_self_closing = match.group(3) == '/' - + is_self_closing = match.group(3) == "/" + start_pos = match.start() start_line, start_col = self._pos_to_line_col(start_pos) - + end_pos = match.end() end_line, end_col = self._pos_to_line_col(end_pos) - - attrs = self._parse_attributes( - attributes_str, - start_pos + len(tag_name) + 1 - ) - + + attrs = self._parse_attributes(attributes_str, start_pos + len(tag_name) + 1) + element = ElementInfo( name=tag_name, start_line=start_line, @@ -695,63 +689,63 @@ def parse(self) -> List[ElementInfo]: end_line=end_line, end_col=end_col, attributes=attrs, - is_self_closing=is_self_closing + is_self_closing=is_self_closing, ) - + self.elements.append(element) - + return self.elements - + def _pos_to_line_col(self, pos: int) -> tuple: """Convert an absolute position in text to (line, column).""" line = 1 col = 1 - + for i, char in enumerate(self.xml_text): if i >= pos: break - if char == '\n': + if char == "\n": line += 1 col = 1 else: col += 1 - + return line, col - + def _parse_attributes(self, attr_str: str, base_pos: int) -> List[AttributeInfo]: """Parse attributes from a string and return their positions.""" attributes = [] attr_pattern = r'([a-zA-Z_][\w:.-]*)\s*=\s*(["\'])((?:(?!\2).)*)\2' - + for match in re.finditer(attr_pattern, attr_str): attr_name = match.group(1) - quote = match.group(2) + match.group(2) attr_value = match.group(3) - + attr_start_pos = base_pos + match.start() attr_end_pos = base_pos + match.end() - + start_line, start_col = self._pos_to_line_col(attr_start_pos) end_line, end_col = self._pos_to_line_col(attr_end_pos) - + attr_info = AttributeInfo( name=attr_name, value=attr_value, start_line=start_line, start_col=start_col, end_line=end_line, - end_col=end_col + end_col=end_col, ) - + attributes.append(attr_info) - + return attributes class PositionElement(etree.ElementBase): """ Custom lxml Element class that includes position information. - + Additional attributes: - start_line: Line where element starts - start_col: Column where element starts @@ -760,46 +754,76 @@ class PositionElement(etree.ElementBase): - is_self_closing: Whether element is self-closing - position_attributes: List of AttributeInfo objects """ - + + # Use __slots__ to store position data directly in the instance + # This works better with lxml's internal structure + __slots__ = () + + def _set_position_data(self, start_line, start_col, end_line, end_col, is_self_closing, position_attributes): + """Internal method to set position data.""" + # Store in a special namespace to avoid conflicts with XML attributes + self.set("_pos_start_line", str(start_line)) + self.set("_pos_start_col", str(start_col)) + self.set("_pos_end_line", str(end_line)) + self.set("_pos_end_col", str(end_col)) + self.set("_pos_is_self_closing", str(is_self_closing)) + # Store position_attributes in a way that survives + # We'll use the element's __dict__ if available, or fall back to a global dict + try: + object.__setattr__(self, "_pos_attrs_data", position_attributes) + except (AttributeError, TypeError): + # Fallback: store in element's tail (not ideal but works) + pass + @property def start_line(self) -> Optional[int]: """Line where element starts.""" - return getattr(self, '_start_line', None) - + val = self.get("_pos_start_line") + return int(val) if val and val != "None" else None + @property def start_col(self) -> Optional[int]: """Column where element starts.""" - return getattr(self, '_start_col', None) - + val = self.get("_pos_start_col") + return int(val) if val and val != "None" else None + @property def end_line(self) -> Optional[int]: """Line where element ends.""" - return getattr(self, '_end_line', None) - + val = self.get("_pos_end_line") + return int(val) if val and val != "None" else None + @property def end_col(self) -> Optional[int]: """Column where element ends.""" - return getattr(self, '_end_col', None) - + val = self.get("_pos_end_col") + return int(val) if val and val != "None" else None + @property def is_self_closing(self) -> Optional[bool]: """Whether element is self-closing.""" - return getattr(self, '_is_self_closing', None) - + val = self.get("_pos_is_self_closing") + if val is None or val == "None": + return None + return val == "True" + @property def position_attributes(self) -> List[AttributeInfo]: """List of attributes with position information.""" - return getattr(self, '_position_attributes', []) + try: + return object.__getattribute__(self, "_pos_attrs_data") + except AttributeError: + return [] class LXMLPositionEnricher: """ Custom lxml parser that enriches elements with position information. - + Usage: enricher = LXMLPositionEnricher(xml_content) root = enricher.root - + # Now you can access position info: element = root.xpath("//record")[0] print(element.start_line) @@ -807,43 +831,50 @@ class LXMLPositionEnricher: print(element.is_self_closing) print(element.position_attributes[0].start_line) """ - - def __init__(self, xml_source: str, is_file: bool = False): - self.xml_source = xml_source - self.is_file = is_file - + + def __init__(self, xml_content: bytes): + """ + Initialize the enricher. + + Args: + xml_content: XML content as bytes + """ + self.xml_content = xml_content + # Create custom parser with our PositionElement class parser = etree.XMLParser() lookup = etree.ElementDefaultClassLookup(element=PositionElement) parser.set_element_class_lookup(lookup) - + # Parse with lxml using custom element class - if is_file: - self.tree = etree.parse(xml_source, parser) - self.root = self.tree.getroot() - else: - self.root = etree.fromstring(xml_source.encode('utf-8'), parser) - + self.root = etree.fromstring(xml_content, parser) + # Parse with position parser - self.position_parser = XMLPositionParser(xml_source, is_file) + self.position_parser = XMLPositionParser(xml_content) self.position_elements = self.position_parser.parse() - + # Enrich elements with position data self._enrich_elements() - + def _enrich_elements(self): """Match and enrich lxml elements with position information.""" # Collect all lxml elements in document order lxml_elements = [] self._traverse_lxml(self.root, lxml_elements) - + + # Debug: Print what we found + print(f"DEBUG: Found {len(lxml_elements)} lxml elements") + print(f"DEBUG: Found {len(self.position_elements)} position elements") + # Group position elements by tag name pos_by_tag = {} for pos_elem in self.position_elements: if pos_elem.name not in pos_by_tag: pos_by_tag[pos_elem.name] = [] pos_by_tag[pos_elem.name].append(pos_elem) - + + print(f"DEBUG: Position elements by tag: {list(pos_by_tag.keys())}") + # Group lxml elements by tag name lxml_by_tag = {} for lxml_elem in lxml_elements: @@ -852,72 +883,88 @@ def _enrich_elements(self): if tag not in lxml_by_tag: lxml_by_tag[tag] = [] lxml_by_tag[tag].append(lxml_elem) - + + print(f"DEBUG: lxml elements by tag: {list(lxml_by_tag.keys())}") + # Match and enrich elements with same tag name by order for tag_name in lxml_by_tag: if tag_name in pos_by_tag: lxml_list = lxml_by_tag[tag_name] pos_list = pos_by_tag[tag_name] - + + print(f"DEBUG: Matching {len(lxml_list)} lxml <{tag_name}> with {len(pos_list)} position <{tag_name}>") + for i in range(min(len(lxml_list), len(pos_list))): lxml_elem = lxml_list[i] pos_elem = pos_list[i] - + + # Debug attributes + lxml_attrs = dict(lxml_elem.attrib) + pos_attrs = {attr.name: attr.value for attr in pos_elem.attributes} + print(f" DEBUG: Comparing element {i}:") + print(f" lxml attrs: {lxml_attrs}") + print(f" pos attrs: {pos_attrs}") + # Verify attributes match if self._attributes_match(lxml_elem, pos_elem): - # Set position information as private attributes - lxml_elem._start_line = pos_elem.start_line - lxml_elem._start_col = pos_elem.start_col - lxml_elem._end_line = pos_elem.end_line - lxml_elem._end_col = pos_elem.end_col - lxml_elem._is_self_closing = pos_elem.is_self_closing - lxml_elem._position_attributes = pos_elem.attributes - + print(f" ✓ MATCH! Setting position data") + # Set position information using the custom method + lxml_elem._set_position_data( + start_line=pos_elem.start_line, + start_col=pos_elem.start_col, + end_line=pos_elem.end_line, + end_col=pos_elem.end_col, + is_self_closing=pos_elem.is_self_closing, + position_attributes=pos_elem.attributes, + ) + else: + print(f" ✗ NO MATCH") + def _traverse_lxml(self, element, elements_list): """Traverse lxml tree in document order.""" if isinstance(element.tag, str): elements_list.append(element) - + for child in element: self._traverse_lxml(child, elements_list) - + def _get_tag_name(self, element) -> Optional[str]: """Get tag name from lxml element, handling namespaces.""" tag = element.tag - + # Skip non-element nodes if not isinstance(tag, str): return None - - if '}' in tag: + + if "}" in tag: # Remove namespace: {http://example.com}tag -> tag - tag = tag.split('}')[1] + tag = tag.split("}")[1] return tag - + def _attributes_match(self, lxml_elem, pos_elem: ElementInfo) -> bool: """Check if attributes match between lxml element and position element.""" lxml_attrs = dict(lxml_elem.attrib) pos_attrs = {attr.name: attr.value for attr in pos_elem.attributes} - + # If both have no attributes, they match if not lxml_attrs and not pos_attrs: return True - + # If attribute counts differ, they don't match if len(lxml_attrs) != len(pos_attrs): return False - + # Check if all attributes match for key, value in lxml_attrs.items(): if key not in pos_attrs or pos_attrs[key] != value: return False - + return True def demo(): """Demonstration of the enriched lxml parser.""" - xml_test = """ + xml_test = b""" """ - + print("=== Enhanced lxml Parser Demo ===\n") print("Creating enriched lxml tree...") - + # Parse XML with position enrichment enricher = LXMLPositionEnricher(xml_test) root = enricher.root - + print(f"Root element: <{root.tag}>\n") - print("="*70) - + print("=" * 70) + # Example 1: Access position info directly print("\nExample 1: Direct property access") - print("-"*70) - + print("-" * 70) + templates = root.xpath("//template") for template in templates: print(f"\n<{template.tag} id='{template.get('id')}'>") @@ -960,18 +1007,18 @@ def demo(): print(f" end_line: {template.end_line}") print(f" end_col: {template.end_col}") print(f" is_self_closing: {template.is_self_closing}") - + if template.position_attributes: print(f" Attributes with positions:") for attr in template.position_attributes: print(f" • {attr.name}='{attr.value}'") print(f" ({attr.start_line},{attr.start_col}) → ({attr.end_line},{attr.end_col})") - + # Example 2: Find specific element and check position - print("\n" + "="*70) + print("\n" + "=" * 70) print("\nExample 2: Find element with t-esc attribute") - print("-"*70) - + print("-" * 70) + t_elements = root.xpath("//t[@t-esc]") if t_elements: t_elem = t_elements[0] @@ -981,23 +1028,23 @@ def demo(): print(f" Attributes:") for attr in t_elem.position_attributes: print(f" {attr.name}='{attr.value}' @ line {attr.start_line}, col {attr.start_col}") - + # Example 3: Iterate all span elements - print("\n" + "="*70) + print("\n" + "=" * 70) print("\nExample 3: All elements") - print("-"*70) - + print("-" * 70) + for span in root.xpath("//span"): attrs = ", ".join([f"{k}='{v}'" for k, v in span.attrib.items()]) print(f"\n") print(f" Lines: {span.start_line} → {span.end_line}") print(f" Columns: {span.start_col} → {span.end_col}") - + # Example 4: Demonstrate it's still a regular lxml element - print("\n" + "="*70) + print("\n" + "=" * 70) print("\nExample 4: Still works as regular lxml element") - print("-"*70) - + print("-" * 70) + print(f"\nCan use all lxml methods:") print(f" root.tag: {root.tag}") print(f" root.getchildren() count: {len(root.getchildren())}") @@ -1007,4 +1054,4 @@ def demo(): if __name__ == "__main__": - demo() \ No newline at end of file + demo() diff --git a/src/oca_pre_commit_hooks/xml_position_parser.py b/src/oca_pre_commit_hooks/xml_position_parser.py index ff7f598..b743649 100644 --- a/src/oca_pre_commit_hooks/xml_position_parser.py +++ b/src/oca_pre_commit_hooks/xml_position_parser.py @@ -1,115 +1,153 @@ +import html import re from dataclasses import dataclass from typing import List, Optional + from lxml import etree + @dataclass class AttributeInfo: """Position information for an attribute.""" + name: str value: str start_line: int start_col: int end_line: int end_col: int - start_index: int # Nuevo: índice de inicio en la cadena - end_index: int # Nuevo: índice de fin en la cadena + @dataclass class ElementInfo: """Position information for an XML element.""" + name: str start_line: int start_col: int end_line: int end_col: int - start_index: int # Nuevo: índice de inicio en la cadena - end_index: int # Nuevo: índice de fin en la cadena attributes: List[AttributeInfo] is_self_closing: bool + class XMLPositionParser: """Parser that finds exact positions of elements and attributes.""" - + def __init__(self, xml_content: bytes): """ Initialize the parser. - + Args: xml_content: XML content as bytes """ - self.xml_text = xml_content.decode('utf-8') - self.lines = self.xml_text.split('\n') + self.xml_text = xml_content.decode("utf-8") + self.lines = self.xml_text.split("\n") self.elements: List[ElementInfo] = [] - + def parse(self) -> List[ElementInfo]: """Parse XML and return position information for all elements.""" - tag_pattern = r'<([a-zA-Z_][\w:.-]*)((?:\s+[^>]*?)?)(/?)>' - + # Robust pattern for multi-line tags + # Matches: or + # Uses greedy matching to capture everything until the closing > + tag_pattern = r"<([a-zA-Z_][\w:.-]*)(.*?)(/?)>" + for match in re.finditer(tag_pattern, self.xml_text, re.MULTILINE | re.DOTALL): tag_name = match.group(1) attributes_str = match.group(2) - is_self_closing = match.group(3) == '/' - + is_self_closing = match.group(3) == "/" + + # Skip if this looks like a closing tag + if attributes_str.strip().startswith("/"): + continue + start_pos = match.start() start_line, start_col = self._pos_to_line_col(start_pos) - + end_pos = match.end() end_line, end_col = self._pos_to_line_col(end_pos) - - attrs = self._parse_attributes( - attributes_str, - start_pos + len(tag_name) + 1 - ) - + + attrs = self._parse_attributes(attributes_str, start_pos + len(tag_name) + 1) + element = ElementInfo( name=tag_name, start_line=start_line, start_col=start_col, end_line=end_line, end_col=end_col, - start_index=start_pos, # Guardamos el índice absoluto - end_index=end_pos, # Guardamos el índice absoluto attributes=attrs, - is_self_closing=is_self_closing + is_self_closing=is_self_closing, ) - + self.elements.append(element) - + return self.elements - + def _pos_to_line_col(self, pos: int) -> tuple: """Convert an absolute position in text to (line, column).""" line = 1 col = 1 - + for i, char in enumerate(self.xml_text): if i >= pos: break - if char == '\n': + if char == "\n": line += 1 col = 1 else: col += 1 - + return line, col - + def _parse_attributes(self, attr_str: str, base_pos: int) -> List[AttributeInfo]: """Parse attributes from a string and return their positions.""" attributes = [] - attr_pattern = r'([a-zA-Z_][\w:.-]*)\s*=\s*(["\'])((?:(?!\2).)*)\2' - - for match in re.finditer(attr_pattern, attr_str): + + # Pattern that properly handles escaped quotes inside attribute values + # Matches double quotes: name="value with " inside" + attr_pattern_double = r'([a-zA-Z_][\w:.-]*)\s*=\s*"([^"]*(?:"[^"]*)*)"' + # Matches single quotes: name='value with ' inside' + attr_pattern_single = r"([a-zA-Z_][\w:.-]*)\s*=\s*'([^']*(?:'[^']*)*)'" + + # Try double quotes first + for match in re.finditer(attr_pattern_double, attr_str, re.DOTALL): + attr_name = match.group(1) + attr_value_raw = match.group(2) + + # Decode HTML entities to match what lxml does (" -> ", & -> &, etc.) + attr_value = html.unescape(attr_value_raw) + + attr_start_pos = base_pos + match.start() + attr_end_pos = base_pos + match.end() + + start_line, start_col = self._pos_to_line_col(attr_start_pos) + end_line, end_col = self._pos_to_line_col(attr_end_pos) + + attr_info = AttributeInfo( + name=attr_name, + value=attr_value, + start_line=start_line, + start_col=start_col, + end_line=end_line, + end_col=end_col, + ) + + attributes.append(attr_info) + + # Try single quotes + for match in re.finditer(attr_pattern_single, attr_str, re.DOTALL): attr_name = match.group(1) - quote = match.group(2) - attr_value = match.group(3) - + attr_value_raw = match.group(2) + + # Decode HTML entities + attr_value = html.unescape(attr_value_raw) + attr_start_pos = base_pos + match.start() attr_end_pos = base_pos + match.end() - + start_line, start_col = self._pos_to_line_col(attr_start_pos) end_line, end_col = self._pos_to_line_col(attr_end_pos) - + attr_info = AttributeInfo( name=attr_name, value=attr_value, @@ -117,159 +155,181 @@ def _parse_attributes(self, attr_str: str, base_pos: int) -> List[AttributeInfo] start_col=start_col, end_line=end_line, end_col=end_col, - start_index=attr_start_pos, # Guardamos el índice absoluto - end_index=attr_end_pos # Guardamos el índice absoluto ) - + attributes.append(attr_info) - + return attributes class PositionElement(etree.ElementBase): """ Custom lxml Element class that includes position information. - - Additional attributes: + + Additional properties: - start_line: Line where element starts - start_col: Column where element starts - end_line: Line where element ends - end_col: Column where element ends - - start_index: Character index where element starts - - end_index: Character index where element ends - is_self_closing: Whether element is self-closing - position_attributes: List of AttributeInfo objects """ - + __slots__ = () - - def _set_position_data(self, start_line, start_col, end_line, end_col, - start_index, end_index, is_self_closing, position_attributes): + + def _set_position_data(self, start_line, start_col, end_line, end_col, is_self_closing, position_attributes): """Internal method to set position data.""" - self.set('_pos_start_line', str(start_line)) - self.set('_pos_start_col', str(start_col)) - self.set('_pos_end_line', str(end_line)) - self.set('_pos_end_col', str(end_col)) - self.set('_pos_start_index', str(start_index)) # Nuevo - self.set('_pos_end_index', str(end_index)) # Nuevo - self.set('_pos_is_self_closing', str(is_self_closing)) + self.set("_pos_start_line", str(start_line)) + self.set("_pos_start_col", str(start_col)) + self.set("_pos_end_line", str(end_line)) + self.set("_pos_end_col", str(end_col)) + self.set("_pos_is_self_closing", str(is_self_closing)) + + # Try to store position_attributes directly try: - object.__setattr__(self, '_pos_attrs_data', position_attributes) + object.__setattr__(self, "_pos_attrs_data", position_attributes) except (AttributeError, TypeError): pass - + + # Also serialize as JSON for reliable storage + import json + + attrs_json = json.dumps( + [ + { + "name": attr.name, + "value": attr.value, + "start_line": attr.start_line, + "start_col": attr.start_col, + "end_line": attr.end_line, + "end_col": attr.end_col, + } + for attr in position_attributes + ] + ) + self.set("_pos_attrs_json", attrs_json) + @property def start_line(self) -> Optional[int]: """Line where element starts.""" - val = self.get('_pos_start_line') - return int(val) if val and val != 'None' else None - + val = self.get("_pos_start_line") + return int(val) if val and val != "None" else None + @property def start_col(self) -> Optional[int]: """Column where element starts.""" - val = self.get('_pos_start_col') - return int(val) if val and val != 'None' else None - + val = self.get("_pos_start_col") + return int(val) if val and val != "None" else None + @property def end_line(self) -> Optional[int]: """Line where element ends.""" - val = self.get('_pos_end_line') - return int(val) if val and val != 'None' else None - + val = self.get("_pos_end_line") + return int(val) if val and val != "None" else None + @property def end_col(self) -> Optional[int]: """Column where element ends.""" - val = self.get('_pos_end_col') - return int(val) if val and val != 'None' else None - - @property - def start_index(self) -> Optional[int]: - """Character index where element starts.""" - val = self.get('_pos_start_index') - return int(val) if val and val != 'None' else None - - @property - def end_index(self) -> Optional[int]: - """Character index where element ends.""" - val = self.get('_pos_end_index') - return int(val) if val and val != 'None' else None - + val = self.get("_pos_end_col") + return int(val) if val and val != "None" else None + @property def is_self_closing(self) -> Optional[bool]: """Whether element is self-closing.""" - val = self.get('_pos_is_self_closing') - if val is None or val == 'None': + val = self.get("_pos_is_self_closing") + if val is None or val == "None": return None - return val == 'True' - + return val == "True" + @property def position_attributes(self) -> List[AttributeInfo]: """List of attributes with position information.""" + # Try to get from object attribute first try: - return object.__getattribute__(self, '_pos_attrs_data') + attrs = object.__getattribute__(self, "_pos_attrs_data") + if attrs: + return attrs except AttributeError: - return [] + pass + + # Fall back to JSON deserialization + import json + + attrs_json = self.get("_pos_attrs_json") + if attrs_json: + try: + attrs_data = json.loads(attrs_json) + return [ + AttributeInfo( + name=a["name"], + value=a["value"], + start_line=a["start_line"], + start_col=a["start_col"], + end_line=a["end_line"], + end_col=a["end_col"], + ) + for a in attrs_data + ] + except (json.JSONDecodeError, KeyError): + pass + + return [] class LXMLPositionEnricher: """ Custom lxml parser that enriches elements with position information. - + Usage: enricher = LXMLPositionEnricher(xml_content) root = enricher.root - - # Now you can access position info: + + # Access position info: element = root.xpath("//record")[0] print(element.start_line) print(element.start_col) - print(element.start_index) - - # Extract the exact text: - xml_text = xml_content.decode('utf-8') - element_text = xml_text[element.start_index:element.end_index] + print(element.is_self_closing) + print(element.position_attributes[0].start_line) """ - + def __init__(self, xml_content: bytes): """ Initialize the enricher. - + Args: xml_content: XML content as bytes """ self.xml_content = xml_content - - # Create custom parser with our PositionElement class + + # Create custom parser with PositionElement class parser = etree.XMLParser() lookup = etree.ElementDefaultClassLookup(element=PositionElement) parser.set_element_class_lookup(lookup) - - # Parse with lxml using custom element class + + # Parse with lxml self.root = etree.fromstring(xml_content, parser) - + # Parse with position parser self.position_parser = XMLPositionParser(xml_content) self.position_elements = self.position_parser.parse() - + # Enrich elements with position data self._enrich_elements() - + def _enrich_elements(self): """Match and enrich lxml elements with position information.""" + # Collect all lxml elements in document order lxml_elements = [] self._traverse_lxml(self.root, lxml_elements) - - print(f"DEBUG: Found {len(lxml_elements)} lxml elements") - print(f"DEBUG: Found {len(self.position_elements)} position elements") - + + # Group position elements by tag name pos_by_tag = {} for pos_elem in self.position_elements: if pos_elem.name not in pos_by_tag: pos_by_tag[pos_elem.name] = [] pos_by_tag[pos_elem.name].append(pos_elem) - - print(f"DEBUG: Position elements by tag: {list(pos_by_tag.keys())}") - + + # Group lxml elements by tag name lxml_by_tag = {} for lxml_elem in lxml_elements: tag = self._get_tag_name(lxml_elem) @@ -277,156 +337,106 @@ def _enrich_elements(self): if tag not in lxml_by_tag: lxml_by_tag[tag] = [] lxml_by_tag[tag].append(lxml_elem) - - print(f"DEBUG: lxml elements by tag: {list(lxml_by_tag.keys())}") - + + # Match and enrich elements with same tag name by order for tag_name in lxml_by_tag: if tag_name in pos_by_tag: lxml_list = lxml_by_tag[tag_name] pos_list = pos_by_tag[tag_name] - - print(f"DEBUG: Matching {len(lxml_list)} lxml <{tag_name}> with {len(pos_list)} position <{tag_name}>") - + for i in range(min(len(lxml_list), len(pos_list))): lxml_elem = lxml_list[i] pos_elem = pos_list[i] - - lxml_attrs = dict(lxml_elem.attrib) - pos_attrs = {attr.name: attr.value for attr in pos_elem.attributes} - print(f" DEBUG: Comparing element {i}:") - print(f" lxml attrs: {lxml_attrs}") - print(f" pos attrs: {pos_attrs}") - + if self._attributes_match(lxml_elem, pos_elem): - print(f" ✓ MATCH! Setting position data") lxml_elem._set_position_data( start_line=pos_elem.start_line, start_col=pos_elem.start_col, end_line=pos_elem.end_line, end_col=pos_elem.end_col, - start_index=pos_elem.start_index, # Nuevo - end_index=pos_elem.end_index, # Nuevo is_self_closing=pos_elem.is_self_closing, - position_attributes=pos_elem.attributes + position_attributes=pos_elem.attributes, ) - else: - print(f" ✗ NO MATCH") - + def _traverse_lxml(self, element, elements_list): """Traverse lxml tree in document order.""" if isinstance(element.tag, str): elements_list.append(element) - + for child in element: self._traverse_lxml(child, elements_list) - + def _get_tag_name(self, element) -> Optional[str]: """Get tag name from lxml element, handling namespaces.""" tag = element.tag - + if not isinstance(tag, str): return None - - if '}' in tag: - tag = tag.split('}')[1] + + if "}" in tag: + tag = tag.split("}")[1] return tag - + def _attributes_match(self, lxml_elem, pos_elem: ElementInfo) -> bool: """Check if attributes match between lxml element and position element.""" lxml_attrs = dict(lxml_elem.attrib) pos_attrs = {attr.name: attr.value for attr in pos_elem.attributes} - + if not lxml_attrs and not pos_attrs: return True - + if len(lxml_attrs) != len(pos_attrs): return False - + for key, value in lxml_attrs.items(): if key not in pos_attrs or pos_attrs[key] != value: return False - + return True def demo(): """Demonstration of the enriched lxml parser.""" - xml_test = b""" + xml_test = b""" - - - - - diff --git a/tests/test_checks.py b/tests/test_checks.py index 21ebc59..cca714c 100644 --- a/tests/test_checks.py +++ b/tests/test_checks.py @@ -35,7 +35,7 @@ "xml-dangerous-qweb-replace-low-priority": 9, "xml-deprecated-data-node": 8, "xml-deprecated-openerp-node": 4, - "xml-deprecated-qweb-directive-15": 3, + "xml-deprecated-qweb-directive-15": 4, "xml-deprecated-qweb-directive": 2, "xml-deprecated-tree-attribute": 3, "xml-double-quotes-py": 4,