diff --git a/README.md b/README.md index 7f3bb8d..ffbe9dd 100644 --- a/README.md +++ b/README.md @@ -462,7 +462,7 @@ options: - https://github.com/OCA/odoo-pre-commit-hooks/blob/v0.2.20/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml#L6 Deprecated QWeb directive `t-esc`. Use `t-out` instead - https://github.com/OCA/odoo-pre-commit-hooks/blob/v0.2.20/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml#L7 Deprecated QWeb directive `t-raw`. Use `t-out` instead - - https://github.com/OCA/odoo-pre-commit-hooks/blob/v0.2.20/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml#L13 Deprecated QWeb directive `t-esc`. Use `t-out` instead + - https://github.com/OCA/odoo-pre-commit-hooks/blob/v0.2.20/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml#L15 Deprecated QWeb directive `t-esc`. Use `t-out` instead * xml-deprecated-tree-attribute diff --git a/src/oca_pre_commit_hooks/checks_odoo_module_xml.py b/src/oca_pre_commit_hooks/checks_odoo_module_xml.py index 4cbf2b9..c0229d1 100644 --- a/src/oca_pre_commit_hooks/checks_odoo_module_xml.py +++ b/src/oca_pre_commit_hooks/checks_odoo_module_xml.py @@ -8,7 +8,7 @@ from lxml import etree from packaging.version import Version -from oca_pre_commit_hooks import node_xml, utils +from oca_pre_commit_hooks import node_xml, utils, xml_position_parser from oca_pre_commit_hooks.base_checker import BaseChecker DFLT_DEPRECATED_TREE_ATTRS = ["colors", "fonts", "string"] @@ -124,7 +124,14 @@ def update_node(self, manifest_data): Useful when the file is modified by autofix and a new line is inserted/removed. It will update the sourceline of the nodes""" with open(manifest_data["filename"], "rb") as f_xml: - node = etree.parse(f_xml) + xml_content = f_xml.read() + # node = etree.parse(f_xml) + # xml_position_parser_obj = xml_position_parser.XMLPositionParser(content) + # node = xml_position_parser_obj. + enricher = xml_position_parser.LXMLPositionEnricher(xml_content) + node = enricher.root + # import pdb;pdb.set_trace() + manifest_data.update({"node": node}) f_xml.seek(0) first_tag, first_tag_lineno = self._get_first_tag(f_xml) @@ -133,6 +140,7 @@ def update_node(self, manifest_data): "node": node, "first_tag": first_tag, "first_tag_lineno": first_tag_lineno, + "xml_content": xml_content, } ) return node @@ -471,7 +479,9 @@ def visit_xml_record(self, manifest_data, record): def autofix_id_position_first(self, node, first_attr, manifest_data): attrs = dict(node.attrib) - node_content = node_xml.NodeContent(manifest_data["filename"], node) + + xml_node_content = manifest_data["xml_content"][node.start_index : node.end_index] + # node_content = node_xml.NodeContent(manifest_data["filename"], node) # Build regex pattern to match the tag with all its known attributes # sourceline is the last line of the last attribute, so we need to search backwards tag_name = re.escape(node.tag) @@ -481,6 +491,10 @@ def autofix_id_position_first(self, node, first_attr, manifest_data): attr_patterns = [] # Use the first attribute spaces since that id will be the new first attribute keys = [f"spaces_before_{first_attr}", "id"] + # if node.get("id") == "view_ir_config_search": + # import pdb + + # pdb.set_trace() for attr_name, attr_value in attrs.items(): escaped_name = re.escape(attr_name) escaped_value = re.escape(attr_value) @@ -513,21 +527,25 @@ def autofix_id_position_first(self, node, first_attr, manifest_data): ) # Search with multiline and dotall flags - match = re.search(pattern, node_content.content_node.decode(), re.DOTALL | re.MULTILINE) + match = re.search(pattern, xml_node_content.decode(), re.DOTALL | re.MULTILINE) if match: keys = [f"open_{node.tag}"] + keys + [f"close_{node.tag}"] match_dict = match.groupdict() recreate = "".join(match_dict[k] for k in keys) original = match.group() - content_node2 = node_content.content_node.replace(original.encode(), recreate.encode(), 1) - if content_node2 != node_content.content_node: + content_node2 = xml_node_content.replace(original.encode(), recreate.encode(), 1) + if content_node2 != xml_node_content: # Modify the record attrib to propagate the change to other checks id_value = attrs.pop("id") node.attrib.clear() new_attrs = {"id": id_value, **attrs} node.attrib.update(new_attrs) - node_content.content_node = content_node2 - utils.perform_fix(manifest_data["filename"], bytes(node_content)) + before = manifest_data["xml_content"][0 : node.start_index - 1] + after = manifest_data["xml_content"][node.end_index :] + import pdb + + pdb.set_trace() + utils.perform_fix(manifest_data["filename"], before + content_node2 + after) @utils.only_required_for_checks("xml-view-dangerous-replace-low-priority", "xml-deprecated-tree-attribute") def visit_xml_record_view(self, manifest_data, record): @@ -745,6 +763,30 @@ def check_xml_double_quotes_py(self): if not (new_py_code := self.is_compatible_single_quote(attr_value)): continue node_content = node_xml.NodeContent(manifest_data["filename"], elem) + if "test_module/website_templates.xml" in manifest_data["filename"] and "t-options" == attr_name: + # print(elem.position_attributes) + print(f"Tag: {elem.tag}") + print(f"Attributes from lxml: {dict(elem.attrib)}") + print(f"position_attributes: {elem.position_attributes}") + print(f"start_line : {elem.start_line}") + with open(manifest_data["filename"], "rb") as mf: + xml_content = mf.read() + import pdb + from importlib import reload + + pdb.set_trace() + from oca_pre_commit_hooks import xml_position_parser + + reload(xml_position_parser) + xml_position_parser.LXMLPositionEnricher(xml_content) + # enricher = LXMLPositionEnricher(xml_content) + # root = enricher.root + + # Luego intenta acceder + # enricher = utils.LXMLPositionEnricher(xml_test) + # root = enricher.root + # if not node_content.content_node.strip(b" ").strip(b"/>\n"): + # import pdb;pdb.set_trace() if b""" not in node_content.content_node: continue self.register_error( diff --git a/src/oca_pre_commit_hooks/node_xml.py b/src/oca_pre_commit_hooks/node_xml.py index a1726cf..b6ca6c2 100644 --- a/src/oca_pre_commit_hooks/node_xml.py +++ b/src/oca_pre_commit_hooks/node_xml.py @@ -1,3 +1,322 @@ +import re +from dataclasses import dataclass +from typing import List, Optional + +from lxml import etree + + +@dataclass +class AttributeInfo: + """Position information for an attribute.""" + + name: str + value: str + start_line: int + start_col: int + end_line: int + end_col: int + + +@dataclass +class ElementInfo: + """Position information for an XML element.""" + + name: str + start_line: int + start_col: int + end_line: int + end_col: int + attributes: List[AttributeInfo] + is_self_closing: bool + + +class XMLPositionParser: + """Parser that finds exact positions of elements and attributes.""" + + def __init__(self, xml_source: str): + self.xml_text = xml_source + + self.lines = self.xml_text.split("\n") + self.elements: List[ElementInfo] = [] + + def parse(self) -> List[ElementInfo]: + """Parse XML and return position information for all elements.""" + tag_pattern = r"<([a-zA-Z_][\w:.-]*)((?:\s+[^>]*?)?)(/?)>" + + for match in re.finditer(tag_pattern, self.xml_text, re.MULTILINE | re.DOTALL): + tag_name = match.group(1) + attributes_str = match.group(2) + is_self_closing = match.group(3) == "/" + + start_pos = match.start() + start_line, start_col = self._pos_to_line_col(start_pos) + + end_pos = match.end() + end_line, end_col = self._pos_to_line_col(end_pos) + + attrs = self._parse_attributes(attributes_str, start_pos + len(tag_name) + 1) + + element = ElementInfo( + name=tag_name, + start_line=start_line, + start_col=start_col, + end_line=end_line, + end_col=end_col, + attributes=attrs, + is_self_closing=is_self_closing, + ) + + self.elements.append(element) + + return self.elements + + def _pos_to_line_col(self, pos: int) -> tuple: + """Convert an absolute position in text to (line, column).""" + line = 1 + col = 1 + + for i, char in enumerate(self.xml_text): + if i >= pos: + break + if char == "\n": + line += 1 + col = 1 + else: + col += 1 + + return line, col + + def _parse_attributes(self, attr_str: str, base_pos: int) -> List[AttributeInfo]: + """Parse attributes from a string and return their positions.""" + attributes = [] + attr_pattern = r'([a-zA-Z_][\w:.-]*)\s*=\s*(["\'])((?:(?!\2).)*)\2' + + for match in re.finditer(attr_pattern, attr_str): + attr_name = match.group(1) + match.group(2) + attr_value = match.group(3) + + attr_start_pos = base_pos + match.start() + attr_end_pos = base_pos + match.end() + + start_line, start_col = self._pos_to_line_col(attr_start_pos) + end_line, end_col = self._pos_to_line_col(attr_end_pos) + + attr_info = AttributeInfo( + name=attr_name, + value=attr_value, + start_line=start_line, + start_col=start_col, + end_line=end_line, + end_col=end_col, + ) + + attributes.append(attr_info) + + return attributes + + +class LXMLPositionEnricher: + """Enriches lxml nodes with precise position information from XMLPositionParser.""" + + def __init__(self, xml_source: str): + self.xml_source = xml_source + + # Parse with lxml + self.root = etree.fromstring(xml_source.encode("utf-8")) + + # Parse with position parser + self.position_parser = XMLPositionParser(xml_source) + self.position_elements = self.position_parser.parse() + + # Create index for matching + self._create_matching_index() + + def _create_matching_index(self): + """Create an index to match lxml elements with position elements. + Uses tag name + document order as key.""" + # Count elements by tag name as we traverse lxml tree + self.lxml_elements = [] + self._traverse_lxml(self.root) + + # Match lxml elements with position elements by order and tag name + self.position_map = {} + + # Group position elements by tag name + pos_by_tag = {} + for pos_elem in self.position_elements: + if pos_elem.name not in pos_by_tag: + pos_by_tag[pos_elem.name] = [] + pos_by_tag[pos_elem.name].append(pos_elem) + + # Group lxml elements by tag name + lxml_by_tag = {} + for lxml_elem in self.lxml_elements: + tag = self._get_tag_name(lxml_elem) + if tag: # Skip None values (comments, etc.) + if tag not in lxml_by_tag: + lxml_by_tag[tag] = [] + lxml_by_tag[tag].append(lxml_elem) + + # Match elements with same tag name by order + for tag_name in lxml_by_tag: + if tag_name in pos_by_tag: + lxml_list = lxml_by_tag[tag_name] + pos_list = pos_by_tag[tag_name] + + # Match by order (assumes same document structure) + for i in range(min(len(lxml_list), len(pos_list))): + lxml_elem = lxml_list[i] + pos_elem = pos_list[i] + + # Verify attributes match for extra safety + if self._attributes_match(lxml_elem, pos_elem): + self.position_map[id(lxml_elem)] = pos_elem + + def _traverse_lxml(self, element): + """Traverse lxml tree in document order.""" + # Only add actual elements (not comments, processing instructions, etc.) + if isinstance(element.tag, str): + self.lxml_elements.append(element) + + for child in element: + self._traverse_lxml(child) + + def _get_tag_name(self, element) -> str: + """Get tag name from lxml element, handling namespaces.""" + tag = element.tag + + # Skip non-element nodes (comments, processing instructions, etc.) + if not isinstance(tag, str): + return None + + if "}" in tag: + # Remove namespace: {http://example.com}tag -> tag + tag = tag.split("}")[1] + return tag + + def _attributes_match(self, lxml_elem, pos_elem: ElementInfo) -> bool: + """Check if attributes match between lxml element and position element. + Returns True if they match or if comparison is inconclusive. + """ + lxml_attrs = dict(lxml_elem.attrib) + pos_attrs = {attr.name: attr.value for attr in pos_elem.attributes} + + # If both have no attributes, they match + if not lxml_attrs and not pos_attrs: + return True + + # If attribute counts differ, they don't match + if len(lxml_attrs) != len(pos_attrs): + return False + + # Check if all attributes match + for key, value in lxml_attrs.items(): + if key not in pos_attrs or pos_attrs[key] != value: + return False + + return True + + def get_position_info(self, lxml_element) -> Optional[ElementInfo]: + """Get position information for an lxml element. + + Args: + lxml_element: An lxml Element object + + Returns: + ElementInfo with position data, or None if not found + """ + return self.position_map.get(id(lxml_element)) + + def enrich_element(self, lxml_element): + """Add position information as attributes to an lxml element. + Adds: _start_line, _start_col, _end_line, _end_col + """ + pos_info = self.get_position_info(lxml_element) + if pos_info: + lxml_element.set("_start_line", str(pos_info.start_line)) + lxml_element.set("_start_col", str(pos_info.start_col)) + lxml_element.set("_end_line", str(pos_info.end_line)) + lxml_element.set("_end_col", str(pos_info.end_col)) + + def enrich_all(self): + """Add position information to all elements in the tree.""" + for elem in self.lxml_elements: + self.enrich_element(elem) + + +def demo(): + """Demonstration of the enricher.""" + xml_test = """ + + + + +""" + + print("=== lxml Position Enricher Demo ===\n") + + # Create enricher + enricher = LXMLPositionEnricher(xml_test) + + print("Matching lxml elements with position data...\n") + print("=" * 70) + + # Iterate through lxml tree and show position info + for elem in enricher.lxml_elements: + tag = enricher._get_tag_name(elem) + pos_info = enricher.get_position_info(elem) + + if pos_info: + attrs = ", ".join([f"{k}='{v}'" for k, v in elem.attrib.items()]) + attrs_str = f" [{attrs}]" if attrs else "" + + print(f"\n<{tag}>{attrs_str}") + print(f" lxml sourceline: {elem.sourceline}") + print(f" Precise position:") + print(f" Start: line {pos_info.start_line}, col {pos_info.start_col}") + print(f" End: line {pos_info.end_line}, col {pos_info.end_col}") + + if pos_info.attributes: + print(f" Attributes with positions:") + for attr in pos_info.attributes: + print( + f" • {attr.name}: ({attr.start_line},{attr.start_col}) → ({attr.end_line},{attr.end_col})" + ) + + print("\n" + "=" * 70) + print("\nExample: Enrich elements with position attributes") + print("=" * 70) + + enricher.enrich_all() + + # Show enriched XML snippet + for elem in list(enricher.lxml_elements)[:3]: + tag = enricher._get_tag_name(elem) + print(f"\n<{tag}>") + for key, value in elem.attrib.items(): + if key.startswith("_"): + print(f" {key}: {value}") + + +if __name__ == "__main__": + demo() + + class NodeContent: """Represents the content and metadata of an XML node.""" @@ -24,12 +343,12 @@ def __init__(self, filename, node): def _read_node(self): # noqa:C901 pylint:disable=too-complex """Internal method to read the content of the file and extract node information.""" - # TODO: Get the sourceline of a particular attribute # Determine the search start line if (node_previous := self.node.getprevious()) is not None: search_start_line = node_previous.sourceline + 1 elif (node_parent := self.node.getparent()) is not None: - search_start_line = node_parent.sourceline + 1 + # Start from parent line (not +1) because child tags can be on same line as parent + search_start_line = node_parent.sourceline else: search_start_line = 2 # first element and it is the root @@ -42,13 +361,92 @@ def _read_node(self): # noqa:C901 pylint:disable=too-complex # Find the actual node start by looking for the tag node_start_idx = None + node_start_col = 0 # Column position within the line + + # When there are multiple tags on the same line, we need to find the RIGHT one + # We'll collect all candidates and use additional heuristics + candidates = [] + for idx, (no_line, line) in enumerate(all_lines): if search_start_line <= no_line <= search_end_line: - stripped_line = line.lstrip() - if stripped_line.startswith(b"<" + node_tag): - node_start_idx = idx - self.start_sourceline = no_line - break + # Look for the tag anywhere in the line, not just at the start + tag_pattern = b"<" + node_tag + tag_pos = 0 + + # Find ALL occurrences of the tag in this line + while True: + tag_pos = line.find(tag_pattern, tag_pos) + if tag_pos == -1: + break + + # Verify it's actually a tag start (followed by space, >, or /) + check_pos = tag_pos + len(tag_pattern) + is_valid = False + + if check_pos >= len(line): + # Tag at end of line is valid + is_valid = True + else: + next_char = line[check_pos : check_pos + 1] + # Valid tag if followed by space, >, /, or newline + if next_char in (b" ", b">", b"/", b"\n", b"\r"): + is_valid = True + + if is_valid: + candidates.append({"idx": idx, "line_no": no_line, "col": tag_pos, "line": line}) + + tag_pos += 1 + + # Choose the best candidate + if candidates: + # If we only have one candidate, use it + if len(candidates) == 1: + best = candidates[0] + node_start_idx = best["idx"] + node_start_col = best["col"] + self.start_sourceline = best["line_no"] + else: + # Multiple candidates - need to find the right one + # Strategy: use node attributes to identify the correct tag + best = None + + # Get the first attribute of the node if it exists + node_attrib_key = None + if self.node.attrib: + node_attrib_key = list(self.node.attrib.keys())[0] + + # Try to match by attribute + if node_attrib_key: + attr_pattern = f"{node_attrib_key}=".encode() + for candidate in candidates: + # Look in the line and following lines for this attribute + idx = candidate["idx"] + # Check current line from the tag position + search_text = candidate["line"][candidate["col"] :] + # Also check next few lines (in case attributes span lines) + for i in range(idx, min(idx + 5, len(all_lines))): + if i > idx: + search_text += all_lines[i][1] + + if attr_pattern in search_text: + best = candidate + break + + # If we didn't find by attribute, use line number heuristic + if best is None: + # Prefer candidates on the target line (node.sourceline) + candidates_on_target_line = [c for c in candidates if c["line_no"] == search_end_line] + + if candidates_on_target_line: + # If multiple on target line, prefer the last one + best = candidates_on_target_line[-1] + else: + # Use the last candidate overall + best = candidates[-1] + + node_start_idx = best["idx"] + node_start_col = best["col"] + self.start_sourceline = best["line_no"] if node_start_idx is None: # Fallback: use search_end_line @@ -60,33 +458,89 @@ def _read_node(self): # noqa:C901 pylint:disable=too-complex # Find the actual node end node_end_idx = node_start_idx + node_end_col = None # Column position where node ends self.end_sourceline = self.start_sourceline + # Track nesting level to handle cases where parent and child have same tag + nesting_level = 0 + for idx in range(node_start_idx, len(all_lines)): no_line, line = all_lines[idx] - stripped_line = line.lstrip() if idx == node_start_idx: - # Check if self-closing or single-line - if b"/>" in line: - node_end_idx = idx - self.end_sourceline = no_line - break - if b"<" + node_tag in line and b"") + if self_close_pos != -1: + # Verify this is the closing for our tag, not a nested one + # Simple check: see if there's another opening tag between start and close + between = relevant_line[:self_close_pos] + # Count opening tags in between + temp_count = between.count(b"<" + node_tag) + if temp_count == 1: # Only our opening tag + node_end_idx = idx + node_end_col = node_start_col + self_close_pos + 2 + self.end_sourceline = no_line + break + + # Check if opening and closing tag on same line + close_tag = b"" + close_pos = relevant_line.find(close_tag) + if close_pos != -1: node_end_idx = idx + node_end_col = node_start_col + close_pos + len(close_tag) self.end_sourceline = no_line break + + # Node continues beyond this line + nesting_level = 1 else: + # Look for closing patterns + stripped_line = line.lstrip() + + # Check for self-closing continuation (when tag opened in previous line) + # This handles cases like: + # + if b"/>" in line: + # Check if this is a standalone /> (not part of a new tag) + # by seeing if the line starts with /> or has /> after attributes + if stripped_line.startswith(b"/>") or ( + b">" not in line[: line.find(b"/>")] if b"/>" in line else False + ): + node_end_idx = idx + node_end_col = line.find(b"/>") + 2 + self.end_sourceline = no_line + nesting_level -= 1 + if not nesting_level: + break + # Look for closing tag - if b"" in line or b"" + if close_tag in line: node_end_idx = idx + node_end_col = line.find(close_tag) + len(close_tag) self.end_sourceline = no_line - break - if b"/>" in stripped_line and not stripped_line.startswith(b"<"): - # Self-closing continuation - node_end_idx = idx - self.end_sourceline = no_line - break + nesting_level -= 1 + if not nesting_level: + break + + # Count any new opening tags to track nesting + pos = 0 + while True: + pos = line.find(b"<" + node_tag, pos) + if pos == -1: + break + # Verify it's a real opening tag + check_pos = pos + len(node_tag) + 1 + if check_pos < len(line): + next_char = line[check_pos : check_pos + 1] + if next_char in (b" ", b">", b"/", b"\n", b"\r"): + nesting_level += 1 + pos += 1 # Look backwards from node start for comment for idx in range(node_start_idx - 1, -1, -1): @@ -117,9 +571,25 @@ def _read_node(self): # noqa:C901 pylint:disable=too-complex self.content_before += line continue self.content_before += line - elif node_start_idx <= idx <= node_end_idx: + elif idx == node_start_idx: + # For the start line, split at the column position + self.content_before += line[:node_start_col] + + if idx == node_end_idx: + # Node starts and ends on same line + self.content_node += line[node_start_col:node_end_col] + self.content_after += line[node_end_col:] + else: + # Node continues to next line(s) + self.content_node += line[node_start_col:] + elif node_start_idx < idx < node_end_idx: + # Full lines that are part of the node self.content_node += line - else: + elif idx == node_end_idx and idx != node_start_idx: + # Last line of the node + self.content_node += line[:node_end_col] + self.content_after += line[node_end_col:] + elif idx > node_end_idx: self.content_after += line # Remove comment from content_before if present diff --git a/src/oca_pre_commit_hooks/xml_position_parser.py b/src/oca_pre_commit_hooks/xml_position_parser.py new file mode 100644 index 0000000..b743649 --- /dev/null +++ b/src/oca_pre_commit_hooks/xml_position_parser.py @@ -0,0 +1,442 @@ +import html +import re +from dataclasses import dataclass +from typing import List, Optional + +from lxml import etree + + +@dataclass +class AttributeInfo: + """Position information for an attribute.""" + + name: str + value: str + start_line: int + start_col: int + end_line: int + end_col: int + + +@dataclass +class ElementInfo: + """Position information for an XML element.""" + + name: str + start_line: int + start_col: int + end_line: int + end_col: int + attributes: List[AttributeInfo] + is_self_closing: bool + + +class XMLPositionParser: + """Parser that finds exact positions of elements and attributes.""" + + def __init__(self, xml_content: bytes): + """ + Initialize the parser. + + Args: + xml_content: XML content as bytes + """ + self.xml_text = xml_content.decode("utf-8") + self.lines = self.xml_text.split("\n") + self.elements: List[ElementInfo] = [] + + def parse(self) -> List[ElementInfo]: + """Parse XML and return position information for all elements.""" + # Robust pattern for multi-line tags + # Matches: or + # Uses greedy matching to capture everything until the closing > + tag_pattern = r"<([a-zA-Z_][\w:.-]*)(.*?)(/?)>" + + for match in re.finditer(tag_pattern, self.xml_text, re.MULTILINE | re.DOTALL): + tag_name = match.group(1) + attributes_str = match.group(2) + is_self_closing = match.group(3) == "/" + + # Skip if this looks like a closing tag + if attributes_str.strip().startswith("/"): + continue + + start_pos = match.start() + start_line, start_col = self._pos_to_line_col(start_pos) + + end_pos = match.end() + end_line, end_col = self._pos_to_line_col(end_pos) + + attrs = self._parse_attributes(attributes_str, start_pos + len(tag_name) + 1) + + element = ElementInfo( + name=tag_name, + start_line=start_line, + start_col=start_col, + end_line=end_line, + end_col=end_col, + attributes=attrs, + is_self_closing=is_self_closing, + ) + + self.elements.append(element) + + return self.elements + + def _pos_to_line_col(self, pos: int) -> tuple: + """Convert an absolute position in text to (line, column).""" + line = 1 + col = 1 + + for i, char in enumerate(self.xml_text): + if i >= pos: + break + if char == "\n": + line += 1 + col = 1 + else: + col += 1 + + return line, col + + def _parse_attributes(self, attr_str: str, base_pos: int) -> List[AttributeInfo]: + """Parse attributes from a string and return their positions.""" + attributes = [] + + # Pattern that properly handles escaped quotes inside attribute values + # Matches double quotes: name="value with " inside" + attr_pattern_double = r'([a-zA-Z_][\w:.-]*)\s*=\s*"([^"]*(?:"[^"]*)*)"' + # Matches single quotes: name='value with ' inside' + attr_pattern_single = r"([a-zA-Z_][\w:.-]*)\s*=\s*'([^']*(?:'[^']*)*)'" + + # Try double quotes first + for match in re.finditer(attr_pattern_double, attr_str, re.DOTALL): + attr_name = match.group(1) + attr_value_raw = match.group(2) + + # Decode HTML entities to match what lxml does (" -> ", & -> &, etc.) + attr_value = html.unescape(attr_value_raw) + + attr_start_pos = base_pos + match.start() + attr_end_pos = base_pos + match.end() + + start_line, start_col = self._pos_to_line_col(attr_start_pos) + end_line, end_col = self._pos_to_line_col(attr_end_pos) + + attr_info = AttributeInfo( + name=attr_name, + value=attr_value, + start_line=start_line, + start_col=start_col, + end_line=end_line, + end_col=end_col, + ) + + attributes.append(attr_info) + + # Try single quotes + for match in re.finditer(attr_pattern_single, attr_str, re.DOTALL): + attr_name = match.group(1) + attr_value_raw = match.group(2) + + # Decode HTML entities + attr_value = html.unescape(attr_value_raw) + + attr_start_pos = base_pos + match.start() + attr_end_pos = base_pos + match.end() + + start_line, start_col = self._pos_to_line_col(attr_start_pos) + end_line, end_col = self._pos_to_line_col(attr_end_pos) + + attr_info = AttributeInfo( + name=attr_name, + value=attr_value, + start_line=start_line, + start_col=start_col, + end_line=end_line, + end_col=end_col, + ) + + attributes.append(attr_info) + + return attributes + + +class PositionElement(etree.ElementBase): + """ + Custom lxml Element class that includes position information. + + Additional properties: + - start_line: Line where element starts + - start_col: Column where element starts + - end_line: Line where element ends + - end_col: Column where element ends + - is_self_closing: Whether element is self-closing + - position_attributes: List of AttributeInfo objects + """ + + __slots__ = () + + def _set_position_data(self, start_line, start_col, end_line, end_col, is_self_closing, position_attributes): + """Internal method to set position data.""" + self.set("_pos_start_line", str(start_line)) + self.set("_pos_start_col", str(start_col)) + self.set("_pos_end_line", str(end_line)) + self.set("_pos_end_col", str(end_col)) + self.set("_pos_is_self_closing", str(is_self_closing)) + + # Try to store position_attributes directly + try: + object.__setattr__(self, "_pos_attrs_data", position_attributes) + except (AttributeError, TypeError): + pass + + # Also serialize as JSON for reliable storage + import json + + attrs_json = json.dumps( + [ + { + "name": attr.name, + "value": attr.value, + "start_line": attr.start_line, + "start_col": attr.start_col, + "end_line": attr.end_line, + "end_col": attr.end_col, + } + for attr in position_attributes + ] + ) + self.set("_pos_attrs_json", attrs_json) + + @property + def start_line(self) -> Optional[int]: + """Line where element starts.""" + val = self.get("_pos_start_line") + return int(val) if val and val != "None" else None + + @property + def start_col(self) -> Optional[int]: + """Column where element starts.""" + val = self.get("_pos_start_col") + return int(val) if val and val != "None" else None + + @property + def end_line(self) -> Optional[int]: + """Line where element ends.""" + val = self.get("_pos_end_line") + return int(val) if val and val != "None" else None + + @property + def end_col(self) -> Optional[int]: + """Column where element ends.""" + val = self.get("_pos_end_col") + return int(val) if val and val != "None" else None + + @property + def is_self_closing(self) -> Optional[bool]: + """Whether element is self-closing.""" + val = self.get("_pos_is_self_closing") + if val is None or val == "None": + return None + return val == "True" + + @property + def position_attributes(self) -> List[AttributeInfo]: + """List of attributes with position information.""" + # Try to get from object attribute first + try: + attrs = object.__getattribute__(self, "_pos_attrs_data") + if attrs: + return attrs + except AttributeError: + pass + + # Fall back to JSON deserialization + import json + + attrs_json = self.get("_pos_attrs_json") + if attrs_json: + try: + attrs_data = json.loads(attrs_json) + return [ + AttributeInfo( + name=a["name"], + value=a["value"], + start_line=a["start_line"], + start_col=a["start_col"], + end_line=a["end_line"], + end_col=a["end_col"], + ) + for a in attrs_data + ] + except (json.JSONDecodeError, KeyError): + pass + + return [] + + +class LXMLPositionEnricher: + """ + Custom lxml parser that enriches elements with position information. + + Usage: + enricher = LXMLPositionEnricher(xml_content) + root = enricher.root + + # Access position info: + element = root.xpath("//record")[0] + print(element.start_line) + print(element.start_col) + print(element.is_self_closing) + print(element.position_attributes[0].start_line) + """ + + def __init__(self, xml_content: bytes): + """ + Initialize the enricher. + + Args: + xml_content: XML content as bytes + """ + self.xml_content = xml_content + + # Create custom parser with PositionElement class + parser = etree.XMLParser() + lookup = etree.ElementDefaultClassLookup(element=PositionElement) + parser.set_element_class_lookup(lookup) + + # Parse with lxml + self.root = etree.fromstring(xml_content, parser) + + # Parse with position parser + self.position_parser = XMLPositionParser(xml_content) + self.position_elements = self.position_parser.parse() + + # Enrich elements with position data + self._enrich_elements() + + def _enrich_elements(self): + """Match and enrich lxml elements with position information.""" + # Collect all lxml elements in document order + lxml_elements = [] + self._traverse_lxml(self.root, lxml_elements) + + # Group position elements by tag name + pos_by_tag = {} + for pos_elem in self.position_elements: + if pos_elem.name not in pos_by_tag: + pos_by_tag[pos_elem.name] = [] + pos_by_tag[pos_elem.name].append(pos_elem) + + # Group lxml elements by tag name + lxml_by_tag = {} + for lxml_elem in lxml_elements: + tag = self._get_tag_name(lxml_elem) + if tag: + if tag not in lxml_by_tag: + lxml_by_tag[tag] = [] + lxml_by_tag[tag].append(lxml_elem) + + # Match and enrich elements with same tag name by order + for tag_name in lxml_by_tag: + if tag_name in pos_by_tag: + lxml_list = lxml_by_tag[tag_name] + pos_list = pos_by_tag[tag_name] + + for i in range(min(len(lxml_list), len(pos_list))): + lxml_elem = lxml_list[i] + pos_elem = pos_list[i] + + if self._attributes_match(lxml_elem, pos_elem): + lxml_elem._set_position_data( + start_line=pos_elem.start_line, + start_col=pos_elem.start_col, + end_line=pos_elem.end_line, + end_col=pos_elem.end_col, + is_self_closing=pos_elem.is_self_closing, + position_attributes=pos_elem.attributes, + ) + + def _traverse_lxml(self, element, elements_list): + """Traverse lxml tree in document order.""" + if isinstance(element.tag, str): + elements_list.append(element) + + for child in element: + self._traverse_lxml(child, elements_list) + + def _get_tag_name(self, element) -> Optional[str]: + """Get tag name from lxml element, handling namespaces.""" + tag = element.tag + + if not isinstance(tag, str): + return None + + if "}" in tag: + tag = tag.split("}")[1] + return tag + + def _attributes_match(self, lxml_elem, pos_elem: ElementInfo) -> bool: + """Check if attributes match between lxml element and position element.""" + lxml_attrs = dict(lxml_elem.attrib) + pos_attrs = {attr.name: attr.value for attr in pos_elem.attributes} + + if not lxml_attrs and not pos_attrs: + return True + + if len(lxml_attrs) != len(pos_attrs): + return False + + for key, value in lxml_attrs.items(): + if key not in pos_attrs or pos_attrs[key] != value: + return False + + return True + + +def demo(): + """Demonstration of the enriched lxml parser.""" + xml_test = b""" + + +""" + + print("=== XML Position Parser Demo ===\n") + + enricher = LXMLPositionEnricher(xml_test) + root = enricher.root + + print("Testing element with multi-line attributes:\n") + + strong_elements = root.xpath("//strong") + if strong_elements: + strong = strong_elements[0] + print(f"<{strong.tag}>") + print(f" lxml attributes: {dict(strong.attrib)}") + print(f" Position: ({strong.start_line},{strong.start_col}) → ({strong.end_line},{strong.end_col})") + print(f" Self-closing: {strong.is_self_closing}") + + if strong.position_attributes: + print(f"\n Parsed attributes with positions:") + for attr in strong.position_attributes: + print(f" • {attr.name} = {repr(attr.value)}") + print(f" Position: ({attr.start_line},{attr.start_col}) → ({attr.end_line},{attr.end_col})") + else: + print(f"\n ⚠️ WARNING: position_attributes is empty!") + + print("\n" + "=" * 70) + print("\nAll elements:") + for elem in root.iter(): + if isinstance(elem.tag, str): + print(f" <{elem.tag}> @ line {elem.start_line}, col {elem.start_col}") + + +if __name__ == "__main__": + demo() diff --git a/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml b/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml index 8e1e855..d989026 100644 --- a/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml +++ b/test_repo/odoo18_module/views/deprecated_qweb_directives15.xml @@ -1,6 +1,6 @@ - + diff --git a/tests/test_checks.py b/tests/test_checks.py index 65a413a..cca714c 100644 --- a/tests/test_checks.py +++ b/tests/test_checks.py @@ -35,10 +35,10 @@ "xml-dangerous-qweb-replace-low-priority": 9, "xml-deprecated-data-node": 8, "xml-deprecated-openerp-node": 4, - "xml-deprecated-qweb-directive-15": 3, + "xml-deprecated-qweb-directive-15": 4, "xml-deprecated-qweb-directive": 2, "xml-deprecated-tree-attribute": 3, - "xml-double-quotes-py": 3, + "xml-double-quotes-py": 4, "xml-duplicate-fields": 3, "xml-duplicate-record-id": 2, "xml-not-valid-char-link": 2, @@ -258,7 +258,8 @@ def test_autofix(self): with open(t_out, "rb") as f: content = f.read() - assert b"t-out" not in content, "The deprecated t-out was previously fixed" + assert content.count(b"t-esc") > 1, "The deprecated t-esc was previously fixed" + assert content.count(b"t-raw") > 1, "The deprecated t-raw was previously fixed" self.checks_run(self.file_paths, autofix=True, no_exit=True, no_verbose=False) @@ -343,4 +344,6 @@ def test_autofix(self): with open(t_out, "rb") as f: content = f.read() - assert b"t-out" in content, "The deprecated t-out was not fixed" + # comments contain 1 valid deprecated + assert content.count(b"t-esc") == 1, "The deprecated t-esc was not fixed" + assert content.count(b"t-raw") == 1, "The deprecated t-esc was not fixed"