From 0efe726d85bf1720bc3104f7c17de417f65fbef9 Mon Sep 17 00:00:00 2001 From: Caleb Evans Date: Tue, 20 Jan 2026 11:19:58 -0700 Subject: [PATCH 1/2] fix(xml): add escaping, root element, and pretty printing - Add XML special character escaping (&, <, >) to prevent parse errors - Add proper XML declaration and root element - Add pretty printing with indentation for readability - Add test coverage for XML escaping - Update documentation to reflect new XML structure --- docs/architecture.md | 57 +++++++++++++++++------------ src/cordon/postprocess/formatter.py | 28 ++++++++++---- tests/test_integration.py | 4 +- tests/test_postprocess.py | 42 ++++++++++++++++++++- 4 files changed, 99 insertions(+), 32 deletions(-) diff --git a/docs/architecture.md b/docs/architecture.md index 59584b0..8cd53c4 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -301,33 +301,44 @@ Merged block: lines 10-30 (score=max(0.15, 0.18, 0.12) = 0.18) ### 6. Output Formatting -**Generates structured XML output with metadata.** +**Generates pretty-printed, structured XML output with metadata.** ```xml - -[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 6 -[Sun Dec 04 07:18:00 2005] [notice] workerEnv.init() ok /etc/httpd/conf/workers2.properties -[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 7 -[Sun Dec 04 07:45:45 2005] [error] [client 63.13.186.196] Directory index forbidden by rule: /var/www/html/ -[Sun Dec 04 08:54:17 2005] [error] [client 147.31.138.75] Directory index forbidden by rule: /var/www/html/ -[Sun Dec 04 09:35:12 2005] [error] [client 207.203.80.15] Directory index forbidden by rule: /var/www/html/ -[Sun Dec 04 10:53:30 2005] [error] [client 218.76.139.20] Directory index forbidden by rule: /var/www/html/ -[Sun Dec 04 11:11:07 2005] [error] [client 24.147.151.74] Directory index forbidden by rule: /var/www/html/ -[Sun Dec 04 11:33:18 2005] [error] [client 211.141.93.88] Directory index forbidden by rule: /var/www/html/ -[Sun Dec 04 11:42:43 2005] [error] [client 216.127.124.16] Directory index forbidden by rule: /var/www/html/ -[Sun Dec 04 12:33:13 2005] [error] [client 208.51.151.210] Directory index forbidden by rule: /var/www/html/ -[Sun Dec 04 13:32:32 2005] [error] [client 65.68.235.27] Directory index forbidden by rule: /var/www/html/ -[Sun Dec 04 14:29:00 2005] [error] [client 4.245.93.87] Directory index forbidden by rule: /var/www/html/ -[Sun Dec 04 15:18:36 2005] [error] [client 67.154.58.130] Directory index forbidden by rule: /var/www/html/ -[Sun Dec 04 15:59:01 2005] [error] [client 24.83.37.136] Directory index forbidden by rule: /var/www/html/ -[Sun Dec 04 16:24:03 2005] [notice] jk2_init() Found child 1219 in scoreboard slot 6 -[Sun Dec 04 16:24:05 2005] [error] [client 58.225.62.140] Directory index forbidden by rule: /var/www/html/ -[Sun Dec 04 16:24:06 2005] [notice] workerEnv.init() ok /etc/httpd/conf/workers2.properties -[Sun Dec 04 16:24:06 2005] [error] mod_jk child workerEnv in error state 6 -[Sun Dec 04 16:31:07 2005] [notice] jk2_init() Found child 1248 in scoreboard slot 7 - + + + + + [Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 6 + [Sun Dec 04 07:18:00 2005] [notice] workerEnv.init() ok /etc/httpd/conf/workers2.properties + [Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 7 + [Sun Dec 04 07:45:45 2005] [error] [client 63.13.186.196] Directory index forbidden by rule: /var/www/html/ + [Sun Dec 04 08:54:17 2005] [error] [client 147.31.138.75] Directory index forbidden by rule: /var/www/html/ + [Sun Dec 04 09:35:12 2005] [error] [client 207.203.80.15] Directory index forbidden by rule: /var/www/html/ + [Sun Dec 04 10:53:30 2005] [error] [client 218.76.139.20] Directory index forbidden by rule: /var/www/html/ + [Sun Dec 04 11:11:07 2005] [error] [client 24.147.151.74] Directory index forbidden by rule: /var/www/html/ + [Sun Dec 04 11:33:18 2005] [error] [client 211.141.93.88] Directory index forbidden by rule: /var/www/html/ + [Sun Dec 04 11:42:43 2005] [error] [client 216.127.124.16] Directory index forbidden by rule: /var/www/html/ + [Sun Dec 04 12:33:13 2005] [error] [client 208.51.151.210] Directory index forbidden by rule: /var/www/html/ + [Sun Dec 04 13:32:32 2005] [error] [client 65.68.235.27] Directory index forbidden by rule: /var/www/html/ + [Sun Dec 04 14:29:00 2005] [error] [client 4.245.93.87] Directory index forbidden by rule: /var/www/html/ + [Sun Dec 04 15:18:36 2005] [error] [client 67.154.58.130] Directory index forbidden by rule: /var/www/html/ + [Sun Dec 04 15:59:01 2005] [error] [client 24.83.37.136] Directory index forbidden by rule: /var/www/html/ + [Sun Dec 04 16:24:03 2005] [notice] jk2_init() Found child 1219 in scoreboard slot 6 + [Sun Dec 04 16:24:05 2005] [error] [client 58.225.62.140] Directory index forbidden by rule: /var/www/html/ + [Sun Dec 04 16:24:06 2005] [notice] workerEnv.init() ok /etc/httpd/conf/workers2.properties + [Sun Dec 04 16:24:06 2005] [error] mod_jk child workerEnv in error state 6 + [Sun Dec 04 16:31:07 2005] [notice] jk2_init() Found child 1248 in scoreboard slot 7 + + + ``` +**Output structure:** +- Valid XML with proper declaration and root element +- Pretty-printed with indentation for readability +- XML special characters (`&`, `<`, `>`) are automatically escaped +- Each `` contains metadata and original log content + **Metadata included:** - **Line range**: References back to original file - **Anomaly score**: Quantifies unusualness diff --git a/src/cordon/postprocess/formatter.py b/src/cordon/postprocess/formatter.py index 45f68cf..698a014 100644 --- a/src/cordon/postprocess/formatter.py +++ b/src/cordon/postprocess/formatter.py @@ -1,5 +1,6 @@ from collections.abc import Sequence from pathlib import Path +from xml.sax.saxutils import escape from cordon.core.types import MergedBlock @@ -26,10 +27,10 @@ def format_blocks(self, merged_blocks: Sequence[MergedBlock], original_file: Pat Formatted string with XML tags and original content """ if not merged_blocks: - return "" + return '\n' # merged blocks are sorted by start_line from the merger - output_parts = [] + output_parts = ['', "", ""] block_idx = 0 current_line = 1 @@ -51,18 +52,31 @@ def format_blocks(self, merged_blocks: Sequence[MergedBlock], original_file: Pat content_lines.append(next_line) current_line += 1 - # format the block + # format the block with indentation tag = ( - f'' ) content = "".join(content_lines) - output_parts.append(f"{tag}\n{content}") + # Escape XML special characters to ensure valid XML output + escaped_content = escape(content) + + # Indent each line of content for pretty printing + # Preserve whitespace-only lines from original content + indented_content = "\n".join( + " " + line if line else line for line in escaped_content.splitlines() + ) + + output_parts.append(f"{tag}\n{indented_content}\n ") + output_parts.append("") # blank line between blocks # move to next block block_idx += 1 current_line += 1 - # join blocks with double newline separator - return "\n\n".join(output_parts) + # close the root element + output_parts.append("") + + # join with newlines + return "\n".join(output_parts) diff --git a/tests/test_integration.py b/tests/test_integration.py index 203533b..a87d749 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -56,7 +56,9 @@ def test_analyze_empty_log(self) -> None: assert result.total_windows == 0 assert result.significant_windows == 0 assert result.merged_blocks == 0 - assert result.output == "" + assert ( + result.output == '\n' + ) finally: temp_path.unlink() diff --git a/tests/test_postprocess.py b/tests/test_postprocess.py index 8ce03ea..8cd7d27 100644 --- a/tests/test_postprocess.py +++ b/tests/test_postprocess.py @@ -136,6 +136,9 @@ def test_format_single_block(self) -> None: formatter = OutputFormatter() output = formatter.format_blocks(blocks, temp_path) + assert '' in output + assert "" in output + assert "" in output assert '' in output assert "line 1\n" in output assert "line 2\n" in output @@ -161,6 +164,9 @@ def test_format_multiple_blocks(self) -> None: formatter = OutputFormatter() output = formatter.format_blocks(blocks, temp_path) + assert '' in output + assert "" in output + assert "" in output assert '' in output assert '' in output assert output.count("") == 2 @@ -181,6 +187,40 @@ def test_format_empty_blocks(self) -> None: formatter = OutputFormatter() output = formatter.format_blocks(blocks, temp_path) - assert output == "" + assert output == '\n' + finally: + temp_path.unlink() + + def test_format_escapes_xml_special_chars(self) -> None: + """Test that XML special characters are properly escaped.""" + with NamedTemporaryFile(mode="w", delete=False, suffix=".log") as f: + f.write("command: test |& tee file.txt\n") + f.write("error: x < y && z > 10\n") + f.write("message: \"quoted\" & 'single'\n") + temp_path = Path(f.name) + + try: + from cordon.core.types import MergedBlock + + blocks = [MergedBlock(start_line=1, end_line=3, original_windows=(0,), max_score=0.8)] + + formatter = OutputFormatter() + output = formatter.format_blocks(blocks, temp_path) + + # Verify XML structure + assert '' in output + assert "" in output + assert "" in output + # Verify XML special characters are escaped (& < > must be escaped in text content) + assert "&" in output + assert "<" in output + assert ">" in output + # Verify the content is properly escaped + assert "command: test |& tee file.txt" in output + assert "error: x < y && z > 10" in output + # Verify raw special characters are not present where they should be escaped + assert "|& tee" not in output # & should be escaped + assert "x < y" not in output # < should be escaped + assert "z > 10" not in output # > should be escaped finally: temp_path.unlink() From dbe52663a8c2fae77e9e800a1acd465c2804ac91 Mon Sep 17 00:00:00 2001 From: Caleb Evans Date: Tue, 20 Jan 2026 11:22:01 -0700 Subject: [PATCH 2/2] release: v0.3.3 --- pyproject.toml | 2 +- src/cordon/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 372895c..aef6361 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "cordon" -version = "0.3.2" +version = "0.3.3" description = "Semantic anomaly detection for system log files" readme = "README.md" requires-python = ">=3.10" diff --git a/src/cordon/__init__.py b/src/cordon/__init__.py index bdb6e92..d1efde8 100644 --- a/src/cordon/__init__.py +++ b/src/cordon/__init__.py @@ -2,7 +2,7 @@ from cordon.core.types import AnalysisResult, MergedBlock, ScoredWindow, TextWindow from cordon.pipeline import SemanticLogAnalyzer -__version__ = "0.3.2" +__version__ = "0.3.3" __all__ = [ "SemanticLogAnalyzer",