From 7ee346537b4f5f4be8ac71f83ee5abf81bf0afbd Mon Sep 17 00:00:00 2001 From: Mateusz Sterczewski Date: Fri, 23 Jan 2026 14:18:47 +0100 Subject: [PATCH 1/2] CM-57848-Fix UTF encoding when displaying code snippet --- cycode/cli/printers/tables/table_printer.py | 4 +- .../cli/printers/utils/code_snippet_syntax.py | 4 +- cycode/cli/printers/utils/rich_helpers.py | 4 +- cycode/cli/utils/string_utils.py | 9 ++ .../printers/utils/test_rich_encoding_fix.py | 85 +++++++++++++++++++ 5 files changed, 103 insertions(+), 3 deletions(-) create mode 100644 tests/cli/printers/utils/test_rich_encoding_fix.py diff --git a/cycode/cli/printers/tables/table_printer.py b/cycode/cli/printers/tables/table_printer.py index 6a5dd198..4468ef9f 100644 --- a/cycode/cli/printers/tables/table_printer.py +++ b/cycode/cli/printers/tables/table_printer.py @@ -8,7 +8,7 @@ from cycode.cli.printers.tables.table_printer_base import TablePrinterBase from cycode.cli.printers.utils import is_git_diff_based_scan from cycode.cli.printers.utils.detection_ordering.common_ordering import sort_and_group_detections_from_scan_result -from cycode.cli.utils.string_utils import get_position_in_line, obfuscate_text +from cycode.cli.utils.string_utils import get_position_in_line, obfuscate_text, sanitize_text_for_encoding if TYPE_CHECKING: from cycode.cli.models import LocalScanResult @@ -96,6 +96,8 @@ def _enrich_table_with_detection_code_segment_values( if not self.show_secret: violation = obfuscate_text(violation) + violation = sanitize_text_for_encoding(violation) + table.add_cell(LINE_NUMBER_COLUMN, str(detection_line)) table.add_cell(COLUMN_NUMBER_COLUMN, str(detection_column)) table.add_cell(VIOLATION_LENGTH_COLUMN, f'{violation_length} chars') diff --git a/cycode/cli/printers/utils/code_snippet_syntax.py b/cycode/cli/printers/utils/code_snippet_syntax.py index 20f94d4e..57bc084e 100644 --- a/cycode/cli/printers/utils/code_snippet_syntax.py +++ b/cycode/cli/printers/utils/code_snippet_syntax.py @@ -5,7 +5,7 @@ from cycode.cli import consts from cycode.cli.console import _SYNTAX_HIGHLIGHT_THEME from cycode.cli.printers.utils import is_git_diff_based_scan -from cycode.cli.utils.string_utils import get_position_in_line, obfuscate_text +from cycode.cli.utils.string_utils import get_position_in_line, obfuscate_text, sanitize_text_for_encoding if TYPE_CHECKING: from cycode.cli.models import Document @@ -72,6 +72,7 @@ def _get_code_snippet_syntax_from_file( code_lines_to_render.append(line_content) code_to_render = '\n'.join(code_lines_to_render) + code_to_render = sanitize_text_for_encoding(code_to_render) return _get_syntax_highlighted_code( code=code_to_render, lexer=Syntax.guess_lexer(document.path, code=code_to_render), @@ -94,6 +95,7 @@ def _get_code_snippet_syntax_from_git_diff( violation = line_content[detection_position_in_line : detection_position_in_line + violation_length] line_content = line_content.replace(violation, obfuscate_text(violation)) + line_content = sanitize_text_for_encoding(line_content) return _get_syntax_highlighted_code( code=line_content, lexer='diff', diff --git a/cycode/cli/printers/utils/rich_helpers.py b/cycode/cli/printers/utils/rich_helpers.py index 52d2a0f2..6049b211 100644 --- a/cycode/cli/printers/utils/rich_helpers.py +++ b/cycode/cli/printers/utils/rich_helpers.py @@ -5,6 +5,7 @@ from rich.panel import Panel from cycode.cli.console import console +from cycode.cli.utils.string_utils import sanitize_text_for_encoding if TYPE_CHECKING: from rich.console import RenderableType @@ -20,8 +21,9 @@ def get_panel(renderable: 'RenderableType', title: str) -> Panel: def get_markdown_panel(markdown_text: str, title: str) -> Panel: + sanitized_text = sanitize_text_for_encoding(markdown_text.strip()) return get_panel( - Markdown(markdown_text.strip()), + Markdown(sanitized_text), title=title, ) diff --git a/cycode/cli/utils/string_utils.py b/cycode/cli/utils/string_utils.py index c3c0c6c6..ac3987f4 100644 --- a/cycode/cli/utils/string_utils.py +++ b/cycode/cli/utils/string_utils.py @@ -65,3 +65,12 @@ def shortcut_dependency_paths(dependency_paths_list: str) -> str: result += '\n' return result.rstrip().rstrip(',') + + +def sanitize_text_for_encoding(text: str) -> str: + """Sanitize text by replacing surrogate characters and invalid UTF-8 sequences. + + This prevents encoding errors when Rich tries to display the content, especially on Windows. + Surrogate characters (U+D800 to U+DFFF) cannot be encoded to UTF-8 and will cause errors. + """ + return text.encode('utf-8', errors='replace').decode('utf-8') diff --git a/tests/cli/printers/utils/test_rich_encoding_fix.py b/tests/cli/printers/utils/test_rich_encoding_fix.py new file mode 100644 index 00000000..e735b9c7 --- /dev/null +++ b/tests/cli/printers/utils/test_rich_encoding_fix.py @@ -0,0 +1,85 @@ +"""Tests for Rich encoding fix to handle surrogate characters.""" + +from io import StringIO +from unittest.mock import MagicMock + +from rich.console import Console + +from cycode.cli import consts +from cycode.cli.models import Document +from cycode.cli.printers.rich_printer import RichPrinter +from cycode.cyclient.models import Detection + + +def create_strict_encoding_console() -> tuple[Console, StringIO]: + """Create a Console that enforces strict UTF-8 encoding, simulating Windows console behavior. + + When Rich writes to the console, the file object needs to encode strings to bytes. + With errors='strict' (default for TextIOWrapper), this raises UnicodeEncodeError on surrogates. + This function simulates that behavior to test the encoding fix. + """ + buffer = StringIO() + + class StrictEncodingWrapper: + def __init__(self, file_obj: StringIO) -> None: + self._file = file_obj + + def write(self, text: str) -> int: + """Validate encoding before writing to simulate strict encoding behavior.""" + text.encode('utf-8') + return self._file.write(text) + + def flush(self) -> None: + self._file.flush() + + def isatty(self) -> bool: + return False + + def __getattr__(self, name: str): + # Delegate all other attributes to the underlying file + return getattr(self._file, name) + + strict_file = StrictEncodingWrapper(buffer) + console = Console(file=strict_file, width=80, force_terminal=False) + return console, buffer + + +def test_rich_printer_handles_surrogate_characters_in_violation_card() -> None: + """Test that RichPrinter._print_violation_card() handles surrogate characters without errors. + + The error occurs in Rich's console._write_buffer() -> write() when console.print() is called. + On Windows with strict encoding, this raises UnicodeEncodeError on surrogates. + """ + surrogate_char = chr(0xDC96) + document_content = 'A' * 1236 + surrogate_char + 'B' * 100 + document = Document( + path='test.py', + content=document_content, + is_git_diff_format=False, + ) + + detection = Detection( + detection_type_id='test-id', + type='test-type', + message='Test message', + detection_details={ + 'description': 'Summary with ' + surrogate_char + ' surrogate character', + 'policy_display_name': 'Test Policy', + 'start_position': 1236, + 'length': 1, + 'line': 0, + }, + detection_rule_id='test-rule-id', + severity='Medium', + ) + + mock_ctx = MagicMock() + mock_ctx.obj = { + 'scan_type': consts.SAST_SCAN_TYPE, + 'show_secret': False, + } + mock_ctx.info_name = consts.SAST_SCAN_TYPE + + console, _ = create_strict_encoding_console() + printer = RichPrinter(mock_ctx, console, console) + printer._print_violation_card(document, detection, 1, 1) From ed93814177a4b07fd6cccc3c238247fd4f22d9a1 Mon Sep 17 00:00:00 2001 From: Mateusz Sterczewski Date: Fri, 23 Jan 2026 14:23:32 +0100 Subject: [PATCH 2/2] CM-57848-Fixes --- cycode/cli/utils/string_utils.py | 2 +- tests/cli/printers/__init__.py | 0 tests/cli/printers/utils/__init__.py | 0 .../printers/utils/test_rich_encoding_fix.py | 25 ++++++++++--------- 4 files changed, 14 insertions(+), 13 deletions(-) create mode 100644 tests/cli/printers/__init__.py create mode 100644 tests/cli/printers/utils/__init__.py diff --git a/cycode/cli/utils/string_utils.py b/cycode/cli/utils/string_utils.py index ac3987f4..06d3a51c 100644 --- a/cycode/cli/utils/string_utils.py +++ b/cycode/cli/utils/string_utils.py @@ -69,7 +69,7 @@ def shortcut_dependency_paths(dependency_paths_list: str) -> str: def sanitize_text_for_encoding(text: str) -> str: """Sanitize text by replacing surrogate characters and invalid UTF-8 sequences. - + This prevents encoding errors when Rich tries to display the content, especially on Windows. Surrogate characters (U+D800 to U+DFFF) cannot be encoded to UTF-8 and will cause errors. """ diff --git a/tests/cli/printers/__init__.py b/tests/cli/printers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/cli/printers/utils/__init__.py b/tests/cli/printers/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/cli/printers/utils/test_rich_encoding_fix.py b/tests/cli/printers/utils/test_rich_encoding_fix.py index e735b9c7..721f1c6a 100644 --- a/tests/cli/printers/utils/test_rich_encoding_fix.py +++ b/tests/cli/printers/utils/test_rich_encoding_fix.py @@ -1,6 +1,7 @@ """Tests for Rich encoding fix to handle surrogate characters.""" from io import StringIO +from typing import Any from unittest.mock import MagicMock from rich.console import Console @@ -13,32 +14,32 @@ def create_strict_encoding_console() -> tuple[Console, StringIO]: """Create a Console that enforces strict UTF-8 encoding, simulating Windows console behavior. - + When Rich writes to the console, the file object needs to encode strings to bytes. With errors='strict' (default for TextIOWrapper), this raises UnicodeEncodeError on surrogates. This function simulates that behavior to test the encoding fix. """ buffer = StringIO() - + class StrictEncodingWrapper: def __init__(self, file_obj: StringIO) -> None: self._file = file_obj - + def write(self, text: str) -> int: """Validate encoding before writing to simulate strict encoding behavior.""" text.encode('utf-8') return self._file.write(text) - + def flush(self) -> None: self._file.flush() - + def isatty(self) -> bool: return False - - def __getattr__(self, name: str): + + def __getattr__(self, name: str) -> Any: # Delegate all other attributes to the underlying file return getattr(self._file, name) - + strict_file = StrictEncodingWrapper(buffer) console = Console(file=strict_file, width=80, force_terminal=False) return console, buffer @@ -46,7 +47,7 @@ def __getattr__(self, name: str): def test_rich_printer_handles_surrogate_characters_in_violation_card() -> None: """Test that RichPrinter._print_violation_card() handles surrogate characters without errors. - + The error occurs in Rich's console._write_buffer() -> write() when console.print() is called. On Windows with strict encoding, this raises UnicodeEncodeError on surrogates. """ @@ -57,7 +58,7 @@ def test_rich_printer_handles_surrogate_characters_in_violation_card() -> None: content=document_content, is_git_diff_format=False, ) - + detection = Detection( detection_type_id='test-id', type='test-type', @@ -72,14 +73,14 @@ def test_rich_printer_handles_surrogate_characters_in_violation_card() -> None: detection_rule_id='test-rule-id', severity='Medium', ) - + mock_ctx = MagicMock() mock_ctx.obj = { 'scan_type': consts.SAST_SCAN_TYPE, 'show_secret': False, } mock_ctx.info_name = consts.SAST_SCAN_TYPE - + console, _ = create_strict_encoding_console() printer = RichPrinter(mock_ctx, console, console) printer._print_violation_card(document, detection, 1, 1)