From ad5570c8d82ff4128ff77c8240955ac6f610392f Mon Sep 17 00:00:00 2001 From: Romke van Dijk Date: Tue, 20 May 2025 10:40:56 +0200 Subject: [PATCH 1/2] Adding fix for infi loop when combining add parameters --- bin/demeuk.py | 29 +++++++++++++++++++---------- tests/conftest.py | 4 ++++ tests/test_app.py | 25 +++++++++++++++++++++++-- tox.ini | 1 + 4 files changed, 47 insertions(+), 12 deletions(-) diff --git a/bin/demeuk.py b/bin/demeuk.py index 3381db0..dd4b693 100755 --- a/bin/demeuk.py +++ b/bin/demeuk.py @@ -142,6 +142,7 @@ check-replacement-character, check-empty-line """ from binascii import hexlify, unhexlify +from collections import deque from glob import glob from html import unescape from inspect import cleandoc @@ -1016,8 +1017,16 @@ def clean_up(lines): """ results = [] log = [] + processed_lines = set() + work_queue = deque(lines) + + while work_queue: + line = work_queue.popleft() + + if line in processed_lines: + continue + processed_lines.add(line) - for line in lines: # Check if the limit is set, if so minus 1 and if 0 is reached lets quit. if type(config['limit']) is int: if config['limit'] > 0: @@ -1057,7 +1066,7 @@ def clean_up(lines): if status: # Lines contains hex, this function will return binary string, so add it back to # our undecoded lines - lines.append(line_decoded) + work_queue.append(line_decoded) if config['debug']: log.append(f'Clean_hex; replaced $HEX[], added to queue and quiting; {line}{linesep}') # Aborting future processing of this line. @@ -1069,7 +1078,7 @@ def clean_up(lines): if status: # Line contains html string, because this can be binary data (linefeeds etc) # convert back to binary string and add to queue again. - lines.append(line_decoded.encode()) + work_queue.append(line_decoded.encode()) if config['debug']: log.append(f'Clean_html; replaced html, added to queue and quiting; {line_decoded}{linesep}') stop = True @@ -1283,49 +1292,49 @@ def clean_up(lines): for modified_line in modified_lines: if config['debug']: log.append(f'Add_split; new line because of split; {modified_line}{linesep}') - lines.append(modified_line.encode()) + work_queue.append(modified_line.encode()) if config.get('add-lower'): modified_line = add_lower(line_decoded) if modified_line: if config['debug']: log.append(f'Add_lower; new line; {modified_line}{linesep}') - lines.append(modified_line.encode()) + work_queue.append(modified_line.encode()) if config.get('add-first-upper'): modified_line = add_first_upper(line_decoded) if modified_line: if config['debug']: log.append(f'Add_first_upper; new line; {modified_line}{linesep}') - lines.append(modified_line.encode()) + work_queue.append(modified_line.encode()) if config.get('add-title-case'): modified_line = add_title_case(line_decoded) if modified_line: if config['debug']: log.append(f'Add_title_case; new line; {modified_line}{linesep}') - lines.append(modified_line.encode()) + work_queue.append(modified_line.encode()) if config.get('add-latin-ligatures'): modified_line = add_latin_ligatures(line_decoded) if modified_line: if config['debug']: log.append(f'Add_latin_ligatures; new line; {modified_line}{linesep}') - lines.append(modified_line.encode()) + work_queue.append(modified_line.encode()) if config.get('add-umlaut'): status, modified_line = clean_add_umlaut(line_decoded) if status: if config['debug']: log.append(f'Add_umlaut; new line; {modified_line}{linesep}') - lines.append(modified_line.encode()) + work_queue.append(modified_line.encode()) if config.get('add-without-punctuation'): modified_line = add_without_punctuation(line_decoded, config.get('punctuation')) if modified_line: if config['debug']: log.append(f'Add_without_punctuation; new line; {modified_line}{linesep}') - lines.append(modified_line.encode()) + work_queue.append(modified_line.encode()) if config['debug']: log.append(f'----End---- {line_decoded}{linesep}{linesep}') diff --git a/tests/conftest.py b/tests/conftest.py index 52b75c6..0a2d6a6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -393,3 +393,7 @@ file.write(f'_amsterdam {linesep}') file.write(f'ROTTERDAM_ {linesep}') file.write(f'Cookie Monster {linesep}') + +with open('testdata/input54', 'w') as file: + file.write(f'Golf Trip{linesep}') + file.write(f'Sequences{linesep}') diff --git a/tests/test_app.py b/tests/test_app.py index f57ab7f..f603ca8 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -2,7 +2,7 @@ from subprocess import PIPE, run from unittest.mock import patch -from pytest import raises +from pytest import raises, mark from bin.demeuk import main @@ -188,7 +188,7 @@ def test_language_processing(): with patch.object(sys, 'argv', testargs): main() line_num_output = calculate_line_numbers('testdata/output11') - assert line_num_output == 29 + assert line_num_output == 21 with open('testdata/output11') as f: filecontent = f.read() assert 'cijfer\n' in filecontent @@ -982,3 +982,24 @@ def test_check_contains(): assert '_amsterdam' not in filecontent assert 'ROTTERDAM_' not in filecontent assert 'Cookie Monster' in filecontent + + +@mark.timeout(1) +def test_infinite_loop(): + testargs = [ + 'demeuk', '-i', 'testdata/input54', '-o', 'testdata/output54', '-l', 'testdata/log54', + '--add-lower', '--add-title-case', + ] + + with patch.object(sys, 'argv', testargs): + main() + + with open('testdata/output54') as f: + filecontent = f.read() + + line_num_output = calculate_line_numbers('testdata/output54') + assert line_num_output == 4 + assert 'Golf Trip' in filecontent + assert 'Sequences' in filecontent + assert 'golf trip' in filecontent + assert 'sequences' in filecontent diff --git a/tox.ini b/tox.ini index 8c7fe31..5fdac49 100644 --- a/tox.ini +++ b/tox.ini @@ -13,6 +13,7 @@ deps = -rrequirements.txt pytest flake8 + pytest-timeout commands = pytest flake8 From ed3e0da2a372f847ab5b302edc314eb2b817fb05 Mon Sep 17 00:00:00 2001 From: Romke van Dijk Date: Tue, 20 May 2025 10:41:46 +0200 Subject: [PATCH 2/2] Bumping version --- bin/demeuk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/demeuk.py b/bin/demeuk.py index dd4b693..117ba1c 100755 --- a/bin/demeuk.py +++ b/bin/demeuk.py @@ -172,7 +172,7 @@ from unidecode import unidecode -version = '4.5.0' +version = '4.5.1' # Search from start to finish for the string $HEX[], with block of a-f0-9 with even number # of hex chars. The first match group is repeated.