From ea7c05ad843f4bbe748ce4aac4b998ca1a78d919 Mon Sep 17 00:00:00 2001 From: Marcin Wilk Date: Thu, 13 Mar 2025 16:00:48 +0100 Subject: [PATCH] Don't fail on MaxSearchableLineLengthReached When searchkit looks for line feeds in a file, it will raise the MaxSearchableLineLengthReached after reading 1048576 chars and not finding one. Now we catch the exception, log a message and skip searching the rest of the file. This approach allows for successful completion of the parallel searches. Little refactoring in 'apply_to_file' was needed to avoid pylint 'too-many-return-statements / R0911' complaint. Closes-issue: https://github.com/dosaboy/searchkit/issues/21 Signed-off-by: Marcin Wilk --- searchkit/constraints.py | 27 +++++++++++++++------------ tests/unit/test_search_constraints.py | 19 +++++++++++++++++++ 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/searchkit/constraints.py b/searchkit/constraints.py index f7044f2..2251456 100644 --- a/searchkit/constraints.py +++ b/searchkit/constraints.py @@ -923,6 +923,7 @@ def apply_to_file(self, fd, destructive=True): log.debug("using cached offset") return self._results[fd.name] + newpos = 0 log.debug("c:%s: starting binary seek search to %s in file %s " "(destructive=True)", self.id, self.since_date, fd.name) try: @@ -941,26 +942,28 @@ def apply_to_file(self, fd, destructive=True): self._results[fd.name] = new_offset except NoTimestampsFoundInFile: log.debug("c:%s no timestamp found in file", self.id) - fd.seek(0) - return fd.tell() + newpos = fd.seek(0) except NoValidLinesFoundInFile: log.debug("c:%s no date after %s found in file - seeking to end", self.since_date, self.id) - fd.seek(0, 2) - return fd.tell() + newpos = fd.seek(0, 2) except TooManyLinesWithoutDate as exc: log.warning("c:%s failed to find a line containing a date: %s", self.id, exc) - fd.seek(0) - return fd.tell() + newpos = fd.seek(0) except MaxSearchableLineLengthReached as exc: - log.error("c:%s exceeded allowed line length search limit " - "before finding line feed: %s", self.id, exc) - raise + log.warning("c:%s exceeded allowed line length search limit " + "before finding line feed: %s" + " - moving to EOF to skip searching this file", + self.id, exc) + newpos = fd.seek(0, 2) + else: + # seek completed without issues + log.debug("c:%s: finished binary seek search in file %s, " + "offset %d", self.id, fd.name, self._results[fd.name]) + newpos = self._results[fd.name] - log.debug("c:%s: finished binary seek search in file %s, offset %d", - self.id, fd.name, self._results[fd.name]) - return self._results[fd.name] + return newpos def stats(self): _stats = {'lines_searched': self._lines_searched, diff --git a/tests/unit/test_search_constraints.py b/tests/unit/test_search_constraints.py index c6a7ba4..2073d53 100644 --- a/tests/unit/test_search_constraints.py +++ b/tests/unit/test_search_constraints.py @@ -4,8 +4,10 @@ import tempfile import shutil import subprocess +import logging from datetime import datetime from unittest import mock +from unittest.mock import patch from io import BytesIO from searchkit.constraints import ( @@ -206,6 +208,23 @@ def test_binary_search_4(self): stats = {'line': {'fail': 0, 'pass': 0}, 'lines_searched': 0} self.assertEqual(c.stats(), stats) + @patch.object(LogFileDateSinceSeeker, 'run', + side_effect=MaxSearchableLineLengthReached) + @patch.object(logging.Logger, "warning") + @utils.create_files({'f1': LOGS_W_TS}) + def test_apply_to_file_throw_max_line_len_err(self, mock_log, mock_lfdss): + self.current_date = self.get_date('Tue Jan 03 00:00:01 UTC 2022') + _file = os.path.join(self.data_root, 'f1') + c = SearchConstraintSearchSince(current_date=self.current_date, + ts_matcher_cls=TimestampSimple, days=7) + with open(_file, 'rb') as fd: + c.apply_to_file(fd) + args, kwargs = mock_log.call_args + self.assertTrue("c:%s exceeded allowed line length search limit " + "before finding line feed: %s" + " - moving to EOF to skip searching this file" + in args) + class TestSearchState(TestSearchKitBase):