From 7c661b28c1e40185066d0a7909b243e5850e6ea3 Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Fri, 11 Jul 2025 20:37:50 +0000 Subject: [PATCH 01/17] feat: implemented error handling --- efemel/pipeline.py | 497 ++++++++++++++++++++++++----------------- tests/test_pipeline.py | 219 +++++++++++++++++- 2 files changed, 507 insertions(+), 209 deletions(-) diff --git a/efemel/pipeline.py b/efemel/pipeline.py index aea2baa..d88f5fa 100644 --- a/efemel/pipeline.py +++ b/efemel/pipeline.py @@ -1,5 +1,5 @@ """ -Pipeline module for functional data processing with chunked processing. +Pipeline module for functional data processing with chunked processing and robust error handling. This module provides a Pipeline class that enables functional programming patterns for data transformation and processing, using internal chunking and optional @@ -19,52 +19,72 @@ from typing import Union from typing import overload -T = TypeVar("T") # Type variable for the elements in the pipeline -U = TypeVar("U") # Type variable for transformed elements +# Type variable for the value part of the internal tuple +T = TypeVar("T") +# Type variable for the transformed value part +U = TypeVar("U") + +# Define a type for the internal data structure: a tuple of (value, error) +PipelineItem = tuple[T, Exception | None] + + +class CompoundError(Exception): + """An exception that aggregates multiple underlying errors from a pipeline.""" + + def __init__(self, errors: list[Exception]): + self.errors = errors + super().__init__(self._format_message()) + + def _format_message(self) -> str: + """Create a summary message for all the contained errors.""" + message = f"{len(self.errors)} error(s) occurred in the pipeline:\n" + for i, error in enumerate(self.errors, 1): + message += f" {i}. {type(error).__name__}: {error}\n" + return message + + def __repr__(self) -> str: + return f"CompoundError(errors={self.errors!r})" class Pipeline[T]: """ - A functional pipeline for data processing with internal chunked processing. + A functional pipeline for data processing with internal chunked processing and error handling. The Pipeline class wraps an iterable and provides a fluent interface for - applying transformations, filters, and reductions. Internally, it processes - data in chunks for improved performance and supports concurrent execution. + applying transformations and filters. It processes data in chunks for performance, + captures exceptions without halting execution, and reports all errors at the end. Type Parameters: - T: The type of elements in the pipeline + T: The type of elements being processed in the pipeline. Attributes: - generator: Generator yielding chunks (lists) of elements + generator: A generator yielding chunks of pipeline items. + chunk_size: The number of elements to process in each chunk. """ - generator: Generator[list[T], None, None] + generator: Generator[list[PipelineItem[T]], None, None] def __init__(self, source: Union[Iterable[T], "Pipeline[T]"], chunk_size: int = 1000) -> None: """ Initialize a new Pipeline with the given data source. Args: - source: An iterable that provides the data for the pipeline. - If source is another Pipeline, it will be efficiently composed. - chunk_size: Number of elements per chunk (default: 1000) + source: An iterable or another Pipeline. + chunk_size: The number of elements per chunk (default: 1000). """ - match source: - case Pipeline(): - # If source is already a Pipeline, we can use its generator directly - self.generator = source.generator - case Iterable(): - # If source is an iterable, we will chunk it - self.generator = self._chunked(source, chunk_size) - - self.chunk_size = chunk_size + if isinstance(source, Pipeline): + self.generator = source.generator + self.chunk_size = source.chunk_size + else: + self.generator = self._chunked_and_wrap(source, chunk_size) + self.chunk_size = chunk_size @staticmethod - def _chunked(iterable: Iterable[T], size: int) -> Generator[list[T], None, None]: - """Break an iterable into chunks of specified size.""" - chunk = [] + def _chunked_and_wrap(iterable: Iterable[T], size: int) -> Generator[list[PipelineItem[T]], None, None]: + """Break an iterable into chunks of specified size and wrap items.""" + chunk: list[PipelineItem[T]] = [] for item in iterable: - chunk.append(item) + chunk.append((item, None)) # Wrap each item in a (value, error) tuple if len(chunk) == size: yield chunk chunk = [] @@ -72,95 +92,170 @@ def _chunked(iterable: Iterable[T], size: int) -> Generator[list[T], None, None] yield chunk @classmethod - def _from_chunks(cls, chunks: Iterable[list[T]], chunk_size: int = 1000) -> "Pipeline[T]": + def _from_chunks(cls, chunks: Iterable[list[PipelineItem[T]]], chunk_size: int) -> "Pipeline[T]": """Create a pipeline directly from an iterable of chunks.""" - p = cls([]) + p = cls([], chunk_size=chunk_size) p.generator = (chunk for chunk in chunks) - p.chunk_size = chunk_size return p - @classmethod - def from_pipeline(cls, pipeline: "Pipeline[T]") -> "Pipeline[T]": + def __iter__(self) -> Generator[T, None, None]: """ - Create a new Pipeline from another Pipeline. - - This method provides an explicit way to create a new Pipeline from an existing one, - preserving the chunked structure for optimal performance. - - Args: - pipeline: The source Pipeline to copy from - - Returns: - A new Pipeline that will process the same data as the source + Iterate over elements, raising a CompoundError at the end if any exceptions were caught. """ - new_pipeline = cls([]) - new_pipeline.generator = pipeline.generator - return new_pipeline - - def __iter__(self) -> Generator[T, None, None]: - """Iterate over elements by flattening chunks.""" - return (item for chunk in self.generator for item in chunk) + errors: list[Exception] = [] + for chunk in self.generator: + for value, error in chunk: + if error: + errors.append(error) + else: + yield value + if errors: + raise CompoundError(errors) def to_list(self) -> list[T]: - """Convert the pipeline to a list by concatenating all chunks.""" - return [item for chunk in self.generator for item in chunk] + """ + Convert the pipeline to a list, raising a CompoundError if any exceptions were caught. + """ + results: list[T] = [] + errors: list[Exception] = [] + for chunk in self.generator: + for value, error in chunk: + if error: + errors.append(error) + else: + results.append(value) + if errors: + raise CompoundError(errors) + return results def first(self) -> T: - """Get the first element from the pipeline.""" - item = next(self.generator, None) - - if item is None: + """ + Get the first element, raising a CompoundError if any exceptions are found anywhere in the pipeline. + """ + # Note: This is a terminal operation and must consume the generator to find all errors. + items = self.to_list() + if not items: raise StopIteration("Pipeline is empty") + return items[0] + + def map[U](self, function: Callable[[T], U]) -> "Pipeline[U]": + """Transform elements, capturing any exceptions that occur.""" + + def map_generator() -> Generator[list[PipelineItem[U]], None, None]: + for chunk in self.generator: + new_chunk: list[PipelineItem[U]] = [] + for value, error in chunk: + if error: + # Pass through existing errors + new_chunk.append((value, error)) + else: + try: + # Apply the function; on success, error is None + new_chunk.append((function(value), None)) + except Exception as e: + # On failure, capture the exception + new_chunk.append((value, e)) + yield new_chunk - return item.pop(0) + return Pipeline._from_chunks(map_generator(), self.chunk_size) def filter(self, predicate: Callable[[T], bool]) -> "Pipeline[T]": - """Filter elements using a predicate, applied per chunk.""" - - return Pipeline._from_chunks( - ([x for x in chunk if predicate(x)] for chunk in self.generator), - self.chunk_size, - ) - - def map(self, function: Callable[[T], U]) -> "Pipeline[U]": - """Transform elements using a function, applied per chunk.""" - return Pipeline._from_chunks( - ([function(x) for x in chunk] for chunk in self.generator), - self.chunk_size, - ) - - def reduce(self, function: Callable[[U, T], U], initial: U) -> "Pipeline[U]": - """Reduce elements to a single value using the given function.""" - acc = initial - for chunk in self.generator: - for item in chunk: - acc = function(acc, item) - return Pipeline([acc]) + """Filter elements, capturing any exceptions from the predicate.""" + + def filter_generator() -> Generator[list[PipelineItem[T]], None, None]: + for chunk in self.generator: + new_chunk: list[PipelineItem[T]] = [] + for value, error in chunk: + if error: + # Pass through existing errors + new_chunk.append((value, error)) + continue + try: + # Keep item if predicate is true + if predicate(value): + new_chunk.append((value, None)) + except Exception as e: + # If predicate fails, capture the error + new_chunk.append((value, e)) + yield new_chunk - def tap(self, function: Callable[[T], Any]) -> Self: - """Apply side effect to each element without modifying data.""" + return Pipeline._from_chunks(filter_generator(), self.chunk_size) - def tap_chunk(chunk: list[T]) -> list[T]: - return [item for item in chunk if function(item) or True] + def reduce[U](self, function: Callable[[U, T], U], initial: U) -> "Pipeline[U]": + """ + Reduce elements to a single value (intermediate operation). + Errored items are skipped in the reduction and passed through. + """ - return Pipeline._from_chunks(tap_chunk(chunk) for chunk in self.generator) + def reduce_generator() -> Generator[list[PipelineItem[U]], None, None]: + acc = initial + error_items: list[PipelineItem[U]] = [] + for chunk in self.generator: + for value, error in chunk: + if error: + error_items.append((value, error)) + else: + acc = function(acc, value) + # Yield all collected errors first, then the final result. + if error_items: + yield error_items + yield [(acc, None)] + + return Pipeline._from_chunks(reduce_generator(), self.chunk_size) def each(self, function: Callable[[T], Any]) -> None: - """Apply function to each element (terminal operation).""" - deque((function(item) for chunk in self.generator for item in chunk), maxlen=0) + """ + Apply a function to each element (terminal operation). - def noop(self) -> None: - """Consume the pipeline without any operation.""" - # Consume all elements in the pipeline without any operation - deque(self.generator, maxlen=0) + Raises a CompoundError at the end if any exceptions were caught. + """ + errors: list[Exception] = [] + for chunk in self.generator: + for value, error in chunk: + if error: + errors.append(error) + else: + function(value) + if errors: + raise CompoundError(errors) + + def apply[U](self, function: Callable[[Self], "Pipeline[U]"]) -> "Pipeline[U]": + """Apply a sequence of transformations to the pipeline.""" + return function(self) - def passthrough(self) -> Self: - """Return the pipeline unchanged (identity operation).""" - return self + def catch[U]( + self, + pipeline_func: Callable[["Pipeline[T]"], "Pipeline[U]"], + on_error: Callable[[Exception], Any], + ) -> "Pipeline[U]": + """ + Apply a pipeline function and handle any captured errors with a callback. - def apply(self, function: Callable[[Self], "Pipeline[U]"]) -> "Pipeline[U]": - """Apply sequence of transformation functions.""" - return function(self) + This method is for inspecting or logging errors mid-stream. It does not + recover from them; the errors remain in the pipeline. + + Args: + pipeline_func: A function that takes a pipeline and returns a new one. + on_error: A function to call with any exception captured in the `pipeline_func`. + + Returns: + A new pipeline that continues with the (potentially errored) items. + """ + processed_pipeline = pipeline_func(self) + + def error_tapping_generator() -> Generator[list[PipelineItem[U]], None, None]: + for chunk in processed_pipeline.generator: + for _, error in chunk: + if error: + try: + on_error(error) + except Exception as e: + # To prevent the handler itself from stopping the pipeline, + # we could log this as a meta-error, but for now, we'll print it. + print(f"Error in `on_error` handler: {e}") + yield chunk # Pass the original chunk through + + return Pipeline._from_chunks(error_tapping_generator(), self.chunk_size) @overload def flatten(self: "Pipeline[list[U]]") -> "Pipeline[U]": ... @@ -171,31 +266,35 @@ def flatten(self: "Pipeline[tuple[U, ...]]") -> "Pipeline[U]": ... @overload def flatten(self: "Pipeline[set[U]]") -> "Pipeline[U]": ... - def flatten( - self: Union["Pipeline[list[U]]", "Pipeline[tuple[U, ...]]", "Pipeline[set[U]]"], - ) -> "Pipeline[Any]": - """Flatten iterable chunks into a single pipeline of elements. - - This method flattens each chunk of iterables and maintains the chunked - structure to avoid memory issues with large datasets. After flattening, - the data is re-chunked to maintain the original chunk_size. - - Example: - [[1, 2], [3, 4]] -> [1, 2, 3, 4] - [(1, 2), (3, 4)] -> [1, 2, 3, 4] - - Note: - We need to overload this method to handle different iterable types because - using Iterable[U] does not preserve the type information for the flattened elements. - It returns Pipeline[Any] instead of Pipeline[U], which is incorrect. - """ - - def flatten_generator() -> Generator[Any, None, None]: - """Generator that yields individual flattened items.""" - return (item for chunk in self.generator for iterable in chunk for item in iterable) - - # Re-chunk the flattened stream to maintain consistent chunk size - return Pipeline._from_chunks(self._chunked(flatten_generator(), self.chunk_size), self.chunk_size) + def flatten(self: Union["Pipeline[list[U]]", "Pipeline[tuple[U, ...]]", "Pipeline[set[U]]"]) -> "Pipeline[Any]": + """Flatten nested iterables within the pipeline.""" + + def flatten_generator() -> Generator[list[PipelineItem[Any]], None, None]: + current_chunk: list[PipelineItem[Any]] = [] + for chunk in self.generator: + for value, error in chunk: + if error: + # Pass through errored items directly + current_chunk.append((value, error)) + elif isinstance(value, list | tuple | set): + # Flatten the value from a successful item + for item in value: + current_chunk.append((item, None)) + if len(current_chunk) == self.chunk_size: + yield current_chunk + current_chunk = [] + # Non-iterable, non-errored items are passed through as-is + else: + current_chunk.append((value, None)) + + if len(current_chunk) == self.chunk_size: + yield current_chunk + current_chunk = [] + + if current_chunk: + yield current_chunk + + return Pipeline._from_chunks(flatten_generator(), self.chunk_size) def concurrent( self, @@ -203,108 +302,90 @@ def concurrent( max_workers: int, ordered: bool = True, ) -> "Pipeline[U]": - """ - Applies a pipeline function to each chunk in parallel. - - This method processes chunks concurrently using a thread pool, which can - significantly speed up I/O-bound or GIL-releasing tasks. The provided - function is applied to a new mini-pipeline created from each chunk. + """Applies a pipeline function to each chunk in parallel.""" - Args: - pipeline_func: A function that takes a `Pipeline` and returns a - transformed `Pipeline`. - max_workers: The maximum number of threads to use. - ordered: If True (default), the output chunks will be in the same - order as the input. Setting to False can improve performance - by yielding results as they complete. - - Returns: - A new Pipeline containing the elements from the concurrently - processed chunks. - """ - - def apply_to_chunk(chunk: list[T]) -> list[U]: - # Create a mini-pipeline for the chunk that treats it as a single unit - chunk_pipeline = Pipeline(chunk, chunk_size=len(chunk)) - # Apply the user's pipeline function and collect the results + def apply_to_chunk(chunk: list[PipelineItem[T]]) -> list[PipelineItem[U]]: + # Create a mini-pipeline from the single chunk of (value, error) tuples. + # The chunk size is len(chunk) to ensure it's processed as one unit. + chunk_pipeline = Pipeline._from_chunks([chunk], len(chunk)) processed_pipeline = pipeline_func(chunk_pipeline) - return processed_pipeline.to_list() - - def ordered_generator() -> Generator[list[U], None, None]: - """Yields results in the order they were submitted.""" - with ThreadPoolExecutor(max_workers=max_workers) as executor: - source_iterator = iter(self.generator) - futures = deque() - - # Prime the executor with the initial set of tasks - for _ in range(max_workers): - try: - chunk = next(source_iterator) - futures.append(executor.submit(apply_to_chunk, chunk)) - except StopIteration: - break # No more chunks in the source - - # As tasks complete, yield results and submit new tasks - while futures: - completed_future = futures.popleft() - yield completed_future.result() - - try: - chunk = next(source_iterator) - futures.append(executor.submit(apply_to_chunk, chunk)) - except StopIteration: - continue # All chunks have been submitted + # The result is already a list of chunks of tuples, so we flatten it by one level. + return [item for processed_chunk in processed_pipeline.generator for item in processed_chunk] - def unordered_generator() -> Generator[list[U], None, None]: - """Yields results as soon as they complete.""" + def process_in_pool( + gen_func: Callable[[], Generator[list[PipelineItem[U]], None, None]], + ) -> Generator[list[PipelineItem[U]], None, None]: with ThreadPoolExecutor(max_workers=max_workers) as executor: - source_iterator = iter(self.generator) - futures = set() - - # Prime the executor with the initial set of tasks - for _ in range(max_workers): - try: - chunk = next(source_iterator) - futures.add(executor.submit(apply_to_chunk, chunk)) - except StopIteration: - break - - while futures: - # Wait for the first available result - done, futures = wait(futures, return_when=FIRST_COMPLETED) - - for completed_future in done: - yield completed_future.result() - # Refill the pool with a new task - try: - chunk = next(source_iterator) - futures.add(executor.submit(apply_to_chunk, chunk)) - except StopIteration: - continue - - gen = ordered_generator() if ordered else unordered_generator() - return Pipeline._from_chunks(gen, self.chunk_size) + yield from gen_func(executor) + + def ordered_generator( + executor: ThreadPoolExecutor, + ) -> Generator[list[PipelineItem[U]], None, None]: + futures = deque(executor.submit(apply_to_chunk, chunk) for chunk in self.generator) + while futures: + yield futures.popleft().result() + + def unordered_generator( + executor: ThreadPoolExecutor, + ) -> Generator[list[PipelineItem[U]], None, None]: + futures = {executor.submit(apply_to_chunk, chunk) for chunk in self.generator} + while futures: + done, futures = wait(futures, return_when=FIRST_COMPLETED) + for f in done: + yield f.result() + + def gen(executor): + if ordered: + return ordered_generator(executor) + else: + return unordered_generator(executor) + + return Pipeline._from_chunks(process_in_pool(gen), self.chunk_size) + + # All other methods like tap, noop, etc. can be adapted similarly + # to handle the (value, error) tuple structure. + def noop(self) -> None: + """Consume the pipeline, raising a CompoundError if any exceptions were caught.""" + errors = [error for chunk in self.generator for _, error in chunk if error] + if errors: + raise CompoundError(errors) @classmethod def chain(cls, *pipelines: "Pipeline[T]") -> "Pipeline[T]": - """ - Chain multiple pipelines together sequentially. - - Args: - *pipelines: Variable number of Pipeline instances to chain - - Returns: - A new Pipeline that processes all input pipelines in sequence - """ + """Chain multiple pipelines together sequentially.""" def chain_generator(): for pipeline in pipelines: yield from pipeline.generator - # chain preserves chunk structure exactly (just concatenates generators) - # Use chunk_size from the first pipeline, or default if no pipelines chunk_size = pipelines[0].chunk_size if pipelines else 1000 - new_pipeline = cls.__new__(cls) - new_pipeline.generator = chain_generator() - new_pipeline.chunk_size = chunk_size + return cls._from_chunks(chain_generator(), chunk_size) + + def tap(self, function: Callable[[T], Any]) -> Self: + """Apply a side-effect function to each element without modifying the data.""" + + def tap_generator() -> Generator[list[PipelineItem[T]], None, None]: + for chunk in self.generator: + for value, error in chunk: + # Apply tap only to non-errored items + if not error: + try: + function(value) + except Exception as e: + # If the tap function itself fails, we should not halt the pipeline. + # For now, we print a warning. A more advanced logger could be used here. + print(f"Warning: Exception in tap function ignored: {e}") + yield chunk # Pass the original chunk through unchanged + + return Pipeline._from_chunks(tap_generator(), self.chunk_size) + + @classmethod + def from_pipeline(cls, pipeline: "Pipeline[T]") -> "Pipeline[T]": + """Create a new Pipeline from another Pipeline, preserving its state.""" + new_pipeline = cls([], chunk_size=pipeline.chunk_size) + new_pipeline.generator = pipeline.generator return new_pipeline + + def passthrough(self) -> Self: + """Return the pipeline unchanged (identity operation).""" + return self diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 261d971..db91e9b 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -7,6 +7,7 @@ import pytest +from efemel.pipeline import CompoundError from efemel.pipeline import Pipeline @@ -459,7 +460,20 @@ def test_flatten_maintains_chunk_size(self): result = [] for chunk in chunks: result.extend(chunk) - assert result == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + + # This is showing the internal representation of the data wheresecond value of the Tuple is the exception + assert result == [ + (1, None), + (2, None), + (3, None), + (4, None), + (5, None), + (6, None), + (7, None), + (8, None), + (9, None), + (10, None), + ] class TestPipelineEdgeCases: @@ -720,3 +734,206 @@ def dummy_consumer(p: Pipeline[int]): # Check if all numbers are consumed assert items == [0, 1, 2, 3, 4] + + +class TestPipelineCatch: + """Test Pipeline catch functionality and CompoundError handling.""" + + def test_catch_basic_error_handling(self): + """Test basic catch functionality with error callback.""" + captured_errors = [] + + def error_prone_pipeline(p): + return p.map(lambda x: 10 / x if x != 0 else 1 / 0) + + def error_handler(error): + captured_errors.append(error) + + pipeline = Pipeline([2, 0, 4, 0, 6]) + + with pytest.raises(CompoundError) as exc_info: + pipeline.catch(error_prone_pipeline, error_handler).to_list() + + # Should have captured 2 ZeroDivisionErrors + assert len(captured_errors) == 2 + assert all(isinstance(e, ZeroDivisionError) for e in captured_errors) + + # CompoundError should also contain the same errors + assert len(exc_info.value.errors) == 2 + assert all(isinstance(e, ZeroDivisionError) for e in exc_info.value.errors) + + def test_catch_with_successful_items(self): + """Test catch with mix of successful and failed items.""" + error_logs = [] + + def selective_error_pipeline(p): + return p.map(lambda x: x * 2 if x != 3 else 1 / 0) + + def log_error(error): + error_logs.append(f"Caught: {type(error).__name__}") + + pipeline = Pipeline([1, 2, 3, 4, 5]) + + with pytest.raises(CompoundError) as exc_info: + pipeline.catch(selective_error_pipeline, log_error).to_list() + + # Should have logged one error + assert len(error_logs) == 1 + assert "ZeroDivisionError" in error_logs[0] + + # CompoundError should contain one error + assert len(exc_info.value.errors) == 1 + + def test_catch_with_reduce_operation(self): + """Test catch with reduce operation that has errors.""" + errors_seen = [] + + def error_prone_reduce_pipeline(p: Pipeline[int]): + # First map some values to cause errors, then reduce + return p.map(lambda x: x if x != 2 else 1 / 0).reduce(lambda acc, x: acc + x, 0) + + def collect_errors(error): + errors_seen.append(error) + + pipeline = Pipeline([1, 2, 3, 4, 5]) + + with pytest.raises(CompoundError) as exc_info: + result = pipeline.catch(error_prone_reduce_pipeline, collect_errors).first() + + # Should have seen the error during catch + assert len(errors_seen) == 1 + assert isinstance(errors_seen[0], ZeroDivisionError) + + # CompoundError should also have the error + assert len(exc_info.value.errors) == 1 + + def test_catch_multiple_operation_errors(self): + """Test catch with multiple operations that can error.""" + all_errors = [] + + def multi_error_pipeline(p: Pipeline[int]): + return ( + p.map(lambda x: x / 2 if x != 4 else 1 / 0) # Error on 4 + .filter( + lambda x: x > 0 if x != 1 else 1 / 0 + ) # Error on 1 (after division = 0.5, but let's say error on result 1) + .map(lambda x: x * 3 if x != 3 else 1 / 0) + ) # Error on 3 (after division = 1.5, won't hit) + + def track_all_errors(error): + all_errors.append(type(error).__name__) + + pipeline = Pipeline([2, 4, 6, 8]) + + with pytest.raises(CompoundError) as exc_info: + pipeline.catch(multi_error_pipeline, track_all_errors).to_list() + + # Should have tracked at least one error + assert len(all_errors) >= 1 + assert len(exc_info.value.errors) >= 1 + + def test_catch_with_no_errors(self): + """Test catch when no errors occur.""" + error_handler_called = [] + + def safe_pipeline(p): + return p.map(lambda x: x * 2).filter(lambda x: x > 5) + + def should_not_be_called(error): + error_handler_called.append(error) + + pipeline = Pipeline([1, 2, 3, 4, 5]) + result = pipeline.catch(safe_pipeline, should_not_be_called).to_list() + + # No errors, so handler should not be called + assert len(error_handler_called) == 0 + # Should get successful results + assert result == [6, 8, 10] + + def test_compound_error_formatting(self): + """Test CompoundError message formatting.""" + + def always_error(x): + if x == 1: + raise ValueError("First error") + elif x == 2: + raise TypeError("Second error") + else: + raise RuntimeError("Third error") + + pipeline = Pipeline([1, 2, 3]) + + with pytest.raises(CompoundError) as exc_info: + pipeline.map(always_error).to_list() + + error_message = str(exc_info.value) + + # Check message contains error count + assert "3 error(s) occurred in the pipeline" in error_message + + # Check individual error messages are included + assert "ValueError: First error" in error_message + assert "TypeError: Second error" in error_message + assert "RuntimeError: Third error" in error_message + + def test_compound_error_with_single_error(self): + """Test CompoundError with just one error.""" + + def single_error(x): + if x == 3: + raise ValueError("Only error") + return x * 2 + + pipeline = Pipeline([1, 2, 3, 4]) + + with pytest.raises(CompoundError) as exc_info: + pipeline.map(single_error).to_list() + + # Should have exactly one error + assert len(exc_info.value.errors) == 1 + assert isinstance(exc_info.value.errors[0], ValueError) + + # Message should reflect single error + error_message = str(exc_info.value) + assert "1 error(s) occurred in the pipeline" in error_message + assert "ValueError: Only error" in error_message + + def test_catch_with_chained_operations(self): + """Test catch with complex chained operations.""" + logged_errors = [] + + def complex_pipeline(p): + return ( + p.filter(lambda x: x > 0 if x != 2 else 1 / 0) # Error on 2 + .map(lambda x: x * 2 if x != 4 else 1 / 0) # Error on 4 + .reduce(lambda acc, x: acc + x, 0) + ) + + def error_logger(error): + logged_errors.append(f"Error: {error}") + + pipeline = Pipeline([1, 2, 3, 4, 5]) + + with pytest.raises(CompoundError) as exc_info: + pipeline.catch(complex_pipeline, error_logger).first() + + # Should have logged errors from the operations + assert len(logged_errors) >= 1 + # CompoundError should also contain the errors + assert len(exc_info.value.errors) >= 1 + + def test_catch_error_handler_exception(self): + """Test catch when error handler itself raises exception.""" + + def error_pipeline(p): + return p.map(lambda x: 1 / 0 if x == 2 else x) + + def failing_handler(error): + raise RuntimeError("Handler failed!") + + pipeline = Pipeline([1, 2, 3]) + + # Should still work despite handler failure + with pytest.raises(CompoundError): + # The failing handler should not stop the pipeline + pipeline.catch(error_pipeline, failing_handler).to_list() From f0743bab93f24b79347d7c7d88198362639d20e2 Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Fri, 11 Jul 2025 20:52:55 +0000 Subject: [PATCH 02/17] feat: implemented global context --- efemel/pipeline.py | 98 ++++++++++++++++++++++++++---------------- tests/test_pipeline.py | 21 ++++----- 2 files changed, 72 insertions(+), 47 deletions(-) diff --git a/efemel/pipeline.py b/efemel/pipeline.py index d88f5fa..961f98e 100644 --- a/efemel/pipeline.py +++ b/efemel/pipeline.py @@ -14,6 +14,7 @@ from concurrent.futures import ThreadPoolExecutor from concurrent.futures import wait from typing import Any +from typing import NamedTuple from typing import Self from typing import TypeVar from typing import Union @@ -24,8 +25,14 @@ # Type variable for the transformed value part U = TypeVar("U") + # Define a type for the internal data structure: a tuple of (value, error) -PipelineItem = tuple[T, Exception | None] +class PipelineItem(NamedTuple): + """Represents an item in the pipeline.""" + + value: T + error: Exception | None + context: dict[str, Any] class CompoundError(Exception): @@ -60,9 +67,11 @@ class Pipeline[T]: Attributes: generator: A generator yielding chunks of pipeline items. chunk_size: The number of elements to process in each chunk. + context: A dictionary for global pipeline state. """ generator: Generator[list[PipelineItem[T]], None, None] + context: dict[str, Any] def __init__(self, source: Union[Iterable[T], "Pipeline[T]"], chunk_size: int = 1000) -> None: """ @@ -75,16 +84,17 @@ def __init__(self, source: Union[Iterable[T], "Pipeline[T]"], chunk_size: int = if isinstance(source, Pipeline): self.generator = source.generator self.chunk_size = source.chunk_size + self.context = source.context else: + self.context = {"has_errors": False} self.generator = self._chunked_and_wrap(source, chunk_size) self.chunk_size = chunk_size - @staticmethod - def _chunked_and_wrap(iterable: Iterable[T], size: int) -> Generator[list[PipelineItem[T]], None, None]: + def _chunked_and_wrap(self, iterable: Iterable[T], size: int) -> Generator[list[PipelineItem[T]], None, None]: """Break an iterable into chunks of specified size and wrap items.""" chunk: list[PipelineItem[T]] = [] for item in iterable: - chunk.append((item, None)) # Wrap each item in a (value, error) tuple + chunk.append(PipelineItem(item, None, self.context)) # Wrap each item if len(chunk) == size: yield chunk chunk = [] @@ -92,10 +102,11 @@ def _chunked_and_wrap(iterable: Iterable[T], size: int) -> Generator[list[Pipeli yield chunk @classmethod - def _from_chunks(cls, chunks: Iterable[list[PipelineItem[T]]], chunk_size: int) -> "Pipeline[T]": + def _from_chunks(cls, chunks: Iterable[list[PipelineItem[T]]], chunk_size: int, context: dict) -> "Pipeline[T]": """Create a pipeline directly from an iterable of chunks.""" p = cls([], chunk_size=chunk_size) p.generator = (chunk for chunk in chunks) + p.context = context return p def __iter__(self) -> Generator[T, None, None]: @@ -104,9 +115,10 @@ def __iter__(self) -> Generator[T, None, None]: """ errors: list[Exception] = [] for chunk in self.generator: - for value, error in chunk: + for value, error, _ in chunk: if error: errors.append(error) + self.context["has_errors"] = True else: yield value if errors: @@ -119,9 +131,10 @@ def to_list(self) -> list[T]: results: list[T] = [] errors: list[Exception] = [] for chunk in self.generator: - for value, error in chunk: + for value, error, _ in chunk: if error: errors.append(error) + self.context["has_errors"] = True else: results.append(value) if errors: @@ -144,20 +157,22 @@ def map[U](self, function: Callable[[T], U]) -> "Pipeline[U]": def map_generator() -> Generator[list[PipelineItem[U]], None, None]: for chunk in self.generator: new_chunk: list[PipelineItem[U]] = [] - for value, error in chunk: + for value, error, context in chunk: if error: # Pass through existing errors - new_chunk.append((value, error)) + new_chunk.append(PipelineItem(value, error, context)) + self.context["has_errors"] = True else: try: # Apply the function; on success, error is None - new_chunk.append((function(value), None)) + new_chunk.append(PipelineItem(function(value), None, context)) except Exception as e: # On failure, capture the exception - new_chunk.append((value, e)) + new_chunk.append(PipelineItem(value, e, context)) + self.context["has_errors"] = True yield new_chunk - return Pipeline._from_chunks(map_generator(), self.chunk_size) + return Pipeline._from_chunks(map_generator(), self.chunk_size, self.context) def filter(self, predicate: Callable[[T], bool]) -> "Pipeline[T]": """Filter elements, capturing any exceptions from the predicate.""" @@ -165,21 +180,23 @@ def filter(self, predicate: Callable[[T], bool]) -> "Pipeline[T]": def filter_generator() -> Generator[list[PipelineItem[T]], None, None]: for chunk in self.generator: new_chunk: list[PipelineItem[T]] = [] - for value, error in chunk: - if error: + for item in chunk: + if item.error: # Pass through existing errors - new_chunk.append((value, error)) + new_chunk.append(item) + self.context["has_errors"] = True continue try: # Keep item if predicate is true - if predicate(value): - new_chunk.append((value, None)) + if predicate(item.value): + new_chunk.append(item) except Exception as e: # If predicate fails, capture the error - new_chunk.append((value, e)) + new_chunk.append(PipelineItem(item.value, e, item.context)) + self.context["has_errors"] = True yield new_chunk - return Pipeline._from_chunks(filter_generator(), self.chunk_size) + return Pipeline._from_chunks(filter_generator(), self.chunk_size, self.context) def reduce[U](self, function: Callable[[U, T], U], initial: U) -> "Pipeline[U]": """ @@ -191,17 +208,18 @@ def reduce_generator() -> Generator[list[PipelineItem[U]], None, None]: acc = initial error_items: list[PipelineItem[U]] = [] for chunk in self.generator: - for value, error in chunk: + for value, error, context in chunk: if error: - error_items.append((value, error)) + error_items.append(PipelineItem(value, error, context)) + self.context["has_errors"] = True else: acc = function(acc, value) # Yield all collected errors first, then the final result. if error_items: yield error_items - yield [(acc, None)] + yield [PipelineItem(acc, None, self.context)] - return Pipeline._from_chunks(reduce_generator(), self.chunk_size) + return Pipeline._from_chunks(reduce_generator(), self.chunk_size, self.context) def each(self, function: Callable[[T], Any]) -> None: """ @@ -211,9 +229,10 @@ def each(self, function: Callable[[T], Any]) -> None: """ errors: list[Exception] = [] for chunk in self.generator: - for value, error in chunk: + for value, error, _ in chunk: if error: errors.append(error) + self.context["has_errors"] = True else: function(value) if errors: @@ -245,8 +264,9 @@ def catch[U]( def error_tapping_generator() -> Generator[list[PipelineItem[U]], None, None]: for chunk in processed_pipeline.generator: - for _, error in chunk: + for _, error, _ in chunk: if error: + self.context["has_errors"] = True try: on_error(error) except Exception as e: @@ -255,7 +275,7 @@ def error_tapping_generator() -> Generator[list[PipelineItem[U]], None, None]: print(f"Error in `on_error` handler: {e}") yield chunk # Pass the original chunk through - return Pipeline._from_chunks(error_tapping_generator(), self.chunk_size) + return Pipeline._from_chunks(error_tapping_generator(), self.chunk_size, self.context) @overload def flatten(self: "Pipeline[list[U]]") -> "Pipeline[U]": ... @@ -272,20 +292,21 @@ def flatten(self: Union["Pipeline[list[U]]", "Pipeline[tuple[U, ...]]", "Pipelin def flatten_generator() -> Generator[list[PipelineItem[Any]], None, None]: current_chunk: list[PipelineItem[Any]] = [] for chunk in self.generator: - for value, error in chunk: + for value, error, context in chunk: if error: # Pass through errored items directly - current_chunk.append((value, error)) + current_chunk.append(PipelineItem(value, error, context)) + self.context["has_errors"] = True elif isinstance(value, list | tuple | set): # Flatten the value from a successful item for item in value: - current_chunk.append((item, None)) + current_chunk.append(PipelineItem(item, None, context)) if len(current_chunk) == self.chunk_size: yield current_chunk current_chunk = [] # Non-iterable, non-errored items are passed through as-is else: - current_chunk.append((value, None)) + current_chunk.append(PipelineItem(value, None, context)) if len(current_chunk) == self.chunk_size: yield current_chunk @@ -294,7 +315,7 @@ def flatten_generator() -> Generator[list[PipelineItem[Any]], None, None]: if current_chunk: yield current_chunk - return Pipeline._from_chunks(flatten_generator(), self.chunk_size) + return Pipeline._from_chunks(flatten_generator(), self.chunk_size, self.context) def concurrent( self, @@ -307,7 +328,7 @@ def concurrent( def apply_to_chunk(chunk: list[PipelineItem[T]]) -> list[PipelineItem[U]]: # Create a mini-pipeline from the single chunk of (value, error) tuples. # The chunk size is len(chunk) to ensure it's processed as one unit. - chunk_pipeline = Pipeline._from_chunks([chunk], len(chunk)) + chunk_pipeline = Pipeline._from_chunks([chunk], len(chunk), self.context) processed_pipeline = pipeline_func(chunk_pipeline) # The result is already a list of chunks of tuples, so we flatten it by one level. return [item for processed_chunk in processed_pipeline.generator for item in processed_chunk] @@ -340,14 +361,15 @@ def gen(executor): else: return unordered_generator(executor) - return Pipeline._from_chunks(process_in_pool(gen), self.chunk_size) + return Pipeline._from_chunks(process_in_pool(gen), self.chunk_size, self.context) # All other methods like tap, noop, etc. can be adapted similarly # to handle the (value, error) tuple structure. def noop(self) -> None: """Consume the pipeline, raising a CompoundError if any exceptions were caught.""" - errors = [error for chunk in self.generator for _, error in chunk if error] + errors = [error for chunk in self.generator for _, error, _ in chunk if error] if errors: + self.context["has_errors"] = True raise CompoundError(errors) @classmethod @@ -359,14 +381,15 @@ def chain_generator(): yield from pipeline.generator chunk_size = pipelines[0].chunk_size if pipelines else 1000 - return cls._from_chunks(chain_generator(), chunk_size) + context = pipelines[0].context if pipelines else {} + return cls._from_chunks(chain_generator(), chunk_size, context) def tap(self, function: Callable[[T], Any]) -> Self: """Apply a side-effect function to each element without modifying the data.""" def tap_generator() -> Generator[list[PipelineItem[T]], None, None]: for chunk in self.generator: - for value, error in chunk: + for value, error, _ in chunk: # Apply tap only to non-errored items if not error: try: @@ -377,13 +400,14 @@ def tap_generator() -> Generator[list[PipelineItem[T]], None, None]: print(f"Warning: Exception in tap function ignored: {e}") yield chunk # Pass the original chunk through unchanged - return Pipeline._from_chunks(tap_generator(), self.chunk_size) + return Pipeline._from_chunks(tap_generator(), self.chunk_size, self.context) @classmethod def from_pipeline(cls, pipeline: "Pipeline[T]") -> "Pipeline[T]": """Create a new Pipeline from another Pipeline, preserving its state.""" new_pipeline = cls([], chunk_size=pipeline.chunk_size) new_pipeline.generator = pipeline.generator + new_pipeline.context = pipeline.context return new_pipeline def passthrough(self) -> Self: diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index db91e9b..88f5869 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -9,6 +9,7 @@ from efemel.pipeline import CompoundError from efemel.pipeline import Pipeline +from efemel.pipeline import PipelineItem class TestPipelineBasics: @@ -463,16 +464,16 @@ def test_flatten_maintains_chunk_size(self): # This is showing the internal representation of the data wheresecond value of the Tuple is the exception assert result == [ - (1, None), - (2, None), - (3, None), - (4, None), - (5, None), - (6, None), - (7, None), - (8, None), - (9, None), - (10, None), + PipelineItem(1, None, {"has_errors": False}), + PipelineItem(2, None, {"has_errors": False}), + PipelineItem(3, None, {"has_errors": False}), + PipelineItem(4, None, {"has_errors": False}), + PipelineItem(5, None, {"has_errors": False}), + PipelineItem(6, None, {"has_errors": False}), + PipelineItem(7, None, {"has_errors": False}), + PipelineItem(8, None, {"has_errors": False}), + PipelineItem(9, None, {"has_errors": False}), + PipelineItem(10, None, {"has_errors": False}), ] From ff3d4ec2e3c775c5237630b1e40100becc2450bd Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Fri, 11 Jul 2025 21:19:55 +0000 Subject: [PATCH 03/17] chore: added context dict --- efemel/pipeline.py | 59 ++++++++++++++++++++++++++++++------------ tests/test_pipeline.py | 38 +++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 17 deletions(-) diff --git a/efemel/pipeline.py b/efemel/pipeline.py index 961f98e..f18304d 100644 --- a/efemel/pipeline.py +++ b/efemel/pipeline.py @@ -16,6 +16,7 @@ from typing import Any from typing import NamedTuple from typing import Self +from typing import TypedDict from typing import TypeVar from typing import Union from typing import overload @@ -26,13 +27,25 @@ U = TypeVar("U") +class PipelineContext(TypedDict): + """Global context available to all pipeline operations.""" + has_errors: bool + errors: list[dict[str, Any]] + + +class FunctionInput(TypedDict): + """Input passed to user functions containing value and context.""" + value: Any + context: PipelineContext + + # Define a type for the internal data structure: a tuple of (value, error) class PipelineItem(NamedTuple): """Represents an item in the pipeline.""" value: T error: Exception | None - context: dict[str, Any] + context: PipelineContext class CompoundError(Exception): @@ -71,7 +84,7 @@ class Pipeline[T]: """ generator: Generator[list[PipelineItem[T]], None, None] - context: dict[str, Any] + context: PipelineContext def __init__(self, source: Union[Iterable[T], "Pipeline[T]"], chunk_size: int = 1000) -> None: """ @@ -86,7 +99,7 @@ def __init__(self, source: Union[Iterable[T], "Pipeline[T]"], chunk_size: int = self.chunk_size = source.chunk_size self.context = source.context else: - self.context = {"has_errors": False} + self.context = PipelineContext(has_errors=False, errors=[]) self.generator = self._chunked_and_wrap(source, chunk_size) self.chunk_size = chunk_size @@ -102,7 +115,9 @@ def _chunked_and_wrap(self, iterable: Iterable[T], size: int) -> Generator[list[ yield chunk @classmethod - def _from_chunks(cls, chunks: Iterable[list[PipelineItem[T]]], chunk_size: int, context: dict) -> "Pipeline[T]": + def _from_chunks( + cls, chunks: Iterable[list[PipelineItem[T]]], chunk_size: int, context: PipelineContext + ) -> "Pipeline[T]": """Create a pipeline directly from an iterable of chunks.""" p = cls([], chunk_size=chunk_size) p.generator = (chunk for chunk in chunks) @@ -151,7 +166,7 @@ def first(self) -> T: raise StopIteration("Pipeline is empty") return items[0] - def map[U](self, function: Callable[[T], U]) -> "Pipeline[U]": + def map[U](self, function: Callable[[FunctionInput], U]) -> "Pipeline[U]": """Transform elements, capturing any exceptions that occur.""" def map_generator() -> Generator[list[PipelineItem[U]], None, None]: @@ -164,8 +179,9 @@ def map_generator() -> Generator[list[PipelineItem[U]], None, None]: self.context["has_errors"] = True else: try: - # Apply the function; on success, error is None - new_chunk.append(PipelineItem(function(value), None, context)) + # Apply the function with value and context; on success, error is None + input_dict = FunctionInput(value=value, context=self.context) + new_chunk.append(PipelineItem(function(input_dict), None, context)) except Exception as e: # On failure, capture the exception new_chunk.append(PipelineItem(value, e, context)) @@ -174,7 +190,7 @@ def map_generator() -> Generator[list[PipelineItem[U]], None, None]: return Pipeline._from_chunks(map_generator(), self.chunk_size, self.context) - def filter(self, predicate: Callable[[T], bool]) -> "Pipeline[T]": + def filter(self, predicate: Callable[[FunctionInput], bool]) -> "Pipeline[T]": """Filter elements, capturing any exceptions from the predicate.""" def filter_generator() -> Generator[list[PipelineItem[T]], None, None]: @@ -188,7 +204,8 @@ def filter_generator() -> Generator[list[PipelineItem[T]], None, None]: continue try: # Keep item if predicate is true - if predicate(item.value): + input_dict = FunctionInput(value=item.value, context=self.context) + if predicate(input_dict): new_chunk.append(item) except Exception as e: # If predicate fails, capture the error @@ -198,7 +215,7 @@ def filter_generator() -> Generator[list[PipelineItem[T]], None, None]: return Pipeline._from_chunks(filter_generator(), self.chunk_size, self.context) - def reduce[U](self, function: Callable[[U, T], U], initial: U) -> "Pipeline[U]": + def reduce[U](self, function: Callable[[U, FunctionInput], U], initial: U) -> "Pipeline[U]": """ Reduce elements to a single value (intermediate operation). Errored items are skipped in the reduction and passed through. @@ -213,7 +230,8 @@ def reduce_generator() -> Generator[list[PipelineItem[U]], None, None]: error_items.append(PipelineItem(value, error, context)) self.context["has_errors"] = True else: - acc = function(acc, value) + input_dict = FunctionInput(value=value, context=self.context) + acc = function(acc, input_dict) # Yield all collected errors first, then the final result. if error_items: yield error_items @@ -221,7 +239,7 @@ def reduce_generator() -> Generator[list[PipelineItem[U]], None, None]: return Pipeline._from_chunks(reduce_generator(), self.chunk_size, self.context) - def each(self, function: Callable[[T], Any]) -> None: + def each(self, function: Callable[[FunctionInput], Any]) -> None: """ Apply a function to each element (terminal operation). @@ -234,7 +252,8 @@ def each(self, function: Callable[[T], Any]) -> None: errors.append(error) self.context["has_errors"] = True else: - function(value) + input_dict = FunctionInput(value=value, context=self.context) + function(input_dict) if errors: raise CompoundError(errors) @@ -260,13 +279,18 @@ def catch[U]( Returns: A new pipeline that continues with the (potentially errored) items. """ + # Initialize errors list in context if not present + if "errors" not in self.context: + self.context["errors"] = [] + processed_pipeline = pipeline_func(self) def error_tapping_generator() -> Generator[list[PipelineItem[U]], None, None]: for chunk in processed_pipeline.generator: - for _, error, _ in chunk: + for value, error, _ in chunk: if error: self.context["has_errors"] = True + self.context["errors"].append({"value": value, "error": error}) try: on_error(error) except Exception as e: @@ -381,10 +405,10 @@ def chain_generator(): yield from pipeline.generator chunk_size = pipelines[0].chunk_size if pipelines else 1000 - context = pipelines[0].context if pipelines else {} + context = pipelines[0].context if pipelines else PipelineContext(has_errors=False, errors=[]) return cls._from_chunks(chain_generator(), chunk_size, context) - def tap(self, function: Callable[[T], Any]) -> Self: + def tap(self, function: Callable[[FunctionInput], Any]) -> Self: """Apply a side-effect function to each element without modifying the data.""" def tap_generator() -> Generator[list[PipelineItem[T]], None, None]: @@ -393,7 +417,8 @@ def tap_generator() -> Generator[list[PipelineItem[T]], None, None]: # Apply tap only to non-errored items if not error: try: - function(value) + input_dict = FunctionInput(value=value, context=self.context) + function(input_dict) except Exception as e: # If the tap function itself fails, we should not halt the pipeline. # For now, we print a warning. A more advanced logger could be used here. diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 88f5869..cf74342 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -938,3 +938,41 @@ def failing_handler(error): with pytest.raises(CompoundError): # The failing handler should not stop the pipeline pipeline.catch(error_pipeline, failing_handler).to_list() + + +class TestPipelineContext: + """Test Pipeline global context functionality.""" + + def test_map_sets_context_value_propagated_to_all_items(self): + """Test that when map function sets a value in context once, it gets passed to all items.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + + context_access_count = 0 + + def set_context_once(value, context): + nonlocal context_access_count + context_access_count += 1 + + # Set a value in context only on the first call + if context_access_count == 1: + context["shared_value"] = "set_by_first_item" + + # All items should be able to access the shared value + shared = context.get("shared_value", "not_found") + return f"{value}_{shared}" + + result = pipeline.map(set_context_once).to_list() + + # Verify that all items got access to the shared value + expected = [ + "1_set_by_first_item", + "2_set_by_first_item", + "3_set_by_first_item", + "4_set_by_first_item", + "5_set_by_first_item", + ] + + assert result == expected + + # Verify the function was called for each item + assert context_access_count == 5 From afc459c9059856097ed3b226be967ecc1b9c7b2f Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Fri, 11 Jul 2025 21:32:29 +0000 Subject: [PATCH 04/17] chore: overloaded context --- efemel/pipeline.py | 77 ++++++++++++++++++++++++++++++------------ tests/test_pipeline.py | 20 +++++------ 2 files changed, 65 insertions(+), 32 deletions(-) diff --git a/efemel/pipeline.py b/efemel/pipeline.py index f18304d..67a08a3 100644 --- a/efemel/pipeline.py +++ b/efemel/pipeline.py @@ -13,6 +13,7 @@ from concurrent.futures import FIRST_COMPLETED from concurrent.futures import ThreadPoolExecutor from concurrent.futures import wait +import inspect from typing import Any from typing import NamedTuple from typing import Self @@ -27,18 +28,39 @@ U = TypeVar("U") +def _accepts_context(func: Callable) -> bool: + """ + Check if a function accepts context as a second parameter. + Returns True if the function takes 2+ parameters, False if it takes 1 parameter. + """ + try: + sig = inspect.signature(func) + param_count = len([p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)]) + return param_count >= 2 + except (ValueError, TypeError): + # Fall back to assuming single parameter for lambda or built-in functions + return False + + +def _call_user_function(func: Callable, value: Any, context: "PipelineContext") -> Any: + """ + Call a user function with the appropriate signature. + If the function accepts context, call with (value, context). + Otherwise, call with just (value). + """ + if _accepts_context(func): + return func(value, context) + else: + return func(value) + + class PipelineContext(TypedDict): """Global context available to all pipeline operations.""" + has_errors: bool errors: list[dict[str, Any]] -class FunctionInput(TypedDict): - """Input passed to user functions containing value and context.""" - value: Any - context: PipelineContext - - # Define a type for the internal data structure: a tuple of (value, error) class PipelineItem(NamedTuple): """Represents an item in the pipeline.""" @@ -166,7 +188,7 @@ def first(self) -> T: raise StopIteration("Pipeline is empty") return items[0] - def map[U](self, function: Callable[[FunctionInput], U]) -> "Pipeline[U]": + def map[U](self, function: Callable[[T], U] | Callable[[T, PipelineContext], U]) -> "Pipeline[U]": """Transform elements, capturing any exceptions that occur.""" def map_generator() -> Generator[list[PipelineItem[U]], None, None]: @@ -179,9 +201,9 @@ def map_generator() -> Generator[list[PipelineItem[U]], None, None]: self.context["has_errors"] = True else: try: - # Apply the function with value and context; on success, error is None - input_dict = FunctionInput(value=value, context=self.context) - new_chunk.append(PipelineItem(function(input_dict), None, context)) + # Apply the function with appropriate signature + result = _call_user_function(function, value, self.context) + new_chunk.append(PipelineItem(result, None, context)) except Exception as e: # On failure, capture the exception new_chunk.append(PipelineItem(value, e, context)) @@ -190,7 +212,7 @@ def map_generator() -> Generator[list[PipelineItem[U]], None, None]: return Pipeline._from_chunks(map_generator(), self.chunk_size, self.context) - def filter(self, predicate: Callable[[FunctionInput], bool]) -> "Pipeline[T]": + def filter(self, predicate: Callable[[T], bool] | Callable[[T, PipelineContext], bool]) -> "Pipeline[T]": """Filter elements, capturing any exceptions from the predicate.""" def filter_generator() -> Generator[list[PipelineItem[T]], None, None]: @@ -204,8 +226,8 @@ def filter_generator() -> Generator[list[PipelineItem[T]], None, None]: continue try: # Keep item if predicate is true - input_dict = FunctionInput(value=item.value, context=self.context) - if predicate(input_dict): + result = _call_user_function(predicate, item.value, self.context) + if result: new_chunk.append(item) except Exception as e: # If predicate fails, capture the error @@ -215,7 +237,9 @@ def filter_generator() -> Generator[list[PipelineItem[T]], None, None]: return Pipeline._from_chunks(filter_generator(), self.chunk_size, self.context) - def reduce[U](self, function: Callable[[U, FunctionInput], U], initial: U) -> "Pipeline[U]": + def reduce[U]( + self, function: Callable[[U, T], U] | Callable[[U, T, PipelineContext], U], initial: U + ) -> "Pipeline[U]": """ Reduce elements to a single value (intermediate operation). Errored items are skipped in the reduction and passed through. @@ -230,8 +254,19 @@ def reduce_generator() -> Generator[list[PipelineItem[U]], None, None]: error_items.append(PipelineItem(value, error, context)) self.context["has_errors"] = True else: - input_dict = FunctionInput(value=value, context=self.context) - acc = function(acc, input_dict) + # For reduce, we need to check for 3 parameters (acc, value, context) + try: + sig = inspect.signature(function) + param_count = len( + [p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)] + ) + if param_count >= 3: + acc = function(acc, value, self.context) + else: + acc = function(acc, value) + except (ValueError, TypeError): + # Fall back to 2-parameter call for lambda or built-in functions + acc = function(acc, value) # Yield all collected errors first, then the final result. if error_items: yield error_items @@ -239,7 +274,7 @@ def reduce_generator() -> Generator[list[PipelineItem[U]], None, None]: return Pipeline._from_chunks(reduce_generator(), self.chunk_size, self.context) - def each(self, function: Callable[[FunctionInput], Any]) -> None: + def each(self, function: Callable[[T], Any] | Callable[[T, PipelineContext], Any]) -> None: """ Apply a function to each element (terminal operation). @@ -252,8 +287,7 @@ def each(self, function: Callable[[FunctionInput], Any]) -> None: errors.append(error) self.context["has_errors"] = True else: - input_dict = FunctionInput(value=value, context=self.context) - function(input_dict) + _call_user_function(function, value, self.context) if errors: raise CompoundError(errors) @@ -408,7 +442,7 @@ def chain_generator(): context = pipelines[0].context if pipelines else PipelineContext(has_errors=False, errors=[]) return cls._from_chunks(chain_generator(), chunk_size, context) - def tap(self, function: Callable[[FunctionInput], Any]) -> Self: + def tap(self, function: Callable[[T], Any] | Callable[[T, PipelineContext], Any]) -> Self: """Apply a side-effect function to each element without modifying the data.""" def tap_generator() -> Generator[list[PipelineItem[T]], None, None]: @@ -417,8 +451,7 @@ def tap_generator() -> Generator[list[PipelineItem[T]], None, None]: # Apply tap only to non-errored items if not error: try: - input_dict = FunctionInput(value=value, context=self.context) - function(input_dict) + _call_user_function(function, value, self.context) except Exception as e: # If the tap function itself fails, we should not halt the pipeline. # For now, we print a warning. A more advanced logger could be used here. diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index cf74342..6d318a8 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -464,16 +464,16 @@ def test_flatten_maintains_chunk_size(self): # This is showing the internal representation of the data wheresecond value of the Tuple is the exception assert result == [ - PipelineItem(1, None, {"has_errors": False}), - PipelineItem(2, None, {"has_errors": False}), - PipelineItem(3, None, {"has_errors": False}), - PipelineItem(4, None, {"has_errors": False}), - PipelineItem(5, None, {"has_errors": False}), - PipelineItem(6, None, {"has_errors": False}), - PipelineItem(7, None, {"has_errors": False}), - PipelineItem(8, None, {"has_errors": False}), - PipelineItem(9, None, {"has_errors": False}), - PipelineItem(10, None, {"has_errors": False}), + PipelineItem(1, None, {"has_errors": False, "errors": []}), + PipelineItem(2, None, {"has_errors": False, "errors": []}), + PipelineItem(3, None, {"has_errors": False, "errors": []}), + PipelineItem(4, None, {"has_errors": False, "errors": []}), + PipelineItem(5, None, {"has_errors": False, "errors": []}), + PipelineItem(6, None, {"has_errors": False, "errors": []}), + PipelineItem(7, None, {"has_errors": False, "errors": []}), + PipelineItem(8, None, {"has_errors": False, "errors": []}), + PipelineItem(9, None, {"has_errors": False, "errors": []}), + PipelineItem(10, None, {"has_errors": False, "errors": []}), ] From 60aa252f82b6f88a6451a24bcd65d0d91f6cd360 Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Fri, 11 Jul 2025 21:36:54 +0000 Subject: [PATCH 05/17] fix: check function param count once --- efemel/pipeline.py | 83 +++++++++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/efemel/pipeline.py b/efemel/pipeline.py index 67a08a3..843a0f8 100644 --- a/efemel/pipeline.py +++ b/efemel/pipeline.py @@ -28,30 +28,17 @@ U = TypeVar("U") -def _accepts_context(func: Callable) -> bool: +def _get_function_arity(func: Callable) -> int: """ - Check if a function accepts context as a second parameter. - Returns True if the function takes 2+ parameters, False if it takes 1 parameter. + Determine how many parameters a function accepts. + Returns the number of positional parameters the function expects. """ try: sig = inspect.signature(func) - param_count = len([p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)]) - return param_count >= 2 + return len([p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)]) except (ValueError, TypeError): # Fall back to assuming single parameter for lambda or built-in functions - return False - - -def _call_user_function(func: Callable, value: Any, context: "PipelineContext") -> Any: - """ - Call a user function with the appropriate signature. - If the function accepts context, call with (value, context). - Otherwise, call with just (value). - """ - if _accepts_context(func): - return func(value, context) - else: - return func(value) + return 1 class PipelineContext(TypedDict): @@ -191,6 +178,10 @@ def first(self) -> T: def map[U](self, function: Callable[[T], U] | Callable[[T, PipelineContext], U]) -> "Pipeline[U]": """Transform elements, capturing any exceptions that occur.""" + # Check function arity once upfront for efficiency + func_arity = _get_function_arity(function) + accepts_context = func_arity >= 2 + def map_generator() -> Generator[list[PipelineItem[U]], None, None]: for chunk in self.generator: new_chunk: list[PipelineItem[U]] = [] @@ -202,7 +193,10 @@ def map_generator() -> Generator[list[PipelineItem[U]], None, None]: else: try: # Apply the function with appropriate signature - result = _call_user_function(function, value, self.context) + if accepts_context: + result = function(value, self.context) + else: + result = function(value) new_chunk.append(PipelineItem(result, None, context)) except Exception as e: # On failure, capture the exception @@ -215,6 +209,10 @@ def map_generator() -> Generator[list[PipelineItem[U]], None, None]: def filter(self, predicate: Callable[[T], bool] | Callable[[T, PipelineContext], bool]) -> "Pipeline[T]": """Filter elements, capturing any exceptions from the predicate.""" + # Check function arity once upfront for efficiency + func_arity = _get_function_arity(predicate) + accepts_context = func_arity >= 2 + def filter_generator() -> Generator[list[PipelineItem[T]], None, None]: for chunk in self.generator: new_chunk: list[PipelineItem[T]] = [] @@ -226,7 +224,10 @@ def filter_generator() -> Generator[list[PipelineItem[T]], None, None]: continue try: # Keep item if predicate is true - result = _call_user_function(predicate, item.value, self.context) + if accepts_context: + result = predicate(item.value, self.context) + else: + result = predicate(item.value) if result: new_chunk.append(item) except Exception as e: @@ -248,25 +249,26 @@ def reduce[U]( def reduce_generator() -> Generator[list[PipelineItem[U]], None, None]: acc = initial error_items: list[PipelineItem[U]] = [] + + # Check function arity once upfront for efficiency + func_arity = _get_function_arity(function) + accepts_context = func_arity >= 3 # For reduce: acc, value, context + for chunk in self.generator: for value, error, context in chunk: if error: error_items.append(PipelineItem(value, error, context)) self.context["has_errors"] = True else: - # For reduce, we need to check for 3 parameters (acc, value, context) try: - sig = inspect.signature(function) - param_count = len( - [p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)] - ) - if param_count >= 3: + if accepts_context: acc = function(acc, value, self.context) else: acc = function(acc, value) - except (ValueError, TypeError): - # Fall back to 2-parameter call for lambda or built-in functions - acc = function(acc, value) + except Exception as e: + # If reduce function fails, we still need to handle it + error_items.append(PipelineItem(value, e, context)) + self.context["has_errors"] = True # Yield all collected errors first, then the final result. if error_items: yield error_items @@ -280,6 +282,11 @@ def each(self, function: Callable[[T], Any] | Callable[[T, PipelineContext], Any Raises a CompoundError at the end if any exceptions were caught. """ + + # Check function arity once upfront for efficiency + func_arity = _get_function_arity(function) + accepts_context = func_arity >= 2 + errors: list[Exception] = [] for chunk in self.generator: for value, error, _ in chunk: @@ -287,7 +294,14 @@ def each(self, function: Callable[[T], Any] | Callable[[T, PipelineContext], Any errors.append(error) self.context["has_errors"] = True else: - _call_user_function(function, value, self.context) + try: + if accepts_context: + function(value, self.context) + else: + function(value) + except Exception as e: + errors.append(e) + self.context["has_errors"] = True if errors: raise CompoundError(errors) @@ -445,13 +459,20 @@ def chain_generator(): def tap(self, function: Callable[[T], Any] | Callable[[T, PipelineContext], Any]) -> Self: """Apply a side-effect function to each element without modifying the data.""" + # Check function arity once upfront for efficiency + func_arity = _get_function_arity(function) + accepts_context = func_arity >= 2 + def tap_generator() -> Generator[list[PipelineItem[T]], None, None]: for chunk in self.generator: for value, error, _ in chunk: # Apply tap only to non-errored items if not error: try: - _call_user_function(function, value, self.context) + if accepts_context: + function(value, self.context) + else: + function(value) except Exception as e: # If the tap function itself fails, we should not halt the pipeline. # For now, we print a warning. A more advanced logger could be used here. From cbf5edd792d9a02e05ba62203855a2e839201976 Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Fri, 11 Jul 2025 21:40:48 +0000 Subject: [PATCH 06/17] fix: more standardisation --- efemel/pipeline.py | 111 ++++++++++++++++++++++++++++----------------- 1 file changed, 69 insertions(+), 42 deletions(-) diff --git a/efemel/pipeline.py b/efemel/pipeline.py index 843a0f8..4c41aad 100644 --- a/efemel/pipeline.py +++ b/efemel/pipeline.py @@ -28,17 +28,64 @@ U = TypeVar("U") -def _get_function_arity(func: Callable) -> int: +def _create_context_aware_function(func: Callable, context: "PipelineContext") -> Callable[[Any], Any]: """ - Determine how many parameters a function accepts. - Returns the number of positional parameters the function expects. + Create a standardized function that always takes just a value parameter. + + This builder analyzes the user function once and returns a wrapper that: + - Calls func(value) if the original function takes 1 parameter + - Calls func(value, context) if the original function takes 2+ parameters + + Args: + func: The user-provided function + context: The pipeline context to pass if needed + + Returns: + A function that takes only a value parameter """ try: sig = inspect.signature(func) - return len([p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)]) + param_count = len([p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)]) + + if param_count >= 2: + # Function accepts context + return lambda value: func(value, context) + else: + # Function takes only value + return lambda value: func(value) + except (ValueError, TypeError): + # Fall back to single parameter for lambda or built-in functions + return lambda value: func(value) + + +def _create_reduce_function(func: Callable, context: "PipelineContext") -> Callable[[Any, Any], Any]: + """ + Create a standardized reduce function that always takes (accumulator, value) parameters. + + This builder analyzes the user function once and returns a wrapper that: + - Calls func(acc, value) if the original function takes 2 parameters + - Calls func(acc, value, context) if the original function takes 3+ parameters + + Args: + func: The user-provided reduce function + context: The pipeline context to pass if needed + + Returns: + A function that takes (accumulator, value) parameters + """ + try: + sig = inspect.signature(func) + param_count = len([p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)]) + + if param_count >= 3: + # Function accepts context as third parameter + return lambda acc, value: func(acc, value, context) + else: + # Function takes only (acc, value) + return lambda acc, value: func(acc, value) except (ValueError, TypeError): - # Fall back to assuming single parameter for lambda or built-in functions - return 1 + # Fall back to 2-parameter call for lambda or built-in functions + return lambda acc, value: func(acc, value) class PipelineContext(TypedDict): @@ -178,9 +225,8 @@ def first(self) -> T: def map[U](self, function: Callable[[T], U] | Callable[[T, PipelineContext], U]) -> "Pipeline[U]": """Transform elements, capturing any exceptions that occur.""" - # Check function arity once upfront for efficiency - func_arity = _get_function_arity(function) - accepts_context = func_arity >= 2 + # Create a standardized function that always takes just value + std_function = _create_context_aware_function(function, self.context) def map_generator() -> Generator[list[PipelineItem[U]], None, None]: for chunk in self.generator: @@ -192,11 +238,8 @@ def map_generator() -> Generator[list[PipelineItem[U]], None, None]: self.context["has_errors"] = True else: try: - # Apply the function with appropriate signature - if accepts_context: - result = function(value, self.context) - else: - result = function(value) + # Apply the standardized function + result = std_function(value) new_chunk.append(PipelineItem(result, None, context)) except Exception as e: # On failure, capture the exception @@ -209,9 +252,8 @@ def map_generator() -> Generator[list[PipelineItem[U]], None, None]: def filter(self, predicate: Callable[[T], bool] | Callable[[T, PipelineContext], bool]) -> "Pipeline[T]": """Filter elements, capturing any exceptions from the predicate.""" - # Check function arity once upfront for efficiency - func_arity = _get_function_arity(predicate) - accepts_context = func_arity >= 2 + # Create a standardized predicate that always takes just value + std_predicate = _create_context_aware_function(predicate, self.context) def filter_generator() -> Generator[list[PipelineItem[T]], None, None]: for chunk in self.generator: @@ -224,10 +266,7 @@ def filter_generator() -> Generator[list[PipelineItem[T]], None, None]: continue try: # Keep item if predicate is true - if accepts_context: - result = predicate(item.value, self.context) - else: - result = predicate(item.value) + result = std_predicate(item.value) if result: new_chunk.append(item) except Exception as e: @@ -250,9 +289,8 @@ def reduce_generator() -> Generator[list[PipelineItem[U]], None, None]: acc = initial error_items: list[PipelineItem[U]] = [] - # Check function arity once upfront for efficiency - func_arity = _get_function_arity(function) - accepts_context = func_arity >= 3 # For reduce: acc, value, context + # Create a standardized reduce function that always takes (acc, value) + std_function = _create_reduce_function(function, self.context) for chunk in self.generator: for value, error, context in chunk: @@ -261,10 +299,7 @@ def reduce_generator() -> Generator[list[PipelineItem[U]], None, None]: self.context["has_errors"] = True else: try: - if accepts_context: - acc = function(acc, value, self.context) - else: - acc = function(acc, value) + acc = std_function(acc, value) except Exception as e: # If reduce function fails, we still need to handle it error_items.append(PipelineItem(value, e, context)) @@ -283,9 +318,8 @@ def each(self, function: Callable[[T], Any] | Callable[[T, PipelineContext], Any Raises a CompoundError at the end if any exceptions were caught. """ - # Check function arity once upfront for efficiency - func_arity = _get_function_arity(function) - accepts_context = func_arity >= 2 + # Create a standardized function that always takes just value + std_function = _create_context_aware_function(function, self.context) errors: list[Exception] = [] for chunk in self.generator: @@ -295,10 +329,7 @@ def each(self, function: Callable[[T], Any] | Callable[[T, PipelineContext], Any self.context["has_errors"] = True else: try: - if accepts_context: - function(value, self.context) - else: - function(value) + std_function(value) except Exception as e: errors.append(e) self.context["has_errors"] = True @@ -459,9 +490,8 @@ def chain_generator(): def tap(self, function: Callable[[T], Any] | Callable[[T, PipelineContext], Any]) -> Self: """Apply a side-effect function to each element without modifying the data.""" - # Check function arity once upfront for efficiency - func_arity = _get_function_arity(function) - accepts_context = func_arity >= 2 + # Create a standardized function that always takes just value + std_function = _create_context_aware_function(function, self.context) def tap_generator() -> Generator[list[PipelineItem[T]], None, None]: for chunk in self.generator: @@ -469,10 +499,7 @@ def tap_generator() -> Generator[list[PipelineItem[T]], None, None]: # Apply tap only to non-errored items if not error: try: - if accepts_context: - function(value, self.context) - else: - function(value) + std_function(value) except Exception as e: # If the tap function itself fails, we should not halt the pipeline. # For now, we print a warning. A more advanced logger could be used here. From dcbb270c2f77fed3086e0db862b029e0a96d2315 Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Fri, 11 Jul 2025 21:51:31 +0000 Subject: [PATCH 07/17] fix: context is now thread safe --- efemel/pipeline.py | 35 +++++++++++++++---- test_concurrent_context.py | 71 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 6 deletions(-) create mode 100644 test_concurrent_context.py diff --git a/efemel/pipeline.py b/efemel/pipeline.py index 4c41aad..8da2c4a 100644 --- a/efemel/pipeline.py +++ b/efemel/pipeline.py @@ -14,6 +14,7 @@ from concurrent.futures import ThreadPoolExecutor from concurrent.futures import wait import inspect +import threading from typing import Any from typing import NamedTuple from typing import Self @@ -428,13 +429,28 @@ def concurrent( ) -> "Pipeline[U]": """Applies a pipeline function to each chunk in parallel.""" + # Thread-safe context merging + context_lock = threading.Lock() + merged_errors: list[dict[str, Any]] = [] + def apply_to_chunk(chunk: list[PipelineItem[T]]) -> list[PipelineItem[U]]: - # Create a mini-pipeline from the single chunk of (value, error) tuples. - # The chunk size is len(chunk) to ensure it's processed as one unit. - chunk_pipeline = Pipeline._from_chunks([chunk], len(chunk), self.context) + # Create an isolated context for this thread to avoid race conditions + isolated_context = PipelineContext(has_errors=False, errors=[]) + + # Create a mini-pipeline from the single chunk with isolated context + chunk_pipeline = Pipeline._from_chunks([chunk], len(chunk), isolated_context) processed_pipeline = pipeline_func(chunk_pipeline) - # The result is already a list of chunks of tuples, so we flatten it by one level. - return [item for processed_chunk in processed_pipeline.generator for item in processed_chunk] + + # Collect results and merge context thread-safely + result = [item for processed_chunk in processed_pipeline.generator for item in processed_chunk] + + # Thread-safe context merging + if isolated_context["has_errors"] or isolated_context["errors"]: + with context_lock: + self.context["has_errors"] = True + merged_errors.extend(isolated_context["errors"]) + + return result def process_in_pool( gen_func: Callable[[], Generator[list[PipelineItem[U]], None, None]], @@ -464,7 +480,14 @@ def gen(executor): else: return unordered_generator(executor) - return Pipeline._from_chunks(process_in_pool(gen), self.chunk_size, self.context) + # Process all chunks and collect results + result_pipeline = Pipeline._from_chunks(process_in_pool(gen), self.chunk_size, self.context) + + # Final context merge after all threads complete + with context_lock: + self.context["errors"].extend(merged_errors) + + return result_pipeline # All other methods like tap, noop, etc. can be adapted similarly # to handle the (value, error) tuple structure. diff --git a/test_concurrent_context.py b/test_concurrent_context.py new file mode 100644 index 0000000..db928e6 --- /dev/null +++ b/test_concurrent_context.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +""" +Test script to verify that the concurrent method handles context properly +and doesn't have race conditions. +""" + +from efemel.pipeline import Pipeline + + +def test_concurrent_context_isolation(): + """Test that concurrent processing properly isolates and merges contexts.""" + + # Create a pipeline with some data + data = list(range(100)) # Large enough to ensure multiple chunks + pipeline = Pipeline(data, chunk_size=10) # Force multiple chunks + + def error_prone_pipeline(p): + """A pipeline that will cause errors on certain values.""" + return p.map(lambda x: x if x % 7 != 0 else 1 / 0) # Error on multiples of 7 + + def safe_pipeline(p): + """A pipeline that should work without errors.""" + return p.map(lambda x: x * 2) + + print("Testing concurrent pipeline with errors...") + + # Test with error-prone pipeline + try: + result = pipeline.concurrent(error_prone_pipeline, max_workers=4).to_list() + print("ERROR: Should have raised CompoundError!") + except Exception as e: + print(f"✓ Correctly caught exception: {type(e).__name__}") + print(f" Pipeline context has_errors: {pipeline.context['has_errors']}") + + # Test with safe pipeline + print("\nTesting concurrent pipeline without errors...") + safe_data = list(range(50)) + safe_pipeline_obj = Pipeline(safe_data, chunk_size=5) + + result = safe_pipeline_obj.concurrent(safe_pipeline, max_workers=4).to_list() + expected = [x * 2 for x in safe_data] + + print(f"✓ Results match expected: {result == expected}") + print(f" Pipeline context has_errors: {safe_pipeline_obj.context['has_errors']}") + print(f" Result length: {len(result)}") + + +def test_concurrent_vs_sequential(): + """Test that concurrent and sequential processing produce the same results.""" + + data = list(range(20)) + + def transform_pipeline(p): + return p.map(lambda x: x * 3).filter(lambda x: x % 2 == 0) + + # Sequential + sequential_result = Pipeline(data).apply(transform_pipeline).to_list() + + # Concurrent + concurrent_result = Pipeline(data, chunk_size=5).concurrent(transform_pipeline, max_workers=3).to_list() + + print("\nSequential vs Concurrent comparison:") + print(f"✓ Results match: {sequential_result == concurrent_result}") + print(f" Sequential: {sequential_result}") + print(f" Concurrent: {concurrent_result}") + + +if __name__ == "__main__": + test_concurrent_context_isolation() + test_concurrent_vs_sequential() + print("\n✓ All concurrent context tests passed!") From 038926e21168ee6bc5c3bfcf15cc3574ff0180ed Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Fri, 11 Jul 2025 22:00:40 +0000 Subject: [PATCH 08/17] chore: refactored tests --- test_concurrent_context.py | 71 --- tests/test_pipeline.py | 978 ----------------------------- tests/test_pipeline_basics.py | 77 +++ tests/test_pipeline_catch.py | 214 +++++++ tests/test_pipeline_composition.py | 100 +++ tests/test_pipeline_concurrency.py | 134 ++++ tests/test_pipeline_context.py | 46 ++ tests/test_pipeline_each.py | 49 ++ tests/test_pipeline_edge_cases.py | 77 +++ tests/test_pipeline_filtering.py | 60 ++ tests/test_pipeline_integration.py | 79 +++ tests/test_pipeline_map_filter.py | 41 ++ tests/test_pipeline_mapping.py | 64 ++ tests/test_pipeline_reduce.py | 52 ++ tests/test_pipeline_tap.py | 55 ++ tests/test_pipeline_utility.py | 134 ++++ 16 files changed, 1182 insertions(+), 1049 deletions(-) delete mode 100644 test_concurrent_context.py delete mode 100644 tests/test_pipeline.py create mode 100644 tests/test_pipeline_basics.py create mode 100644 tests/test_pipeline_catch.py create mode 100644 tests/test_pipeline_composition.py create mode 100644 tests/test_pipeline_concurrency.py create mode 100644 tests/test_pipeline_context.py create mode 100644 tests/test_pipeline_each.py create mode 100644 tests/test_pipeline_edge_cases.py create mode 100644 tests/test_pipeline_filtering.py create mode 100644 tests/test_pipeline_integration.py create mode 100644 tests/test_pipeline_map_filter.py create mode 100644 tests/test_pipeline_mapping.py create mode 100644 tests/test_pipeline_reduce.py create mode 100644 tests/test_pipeline_tap.py create mode 100644 tests/test_pipeline_utility.py diff --git a/test_concurrent_context.py b/test_concurrent_context.py deleted file mode 100644 index db928e6..0000000 --- a/test_concurrent_context.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -""" -Test script to verify that the concurrent method handles context properly -and doesn't have race conditions. -""" - -from efemel.pipeline import Pipeline - - -def test_concurrent_context_isolation(): - """Test that concurrent processing properly isolates and merges contexts.""" - - # Create a pipeline with some data - data = list(range(100)) # Large enough to ensure multiple chunks - pipeline = Pipeline(data, chunk_size=10) # Force multiple chunks - - def error_prone_pipeline(p): - """A pipeline that will cause errors on certain values.""" - return p.map(lambda x: x if x % 7 != 0 else 1 / 0) # Error on multiples of 7 - - def safe_pipeline(p): - """A pipeline that should work without errors.""" - return p.map(lambda x: x * 2) - - print("Testing concurrent pipeline with errors...") - - # Test with error-prone pipeline - try: - result = pipeline.concurrent(error_prone_pipeline, max_workers=4).to_list() - print("ERROR: Should have raised CompoundError!") - except Exception as e: - print(f"✓ Correctly caught exception: {type(e).__name__}") - print(f" Pipeline context has_errors: {pipeline.context['has_errors']}") - - # Test with safe pipeline - print("\nTesting concurrent pipeline without errors...") - safe_data = list(range(50)) - safe_pipeline_obj = Pipeline(safe_data, chunk_size=5) - - result = safe_pipeline_obj.concurrent(safe_pipeline, max_workers=4).to_list() - expected = [x * 2 for x in safe_data] - - print(f"✓ Results match expected: {result == expected}") - print(f" Pipeline context has_errors: {safe_pipeline_obj.context['has_errors']}") - print(f" Result length: {len(result)}") - - -def test_concurrent_vs_sequential(): - """Test that concurrent and sequential processing produce the same results.""" - - data = list(range(20)) - - def transform_pipeline(p): - return p.map(lambda x: x * 3).filter(lambda x: x % 2 == 0) - - # Sequential - sequential_result = Pipeline(data).apply(transform_pipeline).to_list() - - # Concurrent - concurrent_result = Pipeline(data, chunk_size=5).concurrent(transform_pipeline, max_workers=3).to_list() - - print("\nSequential vs Concurrent comparison:") - print(f"✓ Results match: {sequential_result == concurrent_result}") - print(f" Sequential: {sequential_result}") - print(f" Concurrent: {concurrent_result}") - - -if __name__ == "__main__": - test_concurrent_context_isolation() - test_concurrent_vs_sequential() - print("\n✓ All concurrent context tests passed!") diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py deleted file mode 100644 index 6d318a8..0000000 --- a/tests/test_pipeline.py +++ /dev/null @@ -1,978 +0,0 @@ -""" -Test suite for the Pipeline class. - -This module contains comprehensive tests for the Pipeline class functionality -including all methods, edge cases, and error conditions. -""" - -import pytest - -from efemel.pipeline import CompoundError -from efemel.pipeline import Pipeline -from efemel.pipeline import PipelineItem - - -class TestPipelineBasics: - """Test basic Pipeline functionality.""" - - def test_pipeline_initialization(self): - """Test Pipeline initialization with various iterables.""" - # Test with list - pipeline = Pipeline([1, 2, 3, 4, 5]) - assert isinstance(pipeline, Pipeline) - # Generator should be a generator object, not the original list - from types import GeneratorType - - assert isinstance(pipeline.generator, GeneratorType) - - # Test with tuple - pipeline = Pipeline((1, 2, 3)) - assert list(pipeline) == [1, 2, 3] - - # Test with generator - def gen(): - yield from range(1, 4) - - pipeline = Pipeline(gen()) - assert list(pipeline) == [1, 2, 3] - - # Test with empty list - pipeline = Pipeline([]) - assert list(pipeline) == [] - - def test_pipeline_iteration(self): - """Test Pipeline iteration behavior.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - - # Test iteration - result = [] - for item in pipeline: - result.append(item) - - assert result == [1, 2, 3, 4, 5] - - def test_to_list(self): - """Test Pipeline.to_list() method.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - result = pipeline.to_list() - - assert result == [1, 2, 3, 4, 5] - assert isinstance(result, list) - - # Test with empty pipeline - empty_pipeline = Pipeline([]) - assert empty_pipeline.to_list() == [] - - def test_first(self): - """Test Pipeline.first() method.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - assert pipeline.first() == 1 - - # Test with string - str_pipeline = Pipeline(["hello", "world"]) - assert str_pipeline.first() == "hello" - - def test_first_empty_pipeline(self): - """Test Pipeline.first() with empty pipeline.""" - empty_pipeline = Pipeline([]) - with pytest.raises(StopIteration): - empty_pipeline.first() - - -class TestPipelineFiltering: - """Test Pipeline filtering functionality.""" - - def test_filter_basic(self): - """Test basic filtering operations.""" - # Filter even numbers - pipeline1 = Pipeline([1, 2, 3, 4, 5]) - even_pipeline = pipeline1.filter(lambda x: x % 2 == 0) - assert even_pipeline.to_list() == [2, 4] - - # Filter numbers greater than 3 - use fresh pipeline - pipeline2 = Pipeline([1, 2, 3, 4, 5]) - gt3_pipeline = pipeline2.filter(lambda x: x > 3) - assert gt3_pipeline.to_list() == [4, 5] - - def test_filter_with_strings(self): - """Test filtering with string data.""" - # Filter strings longer than 4 characters - pipeline1 = Pipeline(["hello", "world", "python", "test"]) - long_strings = pipeline1.filter(lambda s: len(s) > 4) - assert long_strings.to_list() == ["hello", "world", "python"] - - # Filter strings starting with 'p' - use fresh pipeline - pipeline2 = Pipeline(["hello", "world", "python", "test"]) - p_strings = pipeline2.filter(lambda s: s.startswith("p")) - assert p_strings.to_list() == ["python"] - - def test_filter_empty_result(self): - """Test filter that results in empty pipeline.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - - # Filter that matches nothing - empty_result = pipeline.filter(lambda x: x > 10) - assert empty_result.to_list() == [] - - def test_filter_all_pass(self): - """Test filter where all items pass.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - - # Filter that passes everything - all_pass = pipeline.filter(lambda x: True) - assert all_pass.to_list() == [1, 2, 3, 4, 5] - - def test_filter_chaining(self): - """Test chaining multiple filters.""" - pipeline = Pipeline([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) - - # Chain filters: even numbers and greater than 5 - result = pipeline.filter(lambda x: x % 2 == 0).filter(lambda x: x > 5) - assert result.to_list() == [6, 8, 10] - - -class TestPipelineMapping: - """Test Pipeline mapping functionality.""" - - def test_map_basic(self): - """Test basic mapping operations.""" - # Double each number - pipeline1 = Pipeline([1, 2, 3, 4, 5]) - doubled = pipeline1.map(lambda x: x * 2) - assert doubled.to_list() == [2, 4, 6, 8, 10] - - # Square each number - use fresh pipeline - pipeline2 = Pipeline([1, 2, 3, 4, 5]) - squared = pipeline2.map(lambda x: x**2) - assert squared.to_list() == [1, 4, 9, 16, 25] - - def test_map_type_transformation(self): - """Test mapping with type transformation.""" - # Convert numbers to strings - pipeline1 = Pipeline([1, 2, 3, 4, 5]) - str_pipeline = pipeline1.map(str) - assert str_pipeline.to_list() == ["1", "2", "3", "4", "5"] - - # Convert to boolean (non-zero is True) - use fresh pipeline - pipeline2 = Pipeline([1, 2, 3, 4, 5]) - bool_pipeline = pipeline2.map(bool) - assert bool_pipeline.to_list() == [True, True, True, True, True] - - def test_map_with_strings(self): - """Test mapping with string data.""" - # Convert to uppercase - pipeline1 = Pipeline(["hello", "world", "python"]) - upper_pipeline = pipeline1.map(str.upper) - assert upper_pipeline.to_list() == ["HELLO", "WORLD", "PYTHON"] - - # Get string lengths - use fresh pipeline - pipeline2 = Pipeline(["hello", "world", "python"]) - len_pipeline = pipeline2.map(len) - assert len_pipeline.to_list() == [5, 5, 6] - - def test_map_chaining(self): - """Test chaining multiple map operations.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - - # Chain maps: double, then add 1 - result = pipeline.map(lambda x: x * 2).map(lambda x: x + 1) - assert result.to_list() == [3, 5, 7, 9, 11] - - def test_map_empty_pipeline(self): - """Test mapping on empty pipeline.""" - empty_pipeline = Pipeline([]) - - # Map should return empty result - result = empty_pipeline.map(lambda x: x * 2) - assert result.to_list() == [] - - -class TestPipelineMapFilter: - """Test Pipeline map and filter combinations.""" - - def test_map_then_filter(self): - """Test mapping then filtering.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - - # Double numbers, then filter even results - result = pipeline.map(lambda x: x * 2).filter(lambda x: x % 2 == 0) - assert result.to_list() == [2, 4, 6, 8, 10] - - def test_filter_then_map(self): - """Test filtering then mapping.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - - # Filter even numbers, then double them - result = pipeline.filter(lambda x: x % 2 == 0).map(lambda x: x * 2) - assert result.to_list() == [4, 8] - - def test_complex_map_filter_chain(self): - """Test complex chaining of map and filter operations.""" - pipeline = Pipeline([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) - - # Complex chain: filter odd, multiply by 3, filter > 10 - result = ( - pipeline.filter(lambda x: x % 2 == 1) # [1, 3, 5, 7, 9] - .map(lambda x: x * 3) # [3, 9, 15, 21, 27] - .filter(lambda x: x > 10) - ) # [15, 21, 27] - - assert result.to_list() == [15, 21, 27] - - -class TestPipelineReduce: - """Test Pipeline reduce functionality.""" - - def test_reduce_basic(self): - """Test basic reduce operations.""" - # Sum all numbers - pipeline1 = Pipeline([1, 2, 3, 4, 5]) - sum_result = pipeline1.reduce(lambda acc, x: acc + x, 0) - assert sum_result.first() == 15 - - # Multiply all numbers - use fresh pipeline - pipeline2 = Pipeline([1, 2, 3, 4, 5]) - product_result = pipeline2.reduce(lambda acc, x: acc * x, 1) - assert product_result.first() == 120 - - def test_reduce_with_strings(self): - """Test reduce with string data.""" - # Concatenate strings - pipeline1 = Pipeline(["hello", "world", "python"]) - concat_result = pipeline1.reduce(lambda acc, x: acc + " " + x, "") - assert concat_result.first() == " hello world python" - - # Join with commas - use fresh pipeline - pipeline2 = Pipeline(["hello", "world", "python"]) - join_result = pipeline2.reduce(lambda acc, x: acc + "," + x if acc else x, "") - assert join_result.first() == "hello,world,python" - - def test_reduce_empty_pipeline(self): - """Test reduce on empty pipeline.""" - empty_pipeline = Pipeline([]) - - # Reduce should return initial value - result = empty_pipeline.reduce(lambda acc, x: acc + x, 10) - assert result.first() == 10 - - def test_reduce_single_item(self): - """Test reduce with single item.""" - single_pipeline = Pipeline([42]) - - # Should combine initial value with single item - result = single_pipeline.reduce(lambda acc, x: acc + x, 10) - assert result.first() == 52 - - -class TestPipelineTap: - """Test Pipeline tap functionality.""" - - def test_tap_basic(self): - """Test basic tap operations.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - side_effects = [] - - # Tap to collect side effects - result = pipeline.tap(side_effects.append) - result_list = result.to_list() - - assert result_list == [1, 2, 3, 4, 5] - assert side_effects == [1, 2, 3, 4, 5] - - def test_tap_with_print(self): - """Test tap with print (no assertion needed, just verify it works).""" - pipeline = Pipeline([1, 2, 3]) - - # This should not raise any exceptions - result = pipeline.tap(lambda x: None) # Mock print - assert result.to_list() == [1, 2, 3] - - def test_tap_chaining(self): - """Test tap in a chain of operations.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - side_effects = [] - - # Tap in middle of chain - result = pipeline.map(lambda x: x * 2).tap(side_effects.append).filter(lambda x: x > 5) - - result_list = result.to_list() - - assert result_list == [6, 8, 10] - assert side_effects == [2, 4, 6, 8, 10] - - def test_tap_doesnt_modify_pipeline(self): - """Test that tap doesn't modify the pipeline data.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - - # Tap with function that would modify if it could - result = pipeline.tap(lambda x: x * 1000) - - # Data should be unchanged - assert result.to_list() == [1, 2, 3, 4, 5] - - -class TestPipelineEach: - """Test Pipeline each functionality.""" - - def test_each_basic(self): - """Test basic each operations.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - side_effects = [] - - # Each should execute function for each item - pipeline.each(side_effects.append) - - assert side_effects == [1, 2, 3, 4, 5] - - def test_each_returns_none(self): - """Test that each returns None.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - - result = pipeline.each(lambda x: None) - assert result is None - - def test_each_with_empty_pipeline(self): - """Test each with empty pipeline.""" - empty_pipeline = Pipeline([]) - side_effects = [] - - # Should not execute function - empty_pipeline.each(side_effects.append) - - assert side_effects == [] - - def test_each_terminal_operation(self): - """Test that each is a terminal operation.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - side_effects = [] - - # After each, pipeline should be consumed - pipeline.each(side_effects.append) - - assert side_effects == [1, 2, 3, 4, 5] - - -class TestPipelineUtility: - """Test Pipeline utility methods.""" - - def test_passthrough(self): - """Test passthrough method.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - - # Passthrough should return the same pipeline - result = pipeline.passthrough() - assert result is pipeline - - # Data should be unchanged - assert result.to_list() == [1, 2, 3, 4, 5] - - def test_apply_function(self): - """Test apply with single function.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - - def double_pipeline(p): - return p.map(lambda x: x * 2) - - result = pipeline.apply(double_pipeline) - assert result.to_list() == [2, 4, 6, 8, 10] - - def test_flatten_basic(self): - """Test basic flatten operation.""" - pipeline = Pipeline([[1, 2], [3, 4], [5]]) - - result = pipeline.flatten() - assert result.to_list() == [1, 2, 3, 4, 5] - - def test_flatten_empty_lists(self): - """Test flatten with empty lists.""" - pipeline = Pipeline([[], [1, 2], [], [3]]) - - result = pipeline.flatten() - assert result.to_list() == [1, 2, 3] - - def test_flatten_nested_tuples(self): - """Test flatten with nested tuples.""" - pipeline = Pipeline([(1, 2), (3, 4, 5), (6,)]) - - result = pipeline.flatten() - assert result.to_list() == [1, 2, 3, 4, 5, 6] - - def test_flatten_chunked_processing(self): - """Test that flatten maintains chunked processing and doesn't consume entire pipeline.""" - - # Create a large dataset that would cause OOM if consumed all at once - def generate_nested_data(): - for i in range(1000): - yield [i * 2, i * 2 + 1] - - # Use small chunk size to verify chunked processing - pipeline = Pipeline(generate_nested_data(), chunk_size=10) - result = pipeline.flatten() - - # Verify first few elements without consuming the entire pipeline - first_10 = [] - count = 0 - for item in result: - first_10.append(item) - count += 1 - if count >= 10: - break - - assert first_10 == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - - def test_flatten_preserves_chunk_structure(self): - """Test that flatten preserves chunked structure.""" - # Create pipeline with known chunk structure - data = [[1, 2], [3, 4], [5, 6], [7, 8]] - pipeline = Pipeline(data, chunk_size=2) # 2 sublists per chunk - - result = pipeline.flatten() - - # Verify the result is correct - assert result.to_list() == [1, 2, 3, 4, 5, 6, 7, 8] - - def test_flatten_maintains_chunk_size(self): - """Test that flatten maintains the original chunk_size setting.""" - # Create test data with varying sizes of sublists - data = [[1, 2], [3, 4, 5], [6], [7, 8, 9, 10]] - chunk_size = 3 - pipeline = Pipeline(data, chunk_size=chunk_size) - - # Flatten the pipeline - flattened = pipeline.flatten() - - # Verify chunk_size is preserved - assert flattened.chunk_size == chunk_size - - # Collect chunks to verify structure - chunks = [] - for chunk in flattened.generator: - chunks.append(chunk) - - # Verify all chunks except the last have the expected size - for i, chunk in enumerate(chunks[:-1]): - assert len(chunk) == chunk_size, f"Chunk {i} has size {len(chunk)}, expected {chunk_size}" - - # Last chunk can be smaller than chunk_size - if chunks: - assert len(chunks[-1]) <= chunk_size, f"Last chunk has size {len(chunks[-1])}, should be <= {chunk_size}" - - # Verify the final result is correct - result = [] - for chunk in chunks: - result.extend(chunk) - - # This is showing the internal representation of the data wheresecond value of the Tuple is the exception - assert result == [ - PipelineItem(1, None, {"has_errors": False, "errors": []}), - PipelineItem(2, None, {"has_errors": False, "errors": []}), - PipelineItem(3, None, {"has_errors": False, "errors": []}), - PipelineItem(4, None, {"has_errors": False, "errors": []}), - PipelineItem(5, None, {"has_errors": False, "errors": []}), - PipelineItem(6, None, {"has_errors": False, "errors": []}), - PipelineItem(7, None, {"has_errors": False, "errors": []}), - PipelineItem(8, None, {"has_errors": False, "errors": []}), - PipelineItem(9, None, {"has_errors": False, "errors": []}), - PipelineItem(10, None, {"has_errors": False, "errors": []}), - ] - - -class TestPipelineEdgeCases: - """Test Pipeline edge cases and error conditions.""" - - def test_pipeline_with_none_values(self): - """Test pipeline with None values.""" - # Should handle None values properly - pipeline1 = Pipeline([1, None, 3, None, 5]) - result = pipeline1.to_list() - assert result == [1, None, 3, None, 5] - - # Filter out None values - use fresh pipeline - pipeline2 = Pipeline([1, None, 3, None, 5]) - no_none = pipeline2.filter(lambda x: x is not None) - assert no_none.to_list() == [1, 3, 5] - - def test_pipeline_with_mixed_types(self): - """Test pipeline with mixed data types.""" - # Should handle mixed types - pipeline1 = Pipeline([1, "hello", 3.14, True, None]) - result = pipeline1.to_list() - assert result == [1, "hello", 3.14, True, None] - - # Filter by type (excluding boolean which is a subclass of int) - use fresh pipeline - pipeline2 = Pipeline([1, "hello", 3.14, True, None]) - numbers = pipeline2.filter(lambda x: isinstance(x, int | float) and not isinstance(x, bool) and x is not None) - assert numbers.to_list() == [1, 3.14] - - def test_pipeline_reuse(self): - """Test that pipelines can be reused.""" - original_data = [1, 2, 3, 4, 5] - pipeline = Pipeline(original_data) - - # First use - result1 = pipeline.map(lambda x: x * 2).to_list() - assert result1 == [2, 4, 6, 8, 10] - - # Create new pipeline from same data - pipeline2 = Pipeline(original_data) - result2 = pipeline2.filter(lambda x: x % 2 == 0).to_list() - assert result2 == [2, 4] - - def test_pipeline_method_chaining_returns_new_instances(self): - """Test that pipeline methods return new instances.""" - original = Pipeline([1, 2, 3, 4, 5]) - - # Methods should return new instances (except passthrough) - mapped = original.map(lambda x: x * 2) - filtered = original.filter(lambda x: x % 2 == 0) - - assert mapped is not original - assert filtered is not original - assert mapped is not filtered - - def test_pipeline_with_generator_exhaustion(self): - """Test pipeline behavior with generator exhaustion.""" - - def number_generator(): - yield from range(3) - - pipeline = Pipeline(number_generator()) - - # First consumption - result1 = pipeline.to_list() - assert result1 == [0, 1, 2] - - # Generator should be exhausted now - # This behavior depends on the implementation - - -class TestPipelineIntegration: - """Integration tests for Pipeline class.""" - - def test_data_processing_pipeline(self): - """Test a realistic data processing pipeline.""" - # Simulate processing a list of user data - users = [ - {"name": "Alice", "age": 30, "active": True}, - {"name": "Bob", "age": 25, "active": False}, - {"name": "Charlie", "age": 35, "active": True}, - {"name": "Diana", "age": 28, "active": True}, - {"name": "Eve", "age": 22, "active": False}, - ] - - pipeline = Pipeline(users) - - # Process: filter active users, extract names, convert to uppercase - result = pipeline.filter(lambda user: user["active"]).map(lambda user: user["name"]).map(str.upper) - - assert result.to_list() == ["ALICE", "CHARLIE", "DIANA"] - - def test_number_processing_pipeline(self): - """Test a number processing pipeline.""" - numbers = range(1, 21) # 1 to 20 - - pipeline = Pipeline(numbers) - - # Process: filter even numbers, square them, filter > 50, sum - result = ( - pipeline.filter(lambda x: x % 2 == 0) # [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] - .map(lambda x: x**2) # [4, 16, 36, 64, 100, 144, 196, 256, 324, 400] - .filter(lambda x: x > 50) # [64, 100, 144, 196, 256, 324, 400] - .reduce(lambda acc, x: acc + x, 0) - ) # 1484 - - assert result.first() == 1484 - - def test_text_processing_pipeline(self): - """Test a text processing pipeline.""" - text = "Hello world! This is a test. Python is amazing." - words = text.split() - - pipeline = Pipeline(words) - - # Process: filter words > 3 chars, remove punctuation, lowercase, get unique - result = ( - pipeline.filter(lambda word: len(word) > 3) - .map(lambda word: word.strip(".,!")) - .map(str.lower) - .filter(lambda word: word not in ["this"]) - ) # Simple "unique" filter - - expected = ["hello", "world", "test", "python", "amazing"] - assert result.to_list() == expected - - def test_nested_data_processing(self): - """Test processing nested data structures.""" - data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - - pipeline = Pipeline(data) - - # Flatten, filter odd numbers, square them - result = ( - pipeline.flatten() # [1, 2, 3, 4, 5, 6, 7, 8, 9] - .filter(lambda x: x % 2 == 1) # [1, 3, 5, 7, 9] - .map(lambda x: x**2) - ) # [1, 9, 25, 49, 81] - - assert result.to_list() == [1, 9, 25, 49, 81] - - -class TestPipelineComposition: - """Test Pipeline composition functionality.""" - - def test_pipeline_from_pipeline_init(self): - """Test creating a Pipeline from another Pipeline using __init__.""" - # Create source pipeline - source_pipeline = Pipeline([1, 2, 3, 4, 5]) - - # Create new pipeline from the source - new_pipeline = Pipeline(source_pipeline) - - # Test that new pipeline works (source gets exhausted, so test new one) - result = new_pipeline.to_list() - assert result == [1, 2, 3, 4, 5] - - def test_from_pipeline_class_method(self): - """Test creating a Pipeline using from_pipeline class method.""" - # Create source pipeline - source_pipeline = Pipeline([1, 2, 3, 4, 5]) - - # Create new pipeline using class method - new_pipeline = Pipeline.from_pipeline(source_pipeline) - - # Test that they produce the same results - assert new_pipeline.to_list() == [1, 2, 3, 4, 5] - - def test_chain_method(self): - """Test chaining multiple pipelines.""" - pipeline1 = Pipeline([1, 2, 3]) - pipeline2 = Pipeline([4, 5, 6]) - pipeline3 = Pipeline([7, 8, 9]) - - # Chain them together - chained = Pipeline.chain(pipeline1, pipeline2, pipeline3) - - # Should contain all elements in sequence - assert chained.to_list() == [1, 2, 3, 4, 5, 6, 7, 8, 9] - - def test_chain_empty_pipelines(self): - """Test chaining with empty pipelines.""" - pipeline1 = Pipeline([1, 2]) - empty_pipeline = Pipeline([]) - pipeline2 = Pipeline([3, 4]) - - chained = Pipeline.chain(pipeline1, empty_pipeline, pipeline2) - assert chained.to_list() == [1, 2, 3, 4] - - def test_chain_single_pipeline(self): - """Test chaining with a single pipeline.""" - pipeline = Pipeline([1, 2, 3]) - chained = Pipeline.chain(pipeline) - - assert chained.to_list() == [1, 2, 3] - - def test_pipeline_composition_with_operations(self): - """Test that pipeline composition works with transformations.""" - # Create source pipeline with transformations - source = Pipeline([1, 2, 3, 4, 5]).map(lambda x: x * 2) - - # Create new pipeline from transformed source - new_pipeline = Pipeline.from_pipeline(source) - filtered = new_pipeline.filter(lambda x: x > 5) - - assert filtered.to_list() == [6, 8, 10] - - def test_chain_with_different_types(self): - """Test chaining pipelines with different but compatible types.""" - numbers = Pipeline([1, 2, 3]) - strings = Pipeline(["4", "5", "6"]) - - # Chain and then transform to make types consistent - chained = Pipeline.chain(numbers, strings) - all_strings = chained.map(str) - - assert all_strings.to_list() == ["1", "2", "3", "4", "5", "6"] - - def test_complex_pipeline_composition(self): - """Test complex pipeline composition scenario.""" - # Create multiple source pipelines - evens = Pipeline([2, 4, 6, 8]) - odds = Pipeline([1, 3, 5, 7]) - - # Chain them - all_numbers = Pipeline.chain(evens, odds) - - # Apply transformations - result = all_numbers.filter(lambda x: x > 3).map(lambda x: x**2).reduce(lambda acc, x: acc + x, 0) - - # Expected: [4, 6, 8, 5, 7] -> [16, 36, 64, 25, 49] -> 190 - assert result.first() == 190 - - -class TestPipelineConcurrency: - """Test Pipeline concurrency functionality.""" - - def test_concurrent(self): - # Create a simple pipeline - - def _double_pipeline(p: Pipeline[int]): - return p.map(lambda x: x * 2) - - result = Pipeline(range(0, 5), 1).concurrent(_double_pipeline, 5).to_list() - - # Check if all numbers are doubled - assert result == [0, 2, 4, 6, 8] - - def test_concurrent_consumer(self): - # Create a simple pipeline - - items = [] - - def dummy_consumer(p: Pipeline[int]): - return p.tap(lambda x: items.append(x)) - - Pipeline(range(0, 5), 1).concurrent(dummy_consumer, 5).noop() - - # Check if all numbers are consumed - assert items == [0, 1, 2, 3, 4] - - -class TestPipelineCatch: - """Test Pipeline catch functionality and CompoundError handling.""" - - def test_catch_basic_error_handling(self): - """Test basic catch functionality with error callback.""" - captured_errors = [] - - def error_prone_pipeline(p): - return p.map(lambda x: 10 / x if x != 0 else 1 / 0) - - def error_handler(error): - captured_errors.append(error) - - pipeline = Pipeline([2, 0, 4, 0, 6]) - - with pytest.raises(CompoundError) as exc_info: - pipeline.catch(error_prone_pipeline, error_handler).to_list() - - # Should have captured 2 ZeroDivisionErrors - assert len(captured_errors) == 2 - assert all(isinstance(e, ZeroDivisionError) for e in captured_errors) - - # CompoundError should also contain the same errors - assert len(exc_info.value.errors) == 2 - assert all(isinstance(e, ZeroDivisionError) for e in exc_info.value.errors) - - def test_catch_with_successful_items(self): - """Test catch with mix of successful and failed items.""" - error_logs = [] - - def selective_error_pipeline(p): - return p.map(lambda x: x * 2 if x != 3 else 1 / 0) - - def log_error(error): - error_logs.append(f"Caught: {type(error).__name__}") - - pipeline = Pipeline([1, 2, 3, 4, 5]) - - with pytest.raises(CompoundError) as exc_info: - pipeline.catch(selective_error_pipeline, log_error).to_list() - - # Should have logged one error - assert len(error_logs) == 1 - assert "ZeroDivisionError" in error_logs[0] - - # CompoundError should contain one error - assert len(exc_info.value.errors) == 1 - - def test_catch_with_reduce_operation(self): - """Test catch with reduce operation that has errors.""" - errors_seen = [] - - def error_prone_reduce_pipeline(p: Pipeline[int]): - # First map some values to cause errors, then reduce - return p.map(lambda x: x if x != 2 else 1 / 0).reduce(lambda acc, x: acc + x, 0) - - def collect_errors(error): - errors_seen.append(error) - - pipeline = Pipeline([1, 2, 3, 4, 5]) - - with pytest.raises(CompoundError) as exc_info: - result = pipeline.catch(error_prone_reduce_pipeline, collect_errors).first() - - # Should have seen the error during catch - assert len(errors_seen) == 1 - assert isinstance(errors_seen[0], ZeroDivisionError) - - # CompoundError should also have the error - assert len(exc_info.value.errors) == 1 - - def test_catch_multiple_operation_errors(self): - """Test catch with multiple operations that can error.""" - all_errors = [] - - def multi_error_pipeline(p: Pipeline[int]): - return ( - p.map(lambda x: x / 2 if x != 4 else 1 / 0) # Error on 4 - .filter( - lambda x: x > 0 if x != 1 else 1 / 0 - ) # Error on 1 (after division = 0.5, but let's say error on result 1) - .map(lambda x: x * 3 if x != 3 else 1 / 0) - ) # Error on 3 (after division = 1.5, won't hit) - - def track_all_errors(error): - all_errors.append(type(error).__name__) - - pipeline = Pipeline([2, 4, 6, 8]) - - with pytest.raises(CompoundError) as exc_info: - pipeline.catch(multi_error_pipeline, track_all_errors).to_list() - - # Should have tracked at least one error - assert len(all_errors) >= 1 - assert len(exc_info.value.errors) >= 1 - - def test_catch_with_no_errors(self): - """Test catch when no errors occur.""" - error_handler_called = [] - - def safe_pipeline(p): - return p.map(lambda x: x * 2).filter(lambda x: x > 5) - - def should_not_be_called(error): - error_handler_called.append(error) - - pipeline = Pipeline([1, 2, 3, 4, 5]) - result = pipeline.catch(safe_pipeline, should_not_be_called).to_list() - - # No errors, so handler should not be called - assert len(error_handler_called) == 0 - # Should get successful results - assert result == [6, 8, 10] - - def test_compound_error_formatting(self): - """Test CompoundError message formatting.""" - - def always_error(x): - if x == 1: - raise ValueError("First error") - elif x == 2: - raise TypeError("Second error") - else: - raise RuntimeError("Third error") - - pipeline = Pipeline([1, 2, 3]) - - with pytest.raises(CompoundError) as exc_info: - pipeline.map(always_error).to_list() - - error_message = str(exc_info.value) - - # Check message contains error count - assert "3 error(s) occurred in the pipeline" in error_message - - # Check individual error messages are included - assert "ValueError: First error" in error_message - assert "TypeError: Second error" in error_message - assert "RuntimeError: Third error" in error_message - - def test_compound_error_with_single_error(self): - """Test CompoundError with just one error.""" - - def single_error(x): - if x == 3: - raise ValueError("Only error") - return x * 2 - - pipeline = Pipeline([1, 2, 3, 4]) - - with pytest.raises(CompoundError) as exc_info: - pipeline.map(single_error).to_list() - - # Should have exactly one error - assert len(exc_info.value.errors) == 1 - assert isinstance(exc_info.value.errors[0], ValueError) - - # Message should reflect single error - error_message = str(exc_info.value) - assert "1 error(s) occurred in the pipeline" in error_message - assert "ValueError: Only error" in error_message - - def test_catch_with_chained_operations(self): - """Test catch with complex chained operations.""" - logged_errors = [] - - def complex_pipeline(p): - return ( - p.filter(lambda x: x > 0 if x != 2 else 1 / 0) # Error on 2 - .map(lambda x: x * 2 if x != 4 else 1 / 0) # Error on 4 - .reduce(lambda acc, x: acc + x, 0) - ) - - def error_logger(error): - logged_errors.append(f"Error: {error}") - - pipeline = Pipeline([1, 2, 3, 4, 5]) - - with pytest.raises(CompoundError) as exc_info: - pipeline.catch(complex_pipeline, error_logger).first() - - # Should have logged errors from the operations - assert len(logged_errors) >= 1 - # CompoundError should also contain the errors - assert len(exc_info.value.errors) >= 1 - - def test_catch_error_handler_exception(self): - """Test catch when error handler itself raises exception.""" - - def error_pipeline(p): - return p.map(lambda x: 1 / 0 if x == 2 else x) - - def failing_handler(error): - raise RuntimeError("Handler failed!") - - pipeline = Pipeline([1, 2, 3]) - - # Should still work despite handler failure - with pytest.raises(CompoundError): - # The failing handler should not stop the pipeline - pipeline.catch(error_pipeline, failing_handler).to_list() - - -class TestPipelineContext: - """Test Pipeline global context functionality.""" - - def test_map_sets_context_value_propagated_to_all_items(self): - """Test that when map function sets a value in context once, it gets passed to all items.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - - context_access_count = 0 - - def set_context_once(value, context): - nonlocal context_access_count - context_access_count += 1 - - # Set a value in context only on the first call - if context_access_count == 1: - context["shared_value"] = "set_by_first_item" - - # All items should be able to access the shared value - shared = context.get("shared_value", "not_found") - return f"{value}_{shared}" - - result = pipeline.map(set_context_once).to_list() - - # Verify that all items got access to the shared value - expected = [ - "1_set_by_first_item", - "2_set_by_first_item", - "3_set_by_first_item", - "4_set_by_first_item", - "5_set_by_first_item", - ] - - assert result == expected - - # Verify the function was called for each item - assert context_access_count == 5 diff --git a/tests/test_pipeline_basics.py b/tests/test_pipeline_basics.py new file mode 100644 index 0000000..d89101a --- /dev/null +++ b/tests/test_pipeline_basics.py @@ -0,0 +1,77 @@ +""" +Test basic Pipeline functionality. + +This module contains tests for basic Pipeline class functionality +including initialization, iteration, and core methods. +""" + +import pytest + +from efemel.pipeline import Pipeline + + +class TestPipelineBasics: + """Test basic Pipeline functionality.""" + + def test_pipeline_initialization(self): + """Test Pipeline initialization with various iterables.""" + # Test with list + pipeline = Pipeline([1, 2, 3, 4, 5]) + assert isinstance(pipeline, Pipeline) + # Generator should be a generator object, not the original list + from types import GeneratorType + + assert isinstance(pipeline.generator, GeneratorType) + + # Test with tuple + pipeline = Pipeline((1, 2, 3)) + assert list(pipeline) == [1, 2, 3] + + # Test with generator + def gen(): + yield from range(1, 4) + + pipeline = Pipeline(gen()) + assert list(pipeline) == [1, 2, 3] + + # Test with empty list + pipeline = Pipeline([]) + assert list(pipeline) == [] + + def test_pipeline_iteration(self): + """Test Pipeline iteration behavior.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + + # Test iteration + result = [] + for item in pipeline: + result.append(item) + + assert result == [1, 2, 3, 4, 5] + + def test_to_list(self): + """Test Pipeline.to_list() method.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + result = pipeline.to_list() + + assert result == [1, 2, 3, 4, 5] + assert isinstance(result, list) + + # Test with empty pipeline + empty_pipeline = Pipeline([]) + assert empty_pipeline.to_list() == [] + + def test_first(self): + """Test Pipeline.first() method.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + assert pipeline.first() == 1 + + # Test with string + str_pipeline = Pipeline(["hello", "world"]) + assert str_pipeline.first() == "hello" + + def test_first_empty_pipeline(self): + """Test Pipeline.first() with empty pipeline.""" + empty_pipeline = Pipeline([]) + with pytest.raises(StopIteration): + empty_pipeline.first() diff --git a/tests/test_pipeline_catch.py b/tests/test_pipeline_catch.py new file mode 100644 index 0000000..d805789 --- /dev/null +++ b/tests/test_pipeline_catch.py @@ -0,0 +1,214 @@ +""" +Test Pipeline catch functionality and CompoundError handling. + +This module contains tests for Pipeline error handling including +catch operations, CompoundError formatting, and error propagation. +""" + +import pytest + +from efemel.pipeline import CompoundError +from efemel.pipeline import Pipeline + + +class TestPipelineCatch: + """Test Pipeline catch functionality and CompoundError handling.""" + + def test_catch_basic_error_handling(self): + """Test basic catch functionality with error callback.""" + captured_errors = [] + + def error_prone_pipeline(p): + return p.map(lambda x: 10 / x if x != 0 else 1 / 0) + + def error_handler(error): + captured_errors.append(error) + + pipeline = Pipeline([2, 0, 4, 0, 6]) + + with pytest.raises(CompoundError) as exc_info: + pipeline.catch(error_prone_pipeline, error_handler).to_list() + + # Should have captured 2 ZeroDivisionErrors + assert len(captured_errors) == 2 + assert all(isinstance(e, ZeroDivisionError) for e in captured_errors) + + # CompoundError should also contain the same errors + assert len(exc_info.value.errors) == 2 + assert all(isinstance(e, ZeroDivisionError) for e in exc_info.value.errors) + + def test_catch_with_successful_items(self): + """Test catch with mix of successful and failed items.""" + error_logs = [] + + def selective_error_pipeline(p): + return p.map(lambda x: x * 2 if x != 3 else 1 / 0) + + def log_error(error): + error_logs.append(f"Caught: {type(error).__name__}") + + pipeline = Pipeline([1, 2, 3, 4, 5]) + + with pytest.raises(CompoundError) as exc_info: + pipeline.catch(selective_error_pipeline, log_error).to_list() + + # Should have logged one error + assert len(error_logs) == 1 + assert "ZeroDivisionError" in error_logs[0] + + # CompoundError should contain one error + assert len(exc_info.value.errors) == 1 + + def test_catch_with_reduce_operation(self): + """Test catch with reduce operation that has errors.""" + errors_seen = [] + + def error_prone_reduce_pipeline(p: Pipeline[int]): + # First map some values to cause errors, then reduce + return p.map(lambda x: x if x != 2 else 1 / 0).reduce(lambda acc, x: acc + x, 0) + + def collect_errors(error): + errors_seen.append(error) + + pipeline = Pipeline([1, 2, 3, 4, 5]) + + with pytest.raises(CompoundError) as exc_info: + pipeline.catch(error_prone_reduce_pipeline, collect_errors).first() + + # Should have seen the error during catch + assert len(errors_seen) == 1 + assert isinstance(errors_seen[0], ZeroDivisionError) + + # CompoundError should also have the error + assert len(exc_info.value.errors) == 1 + + def test_catch_multiple_operation_errors(self): + """Test catch with multiple operations that can error.""" + all_errors = [] + + def multi_error_pipeline(p: Pipeline[int]): + return ( + p.map(lambda x: x / 2 if x != 4 else 1 / 0) # Error on 4 + .filter( + lambda x: x > 0 if x != 1 else 1 / 0 + ) # Error on 1 (after division = 0.5, but let's say error on result 1) + .map(lambda x: x * 3 if x != 3 else 1 / 0) + ) # Error on 3 (after division = 1.5, won't hit) + + def track_all_errors(error): + all_errors.append(type(error).__name__) + + pipeline = Pipeline([2, 4, 6, 8]) + + with pytest.raises(CompoundError) as exc_info: + pipeline.catch(multi_error_pipeline, track_all_errors).to_list() + + # Should have tracked at least one error + assert len(all_errors) >= 1 + assert len(exc_info.value.errors) >= 1 + + def test_catch_with_no_errors(self): + """Test catch when no errors occur.""" + error_handler_called = [] + + def safe_pipeline(p): + return p.map(lambda x: x * 2).filter(lambda x: x > 5) + + def should_not_be_called(error): + error_handler_called.append(error) + + pipeline = Pipeline([1, 2, 3, 4, 5]) + result = pipeline.catch(safe_pipeline, should_not_be_called).to_list() + + # No errors, so handler should not be called + assert len(error_handler_called) == 0 + # Should get successful results + assert result == [6, 8, 10] + + def test_compound_error_formatting(self): + """Test CompoundError message formatting.""" + + def always_error(x): + if x == 1: + raise ValueError("First error") + elif x == 2: + raise TypeError("Second error") + else: + raise RuntimeError("Third error") + + pipeline = Pipeline([1, 2, 3]) + + with pytest.raises(CompoundError) as exc_info: + pipeline.map(always_error).to_list() + + error_message = str(exc_info.value) + + # Check message contains error count + assert "3 error(s) occurred in the pipeline" in error_message + + # Check individual error messages are included + assert "ValueError: First error" in error_message + assert "TypeError: Second error" in error_message + assert "RuntimeError: Third error" in error_message + + def test_compound_error_with_single_error(self): + """Test CompoundError with just one error.""" + + def single_error(x): + if x == 3: + raise ValueError("Only error") + return x * 2 + + pipeline = Pipeline([1, 2, 3, 4]) + + with pytest.raises(CompoundError) as exc_info: + pipeline.map(single_error).to_list() + + # Should have exactly one error + assert len(exc_info.value.errors) == 1 + assert isinstance(exc_info.value.errors[0], ValueError) + + # Message should reflect single error + error_message = str(exc_info.value) + assert "1 error(s) occurred in the pipeline" in error_message + assert "ValueError: Only error" in error_message + + def test_catch_with_chained_operations(self): + """Test catch with complex chained operations.""" + logged_errors = [] + + def complex_pipeline(p): + return ( + p.filter(lambda x: x > 0 if x != 2 else 1 / 0) # Error on 2 + .map(lambda x: x * 2 if x != 4 else 1 / 0) # Error on 4 + .reduce(lambda acc, x: acc + x, 0) + ) + + def error_logger(error): + logged_errors.append(f"Error: {error}") + + pipeline = Pipeline([1, 2, 3, 4, 5]) + + with pytest.raises(CompoundError) as exc_info: + pipeline.catch(complex_pipeline, error_logger).first() + + # Should have logged errors from the operations + assert len(logged_errors) >= 1 + # CompoundError should also contain the errors + assert len(exc_info.value.errors) >= 1 + + def test_catch_error_handler_exception(self): + """Test catch when error handler itself raises exception.""" + + def error_pipeline(p): + return p.map(lambda x: 1 / 0 if x == 2 else x) + + def failing_handler(error): + raise RuntimeError("Handler failed!") + + pipeline = Pipeline([1, 2, 3]) + + # Should still work despite handler failure + with pytest.raises(CompoundError): + # The failing handler should not stop the pipeline + pipeline.catch(error_pipeline, failing_handler).to_list() diff --git a/tests/test_pipeline_composition.py b/tests/test_pipeline_composition.py new file mode 100644 index 0000000..0ad0a44 --- /dev/null +++ b/tests/test_pipeline_composition.py @@ -0,0 +1,100 @@ +""" +Test Pipeline composition functionality. + +This module contains tests for Pipeline composition including +pipeline chaining, from_pipeline method, and complex compositions. +""" + +from efemel.pipeline import Pipeline + + +class TestPipelineComposition: + """Test Pipeline composition functionality.""" + + def test_pipeline_from_pipeline_init(self): + """Test creating a Pipeline from another Pipeline using __init__.""" + # Create source pipeline + source_pipeline = Pipeline([1, 2, 3, 4, 5]) + + # Create new pipeline from the source + new_pipeline = Pipeline(source_pipeline) + + # Test that new pipeline works (source gets exhausted, so test new one) + result = new_pipeline.to_list() + assert result == [1, 2, 3, 4, 5] + + def test_from_pipeline_class_method(self): + """Test creating a Pipeline using from_pipeline class method.""" + # Create source pipeline + source_pipeline = Pipeline([1, 2, 3, 4, 5]) + + # Create new pipeline using class method + new_pipeline = Pipeline.from_pipeline(source_pipeline) + + # Test that they produce the same results + assert new_pipeline.to_list() == [1, 2, 3, 4, 5] + + def test_chain_method(self): + """Test chaining multiple pipelines.""" + pipeline1 = Pipeline([1, 2, 3]) + pipeline2 = Pipeline([4, 5, 6]) + pipeline3 = Pipeline([7, 8, 9]) + + # Chain them together + chained = Pipeline.chain(pipeline1, pipeline2, pipeline3) + + # Should contain all elements in sequence + assert chained.to_list() == [1, 2, 3, 4, 5, 6, 7, 8, 9] + + def test_chain_empty_pipelines(self): + """Test chaining with empty pipelines.""" + pipeline1 = Pipeline([1, 2]) + empty_pipeline = Pipeline([]) + pipeline2 = Pipeline([3, 4]) + + chained = Pipeline.chain(pipeline1, empty_pipeline, pipeline2) + assert chained.to_list() == [1, 2, 3, 4] + + def test_chain_single_pipeline(self): + """Test chaining with a single pipeline.""" + pipeline = Pipeline([1, 2, 3]) + chained = Pipeline.chain(pipeline) + + assert chained.to_list() == [1, 2, 3] + + def test_pipeline_composition_with_operations(self): + """Test that pipeline composition works with transformations.""" + # Create source pipeline with transformations + source = Pipeline([1, 2, 3, 4, 5]).map(lambda x: x * 2) + + # Create new pipeline from transformed source + new_pipeline = Pipeline.from_pipeline(source) + filtered = new_pipeline.filter(lambda x: x > 5) + + assert filtered.to_list() == [6, 8, 10] + + def test_chain_with_different_types(self): + """Test chaining pipelines with different but compatible types.""" + numbers = Pipeline([1, 2, 3]) + strings = Pipeline(["4", "5", "6"]) + + # Chain and then transform to make types consistent + chained = Pipeline.chain(numbers, strings) + all_strings = chained.map(str) + + assert all_strings.to_list() == ["1", "2", "3", "4", "5", "6"] + + def test_complex_pipeline_composition(self): + """Test complex pipeline composition scenario.""" + # Create multiple source pipelines + evens = Pipeline([2, 4, 6, 8]) + odds = Pipeline([1, 3, 5, 7]) + + # Chain them + all_numbers = Pipeline.chain(evens, odds) + + # Apply transformations + result = all_numbers.filter(lambda x: x > 3).map(lambda x: x**2).reduce(lambda acc, x: acc + x, 0) + + # Expected: [4, 6, 8, 5, 7] -> [16, 36, 64, 25, 49] -> 190 + assert result.first() == 190 diff --git a/tests/test_pipeline_concurrency.py b/tests/test_pipeline_concurrency.py new file mode 100644 index 0000000..39002d6 --- /dev/null +++ b/tests/test_pipeline_concurrency.py @@ -0,0 +1,134 @@ +""" +Test Pipeline concurrency functionality. + +This module contains tests for Pipeline concurrent operations +including basic concurrency and context isolation. +""" + +import pytest + +from efemel.pipeline import CompoundError +from efemel.pipeline import Pipeline + + +class TestPipelineConcurrency: + """Test Pipeline concurrency functionality.""" + + def test_concurrent(self): + # Create a simple pipeline + + def _double_pipeline(p: Pipeline[int]): + return p.map(lambda x: x * 2) + + result = Pipeline(range(0, 5), 1).concurrent(_double_pipeline, 5).to_list() + + # Check if all numbers are doubled + assert result == [0, 2, 4, 6, 8] + + def test_concurrent_consumer(self): + # Create a simple pipeline + + items = [] + + def dummy_consumer(p: Pipeline[int]): + return p.tap(lambda x: items.append(x)) + + Pipeline(range(0, 5), 1).concurrent(dummy_consumer, 5).noop() + + # Check if all numbers are consumed + assert items == [0, 1, 2, 3, 4] + + def test_concurrent_context_isolation(self): + """Test that concurrent processing properly isolates and merges contexts.""" + + # Create a pipeline with some data + data = list(range(100)) # Large enough to ensure multiple chunks + pipeline = Pipeline(data, chunk_size=10) # Force multiple chunks + + def error_prone_pipeline(p): + """A pipeline that will cause errors on certain values.""" + return p.map(lambda x: x if x % 7 != 0 else 1 / 0) # Error on multiples of 7 + + def safe_pipeline(p): + """A pipeline that should work without errors.""" + return p.map(lambda x: x * 2) + + # Test with error-prone pipeline + with pytest.raises(CompoundError): + pipeline.concurrent(error_prone_pipeline, max_workers=4).to_list() + + # Check that the context properly recorded errors + assert pipeline.context["has_errors"] + + # Test with safe pipeline + safe_data = list(range(50)) + safe_pipeline_obj = Pipeline(safe_data, chunk_size=5) + + result = safe_pipeline_obj.concurrent(safe_pipeline, max_workers=4).to_list() + expected = [x * 2 for x in safe_data] + + assert result == expected + assert not safe_pipeline_obj.context["has_errors"] + + def test_concurrent_vs_sequential(self): + """Test that concurrent and sequential processing produce the same results.""" + data = list(range(20)) + + def transform_pipeline(p): + return p.map(lambda x: x * 3).filter(lambda x: x % 2 == 0) + + # Sequential + sequential_result = Pipeline(data).apply(transform_pipeline).to_list() + + # Concurrent + concurrent_result = Pipeline(data, chunk_size=5).concurrent(transform_pipeline, max_workers=3).to_list() + + assert sequential_result == concurrent_result + + def test_concurrent_with_errors_maintains_order(self): + """Test that concurrent processing maintains order even with errors.""" + data = list(range(20)) + + def error_prone_pipeline(p): + """Creates errors on multiples of 5.""" + return p.map(lambda x: x if x % 5 != 0 else 1 / 0) + + # Test ordered processing + pipeline = Pipeline(data, chunk_size=3) + + with pytest.raises(CompoundError) as exc_info: + pipeline.concurrent(error_prone_pipeline, max_workers=2, ordered=True).to_list() + + # Should have errors for multiples of 5: 0, 5, 10, 15 + assert len(exc_info.value.errors) == 4 + + def test_concurrent_unordered(self): + """Test unordered concurrent processing.""" + data = list(range(20)) + + def simple_transform(p): + return p.map(lambda x: x * 2) + + # Unordered processing + pipeline = Pipeline(data, chunk_size=4) + result = pipeline.concurrent(simple_transform, max_workers=3, ordered=False).to_list() + + # Results should be correct but may be out of order + expected = [x * 2 for x in data] + assert sorted(result) == sorted(expected) + + def test_concurrent_context_thread_safety(self): + """Test that context updates are thread-safe during concurrent processing.""" + data = list(range(100)) + + def context_aware_pipeline(p): + """Pipeline that interacts with context.""" + return p.map(lambda x, ctx: x * 2 if not ctx["has_errors"] else x) + + pipeline = Pipeline(data, chunk_size=10) + result = pipeline.concurrent(context_aware_pipeline, max_workers=4).to_list() + + # Should get consistent results despite concurrent access to context + expected = [x * 2 for x in data] + assert result == expected + assert not pipeline.context["has_errors"] diff --git a/tests/test_pipeline_context.py b/tests/test_pipeline_context.py new file mode 100644 index 0000000..89ba9e4 --- /dev/null +++ b/tests/test_pipeline_context.py @@ -0,0 +1,46 @@ +""" +Test Pipeline global context functionality. + +This module contains tests for Pipeline global context operations +including context propagation and sharing between operations. +""" + +from efemel.pipeline import Pipeline + + +class TestPipelineContext: + """Test Pipeline global context functionality.""" + + def test_map_sets_context_value_propagated_to_all_items(self): + """Test that when map function sets a value in context once, it gets passed to all items.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + + context_access_count = 0 + + def set_context_once(value, context): + nonlocal context_access_count + context_access_count += 1 + + # Set a value in context only on the first call + if context_access_count == 1: + context["shared_value"] = "set_by_first_item" + + # All items should be able to access the shared value + shared = context.get("shared_value", "not_found") + return f"{value}_{shared}" + + result = pipeline.map(set_context_once).to_list() + + # Verify that all items got access to the shared value + expected = [ + "1_set_by_first_item", + "2_set_by_first_item", + "3_set_by_first_item", + "4_set_by_first_item", + "5_set_by_first_item", + ] + + assert result == expected + + # Verify the function was called for each item + assert context_access_count == 5 diff --git a/tests/test_pipeline_each.py b/tests/test_pipeline_each.py new file mode 100644 index 0000000..ac473d6 --- /dev/null +++ b/tests/test_pipeline_each.py @@ -0,0 +1,49 @@ +""" +Test Pipeline each functionality. + +This module contains tests for Pipeline each operations +including side effects and terminal operation behavior. +""" + +from efemel.pipeline import Pipeline + + +class TestPipelineEach: + """Test Pipeline each functionality.""" + + def test_each_basic(self): + """Test basic each operations.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + side_effects = [] + + # Each should execute function for each item + pipeline.each(side_effects.append) + + assert side_effects == [1, 2, 3, 4, 5] + + def test_each_returns_none(self): + """Test that each returns None.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + + result = pipeline.each(lambda x: None) + assert result is None + + def test_each_with_empty_pipeline(self): + """Test each with empty pipeline.""" + empty_pipeline = Pipeline([]) + side_effects = [] + + # Should not execute function + empty_pipeline.each(side_effects.append) + + assert side_effects == [] + + def test_each_terminal_operation(self): + """Test that each is a terminal operation.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + side_effects = [] + + # After each, pipeline should be consumed + pipeline.each(side_effects.append) + + assert side_effects == [1, 2, 3, 4, 5] diff --git a/tests/test_pipeline_edge_cases.py b/tests/test_pipeline_edge_cases.py new file mode 100644 index 0000000..d4530aa --- /dev/null +++ b/tests/test_pipeline_edge_cases.py @@ -0,0 +1,77 @@ +""" +Test Pipeline edge cases and error conditions. + +This module contains tests for Pipeline edge cases including +None values, mixed types, reuse, and generator behavior. +""" + +from efemel.pipeline import Pipeline + + +class TestPipelineEdgeCases: + """Test Pipeline edge cases and error conditions.""" + + def test_pipeline_with_none_values(self): + """Test pipeline with None values.""" + # Should handle None values properly + pipeline1 = Pipeline([1, None, 3, None, 5]) + result = pipeline1.to_list() + assert result == [1, None, 3, None, 5] + + # Filter out None values - use fresh pipeline + pipeline2 = Pipeline([1, None, 3, None, 5]) + no_none = pipeline2.filter(lambda x: x is not None) + assert no_none.to_list() == [1, 3, 5] + + def test_pipeline_with_mixed_types(self): + """Test pipeline with mixed data types.""" + # Should handle mixed types + pipeline1 = Pipeline([1, "hello", 3.14, True, None]) + result = pipeline1.to_list() + assert result == [1, "hello", 3.14, True, None] + + # Filter by type (excluding boolean which is a subclass of int) - use fresh pipeline + pipeline2 = Pipeline([1, "hello", 3.14, True, None]) + numbers = pipeline2.filter(lambda x: isinstance(x, int | float) and not isinstance(x, bool) and x is not None) + assert numbers.to_list() == [1, 3.14] + + def test_pipeline_reuse(self): + """Test that pipelines can be reused.""" + original_data = [1, 2, 3, 4, 5] + pipeline = Pipeline(original_data) + + # First use + result1 = pipeline.map(lambda x: x * 2).to_list() + assert result1 == [2, 4, 6, 8, 10] + + # Create new pipeline from same data + pipeline2 = Pipeline(original_data) + result2 = pipeline2.filter(lambda x: x % 2 == 0).to_list() + assert result2 == [2, 4] + + def test_pipeline_method_chaining_returns_new_instances(self): + """Test that pipeline methods return new instances.""" + original = Pipeline([1, 2, 3, 4, 5]) + + # Methods should return new instances (except passthrough) + mapped = original.map(lambda x: x * 2) + filtered = original.filter(lambda x: x % 2 == 0) + + assert mapped is not original + assert filtered is not original + assert mapped is not filtered + + def test_pipeline_with_generator_exhaustion(self): + """Test pipeline behavior with generator exhaustion.""" + + def number_generator(): + yield from range(3) + + pipeline = Pipeline(number_generator()) + + # First consumption + result1 = pipeline.to_list() + assert result1 == [0, 1, 2] + + # Generator should be exhausted now + # This behavior depends on the implementation diff --git a/tests/test_pipeline_filtering.py b/tests/test_pipeline_filtering.py new file mode 100644 index 0000000..f87b985 --- /dev/null +++ b/tests/test_pipeline_filtering.py @@ -0,0 +1,60 @@ +""" +Test Pipeline filtering functionality. + +This module contains tests for Pipeline filtering operations +including basic filtering, chaining, and edge cases. +""" + +from efemel.pipeline import Pipeline + + +class TestPipelineFiltering: + """Test Pipeline filtering functionality.""" + + def test_filter_basic(self): + """Test basic filtering operations.""" + # Filter even numbers + pipeline1 = Pipeline([1, 2, 3, 4, 5]) + even_pipeline = pipeline1.filter(lambda x: x % 2 == 0) + assert even_pipeline.to_list() == [2, 4] + + # Filter numbers greater than 3 - use fresh pipeline + pipeline2 = Pipeline([1, 2, 3, 4, 5]) + gt3_pipeline = pipeline2.filter(lambda x: x > 3) + assert gt3_pipeline.to_list() == [4, 5] + + def test_filter_with_strings(self): + """Test filtering with string data.""" + # Filter strings longer than 4 characters + pipeline1 = Pipeline(["hello", "world", "python", "test"]) + long_strings = pipeline1.filter(lambda s: len(s) > 4) + assert long_strings.to_list() == ["hello", "world", "python"] + + # Filter strings starting with 'p' - use fresh pipeline + pipeline2 = Pipeline(["hello", "world", "python", "test"]) + p_strings = pipeline2.filter(lambda s: s.startswith("p")) + assert p_strings.to_list() == ["python"] + + def test_filter_empty_result(self): + """Test filter that results in empty pipeline.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + + # Filter that matches nothing + empty_result = pipeline.filter(lambda x: x > 10) + assert empty_result.to_list() == [] + + def test_filter_all_pass(self): + """Test filter where all items pass.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + + # Filter that passes everything + all_pass = pipeline.filter(lambda x: True) + assert all_pass.to_list() == [1, 2, 3, 4, 5] + + def test_filter_chaining(self): + """Test chaining multiple filters.""" + pipeline = Pipeline([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + + # Chain filters: even numbers and greater than 5 + result = pipeline.filter(lambda x: x % 2 == 0).filter(lambda x: x > 5) + assert result.to_list() == [6, 8, 10] diff --git a/tests/test_pipeline_integration.py b/tests/test_pipeline_integration.py new file mode 100644 index 0000000..f4425db --- /dev/null +++ b/tests/test_pipeline_integration.py @@ -0,0 +1,79 @@ +""" +Integration tests for Pipeline class. + +This module contains end-to-end integration tests for complex +Pipeline operations and realistic data processing scenarios. +""" + +from efemel.pipeline import Pipeline + + +class TestPipelineIntegration: + """Integration tests for Pipeline class.""" + + def test_data_processing_pipeline(self): + """Test a realistic data processing pipeline.""" + # Simulate processing a list of user data + users = [ + {"name": "Alice", "age": 30, "active": True}, + {"name": "Bob", "age": 25, "active": False}, + {"name": "Charlie", "age": 35, "active": True}, + {"name": "Diana", "age": 28, "active": True}, + {"name": "Eve", "age": 22, "active": False}, + ] + + pipeline = Pipeline(users) + + # Process: filter active users, extract names, convert to uppercase + result = pipeline.filter(lambda user: user["active"]).map(lambda user: user["name"]).map(str.upper) + + assert result.to_list() == ["ALICE", "CHARLIE", "DIANA"] + + def test_number_processing_pipeline(self): + """Test a number processing pipeline.""" + numbers = range(1, 21) # 1 to 20 + + pipeline = Pipeline(numbers) + + # Process: filter even numbers, square them, filter > 50, sum + result = ( + pipeline.filter(lambda x: x % 2 == 0) # [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] + .map(lambda x: x**2) # [4, 16, 36, 64, 100, 144, 196, 256, 324, 400] + .filter(lambda x: x > 50) # [64, 100, 144, 196, 256, 324, 400] + .reduce(lambda acc, x: acc + x, 0) + ) # 1484 + + assert result.first() == 1484 + + def test_text_processing_pipeline(self): + """Test a text processing pipeline.""" + text = "Hello world! This is a test. Python is amazing." + words = text.split() + + pipeline = Pipeline(words) + + # Process: filter words > 3 chars, remove punctuation, lowercase, get unique + result = ( + pipeline.filter(lambda word: len(word) > 3) + .map(lambda word: word.strip(".,!")) + .map(str.lower) + .filter(lambda word: word not in ["this"]) + ) # Simple "unique" filter + + expected = ["hello", "world", "test", "python", "amazing"] + assert result.to_list() == expected + + def test_nested_data_processing(self): + """Test processing nested data structures.""" + data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + + pipeline = Pipeline(data) + + # Flatten, filter odd numbers, square them + result = ( + pipeline.flatten() # [1, 2, 3, 4, 5, 6, 7, 8, 9] + .filter(lambda x: x % 2 == 1) # [1, 3, 5, 7, 9] + .map(lambda x: x**2) + ) # [1, 9, 25, 49, 81] + + assert result.to_list() == [1, 9, 25, 49, 81] diff --git a/tests/test_pipeline_map_filter.py b/tests/test_pipeline_map_filter.py new file mode 100644 index 0000000..d09e171 --- /dev/null +++ b/tests/test_pipeline_map_filter.py @@ -0,0 +1,41 @@ +""" +Test Pipeline map and filter combinations. + +This module contains tests for combining mapping and filtering operations +in various orders and combinations. +""" + +from efemel.pipeline import Pipeline + + +class TestPipelineMapFilter: + """Test Pipeline map and filter combinations.""" + + def test_map_then_filter(self): + """Test mapping then filtering.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + + # Double numbers, then filter even results + result = pipeline.map(lambda x: x * 2).filter(lambda x: x % 2 == 0) + assert result.to_list() == [2, 4, 6, 8, 10] + + def test_filter_then_map(self): + """Test filtering then mapping.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + + # Filter even numbers, then double them + result = pipeline.filter(lambda x: x % 2 == 0).map(lambda x: x * 2) + assert result.to_list() == [4, 8] + + def test_complex_map_filter_chain(self): + """Test complex chaining of map and filter operations.""" + pipeline = Pipeline([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + + # Complex chain: filter odd, multiply by 3, filter > 10 + result = ( + pipeline.filter(lambda x: x % 2 == 1) # [1, 3, 5, 7, 9] + .map(lambda x: x * 3) # [3, 9, 15, 21, 27] + .filter(lambda x: x > 10) + ) # [15, 21, 27] + + assert result.to_list() == [15, 21, 27] diff --git a/tests/test_pipeline_mapping.py b/tests/test_pipeline_mapping.py new file mode 100644 index 0000000..00a0814 --- /dev/null +++ b/tests/test_pipeline_mapping.py @@ -0,0 +1,64 @@ +""" +Test Pipeline mapping functionality. + +This module contains tests for Pipeline mapping operations +including basic mapping, type transformations, and chaining. +""" + +from efemel.pipeline import Pipeline + + +class TestPipelineMapping: + """Test Pipeline mapping functionality.""" + + def test_map_basic(self): + """Test basic mapping operations.""" + # Double each number + pipeline1 = Pipeline([1, 2, 3, 4, 5]) + doubled = pipeline1.map(lambda x: x * 2) + assert doubled.to_list() == [2, 4, 6, 8, 10] + + # Square each number - use fresh pipeline + pipeline2 = Pipeline([1, 2, 3, 4, 5]) + squared = pipeline2.map(lambda x: x**2) + assert squared.to_list() == [1, 4, 9, 16, 25] + + def test_map_type_transformation(self): + """Test mapping with type transformation.""" + # Convert numbers to strings + pipeline1 = Pipeline([1, 2, 3, 4, 5]) + str_pipeline = pipeline1.map(str) + assert str_pipeline.to_list() == ["1", "2", "3", "4", "5"] + + # Convert to boolean (non-zero is True) - use fresh pipeline + pipeline2 = Pipeline([1, 2, 3, 4, 5]) + bool_pipeline = pipeline2.map(bool) + assert bool_pipeline.to_list() == [True, True, True, True, True] + + def test_map_with_strings(self): + """Test mapping with string data.""" + # Convert to uppercase + pipeline1 = Pipeline(["hello", "world", "python"]) + upper_pipeline = pipeline1.map(str.upper) + assert upper_pipeline.to_list() == ["HELLO", "WORLD", "PYTHON"] + + # Get string lengths - use fresh pipeline + pipeline2 = Pipeline(["hello", "world", "python"]) + len_pipeline = pipeline2.map(len) + assert len_pipeline.to_list() == [5, 5, 6] + + def test_map_chaining(self): + """Test chaining multiple map operations.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + + # Chain maps: double, then add 1 + result = pipeline.map(lambda x: x * 2).map(lambda x: x + 1) + assert result.to_list() == [3, 5, 7, 9, 11] + + def test_map_empty_pipeline(self): + """Test mapping on empty pipeline.""" + empty_pipeline = Pipeline([]) + + # Map should return empty result + result = empty_pipeline.map(lambda x: x * 2) + assert result.to_list() == [] diff --git a/tests/test_pipeline_reduce.py b/tests/test_pipeline_reduce.py new file mode 100644 index 0000000..a1072f7 --- /dev/null +++ b/tests/test_pipeline_reduce.py @@ -0,0 +1,52 @@ +""" +Test Pipeline reduce functionality. + +This module contains tests for Pipeline reduce operations +including basic reduction, string operations, and edge cases. +""" + +from efemel.pipeline import Pipeline + + +class TestPipelineReduce: + """Test Pipeline reduce functionality.""" + + def test_reduce_basic(self): + """Test basic reduce operations.""" + # Sum all numbers + pipeline1 = Pipeline([1, 2, 3, 4, 5]) + sum_result = pipeline1.reduce(lambda acc, x: acc + x, 0) + assert sum_result.first() == 15 + + # Multiply all numbers - use fresh pipeline + pipeline2 = Pipeline([1, 2, 3, 4, 5]) + product_result = pipeline2.reduce(lambda acc, x: acc * x, 1) + assert product_result.first() == 120 + + def test_reduce_with_strings(self): + """Test reduce with string data.""" + # Concatenate strings + pipeline1 = Pipeline(["hello", "world", "python"]) + concat_result = pipeline1.reduce(lambda acc, x: acc + " " + x, "") + assert concat_result.first() == " hello world python" + + # Join with commas - use fresh pipeline + pipeline2 = Pipeline(["hello", "world", "python"]) + join_result = pipeline2.reduce(lambda acc, x: acc + "," + x if acc else x, "") + assert join_result.first() == "hello,world,python" + + def test_reduce_empty_pipeline(self): + """Test reduce on empty pipeline.""" + empty_pipeline = Pipeline([]) + + # Reduce should return initial value + result = empty_pipeline.reduce(lambda acc, x: acc + x, 10) + assert result.first() == 10 + + def test_reduce_single_item(self): + """Test reduce with single item.""" + single_pipeline = Pipeline([42]) + + # Should combine initial value with single item + result = single_pipeline.reduce(lambda acc, x: acc + x, 10) + assert result.first() == 52 diff --git a/tests/test_pipeline_tap.py b/tests/test_pipeline_tap.py new file mode 100644 index 0000000..d835298 --- /dev/null +++ b/tests/test_pipeline_tap.py @@ -0,0 +1,55 @@ +""" +Test Pipeline tap functionality. + +This module contains tests for Pipeline tap operations +including side effects, chaining, and data integrity. +""" + +from efemel.pipeline import Pipeline + + +class TestPipelineTap: + """Test Pipeline tap functionality.""" + + def test_tap_basic(self): + """Test basic tap operations.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + side_effects = [] + + # Tap to collect side effects + result = pipeline.tap(side_effects.append) + result_list = result.to_list() + + assert result_list == [1, 2, 3, 4, 5] + assert side_effects == [1, 2, 3, 4, 5] + + def test_tap_with_print(self): + """Test tap with print (no assertion needed, just verify it works).""" + pipeline = Pipeline([1, 2, 3]) + + # This should not raise any exceptions + result = pipeline.tap(lambda x: None) # Mock print + assert result.to_list() == [1, 2, 3] + + def test_tap_chaining(self): + """Test tap in a chain of operations.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + side_effects = [] + + # Tap in middle of chain + result = pipeline.map(lambda x: x * 2).tap(side_effects.append).filter(lambda x: x > 5) + + result_list = result.to_list() + + assert result_list == [6, 8, 10] + assert side_effects == [2, 4, 6, 8, 10] + + def test_tap_doesnt_modify_pipeline(self): + """Test that tap doesn't modify the pipeline data.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + + # Tap with function that would modify if it could + result = pipeline.tap(lambda x: x * 1000) + + # Data should be unchanged + assert result.to_list() == [1, 2, 3, 4, 5] diff --git a/tests/test_pipeline_utility.py b/tests/test_pipeline_utility.py new file mode 100644 index 0000000..e76447f --- /dev/null +++ b/tests/test_pipeline_utility.py @@ -0,0 +1,134 @@ +""" +Test Pipeline utility methods. + +This module contains tests for Pipeline utility operations +including passthrough, apply, flatten, and chunked processing. +""" + +from efemel.pipeline import Pipeline +from efemel.pipeline import PipelineItem + + +class TestPipelineUtility: + """Test Pipeline utility methods.""" + + def test_passthrough(self): + """Test passthrough method.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + + # Passthrough should return the same pipeline + result = pipeline.passthrough() + assert result is pipeline + + # Data should be unchanged + assert result.to_list() == [1, 2, 3, 4, 5] + + def test_apply_function(self): + """Test apply with single function.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + + def double_pipeline(p): + return p.map(lambda x: x * 2) + + result = pipeline.apply(double_pipeline) + assert result.to_list() == [2, 4, 6, 8, 10] + + def test_flatten_basic(self): + """Test basic flatten operation.""" + pipeline = Pipeline([[1, 2], [3, 4], [5]]) + + result = pipeline.flatten() + assert result.to_list() == [1, 2, 3, 4, 5] + + def test_flatten_empty_lists(self): + """Test flatten with empty lists.""" + pipeline = Pipeline([[], [1, 2], [], [3]]) + + result = pipeline.flatten() + assert result.to_list() == [1, 2, 3] + + def test_flatten_nested_tuples(self): + """Test flatten with nested tuples.""" + pipeline = Pipeline([(1, 2), (3, 4, 5), (6,)]) + + result = pipeline.flatten() + assert result.to_list() == [1, 2, 3, 4, 5, 6] + + def test_flatten_chunked_processing(self): + """Test that flatten maintains chunked processing and doesn't consume entire pipeline.""" + + # Create a large dataset that would cause OOM if consumed all at once + def generate_nested_data(): + for i in range(1000): + yield [i * 2, i * 2 + 1] + + # Use small chunk size to verify chunked processing + pipeline = Pipeline(generate_nested_data(), chunk_size=10) + result = pipeline.flatten() + + # Verify first few elements without consuming the entire pipeline + first_10 = [] + count = 0 + for item in result: + first_10.append(item) + count += 1 + if count >= 10: + break + + assert first_10 == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + + def test_flatten_preserves_chunk_structure(self): + """Test that flatten preserves chunked structure.""" + # Create pipeline with known chunk structure + data = [[1, 2], [3, 4], [5, 6], [7, 8]] + pipeline = Pipeline(data, chunk_size=2) # 2 sublists per chunk + + result = pipeline.flatten() + + # Verify the result is correct + assert result.to_list() == [1, 2, 3, 4, 5, 6, 7, 8] + + def test_flatten_maintains_chunk_size(self): + """Test that flatten maintains the original chunk_size setting.""" + # Create test data with varying sizes of sublists + data = [[1, 2], [3, 4, 5], [6], [7, 8, 9, 10]] + chunk_size = 3 + pipeline = Pipeline(data, chunk_size=chunk_size) + + # Flatten the pipeline + flattened = pipeline.flatten() + + # Verify chunk_size is preserved + assert flattened.chunk_size == chunk_size + + # Collect chunks to verify structure + chunks = [] + for chunk in flattened.generator: + chunks.append(chunk) + + # Verify all chunks except the last have the expected size + for i, chunk in enumerate(chunks[:-1]): + assert len(chunk) == chunk_size, f"Chunk {i} has size {len(chunk)}, expected {chunk_size}" + + # Last chunk can be smaller than chunk_size + if chunks: + assert len(chunks[-1]) <= chunk_size, f"Last chunk has size {len(chunks[-1])}, should be <= {chunk_size}" + + # Verify the final result is correct + result = [] + for chunk in chunks: + result.extend(chunk) + + # This is showing the internal representation of the data wheresecond value of the Tuple is the exception + assert result == [ + PipelineItem(1, None, {"has_errors": False, "errors": []}), + PipelineItem(2, None, {"has_errors": False, "errors": []}), + PipelineItem(3, None, {"has_errors": False, "errors": []}), + PipelineItem(4, None, {"has_errors": False, "errors": []}), + PipelineItem(5, None, {"has_errors": False, "errors": []}), + PipelineItem(6, None, {"has_errors": False, "errors": []}), + PipelineItem(7, None, {"has_errors": False, "errors": []}), + PipelineItem(8, None, {"has_errors": False, "errors": []}), + PipelineItem(9, None, {"has_errors": False, "errors": []}), + PipelineItem(10, None, {"has_errors": False, "errors": []}), + ] From 3de3b120e3e1c6abfec2afddda6305b8c818e2ee Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Fri, 11 Jul 2025 22:12:38 +0000 Subject: [PATCH 09/17] fix: context is properly merged back --- efemel/pipeline.py | 38 ++++++++++++---- tests/test_pipeline_context.py | 80 ++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+), 8 deletions(-) diff --git a/efemel/pipeline.py b/efemel/pipeline.py index 8da2c4a..d61e9a7 100644 --- a/efemel/pipeline.py +++ b/efemel/pipeline.py @@ -226,9 +226,6 @@ def first(self) -> T: def map[U](self, function: Callable[[T], U] | Callable[[T, PipelineContext], U]) -> "Pipeline[U]": """Transform elements, capturing any exceptions that occur.""" - # Create a standardized function that always takes just value - std_function = _create_context_aware_function(function, self.context) - def map_generator() -> Generator[list[PipelineItem[U]], None, None]: for chunk in self.generator: new_chunk: list[PipelineItem[U]] = [] @@ -239,6 +236,8 @@ def map_generator() -> Generator[list[PipelineItem[U]], None, None]: self.context["has_errors"] = True else: try: + # Create a standardized function using the item's specific context + std_function = _create_context_aware_function(function, context) # Apply the standardized function result = std_function(value) new_chunk.append(PipelineItem(result, None, context)) @@ -432,21 +431,27 @@ def concurrent( # Thread-safe context merging context_lock = threading.Lock() merged_errors: list[dict[str, Any]] = [] + isolated_contexts: list[PipelineContext] = [] def apply_to_chunk(chunk: list[PipelineItem[T]]) -> list[PipelineItem[U]]: - # Create an isolated context for this thread to avoid race conditions + # Create an isolated context for this thread, copying the main context isolated_context = PipelineContext(has_errors=False, errors=[]) + # Copy any existing context fields from main context + isolated_context.update(self.context) + isolated_context["has_errors"] = False + isolated_context["errors"] = [] # Create a mini-pipeline from the single chunk with isolated context chunk_pipeline = Pipeline._from_chunks([chunk], len(chunk), isolated_context) processed_pipeline = pipeline_func(chunk_pipeline) - # Collect results and merge context thread-safely + # Collect results and store context for later merging result = [item for processed_chunk in processed_pipeline.generator for item in processed_chunk] - # Thread-safe context merging - if isolated_context["has_errors"] or isolated_context["errors"]: - with context_lock: + # Thread-safe context collection + with context_lock: + isolated_contexts.append(isolated_context) + if isolated_context["has_errors"] or isolated_context["errors"]: self.context["has_errors"] = True merged_errors.extend(isolated_context["errors"]) @@ -487,6 +492,23 @@ def gen(executor): with context_lock: self.context["errors"].extend(merged_errors) + # Merge all context fields from isolated contexts + for isolated_context in isolated_contexts: + for key, value in isolated_context.items(): + if key not in ["has_errors", "errors"]: # Skip standard fields already handled + if key in self.context: + # If the field already exists, merge it intelligently + if isinstance(value, list) and isinstance(self.context[key], list): + self.context[key].extend(value) + elif isinstance(value, dict) and isinstance(self.context[key], dict): + self.context[key].update(value) + else: + # For other types, use the latest value + self.context[key] = value + else: + # New field, just add it + self.context[key] = value + return result_pipeline # All other methods like tap, noop, etc. can be adapted similarly diff --git a/tests/test_pipeline_context.py b/tests/test_pipeline_context.py index 89ba9e4..2c85be8 100644 --- a/tests/test_pipeline_context.py +++ b/tests/test_pipeline_context.py @@ -5,6 +5,9 @@ including context propagation and sharing between operations. """ +import pytest + +from efemel.pipeline import CompoundError from efemel.pipeline import Pipeline @@ -44,3 +47,80 @@ def set_context_once(value, context): # Verify the function was called for each item assert context_access_count == 5 + + def test_concurrent_context_isolation(self): + """Test that concurrent processing properly isolates and merges contexts.""" + # Create a pipeline with some data + data = list(range(100)) # Large enough to ensure multiple chunks + pipeline = Pipeline(data, chunk_size=10) # Force multiple chunks + + def error_prone_pipeline(p): + """A pipeline that will cause errors on certain values.""" + return p.map(lambda x: x if x % 7 != 0 else 1 / 0) # Error on multiples of 7 + + def safe_pipeline(p): + """A pipeline that should work without errors.""" + return p.map(lambda x: x * 2) + + # Test with error-prone pipeline + with pytest.raises(CompoundError): + pipeline.concurrent(error_prone_pipeline, max_workers=4).to_list() + + # Check that the context properly recorded errors + assert pipeline.context["has_errors"] + + # Test with safe pipeline + safe_data = list(range(50)) + safe_pipeline_obj = Pipeline(safe_data, chunk_size=5) + + result = safe_pipeline_obj.concurrent(safe_pipeline, max_workers=4).to_list() + expected = [x * 2 for x in safe_data] + + assert result == expected + assert not safe_pipeline_obj.context["has_errors"] + + def test_concurrent_vs_sequential_context(self): + """Test that concurrent and sequential processing produce the same results.""" + data = list(range(20)) + + def transform_pipeline(p): + return p.map(lambda x: x * 3).filter(lambda x: x % 2 == 0) + + # Sequential + sequential_result = Pipeline(data).apply(transform_pipeline).to_list() + + # Concurrent + concurrent_result = Pipeline(data, chunk_size=5).concurrent(transform_pipeline, max_workers=3).to_list() + + assert sequential_result == concurrent_result + + def test_context_modifications_merged_in_concurrent_operations(self): + """Test that context modifications in concurrent processing are properly merged back.""" + data = list(range(20)) + + def context_modifying_pipeline(p): + """Pipeline that modifies context during processing.""" + + def modify_context(value, context): + # Modify context - these changes should be merged back + if "processed_values" not in context: + context["processed_values"] = [] + context["processed_values"].append(value) + return value * 2 + + return p.map(modify_context) + + pipeline = Pipeline(data, chunk_size=5) + result = pipeline.concurrent(context_modifying_pipeline, max_workers=2).to_list() + + # Verify results are correct + expected = [x * 2 for x in data] + assert result == expected + + # Context modifications should be merged back from all threads + # Each thread processes its chunk and adds values to context + assert "processed_values" in pipeline.context + assert len(pipeline.context["processed_values"]) > 0 + # Should have all processed values (order may vary due to concurrency) + assert set(pipeline.context["processed_values"]) == set(data) + assert not pipeline.context["has_errors"] From 1330235641af0ec7fee21cb8a5d38b2e11b5e390 Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Mon, 14 Jul 2025 14:37:44 +0300 Subject: [PATCH 10/17] chore: performance tests --- performance_test.py | 561 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 561 insertions(+) create mode 100644 performance_test.py diff --git a/performance_test.py b/performance_test.py new file mode 100644 index 0000000..0396d94 --- /dev/null +++ b/performance_test.py @@ -0,0 +1,561 @@ +#!/usr/bin/env python3 +""" +Performance test for Pipeline class comparing different data processing approaches. + +This test compares: +1. Pipeline class operations +2. Chained generator expressions with for/yield +3. Itertools map/filter chaining + +Tests are run on 1 million items, 100 times each to get statistical data. +""" + +import itertools +import statistics +import time +from itertools import islice + +def generate_test_data(size: int = 1_000_000) -> list[int]: + """Generate test data of specified size.""" + return range(size) + + +def generator_approach(data: list[int]) -> list[int]: + """Process data using chained generator expressions.""" + + def step1(items): + """Filter even numbers.""" + for item in items: + if item % 2 == 0: + yield item + + def step2(items): + """Double the numbers.""" + for item in items: + yield item * 2 + + def step3(items): + """Filter > 100.""" + for item in items: + if item > 100: + yield item + + def step4(items): + """Add 1.""" + for item in items: + yield item + 1 + + return list(step4(step3(step2(step1(data))))) + + +def builtin_map_filter_approach(data: list[int]) -> list[int]: + """Process data using built-in map/filter chaining.""" + result = filter(lambda x: x % 2 == 0, data) # Keep even numbers + result = map(lambda x: x * 2, result) # Double them + result = filter(lambda x: x > 100, result) # Keep only > 100 + result = map(lambda x: x + 1, result) # Add 1 + return list(result) + + +def generator_expression_approach(data: list[int]) -> list[int]: + """Process data using generator expressions.""" + result = (x for x in data if x % 2 == 0) # Keep even numbers + result = (x * 2 for x in result) # Double them + result = (x for x in result if x > 100) # Keep only > 100 + result = (x + 1 for x in result) # Add 1 + return list(result) + + +def list_comprehension_approach(data: list[int]) -> list[int]: + """Process data using separate list comprehensions with intermediate lists.""" + # Create intermediate lists at each step to match other approaches + step1 = [x for x in data if x % 2 == 0] # Filter even numbers + step2 = [x * 2 for x in step1] # Double them + step3 = [x for x in step2 if x > 100] # Filter > 100 + step4 = [x + 1 for x in step3] # Add 1 + return step4 + + +def chunked_generator_listcomp_approach(data: list[int]) -> list[int]: + """Process data using chunked generators with intermediate lists per chunk.""" + + def chunk_generator(data, chunk_size=1000): + """Generate chunks using a generator.""" + for i in range(0, len(data), chunk_size): + yield data[i : i + chunk_size] + + # Process each chunk with intermediate lists and combine + results = [] + for chunk in chunk_generator(data): + # Create intermediate lists for each chunk + step1 = [x for x in chunk if x % 2 == 0] # Filter even numbers + step2 = [x * 2 for x in step1] # Double them + step3 = [x for x in step2 if x > 100] # Filter > 100 + step4 = [x + 1 for x in step3] # Add 1 + results.extend(step4) + return results + + +def mutated_chunked_generator_listcomp_approach(data: list[int]) -> list[int]: + """Process data using chunked generators with intermediate lists per chunk.""" + + def chunk_generator(data, chunk_size=1000): + """Generate chunks using a generator.""" + while True: + chunk = list(islice(data, chunk_size)) + if not chunk: + return + yield chunk + + # Process each chunk with intermediate lists and combine + results = [] + for chunk in chunk_generator(data): + # Create intermediate lists for each chunk + step = [x for x in chunk if x % 2 == 0] # Filter even numbers + step = [x * 2 for x in step] # Double them + step = [x for x in step if x > 100] # Filter > 100 + step = [x + 1 for x in step] # Add 1 + results.extend(step) + return results + + +class ChunkedPipeline: + """ + A chunked pipeline that composes operations into a single function, + inspired by a callback-chaining pattern. + """ + + def __init__(self, chunk_size=1000): + """Initialize the pipeline with data and chunk size.""" + self.chunk_size = chunk_size + # The transformer starts as an identity function that does nothing. + self.transformer = lambda chunk: chunk + + def _chunk_generator(self, data): + """Generate chunks from the data.""" + data_iter = iter(data) + while True: + chunk = list(itertools.islice(data_iter, self.chunk_size)) + if not chunk: + break + yield chunk + + def pipe(self, operation): + """ + Composes the current transformer with a new chunk-wise operation. + The operation should be a function that takes a list and returns a list. + """ + # Capture the current transformer + prev_transformer = self.transformer + # Create a new transformer that first calls the old one, then the new operation + self.transformer = lambda chunk: operation(prev_transformer(chunk)) + return self + + def filter(self, predicate): + """Adds a filter operation by wrapping the current transformer in a list comprehension.""" + return self.pipe(lambda chunk: [x for x in chunk if predicate(x)]) + + def map(self, func): + """Adds a map operation by wrapping the current transformer in a list comprehension.""" + return self.pipe(lambda chunk: [func(x) for x in chunk]) + + def run(self, data): + """ + Execute the pipeline by calling the single, composed transformer on each chunk. + """ + for chunk in self._chunk_generator(data): + # The transformer is a single function that contains all the nested operations. + yield from self.transformer(chunk) + + +PIPELINE = ( + ChunkedPipeline() + .filter(lambda x: x % 2 == 0) # Filter even numbers + .map(lambda x: x * 2) # Double them + .filter(lambda x: x > 100) # Filter > 100 + .map(lambda x: x + 1) +) + + +class ChunkedPipelineGenerator: + """ + A chunked pipeline that composes operations into a single generator function, + using lazy evaluation throughout the pipeline. + """ + + def __init__(self, chunk_size=1000): + """Initialize the pipeline with chunk size.""" + self.chunk_size = chunk_size + # The transformer starts as an identity generator that yields items as-is + self.transformer = lambda chunk: (x for x in chunk) + + def _chunk_generator(self, data): + """Generate chunks from the data.""" + data_iter = iter(data) + while True: + chunk = list(itertools.islice(data_iter, self.chunk_size)) + if not chunk: + break + yield chunk + + def pipe(self, operation): + """ + Composes the current transformer with a new chunk-wise operation. + The operation should be a function that takes a generator and returns a generator. + """ + # Capture the current transformer + prev_transformer = self.transformer + # Create a new transformer that first calls the old one, then the new operation + self.transformer = lambda chunk: operation(prev_transformer(chunk)) + return self + + def filter(self, predicate): + """Adds a filter operation using generator expression.""" + return self.pipe(lambda gen: (x for x in gen if predicate(x))) + + def map(self, func): + """Adds a map operation using generator expression.""" + return self.pipe(lambda gen: (func(x) for x in gen)) + + def run(self, data): + """ + Execute the pipeline by calling the single, composed transformer on each chunk. + Yields items lazily from the generator pipeline. + """ + for chunk in self._chunk_generator(data): + # The transformer is a single function that contains all the nested generator operations + yield from self.transformer(chunk) + + +# Create a generator-based pipeline instance +PIPELINE_GENERATOR = ( + ChunkedPipelineGenerator() + .filter(lambda x: x % 2 == 0) # Filter even numbers + .map(lambda x: x * 2) # Double them + .filter(lambda x: x > 100) # Filter > 100 + .map(lambda x: x + 1) +) + + +def chunked_pipeline_approach(data: list[int]) -> list[int]: + """Process data using the ChunkedPipeline class.""" + return list(PIPELINE.run(data)) + + +def chunked_pipeline_generator_approach(data: list[int]) -> list[int]: + """Process data using the ChunkedPipelineGenerator class.""" + return list(PIPELINE_GENERATOR.run(data)) + + +class ChunkedPipelineSimple: + """ + A simple chunked pipeline that stores operations in a list and applies them sequentially + using list comprehensions when executed. + """ + + def __init__(self, chunk_size=1000): + """Initialize the pipeline with chunk size.""" + self.chunk_size = chunk_size + self.operations = [] + + def _chunk_generator(self, data): + """Generate chunks from the data.""" + data_iter = iter(data) + while True: + chunk = list(itertools.islice(data_iter, self.chunk_size)) + if not chunk: + break + yield chunk + + def pipe(self, operation): + """ + Add an operation to the pipeline. + The operation should be a function that takes a list and returns a list. + """ + self.operations.append(operation) + return self + + def filter(self, predicate): + """Add a filter operation to the pipeline.""" + return self.pipe(lambda chunk: [x for x in chunk if predicate(x)]) + + def map(self, func): + """Add a map operation to the pipeline.""" + return self.pipe(lambda chunk: [func(x) for x in chunk]) + + def run(self, data): + """ + Execute the pipeline by applying all operations sequentially to each chunk. + """ + for chunk in self._chunk_generator(data): + # Apply all operations sequentially to the chunk + current_chunk = chunk + for operation in self.operations: + current_chunk = operation(current_chunk) + + # Yield each item from the processed chunk + yield from current_chunk + + +# Create a simple pipeline instance +PIPELINE_SIMPLE = ( + ChunkedPipelineSimple() + .filter(lambda x: x % 2 == 0) # Filter even numbers + .map(lambda x: x * 2) # Double them + .filter(lambda x: x > 100) # Filter > 100 + .map(lambda x: x + 1) +) + + +def chunked_pipeline_simple_approach(data: list[int]) -> list[int]: + """Process data using the ChunkedPipelineSimple class.""" + return list(PIPELINE_SIMPLE.run(data)) + +# Sentinel value to indicate that an item has been filtered out. +_SKIPPED = object() + +class ChunkedPipelinePerItem: + """ + A chunked pipeline that composes operations into a single function + operating on individual items, inspired by a callback-chaining pattern + to reduce the creation of intermediate lists. + """ + + def __init__(self, chunk_size=1000): + """Initialize the pipeline with a specified chunk size.""" + self.chunk_size = chunk_size + # The transformer starts as an identity function. + self.transformer = lambda item: item + + def _chunk_generator(self, data): + """A generator that yields chunks of data.""" + data_iter = iter(data) + while True: + chunk = list(itertools.islice(data_iter, self.chunk_size)) + if not chunk: + break + yield chunk + + def pipe(self, operation): + """ + Composes the current transformer with a new item-wise operation. + This internal method is the core of the pipeline's composition logic. + """ + prev_transformer = self.transformer + + def new_transformer(item): + # Apply the existing chain of transformations. + processed_item = prev_transformer(item) + + # If a previous operation (like a filter) already skipped this item, + # we bypass the new operation entirely. + if processed_item is _SKIPPED: + return _SKIPPED + + # Apply the new operation to the result of the previous ones. + return operation(processed_item) + + self.transformer = new_transformer + return self + + def filter(self, predicate): + """ + Adds a filter operation to the pipeline. + + If the predicate returns `False`, the item is marked as skipped, + and no further operations in the chain will be executed on it. + """ + def filter_operation(item): + return item if predicate(item) else _SKIPPED + + return self.pipe(filter_operation) + + def map(self, func): + """Adds a map operation to transform an item.""" + return self.pipe(func) + + def run(self, data): + """ + Executes the pipeline. + + The composed transformer function is applied to each item individually. + Results are yielded only if they haven't been marked as skipped. + """ + for chunk in self._chunk_generator(data): + yield from [result for item in chunk if (result := self.transformer(item)) is not _SKIPPED] + +PIPELINE_PER_ITEM = ( + ChunkedPipelinePerItem() + .filter(lambda x: x % 2 == 0) # Filter even numbers + .map(lambda x: x * 2) # Double them + .filter(lambda x: x > 100) # Filter > 100 + .map(lambda x: x + 1) +) + +def chunked_pipeline_per_item_approach(data) -> list[int]: + """Process data using the ChunkedPipelinePerItem class.""" + return list(PIPELINE_PER_ITEM.run(data)) + +def time_function(func, *args, **kwargs) -> float: + """Time a function execution and return duration in seconds.""" + start_time = time.perf_counter() + func(*args, **kwargs) + end_time = time.perf_counter() + return end_time - start_time + + +def run_performance_test(): + """Run comprehensive performance test.""" + print("🚀 Starting Performance Test") + print("=" * 60) + + # Test configurations + approaches = { + # "Generators": generator_approach, + # "Map/Filter": builtin_map_filter_approach, + # "GeneratorExpression": generator_expression_approach, + "ChunkedPipeline": chunked_pipeline_approach, + "ChunkedPipelinePerItem": chunked_pipeline_per_item_approach, + "ChunkedPipelineGenerator": chunked_pipeline_generator_approach, + "ChunkedPipelineSimple": chunked_pipeline_simple_approach, + "ListComprehension": list_comprehension_approach, + "ChunkedGeneratorListComp": chunked_generator_listcomp_approach, + # "MutatedChunkedGeneratorListComp": mutated_chunked_generator_listcomp_approach, + } + + num_runs = 50 + results = {} + + for name, approach_func in approaches.items(): + print(f"\n🔄 Testing {name} approach ({num_runs} runs)...") + times = [] + + # Warm up + approach_func(generate_test_data(1_000_000)) + + # Actual timing runs + for run in range(num_runs): + print(f" Run {run + 1}/{num_runs}") + + duration = time_function(approach_func, generate_test_data(1_000_000)) + times.append(duration) + + # Calculate statistics + results[name] = { + "times": times, + "min": min(times), + "max": max(times), + "avg": statistics.mean(times), + "p95": statistics.quantiles(times, n=20)[18], # 95th percentile + "p99": statistics.quantiles(times, n=100)[98], # 99th percentile + "median": statistics.median(times), + "stdev": statistics.stdev(times) if len(times) > 1 else 0, + } # Print results + print("\n" + "=" * 60) + print("📈 PERFORMANCE RESULTS") + print("=" * 60) + + # Header + header = ( + f"{'Approach':<12} {'Min (s)':<10} {'Max (s)':<10} {'Avg (s)':<10} " + f"{'P95 (s)':<10} {'P99 (s)':<10} {'Median (s)':<12} {'StdDev':<10}" + ) + print(header) + print("-" * 104) + + # Sort by average time + sorted_results = sorted(results.items(), key=lambda x: x[1]["avg"]) + + for name, stats in sorted_results: + print( + f"{name:<12} {stats['min']:<10.4f} {stats['max']:<10.4f} {stats['avg']:<10.4f} " + f"{stats['p95']:<10.4f} {stats['p99']:<10.4f} {stats['median']:<12.4f} {stats['stdev']:<10.4f}" + ) + + # Relative performance + print("\n" + "=" * 60) + print("🏆 RELATIVE PERFORMANCE") + print("=" * 60) + + baseline = sorted_results[0][1]["avg"] # Fastest approach as baseline + + for name, stats in sorted_results: + ratio = stats["avg"] / baseline + percentage = (ratio - 1) * 100 + if ratio == 1.0: + print(f"{name:<12} 1.00x (baseline)") + else: + print(f"{name:<12} {ratio:.2f}x ({percentage:+.1f}% slower)") + + # Verify correctness + print("\n" + "=" * 60) + print("✅ CORRECTNESS VERIFICATION") + print("=" * 60) + + # Use smaller dataset for verification + results_correctness = {} + + for name, approach_func in approaches.items(): + result = approach_func(generate_test_data(1_000)) + results_correctness[name] = result + print(f"{name:<12} Result length: {len(result)}") + + # Check if all approaches produce the same result + first_result = next(iter(results_correctness.values())) + all_same = all(result == first_result for result in results_correctness.values()) + + if all_same: + print("✅ All approaches produce identical results") + print(f" Sample result (first 10): {first_result[:10]}") + else: + print("❌ Approaches produce different results!") + for name, result in results_correctness.items(): + print(f" {name}: {result[:10]} (length: {len(result)})") + + +def run_memory_test(): + """Run a quick memory efficiency comparison.""" + print("\n" + "=" * 60) + print("💾 MEMORY EFFICIENCY TEST") + print("=" * 60) + + import tracemalloc + + approaches = { + # "Generators": generator_approach, + # "MapFilter": builtin_map_filter_approach, + # "GeneratorExpression": generator_expression_approach, + "ChunkedPipeline": chunked_pipeline_approach, + "ChunkedPipelinePerItem": chunked_pipeline_per_item_approach, + "ChunkedPipelineGenerator": chunked_pipeline_generator_approach, + "ChunkedPipelineSimple": chunked_pipeline_simple_approach, + "ListComprehension": list_comprehension_approach, + "ChunkedGeneratorListComp": chunked_generator_listcomp_approach, + # "MutatedChunkedGeneratorListComp": mutated_chunked_generator_listcomp_approach, + } + + for name, approach_func in approaches.items(): + tracemalloc.start() + + result = approach_func(generate_test_data(100_000)) + + current, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + + print( + f"{name:<12} Peak memory: {peak / 1024 / 1024:.2f} MB, " + f"Current: {current / 1024 / 1024:.2f} MB, " + f"Result length: {len(result)}" + ) + + +if __name__ == "__main__": + try: + run_performance_test() + run_memory_test() + print("\n🎉 ChunkedPipeline test completed successfully!") + except KeyboardInterrupt: + print("\n⚠️ Test interrupted by user") + except Exception as e: + print(f"\n❌ Test failed with error: {e}") + raise From 467ae3ab4dc0757b372f3ae34843a8ee9da178c0 Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Mon, 14 Jul 2025 20:31:27 +0000 Subject: [PATCH 11/17] feat: refactored pipeline to be list comprehension based --- efemel/pipeline.py | 707 +++++++++-------------------- performance_test.py | 191 ++++---- pipeline.py | 263 +++++++++++ tests/test_pipeline.py | 0 tests/test_pipeline_basics.py | 76 ++-- tests/test_pipeline_catch.py | 214 --------- tests/test_pipeline_composition.py | 145 +++--- tests/test_pipeline_each.py | 16 +- tests/test_pipeline_edge_cases.py | 50 +- tests/test_pipeline_filtering.py | 32 +- tests/test_pipeline_integration.py | 18 +- tests/test_pipeline_map_filter.py | 12 +- tests/test_pipeline_mapping.py | 40 +- tests/test_pipeline_reduce.py | 38 +- tests/test_pipeline_tap.py | 16 +- tests/test_pipeline_utility.py | 125 ++--- 16 files changed, 839 insertions(+), 1104 deletions(-) create mode 100644 pipeline.py create mode 100644 tests/test_pipeline.py delete mode 100644 tests/test_pipeline_catch.py diff --git a/efemel/pipeline.py b/efemel/pipeline.py index d61e9a7..6aa910b 100644 --- a/efemel/pipeline.py +++ b/efemel/pipeline.py @@ -1,566 +1,263 @@ -""" -Pipeline module for functional data processing with chunked processing and robust error handling. - -This module provides a Pipeline class that enables functional programming patterns -for data transformation and processing, using internal chunking and optional -concurrent execution for performance. -""" - from collections import deque from collections.abc import Callable -from collections.abc import Generator from collections.abc import Iterable +from collections.abc import Iterator from concurrent.futures import FIRST_COMPLETED -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import ProcessPoolExecutor from concurrent.futures import wait +from functools import reduce import inspect -import threading +import itertools from typing import Any -from typing import NamedTuple -from typing import Self -from typing import TypedDict -from typing import TypeVar from typing import Union from typing import overload -# Type variable for the value part of the internal tuple -T = TypeVar("T") -# Type variable for the transformed value part -U = TypeVar("U") +# Type aliases for pipeline functions +type PipelineFunction[Out, T] = Callable[[Out], T] | Callable[[Out, PipelineContext], T] +type PipelineReduceFunction[U, Out] = Callable[[U, Out], U] | Callable[[U, Out, PipelineContext], U] -def _create_context_aware_function(func: Callable, context: "PipelineContext") -> Callable[[Any], Any]: - """ - Create a standardized function that always takes just a value parameter. +class PipelineContext(dict): + """Global context available to all pipeline operations.""" - This builder analyzes the user function once and returns a wrapper that: - - Calls func(value) if the original function takes 1 parameter - - Calls func(value, context) if the original function takes 2+ parameters + pass - Args: - func: The user-provided function - context: The pipeline context to pass if needed - Returns: - A function that takes only a value parameter +class Pipeline[In, Out]: """ - try: - sig = inspect.signature(func) - param_count = len([p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)]) - - if param_count >= 2: - # Function accepts context - return lambda value: func(value, context) - else: - # Function takes only value - return lambda value: func(value) - except (ValueError, TypeError): - # Fall back to single parameter for lambda or built-in functions - return lambda value: func(value) - - -def _create_reduce_function(func: Callable, context: "PipelineContext") -> Callable[[Any, Any], Any]: + A functional pipeline for data processing with internal chunked processing. + + The Pipeline class provides a fluent interface for applying transformations + and filters. It processes data in chunks for performance and maintains a + global context for shared state. The pipeline is callable and iterable. """ - Create a standardized reduce function that always takes (accumulator, value) parameters. - This builder analyzes the user function once and returns a wrapper that: - - Calls func(acc, value) if the original function takes 2 parameters - - Calls func(acc, value, context) if the original function takes 3+ parameters + def __init__( + self, + chunk_size: int = 1000, + transformer: Callable[[list[In]], list[Out]] | None = None, + context: PipelineContext | None = None, + ) -> None: + self.chunk_size = chunk_size + self.context = context or PipelineContext() + self.transformer: Callable[[list[In]], list[Out]] = transformer or (lambda chunk: chunk) - Args: - func: The user-provided reduce function - context: The pipeline context to pass if needed + @classmethod + def init[T](cls, _type_hint: type[T], chunk_size: int = 1000) -> "Pipeline[T, T]": + """Create a new identity pipeline with explicit type hint.""" + return cls(chunk_size) - Returns: - A function that takes (accumulator, value) parameters - """ - try: - sig = inspect.signature(func) - param_count = len([p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)]) - - if param_count >= 3: - # Function accepts context as third parameter - return lambda acc, value: func(acc, value, context) - else: - # Function takes only (acc, value) - return lambda acc, value: func(acc, value) - except (ValueError, TypeError): - # Fall back to 2-parameter call for lambda or built-in functions - return lambda acc, value: func(acc, value) - - -class PipelineContext(TypedDict): - """Global context available to all pipeline operations.""" + def __call__(self, *data: Iterable[In]) -> Iterator[Out]: + """Execute the pipeline on data source (makes pipeline callable).""" + assert data, "Data source must be provided to execute the pipeline" - has_errors: bool - errors: list[dict[str, Any]] + for selected_data in data: + for chunk in self._chunk_generator(selected_data): + yield from self.transformer(chunk) + def pipe[U](self, operation: Callable[[list[Out]], list[U]]) -> "Pipeline[In, U]": + """ + Composes the current transformer with a new chunk-wise operation. + The operation should be a function that takes a list and returns a list. + """ + prev_transformer = self.transformer + self.transformer = lambda chunk: operation(prev_transformer(chunk)) + return self -# Define a type for the internal data structure: a tuple of (value, error) -class PipelineItem(NamedTuple): - """Represents an item in the pipeline.""" + def _create_context_aware_function(self, func: Callable): + """Create function that properly handles context parameter""" + try: + sig = inspect.signature(func) + params = [p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)] + param_count = len(params) + + if param_count >= 2: + return lambda value: func(value, self.context) + return func + except (ValueError, TypeError): + return func + + def _create_reduce_function(self, func: Callable): + """Create function that properly handles context parameter for reduce operations""" + try: + sig = inspect.signature(func) + params = [p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)] + param_count = len(params) + + if param_count >= 3: + return lambda acc, value: func(acc, value, self.context) + return func + except (ValueError, TypeError): + return func + + def map[U](self, function: PipelineFunction[Out, U]) -> "Pipeline[In, U]": + """Transform elements in the pipeline.""" + std_function = self._create_context_aware_function(function) + return self.pipe(lambda chunk: [std_function(x) for x in chunk]) + + def filter(self, predicate: PipelineFunction[Out, bool]) -> "Pipeline[In, Out]": + """Filter elements in the pipeline.""" + std_predicate = self._create_context_aware_function(predicate) + return self.pipe(lambda chunk: [x for x in chunk if std_predicate(x)]) + + def _chunk_generator(self, data: Iterable[In]) -> Iterator[list[In]]: + """Generate chunks from the data source""" + data_iter = iter(data) + while chunk := list(itertools.islice(data_iter, self.chunk_size)): + yield chunk - value: T - error: Exception | None - context: PipelineContext + def to_list(self, *data: Iterable[In]) -> list[Out]: + """Execute pipeline and return results as list.""" + return [item for d in data for chunk in self._chunk_generator(d) for item in self.transformer(chunk)] + def each(self, *data: Iterable[In], function: PipelineFunction[Out, Any]) -> None: + """Apply function to each element (terminal operation).""" + std_function = self._create_context_aware_function(function) -class CompoundError(Exception): - """An exception that aggregates multiple underlying errors from a pipeline.""" + for chunk in self._chunk_generator(itertools.chain.from_iterable(data)): + [std_function(item) for item in self.transformer(chunk)] - def __init__(self, errors: list[Exception]): - self.errors = errors - super().__init__(self._format_message()) + def tap(self, function: PipelineFunction[Out, Any]) -> "Pipeline[In, Out]": + """Apply side-effect without modifying data.""" + std_function = self._create_context_aware_function(function) - def _format_message(self) -> str: - """Create a summary message for all the contained errors.""" - message = f"{len(self.errors)} error(s) occurred in the pipeline:\n" - for i, error in enumerate(self.errors, 1): - message += f" {i}. {type(error).__name__}: {error}\n" - return message + def tap_operation(chunk: list[Out]) -> list[Out]: + for item in chunk: + std_function(item) + return chunk - def __repr__(self) -> str: - return f"CompoundError(errors={self.errors!r})" + return self.pipe(tap_operation) + def consume(self, *data: Iterable[In]) -> None: + """Consume the pipeline without returning results (terminal operation).""" + for _ in self(*data): + pass -class Pipeline[T]: - """ - A functional pipeline for data processing with internal chunked processing and error handling. + @overload + def flatten(self: "Pipeline[list[In]]") -> "Pipeline[In]": ... - The Pipeline class wraps an iterable and provides a fluent interface for - applying transformations and filters. It processes data in chunks for performance, - captures exceptions without halting execution, and reports all errors at the end. + @overload + def flatten(self: "Pipeline[tuple[In, ...]]") -> "Pipeline[In]": ... - Type Parameters: - T: The type of elements being processed in the pipeline. + @overload + def flatten(self: "Pipeline[set[In]]") -> "Pipeline[In]": ... - Attributes: - generator: A generator yielding chunks of pipeline items. - chunk_size: The number of elements to process in each chunk. - context: A dictionary for global pipeline state. - """ + def flatten( + self: Union["Pipeline[list[In], In]", "Pipeline[tuple[In, ...]]", "Pipeline[set[In]]"], + ) -> "Pipeline[In, In]": + """Flatten nested lists into a single list.""" + return self.pipe(lambda chunk: [item for sublist in chunk for item in sublist]) - generator: Generator[list[PipelineItem[T]], None, None] - context: PipelineContext + def apply[T](self, pipeline: "Pipeline[Out, T]") -> "Pipeline[In, T]": + """Apply another pipeline to the current one.""" + return self.pipe(lambda chunk: pipeline.transformer(chunk)) - def __init__(self, source: Union[Iterable[T], "Pipeline[T]"], chunk_size: int = 1000) -> None: - """ - Initialize a new Pipeline with the given data source. + def first(self, *data: Iterable[In], n: int = 1) -> list[Out]: + """Get the first n elements of the pipeline (terminal operation).""" + assert n >= 1, "n must be at least 1" + data_iter = itertools.chain.from_iterable(data) + result = [] + remaining = n - Args: - source: An iterable or another Pipeline. - chunk_size: The number of elements per chunk (default: 1000). - """ - if isinstance(source, Pipeline): - self.generator = source.generator - self.chunk_size = source.chunk_size - self.context = source.context - else: - self.context = PipelineContext(has_errors=False, errors=[]) - self.generator = self._chunked_and_wrap(source, chunk_size) - self.chunk_size = chunk_size - - def _chunked_and_wrap(self, iterable: Iterable[T], size: int) -> Generator[list[PipelineItem[T]], None, None]: - """Break an iterable into chunks of specified size and wrap items.""" - chunk: list[PipelineItem[T]] = [] - for item in iterable: - chunk.append(PipelineItem(item, None, self.context)) # Wrap each item - if len(chunk) == size: - yield chunk - chunk = [] - if chunk: - yield chunk + while remaining > 0: + selected_item_count = min(self.chunk_size, remaining) - @classmethod - def _from_chunks( - cls, chunks: Iterable[list[PipelineItem[T]]], chunk_size: int, context: PipelineContext - ) -> "Pipeline[T]": - """Create a pipeline directly from an iterable of chunks.""" - p = cls([], chunk_size=chunk_size) - p.generator = (chunk for chunk in chunks) - p.context = context - return p - - def __iter__(self) -> Generator[T, None, None]: - """ - Iterate over elements, raising a CompoundError at the end if any exceptions were caught. - """ - errors: list[Exception] = [] - for chunk in self.generator: - for value, error, _ in chunk: - if error: - errors.append(error) - self.context["has_errors"] = True - else: - yield value - if errors: - raise CompoundError(errors) - - def to_list(self) -> list[T]: - """ - Convert the pipeline to a list, raising a CompoundError if any exceptions were caught. - """ - results: list[T] = [] - errors: list[Exception] = [] - for chunk in self.generator: - for value, error, _ in chunk: - if error: - errors.append(error) - self.context["has_errors"] = True - else: - results.append(value) - if errors: - raise CompoundError(errors) - return results - - def first(self) -> T: - """ - Get the first element, raising a CompoundError if any exceptions are found anywhere in the pipeline. - """ - # Note: This is a terminal operation and must consume the generator to find all errors. - items = self.to_list() - if not items: - raise StopIteration("Pipeline is empty") - return items[0] - - def map[U](self, function: Callable[[T], U] | Callable[[T, PipelineContext], U]) -> "Pipeline[U]": - """Transform elements, capturing any exceptions that occur.""" - - def map_generator() -> Generator[list[PipelineItem[U]], None, None]: - for chunk in self.generator: - new_chunk: list[PipelineItem[U]] = [] - for value, error, context in chunk: - if error: - # Pass through existing errors - new_chunk.append(PipelineItem(value, error, context)) - self.context["has_errors"] = True - else: - try: - # Create a standardized function using the item's specific context - std_function = _create_context_aware_function(function, context) - # Apply the standardized function - result = std_function(value) - new_chunk.append(PipelineItem(result, None, context)) - except Exception as e: - # On failure, capture the exception - new_chunk.append(PipelineItem(value, e, context)) - self.context["has_errors"] = True - yield new_chunk - - return Pipeline._from_chunks(map_generator(), self.chunk_size, self.context) - - def filter(self, predicate: Callable[[T], bool] | Callable[[T, PipelineContext], bool]) -> "Pipeline[T]": - """Filter elements, capturing any exceptions from the predicate.""" - - # Create a standardized predicate that always takes just value - std_predicate = _create_context_aware_function(predicate, self.context) - - def filter_generator() -> Generator[list[PipelineItem[T]], None, None]: - for chunk in self.generator: - new_chunk: list[PipelineItem[T]] = [] - for item in chunk: - if item.error: - # Pass through existing errors - new_chunk.append(item) - self.context["has_errors"] = True - continue - try: - # Keep item if predicate is true - result = std_predicate(item.value) - if result: - new_chunk.append(item) - except Exception as e: - # If predicate fails, capture the error - new_chunk.append(PipelineItem(item.value, e, item.context)) - self.context["has_errors"] = True - yield new_chunk - - return Pipeline._from_chunks(filter_generator(), self.chunk_size, self.context) - - def reduce[U]( - self, function: Callable[[U, T], U] | Callable[[U, T, PipelineContext], U], initial: U - ) -> "Pipeline[U]": - """ - Reduce elements to a single value (intermediate operation). - Errored items are skipped in the reduction and passed through. - """ + # Read a chunk of size min(chunk_size, remaining) + chunk = list(itertools.islice(data_iter, selected_item_count)) - def reduce_generator() -> Generator[list[PipelineItem[U]], None, None]: - acc = initial - error_items: list[PipelineItem[U]] = [] - - # Create a standardized reduce function that always takes (acc, value) - std_function = _create_reduce_function(function, self.context) - - for chunk in self.generator: - for value, error, context in chunk: - if error: - error_items.append(PipelineItem(value, error, context)) - self.context["has_errors"] = True - else: - try: - acc = std_function(acc, value) - except Exception as e: - # If reduce function fails, we still need to handle it - error_items.append(PipelineItem(value, e, context)) - self.context["has_errors"] = True - # Yield all collected errors first, then the final result. - if error_items: - yield error_items - yield [PipelineItem(acc, None, self.context)] - - return Pipeline._from_chunks(reduce_generator(), self.chunk_size, self.context) - - def each(self, function: Callable[[T], Any] | Callable[[T, PipelineContext], Any]) -> None: - """ - Apply a function to each element (terminal operation). + if not chunk: + break # No more data + + result.extend(self.transformer(chunk)) + + remaining -= selected_item_count - Raises a CompoundError at the end if any exceptions were caught. + return result + + def reduce[U](self, *data: Iterable[In], function: PipelineReduceFunction[U, Out], initial: U) -> U: """ + Reduce elements to a single value (terminal operation). - # Create a standardized function that always takes just value - std_function = _create_context_aware_function(function, self.context) + Args: + function: Reduce function (acc, value) -> new_acc + initial: Initial accumulator value + """ + reducer = self._create_reduce_function(function) + return reduce(reducer, self(*data), initial) - errors: list[Exception] = [] - for chunk in self.generator: - for value, error, _ in chunk: - if error: - errors.append(error) - self.context["has_errors"] = True - else: - try: - std_function(value) - except Exception as e: - errors.append(e) - self.context["has_errors"] = True - if errors: - raise CompoundError(errors) - - def apply[U](self, function: Callable[[Self], "Pipeline[U]"]) -> "Pipeline[U]": - """Apply a sequence of transformations to the pipeline.""" - return function(self) - - def catch[U]( + def concurrent[T]( self, - pipeline_func: Callable[["Pipeline[T]"], "Pipeline[U]"], - on_error: Callable[[Exception], Any], - ) -> "Pipeline[U]": + *data: Iterable[In], + pipeline: "Pipeline[Out,T]", + max_workers: int, + ordered: bool = True, + ) -> Iterator[T]: """ - Apply a pipeline function and handle any captured errors with a callback. + Applies a sub-pipeline to each chunk in parallel (Terminal Operation). + + This method consumes the pipeline and processes chunks of data concurrently, + returning an iterator for the results. It is a terminal operation. - This method is for inspecting or logging errors mid-stream. It does not - recover from them; the errors remain in the pipeline. + To manage memory effectively, it uses a just-in-time approach, only + pulling chunks from the source iterator when a worker thread is available. Args: - pipeline_func: A function that takes a pipeline and returns a new one. - on_error: A function to call with any exception captured in the `pipeline_func`. + *data: The input data iterable(s) to process. + pipeline: A `Pipeline` instance to apply to each chunk. A deep + copy is used for each thread to ensure context isolation. + max_workers: The maximum number of worker threads. + ordered: If True, results are yielded in the original order. + If False, results are yielded as they complete. Returns: - A new pipeline that continues with the (potentially errored) items. + An iterator that yields the processed items. """ - # Initialize errors list in context if not present - if "errors" not in self.context: - self.context["errors"] = [] - - processed_pipeline = pipeline_func(self) - - def error_tapping_generator() -> Generator[list[PipelineItem[U]], None, None]: - for chunk in processed_pipeline.generator: - for value, error, _ in chunk: - if error: - self.context["has_errors"] = True - self.context["errors"].append({"value": value, "error": error}) - try: - on_error(error) - except Exception as e: - # To prevent the handler itself from stopping the pipeline, - # we could log this as a meta-error, but for now, we'll print it. - print(f"Error in `on_error` handler: {e}") - yield chunk # Pass the original chunk through - - return Pipeline._from_chunks(error_tapping_generator(), self.chunk_size, self.context) - @overload - def flatten(self: "Pipeline[list[U]]") -> "Pipeline[U]": ... + def process_chunk(chunk: list[Out]) -> list[T]: + return pipeline.to_list(chunk) - @overload - def flatten(self: "Pipeline[tuple[U, ...]]") -> "Pipeline[U]": ... + def _ordered_generator(chunks_iter: Iterator[list[Out]], executor: ProcessPoolExecutor) -> Iterator[list[T]]: + futures = deque() + for _ in range(max_workers + 1): + try: + chunk = next(chunks_iter) + futures.append(executor.submit(process_chunk, chunk)) + except StopIteration: + break - @overload - def flatten(self: "Pipeline[set[U]]") -> "Pipeline[U]": ... - - def flatten(self: Union["Pipeline[list[U]]", "Pipeline[tuple[U, ...]]", "Pipeline[set[U]]"]) -> "Pipeline[Any]": - """Flatten nested iterables within the pipeline.""" - - def flatten_generator() -> Generator[list[PipelineItem[Any]], None, None]: - current_chunk: list[PipelineItem[Any]] = [] - for chunk in self.generator: - for value, error, context in chunk: - if error: - # Pass through errored items directly - current_chunk.append(PipelineItem(value, error, context)) - self.context["has_errors"] = True - elif isinstance(value, list | tuple | set): - # Flatten the value from a successful item - for item in value: - current_chunk.append(PipelineItem(item, None, context)) - if len(current_chunk) == self.chunk_size: - yield current_chunk - current_chunk = [] - # Non-iterable, non-errored items are passed through as-is - else: - current_chunk.append(PipelineItem(value, None, context)) - - if len(current_chunk) == self.chunk_size: - yield current_chunk - current_chunk = [] - - if current_chunk: - yield current_chunk - - return Pipeline._from_chunks(flatten_generator(), self.chunk_size, self.context) - - def concurrent( - self, - pipeline_func: Callable[["Pipeline[T]"], "Pipeline[U]"], - max_workers: int, - ordered: bool = True, - ) -> "Pipeline[U]": - """Applies a pipeline function to each chunk in parallel.""" - - # Thread-safe context merging - context_lock = threading.Lock() - merged_errors: list[dict[str, Any]] = [] - isolated_contexts: list[PipelineContext] = [] - - def apply_to_chunk(chunk: list[PipelineItem[T]]) -> list[PipelineItem[U]]: - # Create an isolated context for this thread, copying the main context - isolated_context = PipelineContext(has_errors=False, errors=[]) - # Copy any existing context fields from main context - isolated_context.update(self.context) - isolated_context["has_errors"] = False - isolated_context["errors"] = [] - - # Create a mini-pipeline from the single chunk with isolated context - chunk_pipeline = Pipeline._from_chunks([chunk], len(chunk), isolated_context) - processed_pipeline = pipeline_func(chunk_pipeline) - - # Collect results and store context for later merging - result = [item for processed_chunk in processed_pipeline.generator for item in processed_chunk] - - # Thread-safe context collection - with context_lock: - isolated_contexts.append(isolated_context) - if isolated_context["has_errors"] or isolated_context["errors"]: - self.context["has_errors"] = True - merged_errors.extend(isolated_context["errors"]) - - return result - - def process_in_pool( - gen_func: Callable[[], Generator[list[PipelineItem[U]], None, None]], - ) -> Generator[list[PipelineItem[U]], None, None]: - with ThreadPoolExecutor(max_workers=max_workers) as executor: - yield from gen_func(executor) - - def ordered_generator( - executor: ThreadPoolExecutor, - ) -> Generator[list[PipelineItem[U]], None, None]: - futures = deque(executor.submit(apply_to_chunk, chunk) for chunk in self.generator) while futures: yield futures.popleft().result() + try: + chunk = next(chunks_iter) + futures.append(executor.submit(process_chunk, chunk)) + except StopIteration: + continue + + def _unordered_generator(chunks_iter: Iterator[list[Out]], executor: ProcessPoolExecutor) -> Iterator[list[T]]: + futures = {executor.submit(process_chunk, chunk) for chunk in itertools.islice(chunks_iter, max_workers + 1)} - def unordered_generator( - executor: ThreadPoolExecutor, - ) -> Generator[list[PipelineItem[U]], None, None]: - futures = {executor.submit(apply_to_chunk, chunk) for chunk in self.generator} while futures: done, futures = wait(futures, return_when=FIRST_COMPLETED) - for f in done: - yield f.result() - - def gen(executor): - if ordered: - return ordered_generator(executor) - else: - return unordered_generator(executor) - - # Process all chunks and collect results - result_pipeline = Pipeline._from_chunks(process_in_pool(gen), self.chunk_size, self.context) - - # Final context merge after all threads complete - with context_lock: - self.context["errors"].extend(merged_errors) - - # Merge all context fields from isolated contexts - for isolated_context in isolated_contexts: - for key, value in isolated_context.items(): - if key not in ["has_errors", "errors"]: # Skip standard fields already handled - if key in self.context: - # If the field already exists, merge it intelligently - if isinstance(value, list) and isinstance(self.context[key], list): - self.context[key].extend(value) - elif isinstance(value, dict) and isinstance(self.context[key], dict): - self.context[key].update(value) - else: - # For other types, use the latest value - self.context[key] = value - else: - # New field, just add it - self.context[key] = value - - return result_pipeline - - # All other methods like tap, noop, etc. can be adapted similarly - # to handle the (value, error) tuple structure. - def noop(self) -> None: - """Consume the pipeline, raising a CompoundError if any exceptions were caught.""" - errors = [error for chunk in self.generator for _, error, _ in chunk if error] - if errors: - self.context["has_errors"] = True - raise CompoundError(errors) + for future in done: + yield future.result() + try: + chunk = next(chunks_iter) + futures.add(executor.submit(process_chunk, chunk)) + except StopIteration: + continue - @classmethod - def chain(cls, *pipelines: "Pipeline[T]") -> "Pipeline[T]": - """Chain multiple pipelines together sequentially.""" - - def chain_generator(): - for pipeline in pipelines: - yield from pipeline.generator - - chunk_size = pipelines[0].chunk_size if pipelines else 1000 - context = pipelines[0].context if pipelines else PipelineContext(has_errors=False, errors=[]) - return cls._from_chunks(chain_generator(), chunk_size, context) - - def tap(self, function: Callable[[T], Any] | Callable[[T, PipelineContext], Any]) -> Self: - """Apply a side-effect function to each element without modifying the data.""" - - # Create a standardized function that always takes just value - std_function = _create_context_aware_function(function, self.context) - - def tap_generator() -> Generator[list[PipelineItem[T]], None, None]: - for chunk in self.generator: - for value, error, _ in chunk: - # Apply tap only to non-errored items - if not error: - try: - std_function(value) - except Exception as e: - # If the tap function itself fails, we should not halt the pipeline. - # For now, we print a warning. A more advanced logger could be used here. - print(f"Warning: Exception in tap function ignored: {e}") - yield chunk # Pass the original chunk through unchanged - - return Pipeline._from_chunks(tap_generator(), self.chunk_size, self.context) + # This wrapper generator ensures the ProcessPoolExecutor is properly + # managed and closed only after the result iterator is exhausted. + def result_iterator_manager() -> Iterator[T]: + with ProcessPoolExecutor(max_workers=max_workers) as executor: + # Create the stream of chunks to be processed concurrently. + chunks_to_process = (self.transformer(chunk) for d in data for chunk in self._chunk_generator(d)) - @classmethod - def from_pipeline(cls, pipeline: "Pipeline[T]") -> "Pipeline[T]": - """Create a new Pipeline from another Pipeline, preserving its state.""" - new_pipeline = cls([], chunk_size=pipeline.chunk_size) - new_pipeline.generator = pipeline.generator - new_pipeline.context = pipeline.context - return new_pipeline - - def passthrough(self) -> Self: - """Return the pipeline unchanged (identity operation).""" - return self + gen_func = _ordered_generator if ordered else _unordered_generator + processed_chunks_iterator = gen_func(chunks_to_process, executor) + + # Flatten the iterator of chunks into an iterator of items. + for result_chunk in processed_chunks_iterator: + yield from result_chunk + + return result_iterator_manager() diff --git a/performance_test.py b/performance_test.py index 0396d94..adb3c0d 100644 --- a/performance_test.py +++ b/performance_test.py @@ -11,9 +11,12 @@ """ import itertools +from itertools import islice import statistics import time -from itertools import islice + +from pipeline import Pipeline + def generate_test_data(size: int = 1_000_000) -> list[int]: """Generate test data of specified size.""" @@ -102,10 +105,10 @@ def mutated_chunked_generator_listcomp_approach(data: list[int]) -> list[int]: def chunk_generator(data, chunk_size=1000): """Generate chunks using a generator.""" while True: - chunk = list(islice(data, chunk_size)) - if not chunk: - return - yield chunk + chunk = list(islice(data, chunk_size)) + if not chunk: + return + yield chunk # Process each chunk with intermediate lists and combine results = [] @@ -168,7 +171,7 @@ def run(self, data): yield from self.transformer(chunk) -PIPELINE = ( +CHUNKED_PIPELINE = ( ChunkedPipeline() .filter(lambda x: x % 2 == 0) # Filter even numbers .map(lambda x: x * 2) # Double them @@ -239,7 +242,7 @@ def run(self, data): def chunked_pipeline_approach(data: list[int]) -> list[int]: """Process data using the ChunkedPipeline class.""" - return list(PIPELINE.run(data)) + return list(CHUNKED_PIPELINE.run(data)) def chunked_pipeline_generator_approach(data: list[int]) -> list[int]: @@ -311,78 +314,82 @@ def chunked_pipeline_simple_approach(data: list[int]) -> list[int]: """Process data using the ChunkedPipelineSimple class.""" return list(PIPELINE_SIMPLE.run(data)) + # Sentinel value to indicate that an item has been filtered out. _SKIPPED = object() + class ChunkedPipelinePerItem: + """ + A chunked pipeline that composes operations into a single function + operating on individual items, inspired by a callback-chaining pattern + to reduce the creation of intermediate lists. + """ + + def __init__(self, chunk_size=1000): + """Initialize the pipeline with a specified chunk size.""" + self.chunk_size = chunk_size + # The transformer starts as an identity function. + self.transformer = lambda item: item + + def _chunk_generator(self, data): + """A generator that yields chunks of data.""" + data_iter = iter(data) + while True: + chunk = list(itertools.islice(data_iter, self.chunk_size)) + if not chunk: + break + yield chunk + + def pipe(self, operation): """ - A chunked pipeline that composes operations into a single function - operating on individual items, inspired by a callback-chaining pattern - to reduce the creation of intermediate lists. + Composes the current transformer with a new item-wise operation. + This internal method is the core of the pipeline's composition logic. """ + prev_transformer = self.transformer + + def new_transformer(item): + # Apply the existing chain of transformations. + processed_item = prev_transformer(item) + + # If a previous operation (like a filter) already skipped this item, + # we bypass the new operation entirely. + if processed_item is _SKIPPED: + return _SKIPPED + + # Apply the new operation to the result of the previous ones. + return operation(processed_item) + + self.transformer = new_transformer + return self + + def filter(self, predicate): + """ + Adds a filter operation to the pipeline. + + If the predicate returns `False`, the item is marked as skipped, + and no further operations in the chain will be executed on it. + """ + + def filter_operation(item): + return item if predicate(item) else _SKIPPED + + return self.pipe(filter_operation) + + def map(self, func): + """Adds a map operation to transform an item.""" + return self.pipe(func) + + def run(self, data): + """ + Executes the pipeline. + + The composed transformer function is applied to each item individually. + Results are yielded only if they haven't been marked as skipped. + """ + for chunk in self._chunk_generator(data): + yield from [result for item in chunk if (result := self.transformer(item)) is not _SKIPPED] - def __init__(self, chunk_size=1000): - """Initialize the pipeline with a specified chunk size.""" - self.chunk_size = chunk_size - # The transformer starts as an identity function. - self.transformer = lambda item: item - - def _chunk_generator(self, data): - """A generator that yields chunks of data.""" - data_iter = iter(data) - while True: - chunk = list(itertools.islice(data_iter, self.chunk_size)) - if not chunk: - break - yield chunk - - def pipe(self, operation): - """ - Composes the current transformer with a new item-wise operation. - This internal method is the core of the pipeline's composition logic. - """ - prev_transformer = self.transformer - - def new_transformer(item): - # Apply the existing chain of transformations. - processed_item = prev_transformer(item) - - # If a previous operation (like a filter) already skipped this item, - # we bypass the new operation entirely. - if processed_item is _SKIPPED: - return _SKIPPED - - # Apply the new operation to the result of the previous ones. - return operation(processed_item) - - self.transformer = new_transformer - return self - - def filter(self, predicate): - """ - Adds a filter operation to the pipeline. - - If the predicate returns `False`, the item is marked as skipped, - and no further operations in the chain will be executed on it. - """ - def filter_operation(item): - return item if predicate(item) else _SKIPPED - - return self.pipe(filter_operation) - - def map(self, func): - """Adds a map operation to transform an item.""" - return self.pipe(func) - - def run(self, data): - """ - Executes the pipeline. - - The composed transformer function is applied to each item individually. - Results are yielded only if they haven't been marked as skipped. - """ - for chunk in self._chunk_generator(data): - yield from [result for item in chunk if (result := self.transformer(item)) is not _SKIPPED] PIPELINE_PER_ITEM = ( ChunkedPipelinePerItem() @@ -392,10 +399,26 @@ def run(self, data): .map(lambda x: x + 1) ) + def chunked_pipeline_per_item_approach(data) -> list[int]: """Process data using the ChunkedPipelinePerItem class.""" return list(PIPELINE_PER_ITEM.run(data)) + +PIPELINE = ( + Pipeline.init(int) + .filter(lambda x: x % 2 == 0) # Filter even numbers + .map(lambda x: x * 2) # Double them + .filter(lambda x: x > 100) # Filter > 100 + .map(lambda x: x + 1) +) + + +def pipeline_approach(data: list[int]) -> list[int]: + """Process data using the Pipeline class.""" + return PIPELINE.to_list(data) + + def time_function(func, *args, **kwargs) -> float: """Time a function execution and return duration in seconds.""" start_time = time.perf_counter() @@ -414,16 +437,17 @@ def run_performance_test(): # "Generators": generator_approach, # "Map/Filter": builtin_map_filter_approach, # "GeneratorExpression": generator_expression_approach, - "ChunkedPipeline": chunked_pipeline_approach, - "ChunkedPipelinePerItem": chunked_pipeline_per_item_approach, - "ChunkedPipelineGenerator": chunked_pipeline_generator_approach, - "ChunkedPipelineSimple": chunked_pipeline_simple_approach, - "ListComprehension": list_comprehension_approach, "ChunkedGeneratorListComp": chunked_generator_listcomp_approach, + "ChunkedPipeline": chunked_pipeline_approach, + "Pipeline": pipeline_approach, + # "ChunkedPipelinePerItem": chunked_pipeline_per_item_approach, + # "ChunkedPipelineGenerator": chunked_pipeline_generator_approach, + # "ChunkedPipelineSimple": chunked_pipeline_simple_approach, + # "ListComprehension": list_comprehension_approach, # "MutatedChunkedGeneratorListComp": mutated_chunked_generator_listcomp_approach, } - num_runs = 50 + num_runs = 20 results = {} for name, approach_func in approaches.items(): @@ -525,12 +549,13 @@ def run_memory_test(): # "Generators": generator_approach, # "MapFilter": builtin_map_filter_approach, # "GeneratorExpression": generator_expression_approach, - "ChunkedPipeline": chunked_pipeline_approach, - "ChunkedPipelinePerItem": chunked_pipeline_per_item_approach, - "ChunkedPipelineGenerator": chunked_pipeline_generator_approach, - "ChunkedPipelineSimple": chunked_pipeline_simple_approach, - "ListComprehension": list_comprehension_approach, "ChunkedGeneratorListComp": chunked_generator_listcomp_approach, + "ChunkedPipeline": chunked_pipeline_approach, + "Pipeline": pipeline_approach, + # "ChunkedPipelinePerItem": chunked_pipeline_per_item_approach, + # "ChunkedPipelineGenerator": chunked_pipeline_generator_approach, + # "ChunkedPipelineSimple": chunked_pipeline_simple_approach, + # "ListComprehension": list_comprehension_approach, # "MutatedChunkedGeneratorListComp": mutated_chunked_generator_listcomp_approach, } diff --git a/pipeline.py b/pipeline.py new file mode 100644 index 0000000..2178b7f --- /dev/null +++ b/pipeline.py @@ -0,0 +1,263 @@ +from collections import deque +from collections.abc import Callable +from collections.abc import Iterable +from collections.abc import Iterator +from concurrent.futures import FIRST_COMPLETED +from concurrent.futures import ProcessPoolExecutor +from concurrent.futures import wait +from functools import reduce +import inspect +import itertools +from typing import Any +from typing import Union +from typing import overload + +# Type aliases for pipeline functions +type PipelineFunction[Out, T] = Callable[[Out], T] | Callable[[Out, PipelineContext], T] +type PipelineReduceFunction[U, Out] = Callable[[U, Out], U] | Callable[[U, Out, PipelineContext], U] + + +class PipelineContext(dict): + """Global context available to all pipeline operations.""" + + pass + + +class Pipeline[In, Out]: + """ + A functional pipeline for data processing with internal chunked processing. + + The Pipeline class provides a fluent interface for applying transformations + and filters. It processes data in chunks for performance and maintains a + global context for shared state. The pipeline is callable and iterable. + """ + + def __init__( + self, + chunk_size: int = 1000, + transformer: Callable[[list[In]], list[Out]] | None = None, + context: PipelineContext | None = None, + ) -> None: + self.chunk_size = chunk_size + self.context = context or PipelineContext() + self.transformer: Callable[[list[In]], list[Out]] = transformer or (lambda chunk: chunk) + + @classmethod + def init[T](cls, _type_hint: type[T], chunk_size: int = 1000) -> "Pipeline[T, T]": + """Create a new identity pipeline with explicit type hint.""" + return cls(chunk_size) + + def __call__(self, *data: Iterable[In]) -> Iterator[Out]: + """Execute the pipeline on data source (makes pipeline callable).""" + assert data, "Data source must be provided to execute the pipeline" + + for selected_data in data: + for chunk in self._chunk_generator(selected_data): + yield from self.transformer(chunk) + + def pipe[U](self, operation: Callable[[list[Out]], list[U]]) -> "Pipeline[In, U]": + """ + Composes the current transformer with a new chunk-wise operation. + The operation should be a function that takes a list and returns a list. + """ + prev_transformer = self.transformer + self.transformer = lambda chunk: operation(prev_transformer(chunk)) + return self + + def _create_context_aware_function(self, func: Callable): + """Create function that properly handles context parameter""" + try: + sig = inspect.signature(func) + params = [p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)] + param_count = len(params) + + if param_count >= 2: + return lambda value: func(value, self.context) + return func + except (ValueError, TypeError): + return func + + def _create_reduce_function(self, func: Callable): + """Create function that properly handles context parameter for reduce operations""" + try: + sig = inspect.signature(func) + params = [p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)] + param_count = len(params) + + if param_count >= 3: + return lambda acc, value: func(acc, value, self.context) + return func + except (ValueError, TypeError): + return func + + def map[U](self, function: PipelineFunction[Out, U]) -> "Pipeline[In, U]": + """Transform elements in the pipeline.""" + std_function = self._create_context_aware_function(function) + return self.pipe(lambda chunk: [std_function(x) for x in chunk]) + + def filter(self, predicate: PipelineFunction[Out, bool]) -> "Pipeline[In, Out]": + """Filter elements in the pipeline.""" + std_predicate = self._create_context_aware_function(predicate) + return self.pipe(lambda chunk: [x for x in chunk if std_predicate(x)]) + + def _chunk_generator(self, data: Iterable[In]) -> Iterator[list[In]]: + """Generate chunks from the data source""" + data_iter = iter(data) + while chunk := list(itertools.islice(data_iter, self.chunk_size)): + yield chunk + + def to_list(self, *data: Iterable[In]) -> list[Out]: + """Execute pipeline and return results as list.""" + return [item for d in data for chunk in self._chunk_generator(d) for item in self.transformer(chunk)] + + def each(self, *data: Iterable[In], function: PipelineFunction[Out, Any]) -> None: + """Apply function to each element (terminal operation).""" + std_function = self._create_context_aware_function(function) + + for chunk in self._chunk_generator(itertools.chain.from_iterable(data)): + [std_function(item) for item in self.transformer(chunk)] + + def tap(self, function: PipelineFunction[Out, Any]) -> "Pipeline[In, Out]": + """Apply side-effect without modifying data.""" + std_function = self._create_context_aware_function(function) + + def tap_operation(chunk: list[Out]) -> list[Out]: + for item in chunk: + std_function(item) + return chunk + + return self.pipe(tap_operation) + + def consume(self, *data: Iterable[In]) -> None: + """Consume the pipeline without returning results (terminal operation).""" + for _ in self(*data): + pass + + @overload + def flatten(self: "Pipeline[list[In]]") -> "Pipeline[In]": ... + + @overload + def flatten(self: "Pipeline[tuple[In, ...]]") -> "Pipeline[In]": ... + + @overload + def flatten(self: "Pipeline[set[In]]") -> "Pipeline[In]": ... + + def flatten( + self: Union["Pipeline[list[In], In]", "Pipeline[tuple[In, ...]]", "Pipeline[set[In]]"], + ) -> "Pipeline[In, In]": + """Flatten nested lists into a single list.""" + return self.pipe(lambda chunk: [item for sublist in chunk for item in sublist]) + + def apply[T](self, pipeline: "Pipeline[Out, T]") -> "Pipeline[In, T]": + """Apply another pipeline to the current one.""" + return self.pipe(lambda chunk: pipeline.transformer(chunk)) + + def first(self, *data: Iterable[In], n: int = 1) -> list[Out]: + """Get the first n elements of the pipeline (terminal operation).""" + assert n >= 1, "n must be at least 1" + data_iter = itertools.chain.from_iterable(data) + result = [] + remaining = n + + while remaining > 0: + selected_item_count = min(self.chunk_size, remaining) + + # Read a chunk of size min(chunk_size, remaining) + chunk = list(itertools.islice(data_iter, selected_item_count)) + + if not chunk: + break # No more data + + result.extend(self.transformer(chunk)) + + remaining -= selected_item_count + + return result + + def reduce[U](self, *data: Iterable[In], function: PipelineReduceFunction[U, Out], initial: U) -> U: + """ + Reduce elements to a single value (terminal operation). + + Args: + function: Reduce function (acc, value) -> new_acc + initial: Initial accumulator value + """ + reducer = self._create_reduce_function(function) + return reduce(reducer, self(*data), initial) + + def concurrent[T]( + self, + *data: Iterable[In], + pipeline: "Pipeline[Out,T]", + max_workers: int, + ordered: bool = True, + ) -> Iterator[T]: + """ + Applies a sub-pipeline to each chunk in parallel (Terminal Operation). + + This method consumes the pipeline and processes chunks of data concurrently, + returning an iterator for the results. It is a terminal operation. + + To manage memory effectively, it uses a just-in-time approach, only + pulling chunks from the source iterator when a worker thread is available. + + Args: + *data: The input data iterable(s) to process. + pipeline: A `Pipeline` instance to apply to each chunk. A deep + copy is used for each thread to ensure context isolation. + max_workers: The maximum number of worker threads. + ordered: If True, results are yielded in the original order. + If False, results are yielded as they complete. + + Returns: + An iterator that yields the processed items. + """ + + def process_chunk(chunk: list[Out]) -> list[T]: + return pipeline.to_list(chunk) + + def _ordered_generator(chunks_iter: Iterator[list[Out]], executor: ProcessPoolExecutor) -> Iterator[list[T]]: + futures = deque() + for _ in range(max_workers + 1): + try: + chunk = next(chunks_iter) + futures.append(executor.submit(process_chunk, chunk)) + except StopIteration: + break + + while futures: + yield futures.popleft().result() + try: + chunk = next(chunks_iter) + futures.append(executor.submit(process_chunk, chunk)) + except StopIteration: + continue + + def _unordered_generator(chunks_iter: Iterator[list[Out]], executor: ProcessPoolExecutor) -> Iterator[list[T]]: + futures = {executor.submit(process_chunk, chunk) for chunk in itertools.islice(chunks_iter, max_workers + 1)} + + while futures: + done, futures = wait(futures, return_when=FIRST_COMPLETED) + for future in done: + yield future.result() + try: + chunk = next(chunks_iter) + futures.add(executor.submit(process_chunk, chunk)) + except StopIteration: + continue + + # This wrapper generator ensures the ProcessPoolExecutor is properly + # managed and closed only after the result iterator is exhausted. + def result_iterator_manager() -> Iterator[T]: + with ProcessPoolExecutor(max_workers=max_workers) as executor: + # Create the stream of chunks to be processed concurrently. + chunks_to_process = (self.transformer(chunk) for d in data for chunk in self._chunk_generator(d)) + + gen_func = _ordered_generator if ordered else _unordered_generator + processed_chunks_iterator = gen_func(chunks_to_process, executor) + + # Flatten the iterator of chunks into an iterator of items. + for result_chunk in processed_chunks_iterator: + yield from result_chunk + + return result_iterator_manager() diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_pipeline_basics.py b/tests/test_pipeline_basics.py index d89101a..be60b7b 100644 --- a/tests/test_pipeline_basics.py +++ b/tests/test_pipeline_basics.py @@ -5,8 +5,6 @@ including initialization, iteration, and core methods. """ -import pytest - from efemel.pipeline import Pipeline @@ -15,63 +13,71 @@ class TestPipelineBasics: def test_pipeline_initialization(self): """Test Pipeline initialization with various iterables.""" - # Test with list - pipeline = Pipeline([1, 2, 3, 4, 5]) + # Test with default parameters + pipeline = Pipeline() assert isinstance(pipeline, Pipeline) - # Generator should be a generator object, not the original list - from types import GeneratorType - - assert isinstance(pipeline.generator, GeneratorType) - - # Test with tuple - pipeline = Pipeline((1, 2, 3)) - assert list(pipeline) == [1, 2, 3] + assert pipeline.chunk_size == 1000 + assert isinstance(pipeline.context, dict) - # Test with generator - def gen(): - yield from range(1, 4) + # Test with custom chunk size + pipeline = Pipeline(chunk_size=500) + assert pipeline.chunk_size == 500 - pipeline = Pipeline(gen()) - assert list(pipeline) == [1, 2, 3] + # Test with custom context + from efemel.pipeline import PipelineContext - # Test with empty list - pipeline = Pipeline([]) - assert list(pipeline) == [] + custom_context = PipelineContext() + custom_context["key"] = "value" + pipeline = Pipeline(context=custom_context) + assert pipeline.context["key"] == "value" def test_pipeline_iteration(self): - """Test Pipeline iteration behavior.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) + """Test Pipeline iteration behavior through callable interface.""" + pipeline = Pipeline() - # Test iteration + # Test iteration via __call__ + data = [1, 2, 3, 4, 5] result = [] - for item in pipeline: + for item in pipeline(data): result.append(item) assert result == [1, 2, 3, 4, 5] + # Test with multiple data sources + result2 = list(pipeline([1, 2], [3, 4, 5])) + assert result2 == [1, 2, 3, 4, 5] + def test_to_list(self): """Test Pipeline.to_list() method.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - result = pipeline.to_list() + pipeline = Pipeline() + result = pipeline.to_list([1, 2, 3, 4, 5]) assert result == [1, 2, 3, 4, 5] assert isinstance(result, list) # Test with empty pipeline - empty_pipeline = Pipeline([]) - assert empty_pipeline.to_list() == [] + assert pipeline.to_list([]) == [] + + # Test with multiple data sources + result2 = pipeline.to_list([1, 2], [3, 4, 5]) + assert result2 == [1, 2, 3, 4, 5] def test_first(self): """Test Pipeline.first() method.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - assert pipeline.first() == 1 + pipeline = Pipeline() + result = pipeline.first([1, 2, 3, 4, 5]) + assert result == [1] # Test with string - str_pipeline = Pipeline(["hello", "world"]) - assert str_pipeline.first() == "hello" + result = pipeline.first(["hello", "world"]) + assert result == ["hello"] + + # Test first n elements + result = pipeline.first([1, 2, 3, 4, 5], n=3) + assert result == [1, 2, 3] def test_first_empty_pipeline(self): """Test Pipeline.first() with empty pipeline.""" - empty_pipeline = Pipeline([]) - with pytest.raises(StopIteration): - empty_pipeline.first() + pipeline = Pipeline() + result = pipeline.first([]) + assert result == [] diff --git a/tests/test_pipeline_catch.py b/tests/test_pipeline_catch.py deleted file mode 100644 index d805789..0000000 --- a/tests/test_pipeline_catch.py +++ /dev/null @@ -1,214 +0,0 @@ -""" -Test Pipeline catch functionality and CompoundError handling. - -This module contains tests for Pipeline error handling including -catch operations, CompoundError formatting, and error propagation. -""" - -import pytest - -from efemel.pipeline import CompoundError -from efemel.pipeline import Pipeline - - -class TestPipelineCatch: - """Test Pipeline catch functionality and CompoundError handling.""" - - def test_catch_basic_error_handling(self): - """Test basic catch functionality with error callback.""" - captured_errors = [] - - def error_prone_pipeline(p): - return p.map(lambda x: 10 / x if x != 0 else 1 / 0) - - def error_handler(error): - captured_errors.append(error) - - pipeline = Pipeline([2, 0, 4, 0, 6]) - - with pytest.raises(CompoundError) as exc_info: - pipeline.catch(error_prone_pipeline, error_handler).to_list() - - # Should have captured 2 ZeroDivisionErrors - assert len(captured_errors) == 2 - assert all(isinstance(e, ZeroDivisionError) for e in captured_errors) - - # CompoundError should also contain the same errors - assert len(exc_info.value.errors) == 2 - assert all(isinstance(e, ZeroDivisionError) for e in exc_info.value.errors) - - def test_catch_with_successful_items(self): - """Test catch with mix of successful and failed items.""" - error_logs = [] - - def selective_error_pipeline(p): - return p.map(lambda x: x * 2 if x != 3 else 1 / 0) - - def log_error(error): - error_logs.append(f"Caught: {type(error).__name__}") - - pipeline = Pipeline([1, 2, 3, 4, 5]) - - with pytest.raises(CompoundError) as exc_info: - pipeline.catch(selective_error_pipeline, log_error).to_list() - - # Should have logged one error - assert len(error_logs) == 1 - assert "ZeroDivisionError" in error_logs[0] - - # CompoundError should contain one error - assert len(exc_info.value.errors) == 1 - - def test_catch_with_reduce_operation(self): - """Test catch with reduce operation that has errors.""" - errors_seen = [] - - def error_prone_reduce_pipeline(p: Pipeline[int]): - # First map some values to cause errors, then reduce - return p.map(lambda x: x if x != 2 else 1 / 0).reduce(lambda acc, x: acc + x, 0) - - def collect_errors(error): - errors_seen.append(error) - - pipeline = Pipeline([1, 2, 3, 4, 5]) - - with pytest.raises(CompoundError) as exc_info: - pipeline.catch(error_prone_reduce_pipeline, collect_errors).first() - - # Should have seen the error during catch - assert len(errors_seen) == 1 - assert isinstance(errors_seen[0], ZeroDivisionError) - - # CompoundError should also have the error - assert len(exc_info.value.errors) == 1 - - def test_catch_multiple_operation_errors(self): - """Test catch with multiple operations that can error.""" - all_errors = [] - - def multi_error_pipeline(p: Pipeline[int]): - return ( - p.map(lambda x: x / 2 if x != 4 else 1 / 0) # Error on 4 - .filter( - lambda x: x > 0 if x != 1 else 1 / 0 - ) # Error on 1 (after division = 0.5, but let's say error on result 1) - .map(lambda x: x * 3 if x != 3 else 1 / 0) - ) # Error on 3 (after division = 1.5, won't hit) - - def track_all_errors(error): - all_errors.append(type(error).__name__) - - pipeline = Pipeline([2, 4, 6, 8]) - - with pytest.raises(CompoundError) as exc_info: - pipeline.catch(multi_error_pipeline, track_all_errors).to_list() - - # Should have tracked at least one error - assert len(all_errors) >= 1 - assert len(exc_info.value.errors) >= 1 - - def test_catch_with_no_errors(self): - """Test catch when no errors occur.""" - error_handler_called = [] - - def safe_pipeline(p): - return p.map(lambda x: x * 2).filter(lambda x: x > 5) - - def should_not_be_called(error): - error_handler_called.append(error) - - pipeline = Pipeline([1, 2, 3, 4, 5]) - result = pipeline.catch(safe_pipeline, should_not_be_called).to_list() - - # No errors, so handler should not be called - assert len(error_handler_called) == 0 - # Should get successful results - assert result == [6, 8, 10] - - def test_compound_error_formatting(self): - """Test CompoundError message formatting.""" - - def always_error(x): - if x == 1: - raise ValueError("First error") - elif x == 2: - raise TypeError("Second error") - else: - raise RuntimeError("Third error") - - pipeline = Pipeline([1, 2, 3]) - - with pytest.raises(CompoundError) as exc_info: - pipeline.map(always_error).to_list() - - error_message = str(exc_info.value) - - # Check message contains error count - assert "3 error(s) occurred in the pipeline" in error_message - - # Check individual error messages are included - assert "ValueError: First error" in error_message - assert "TypeError: Second error" in error_message - assert "RuntimeError: Third error" in error_message - - def test_compound_error_with_single_error(self): - """Test CompoundError with just one error.""" - - def single_error(x): - if x == 3: - raise ValueError("Only error") - return x * 2 - - pipeline = Pipeline([1, 2, 3, 4]) - - with pytest.raises(CompoundError) as exc_info: - pipeline.map(single_error).to_list() - - # Should have exactly one error - assert len(exc_info.value.errors) == 1 - assert isinstance(exc_info.value.errors[0], ValueError) - - # Message should reflect single error - error_message = str(exc_info.value) - assert "1 error(s) occurred in the pipeline" in error_message - assert "ValueError: Only error" in error_message - - def test_catch_with_chained_operations(self): - """Test catch with complex chained operations.""" - logged_errors = [] - - def complex_pipeline(p): - return ( - p.filter(lambda x: x > 0 if x != 2 else 1 / 0) # Error on 2 - .map(lambda x: x * 2 if x != 4 else 1 / 0) # Error on 4 - .reduce(lambda acc, x: acc + x, 0) - ) - - def error_logger(error): - logged_errors.append(f"Error: {error}") - - pipeline = Pipeline([1, 2, 3, 4, 5]) - - with pytest.raises(CompoundError) as exc_info: - pipeline.catch(complex_pipeline, error_logger).first() - - # Should have logged errors from the operations - assert len(logged_errors) >= 1 - # CompoundError should also contain the errors - assert len(exc_info.value.errors) >= 1 - - def test_catch_error_handler_exception(self): - """Test catch when error handler itself raises exception.""" - - def error_pipeline(p): - return p.map(lambda x: 1 / 0 if x == 2 else x) - - def failing_handler(error): - raise RuntimeError("Handler failed!") - - pipeline = Pipeline([1, 2, 3]) - - # Should still work despite handler failure - with pytest.raises(CompoundError): - # The failing handler should not stop the pipeline - pipeline.catch(error_pipeline, failing_handler).to_list() diff --git a/tests/test_pipeline_composition.py b/tests/test_pipeline_composition.py index 0ad0a44..d6a724b 100644 --- a/tests/test_pipeline_composition.py +++ b/tests/test_pipeline_composition.py @@ -11,90 +11,103 @@ class TestPipelineComposition: """Test Pipeline composition functionality.""" - def test_pipeline_from_pipeline_init(self): - """Test creating a Pipeline from another Pipeline using __init__.""" - # Create source pipeline - source_pipeline = Pipeline([1, 2, 3, 4, 5]) + def test_pipeline_apply_method(self): + """Test applying one pipeline to another using apply method.""" + # Create source pipeline that doubles numbers + doubler_pipeline = Pipeline().map(lambda x: x * 2) - # Create new pipeline from the source - new_pipeline = Pipeline(source_pipeline) + # Create main pipeline that adds 1 + main_pipeline = Pipeline().map(lambda x: x + 1) - # Test that new pipeline works (source gets exhausted, so test new one) - result = new_pipeline.to_list() - assert result == [1, 2, 3, 4, 5] + # Apply the doubler to the main pipeline + composed = main_pipeline.apply(doubler_pipeline) - def test_from_pipeline_class_method(self): - """Test creating a Pipeline using from_pipeline class method.""" - # Create source pipeline - source_pipeline = Pipeline([1, 2, 3, 4, 5]) + # Test the composition: first add 1, then double + result = composed.to_list([1, 2, 3, 4, 5]) + assert result == [4, 6, 8, 10, 12] # (1+1)*2, (2+1)*2, (3+1)*2, (4+1)*2, (5+1)*2 - # Create new pipeline using class method - new_pipeline = Pipeline.from_pipeline(source_pipeline) + def test_pipeline_chaining_with_multiple_data_sources(self): + """Test chaining operations with multiple data sources.""" + pipeline = Pipeline() - # Test that they produce the same results - assert new_pipeline.to_list() == [1, 2, 3, 4, 5] + # Chain operations: double, then filter even, then add 10 + result = pipeline.map(lambda x: x * 2).filter(lambda x: x % 2 == 0).map(lambda x: x + 10) - def test_chain_method(self): - """Test chaining multiple pipelines.""" - pipeline1 = Pipeline([1, 2, 3]) - pipeline2 = Pipeline([4, 5, 6]) - pipeline3 = Pipeline([7, 8, 9]) + # Test with multiple data sources + data1 = [1, 2, 3] + data2 = [4, 5, 6] + result_list = result.to_list(data1, data2) - # Chain them together - chained = Pipeline.chain(pipeline1, pipeline2, pipeline3) + # Expected: double [1,2,3,4,5,6] -> [2,4,6,8,10,12], filter even -> [2,4,6,8,10,12], add 10 -> [12,14,16,18,20,22] + assert result_list == [12, 14, 16, 18, 20, 22] - # Should contain all elements in sequence - assert chained.to_list() == [1, 2, 3, 4, 5, 6, 7, 8, 9] + def test_complex_composition(self): + """Test complex pipeline composition.""" + # Create a pipeline that processes data in multiple steps + pipeline = Pipeline() - def test_chain_empty_pipelines(self): - """Test chaining with empty pipelines.""" - pipeline1 = Pipeline([1, 2]) - empty_pipeline = Pipeline([]) - pipeline2 = Pipeline([3, 4]) + # Complex chain: map to square, filter > 10, map to string, filter length > 1 + result = pipeline.map(lambda x: x**2).filter(lambda x: x > 10).map(str).filter(lambda s: len(s) > 1) - chained = Pipeline.chain(pipeline1, empty_pipeline, pipeline2) - assert chained.to_list() == [1, 2, 3, 4] + # Test with data [1, 2, 3, 4, 5, 6] + # Square: [1, 4, 9, 16, 25, 36] + # Filter > 10: [16, 25, 36] + # To string: ["16", "25", "36"] + # Filter length > 1: ["16", "25", "36"] (all have length 2) + result_list = result.to_list([1, 2, 3, 4, 5, 6]) + assert result_list == ["16", "25", "36"] - def test_chain_single_pipeline(self): - """Test chaining with a single pipeline.""" - pipeline = Pipeline([1, 2, 3]) - chained = Pipeline.chain(pipeline) + def test_pipeline_apply_with_empty_data(self): + """Test apply method with empty data.""" + doubler = Pipeline().map(lambda x: x * 2) + main = Pipeline().map(lambda x: x + 1) - assert chained.to_list() == [1, 2, 3] + composed = main.apply(doubler) + result = composed.to_list([]) + assert result == [] - def test_pipeline_composition_with_operations(self): - """Test that pipeline composition works with transformations.""" - # Create source pipeline with transformations - source = Pipeline([1, 2, 3, 4, 5]).map(lambda x: x * 2) + def test_pipeline_apply_with_filters(self): + """Test apply method combining filters and maps.""" + # Create a pipeline that filters even numbers + even_filter = Pipeline().filter(lambda x: x % 2 == 0) - # Create new pipeline from transformed source - new_pipeline = Pipeline.from_pipeline(source) - filtered = new_pipeline.filter(lambda x: x > 5) + # Create a pipeline that doubles numbers + doubler = Pipeline().map(lambda x: x * 2) - assert filtered.to_list() == [6, 8, 10] + # Apply even filter first, then doubler + composed = even_filter.apply(doubler) - def test_chain_with_different_types(self): - """Test chaining pipelines with different but compatible types.""" - numbers = Pipeline([1, 2, 3]) - strings = Pipeline(["4", "5", "6"]) + result = composed.to_list([1, 2, 3, 4, 5, 6]) + assert result == [4, 8, 12] # Even numbers [2, 4, 6] doubled - # Chain and then transform to make types consistent - chained = Pipeline.chain(numbers, strings) - all_strings = chained.map(str) + def test_multiple_pipeline_applications(self): + """Test applying multiple pipelines in sequence.""" + # Create individual transformation pipelines + add_one = Pipeline().map(lambda x: x + 1) + double = Pipeline().map(lambda x: x * 2) + to_string = Pipeline().map(str) - assert all_strings.to_list() == ["1", "2", "3", "4", "5", "6"] + # Chain them using apply + result_pipeline = add_one.apply(double).apply(to_string) + + # Test: (1+1)*2=4, (2+1)*2=6, (3+1)*2=8 -> ["4", "6", "8"] + result = result_pipeline.to_list([1, 2, 3]) + assert result == ["4", "6", "8"] def test_complex_pipeline_composition(self): """Test complex pipeline composition scenario.""" - # Create multiple source pipelines - evens = Pipeline([2, 4, 6, 8]) - odds = Pipeline([1, 3, 5, 7]) - - # Chain them - all_numbers = Pipeline.chain(evens, odds) - - # Apply transformations - result = all_numbers.filter(lambda x: x > 3).map(lambda x: x**2).reduce(lambda acc, x: acc + x, 0) - - # Expected: [4, 6, 8, 5, 7] -> [16, 36, 64, 25, 49] -> 190 - assert result.first() == 190 + # Use a single pipeline with multiple data sources instead of chaining + pipeline = Pipeline() + + # Apply transformations: filter > 3, square, then reduce to sum + result = ( + pipeline.filter(lambda x: x > 3) + .map(lambda x: x**2) + .reduce([2, 4, 6, 8], [1, 3, 5, 7], function=lambda acc, x: acc + x, initial=0) + ) + + # Data: [2, 4, 6, 8] + [1, 3, 5, 7] = [2, 4, 6, 8, 1, 3, 5, 7] + # Filter > 3: [4, 6, 8, 5, 7] + # Square: [16, 36, 64, 25, 49] + # Sum: 190 + assert result == 190 diff --git a/tests/test_pipeline_each.py b/tests/test_pipeline_each.py index ac473d6..4dbfe4d 100644 --- a/tests/test_pipeline_each.py +++ b/tests/test_pipeline_each.py @@ -13,37 +13,37 @@ class TestPipelineEach: def test_each_basic(self): """Test basic each operations.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) + pipeline = Pipeline() side_effects = [] # Each should execute function for each item - pipeline.each(side_effects.append) + pipeline.each([1, 2, 3, 4, 5], function=side_effects.append) assert side_effects == [1, 2, 3, 4, 5] def test_each_returns_none(self): """Test that each returns None.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) + pipeline = Pipeline() - result = pipeline.each(lambda x: None) + result = pipeline.each([1, 2, 3, 4, 5], function=lambda x: None) assert result is None def test_each_with_empty_pipeline(self): """Test each with empty pipeline.""" - empty_pipeline = Pipeline([]) + pipeline = Pipeline() side_effects = [] # Should not execute function - empty_pipeline.each(side_effects.append) + pipeline.each([], function=side_effects.append) assert side_effects == [] def test_each_terminal_operation(self): """Test that each is a terminal operation.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) + pipeline = Pipeline() side_effects = [] # After each, pipeline should be consumed - pipeline.each(side_effects.append) + pipeline.each([1, 2, 3, 4, 5], function=side_effects.append) assert side_effects == [1, 2, 3, 4, 5] diff --git a/tests/test_pipeline_edge_cases.py b/tests/test_pipeline_edge_cases.py index d4530aa..b119e15 100644 --- a/tests/test_pipeline_edge_cases.py +++ b/tests/test_pipeline_edge_cases.py @@ -14,51 +14,50 @@ class TestPipelineEdgeCases: def test_pipeline_with_none_values(self): """Test pipeline with None values.""" # Should handle None values properly - pipeline1 = Pipeline([1, None, 3, None, 5]) - result = pipeline1.to_list() + pipeline = Pipeline() + result = pipeline.to_list([1, None, 3, None, 5]) assert result == [1, None, 3, None, 5] # Filter out None values - use fresh pipeline - pipeline2 = Pipeline([1, None, 3, None, 5]) + pipeline2 = Pipeline() no_none = pipeline2.filter(lambda x: x is not None) - assert no_none.to_list() == [1, 3, 5] + assert no_none.to_list([1, None, 3, None, 5]) == [1, 3, 5] def test_pipeline_with_mixed_types(self): """Test pipeline with mixed data types.""" # Should handle mixed types - pipeline1 = Pipeline([1, "hello", 3.14, True, None]) - result = pipeline1.to_list() + pipeline = Pipeline() + result = pipeline.to_list([1, "hello", 3.14, True, None]) assert result == [1, "hello", 3.14, True, None] # Filter by type (excluding boolean which is a subclass of int) - use fresh pipeline - pipeline2 = Pipeline([1, "hello", 3.14, True, None]) + pipeline2 = Pipeline() numbers = pipeline2.filter(lambda x: isinstance(x, int | float) and not isinstance(x, bool) and x is not None) - assert numbers.to_list() == [1, 3.14] + assert numbers.to_list([1, "hello", 3.14, True, None]) == [1, 3.14] def test_pipeline_reuse(self): """Test that pipelines can be reused.""" original_data = [1, 2, 3, 4, 5] - pipeline = Pipeline(original_data) + pipeline = Pipeline() # First use - result1 = pipeline.map(lambda x: x * 2).to_list() + result1 = pipeline.map(lambda x: x * 2).to_list(original_data) assert result1 == [2, 4, 6, 8, 10] - # Create new pipeline from same data - pipeline2 = Pipeline(original_data) - result2 = pipeline2.filter(lambda x: x % 2 == 0).to_list() - assert result2 == [2, 4] + # Pipeline should be reusable with different data + new_data = [10, 20, 30] + result2 = pipeline.to_list(new_data) # Same pipeline, different data + assert result2 == [20, 40, 60] def test_pipeline_method_chaining_returns_new_instances(self): """Test that pipeline methods return new instances.""" - original = Pipeline([1, 2, 3, 4, 5]) + # Create separate pipelines for different operations + mapped = Pipeline().map(lambda x: x * 2) + filtered = Pipeline().filter(lambda x: x % 2 == 0) - # Methods should return new instances (except passthrough) - mapped = original.map(lambda x: x * 2) - filtered = original.filter(lambda x: x % 2 == 0) - - assert mapped is not original - assert filtered is not original + # Test that they work correctly with different transformations + assert mapped.to_list([1, 2, 3, 4, 5]) == [2, 4, 6, 8, 10] + assert filtered.to_list([1, 2, 3, 4, 5]) == [2, 4] assert mapped is not filtered def test_pipeline_with_generator_exhaustion(self): @@ -67,11 +66,12 @@ def test_pipeline_with_generator_exhaustion(self): def number_generator(): yield from range(3) - pipeline = Pipeline(number_generator()) + pipeline = Pipeline() # First consumption - result1 = pipeline.to_list() + result1 = pipeline.to_list(number_generator()) assert result1 == [0, 1, 2] - # Generator should be exhausted now - # This behavior depends on the implementation + # New generator for second consumption (generators are single-use) + result2 = pipeline.to_list(number_generator()) + assert result2 == [0, 1, 2] diff --git a/tests/test_pipeline_filtering.py b/tests/test_pipeline_filtering.py index f87b985..a6c161c 100644 --- a/tests/test_pipeline_filtering.py +++ b/tests/test_pipeline_filtering.py @@ -14,47 +14,43 @@ class TestPipelineFiltering: def test_filter_basic(self): """Test basic filtering operations.""" # Filter even numbers - pipeline1 = Pipeline([1, 2, 3, 4, 5]) - even_pipeline = pipeline1.filter(lambda x: x % 2 == 0) - assert even_pipeline.to_list() == [2, 4] + pipeline1 = Pipeline().filter(lambda x: x % 2 == 0) + assert pipeline1.to_list([1, 2, 3, 4, 5]) == [2, 4] # Filter numbers greater than 3 - use fresh pipeline - pipeline2 = Pipeline([1, 2, 3, 4, 5]) - gt3_pipeline = pipeline2.filter(lambda x: x > 3) - assert gt3_pipeline.to_list() == [4, 5] + pipeline2 = Pipeline().filter(lambda x: x > 3) + assert pipeline2.to_list([1, 2, 3, 4, 5]) == [4, 5] def test_filter_with_strings(self): """Test filtering with string data.""" # Filter strings longer than 4 characters - pipeline1 = Pipeline(["hello", "world", "python", "test"]) - long_strings = pipeline1.filter(lambda s: len(s) > 4) - assert long_strings.to_list() == ["hello", "world", "python"] + pipeline1 = Pipeline().filter(lambda s: len(s) > 4) + assert pipeline1.to_list(["hello", "world", "python", "test"]) == ["hello", "world", "python"] # Filter strings starting with 'p' - use fresh pipeline - pipeline2 = Pipeline(["hello", "world", "python", "test"]) - p_strings = pipeline2.filter(lambda s: s.startswith("p")) - assert p_strings.to_list() == ["python"] + pipeline2 = Pipeline().filter(lambda s: s.startswith("p")) + assert pipeline2.to_list(["hello", "world", "python", "test"]) == ["python"] def test_filter_empty_result(self): """Test filter that results in empty pipeline.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) + pipeline = Pipeline() # Filter that matches nothing empty_result = pipeline.filter(lambda x: x > 10) - assert empty_result.to_list() == [] + assert empty_result.to_list([1, 2, 3, 4, 5]) == [] def test_filter_all_pass(self): """Test filter where all items pass.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) + pipeline = Pipeline() # Filter that passes everything all_pass = pipeline.filter(lambda x: True) - assert all_pass.to_list() == [1, 2, 3, 4, 5] + assert all_pass.to_list([1, 2, 3, 4, 5]) == [1, 2, 3, 4, 5] def test_filter_chaining(self): """Test chaining multiple filters.""" - pipeline = Pipeline([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + pipeline = Pipeline() # Chain filters: even numbers and greater than 5 result = pipeline.filter(lambda x: x % 2 == 0).filter(lambda x: x > 5) - assert result.to_list() == [6, 8, 10] + assert result.to_list([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) == [6, 8, 10] diff --git a/tests/test_pipeline_integration.py b/tests/test_pipeline_integration.py index f4425db..d1d0569 100644 --- a/tests/test_pipeline_integration.py +++ b/tests/test_pipeline_integration.py @@ -22,35 +22,35 @@ def test_data_processing_pipeline(self): {"name": "Eve", "age": 22, "active": False}, ] - pipeline = Pipeline(users) + pipeline = Pipeline() # Process: filter active users, extract names, convert to uppercase result = pipeline.filter(lambda user: user["active"]).map(lambda user: user["name"]).map(str.upper) - assert result.to_list() == ["ALICE", "CHARLIE", "DIANA"] + assert result.to_list(users) == ["ALICE", "CHARLIE", "DIANA"] def test_number_processing_pipeline(self): """Test a number processing pipeline.""" numbers = range(1, 21) # 1 to 20 - pipeline = Pipeline(numbers) + pipeline = Pipeline() # Process: filter even numbers, square them, filter > 50, sum result = ( pipeline.filter(lambda x: x % 2 == 0) # [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] .map(lambda x: x**2) # [4, 16, 36, 64, 100, 144, 196, 256, 324, 400] .filter(lambda x: x > 50) # [64, 100, 144, 196, 256, 324, 400] - .reduce(lambda acc, x: acc + x, 0) + .reduce(numbers, function=lambda acc, x: acc + x, initial=0) ) # 1484 - assert result.first() == 1484 + assert result == 1484 def test_text_processing_pipeline(self): """Test a text processing pipeline.""" text = "Hello world! This is a test. Python is amazing." words = text.split() - pipeline = Pipeline(words) + pipeline = Pipeline() # Process: filter words > 3 chars, remove punctuation, lowercase, get unique result = ( @@ -61,13 +61,13 @@ def test_text_processing_pipeline(self): ) # Simple "unique" filter expected = ["hello", "world", "test", "python", "amazing"] - assert result.to_list() == expected + assert result.to_list(words) == expected def test_nested_data_processing(self): """Test processing nested data structures.""" data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - pipeline = Pipeline(data) + pipeline = Pipeline() # Flatten, filter odd numbers, square them result = ( @@ -76,4 +76,4 @@ def test_nested_data_processing(self): .map(lambda x: x**2) ) # [1, 9, 25, 49, 81] - assert result.to_list() == [1, 9, 25, 49, 81] + assert result.to_list(data) == [1, 9, 25, 49, 81] diff --git a/tests/test_pipeline_map_filter.py b/tests/test_pipeline_map_filter.py index d09e171..1b85c5a 100644 --- a/tests/test_pipeline_map_filter.py +++ b/tests/test_pipeline_map_filter.py @@ -13,23 +13,23 @@ class TestPipelineMapFilter: def test_map_then_filter(self): """Test mapping then filtering.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) + pipeline = Pipeline() # Double numbers, then filter even results result = pipeline.map(lambda x: x * 2).filter(lambda x: x % 2 == 0) - assert result.to_list() == [2, 4, 6, 8, 10] + assert result.to_list([1, 2, 3, 4, 5]) == [2, 4, 6, 8, 10] def test_filter_then_map(self): """Test filtering then mapping.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) + pipeline = Pipeline() # Filter even numbers, then double them result = pipeline.filter(lambda x: x % 2 == 0).map(lambda x: x * 2) - assert result.to_list() == [4, 8] + assert result.to_list([1, 2, 3, 4, 5]) == [4, 8] def test_complex_map_filter_chain(self): """Test complex chaining of map and filter operations.""" - pipeline = Pipeline([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + pipeline = Pipeline() # Complex chain: filter odd, multiply by 3, filter > 10 result = ( @@ -38,4 +38,4 @@ def test_complex_map_filter_chain(self): .filter(lambda x: x > 10) ) # [15, 21, 27] - assert result.to_list() == [15, 21, 27] + assert result.to_list([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) == [15, 21, 27] diff --git a/tests/test_pipeline_mapping.py b/tests/test_pipeline_mapping.py index 00a0814..5950a57 100644 --- a/tests/test_pipeline_mapping.py +++ b/tests/test_pipeline_mapping.py @@ -14,51 +14,45 @@ class TestPipelineMapping: def test_map_basic(self): """Test basic mapping operations.""" # Double each number - pipeline1 = Pipeline([1, 2, 3, 4, 5]) - doubled = pipeline1.map(lambda x: x * 2) - assert doubled.to_list() == [2, 4, 6, 8, 10] + pipeline1 = Pipeline().map(lambda x: x * 2) + assert pipeline1.to_list([1, 2, 3, 4, 5]) == [2, 4, 6, 8, 10] # Square each number - use fresh pipeline - pipeline2 = Pipeline([1, 2, 3, 4, 5]) - squared = pipeline2.map(lambda x: x**2) - assert squared.to_list() == [1, 4, 9, 16, 25] + pipeline2 = Pipeline().map(lambda x: x**2) + assert pipeline2.to_list([1, 2, 3, 4, 5]) == [1, 4, 9, 16, 25] def test_map_type_transformation(self): """Test mapping with type transformation.""" # Convert numbers to strings - pipeline1 = Pipeline([1, 2, 3, 4, 5]) - str_pipeline = pipeline1.map(str) - assert str_pipeline.to_list() == ["1", "2", "3", "4", "5"] + pipeline1 = Pipeline().map(str) + assert pipeline1.to_list([1, 2, 3, 4, 5]) == ["1", "2", "3", "4", "5"] # Convert to boolean (non-zero is True) - use fresh pipeline - pipeline2 = Pipeline([1, 2, 3, 4, 5]) - bool_pipeline = pipeline2.map(bool) - assert bool_pipeline.to_list() == [True, True, True, True, True] + pipeline2 = Pipeline().map(bool) + assert pipeline2.to_list([1, 2, 3, 4, 5]) == [True, True, True, True, True] def test_map_with_strings(self): """Test mapping with string data.""" # Convert to uppercase - pipeline1 = Pipeline(["hello", "world", "python"]) - upper_pipeline = pipeline1.map(str.upper) - assert upper_pipeline.to_list() == ["HELLO", "WORLD", "PYTHON"] + pipeline1 = Pipeline().map(str.upper) + assert pipeline1.to_list(["hello", "world", "python"]) == ["HELLO", "WORLD", "PYTHON"] # Get string lengths - use fresh pipeline - pipeline2 = Pipeline(["hello", "world", "python"]) - len_pipeline = pipeline2.map(len) - assert len_pipeline.to_list() == [5, 5, 6] + pipeline2 = Pipeline().map(len) + assert pipeline2.to_list(["hello", "world", "python"]) == [5, 5, 6] def test_map_chaining(self): """Test chaining multiple map operations.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) + pipeline = Pipeline() # Chain maps: double, then add 1 result = pipeline.map(lambda x: x * 2).map(lambda x: x + 1) - assert result.to_list() == [3, 5, 7, 9, 11] + assert result.to_list([1, 2, 3, 4, 5]) == [3, 5, 7, 9, 11] def test_map_empty_pipeline(self): """Test mapping on empty pipeline.""" - empty_pipeline = Pipeline([]) + pipeline = Pipeline() # Map should return empty result - result = empty_pipeline.map(lambda x: x * 2) - assert result.to_list() == [] + result = pipeline.map(lambda x: x * 2) + assert result.to_list([]) == [] diff --git a/tests/test_pipeline_reduce.py b/tests/test_pipeline_reduce.py index a1072f7..0d2a579 100644 --- a/tests/test_pipeline_reduce.py +++ b/tests/test_pipeline_reduce.py @@ -14,39 +14,41 @@ class TestPipelineReduce: def test_reduce_basic(self): """Test basic reduce operations.""" # Sum all numbers - pipeline1 = Pipeline([1, 2, 3, 4, 5]) - sum_result = pipeline1.reduce(lambda acc, x: acc + x, 0) - assert sum_result.first() == 15 + pipeline1 = Pipeline() + sum_result = pipeline1.reduce([1, 2, 3, 4, 5], function=lambda acc, x: acc + x, initial=0) + assert sum_result == 15 # Multiply all numbers - use fresh pipeline - pipeline2 = Pipeline([1, 2, 3, 4, 5]) - product_result = pipeline2.reduce(lambda acc, x: acc * x, 1) - assert product_result.first() == 120 + pipeline2 = Pipeline() + product_result = pipeline2.reduce([1, 2, 3, 4, 5], function=lambda acc, x: acc * x, initial=1) + assert product_result == 120 def test_reduce_with_strings(self): """Test reduce with string data.""" # Concatenate strings - pipeline1 = Pipeline(["hello", "world", "python"]) - concat_result = pipeline1.reduce(lambda acc, x: acc + " " + x, "") - assert concat_result.first() == " hello world python" + pipeline1 = Pipeline() + concat_result = pipeline1.reduce(["hello", "world", "python"], function=lambda acc, x: acc + " " + x, initial="") + assert concat_result == " hello world python" # Join with commas - use fresh pipeline - pipeline2 = Pipeline(["hello", "world", "python"]) - join_result = pipeline2.reduce(lambda acc, x: acc + "," + x if acc else x, "") - assert join_result.first() == "hello,world,python" + pipeline2 = Pipeline() + join_result = pipeline2.reduce( + ["hello", "world", "python"], function=lambda acc, x: acc + "," + x if acc else x, initial="" + ) + assert join_result == "hello,world,python" def test_reduce_empty_pipeline(self): """Test reduce on empty pipeline.""" - empty_pipeline = Pipeline([]) + pipeline = Pipeline() # Reduce should return initial value - result = empty_pipeline.reduce(lambda acc, x: acc + x, 10) - assert result.first() == 10 + result = pipeline.reduce([], function=lambda acc, x: acc + x, initial=10) + assert result == 10 def test_reduce_single_item(self): """Test reduce with single item.""" - single_pipeline = Pipeline([42]) + pipeline = Pipeline() # Should combine initial value with single item - result = single_pipeline.reduce(lambda acc, x: acc + x, 10) - assert result.first() == 52 + result = pipeline.reduce([42], function=lambda acc, x: acc + x, initial=10) + assert result == 52 diff --git a/tests/test_pipeline_tap.py b/tests/test_pipeline_tap.py index d835298..f22661b 100644 --- a/tests/test_pipeline_tap.py +++ b/tests/test_pipeline_tap.py @@ -13,43 +13,43 @@ class TestPipelineTap: def test_tap_basic(self): """Test basic tap operations.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) + pipeline = Pipeline() side_effects = [] # Tap to collect side effects result = pipeline.tap(side_effects.append) - result_list = result.to_list() + result_list = result.to_list([1, 2, 3, 4, 5]) assert result_list == [1, 2, 3, 4, 5] assert side_effects == [1, 2, 3, 4, 5] def test_tap_with_print(self): """Test tap with print (no assertion needed, just verify it works).""" - pipeline = Pipeline([1, 2, 3]) + pipeline = Pipeline() # This should not raise any exceptions result = pipeline.tap(lambda x: None) # Mock print - assert result.to_list() == [1, 2, 3] + assert result.to_list([1, 2, 3]) == [1, 2, 3] def test_tap_chaining(self): """Test tap in a chain of operations.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) + pipeline = Pipeline() side_effects = [] # Tap in middle of chain result = pipeline.map(lambda x: x * 2).tap(side_effects.append).filter(lambda x: x > 5) - result_list = result.to_list() + result_list = result.to_list([1, 2, 3, 4, 5]) assert result_list == [6, 8, 10] assert side_effects == [2, 4, 6, 8, 10] def test_tap_doesnt_modify_pipeline(self): """Test that tap doesn't modify the pipeline data.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) + pipeline = Pipeline() # Tap with function that would modify if it could result = pipeline.tap(lambda x: x * 1000) # Data should be unchanged - assert result.to_list() == [1, 2, 3, 4, 5] + assert result.to_list([1, 2, 3, 4, 5]) == [1, 2, 3, 4, 5] diff --git a/tests/test_pipeline_utility.py b/tests/test_pipeline_utility.py index e76447f..96df4b6 100644 --- a/tests/test_pipeline_utility.py +++ b/tests/test_pipeline_utility.py @@ -2,133 +2,86 @@ Test Pipeline utility methods. This module contains tests for Pipeline utility operations -including passthrough, apply, flatten, and chunked processing. +including apply, flatten, and chunked processing. """ from efemel.pipeline import Pipeline -from efemel.pipeline import PipelineItem class TestPipelineUtility: """Test Pipeline utility methods.""" - def test_passthrough(self): - """Test passthrough method.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - - # Passthrough should return the same pipeline - result = pipeline.passthrough() - assert result is pipeline - - # Data should be unchanged - assert result.to_list() == [1, 2, 3, 4, 5] - def test_apply_function(self): - """Test apply with single function.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - - def double_pipeline(p): - return p.map(lambda x: x * 2) + """Test apply with pipeline function.""" + pipeline = Pipeline() + # Create another pipeline to apply + double_pipeline = Pipeline().map(lambda x: x * 2) + result = pipeline.apply(double_pipeline) - assert result.to_list() == [2, 4, 6, 8, 10] + assert result.to_list([1, 2, 3, 4, 5]) == [2, 4, 6, 8, 10] def test_flatten_basic(self): """Test basic flatten operation.""" - pipeline = Pipeline([[1, 2], [3, 4], [5]]) + pipeline = Pipeline() result = pipeline.flatten() - assert result.to_list() == [1, 2, 3, 4, 5] + assert result.to_list([[1, 2], [3, 4], [5]]) == [1, 2, 3, 4, 5] def test_flatten_empty_lists(self): """Test flatten with empty lists.""" - pipeline = Pipeline([[], [1, 2], [], [3]]) + pipeline = Pipeline() result = pipeline.flatten() - assert result.to_list() == [1, 2, 3] + assert result.to_list([[], [1, 2], [], [3]]) == [1, 2, 3] def test_flatten_nested_tuples(self): """Test flatten with nested tuples.""" - pipeline = Pipeline([(1, 2), (3, 4, 5), (6,)]) + pipeline = Pipeline() result = pipeline.flatten() - assert result.to_list() == [1, 2, 3, 4, 5, 6] + assert result.to_list([(1, 2), (3, 4, 5), (6,)]) == [1, 2, 3, 4, 5, 6] def test_flatten_chunked_processing(self): - """Test that flatten maintains chunked processing and doesn't consume entire pipeline.""" - - # Create a large dataset that would cause OOM if consumed all at once + """Test that flatten works with iterators and doesn't consume entire pipeline at once.""" + # Create a generator that yields nested data def generate_nested_data(): - for i in range(1000): + for i in range(10): # Smaller range for easier testing yield [i * 2, i * 2 + 1] - # Use small chunk size to verify chunked processing - pipeline = Pipeline(generate_nested_data(), chunk_size=10) + pipeline = Pipeline() result = pipeline.flatten() - # Verify first few elements without consuming the entire pipeline - first_10 = [] - count = 0 - for item in result: - first_10.append(item) - count += 1 - if count >= 10: - break + # Test the flattened result + actual = result.to_list(generate_nested_data()) + expected = [] + for i in range(10): + expected.extend([i * 2, i * 2 + 1]) - assert first_10 == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + assert actual == expected - def test_flatten_preserves_chunk_structure(self): - """Test that flatten preserves chunked structure.""" - # Create pipeline with known chunk structure - data = [[1, 2], [3, 4], [5, 6], [7, 8]] - pipeline = Pipeline(data, chunk_size=2) # 2 sublists per chunk + def test_flatten_with_chunk_size(self): + """Test that flatten works correctly with different chunk sizes.""" + # Create pipeline with custom chunk size + pipeline = Pipeline(chunk_size=3) + + # Test data with varying sizes of sublists + data = [[1, 2], [3, 4, 5], [6], [7, 8, 9, 10]] result = pipeline.flatten() + assert result.to_list(data) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] - # Verify the result is correct - assert result.to_list() == [1, 2, 3, 4, 5, 6, 7, 8] + def test_flatten_preserves_chunk_size(self): + """Test that flatten preserves the original chunk_size setting.""" + chunk_size = 5 + pipeline = Pipeline(chunk_size=chunk_size) - def test_flatten_maintains_chunk_size(self): - """Test that flatten maintains the original chunk_size setting.""" - # Create test data with varying sizes of sublists - data = [[1, 2], [3, 4, 5], [6], [7, 8, 9, 10]] - chunk_size = 3 - pipeline = Pipeline(data, chunk_size=chunk_size) - - # Flatten the pipeline flattened = pipeline.flatten() # Verify chunk_size is preserved assert flattened.chunk_size == chunk_size - # Collect chunks to verify structure - chunks = [] - for chunk in flattened.generator: - chunks.append(chunk) - - # Verify all chunks except the last have the expected size - for i, chunk in enumerate(chunks[:-1]): - assert len(chunk) == chunk_size, f"Chunk {i} has size {len(chunk)}, expected {chunk_size}" - - # Last chunk can be smaller than chunk_size - if chunks: - assert len(chunks[-1]) <= chunk_size, f"Last chunk has size {len(chunks[-1])}, should be <= {chunk_size}" - - # Verify the final result is correct - result = [] - for chunk in chunks: - result.extend(chunk) - - # This is showing the internal representation of the data wheresecond value of the Tuple is the exception - assert result == [ - PipelineItem(1, None, {"has_errors": False, "errors": []}), - PipelineItem(2, None, {"has_errors": False, "errors": []}), - PipelineItem(3, None, {"has_errors": False, "errors": []}), - PipelineItem(4, None, {"has_errors": False, "errors": []}), - PipelineItem(5, None, {"has_errors": False, "errors": []}), - PipelineItem(6, None, {"has_errors": False, "errors": []}), - PipelineItem(7, None, {"has_errors": False, "errors": []}), - PipelineItem(8, None, {"has_errors": False, "errors": []}), - PipelineItem(9, None, {"has_errors": False, "errors": []}), - PipelineItem(10, None, {"has_errors": False, "errors": []}), - ] + # Test with actual data + data = [[1, 2], [3, 4], [5, 6], [7, 8]] + result = flattened.to_list(data) + assert result == [1, 2, 3, 4, 5, 6, 7, 8] From ce80b87d9f48deff3980e854657253d8b6f39d6d Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Mon, 14 Jul 2025 20:33:44 +0000 Subject: [PATCH 12/17] chore: cleanup --- performance_test.py | 6 +- pipeline.py | 263 --------------------------------- tests/test_pipeline_utility.py | 3 +- 3 files changed, 5 insertions(+), 267 deletions(-) delete mode 100644 pipeline.py diff --git a/performance_test.py b/performance_test.py index adb3c0d..5dd0374 100644 --- a/performance_test.py +++ b/performance_test.py @@ -15,7 +15,7 @@ import statistics import time -from pipeline import Pipeline +from efemel.pipeline import Pipeline def generate_test_data(size: int = 1_000_000) -> list[int]: @@ -54,9 +54,9 @@ def step4(items): def builtin_map_filter_approach(data: list[int]) -> list[int]: """Process data using built-in map/filter chaining.""" result = filter(lambda x: x % 2 == 0, data) # Keep even numbers - result = map(lambda x: x * 2, result) # Double them + result = (x * 2 for x in result) # Double them result = filter(lambda x: x > 100, result) # Keep only > 100 - result = map(lambda x: x + 1, result) # Add 1 + result = (x + 1 for x in result) # Add 1 return list(result) diff --git a/pipeline.py b/pipeline.py deleted file mode 100644 index 2178b7f..0000000 --- a/pipeline.py +++ /dev/null @@ -1,263 +0,0 @@ -from collections import deque -from collections.abc import Callable -from collections.abc import Iterable -from collections.abc import Iterator -from concurrent.futures import FIRST_COMPLETED -from concurrent.futures import ProcessPoolExecutor -from concurrent.futures import wait -from functools import reduce -import inspect -import itertools -from typing import Any -from typing import Union -from typing import overload - -# Type aliases for pipeline functions -type PipelineFunction[Out, T] = Callable[[Out], T] | Callable[[Out, PipelineContext], T] -type PipelineReduceFunction[U, Out] = Callable[[U, Out], U] | Callable[[U, Out, PipelineContext], U] - - -class PipelineContext(dict): - """Global context available to all pipeline operations.""" - - pass - - -class Pipeline[In, Out]: - """ - A functional pipeline for data processing with internal chunked processing. - - The Pipeline class provides a fluent interface for applying transformations - and filters. It processes data in chunks for performance and maintains a - global context for shared state. The pipeline is callable and iterable. - """ - - def __init__( - self, - chunk_size: int = 1000, - transformer: Callable[[list[In]], list[Out]] | None = None, - context: PipelineContext | None = None, - ) -> None: - self.chunk_size = chunk_size - self.context = context or PipelineContext() - self.transformer: Callable[[list[In]], list[Out]] = transformer or (lambda chunk: chunk) - - @classmethod - def init[T](cls, _type_hint: type[T], chunk_size: int = 1000) -> "Pipeline[T, T]": - """Create a new identity pipeline with explicit type hint.""" - return cls(chunk_size) - - def __call__(self, *data: Iterable[In]) -> Iterator[Out]: - """Execute the pipeline on data source (makes pipeline callable).""" - assert data, "Data source must be provided to execute the pipeline" - - for selected_data in data: - for chunk in self._chunk_generator(selected_data): - yield from self.transformer(chunk) - - def pipe[U](self, operation: Callable[[list[Out]], list[U]]) -> "Pipeline[In, U]": - """ - Composes the current transformer with a new chunk-wise operation. - The operation should be a function that takes a list and returns a list. - """ - prev_transformer = self.transformer - self.transformer = lambda chunk: operation(prev_transformer(chunk)) - return self - - def _create_context_aware_function(self, func: Callable): - """Create function that properly handles context parameter""" - try: - sig = inspect.signature(func) - params = [p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)] - param_count = len(params) - - if param_count >= 2: - return lambda value: func(value, self.context) - return func - except (ValueError, TypeError): - return func - - def _create_reduce_function(self, func: Callable): - """Create function that properly handles context parameter for reduce operations""" - try: - sig = inspect.signature(func) - params = [p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)] - param_count = len(params) - - if param_count >= 3: - return lambda acc, value: func(acc, value, self.context) - return func - except (ValueError, TypeError): - return func - - def map[U](self, function: PipelineFunction[Out, U]) -> "Pipeline[In, U]": - """Transform elements in the pipeline.""" - std_function = self._create_context_aware_function(function) - return self.pipe(lambda chunk: [std_function(x) for x in chunk]) - - def filter(self, predicate: PipelineFunction[Out, bool]) -> "Pipeline[In, Out]": - """Filter elements in the pipeline.""" - std_predicate = self._create_context_aware_function(predicate) - return self.pipe(lambda chunk: [x for x in chunk if std_predicate(x)]) - - def _chunk_generator(self, data: Iterable[In]) -> Iterator[list[In]]: - """Generate chunks from the data source""" - data_iter = iter(data) - while chunk := list(itertools.islice(data_iter, self.chunk_size)): - yield chunk - - def to_list(self, *data: Iterable[In]) -> list[Out]: - """Execute pipeline and return results as list.""" - return [item for d in data for chunk in self._chunk_generator(d) for item in self.transformer(chunk)] - - def each(self, *data: Iterable[In], function: PipelineFunction[Out, Any]) -> None: - """Apply function to each element (terminal operation).""" - std_function = self._create_context_aware_function(function) - - for chunk in self._chunk_generator(itertools.chain.from_iterable(data)): - [std_function(item) for item in self.transformer(chunk)] - - def tap(self, function: PipelineFunction[Out, Any]) -> "Pipeline[In, Out]": - """Apply side-effect without modifying data.""" - std_function = self._create_context_aware_function(function) - - def tap_operation(chunk: list[Out]) -> list[Out]: - for item in chunk: - std_function(item) - return chunk - - return self.pipe(tap_operation) - - def consume(self, *data: Iterable[In]) -> None: - """Consume the pipeline without returning results (terminal operation).""" - for _ in self(*data): - pass - - @overload - def flatten(self: "Pipeline[list[In]]") -> "Pipeline[In]": ... - - @overload - def flatten(self: "Pipeline[tuple[In, ...]]") -> "Pipeline[In]": ... - - @overload - def flatten(self: "Pipeline[set[In]]") -> "Pipeline[In]": ... - - def flatten( - self: Union["Pipeline[list[In], In]", "Pipeline[tuple[In, ...]]", "Pipeline[set[In]]"], - ) -> "Pipeline[In, In]": - """Flatten nested lists into a single list.""" - return self.pipe(lambda chunk: [item for sublist in chunk for item in sublist]) - - def apply[T](self, pipeline: "Pipeline[Out, T]") -> "Pipeline[In, T]": - """Apply another pipeline to the current one.""" - return self.pipe(lambda chunk: pipeline.transformer(chunk)) - - def first(self, *data: Iterable[In], n: int = 1) -> list[Out]: - """Get the first n elements of the pipeline (terminal operation).""" - assert n >= 1, "n must be at least 1" - data_iter = itertools.chain.from_iterable(data) - result = [] - remaining = n - - while remaining > 0: - selected_item_count = min(self.chunk_size, remaining) - - # Read a chunk of size min(chunk_size, remaining) - chunk = list(itertools.islice(data_iter, selected_item_count)) - - if not chunk: - break # No more data - - result.extend(self.transformer(chunk)) - - remaining -= selected_item_count - - return result - - def reduce[U](self, *data: Iterable[In], function: PipelineReduceFunction[U, Out], initial: U) -> U: - """ - Reduce elements to a single value (terminal operation). - - Args: - function: Reduce function (acc, value) -> new_acc - initial: Initial accumulator value - """ - reducer = self._create_reduce_function(function) - return reduce(reducer, self(*data), initial) - - def concurrent[T]( - self, - *data: Iterable[In], - pipeline: "Pipeline[Out,T]", - max_workers: int, - ordered: bool = True, - ) -> Iterator[T]: - """ - Applies a sub-pipeline to each chunk in parallel (Terminal Operation). - - This method consumes the pipeline and processes chunks of data concurrently, - returning an iterator for the results. It is a terminal operation. - - To manage memory effectively, it uses a just-in-time approach, only - pulling chunks from the source iterator when a worker thread is available. - - Args: - *data: The input data iterable(s) to process. - pipeline: A `Pipeline` instance to apply to each chunk. A deep - copy is used for each thread to ensure context isolation. - max_workers: The maximum number of worker threads. - ordered: If True, results are yielded in the original order. - If False, results are yielded as they complete. - - Returns: - An iterator that yields the processed items. - """ - - def process_chunk(chunk: list[Out]) -> list[T]: - return pipeline.to_list(chunk) - - def _ordered_generator(chunks_iter: Iterator[list[Out]], executor: ProcessPoolExecutor) -> Iterator[list[T]]: - futures = deque() - for _ in range(max_workers + 1): - try: - chunk = next(chunks_iter) - futures.append(executor.submit(process_chunk, chunk)) - except StopIteration: - break - - while futures: - yield futures.popleft().result() - try: - chunk = next(chunks_iter) - futures.append(executor.submit(process_chunk, chunk)) - except StopIteration: - continue - - def _unordered_generator(chunks_iter: Iterator[list[Out]], executor: ProcessPoolExecutor) -> Iterator[list[T]]: - futures = {executor.submit(process_chunk, chunk) for chunk in itertools.islice(chunks_iter, max_workers + 1)} - - while futures: - done, futures = wait(futures, return_when=FIRST_COMPLETED) - for future in done: - yield future.result() - try: - chunk = next(chunks_iter) - futures.add(executor.submit(process_chunk, chunk)) - except StopIteration: - continue - - # This wrapper generator ensures the ProcessPoolExecutor is properly - # managed and closed only after the result iterator is exhausted. - def result_iterator_manager() -> Iterator[T]: - with ProcessPoolExecutor(max_workers=max_workers) as executor: - # Create the stream of chunks to be processed concurrently. - chunks_to_process = (self.transformer(chunk) for d in data for chunk in self._chunk_generator(d)) - - gen_func = _ordered_generator if ordered else _unordered_generator - processed_chunks_iterator = gen_func(chunks_to_process, executor) - - # Flatten the iterator of chunks into an iterator of items. - for result_chunk in processed_chunks_iterator: - yield from result_chunk - - return result_iterator_manager() diff --git a/tests/test_pipeline_utility.py b/tests/test_pipeline_utility.py index 96df4b6..f5839b8 100644 --- a/tests/test_pipeline_utility.py +++ b/tests/test_pipeline_utility.py @@ -17,7 +17,7 @@ def test_apply_function(self): # Create another pipeline to apply double_pipeline = Pipeline().map(lambda x: x * 2) - + result = pipeline.apply(double_pipeline) assert result.to_list([1, 2, 3, 4, 5]) == [2, 4, 6, 8, 10] @@ -44,6 +44,7 @@ def test_flatten_nested_tuples(self): def test_flatten_chunked_processing(self): """Test that flatten works with iterators and doesn't consume entire pipeline at once.""" + # Create a generator that yields nested data def generate_nested_data(): for i in range(10): # Smaller range for easier testing From 88024b2e9e0b6e02b4cb06929d8c6e32c29c6a82 Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Mon, 14 Jul 2025 22:04:56 +0000 Subject: [PATCH 13/17] chore: broke up pipeline into pipeline and transformer --- efemel/pipeline/pipeline.py | 72 ++++++++++ efemel/pipeline/transformers/transformer.py | 132 +++++++++++++++++ efemel/{pipeline.py => pipeline_old.py} | 0 tests/test_pipeline_concurrency.py | 134 ------------------ {tests => tests_old}/test_pipeline.py | 0 {tests => tests_old}/test_pipeline_basics.py | 0 .../test_pipeline_composition.py | 0 tests_old/test_pipeline_concurrency.py | 87 ++++++++++++ {tests => tests_old}/test_pipeline_context.py | 42 +----- {tests => tests_old}/test_pipeline_each.py | 0 .../test_pipeline_edge_cases.py | 0 .../test_pipeline_filtering.py | 0 .../test_pipeline_integration.py | 0 .../test_pipeline_map_filter.py | 0 {tests => tests_old}/test_pipeline_mapping.py | 0 {tests => tests_old}/test_pipeline_reduce.py | 0 {tests => tests_old}/test_pipeline_tap.py | 0 {tests => tests_old}/test_pipeline_utility.py | 0 18 files changed, 295 insertions(+), 172 deletions(-) create mode 100644 efemel/pipeline/pipeline.py create mode 100644 efemel/pipeline/transformers/transformer.py rename efemel/{pipeline.py => pipeline_old.py} (100%) delete mode 100644 tests/test_pipeline_concurrency.py rename {tests => tests_old}/test_pipeline.py (100%) rename {tests => tests_old}/test_pipeline_basics.py (100%) rename {tests => tests_old}/test_pipeline_composition.py (100%) create mode 100644 tests_old/test_pipeline_concurrency.py rename {tests => tests_old}/test_pipeline_context.py (64%) rename {tests => tests_old}/test_pipeline_each.py (100%) rename {tests => tests_old}/test_pipeline_edge_cases.py (100%) rename {tests => tests_old}/test_pipeline_filtering.py (100%) rename {tests => tests_old}/test_pipeline_integration.py (100%) rename {tests => tests_old}/test_pipeline_map_filter.py (100%) rename {tests => tests_old}/test_pipeline_mapping.py (100%) rename {tests => tests_old}/test_pipeline_reduce.py (100%) rename {tests => tests_old}/test_pipeline_tap.py (100%) rename {tests => tests_old}/test_pipeline_utility.py (100%) diff --git a/efemel/pipeline/pipeline.py b/efemel/pipeline/pipeline.py new file mode 100644 index 0000000..641042c --- /dev/null +++ b/efemel/pipeline/pipeline.py @@ -0,0 +1,72 @@ +from collections.abc import Callable +from collections.abc import Iterable +from collections.abc import Iterator +import itertools +from typing import Any +from typing import TypedDict +from typing import TypeVar + +from .transformers.transformer import Transformer + +# --- Type Aliases --- +T = TypeVar("T") +PipelineFunction = Callable[[T], Any] + + +class PipelineContext(TypedDict): + """Global context available to all pipeline operations.""" + + pass + + +class Pipeline[T]: + """ + Manages a data source and applies transformers to it. + Provides terminal operations to consume the resulting data. + """ + + def __init__(self, *data: Iterable[T]): + self.data_source: Iterable[T] = itertools.chain.from_iterable(data) + self.processed_data: Iterator = iter(self.data_source) + + def apply[U](self, transformer: Transformer[T, U] | Callable[[Iterable[T]], Iterator[U]]) -> "Pipeline[U]": + """ + Applies a transformer to the current data source. + """ + # The transformer is called with the current data, producing a new iterator + self.processed_data = transformer(self.data_source) + # We must cast self to the new type. + return self # type: ignore + + def __iter__(self) -> Iterator[T]: + """Allows the pipeline to be iterated over.""" + yield from self.processed_data + + def to_list(self) -> list[T]: + """Executes the pipeline and returns the results as a list.""" + return list(self.processed_data) + + def each(self, function: PipelineFunction[T]) -> None: + """Applies a function to each element (terminal operation).""" + # Context needs to be accessed from the function if it's context-aware, + # but the pipeline itself doesn't own a context. This is a design choice. + # For simplicity, we assume the function is not context-aware here + # or that context is handled within the Transformers. + for item in self.processed_data: + function(item) + + def first(self, n: int = 1) -> list[T]: + """Gets the first n elements of the pipeline (terminal operation).""" + assert n >= 1, "n must be at least 1" + return list(itertools.islice(self.processed_data, n)) + + def consume(self) -> None: + """Consumes the pipeline without returning results.""" + for _ in self.processed_data: + pass + + +t = Transformer.init(int).map(lambda x: x * 2).reduce(lambda acc, x: acc + x, initial=0) +# t = 1 + +Pipeline([1, 2, 3, 4]).apply(t).each(lambda x: print(x)) diff --git a/efemel/pipeline/transformers/transformer.py b/efemel/pipeline/transformers/transformer.py new file mode 100644 index 0000000..f2abdf3 --- /dev/null +++ b/efemel/pipeline/transformers/transformer.py @@ -0,0 +1,132 @@ +from collections.abc import Callable +from collections.abc import Iterable +from collections.abc import Iterator +from functools import reduce +import inspect +import itertools +from typing import Any +from typing import Union +from typing import overload + +# --- Type Aliases --- +type PipelineFunction[Out, T] = Callable[[Out], T] | Callable[[Out, PipelineContext], T] +type PipelineReduceFunction[U, Out] = Callable[[U, Out], U] | Callable[[U, Out, PipelineContext], U] + + +class PipelineContext(dict): + """Global context available to all pipeline operations.""" + + pass + + +class Transformer[In, Out]: + """ + Defines and composes data transformations (e.g., map, filter). + + Transformers are callable and return a generator, applying the composed + transformation logic to an iterable data source. + """ + + def __init__( + self, + chunk_size: int = 1000, + context: PipelineContext | None = None, + ): + self.chunk_size = chunk_size + self.context = context or PipelineContext() + self.transformer = lambda chunk: chunk + + @classmethod + def init[T](cls, _type_hint: type[T], chunk_size: int = 1000) -> "Transformer[T, T]": + """Create a new identity pipeline with explicit type hint.""" + return cls(chunk_size) # type: ignore + + def __call__(self, data: Iterable[In]) -> Iterator[Out]: + """ + Executes the transformer on a data source. + """ + for chunk in self._chunk_generator(data): + yield from self.transformer(chunk) + + def _chunk_generator(self, data: Iterable[In]) -> Iterator[list[In]]: + """Breaks an iterable into chunks of a specified size.""" + data_iter = iter(data) + while chunk := list(itertools.islice(data_iter, self.chunk_size)): + yield chunk + + def _pipe[U](self, operation: Callable[[list[Out]], list[U]]) -> "Transformer[In, U]": + """Composes the current transformer with a new chunk-wise operation.""" + prev_transformer = self.transformer + self.transformer = lambda chunk: operation(prev_transformer(chunk)) + return self # type: ignore + + def _create_context_aware_function(self, func: Callable) -> Callable: + """Creates a function that correctly handles the context parameter.""" + try: + sig = inspect.signature(func) + params = [p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)] + if len(params) >= 2: + return lambda value: func(value, self.context) + except (ValueError, TypeError): + pass + return func + + def _create_reduce_function(self, func: Callable) -> Callable: + """Creates a reduce function that correctly handles the context parameter.""" + try: + sig = inspect.signature(func) + params = [p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)] + if len(params) >= 3: + return lambda acc, value: func(acc, value, self.context) + except (ValueError, TypeError): + pass + return func + + def map[U](self, function: PipelineFunction[Out, U]) -> "Transformer[In, U]": + """Transforms elements in the pipeline.""" + std_function = self._create_context_aware_function(function) + return self._pipe(lambda chunk: [std_function(x) for x in chunk]) + + def filter(self, predicate: PipelineFunction[Out, bool]) -> "Transformer[In, Out]": + """Filters elements in the pipeline.""" + std_predicate = self._create_context_aware_function(predicate) + return self._pipe(lambda chunk: [x for x in chunk if std_predicate(x)]) + + @overload + def flatten[T](self: "Transformer[In, list[T]]") -> "Transformer[In, T]": ... + + @overload + def flatten[T](self: "Transformer[In, tuple[T, ...]]") -> "Transformer[In, T]": ... + + @overload + def flatten[T](self: "Transformer[In, set[T]]") -> "Transformer[In, T]": ... + + def flatten[T]( + self: Union["Transformer[In, list[T]]", "Transformer[In, tuple[T, ...]]", "Transformer[In, set[T]]"], + ) -> "Transformer[In, T]": + """Flatten nested lists into a single list.""" + return self._pipe(lambda chunk: [item for sublist in chunk for item in sublist]) # type: ignore + + def tap(self, function: PipelineFunction[Out, Any]) -> "Transformer[In, Out]": + """Applies a side-effect function without modifying the data.""" + std_function = self._create_context_aware_function(function) + + def tap_operation(chunk: list[Out]) -> list[Out]: + for item in chunk: + std_function(item) + return chunk + + return self._pipe(tap_operation) + + def reduce[U](self, function: PipelineReduceFunction, initial: U): + """Reduces elements to a single value (terminal operation).""" + reducer = self._create_reduce_function(function) + + def _reduce(data: Iterable[Out]) -> Iterator[U]: + yield reduce(reducer, data, initial) + + return _reduce + + def apply[T](self, t: "Transformer[Out, T]") -> "Transformer[In, T]": + """Apply another pipeline to the current one.""" + return self._pipe(lambda chunk: t.transformer(chunk)) diff --git a/efemel/pipeline.py b/efemel/pipeline_old.py similarity index 100% rename from efemel/pipeline.py rename to efemel/pipeline_old.py diff --git a/tests/test_pipeline_concurrency.py b/tests/test_pipeline_concurrency.py deleted file mode 100644 index 39002d6..0000000 --- a/tests/test_pipeline_concurrency.py +++ /dev/null @@ -1,134 +0,0 @@ -""" -Test Pipeline concurrency functionality. - -This module contains tests for Pipeline concurrent operations -including basic concurrency and context isolation. -""" - -import pytest - -from efemel.pipeline import CompoundError -from efemel.pipeline import Pipeline - - -class TestPipelineConcurrency: - """Test Pipeline concurrency functionality.""" - - def test_concurrent(self): - # Create a simple pipeline - - def _double_pipeline(p: Pipeline[int]): - return p.map(lambda x: x * 2) - - result = Pipeline(range(0, 5), 1).concurrent(_double_pipeline, 5).to_list() - - # Check if all numbers are doubled - assert result == [0, 2, 4, 6, 8] - - def test_concurrent_consumer(self): - # Create a simple pipeline - - items = [] - - def dummy_consumer(p: Pipeline[int]): - return p.tap(lambda x: items.append(x)) - - Pipeline(range(0, 5), 1).concurrent(dummy_consumer, 5).noop() - - # Check if all numbers are consumed - assert items == [0, 1, 2, 3, 4] - - def test_concurrent_context_isolation(self): - """Test that concurrent processing properly isolates and merges contexts.""" - - # Create a pipeline with some data - data = list(range(100)) # Large enough to ensure multiple chunks - pipeline = Pipeline(data, chunk_size=10) # Force multiple chunks - - def error_prone_pipeline(p): - """A pipeline that will cause errors on certain values.""" - return p.map(lambda x: x if x % 7 != 0 else 1 / 0) # Error on multiples of 7 - - def safe_pipeline(p): - """A pipeline that should work without errors.""" - return p.map(lambda x: x * 2) - - # Test with error-prone pipeline - with pytest.raises(CompoundError): - pipeline.concurrent(error_prone_pipeline, max_workers=4).to_list() - - # Check that the context properly recorded errors - assert pipeline.context["has_errors"] - - # Test with safe pipeline - safe_data = list(range(50)) - safe_pipeline_obj = Pipeline(safe_data, chunk_size=5) - - result = safe_pipeline_obj.concurrent(safe_pipeline, max_workers=4).to_list() - expected = [x * 2 for x in safe_data] - - assert result == expected - assert not safe_pipeline_obj.context["has_errors"] - - def test_concurrent_vs_sequential(self): - """Test that concurrent and sequential processing produce the same results.""" - data = list(range(20)) - - def transform_pipeline(p): - return p.map(lambda x: x * 3).filter(lambda x: x % 2 == 0) - - # Sequential - sequential_result = Pipeline(data).apply(transform_pipeline).to_list() - - # Concurrent - concurrent_result = Pipeline(data, chunk_size=5).concurrent(transform_pipeline, max_workers=3).to_list() - - assert sequential_result == concurrent_result - - def test_concurrent_with_errors_maintains_order(self): - """Test that concurrent processing maintains order even with errors.""" - data = list(range(20)) - - def error_prone_pipeline(p): - """Creates errors on multiples of 5.""" - return p.map(lambda x: x if x % 5 != 0 else 1 / 0) - - # Test ordered processing - pipeline = Pipeline(data, chunk_size=3) - - with pytest.raises(CompoundError) as exc_info: - pipeline.concurrent(error_prone_pipeline, max_workers=2, ordered=True).to_list() - - # Should have errors for multiples of 5: 0, 5, 10, 15 - assert len(exc_info.value.errors) == 4 - - def test_concurrent_unordered(self): - """Test unordered concurrent processing.""" - data = list(range(20)) - - def simple_transform(p): - return p.map(lambda x: x * 2) - - # Unordered processing - pipeline = Pipeline(data, chunk_size=4) - result = pipeline.concurrent(simple_transform, max_workers=3, ordered=False).to_list() - - # Results should be correct but may be out of order - expected = [x * 2 for x in data] - assert sorted(result) == sorted(expected) - - def test_concurrent_context_thread_safety(self): - """Test that context updates are thread-safe during concurrent processing.""" - data = list(range(100)) - - def context_aware_pipeline(p): - """Pipeline that interacts with context.""" - return p.map(lambda x, ctx: x * 2 if not ctx["has_errors"] else x) - - pipeline = Pipeline(data, chunk_size=10) - result = pipeline.concurrent(context_aware_pipeline, max_workers=4).to_list() - - # Should get consistent results despite concurrent access to context - expected = [x * 2 for x in data] - assert result == expected - assert not pipeline.context["has_errors"] diff --git a/tests/test_pipeline.py b/tests_old/test_pipeline.py similarity index 100% rename from tests/test_pipeline.py rename to tests_old/test_pipeline.py diff --git a/tests/test_pipeline_basics.py b/tests_old/test_pipeline_basics.py similarity index 100% rename from tests/test_pipeline_basics.py rename to tests_old/test_pipeline_basics.py diff --git a/tests/test_pipeline_composition.py b/tests_old/test_pipeline_composition.py similarity index 100% rename from tests/test_pipeline_composition.py rename to tests_old/test_pipeline_composition.py diff --git a/tests_old/test_pipeline_concurrency.py b/tests_old/test_pipeline_concurrency.py new file mode 100644 index 0000000..6a6d525 --- /dev/null +++ b/tests_old/test_pipeline_concurrency.py @@ -0,0 +1,87 @@ +""" +Test Pipeline concurrency functionality. + +This module contains tests for Pipeline concurrent operations +including basic concurrency and context isolation. +""" + +from efemel.pipeline import Pipeline + + +class TestPipelineConcurrency: + """Test Pipeline concurrency functionality.""" + + def test_concurrent(self): + # Create a simple pipeline + + def _double_pipeline(p): + return p.map(lambda x: x * 2) + + pipeline = Pipeline(chunk_size=1) + result = list(pipeline.concurrent(range(0, 5), pipeline=_double_pipeline, max_workers=5)) + + # Check if all numbers are doubled + assert result == [0, 2, 4, 6, 8] + + def test_concurrent_consumer(self): + # Create a simple pipeline + + items = [] + + def dummy_consumer(p): + return p.tap(lambda x: items.append(x)) + + pipeline = Pipeline(chunk_size=1) + list(pipeline.concurrent(range(0, 5), pipeline=dummy_consumer, max_workers=5)) + + # Check if all numbers are consumed + assert items == [0, 1, 2, 3, 4] + + def test_concurrent_vs_sequential(self): + """Test that concurrent and sequential processing produce the same results.""" + data = list(range(20)) + + def transform_pipeline(p): + return p.map(lambda x: x * 3).filter(lambda x: x % 2 == 0) + + # Sequential + sequential_pipeline = Pipeline().apply(transform_pipeline) + sequential_result = sequential_pipeline.to_list(data) + + # Concurrent + concurrent_pipeline = Pipeline(chunk_size=5) + concurrent_result = list(concurrent_pipeline.concurrent(data, pipeline=transform_pipeline, max_workers=3)) + + assert sequential_result == concurrent_result + + def test_concurrent_unordered(self): + """Test unordered concurrent processing.""" + data = list(range(20)) + + def simple_transform(p): + return p.map(lambda x: x * 2) + + # Unordered processing + pipeline = Pipeline(chunk_size=4) + result = list(pipeline.concurrent(data, pipeline=simple_transform, max_workers=3, ordered=False)) + + # Results should be correct but may be out of order + expected = [x * 2 for x in data] + assert sorted(result) == sorted(expected) + + def test_concurrent_context_thread_safety(self): + """Test that context updates are thread-safe during concurrent processing.""" + data = list(range(100)) + + def context_aware_pipeline(p): + """Pipeline that interacts with context.""" + return p.map(lambda x, ctx: x * 2 if not ctx.get("has_errors", False) else x) + + pipeline = Pipeline(chunk_size=10) + pipeline.context["has_errors"] = False + result = list(pipeline.concurrent(data, pipeline=context_aware_pipeline, max_workers=4)) + + # Should get consistent results despite concurrent access to context + expected = [x * 2 for x in data] + assert result == expected + assert not pipeline.context.get("has_errors", False) diff --git a/tests/test_pipeline_context.py b/tests_old/test_pipeline_context.py similarity index 64% rename from tests/test_pipeline_context.py rename to tests_old/test_pipeline_context.py index 2c85be8..077b39f 100644 --- a/tests/test_pipeline_context.py +++ b/tests_old/test_pipeline_context.py @@ -5,9 +5,6 @@ including context propagation and sharing between operations. """ -import pytest - -from efemel.pipeline import CompoundError from efemel.pipeline import Pipeline @@ -48,37 +45,6 @@ def set_context_once(value, context): # Verify the function was called for each item assert context_access_count == 5 - def test_concurrent_context_isolation(self): - """Test that concurrent processing properly isolates and merges contexts.""" - # Create a pipeline with some data - data = list(range(100)) # Large enough to ensure multiple chunks - pipeline = Pipeline(data, chunk_size=10) # Force multiple chunks - - def error_prone_pipeline(p): - """A pipeline that will cause errors on certain values.""" - return p.map(lambda x: x if x % 7 != 0 else 1 / 0) # Error on multiples of 7 - - def safe_pipeline(p): - """A pipeline that should work without errors.""" - return p.map(lambda x: x * 2) - - # Test with error-prone pipeline - with pytest.raises(CompoundError): - pipeline.concurrent(error_prone_pipeline, max_workers=4).to_list() - - # Check that the context properly recorded errors - assert pipeline.context["has_errors"] - - # Test with safe pipeline - safe_data = list(range(50)) - safe_pipeline_obj = Pipeline(safe_data, chunk_size=5) - - result = safe_pipeline_obj.concurrent(safe_pipeline, max_workers=4).to_list() - expected = [x * 2 for x in safe_data] - - assert result == expected - assert not safe_pipeline_obj.context["has_errors"] - def test_concurrent_vs_sequential_context(self): """Test that concurrent and sequential processing produce the same results.""" data = list(range(20)) @@ -87,10 +53,10 @@ def transform_pipeline(p): return p.map(lambda x: x * 3).filter(lambda x: x % 2 == 0) # Sequential - sequential_result = Pipeline(data).apply(transform_pipeline).to_list() + sequential_result = Pipeline().apply(transform_pipeline).to_list(data) # Concurrent - concurrent_result = Pipeline(data, chunk_size=5).concurrent(transform_pipeline, max_workers=3).to_list() + concurrent_result = list(Pipeline(5).concurrent(data, pipeline=transform_pipeline, max_workers=3)) assert sequential_result == concurrent_result @@ -110,8 +76,8 @@ def modify_context(value, context): return p.map(modify_context) - pipeline = Pipeline(data, chunk_size=5) - result = pipeline.concurrent(context_modifying_pipeline, max_workers=2).to_list() + pipeline = Pipeline(5) + result = list(pipeline.concurrent(data, pipeline=context_modifying_pipeline, max_workers=2)) # Verify results are correct expected = [x * 2 for x in data] diff --git a/tests/test_pipeline_each.py b/tests_old/test_pipeline_each.py similarity index 100% rename from tests/test_pipeline_each.py rename to tests_old/test_pipeline_each.py diff --git a/tests/test_pipeline_edge_cases.py b/tests_old/test_pipeline_edge_cases.py similarity index 100% rename from tests/test_pipeline_edge_cases.py rename to tests_old/test_pipeline_edge_cases.py diff --git a/tests/test_pipeline_filtering.py b/tests_old/test_pipeline_filtering.py similarity index 100% rename from tests/test_pipeline_filtering.py rename to tests_old/test_pipeline_filtering.py diff --git a/tests/test_pipeline_integration.py b/tests_old/test_pipeline_integration.py similarity index 100% rename from tests/test_pipeline_integration.py rename to tests_old/test_pipeline_integration.py diff --git a/tests/test_pipeline_map_filter.py b/tests_old/test_pipeline_map_filter.py similarity index 100% rename from tests/test_pipeline_map_filter.py rename to tests_old/test_pipeline_map_filter.py diff --git a/tests/test_pipeline_mapping.py b/tests_old/test_pipeline_mapping.py similarity index 100% rename from tests/test_pipeline_mapping.py rename to tests_old/test_pipeline_mapping.py diff --git a/tests/test_pipeline_reduce.py b/tests_old/test_pipeline_reduce.py similarity index 100% rename from tests/test_pipeline_reduce.py rename to tests_old/test_pipeline_reduce.py diff --git a/tests/test_pipeline_tap.py b/tests_old/test_pipeline_tap.py similarity index 100% rename from tests/test_pipeline_tap.py rename to tests_old/test_pipeline_tap.py diff --git a/tests/test_pipeline_utility.py b/tests_old/test_pipeline_utility.py similarity index 100% rename from tests/test_pipeline_utility.py rename to tests_old/test_pipeline_utility.py From 4a3f5da58357d67d71984e201fa9d65c2a2451b4 Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Mon, 14 Jul 2025 22:13:34 +0000 Subject: [PATCH 14/17] fix: reduce typing --- efemel/pipeline/transformers/transformer.py | 33 ++++++++++++--------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/efemel/pipeline/transformers/transformer.py b/efemel/pipeline/transformers/transformer.py index f2abdf3..0e02a6a 100644 --- a/efemel/pipeline/transformers/transformer.py +++ b/efemel/pipeline/transformers/transformer.py @@ -41,13 +41,6 @@ def init[T](cls, _type_hint: type[T], chunk_size: int = 1000) -> "Transformer[T, """Create a new identity pipeline with explicit type hint.""" return cls(chunk_size) # type: ignore - def __call__(self, data: Iterable[In]) -> Iterator[Out]: - """ - Executes the transformer on a data source. - """ - for chunk in self._chunk_generator(data): - yield from self.transformer(chunk) - def _chunk_generator(self, data: Iterable[In]) -> Iterator[list[In]]: """Breaks an iterable into chunks of a specified size.""" data_iter = iter(data) @@ -118,15 +111,27 @@ def tap_operation(chunk: list[Out]) -> list[Out]: return self._pipe(tap_operation) - def reduce[U](self, function: PipelineReduceFunction, initial: U): + def apply[T](self, t: "Transformer[Out, T]") -> "Transformer[In, T]": + """Apply another pipeline to the current one.""" + return self._pipe(lambda chunk: t.transformer(chunk)) + + # Terminal operations + # These operations execute the transformer on a data source and yield results. + # If you want to operate on the results, you need to use a Pipeline and apply + # a different transformer to it. + + def __call__(self, data: Iterable[In]) -> Iterator[Out]: + """ + Executes the transformer on a data source (terminal operations). + """ + for chunk in self._chunk_generator(data): + yield from self.transformer(chunk) + + def reduce[U](self, function: PipelineReduceFunction[Out, U], initial: U): """Reduces elements to a single value (terminal operation).""" reducer = self._create_reduce_function(function) - def _reduce(data: Iterable[Out]) -> Iterator[U]: - yield reduce(reducer, data, initial) + def _reduce(data: Iterable[In]) -> Iterator[U]: + yield reduce(reducer, self(data), initial) return _reduce - - def apply[T](self, t: "Transformer[Out, T]") -> "Transformer[In, T]": - """Apply another pipeline to the current one.""" - return self._pipe(lambda chunk: t.transformer(chunk)) From 4212174183e52cc923a4f2e55d521d6dd9d35f37 Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Mon, 14 Jul 2025 22:24:58 +0000 Subject: [PATCH 15/17] chore: attempt to implement transform method --- efemel/pipeline/pipeline.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/efemel/pipeline/pipeline.py b/efemel/pipeline/pipeline.py index 641042c..9cf10e7 100644 --- a/efemel/pipeline/pipeline.py +++ b/efemel/pipeline/pipeline.py @@ -38,6 +38,22 @@ def apply[U](self, transformer: Transformer[T, U] | Callable[[Iterable[T]], Iter # We must cast self to the new type. return self # type: ignore + def transform[U](self, transformer: Callable[[Transformer[T, T]], Transformer[T, U]]) -> "Pipeline[U]": + """ + Shorthand method to apply a transformation using a lambda function. + Creates a Transformer under the hood and applies it to the pipeline. + + Args: + function: A callable that transforms each element from type T to type U + + Returns: + A new Pipeline with the transformed data + """ + # Create a new transformer with identity and apply the map operation + # We create a basic transformer and immediately map with the function + + return self.apply(Transformer[T, T]().apply(transformer)) + def __iter__(self) -> Iterator[T]: """Allows the pipeline to be iterated over.""" yield from self.processed_data @@ -69,4 +85,8 @@ def consume(self) -> None: t = Transformer.init(int).map(lambda x: x * 2).reduce(lambda acc, x: acc + x, initial=0) # t = 1 +# Example using the original apply method with a full transformer Pipeline([1, 2, 3, 4]).apply(t).each(lambda x: print(x)) + +# Example using the new shorthand transform method +Pipeline([1, 2, 3, 4]).transform(lambda x: x.map(lambda y: y * 2)).each(lambda x: print(x)) From 30b2cc24d93d7d1b2d6a68efaae10631580a2b56 Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Tue, 15 Jul 2025 07:05:47 +0000 Subject: [PATCH 16/17] chore: pipeline tests and fix apply functions --- efemel/pipeline/pipeline.py | 20 +- efemel/pipeline/transformers/transformer.py | 5 +- efemel/pipeline_old.py | 263 -------------------- tests/test_integration.py | 221 ++++++++++++++++ tests/test_pipeline.py | 242 ++++++++++++++++++ tests/test_transformer.py | 211 ++++++++++++++++ tests_old/test_pipeline.py | 0 tests_old/test_pipeline_basics.py | 83 ------ tests_old/test_pipeline_composition.py | 113 --------- tests_old/test_pipeline_concurrency.py | 87 ------- tests_old/test_pipeline_context.py | 92 ------- tests_old/test_pipeline_each.py | 49 ---- tests_old/test_pipeline_edge_cases.py | 77 ------ tests_old/test_pipeline_filtering.py | 56 ----- tests_old/test_pipeline_integration.py | 79 ------ tests_old/test_pipeline_map_filter.py | 41 --- tests_old/test_pipeline_mapping.py | 58 ----- tests_old/test_pipeline_reduce.py | 54 ---- tests_old/test_pipeline_tap.py | 55 ---- tests_old/test_pipeline_utility.py | 88 ------- 20 files changed, 687 insertions(+), 1207 deletions(-) delete mode 100644 efemel/pipeline_old.py create mode 100644 tests/test_integration.py create mode 100644 tests/test_pipeline.py create mode 100644 tests/test_transformer.py delete mode 100644 tests_old/test_pipeline.py delete mode 100644 tests_old/test_pipeline_basics.py delete mode 100644 tests_old/test_pipeline_composition.py delete mode 100644 tests_old/test_pipeline_concurrency.py delete mode 100644 tests_old/test_pipeline_context.py delete mode 100644 tests_old/test_pipeline_each.py delete mode 100644 tests_old/test_pipeline_edge_cases.py delete mode 100644 tests_old/test_pipeline_filtering.py delete mode 100644 tests_old/test_pipeline_integration.py delete mode 100644 tests_old/test_pipeline_map_filter.py delete mode 100644 tests_old/test_pipeline_mapping.py delete mode 100644 tests_old/test_pipeline_reduce.py delete mode 100644 tests_old/test_pipeline_tap.py delete mode 100644 tests_old/test_pipeline_utility.py diff --git a/efemel/pipeline/pipeline.py b/efemel/pipeline/pipeline.py index 9cf10e7..e5e1e2f 100644 --- a/efemel/pipeline/pipeline.py +++ b/efemel/pipeline/pipeline.py @@ -26,33 +26,33 @@ class Pipeline[T]: """ def __init__(self, *data: Iterable[T]): - self.data_source: Iterable[T] = itertools.chain.from_iterable(data) + self.data_source: Iterable[T] = itertools.chain.from_iterable(data) if len(data) > 1 else data[0] self.processed_data: Iterator = iter(self.data_source) def apply[U](self, transformer: Transformer[T, U] | Callable[[Iterable[T]], Iterator[U]]) -> "Pipeline[U]": """ Applies a transformer to the current data source. """ - # The transformer is called with the current data, producing a new iterator - self.processed_data = transformer(self.data_source) - # We must cast self to the new type. + # The transformer is called with the current processed data, producing a new iterator + new_data = transformer(self.processed_data) + # Create a new pipeline with the transformed data + self.processed_data = new_data return self # type: ignore - def transform[U](self, transformer: Callable[[Transformer[T, T]], Transformer[T, U]]) -> "Pipeline[U]": + def transform[U](self, t: Callable[[Transformer[T, T]], Transformer[T, U]]) -> "Pipeline[U]": """ Shorthand method to apply a transformation using a lambda function. Creates a Transformer under the hood and applies it to the pipeline. Args: - function: A callable that transforms each element from type T to type U + t: A callable that takes a transformer and returns a transformed transformer Returns: A new Pipeline with the transformed data """ - # Create a new transformer with identity and apply the map operation - # We create a basic transformer and immediately map with the function - - return self.apply(Transformer[T, T]().apply(transformer)) + # Create a new transformer and apply the transformation function + transformer = t(Transformer[T, T]()) + return self.apply(transformer) def __iter__(self) -> Iterator[T]: """Allows the pipeline to be iterated over.""" diff --git a/efemel/pipeline/transformers/transformer.py b/efemel/pipeline/transformers/transformer.py index 0e02a6a..6076587 100644 --- a/efemel/pipeline/transformers/transformer.py +++ b/efemel/pipeline/transformers/transformer.py @@ -5,6 +5,7 @@ import inspect import itertools from typing import Any +from typing import Self from typing import Union from typing import overload @@ -111,9 +112,9 @@ def tap_operation(chunk: list[Out]) -> list[Out]: return self._pipe(tap_operation) - def apply[T](self, t: "Transformer[Out, T]") -> "Transformer[In, T]": + def apply[T](self, t: Callable[[Self], "Transformer[In, T]"]) -> "Transformer[In, T]": """Apply another pipeline to the current one.""" - return self._pipe(lambda chunk: t.transformer(chunk)) + return t(self) # Terminal operations # These operations execute the transformer on a data source and yield results. diff --git a/efemel/pipeline_old.py b/efemel/pipeline_old.py deleted file mode 100644 index 6aa910b..0000000 --- a/efemel/pipeline_old.py +++ /dev/null @@ -1,263 +0,0 @@ -from collections import deque -from collections.abc import Callable -from collections.abc import Iterable -from collections.abc import Iterator -from concurrent.futures import FIRST_COMPLETED -from concurrent.futures import ProcessPoolExecutor -from concurrent.futures import wait -from functools import reduce -import inspect -import itertools -from typing import Any -from typing import Union -from typing import overload - -# Type aliases for pipeline functions -type PipelineFunction[Out, T] = Callable[[Out], T] | Callable[[Out, PipelineContext], T] -type PipelineReduceFunction[U, Out] = Callable[[U, Out], U] | Callable[[U, Out, PipelineContext], U] - - -class PipelineContext(dict): - """Global context available to all pipeline operations.""" - - pass - - -class Pipeline[In, Out]: - """ - A functional pipeline for data processing with internal chunked processing. - - The Pipeline class provides a fluent interface for applying transformations - and filters. It processes data in chunks for performance and maintains a - global context for shared state. The pipeline is callable and iterable. - """ - - def __init__( - self, - chunk_size: int = 1000, - transformer: Callable[[list[In]], list[Out]] | None = None, - context: PipelineContext | None = None, - ) -> None: - self.chunk_size = chunk_size - self.context = context or PipelineContext() - self.transformer: Callable[[list[In]], list[Out]] = transformer or (lambda chunk: chunk) - - @classmethod - def init[T](cls, _type_hint: type[T], chunk_size: int = 1000) -> "Pipeline[T, T]": - """Create a new identity pipeline with explicit type hint.""" - return cls(chunk_size) - - def __call__(self, *data: Iterable[In]) -> Iterator[Out]: - """Execute the pipeline on data source (makes pipeline callable).""" - assert data, "Data source must be provided to execute the pipeline" - - for selected_data in data: - for chunk in self._chunk_generator(selected_data): - yield from self.transformer(chunk) - - def pipe[U](self, operation: Callable[[list[Out]], list[U]]) -> "Pipeline[In, U]": - """ - Composes the current transformer with a new chunk-wise operation. - The operation should be a function that takes a list and returns a list. - """ - prev_transformer = self.transformer - self.transformer = lambda chunk: operation(prev_transformer(chunk)) - return self - - def _create_context_aware_function(self, func: Callable): - """Create function that properly handles context parameter""" - try: - sig = inspect.signature(func) - params = [p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)] - param_count = len(params) - - if param_count >= 2: - return lambda value: func(value, self.context) - return func - except (ValueError, TypeError): - return func - - def _create_reduce_function(self, func: Callable): - """Create function that properly handles context parameter for reduce operations""" - try: - sig = inspect.signature(func) - params = [p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)] - param_count = len(params) - - if param_count >= 3: - return lambda acc, value: func(acc, value, self.context) - return func - except (ValueError, TypeError): - return func - - def map[U](self, function: PipelineFunction[Out, U]) -> "Pipeline[In, U]": - """Transform elements in the pipeline.""" - std_function = self._create_context_aware_function(function) - return self.pipe(lambda chunk: [std_function(x) for x in chunk]) - - def filter(self, predicate: PipelineFunction[Out, bool]) -> "Pipeline[In, Out]": - """Filter elements in the pipeline.""" - std_predicate = self._create_context_aware_function(predicate) - return self.pipe(lambda chunk: [x for x in chunk if std_predicate(x)]) - - def _chunk_generator(self, data: Iterable[In]) -> Iterator[list[In]]: - """Generate chunks from the data source""" - data_iter = iter(data) - while chunk := list(itertools.islice(data_iter, self.chunk_size)): - yield chunk - - def to_list(self, *data: Iterable[In]) -> list[Out]: - """Execute pipeline and return results as list.""" - return [item for d in data for chunk in self._chunk_generator(d) for item in self.transformer(chunk)] - - def each(self, *data: Iterable[In], function: PipelineFunction[Out, Any]) -> None: - """Apply function to each element (terminal operation).""" - std_function = self._create_context_aware_function(function) - - for chunk in self._chunk_generator(itertools.chain.from_iterable(data)): - [std_function(item) for item in self.transformer(chunk)] - - def tap(self, function: PipelineFunction[Out, Any]) -> "Pipeline[In, Out]": - """Apply side-effect without modifying data.""" - std_function = self._create_context_aware_function(function) - - def tap_operation(chunk: list[Out]) -> list[Out]: - for item in chunk: - std_function(item) - return chunk - - return self.pipe(tap_operation) - - def consume(self, *data: Iterable[In]) -> None: - """Consume the pipeline without returning results (terminal operation).""" - for _ in self(*data): - pass - - @overload - def flatten(self: "Pipeline[list[In]]") -> "Pipeline[In]": ... - - @overload - def flatten(self: "Pipeline[tuple[In, ...]]") -> "Pipeline[In]": ... - - @overload - def flatten(self: "Pipeline[set[In]]") -> "Pipeline[In]": ... - - def flatten( - self: Union["Pipeline[list[In], In]", "Pipeline[tuple[In, ...]]", "Pipeline[set[In]]"], - ) -> "Pipeline[In, In]": - """Flatten nested lists into a single list.""" - return self.pipe(lambda chunk: [item for sublist in chunk for item in sublist]) - - def apply[T](self, pipeline: "Pipeline[Out, T]") -> "Pipeline[In, T]": - """Apply another pipeline to the current one.""" - return self.pipe(lambda chunk: pipeline.transformer(chunk)) - - def first(self, *data: Iterable[In], n: int = 1) -> list[Out]: - """Get the first n elements of the pipeline (terminal operation).""" - assert n >= 1, "n must be at least 1" - data_iter = itertools.chain.from_iterable(data) - result = [] - remaining = n - - while remaining > 0: - selected_item_count = min(self.chunk_size, remaining) - - # Read a chunk of size min(chunk_size, remaining) - chunk = list(itertools.islice(data_iter, selected_item_count)) - - if not chunk: - break # No more data - - result.extend(self.transformer(chunk)) - - remaining -= selected_item_count - - return result - - def reduce[U](self, *data: Iterable[In], function: PipelineReduceFunction[U, Out], initial: U) -> U: - """ - Reduce elements to a single value (terminal operation). - - Args: - function: Reduce function (acc, value) -> new_acc - initial: Initial accumulator value - """ - reducer = self._create_reduce_function(function) - return reduce(reducer, self(*data), initial) - - def concurrent[T]( - self, - *data: Iterable[In], - pipeline: "Pipeline[Out,T]", - max_workers: int, - ordered: bool = True, - ) -> Iterator[T]: - """ - Applies a sub-pipeline to each chunk in parallel (Terminal Operation). - - This method consumes the pipeline and processes chunks of data concurrently, - returning an iterator for the results. It is a terminal operation. - - To manage memory effectively, it uses a just-in-time approach, only - pulling chunks from the source iterator when a worker thread is available. - - Args: - *data: The input data iterable(s) to process. - pipeline: A `Pipeline` instance to apply to each chunk. A deep - copy is used for each thread to ensure context isolation. - max_workers: The maximum number of worker threads. - ordered: If True, results are yielded in the original order. - If False, results are yielded as they complete. - - Returns: - An iterator that yields the processed items. - """ - - def process_chunk(chunk: list[Out]) -> list[T]: - return pipeline.to_list(chunk) - - def _ordered_generator(chunks_iter: Iterator[list[Out]], executor: ProcessPoolExecutor) -> Iterator[list[T]]: - futures = deque() - for _ in range(max_workers + 1): - try: - chunk = next(chunks_iter) - futures.append(executor.submit(process_chunk, chunk)) - except StopIteration: - break - - while futures: - yield futures.popleft().result() - try: - chunk = next(chunks_iter) - futures.append(executor.submit(process_chunk, chunk)) - except StopIteration: - continue - - def _unordered_generator(chunks_iter: Iterator[list[Out]], executor: ProcessPoolExecutor) -> Iterator[list[T]]: - futures = {executor.submit(process_chunk, chunk) for chunk in itertools.islice(chunks_iter, max_workers + 1)} - - while futures: - done, futures = wait(futures, return_when=FIRST_COMPLETED) - for future in done: - yield future.result() - try: - chunk = next(chunks_iter) - futures.add(executor.submit(process_chunk, chunk)) - except StopIteration: - continue - - # This wrapper generator ensures the ProcessPoolExecutor is properly - # managed and closed only after the result iterator is exhausted. - def result_iterator_manager() -> Iterator[T]: - with ProcessPoolExecutor(max_workers=max_workers) as executor: - # Create the stream of chunks to be processed concurrently. - chunks_to_process = (self.transformer(chunk) for d in data for chunk in self._chunk_generator(d)) - - gen_func = _ordered_generator if ordered else _unordered_generator - processed_chunks_iterator = gen_func(chunks_to_process, executor) - - # Flatten the iterator of chunks into an iterator of items. - for result_chunk in processed_chunks_iterator: - yield from result_chunk - - return result_iterator_manager() diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..1cbe1fc --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,221 @@ +"""Integration tests for Pipeline and Transformer working together.""" + +from efemel.pipeline.pipeline import Pipeline +from efemel.pipeline.transformers.transformer import PipelineContext +from efemel.pipeline.transformers.transformer import Transformer + + +class TestPipelineTransformerIntegration: + """Test Pipeline and Transformer integration.""" + + def test_basic_integration(self): + """Test basic pipeline and transformer integration.""" + # Create a transformer that doubles and filters + transformer = Transformer.init(int).map(lambda x: x * 2).filter(lambda x: x > 5) + + # Apply to pipeline + result = Pipeline([1, 2, 3, 4, 5]).apply(transformer).to_list() + assert result == [6, 8, 10] + + def test_context_sharing(self): + """Test that context is properly shared in transformations.""" + context = PipelineContext({"multiplier": 3, "threshold": 5}) + + transformer = ( + Transformer(context=context).map(lambda x, ctx: x * ctx["multiplier"]).filter(lambda x, ctx: x > ctx["threshold"]) + ) + + result = Pipeline([1, 2, 3]).apply(transformer).to_list() + assert result == [6, 9] # [3, 6, 9] filtered to [6, 9] + + def test_reduce_integration(self): + """Test reduce operation with pipeline.""" + transformer = Transformer.init(int).map(lambda x: x * 2) + reducer = transformer.reduce(lambda acc, x: acc + x, initial=0) + + result = list(reducer([1, 2, 3, 4])) + assert result == [20] # [2, 4, 6, 8] summed = 20 + + def test_complex_chain_integration(self): + """Test complex chain of operations.""" + # Transform: double -> flatten -> filter -> tap + side_effects = [] + + transformer = ( + Transformer.init(int) + .map(lambda x: [x, x * 2]) # Create pairs + .flatten() # Flatten to single list + .filter(lambda x: x % 3 == 0) # Keep multiples of 3 + .tap(lambda x: side_effects.append(f"processed: {x}")) + ) + + result = Pipeline([1, 2, 3, 6]).apply(transformer).to_list() + + # Expected: [1,2,2,4,3,6,6,12] -> [3,6,6,12] (multiples of 3) + assert result == [3, 6, 6, 12] + assert side_effects == ["processed: 3", "processed: 6", "processed: 6", "processed: 12"] + + def test_multiple_transformer_applications(self): + """Test applying multiple transformers in sequence.""" + t1 = Transformer.init(int).map(lambda x: x * 2) + t2 = Transformer.init(int).filter(lambda x: x > 5) + t3 = Transformer.init(int).map(lambda x: x + 1) + + result = ( + Pipeline([1, 2, 3, 4, 5]) + .apply(t1) # [2, 4, 6, 8, 10] + .apply(t2) # [6, 8, 10] + .apply(t3) # [7, 9, 11] + .to_list() + ) + + assert result == [7, 9, 11] + + def test_transform_shorthand_integration(self): + """Test transform shorthand method integration.""" + result = ( + Pipeline([1, 2, 3, 4, 5]) + .transform(lambda t: t.map(lambda x: x * 3)) + .transform(lambda t: t.filter(lambda x: x > 6)) + .to_list() + ) + + assert result == [9, 12, 15] # [3, 6, 9, 12, 15] -> [9, 12, 15] + + def test_mixed_operations(self): + """Test mixing transform shorthand and apply methods.""" + transformer = Transformer.init(int).filter(lambda x: x < 10) + + result = ( + Pipeline([1, 2, 3, 4, 5]) + .transform(lambda t: t.map(lambda x: x * 2)) # [2, 4, 6, 8, 10] + .apply(transformer) # [2, 4, 6, 8] + .transform(lambda t: t.map(lambda x: x + 1)) # [3, 5, 7, 9] + .to_list() + ) + + assert result == [3, 5, 7, 9] + + +class TestPipelineDataProcessingPatterns: + """Test common data processing patterns.""" + + def test_etl_pattern(self): + """Test Extract-Transform-Load pattern.""" + # Simulate raw data extraction + raw_data = [ + {"name": "Alice", "age": 25, "salary": 50000}, + {"name": "Bob", "age": 30, "salary": 60000}, + {"name": "Charlie", "age": 35, "salary": 70000}, + {"name": "David", "age": 28, "salary": 55000}, + ] + + # Transform: extract names of people over 28 with salary > 55000 + result = ( + Pipeline(raw_data) + .transform(lambda t: t.filter(lambda x: x["age"] > 28 and x["salary"] > 55000)) + .transform(lambda t: t.map(lambda x: x["name"])) + .to_list() + ) + + assert result == ["Bob", "Charlie"] + + def test_data_aggregation_pattern(self): + """Test data aggregation pattern.""" + # Group and count pattern + data = ["apple", "banana", "apple", "cherry", "banana", "apple"] + + # Count occurrences + counts = {} + (Pipeline(data).transform(lambda t: t.tap(lambda x: counts.update({x: counts.get(x, 0) + 1}))).consume()) + + assert counts == {"apple": 3, "banana": 2, "cherry": 1} + + def test_map_reduce_pattern(self): + """Test map-reduce pattern.""" + # Map: square numbers, Reduce: sum them + transformer = Transformer.init(int).map(lambda x: x * x) # Square + + reducer = transformer.reduce(lambda acc, x: acc + x, initial=0) # Sum + + result = list(reducer([1, 2, 3, 4, 5])) + assert result == [55] # 1 + 4 + 9 + 16 + 25 = 55 + + def test_filtering_and_transformation_pattern(self): + """Test filtering and transformation pattern.""" + # Process only even numbers, double them, then sum + even_numbers = [] + transformer = ( + Transformer.init(int).filter(lambda x: x % 2 == 0).tap(lambda x: even_numbers.append(x)).map(lambda x: x * 2) + ) + + result = Pipeline(range(1, 11)).apply(transformer).to_list() + + assert even_numbers == [2, 4, 6, 8, 10] + assert result == [4, 8, 12, 16, 20] + + def test_data_validation_pattern(self): + """Test data validation pattern.""" + # Validate and clean data + raw_data = [1, "2", 3.0, "invalid", 5, None, 7] + + valid_numbers = [] + + def validate_and_convert(x): + try: + num = float(x) if x is not None else None + if num is not None and not str(x).lower() == "invalid": + valid_numbers.append(num) + return int(num) + return None + except (ValueError, TypeError): + return None + + result = ( + Pipeline(raw_data) + .transform(lambda t: t.map(validate_and_convert)) + .transform(lambda t: t.filter(lambda x: x is not None)) + .to_list() + ) + + assert result == [1, 2, 3, 5, 7] + assert valid_numbers == [1.0, 2.0, 3.0, 5.0, 7.0] + + +class TestPipelinePerformanceIntegration: + """Test performance aspects of pipeline and transformer integration.""" + + def test_large_dataset_chunked_processing(self): + """Test chunked processing with large datasets.""" + # Process 10,000 numbers with small chunk size + transformer = Transformer.init(int, chunk_size=100).map(lambda x: x * 2).filter(lambda x: x % 1000 == 0) + + large_data = range(10000) + result = Pipeline(large_data).apply(transformer).to_list() + + # Should get multiples of 500 doubled (0, 1000, 2000, ..., 18000) + expected = [x * 2 for x in range(0, 10000, 500)] + assert result == expected + + def test_memory_efficient_processing(self): + """Test memory-efficient lazy processing.""" + # Process large dataset but only take first few results + transformer = Transformer.init(int).map(lambda x: x**2).filter(lambda x: x > 100) + + large_data = range(1000) + result = Pipeline(large_data).apply(transformer).first(5) + + # First 5 squares > 100: 121, 144, 169, 196, 225 + assert result == [121, 144, 169, 196, 225] + + def test_streaming_processing(self): + """Test streaming-like processing pattern.""" + # Simulate processing data as it arrives + batches = [[1, 2], [3, 4], [5, 6]] + results = [] + + for batch in batches: + batch_result = Pipeline(batch).transform(lambda t: t.map(lambda x: x * 3)).to_list() + results.extend(batch_result) + + assert results == [3, 6, 9, 12, 15, 18] diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..6de9a51 --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,242 @@ +"""Tests for the Pipeline class.""" + +from efemel.pipeline.pipeline import Pipeline +from efemel.pipeline.transformers.transformer import Transformer + + +class TestPipelineBasics: + """Test basic pipeline functionality.""" + + def test_pipeline_creation_from_single_iterable(self): + """Test creating pipeline from single iterable.""" + pipeline = Pipeline([1, 2, 3]) + result = pipeline.to_list() + assert result == [1, 2, 3] + + def test_pipeline_creation_from_multiple_iterables(self): + """Test creating pipeline from multiple iterables.""" + pipeline = Pipeline([1, 2], [3, 4], [5]) + result = pipeline.to_list() + assert result == [1, 2, 3, 4, 5] + + def test_pipeline_iteration(self): + """Test that pipeline can be iterated over.""" + pipeline = Pipeline([1, 2, 3]) + result = list(pipeline) + assert result == [1, 2, 3] + + def test_pipeline_to_list_multiple_calls(self): + """Test that to_list can be called multiple times (iterator consumption).""" + pipeline = Pipeline([1, 2, 3]) + first_result = pipeline.to_list() + second_result = pipeline.to_list() + # Note: This tests current behavior - iterator is consumed after first call + assert first_result == [1, 2, 3] + assert second_result == [] # Iterator is consumed + + +class TestPipelineApply: + """Test apply method with transformers.""" + + def test_apply_transformer(self): + """Test apply with chained transformer operations.""" + transformer = Transformer.init(int).map(lambda x: x * 2).filter(lambda x: x > 4) + pipeline = Pipeline([1, 2, 3, 4]).apply(transformer) + result = pipeline.to_list() + assert result == [6, 8] + + def test_apply_callable_function(self): + """Test apply with a callable function.""" + + def double_generator(data): + for item in data: + yield item * 2 + + pipeline = Pipeline([1, 2, 3]).apply(double_generator) + result = pipeline.to_list() + assert result == [2, 4, 6] + + +class TestPipelineTransform: + """Test transform shorthand method.""" + + def test_transform_chain_operations(self): + """Test transform with chained operations.""" + pipeline = Pipeline([1, 2, 3, 4]).transform(lambda t: t.map(lambda x: x * 2).filter(lambda x: x > 4)) + result = pipeline.to_list() + assert result == [6, 8] + + def test_transform_with_custom_transformer(self): + """Test transform with custom transformer class.""" + + def custom_transform(transformer: Transformer[int, int]) -> Transformer[int, int]: + return transformer.map(lambda x: x + 10).filter(lambda x: x > 12) + + pipeline = Pipeline([1, 2, 3]).transform(custom_transform) + result = pipeline.to_list() + assert result == [13] # [11, 12, 13] filtered to [13] + + +class TestPipelineTerminalOperations: + """Test terminal operations that consume the pipeline.""" + + def test_each_applies_function_to_all_elements(self): + """Test each applies function to all elements.""" + results = [] + pipeline = Pipeline([1, 2, 3]) + pipeline.each(lambda x: results.append(x * 2)) + assert results == [2, 4, 6] + + def test_first_gets_first_n_elements(self): + """Test first gets first n elements.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + result = pipeline.first(3) + assert result == [1, 2, 3] + + def test_first_default_gets_one_element(self): + """Test first with default argument gets one element.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + result = pipeline.first() + assert result == [1] + + def test_first_with_small_dataset(self): + """Test first when requesting more elements than available.""" + pipeline = Pipeline([1, 2]) + result = pipeline.first(5) + assert result == [1, 2] + + def test_consume_processes_without_return(self): + """Test consume processes all elements without returning anything.""" + side_effects = [] + transformer = Transformer.init(int).tap(lambda x: side_effects.append(x)) + pipeline = Pipeline([1, 2, 3]).apply(transformer) + + result = pipeline.consume() + assert result is None + assert side_effects == [1, 2, 3] + + def test_to_list_returns_all_elements(self): + """Test to_list returns all processed elements.""" + pipeline = Pipeline([1, 2, 3, 4, 5]) + result = pipeline.to_list() + assert result == [1, 2, 3, 4, 5] + + +class TestPipelineChaining: + """Test chaining pipeline operations.""" + + def test_apply_then_terminal_operation(self): + """Test applying transformer then using terminal operation.""" + transformer = Transformer.init(int).map(lambda x: x * 2) + pipeline = Pipeline([1, 2, 3]).apply(transformer) + result = pipeline.first(2) + assert result == [2, 4] + + def test_multiple_transforms(self): + """Test applying multiple transforms.""" + pipeline = ( + Pipeline([1, 2, 3, 4]).transform(lambda t: t.map(lambda x: x * 2)).transform(lambda t: t.filter(lambda x: x > 4)) + ) + result = pipeline.to_list() + assert result == [6, 8] + + def test_transform_then_apply(self): + """Test transform followed by apply.""" + transformer = Transformer.init(int).filter(lambda x: x > 4) + pipeline = Pipeline([1, 2, 3, 4, 5]).transform(lambda t: t.map(lambda x: x * 2)).apply(transformer) + result = pipeline.to_list() + assert result == [6, 8, 10] + + +class TestPipelineDataTypes: + """Test pipeline with different data types.""" + + def test_pipeline_with_strings(self): + """Test pipeline with string data.""" + pipeline = Pipeline(["hello", "world"]).transform(lambda t: t.map(lambda x: x.upper())) + result = pipeline.to_list() + assert result == ["HELLO", "WORLD"] + + def test_pipeline_with_mixed_data(self): + """Test pipeline with mixed data types.""" + pipeline = Pipeline([1, "hello", 3.14]).transform(lambda t: t.map(lambda x: str(x))) + result = pipeline.to_list() + assert result == ["1", "hello", "3.14"] + + def test_pipeline_with_complex_objects(self): + """Test pipeline with complex objects.""" + data = [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}] + pipeline = Pipeline(data).transform(lambda t: t.map(lambda x: x["name"])) + result = pipeline.to_list() + assert result == ["Alice", "Bob"] + + +class TestPipelineEdgeCases: + """Test edge cases for pipeline.""" + + def test_empty_pipeline(self): + """Test pipeline with empty data.""" + pipeline = Pipeline([]) + result = pipeline.to_list() + assert result == [] + + def test_empty_pipeline_terminal_operations(self): + """Test terminal operations on empty pipeline.""" + # Test each + results = [] + pipeline_each = Pipeline([]) + pipeline_each.each(lambda x: results.append(x)) + assert results == [] + + # Test first + pipeline_first = Pipeline([]) + result = pipeline_first.first(5) + assert result == [] + + # Test consume + pipeline_consume = Pipeline([]) + result = pipeline_consume.consume() + assert result is None + + def test_single_element_pipeline(self): + """Test pipeline with single element.""" + pipeline = Pipeline([42]) + result = pipeline.to_list() + assert result == [42] + + def test_pipeline_type_preservation(self): + """Test that pipeline preserves and transforms types correctly.""" + # Start with integers + pipeline = Pipeline([1, 2, 3]) + int_result = pipeline.to_list() + assert all(isinstance(x, int) for x in int_result) + + # Transform to strings + pipeline = Pipeline([1, 2, 3]).transform(lambda t: t.map(lambda x: str(x))) + str_result = pipeline.to_list() + assert all(isinstance(x, str) for x in str_result) + assert str_result == ["1", "2", "3"] + + +class TestPipelinePerformance: + """Test pipeline performance characteristics.""" + + def test_large_dataset_processing(self): + """Test pipeline can handle large datasets.""" + large_data = list(range(10000)) + pipeline = Pipeline(large_data).transform(lambda t: t.map(lambda x: x * 2).filter(lambda x: x % 100 == 0)) + result = pipeline.to_list() + + # Should have every 50th element doubled (0, 100, 200, ..., 19800) + expected = [x * 2 for x in range(0, 10000, 50)] + assert result == expected + + def test_chunked_processing(self): + """Test that chunked processing works correctly.""" + # Use small chunk size to test chunking behavior + transformer = Transformer.init(int, chunk_size=10).map(lambda x: x + 1) + pipeline = Pipeline(list(range(100))).apply(transformer) + result = pipeline.to_list() + + expected = list(range(1, 101)) # [1, 2, 3, ..., 100] + assert result == expected diff --git a/tests/test_transformer.py b/tests/test_transformer.py new file mode 100644 index 0000000..b01109a --- /dev/null +++ b/tests/test_transformer.py @@ -0,0 +1,211 @@ +"""Tests for the Transformer class.""" + +from efemel.pipeline.transformers.transformer import PipelineContext +from efemel.pipeline.transformers.transformer import Transformer + + +class TestTransformerBasics: + """Test basic transformer functionality.""" + + def test_init_creates_identity_transformer(self): + """Test that init creates an identity transformer.""" + transformer = Transformer.init(int) + result = list(transformer([1, 2, 3])) + assert result == [1, 2, 3] + + def test_init_with_chunk_size(self): + """Test init with custom chunk size.""" + transformer = Transformer.init(int, chunk_size=2) + assert transformer.chunk_size == 2 + result = list(transformer([1, 2, 3, 4])) + assert result == [1, 2, 3, 4] + + def test_init_with_context(self): + """Test init with custom context.""" + context = PipelineContext({"key": "value"}) + transformer = Transformer(context=context) + assert transformer.context == context + + def test_call_executes_transformer(self): + """Test that calling transformer executes it on data.""" + transformer = Transformer.init(int) + result = list(transformer([1, 2, 3])) + assert result == [1, 2, 3] + + +class TestTransformerOperations: + """Test transformer operations like map, filter, etc.""" + + def test_map_transforms_elements(self): + """Test map transforms each element.""" + transformer = Transformer.init(int).map(lambda x: x * 2) + result = list(transformer([1, 2, 3])) + assert result == [2, 4, 6] + + def test_map_with_context_aware_function(self): + """Test map with context-aware function.""" + context = PipelineContext({"multiplier": 3}) + transformer = Transformer(context=context).map(lambda x, ctx: x * ctx["multiplier"]) + result = list(transformer([1, 2, 3])) + assert result == [3, 6, 9] + + def test_filter_keeps_matching_elements(self): + """Test filter keeps only matching elements.""" + transformer = Transformer.init(int).filter(lambda x: x % 2 == 0) + result = list(transformer([1, 2, 3, 4, 5, 6])) + assert result == [2, 4, 6] + + def test_filter_with_context_aware_function(self): + """Test filter with context-aware function.""" + context = PipelineContext({"threshold": 3}) + transformer = Transformer(context=context).filter(lambda x, ctx: x > ctx["threshold"]) + result = list(transformer([1, 2, 3, 4, 5])) + assert result == [4, 5] + + def test_flatten_list_of_lists(self): + """Test flatten with list of lists.""" + transformer = Transformer.init(list).flatten() + result = list(transformer([[1, 2], [3, 4], [5]])) + assert result == [1, 2, 3, 4, 5] + + def test_flatten_list_of_tuples(self): + """Test flatten with list of tuples.""" + transformer = Transformer.init(tuple).flatten() + result = list(transformer([(1, 2), (3, 4), (5,)])) + assert result == [1, 2, 3, 4, 5] + + def test_flatten_list_of_sets(self): + """Test flatten with list of sets.""" + transformer = Transformer.init(set).flatten() + result = list(transformer([{1, 2}, {3, 4}, {5}])) + # Sets are unordered, so we sort for comparison + assert sorted(result) == [1, 2, 3, 4, 5] + + def test_tap_applies_side_effect_without_modification(self): + """Test tap applies side effect without modifying data.""" + side_effects = [] + transformer = Transformer.init(int).tap(lambda x: side_effects.append(x)) + result = list(transformer([1, 2, 3])) + + assert result == [1, 2, 3] # Data unchanged + assert side_effects == [1, 2, 3] # Side effect applied + + def test_tap_with_context_aware_function(self): + """Test tap with context-aware function.""" + side_effects = [] + context = PipelineContext({"prefix": "item:"}) + transformer = Transformer(context=context).tap(lambda x, ctx: side_effects.append(f"{ctx['prefix']}{x}")) + result = list(transformer([1, 2, 3])) + + assert result == [1, 2, 3] + assert side_effects == ["item:1", "item:2", "item:3"] + + def test_apply_composes_transformers(self): + """Test apply composes transformers.""" + transformer1 = Transformer.init(int).map(lambda x: x * 2) + transformer2 = transformer1.apply(lambda t: t.filter(lambda x: x > 4)) + result = list(transformer2([1, 2, 3, 4])) + assert result == [6, 8] # [2, 4, 6, 8] filtered to [6, 8] + + +class TestTransformerChaining: + """Test chaining multiple transformer operations.""" + + def test_map_then_filter(self): + """Test map followed by filter.""" + transformer = Transformer.init(int).map(lambda x: x * 2).filter(lambda x: x > 4) + result = list(transformer([1, 2, 3, 4])) + assert result == [6, 8] + + def test_filter_then_map(self): + """Test filter followed by map.""" + transformer = Transformer.init(int).filter(lambda x: x % 2 == 0).map(lambda x: x * 3) + result = list(transformer([1, 2, 3, 4, 5, 6])) + assert result == [6, 12, 18] + + def test_map_flatten_filter(self): + """Test map, flatten, then filter.""" + transformer = Transformer.init(int).map(lambda x: [x, x * 2]).flatten().filter(lambda x: x > 3) + result = list(transformer([1, 2, 3])) + assert result == [4, 6] # [[1,2], [2,4], [3,6]] -> [1,2,2,4,3,6] -> [4,6] + + def test_complex_chain_with_tap(self): + """Test complex chain with tap for side effects.""" + side_effects = [] + transformer = ( + Transformer.init(int) + .map(lambda x: x * 2) + .tap(lambda x: side_effects.append(f"doubled: {x}")) + .filter(lambda x: x > 4) + .tap(lambda x: side_effects.append(f"filtered: {x}")) + ) + + result = list(transformer([1, 2, 3, 4])) + assert result == [6, 8] + assert side_effects == ["doubled: 2", "doubled: 4", "doubled: 6", "doubled: 8", "filtered: 6", "filtered: 8"] + + +class TestTransformerTerminalOperations: + """Test terminal operations like reduce.""" + + def test_reduce_sums_elements(self): + """Test reduce sums all elements.""" + transformer = Transformer.init(int) + reducer = transformer.reduce(lambda acc, x: acc + x, initial=0) + result = list(reducer([1, 2, 3, 4])) + assert result == [10] + + def test_reduce_with_context_aware_function(self): + """Test reduce with context-aware function.""" + context = PipelineContext({"multiplier": 2}) + transformer = Transformer(context=context) + reducer = transformer.reduce(lambda acc, x, ctx: acc + (x * ctx["multiplier"]), initial=0) + result = list(reducer([1, 2, 3])) + assert result == [12] # (1*2) + (2*2) + (3*2) = 2 + 4 + 6 = 12 + + def test_reduce_with_map_chain(self): + """Test reduce after map transformation.""" + transformer = Transformer.init(int).map(lambda x: x * 2) + reducer = transformer.reduce(lambda acc, x: acc + x, initial=0) + result = list(reducer([1, 2, 3])) + assert result == [12] # [2, 4, 6] summed = 12 + + +class TestTransformerChunking: + """Test chunking behavior.""" + + def test_chunking_with_small_chunk_size(self): + """Test transformer works correctly with small chunk sizes.""" + transformer = Transformer.init(int, chunk_size=2).map(lambda x: x * 2) + result = list(transformer([1, 2, 3, 4, 5])) + assert result == [2, 4, 6, 8, 10] + + def test_chunking_with_large_data(self): + """Test transformer works correctly with large datasets.""" + transformer = Transformer.init(int, chunk_size=100).map(lambda x: x + 1) + large_data = list(range(1000)) + result = list(transformer(large_data)) + expected = [x + 1 for x in large_data] + assert result == expected + + +class TestTransformerEdgeCases: + """Test edge cases for transformer.""" + + def test_empty_data(self): + """Test transformer with empty data.""" + transformer = Transformer.init(int).map(lambda x: x * 2) + result = list(transformer([])) + assert result == [] + + def test_single_element(self): + """Test transformer with single element.""" + transformer = Transformer.init(int).map(lambda x: x * 2).filter(lambda x: x > 0) + result = list(transformer([5])) + assert result == [10] + + def test_filter_removes_all_elements(self): + """Test filter that removes all elements.""" + transformer = Transformer.init(int).filter(lambda x: x > 100) + result = list(transformer([1, 2, 3])) + assert result == [] diff --git a/tests_old/test_pipeline.py b/tests_old/test_pipeline.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests_old/test_pipeline_basics.py b/tests_old/test_pipeline_basics.py deleted file mode 100644 index be60b7b..0000000 --- a/tests_old/test_pipeline_basics.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -Test basic Pipeline functionality. - -This module contains tests for basic Pipeline class functionality -including initialization, iteration, and core methods. -""" - -from efemel.pipeline import Pipeline - - -class TestPipelineBasics: - """Test basic Pipeline functionality.""" - - def test_pipeline_initialization(self): - """Test Pipeline initialization with various iterables.""" - # Test with default parameters - pipeline = Pipeline() - assert isinstance(pipeline, Pipeline) - assert pipeline.chunk_size == 1000 - assert isinstance(pipeline.context, dict) - - # Test with custom chunk size - pipeline = Pipeline(chunk_size=500) - assert pipeline.chunk_size == 500 - - # Test with custom context - from efemel.pipeline import PipelineContext - - custom_context = PipelineContext() - custom_context["key"] = "value" - pipeline = Pipeline(context=custom_context) - assert pipeline.context["key"] == "value" - - def test_pipeline_iteration(self): - """Test Pipeline iteration behavior through callable interface.""" - pipeline = Pipeline() - - # Test iteration via __call__ - data = [1, 2, 3, 4, 5] - result = [] - for item in pipeline(data): - result.append(item) - - assert result == [1, 2, 3, 4, 5] - - # Test with multiple data sources - result2 = list(pipeline([1, 2], [3, 4, 5])) - assert result2 == [1, 2, 3, 4, 5] - - def test_to_list(self): - """Test Pipeline.to_list() method.""" - pipeline = Pipeline() - result = pipeline.to_list([1, 2, 3, 4, 5]) - - assert result == [1, 2, 3, 4, 5] - assert isinstance(result, list) - - # Test with empty pipeline - assert pipeline.to_list([]) == [] - - # Test with multiple data sources - result2 = pipeline.to_list([1, 2], [3, 4, 5]) - assert result2 == [1, 2, 3, 4, 5] - - def test_first(self): - """Test Pipeline.first() method.""" - pipeline = Pipeline() - result = pipeline.first([1, 2, 3, 4, 5]) - assert result == [1] - - # Test with string - result = pipeline.first(["hello", "world"]) - assert result == ["hello"] - - # Test first n elements - result = pipeline.first([1, 2, 3, 4, 5], n=3) - assert result == [1, 2, 3] - - def test_first_empty_pipeline(self): - """Test Pipeline.first() with empty pipeline.""" - pipeline = Pipeline() - result = pipeline.first([]) - assert result == [] diff --git a/tests_old/test_pipeline_composition.py b/tests_old/test_pipeline_composition.py deleted file mode 100644 index d6a724b..0000000 --- a/tests_old/test_pipeline_composition.py +++ /dev/null @@ -1,113 +0,0 @@ -""" -Test Pipeline composition functionality. - -This module contains tests for Pipeline composition including -pipeline chaining, from_pipeline method, and complex compositions. -""" - -from efemel.pipeline import Pipeline - - -class TestPipelineComposition: - """Test Pipeline composition functionality.""" - - def test_pipeline_apply_method(self): - """Test applying one pipeline to another using apply method.""" - # Create source pipeline that doubles numbers - doubler_pipeline = Pipeline().map(lambda x: x * 2) - - # Create main pipeline that adds 1 - main_pipeline = Pipeline().map(lambda x: x + 1) - - # Apply the doubler to the main pipeline - composed = main_pipeline.apply(doubler_pipeline) - - # Test the composition: first add 1, then double - result = composed.to_list([1, 2, 3, 4, 5]) - assert result == [4, 6, 8, 10, 12] # (1+1)*2, (2+1)*2, (3+1)*2, (4+1)*2, (5+1)*2 - - def test_pipeline_chaining_with_multiple_data_sources(self): - """Test chaining operations with multiple data sources.""" - pipeline = Pipeline() - - # Chain operations: double, then filter even, then add 10 - result = pipeline.map(lambda x: x * 2).filter(lambda x: x % 2 == 0).map(lambda x: x + 10) - - # Test with multiple data sources - data1 = [1, 2, 3] - data2 = [4, 5, 6] - result_list = result.to_list(data1, data2) - - # Expected: double [1,2,3,4,5,6] -> [2,4,6,8,10,12], filter even -> [2,4,6,8,10,12], add 10 -> [12,14,16,18,20,22] - assert result_list == [12, 14, 16, 18, 20, 22] - - def test_complex_composition(self): - """Test complex pipeline composition.""" - # Create a pipeline that processes data in multiple steps - pipeline = Pipeline() - - # Complex chain: map to square, filter > 10, map to string, filter length > 1 - result = pipeline.map(lambda x: x**2).filter(lambda x: x > 10).map(str).filter(lambda s: len(s) > 1) - - # Test with data [1, 2, 3, 4, 5, 6] - # Square: [1, 4, 9, 16, 25, 36] - # Filter > 10: [16, 25, 36] - # To string: ["16", "25", "36"] - # Filter length > 1: ["16", "25", "36"] (all have length 2) - result_list = result.to_list([1, 2, 3, 4, 5, 6]) - assert result_list == ["16", "25", "36"] - - def test_pipeline_apply_with_empty_data(self): - """Test apply method with empty data.""" - doubler = Pipeline().map(lambda x: x * 2) - main = Pipeline().map(lambda x: x + 1) - - composed = main.apply(doubler) - result = composed.to_list([]) - assert result == [] - - def test_pipeline_apply_with_filters(self): - """Test apply method combining filters and maps.""" - # Create a pipeline that filters even numbers - even_filter = Pipeline().filter(lambda x: x % 2 == 0) - - # Create a pipeline that doubles numbers - doubler = Pipeline().map(lambda x: x * 2) - - # Apply even filter first, then doubler - composed = even_filter.apply(doubler) - - result = composed.to_list([1, 2, 3, 4, 5, 6]) - assert result == [4, 8, 12] # Even numbers [2, 4, 6] doubled - - def test_multiple_pipeline_applications(self): - """Test applying multiple pipelines in sequence.""" - # Create individual transformation pipelines - add_one = Pipeline().map(lambda x: x + 1) - double = Pipeline().map(lambda x: x * 2) - to_string = Pipeline().map(str) - - # Chain them using apply - result_pipeline = add_one.apply(double).apply(to_string) - - # Test: (1+1)*2=4, (2+1)*2=6, (3+1)*2=8 -> ["4", "6", "8"] - result = result_pipeline.to_list([1, 2, 3]) - assert result == ["4", "6", "8"] - - def test_complex_pipeline_composition(self): - """Test complex pipeline composition scenario.""" - # Use a single pipeline with multiple data sources instead of chaining - pipeline = Pipeline() - - # Apply transformations: filter > 3, square, then reduce to sum - result = ( - pipeline.filter(lambda x: x > 3) - .map(lambda x: x**2) - .reduce([2, 4, 6, 8], [1, 3, 5, 7], function=lambda acc, x: acc + x, initial=0) - ) - - # Data: [2, 4, 6, 8] + [1, 3, 5, 7] = [2, 4, 6, 8, 1, 3, 5, 7] - # Filter > 3: [4, 6, 8, 5, 7] - # Square: [16, 36, 64, 25, 49] - # Sum: 190 - assert result == 190 diff --git a/tests_old/test_pipeline_concurrency.py b/tests_old/test_pipeline_concurrency.py deleted file mode 100644 index 6a6d525..0000000 --- a/tests_old/test_pipeline_concurrency.py +++ /dev/null @@ -1,87 +0,0 @@ -""" -Test Pipeline concurrency functionality. - -This module contains tests for Pipeline concurrent operations -including basic concurrency and context isolation. -""" - -from efemel.pipeline import Pipeline - - -class TestPipelineConcurrency: - """Test Pipeline concurrency functionality.""" - - def test_concurrent(self): - # Create a simple pipeline - - def _double_pipeline(p): - return p.map(lambda x: x * 2) - - pipeline = Pipeline(chunk_size=1) - result = list(pipeline.concurrent(range(0, 5), pipeline=_double_pipeline, max_workers=5)) - - # Check if all numbers are doubled - assert result == [0, 2, 4, 6, 8] - - def test_concurrent_consumer(self): - # Create a simple pipeline - - items = [] - - def dummy_consumer(p): - return p.tap(lambda x: items.append(x)) - - pipeline = Pipeline(chunk_size=1) - list(pipeline.concurrent(range(0, 5), pipeline=dummy_consumer, max_workers=5)) - - # Check if all numbers are consumed - assert items == [0, 1, 2, 3, 4] - - def test_concurrent_vs_sequential(self): - """Test that concurrent and sequential processing produce the same results.""" - data = list(range(20)) - - def transform_pipeline(p): - return p.map(lambda x: x * 3).filter(lambda x: x % 2 == 0) - - # Sequential - sequential_pipeline = Pipeline().apply(transform_pipeline) - sequential_result = sequential_pipeline.to_list(data) - - # Concurrent - concurrent_pipeline = Pipeline(chunk_size=5) - concurrent_result = list(concurrent_pipeline.concurrent(data, pipeline=transform_pipeline, max_workers=3)) - - assert sequential_result == concurrent_result - - def test_concurrent_unordered(self): - """Test unordered concurrent processing.""" - data = list(range(20)) - - def simple_transform(p): - return p.map(lambda x: x * 2) - - # Unordered processing - pipeline = Pipeline(chunk_size=4) - result = list(pipeline.concurrent(data, pipeline=simple_transform, max_workers=3, ordered=False)) - - # Results should be correct but may be out of order - expected = [x * 2 for x in data] - assert sorted(result) == sorted(expected) - - def test_concurrent_context_thread_safety(self): - """Test that context updates are thread-safe during concurrent processing.""" - data = list(range(100)) - - def context_aware_pipeline(p): - """Pipeline that interacts with context.""" - return p.map(lambda x, ctx: x * 2 if not ctx.get("has_errors", False) else x) - - pipeline = Pipeline(chunk_size=10) - pipeline.context["has_errors"] = False - result = list(pipeline.concurrent(data, pipeline=context_aware_pipeline, max_workers=4)) - - # Should get consistent results despite concurrent access to context - expected = [x * 2 for x in data] - assert result == expected - assert not pipeline.context.get("has_errors", False) diff --git a/tests_old/test_pipeline_context.py b/tests_old/test_pipeline_context.py deleted file mode 100644 index 077b39f..0000000 --- a/tests_old/test_pipeline_context.py +++ /dev/null @@ -1,92 +0,0 @@ -""" -Test Pipeline global context functionality. - -This module contains tests for Pipeline global context operations -including context propagation and sharing between operations. -""" - -from efemel.pipeline import Pipeline - - -class TestPipelineContext: - """Test Pipeline global context functionality.""" - - def test_map_sets_context_value_propagated_to_all_items(self): - """Test that when map function sets a value in context once, it gets passed to all items.""" - pipeline = Pipeline([1, 2, 3, 4, 5]) - - context_access_count = 0 - - def set_context_once(value, context): - nonlocal context_access_count - context_access_count += 1 - - # Set a value in context only on the first call - if context_access_count == 1: - context["shared_value"] = "set_by_first_item" - - # All items should be able to access the shared value - shared = context.get("shared_value", "not_found") - return f"{value}_{shared}" - - result = pipeline.map(set_context_once).to_list() - - # Verify that all items got access to the shared value - expected = [ - "1_set_by_first_item", - "2_set_by_first_item", - "3_set_by_first_item", - "4_set_by_first_item", - "5_set_by_first_item", - ] - - assert result == expected - - # Verify the function was called for each item - assert context_access_count == 5 - - def test_concurrent_vs_sequential_context(self): - """Test that concurrent and sequential processing produce the same results.""" - data = list(range(20)) - - def transform_pipeline(p): - return p.map(lambda x: x * 3).filter(lambda x: x % 2 == 0) - - # Sequential - sequential_result = Pipeline().apply(transform_pipeline).to_list(data) - - # Concurrent - concurrent_result = list(Pipeline(5).concurrent(data, pipeline=transform_pipeline, max_workers=3)) - - assert sequential_result == concurrent_result - - def test_context_modifications_merged_in_concurrent_operations(self): - """Test that context modifications in concurrent processing are properly merged back.""" - data = list(range(20)) - - def context_modifying_pipeline(p): - """Pipeline that modifies context during processing.""" - - def modify_context(value, context): - # Modify context - these changes should be merged back - if "processed_values" not in context: - context["processed_values"] = [] - context["processed_values"].append(value) - return value * 2 - - return p.map(modify_context) - - pipeline = Pipeline(5) - result = list(pipeline.concurrent(data, pipeline=context_modifying_pipeline, max_workers=2)) - - # Verify results are correct - expected = [x * 2 for x in data] - assert result == expected - - # Context modifications should be merged back from all threads - # Each thread processes its chunk and adds values to context - assert "processed_values" in pipeline.context - assert len(pipeline.context["processed_values"]) > 0 - # Should have all processed values (order may vary due to concurrency) - assert set(pipeline.context["processed_values"]) == set(data) - assert not pipeline.context["has_errors"] diff --git a/tests_old/test_pipeline_each.py b/tests_old/test_pipeline_each.py deleted file mode 100644 index 4dbfe4d..0000000 --- a/tests_old/test_pipeline_each.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Test Pipeline each functionality. - -This module contains tests for Pipeline each operations -including side effects and terminal operation behavior. -""" - -from efemel.pipeline import Pipeline - - -class TestPipelineEach: - """Test Pipeline each functionality.""" - - def test_each_basic(self): - """Test basic each operations.""" - pipeline = Pipeline() - side_effects = [] - - # Each should execute function for each item - pipeline.each([1, 2, 3, 4, 5], function=side_effects.append) - - assert side_effects == [1, 2, 3, 4, 5] - - def test_each_returns_none(self): - """Test that each returns None.""" - pipeline = Pipeline() - - result = pipeline.each([1, 2, 3, 4, 5], function=lambda x: None) - assert result is None - - def test_each_with_empty_pipeline(self): - """Test each with empty pipeline.""" - pipeline = Pipeline() - side_effects = [] - - # Should not execute function - pipeline.each([], function=side_effects.append) - - assert side_effects == [] - - def test_each_terminal_operation(self): - """Test that each is a terminal operation.""" - pipeline = Pipeline() - side_effects = [] - - # After each, pipeline should be consumed - pipeline.each([1, 2, 3, 4, 5], function=side_effects.append) - - assert side_effects == [1, 2, 3, 4, 5] diff --git a/tests_old/test_pipeline_edge_cases.py b/tests_old/test_pipeline_edge_cases.py deleted file mode 100644 index b119e15..0000000 --- a/tests_old/test_pipeline_edge_cases.py +++ /dev/null @@ -1,77 +0,0 @@ -""" -Test Pipeline edge cases and error conditions. - -This module contains tests for Pipeline edge cases including -None values, mixed types, reuse, and generator behavior. -""" - -from efemel.pipeline import Pipeline - - -class TestPipelineEdgeCases: - """Test Pipeline edge cases and error conditions.""" - - def test_pipeline_with_none_values(self): - """Test pipeline with None values.""" - # Should handle None values properly - pipeline = Pipeline() - result = pipeline.to_list([1, None, 3, None, 5]) - assert result == [1, None, 3, None, 5] - - # Filter out None values - use fresh pipeline - pipeline2 = Pipeline() - no_none = pipeline2.filter(lambda x: x is not None) - assert no_none.to_list([1, None, 3, None, 5]) == [1, 3, 5] - - def test_pipeline_with_mixed_types(self): - """Test pipeline with mixed data types.""" - # Should handle mixed types - pipeline = Pipeline() - result = pipeline.to_list([1, "hello", 3.14, True, None]) - assert result == [1, "hello", 3.14, True, None] - - # Filter by type (excluding boolean which is a subclass of int) - use fresh pipeline - pipeline2 = Pipeline() - numbers = pipeline2.filter(lambda x: isinstance(x, int | float) and not isinstance(x, bool) and x is not None) - assert numbers.to_list([1, "hello", 3.14, True, None]) == [1, 3.14] - - def test_pipeline_reuse(self): - """Test that pipelines can be reused.""" - original_data = [1, 2, 3, 4, 5] - pipeline = Pipeline() - - # First use - result1 = pipeline.map(lambda x: x * 2).to_list(original_data) - assert result1 == [2, 4, 6, 8, 10] - - # Pipeline should be reusable with different data - new_data = [10, 20, 30] - result2 = pipeline.to_list(new_data) # Same pipeline, different data - assert result2 == [20, 40, 60] - - def test_pipeline_method_chaining_returns_new_instances(self): - """Test that pipeline methods return new instances.""" - # Create separate pipelines for different operations - mapped = Pipeline().map(lambda x: x * 2) - filtered = Pipeline().filter(lambda x: x % 2 == 0) - - # Test that they work correctly with different transformations - assert mapped.to_list([1, 2, 3, 4, 5]) == [2, 4, 6, 8, 10] - assert filtered.to_list([1, 2, 3, 4, 5]) == [2, 4] - assert mapped is not filtered - - def test_pipeline_with_generator_exhaustion(self): - """Test pipeline behavior with generator exhaustion.""" - - def number_generator(): - yield from range(3) - - pipeline = Pipeline() - - # First consumption - result1 = pipeline.to_list(number_generator()) - assert result1 == [0, 1, 2] - - # New generator for second consumption (generators are single-use) - result2 = pipeline.to_list(number_generator()) - assert result2 == [0, 1, 2] diff --git a/tests_old/test_pipeline_filtering.py b/tests_old/test_pipeline_filtering.py deleted file mode 100644 index a6c161c..0000000 --- a/tests_old/test_pipeline_filtering.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Test Pipeline filtering functionality. - -This module contains tests for Pipeline filtering operations -including basic filtering, chaining, and edge cases. -""" - -from efemel.pipeline import Pipeline - - -class TestPipelineFiltering: - """Test Pipeline filtering functionality.""" - - def test_filter_basic(self): - """Test basic filtering operations.""" - # Filter even numbers - pipeline1 = Pipeline().filter(lambda x: x % 2 == 0) - assert pipeline1.to_list([1, 2, 3, 4, 5]) == [2, 4] - - # Filter numbers greater than 3 - use fresh pipeline - pipeline2 = Pipeline().filter(lambda x: x > 3) - assert pipeline2.to_list([1, 2, 3, 4, 5]) == [4, 5] - - def test_filter_with_strings(self): - """Test filtering with string data.""" - # Filter strings longer than 4 characters - pipeline1 = Pipeline().filter(lambda s: len(s) > 4) - assert pipeline1.to_list(["hello", "world", "python", "test"]) == ["hello", "world", "python"] - - # Filter strings starting with 'p' - use fresh pipeline - pipeline2 = Pipeline().filter(lambda s: s.startswith("p")) - assert pipeline2.to_list(["hello", "world", "python", "test"]) == ["python"] - - def test_filter_empty_result(self): - """Test filter that results in empty pipeline.""" - pipeline = Pipeline() - - # Filter that matches nothing - empty_result = pipeline.filter(lambda x: x > 10) - assert empty_result.to_list([1, 2, 3, 4, 5]) == [] - - def test_filter_all_pass(self): - """Test filter where all items pass.""" - pipeline = Pipeline() - - # Filter that passes everything - all_pass = pipeline.filter(lambda x: True) - assert all_pass.to_list([1, 2, 3, 4, 5]) == [1, 2, 3, 4, 5] - - def test_filter_chaining(self): - """Test chaining multiple filters.""" - pipeline = Pipeline() - - # Chain filters: even numbers and greater than 5 - result = pipeline.filter(lambda x: x % 2 == 0).filter(lambda x: x > 5) - assert result.to_list([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) == [6, 8, 10] diff --git a/tests_old/test_pipeline_integration.py b/tests_old/test_pipeline_integration.py deleted file mode 100644 index d1d0569..0000000 --- a/tests_old/test_pipeline_integration.py +++ /dev/null @@ -1,79 +0,0 @@ -""" -Integration tests for Pipeline class. - -This module contains end-to-end integration tests for complex -Pipeline operations and realistic data processing scenarios. -""" - -from efemel.pipeline import Pipeline - - -class TestPipelineIntegration: - """Integration tests for Pipeline class.""" - - def test_data_processing_pipeline(self): - """Test a realistic data processing pipeline.""" - # Simulate processing a list of user data - users = [ - {"name": "Alice", "age": 30, "active": True}, - {"name": "Bob", "age": 25, "active": False}, - {"name": "Charlie", "age": 35, "active": True}, - {"name": "Diana", "age": 28, "active": True}, - {"name": "Eve", "age": 22, "active": False}, - ] - - pipeline = Pipeline() - - # Process: filter active users, extract names, convert to uppercase - result = pipeline.filter(lambda user: user["active"]).map(lambda user: user["name"]).map(str.upper) - - assert result.to_list(users) == ["ALICE", "CHARLIE", "DIANA"] - - def test_number_processing_pipeline(self): - """Test a number processing pipeline.""" - numbers = range(1, 21) # 1 to 20 - - pipeline = Pipeline() - - # Process: filter even numbers, square them, filter > 50, sum - result = ( - pipeline.filter(lambda x: x % 2 == 0) # [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] - .map(lambda x: x**2) # [4, 16, 36, 64, 100, 144, 196, 256, 324, 400] - .filter(lambda x: x > 50) # [64, 100, 144, 196, 256, 324, 400] - .reduce(numbers, function=lambda acc, x: acc + x, initial=0) - ) # 1484 - - assert result == 1484 - - def test_text_processing_pipeline(self): - """Test a text processing pipeline.""" - text = "Hello world! This is a test. Python is amazing." - words = text.split() - - pipeline = Pipeline() - - # Process: filter words > 3 chars, remove punctuation, lowercase, get unique - result = ( - pipeline.filter(lambda word: len(word) > 3) - .map(lambda word: word.strip(".,!")) - .map(str.lower) - .filter(lambda word: word not in ["this"]) - ) # Simple "unique" filter - - expected = ["hello", "world", "test", "python", "amazing"] - assert result.to_list(words) == expected - - def test_nested_data_processing(self): - """Test processing nested data structures.""" - data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - - pipeline = Pipeline() - - # Flatten, filter odd numbers, square them - result = ( - pipeline.flatten() # [1, 2, 3, 4, 5, 6, 7, 8, 9] - .filter(lambda x: x % 2 == 1) # [1, 3, 5, 7, 9] - .map(lambda x: x**2) - ) # [1, 9, 25, 49, 81] - - assert result.to_list(data) == [1, 9, 25, 49, 81] diff --git a/tests_old/test_pipeline_map_filter.py b/tests_old/test_pipeline_map_filter.py deleted file mode 100644 index 1b85c5a..0000000 --- a/tests_old/test_pipeline_map_filter.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Test Pipeline map and filter combinations. - -This module contains tests for combining mapping and filtering operations -in various orders and combinations. -""" - -from efemel.pipeline import Pipeline - - -class TestPipelineMapFilter: - """Test Pipeline map and filter combinations.""" - - def test_map_then_filter(self): - """Test mapping then filtering.""" - pipeline = Pipeline() - - # Double numbers, then filter even results - result = pipeline.map(lambda x: x * 2).filter(lambda x: x % 2 == 0) - assert result.to_list([1, 2, 3, 4, 5]) == [2, 4, 6, 8, 10] - - def test_filter_then_map(self): - """Test filtering then mapping.""" - pipeline = Pipeline() - - # Filter even numbers, then double them - result = pipeline.filter(lambda x: x % 2 == 0).map(lambda x: x * 2) - assert result.to_list([1, 2, 3, 4, 5]) == [4, 8] - - def test_complex_map_filter_chain(self): - """Test complex chaining of map and filter operations.""" - pipeline = Pipeline() - - # Complex chain: filter odd, multiply by 3, filter > 10 - result = ( - pipeline.filter(lambda x: x % 2 == 1) # [1, 3, 5, 7, 9] - .map(lambda x: x * 3) # [3, 9, 15, 21, 27] - .filter(lambda x: x > 10) - ) # [15, 21, 27] - - assert result.to_list([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) == [15, 21, 27] diff --git a/tests_old/test_pipeline_mapping.py b/tests_old/test_pipeline_mapping.py deleted file mode 100644 index 5950a57..0000000 --- a/tests_old/test_pipeline_mapping.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Test Pipeline mapping functionality. - -This module contains tests for Pipeline mapping operations -including basic mapping, type transformations, and chaining. -""" - -from efemel.pipeline import Pipeline - - -class TestPipelineMapping: - """Test Pipeline mapping functionality.""" - - def test_map_basic(self): - """Test basic mapping operations.""" - # Double each number - pipeline1 = Pipeline().map(lambda x: x * 2) - assert pipeline1.to_list([1, 2, 3, 4, 5]) == [2, 4, 6, 8, 10] - - # Square each number - use fresh pipeline - pipeline2 = Pipeline().map(lambda x: x**2) - assert pipeline2.to_list([1, 2, 3, 4, 5]) == [1, 4, 9, 16, 25] - - def test_map_type_transformation(self): - """Test mapping with type transformation.""" - # Convert numbers to strings - pipeline1 = Pipeline().map(str) - assert pipeline1.to_list([1, 2, 3, 4, 5]) == ["1", "2", "3", "4", "5"] - - # Convert to boolean (non-zero is True) - use fresh pipeline - pipeline2 = Pipeline().map(bool) - assert pipeline2.to_list([1, 2, 3, 4, 5]) == [True, True, True, True, True] - - def test_map_with_strings(self): - """Test mapping with string data.""" - # Convert to uppercase - pipeline1 = Pipeline().map(str.upper) - assert pipeline1.to_list(["hello", "world", "python"]) == ["HELLO", "WORLD", "PYTHON"] - - # Get string lengths - use fresh pipeline - pipeline2 = Pipeline().map(len) - assert pipeline2.to_list(["hello", "world", "python"]) == [5, 5, 6] - - def test_map_chaining(self): - """Test chaining multiple map operations.""" - pipeline = Pipeline() - - # Chain maps: double, then add 1 - result = pipeline.map(lambda x: x * 2).map(lambda x: x + 1) - assert result.to_list([1, 2, 3, 4, 5]) == [3, 5, 7, 9, 11] - - def test_map_empty_pipeline(self): - """Test mapping on empty pipeline.""" - pipeline = Pipeline() - - # Map should return empty result - result = pipeline.map(lambda x: x * 2) - assert result.to_list([]) == [] diff --git a/tests_old/test_pipeline_reduce.py b/tests_old/test_pipeline_reduce.py deleted file mode 100644 index 0d2a579..0000000 --- a/tests_old/test_pipeline_reduce.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Test Pipeline reduce functionality. - -This module contains tests for Pipeline reduce operations -including basic reduction, string operations, and edge cases. -""" - -from efemel.pipeline import Pipeline - - -class TestPipelineReduce: - """Test Pipeline reduce functionality.""" - - def test_reduce_basic(self): - """Test basic reduce operations.""" - # Sum all numbers - pipeline1 = Pipeline() - sum_result = pipeline1.reduce([1, 2, 3, 4, 5], function=lambda acc, x: acc + x, initial=0) - assert sum_result == 15 - - # Multiply all numbers - use fresh pipeline - pipeline2 = Pipeline() - product_result = pipeline2.reduce([1, 2, 3, 4, 5], function=lambda acc, x: acc * x, initial=1) - assert product_result == 120 - - def test_reduce_with_strings(self): - """Test reduce with string data.""" - # Concatenate strings - pipeline1 = Pipeline() - concat_result = pipeline1.reduce(["hello", "world", "python"], function=lambda acc, x: acc + " " + x, initial="") - assert concat_result == " hello world python" - - # Join with commas - use fresh pipeline - pipeline2 = Pipeline() - join_result = pipeline2.reduce( - ["hello", "world", "python"], function=lambda acc, x: acc + "," + x if acc else x, initial="" - ) - assert join_result == "hello,world,python" - - def test_reduce_empty_pipeline(self): - """Test reduce on empty pipeline.""" - pipeline = Pipeline() - - # Reduce should return initial value - result = pipeline.reduce([], function=lambda acc, x: acc + x, initial=10) - assert result == 10 - - def test_reduce_single_item(self): - """Test reduce with single item.""" - pipeline = Pipeline() - - # Should combine initial value with single item - result = pipeline.reduce([42], function=lambda acc, x: acc + x, initial=10) - assert result == 52 diff --git a/tests_old/test_pipeline_tap.py b/tests_old/test_pipeline_tap.py deleted file mode 100644 index f22661b..0000000 --- a/tests_old/test_pipeline_tap.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Test Pipeline tap functionality. - -This module contains tests for Pipeline tap operations -including side effects, chaining, and data integrity. -""" - -from efemel.pipeline import Pipeline - - -class TestPipelineTap: - """Test Pipeline tap functionality.""" - - def test_tap_basic(self): - """Test basic tap operations.""" - pipeline = Pipeline() - side_effects = [] - - # Tap to collect side effects - result = pipeline.tap(side_effects.append) - result_list = result.to_list([1, 2, 3, 4, 5]) - - assert result_list == [1, 2, 3, 4, 5] - assert side_effects == [1, 2, 3, 4, 5] - - def test_tap_with_print(self): - """Test tap with print (no assertion needed, just verify it works).""" - pipeline = Pipeline() - - # This should not raise any exceptions - result = pipeline.tap(lambda x: None) # Mock print - assert result.to_list([1, 2, 3]) == [1, 2, 3] - - def test_tap_chaining(self): - """Test tap in a chain of operations.""" - pipeline = Pipeline() - side_effects = [] - - # Tap in middle of chain - result = pipeline.map(lambda x: x * 2).tap(side_effects.append).filter(lambda x: x > 5) - - result_list = result.to_list([1, 2, 3, 4, 5]) - - assert result_list == [6, 8, 10] - assert side_effects == [2, 4, 6, 8, 10] - - def test_tap_doesnt_modify_pipeline(self): - """Test that tap doesn't modify the pipeline data.""" - pipeline = Pipeline() - - # Tap with function that would modify if it could - result = pipeline.tap(lambda x: x * 1000) - - # Data should be unchanged - assert result.to_list([1, 2, 3, 4, 5]) == [1, 2, 3, 4, 5] diff --git a/tests_old/test_pipeline_utility.py b/tests_old/test_pipeline_utility.py deleted file mode 100644 index f5839b8..0000000 --- a/tests_old/test_pipeline_utility.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -Test Pipeline utility methods. - -This module contains tests for Pipeline utility operations -including apply, flatten, and chunked processing. -""" - -from efemel.pipeline import Pipeline - - -class TestPipelineUtility: - """Test Pipeline utility methods.""" - - def test_apply_function(self): - """Test apply with pipeline function.""" - pipeline = Pipeline() - - # Create another pipeline to apply - double_pipeline = Pipeline().map(lambda x: x * 2) - - result = pipeline.apply(double_pipeline) - assert result.to_list([1, 2, 3, 4, 5]) == [2, 4, 6, 8, 10] - - def test_flatten_basic(self): - """Test basic flatten operation.""" - pipeline = Pipeline() - - result = pipeline.flatten() - assert result.to_list([[1, 2], [3, 4], [5]]) == [1, 2, 3, 4, 5] - - def test_flatten_empty_lists(self): - """Test flatten with empty lists.""" - pipeline = Pipeline() - - result = pipeline.flatten() - assert result.to_list([[], [1, 2], [], [3]]) == [1, 2, 3] - - def test_flatten_nested_tuples(self): - """Test flatten with nested tuples.""" - pipeline = Pipeline() - - result = pipeline.flatten() - assert result.to_list([(1, 2), (3, 4, 5), (6,)]) == [1, 2, 3, 4, 5, 6] - - def test_flatten_chunked_processing(self): - """Test that flatten works with iterators and doesn't consume entire pipeline at once.""" - - # Create a generator that yields nested data - def generate_nested_data(): - for i in range(10): # Smaller range for easier testing - yield [i * 2, i * 2 + 1] - - pipeline = Pipeline() - result = pipeline.flatten() - - # Test the flattened result - actual = result.to_list(generate_nested_data()) - expected = [] - for i in range(10): - expected.extend([i * 2, i * 2 + 1]) - - assert actual == expected - - def test_flatten_with_chunk_size(self): - """Test that flatten works correctly with different chunk sizes.""" - # Create pipeline with custom chunk size - pipeline = Pipeline(chunk_size=3) - - # Test data with varying sizes of sublists - data = [[1, 2], [3, 4, 5], [6], [7, 8, 9, 10]] - - result = pipeline.flatten() - assert result.to_list(data) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] - - def test_flatten_preserves_chunk_size(self): - """Test that flatten preserves the original chunk_size setting.""" - chunk_size = 5 - pipeline = Pipeline(chunk_size=chunk_size) - - flattened = pipeline.flatten() - - # Verify chunk_size is preserved - assert flattened.chunk_size == chunk_size - - # Test with actual data - data = [[1, 2], [3, 4], [5, 6], [7, 8]] - result = flattened.to_list(data) - assert result == [1, 2, 3, 4, 5, 6, 7, 8] From 0952109b101ad6caa2a1291a1df5f9156f98266e Mon Sep 17 00:00:00 2001 From: ringoldsdev Date: Tue, 15 Jul 2025 07:06:19 +0000 Subject: [PATCH 17/17] chore: remove test code --- efemel/pipeline/pipeline.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/efemel/pipeline/pipeline.py b/efemel/pipeline/pipeline.py index e5e1e2f..37cb288 100644 --- a/efemel/pipeline/pipeline.py +++ b/efemel/pipeline/pipeline.py @@ -80,13 +80,3 @@ def consume(self) -> None: """Consumes the pipeline without returning results.""" for _ in self.processed_data: pass - - -t = Transformer.init(int).map(lambda x: x * 2).reduce(lambda acc, x: acc + x, initial=0) -# t = 1 - -# Example using the original apply method with a full transformer -Pipeline([1, 2, 3, 4]).apply(t).each(lambda x: print(x)) - -# Example using the new shorthand transform method -Pipeline([1, 2, 3, 4]).transform(lambda x: x.map(lambda y: y * 2)).each(lambda x: print(x))