diff --git a/sigllm/pipelines/detector/multivariate_mistral_detector_jsonformat.json b/sigllm/pipelines/detector/multivariate_mistral_detector_jsonformat.json new file mode 100644 index 0000000..4a98fa2 --- /dev/null +++ b/sigllm/pipelines/detector/multivariate_mistral_detector_jsonformat.json @@ -0,0 +1,116 @@ +{ + "primitives": [ + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate", + "sklearn.impute.SimpleImputer", + "sigllm.primitives.transformation.Float2Scalar", + "mlstars.custom.timeseries_preprocessing.rolling_window_sequences", + "sigllm.primitives.formatting.json_format.format_as_string", + "sigllm.primitives.forecasting.huggingface.HF", + "sigllm.primitives.formatting.json_format.format_as_integer", + "sigllm.primitives.transformation.Scalar2Float", + "sigllm.primitives.transformation.Scalar2Float", + "sigllm.primitives.postprocessing.aggregate_rolling_window", + "numpy.reshape", + "orion.primitives.timeseries_errors.regression_errors", + "orion.primitives.timeseries_anomalies.find_anomalies" + ], + "init_params": { + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { + "time_column": "timestamp", + "interval": 21600, + "method": "mean" + }, + "sigllm.primitives.transformation.Float2Scalar#1": { + "decimal": 2, + "rescale": true + }, + "mlstars.custom.timeseries_preprocessing.rolling_window_sequences#1": { + "target_column": 0, + "window_size": 140, + "target_size": 1, + "step_size": 1 + }, + "sigllm.primitives.forecasting.huggingface.HF#1": { + "name": "mistralai/Mistral-7B-Instruct-v0.2", + "steps": 5, + "multivariate_allowed_symbols": [ + "d", + ":", + "," + ] + }, + "sigllm.primitives.formatting.json_format.format_as_integer#1": { + "trunc": 1, + "target_column": 0 + }, + "sigllm.primitives.postprocessing.aggregate_rolling_window#1": { + "agg": "median" + }, + "orion.primitives.timeseries_anomalies.find_anomalies#1": { + "window_size_portion": 0.3, + "window_step_size_portion": 0.1, + "fixed_threshold": true + } + }, + "input_names": { + "sigllm.primitives.transformation.Float2Scalar#1": { + "X": "y" + }, + "mlstars.custom.timeseries_preprocessing.rolling_window_sequences#1": { + "X": "y_scaled" + }, + "sigllm.primitives.formatting.json_format.format_as_integer#1": { + "X": "y_hat" + }, + "sigllm.primitives.transformation.Scalar2Float#1": { + "X": "y_hat", + "minimum": "minimum", + "decimal": "decimal" + }, + "sigllm.primitives.transformation.Scalar2Float#2": { + "X": "y", + "minimum": "minimum", + "decimal": "decimal" + }, + "sigllm.primitives.postprocessing.aggregate_rolling_window#1": { + "y": "y_hat" + }, + "numpy.reshape#1": { + "X": "y_hat" + }, + "orion.primitives.timeseries_anomalies.find_anomalies#1": { + "index": "target_index" + } + }, + "output_names": { + "sklearn.impute.SimpleImputer#1": { + "X": "y" + }, + "sigllm.primitives.transformation.Float2Scalar#1": { + "X": "y_scaled", + "minimum": "minimum", + "decimal": "decimal" + }, + "sigllm.primitives.forecasting.huggingface.HF#1": { + "y": "y_hat" + }, + "sigllm.primitives.formatting.json_format.format_as_integer#1": { + "X": "y_hat" + }, + "sigllm.primitives.transformation.Scalar2Float#1": { + "X": "y_hat" + }, + "sigllm.primitives.transformation.Scalar2Float#2": { + "X": "y" + }, + "sigllm.primitives.postprocessing.aggregate_rolling_window#1": { + "y": "y_hat" + }, + "numpy.reshape#1": { + "X": "y_hat" + }, + "orion.primitives.timeseries_anomalies.find_anomalies#1": { + "y": "anomalies" + } + } +} \ No newline at end of file diff --git a/sigllm/primitives/forecasting/huggingface.py b/sigllm/primitives/forecasting/huggingface.py index 024e62c..021506f 100644 --- a/sigllm/primitives/forecasting/huggingface.py +++ b/sigllm/primitives/forecasting/huggingface.py @@ -14,6 +14,7 @@ DEFAULT_PAD_TOKEN = '' VALID_NUMBERS = list('0123456789') +VALID_MULTIVARIATE_SYMBOLS = [] DEFAULT_MODEL = 'mistralai/Mistral-7B-Instruct-v0.2' @@ -41,6 +42,9 @@ class HF: padding (int): Additional padding token to forecast to reduce short horizon predictions. Default to `0`. + multivariate_allowed_symbols (list): + List of token strings to allow in addition to digits when generating. + Default to `[]`. """ def __init__( @@ -53,6 +57,7 @@ def __init__( raw=False, samples=1, padding=0, + multivariate_allowed_symbols=[], ): self.name = name self.sep = sep @@ -62,6 +67,7 @@ def __init__( self.raw = raw self.samples = samples self.padding = padding + self.multivariate_allowed_symbols = multivariate_allowed_symbols self.tokenizer = AutoTokenizer.from_pretrained(self.name, use_fast=False) @@ -85,6 +91,9 @@ def __init__( token = self.tokenizer.convert_tokens_to_ids(number) valid_tokens.append(token) + for symbol in self.multivariate_allowed_symbols: + valid_tokens.append(self.tokenizer.convert_tokens_to_ids(symbol)) + valid_tokens.append(self.tokenizer.convert_tokens_to_ids(self.sep)) self.invalid_tokens = [ [i] for i in range(len(self.tokenizer) - 1) if i not in valid_tokens @@ -116,7 +125,7 @@ def forecast(self, X, **kwargs): tokenized_input = self.tokenizer([text], return_tensors='pt').to('cuda') input_length = tokenized_input['input_ids'].shape[1] - average_length = input_length / len(text.split(',')) + average_length = input_length / len(text.split(self.sep)) max_tokens = (average_length + self.padding) * self.steps generate_ids = self.model.generate( diff --git a/sigllm/primitives/formatting/__init__.py b/sigllm/primitives/formatting/__init__.py new file mode 100644 index 0000000..f0e677c --- /dev/null +++ b/sigllm/primitives/formatting/__init__.py @@ -0,0 +1,19 @@ +"""Multivariate formatting methods for time series data.""" + +from sigllm.primitives.formatting.multivariate_formatting import MultivariateFormattingMethod +from sigllm.primitives.formatting.json_format import JSONFormat +from sigllm.primitives.formatting.univariate_control import UnivariateControl +from sigllm.primitives.formatting.persistence_control import PersistenceControl +from sigllm.primitives.formatting.value_concatenation import ValueConcatenation +from sigllm.primitives.formatting.value_interleave import ValueInterleave +from sigllm.primitives.formatting.digit_interleave import DigitInterleave + +__all__ = [ + 'MultivariateFormattingMethod', + 'JSONFormat', + 'UnivariateControl', + 'PersistenceControl', + 'ValueConcatenation', + 'ValueInterleave', + 'DigitInterleave', +] diff --git a/sigllm/primitives/formatting/digit_interleave.py b/sigllm/primitives/formatting/digit_interleave.py new file mode 100644 index 0000000..937c817 --- /dev/null +++ b/sigllm/primitives/formatting/digit_interleave.py @@ -0,0 +1,103 @@ +import numpy as np + +from sigllm.primitives.formatting.multivariate_formatting import MultivariateFormattingMethod + + +class DigitInterleave(MultivariateFormattingMethod): + """Formatting method that interleaves digits from multiple values.""" + + def __init__(self, verbose: bool = False, **kwargs): + super().__init__('digit_interleave', verbose=verbose, **kwargs) + + def format_as_string( + self, X: np.ndarray, digits_per_timestamp=3, separator=',', **kwargs + ) -> str: + """Format array as string with interleaved digits.""" + max_digits = max(len(str(abs(int(v)))) for window in X for ts in window for v in ts) + width_used = max(digits_per_timestamp, max_digits) + self.metadata['width_used'] = width_used + + def interleave_digits(timestamp): + str_values = [str(int(val)) for val in timestamp] + padded_values = [s.zfill(width_used) for s in str_values] + result_str = '' + for digit_pos in range(width_used): + for padded_val in padded_values: + result_str += padded_val[digit_pos] + + return result_str + + result = [ + separator.join(interleave_digits(timestamp) for timestamp in window) + separator + for window in X + ] + return result + + def format_as_integer( + self, + X: list[str], + separator=',', + trunc=None, + digits_per_timestamp=3, + target_column=None, + **kwargs, + ) -> np.ndarray: + """Parse interleaved digit strings back to integer arrays for the target column. + + Args: + X (list[str]): + list of strings, each string is a concatenation of + interleaved digit values separated by separator. + separator (str): + separator between values + trunc (int): + Number of timestamps to extract from each sample. + If None, all timestamps are extracted. + digits_per_timestamp (int): + Number of digits to extract from each timestamp. + target_column (int): + Which column to extract (default 0). Can also be set via config. + + Returns: + np.ndarray: + Array that holds int values for the target column + for each sample in each window. + """ + width_used = self.metadata['width_used'] + if target_column is None: + target_column = self.config.get('target_column', 0) + + def deinterleave_timestamp_target_column(interleaved_str): + """Convert interleaved digits back to original values and extract target dimension.""" + total_digits = len(interleaved_str) + num_values = total_digits // width_used + + if target_column >= num_values: + return np.array([None]) + + value_digits = [] + for digit_pos in range(width_used): + pos = digit_pos * num_values + target_column + if pos < total_digits: + value_digits.append(interleaved_str[pos]) + + if value_digits: + return np.array([int(''.join(value_digits))]) + return np.array([None]) + + result = np.array( + [ + [ + deinterleave_timestamp_target_column(timestamp) + for sample in entry + for timestamp in sample + .lstrip(separator) + .rstrip(separator) + .split(separator)[:trunc] + if timestamp.strip() + ] + for entry in X + ], + dtype=object, + ) + return result diff --git a/sigllm/primitives/formatting/json_format.py b/sigllm/primitives/formatting/json_format.py new file mode 100644 index 0000000..d0df322 --- /dev/null +++ b/sigllm/primitives/formatting/json_format.py @@ -0,0 +1,130 @@ +import re +from collections import defaultdict + +import numpy as np + +from sigllm.primitives.formatting.multivariate_formatting import MultivariateFormattingMethod + + +class JSONFormat(MultivariateFormattingMethod): + """Formatting method that uses JSON-like format with dimension prefixes.""" + + def __init__(self, verbose: bool = False, **kwargs): + super().__init__('json_format', verbose=verbose, **kwargs) + + def format_as_string(self, X: np.ndarray, separator=',', **kwargs) -> str: + """Format array as string with dimension prefixes.""" + + def window_to_json(X): + rows = [] + for row in X: + parts = [f'd{i}:{val}' for i, val in enumerate(row)] + rows.append(','.join(parts)) + return ','.join(rows) + + out = [window_to_json(window) for window in X] + return out + + def format_as_integer(self, X, trunc=None, steps_ahead=None, target_column=None, **kwargs): + """Parse model output and extract values for the target column for specified steps ahead. + + Args: + X (str): + Model output containing tokens like "d0:1,d1:2,d0:3,d1:4..." + trunc (int, optional): + Legacy parameter for truncation (used when steps_ahead is None) + steps_ahead (list): + List of step indices to extract (e.g., [1,3,5,10]) + If None, trunc is used to determine the number of values to extract. + target_column (int): + Which dimension to extract (default 0). Can also be set via config. + + Returns: + If steps_ahead is None: + np.array of shape (batch, samples) with truncated flat values + If steps_ahead is provided: + dict mapping step -> np.array of target_column values at that step + """ + if trunc is None: + trunc = self.config.get('trunc') + if steps_ahead is None and 'steps_ahead' in self.config: + steps_ahead = self.config.get('steps_ahead') + if target_column is None: + target_column = self.config.get('target_column', 0) + + if steps_ahead is None: + return self._format_as_integer_legacy(X, trunc, target_column) + + results_by_step = defaultdict(list) + + for window in X: + step_samples = defaultdict(list) + for sample in window: + dim_values = self._extract_dim_values(sample, target_column) + for step in steps_ahead: + idx = step - 1 + if idx < len(dim_values): + step_samples[step].append(dim_values[idx]) + else: + step_samples[step].append(None) + for step in steps_ahead: + results_by_step[step].append(step_samples[step]) + + for step in steps_ahead: + results_by_step[step] = np.array(results_by_step[step], dtype=object) + + return results_by_step + + def _format_as_integer_legacy(self, X, trunc=None, target_column=0): + """Extract values for the target dimension from parsed output. + + Args: + X (str): + Model output containing tokens like "d0:1,d1:2,d0:3,d1:4..." + trunc (int, optional): + If None, return all values in a 2D array (num_windows, num_samples) where + each cell is a list of values for that sample. + If int, return 3D array (num_windows, num_samples, trunc) taking the first + trunc values for each sample. None-padded if trunc is larger + than the number of values. + target_column (int): + Which dimension to extract (default 0). + + Returns: + np.array of shape (num_windows, num_samples, num_values) + or (num_windows, num_samples, trunc) that hold values + for the target column for each sample in each window. + """ + if trunc is None: + batch_rows = [] + for window in X: + samples = [] + for sample in window: + samples.append(self._extract_dim_values(sample, target_column)) + batch_rows.append(samples) + return np.array(batch_rows, dtype=object) + + num_windows = len(X) + num_samples = len(X[0]) if num_windows > 0 else 0 + result = np.full((num_windows, num_samples, trunc), fill_value=None) + + for i, window in enumerate(X): + for j, sample in enumerate(window): + dim_values = self._extract_dim_values(sample, target_column) + for k in range(min(trunc, len(dim_values))): + result[i, j, k] = dim_values[k] + + return result + + def _extract_dim_values(self, sample, dim): + """Helper function to extract values for a given column from a sample string in order. + + For "d0:1,d1:2,d0:3,d1:4" with dim=0, returns [1, 3]. + For "d0:1,d1:2,d0:3,d1:4" with dim=1, returns [2, 4]. + """ + tokens = re.findall(r'd(\d+):(\d+)', sample) + dim_values = [] + for dim_str, val_str in tokens: + if dim_str == str(dim): + dim_values.append(int(val_str)) + return dim_values diff --git a/sigllm/primitives/formatting/multivariate_formatting.py b/sigllm/primitives/formatting/multivariate_formatting.py new file mode 100644 index 0000000..87710e2 --- /dev/null +++ b/sigllm/primitives/formatting/multivariate_formatting.py @@ -0,0 +1,35 @@ +import numpy as np +import pandas as pd + + +class MultivariateFormattingMethod: + """Base class for multivariate formatting methods. + + Subclasses implement format_as_string and format_as_integer to convert + between numpy arrays and string representations for LLM input/output. + + The target_column parameter (default 0) can be passed to subclass methods + or set via config to specify which dimension to extract. This should + match the target_column parameter used in rolling_window_sequences. + """ + + def __init__(self, method_name: str, verbose: bool = False, **kwargs): + self.method_name = method_name + self.config = kwargs + self.metadata = {} + self.verbose = verbose + + def format_as_string(self, X: np.ndarray, **kwargs) -> str: + """Format array as string representation.""" + raise NotImplementedError() + + def format_as_integer(self, X: str, **kwargs) -> np.ndarray: + """Parse string representation back to integer array.""" + raise NotImplementedError() + + def normalize_data(self, df: pd.DataFrame) -> pd.DataFrame: + """Normalize data by subtracting mean and dividing by std.""" + ts = df[['timestamp']] + vals = df.drop(columns=['timestamp']) + normed = (vals - vals.mean(axis=0)) / vals.std(axis=0) + return pd.concat([ts, normed], axis=1)[df.columns] diff --git a/sigllm/primitives/formatting/persistence_control.py b/sigllm/primitives/formatting/persistence_control.py new file mode 100644 index 0000000..88ddf9f --- /dev/null +++ b/sigllm/primitives/formatting/persistence_control.py @@ -0,0 +1,55 @@ +import numpy as np + +from sigllm.primitives.formatting.multivariate_formatting import MultivariateFormattingMethod + + +class PersistenceControl(MultivariateFormattingMethod): + """Formatting method using persistence control strategy.""" + + def __init__(self, verbose: bool = False, **kwargs): + super().__init__('persistence_control', verbose=verbose, **kwargs) + + def format_as_string(self, X: np.ndarray, separator=',', target_column=None, **kwargs) -> str: + """Format array as string with persistence control. + + Args: + X (np.ndarray): + Input array with shape (num_windows, num_timestamps, num_dims). + separator (str): + Separator between values. + target_column (int): + Which dimension to encode (default 0). Can also be set via config. + + Returns: + list[str]: + List of strings, one per window, containing only + the target dimension values. + """ + if target_column is None: + target_column = self.config.get('target_column', 0) + result = [] + for row in X[:, :, target_column]: + result.append(separator.join(map(str, row.flatten()))) + return result + + def format_as_integer( + self, X: list[str], separator=',', target_column=None, **kwargs + ) -> np.ndarray: + """Parse string representation back to integer array (last value only). + + Args: + X (list[str]): + List of strings to parse. + separator (str): + Separator between values. + target_column (int): + Accepted for API consistency (default 0). The string already contains + only the target dimension, so this parameter has no effect on parsing. + + Returns: + np.ndarray that holds the last int value for each window. + """ + result = [ + [[int(entry.lstrip(separator).rstrip(separator).split(separator)[-1])]] for entry in X + ] + return np.array(result, dtype=int) diff --git a/sigllm/primitives/formatting/univariate_control.py b/sigllm/primitives/formatting/univariate_control.py new file mode 100644 index 0000000..2bc785c --- /dev/null +++ b/sigllm/primitives/formatting/univariate_control.py @@ -0,0 +1,62 @@ +import numpy as np + +from sigllm.primitives.formatting.multivariate_formatting import MultivariateFormattingMethod + + +class UnivariateControl(MultivariateFormattingMethod): + """Formatting method using univariate control strategy.""" + + def __init__(self, verbose: bool = False, **kwargs): + super().__init__('univariate_control', verbose=verbose, **kwargs) + + def format_as_string(self, X: np.ndarray, separator=',', target_column=None, **kwargs) -> str: + """Format array as string with univariate control. + + Args: + X (np.ndarray): + Input array with shape (num_windows, num_timestamps, num_dims). + separator (str): + Separator between values. + target_column (int): + Which dimension to encode (default 0). Can also be set via config. + + Returns: + list[str]: + List of strings, one per window, containing only + the target dimension values. + """ + if target_column is None: + target_column = self.config.get('target_column', 0) + result = [] + for row in X[:, :, target_column]: + result.append(separator.join(map(str, row.flatten()))) + return result + + def format_as_integer( + self, X: list[str], separator=',', trunc=None, target_column=None, **kwargs + ) -> np.ndarray: + """Parse string representation back to integer array. + + Args: + X (list[str]): + List of strings to parse. + separator (str): + Separator between values. + trunc (int): + Number of values to extract. If None, all values are extracted. + target_column (int): + Accepted for API consistency (default 0). The string already contains + only the target dimension, so this parameter has no effect on parsing. + + Returns: + np.ndarray that holds int values for each sample in each window. + """ + result = [ + [ + np.array([int(x) for x in entry.lstrip(separator).split(separator) if x])[:trunc] + for entry in row + ] + for row in X + ] + out = np.array(result, dtype=object) + return out diff --git a/sigllm/primitives/formatting/utils.py b/sigllm/primitives/formatting/utils.py new file mode 100644 index 0000000..cb22863 --- /dev/null +++ b/sigllm/primitives/formatting/utils.py @@ -0,0 +1,193 @@ +import logging + +import numpy as np +import pandas as pd +from mlblocks import MLPipeline + +logger = logging.getLogger(__name__) + + +def create_test_data(N=25): + """Create test data for formatting validation.""" + x1 = np.linspace(10, 9 + N, N) / 100 + x2 = np.array([i % 2 for i in range(N)]) + x3 = np.linspace(N + 40, 41, N) / 100 + + return pd.DataFrame({ + 'timestamp': np.linspace(0, 3600 * (N - 1), N), + 'x1': x1, + 'x2': x2, + 'x3': x3, + }) + + +def run_pipeline( + method, + data=None, + interval=3600, + window_size=15, + verbose=True, + samples=7, + normalize=False, + temp=0.1, + multivariate_allowed_symbols=None, + pipeline_name='mistral_detector', + stride=1, + n_clusters=2, + strategy='scaling', + steps_ahead=None, +): + """Run the forecasting pipeline. + + Args: + method (subclass of MultivariateFormattingMethod): + The method to run the pipeline for. + data (pd.DataFrame): + The data to run the pipeline on. + interval (int): + The interval between timestamps in the data. + window_size (int): + The context length for each prediction window. + samples (int): + The number of times to run the LLM on each window. + normalize (bool): + Whether to normalize the data before running. + multivariate_allowed_symbols (list): + The allowed symbols for LLMs to output aside from digits. + pipeline_name (str): + The name of the pipeline we are wrapping (choice of + `mistral_detector` and `gpt_detector`). + stride (int): + The gap between consecutive prediction windows. + n_clusters (int): + Not yet supported. Will be used with the `binning` + pre-processing strategy in the future. + strategy (str): + For now, must be `scaling`. We will add option for + `binning` in the future. + steps_ahead (list, optional): + The amount of steps ahead to predict in each window. + + Returns: + The errors, y_hat, and y for the pipeline. + """ + if data is None: + data = create_test_data() + + pipeline = MLPipeline(pipeline_name) + digits_per_timestamp = method.config.get('digits_per_timestamp', 2) + + num_dims = len(data.columns) - 1 + + if steps_ahead is not None: + max_steps = max(steps_ahead) + hf_steps = max_steps * (num_dims + 1) # adding some padding here + else: + hf_steps = 2 + + test_hyperparameters = { + 'mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1': { + 'interval': interval + }, + 'mlstars.custom.timeseries_preprocessing.rolling_window_sequences#1': { + 'target_column': 0, + 'window_size': window_size, + 'target_size': max(steps_ahead) if steps_ahead else 1, + 'step_size': stride, + }, + 'sigllm.primitives.forecasting.huggingface.HF#1': { + 'samples': samples, + 'temp': temp, + 'multivariate_allowed_symbols': ( + [] if multivariate_allowed_symbols is None else multivariate_allowed_symbols + ), + 'steps': hf_steps, + }, + } + + if strategy == 'scaling': + test_hyperparameters['sigllm.primitives.transformation.Float2Scalar#1'] = { + 'decimal': digits_per_timestamp, + 'rescale': True, + } + elif strategy == 'binning': + test_hyperparameters['sigllm.primitives.transformation.Float2Scalar#1'] = { + 'strategy': 'binning', + 'n_clusters': n_clusters, + } + raise ValueError('Note that binning is not supported for now.') + else: + raise ValueError( + f'Invalid strategy: {strategy}. Note that only scaling is supported for now.' + ) + + pipeline.set_hyperparameters(test_hyperparameters) + if normalize: + data = method.normalize_data(data) + context = pipeline.fit(data, start_=0, output_=3) + context['X'] = method.format_as_string(context['X'], **method.config) + + if method.method_name == 'persistence_control': + context['y_hat'] = context['X'] + else: + context = pipeline.fit(**context, start_=5, output_=5) + + if verbose: + logger.info('y_hat example: %s', context['y_hat'][0][0]) + + if steps_ahead is not None: + return _process_multi_step_results(method, context, pipeline, steps_ahead, verbose) + + context['y_hat'] = method.format_as_integer(context['y_hat'], trunc=1) + if verbose: + logger.info('y_hat example: %s', context['y_hat'][0][0]) + + context = pipeline.fit(**context, start_=7, output_=10) + errors = np.round(context['errors'], 7) + if verbose: + logger.info('y_hat: %s', context['y_hat']) + logger.info('y: %s', context['y']) + logger.info('errors: %s', errors) + + return errors, context['y_hat'], context['y'] + + +def _process_multi_step_results(method, context, pipeline, steps_ahead, verbose): + """Process results for multi-step-ahead prediction. + + For multi-step predictions with stride > 1, we skip aggregate_rolling_window + since there's no overlap between predictions. Each window gives one prediction + per step, indexed sequentially (0, 1, 2, ...) regardless of actual stride. + + Returns: + dict {step : {'errors': [...], 'y_hat': [...], 'y': [...]}} + """ + y_hat_by_step = method.format_as_integer(context['y_hat'], steps_ahead=steps_ahead) + + results = {} + + for step in steps_ahead: + step_context = context.copy() + + y_hat_step = y_hat_by_step[step] + y_hat_float = np.array( + [[v if v is not None else np.nan for v in row] for row in y_hat_step], dtype=float + ) + step_context['y_hat'] = np.expand_dims(y_hat_float, axis=-1) + step_context = pipeline.fit(**step_context, start_=7, output_=7) + y_hat_agg = np.nanmedian(step_context['y_hat'], axis=1).squeeze() + y_for_step = context['y'][:, step - 1] if context['y'].ndim > 1 else context['y'] + errors = np.round(y_hat_agg - y_for_step, 7) + + if verbose: + logger.info( + 'Step %s - y_hat shape: %s, errors shape: %s', step, y_hat_agg.shape, errors.shape + ) + + results[step] = { + 'errors': errors, + 'y_hat': y_hat_agg, + 'y': y_for_step, + } + + return results diff --git a/sigllm/primitives/formatting/value_concatenation.py b/sigllm/primitives/formatting/value_concatenation.py new file mode 100644 index 0000000..d020975 --- /dev/null +++ b/sigllm/primitives/formatting/value_concatenation.py @@ -0,0 +1,63 @@ +import numpy as np + +from sigllm.primitives.formatting.multivariate_formatting import MultivariateFormattingMethod + + +class ValueConcatenation(MultivariateFormattingMethod): + """Formatting method that concatenates values directly.""" + + def __init__(self, verbose: bool = False, **kwargs): + super().__init__('value_concatenation', verbose=verbose, **kwargs) + + def format_as_string(self, X: np.ndarray, separator=',', **kwargs) -> str: + """Format array as string with concatenated values.""" + result = [] + for row in X: + result.append(separator.join(map(str, row.flatten()))) + return result + + def format_as_integer( + self, X: list[str], separator=',', trunc=None, num_dims=None, target_column=None, **kwargs + ) -> np.ndarray: + """Extract values for the target dimension from each sample in each window as ints. + + Args: + X (list[str]): + list of strings, each string is a concatenation of num_dims values + separated by separator. + separator (str): + separator between values + trunc (int): + Number of values to extract from each sample. If None, all values are extracted. + num_dims (int): + Number of dimensions (mandatory if num_dims is not provided in config) + target_column (int): + Which dimension to extract (default 0). Can also be set via config. + + Returns: + np.ndarray: + Array that holds int values for the target column + for each sample in each window. + """ + num_dims = num_dims or self.config.get('num_dims') + if num_dims is None: + raise ValueError( + 'Cannot parse concatenated values ' + 'without knowing the number of dimensions ' + 'or target column.' + ) + + if target_column is None: + target_column = self.config.get('target_column', 0) + + result = [ + [ + np.array([int(x) for x in entry.lstrip(separator).split(separator) if x])[ + target_column::num_dims + ][:trunc] + for entry in row + ] + for row in X + ] + out = np.array(result, dtype=object) + return out diff --git a/sigllm/primitives/formatting/value_interleave.py b/sigllm/primitives/formatting/value_interleave.py new file mode 100644 index 0000000..162b3c8 --- /dev/null +++ b/sigllm/primitives/formatting/value_interleave.py @@ -0,0 +1,80 @@ +import numpy as np + +from sigllm.primitives.formatting.multivariate_formatting import MultivariateFormattingMethod + + +class ValueInterleave(MultivariateFormattingMethod): + """Formatting method that interleaves values from multiple dimensions.""" + + def __init__(self, verbose: bool = False, **kwargs): + super().__init__('value_interleave', verbose=verbose, **kwargs) + + def format_as_string( + self, X: np.ndarray, digits_per_timestamp=3, separator=',', **kwargs + ) -> str: + """Format array as string with interleaved values.""" + max_digits = max(len(str(abs(int(v)))) for window in X for ts in window for v in ts) + width_used = max(digits_per_timestamp, max_digits) + self.metadata['width_used'] = width_used + result = [ + separator.join( + ''.join(str(int(val)).zfill(width_used)[:width_used] for val in timestamp) + for timestamp in window + ) + + separator + for window in X + ] + return result + + def format_as_integer( + self, + X: list[str], + separator=',', + trunc=None, + digits_per_timestamp=3, + target_column=None, + **kwargs, + ) -> np.ndarray: + """Parse interleaved value strings back to integer arrays for the target dimension. + + Args: + X (list[str]): + list of strings, each string is a concatenation of + num_dims values separated by separator. + separator (str): + separator between values + trunc (int): + Number of values to extract from each sample. If None, all values are extracted. + digits_per_timestamp (int): + Number of digits to extract from each timestamp. + target_column (int): + Which dimension to extract (default 0). Can also be set via config. + + Returns: + np.ndarray: + Array that holds int values for the target column + for each sample in each window. + """ + width_used = self.metadata['width_used'] + if target_column is None: + target_column = self.config.get('target_column', 0) + + def parse_target_column_from_timestamp(timestamp): + arr = [ + int(timestamp[i : i + width_used]) for i in range(0, len(timestamp), width_used) + ] + if target_column < len(arr): + return arr[target_column] + return None + + result = [] + for entry in X: + row = [] + for sample in entry: + parts = sample.lstrip(separator).rstrip(separator).split(separator) + vals = np.array([parse_target_column_from_timestamp(ts) for ts in parts if ts]) + if trunc is not None: + vals = vals[:trunc] + row.append(vals) + result.append(row) + return np.array(result, dtype=object) diff --git a/sigllm/primitives/jsons/sigllm.primitives.forecasting.huggingface.HF.json b/sigllm/primitives/jsons/sigllm.primitives.forecasting.huggingface.HF.json index 50d762b..f0b70c3 100644 --- a/sigllm/primitives/jsons/sigllm.primitives.forecasting.huggingface.HF.json +++ b/sigllm/primitives/jsons/sigllm.primitives.forecasting.huggingface.HF.json @@ -63,6 +63,10 @@ "padding": { "type": "int", "default": 0 + }, + "multivariate_allowed_symbols": { + "type": "list", + "default": [] } } } diff --git a/sigllm/primitives/jsons/sigllm.primitives.forecasting.long_short_term_context.format_as_string_with_LST_prompt.json b/sigllm/primitives/jsons/sigllm.primitives.forecasting.long_short_term_context.format_as_string_with_LST_prompt.json new file mode 100644 index 0000000..66fd4d6 --- /dev/null +++ b/sigllm/primitives/jsons/sigllm.primitives.forecasting.long_short_term_context.format_as_string_with_LST_prompt.json @@ -0,0 +1,47 @@ +{ + "name": "sigllm.primitives.forecasting.long_short_term_context.format_as_string_with_LST_prompt", + "contributors": [ + "Allen Baranov " + ], + "description": "Format X to string(s) with LST prompt. Formats long-term and short-term context separately and combines them with a prompt.", + "classifiers": { + "type": "preprocessor", + "subtype": "transformer" + }, + "modalities": [], + "primitive": "sigllm.primitives.forecasting.long_short_term_context.format_as_string_with_LST_prompt", + "produce": { + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "L": { + "type": "int", + "default": 5 + }, + "sep": { + "type": "string", + "default": "," + }, + "space": { + "type": "bool", + "default": false + }, + "single": { + "type": "bool", + "default": false + } + } + } +} diff --git a/sigllm/primitives/jsons/sigllm.primitives.forecasting.long_short_term_context.long_short_term_context.json b/sigllm/primitives/jsons/sigllm.primitives.forecasting.long_short_term_context.long_short_term_context.json new file mode 100644 index 0000000..66e3731 --- /dev/null +++ b/sigllm/primitives/jsons/sigllm.primitives.forecasting.long_short_term_context.long_short_term_context.json @@ -0,0 +1,73 @@ +{ + "name": "sigllm.primitives.forecasting.long_short_term_context.long_short_term_context", + "contributors": [ + "Allen Baranov " + ], + "description": "Process rolling window sequences to create long-term aggregated and short-term raw context windows", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_extractor" + }, + "modalities": [ + "timeseries" + ], + "primitive": "sigllm.primitives.forecasting.long_short_term_context.long_short_term_context", + "produce": { + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "first_index", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "first_index", + "type": "ndarray" + }, + { + "name": "window_size", + "type": "int" + }, + { + "name": "step_size", + "type": "int" + }, + { + "name": "dim", + "type": "int" + } + ] + }, + "hyperparameters": { + "fixed": { + "L": { + "type": "int", + "default": 5 + }, + "W": { + "type": "int", + "default": null + }, + "S": { + "type": "int", + "default": null + }, + "step_size": { + "type": "int", + "default": 1 + }, + "aggregation_method": { + "type": "str", + "default": "mean" + } + } + } +} diff --git a/sigllm/primitives/jsons/sigllm.primitives.formatting.digit_interleave.format_as_integer.json b/sigllm/primitives/jsons/sigllm.primitives.formatting.digit_interleave.format_as_integer.json new file mode 100644 index 0000000..16f2e33 --- /dev/null +++ b/sigllm/primitives/jsons/sigllm.primitives.formatting.digit_interleave.format_as_integer.json @@ -0,0 +1,58 @@ +{ + "name": "sigllm.primitives.formatting.digit_interleave.format_as_integer", + "contributors": [ + "Allen Baranov " + ], + "description": "Parse digit-interleaved strings back to integer arrays for the target dimension.", + "classifiers": { + "type": "preprocessor", + "subtype": "transformer" + }, + "modalities": [], + "primitive": "sigllm.primitives.formatting.digit_interleave.DigitInterleave", + "produce": { + "method": "format_as_integer", + "args": [ + { + "name": "X", + "type": "list" + }, + { + "name": "trunc", + "type": "int", + "default": null + }, + { + "name": "target_column", + "type": "int", + "default": null + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "separator": { + "type": "string", + "default": "," + }, + "trunc": { + "type": "int", + "default": null + }, + "digits_per_timestamp": { + "type": "int", + "default": 3 + }, + "target_column": { + "type": "int", + "default": 0 + } + } + } +} diff --git a/sigllm/primitives/jsons/sigllm.primitives.formatting.digit_interleave.format_as_string.json b/sigllm/primitives/jsons/sigllm.primitives.formatting.digit_interleave.format_as_string.json new file mode 100644 index 0000000..ec44656 --- /dev/null +++ b/sigllm/primitives/jsons/sigllm.primitives.formatting.digit_interleave.format_as_string.json @@ -0,0 +1,40 @@ +{ + "name": "sigllm.primitives.formatting.digit_interleave.format_as_string", + "contributors": [ + "Allen Baranov " + ], + "description": "Transform an ndarray of multivariate values to strings with digit-interleaved encoding per timestamp.", + "classifiers": { + "type": "preprocessor", + "subtype": "transformer" + }, + "modalities": [], + "primitive": "sigllm.primitives.formatting.digit_interleave.DigitInterleave", + "produce": { + "method": "format_as_string", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "list" + } + ] + }, + "hyperparameters": { + "fixed": { + "digits_per_timestamp": { + "type": "int", + "default": 3 + }, + "separator": { + "type": "string", + "default": "," + } + } + } +} diff --git a/sigllm/primitives/jsons/sigllm.primitives.formatting.json_format.format_as_integer.json b/sigllm/primitives/jsons/sigllm.primitives.formatting.json_format.format_as_integer.json new file mode 100644 index 0000000..fde0f70 --- /dev/null +++ b/sigllm/primitives/jsons/sigllm.primitives.formatting.json_format.format_as_integer.json @@ -0,0 +1,59 @@ +{ + "name": "sigllm.primitives.formatting.json_format.format_as_integer", + "contributors": [ + "Allen Baranov " + ], + "description": "Transform an ndarray of JSON-formatted string values to an ndarray of integers for multivariate data.", + "classifiers": { + "type": "preprocessor", + "subtype": "transformer" + }, + "modalities": [], + "primitive": "sigllm.primitives.formatting.json_format.JSONFormat", + "produce": { + "method": "format_as_integer", + "args": [ + { + "name": "X", + "type": "list" + }, + { + "name": "trunc", + "type": "int", + "default": null + }, + { + "name": "steps_ahead", + "type": "list", + "default": null + }, + { + "name": "target_column", + "type": "int", + "default": null + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "trunc": { + "type": "int", + "default": null + }, + "steps_ahead": { + "type": "list", + "default": null + }, + "target_column": { + "type": "int", + "default": 0 + } + } + } +} diff --git a/sigllm/primitives/jsons/sigllm.primitives.formatting.json_format.format_as_string.json b/sigllm/primitives/jsons/sigllm.primitives.formatting.json_format.format_as_string.json new file mode 100644 index 0000000..2a7fca6 --- /dev/null +++ b/sigllm/primitives/jsons/sigllm.primitives.formatting.json_format.format_as_string.json @@ -0,0 +1,36 @@ +{ + "name": "sigllm.primitives.formatting.json_format.format_as_string", + "contributors": [ + "Allen Baranov " + ], + "description": "Transform an ndarray of integer values to an ndarray of JSON-formatted strings for multivariate data.", + "classifiers": { + "type": "preprocessor", + "subtype": "transformer" + }, + "modalities": [], + "primitive": "sigllm.primitives.formatting.json_format.JSONFormat", + "produce": { + "method": "format_as_string", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "list" + } + ] + }, + "hyperparameters": { + "fixed": { + "separator": { + "type": "string", + "default": "," + } + } + } +} diff --git a/sigllm/primitives/jsons/sigllm.primitives.formatting.persistence_control.format_as_integer.json b/sigllm/primitives/jsons/sigllm.primitives.formatting.persistence_control.format_as_integer.json new file mode 100644 index 0000000..7cebf21 --- /dev/null +++ b/sigllm/primitives/jsons/sigllm.primitives.formatting.persistence_control.format_as_integer.json @@ -0,0 +1,45 @@ +{ + "name": "sigllm.primitives.formatting.persistence_control.format_as_integer", + "contributors": [ + "Allen Baranov " + ], + "description": "Parse persistence strings back to integer array (last value per window only).", + "classifiers": { + "type": "preprocessor", + "subtype": "transformer" + }, + "modalities": [], + "primitive": "sigllm.primitives.formatting.persistence_control.PersistenceControl", + "produce": { + "method": "format_as_integer", + "args": [ + { + "name": "X", + "type": "list" + }, + { + "name": "target_column", + "type": "int", + "default": null + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "separator": { + "type": "string", + "default": "," + }, + "target_column": { + "type": "int", + "default": 0 + } + } + } +} diff --git a/sigllm/primitives/jsons/sigllm.primitives.formatting.persistence_control.format_as_string.json b/sigllm/primitives/jsons/sigllm.primitives.formatting.persistence_control.format_as_string.json new file mode 100644 index 0000000..d5df2ab --- /dev/null +++ b/sigllm/primitives/jsons/sigllm.primitives.formatting.persistence_control.format_as_string.json @@ -0,0 +1,40 @@ +{ + "name": "sigllm.primitives.formatting.persistence_control.format_as_string", + "contributors": [ + "Allen Baranov " + ], + "description": "Transform an ndarray to strings containing only the target dimension (persistence control).", + "classifiers": { + "type": "preprocessor", + "subtype": "transformer" + }, + "modalities": [], + "primitive": "sigllm.primitives.formatting.persistence_control.PersistenceControl", + "produce": { + "method": "format_as_string", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "list" + } + ] + }, + "hyperparameters": { + "fixed": { + "separator": { + "type": "string", + "default": "," + }, + "target_column": { + "type": "int", + "default": 0 + } + } + } +} diff --git a/sigllm/primitives/jsons/sigllm.primitives.formatting.univariate_control.format_as_integer.json b/sigllm/primitives/jsons/sigllm.primitives.formatting.univariate_control.format_as_integer.json new file mode 100644 index 0000000..869143c --- /dev/null +++ b/sigllm/primitives/jsons/sigllm.primitives.formatting.univariate_control.format_as_integer.json @@ -0,0 +1,54 @@ +{ + "name": "sigllm.primitives.formatting.univariate_control.format_as_integer", + "contributors": [ + "Allen Baranov " + ], + "description": "Parse univariate value strings back to integer arrays.", + "classifiers": { + "type": "preprocessor", + "subtype": "transformer" + }, + "modalities": [], + "primitive": "sigllm.primitives.formatting.univariate_control.UnivariateControl", + "produce": { + "method": "format_as_integer", + "args": [ + { + "name": "X", + "type": "list" + }, + { + "name": "trunc", + "type": "int", + "default": null + }, + { + "name": "target_column", + "type": "int", + "default": null + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "separator": { + "type": "string", + "default": "," + }, + "trunc": { + "type": "int", + "default": null + }, + "target_column": { + "type": "int", + "default": 0 + } + } + } +} diff --git a/sigllm/primitives/jsons/sigllm.primitives.formatting.univariate_control.format_as_string.json b/sigllm/primitives/jsons/sigllm.primitives.formatting.univariate_control.format_as_string.json new file mode 100644 index 0000000..b4582a9 --- /dev/null +++ b/sigllm/primitives/jsons/sigllm.primitives.formatting.univariate_control.format_as_string.json @@ -0,0 +1,40 @@ +{ + "name": "sigllm.primitives.formatting.univariate_control.format_as_string", + "contributors": [ + "Allen Baranov " + ], + "description": "Transform an ndarray to strings containing only the target dimension (univariate control).", + "classifiers": { + "type": "preprocessor", + "subtype": "transformer" + }, + "modalities": [], + "primitive": "sigllm.primitives.formatting.univariate_control.UnivariateControl", + "produce": { + "method": "format_as_string", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "list" + } + ] + }, + "hyperparameters": { + "fixed": { + "separator": { + "type": "string", + "default": "," + }, + "target_column": { + "type": "int", + "default": 0 + } + } + } +} diff --git a/sigllm/primitives/jsons/sigllm.primitives.formatting.value_concatenation.format_as_integer.json b/sigllm/primitives/jsons/sigllm.primitives.formatting.value_concatenation.format_as_integer.json new file mode 100644 index 0000000..23d5122 --- /dev/null +++ b/sigllm/primitives/jsons/sigllm.primitives.formatting.value_concatenation.format_as_integer.json @@ -0,0 +1,63 @@ +{ + "name": "sigllm.primitives.formatting.value_concatenation.format_as_integer", + "contributors": [ + "Allen Baranov " + ], + "description": "Parse concatenated value strings back to integer arrays for the target dimension.", + "classifiers": { + "type": "preprocessor", + "subtype": "transformer" + }, + "modalities": [], + "primitive": "sigllm.primitives.formatting.value_concatenation.ValueConcatenation", + "produce": { + "method": "format_as_integer", + "args": [ + { + "name": "X", + "type": "list" + }, + { + "name": "trunc", + "type": "int", + "default": null + }, + { + "name": "num_dims", + "type": "int", + "default": null + }, + { + "name": "target_column", + "type": "int", + "default": null + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "separator": { + "type": "string", + "default": "," + }, + "trunc": { + "type": "int", + "default": null + }, + "num_dims": { + "type": "int", + "default": null + }, + "target_column": { + "type": "int", + "default": 0 + } + } + } +} diff --git a/sigllm/primitives/jsons/sigllm.primitives.formatting.value_concatenation.format_as_string.json b/sigllm/primitives/jsons/sigllm.primitives.formatting.value_concatenation.format_as_string.json new file mode 100644 index 0000000..efce7be --- /dev/null +++ b/sigllm/primitives/jsons/sigllm.primitives.formatting.value_concatenation.format_as_string.json @@ -0,0 +1,36 @@ +{ + "name": "sigllm.primitives.formatting.value_concatenation.format_as_string", + "contributors": [ + "Allen Baranov " + ], + "description": "Transform an ndarray of multivariate values to strings with concatenated values per window.", + "classifiers": { + "type": "preprocessor", + "subtype": "transformer" + }, + "modalities": [], + "primitive": "sigllm.primitives.formatting.value_concatenation.ValueConcatenation", + "produce": { + "method": "format_as_string", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "list" + } + ] + }, + "hyperparameters": { + "fixed": { + "separator": { + "type": "string", + "default": "," + } + } + } +} diff --git a/sigllm/primitives/jsons/sigllm.primitives.formatting.value_interleave.format_as_integer.json b/sigllm/primitives/jsons/sigllm.primitives.formatting.value_interleave.format_as_integer.json new file mode 100644 index 0000000..a8bc204 --- /dev/null +++ b/sigllm/primitives/jsons/sigllm.primitives.formatting.value_interleave.format_as_integer.json @@ -0,0 +1,54 @@ +{ + "name": "sigllm.primitives.formatting.value_interleave.format_as_integer", + "contributors": [ + "Allen Baranov " + ], + "description": "Parse interleaved value strings back to integer arrays for the target dimension.", + "classifiers": { + "type": "preprocessor", + "subtype": "transformer" + }, + "modalities": [], + "primitive": "sigllm.primitives.formatting.value_interleave.ValueInterleave", + "produce": { + "method": "format_as_integer", + "args": [ + { + "name": "X", + "type": "list" + }, + { + "name": "trunc", + "type": "int", + "default": null + }, + { + "name": "target_column", + "type": "int", + "default": null + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "separator": { + "type": "string", + "default": "," + }, + "trunc": { + "type": "int", + "default": null + }, + "target_column": { + "type": "int", + "default": 0 + } + } + } +} diff --git a/sigllm/primitives/jsons/sigllm.primitives.formatting.value_interleave.format_as_string.json b/sigllm/primitives/jsons/sigllm.primitives.formatting.value_interleave.format_as_string.json new file mode 100644 index 0000000..7c19510 --- /dev/null +++ b/sigllm/primitives/jsons/sigllm.primitives.formatting.value_interleave.format_as_string.json @@ -0,0 +1,40 @@ +{ + "name": "sigllm.primitives.formatting.value_interleave.format_as_string", + "contributors": [ + "Allen Baranov " + ], + "description": "Transform an ndarray of multivariate values to strings with interleaved fixed-width values per timestamp.", + "classifiers": { + "type": "preprocessor", + "subtype": "transformer" + }, + "modalities": [], + "primitive": "sigllm.primitives.formatting.value_interleave.ValueInterleave", + "produce": { + "method": "format_as_string", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "list" + } + ] + }, + "hyperparameters": { + "fixed": { + "digits_per_timestamp": { + "type": "int", + "default": 3 + }, + "separator": { + "type": "string", + "default": "," + } + } + } +} diff --git a/tests/primitives/formatting/test_digit_interleave.py b/tests/primitives/formatting/test_digit_interleave.py new file mode 100644 index 0000000..cb6cc13 --- /dev/null +++ b/tests/primitives/formatting/test_digit_interleave.py @@ -0,0 +1,165 @@ +import unittest + +import numpy as np + +from sigllm.primitives.formatting.digit_interleave import DigitInterleave + + +class DigitInterleaveFormatAsStringTest(unittest.TestCase): + """Tests for DigitInterleave.format_as_string.""" + + def setUp(self): + self.formatter = DigitInterleave() + + def test_single_window_single_timestamp_one_value_to_string(self): + X = np.array([[[5]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['005,']) + self.assertEqual(self.formatter.metadata['width_used'], 3) + + def test_single_window_single_timestamp_two_values_to_string(self): + X = np.array([[[1, 23]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['000213,']) + self.assertEqual(self.formatter.metadata['width_used'], 3) + + def test_single_window_multiple_timestamps_to_string(self): + X = np.array([[[100, 2], [3, 4]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['100002,000034,']) + self.assertEqual(self.formatter.metadata['width_used'], 3) + + def test_multiple_windows_to_string(self): + X = np.array([[[1, 2]], [[3, 4]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(len(out), 2) + self.assertEqual(out[0], '000012,') + self.assertEqual(out[1], '000034,') + + def test_digits_per_timestamp_wider_than_values(self): + X = np.array([[[7]]]) + out = self.formatter.format_as_string(X, digits_per_timestamp=3) + self.assertEqual(out, ['007,']) + self.assertEqual(self.formatter.metadata['width_used'], 3) + + def test_values_wider_than_digits_per_timestamp(self): + X = np.array([[[1234, 500], [101, 500]], [[30, 10], [32, 14]]]) + out = self.formatter.format_as_string(X, digits_per_timestamp=2) + self.assertEqual(out, ['10253040,00150010,', '00003100,00003124,']) + self.assertEqual(self.formatter.metadata['width_used'], 4) + + def test_custom_separator(self): + X = np.array([[[1], [2]]]) + out = self.formatter.format_as_string(X, separator=';') + self.assertEqual(out, ['001;002;']) + + def test_custom_digits_per_timestamp(self): + X = np.array([[[1], [2]]]) + out = self.formatter.format_as_string(X, digits_per_timestamp=2) + self.assertEqual(out, ['01,02,']) + self.assertEqual(self.formatter.metadata['width_used'], 2) + + +class DigitInterleaveFormatAsIntegerTest(unittest.TestCase): + """Tests for DigitInterleave.format_as_integer (requires width_used in metadata).""" + + def setUp(self): + self.formatter = DigitInterleave() + self.formatter.metadata['width_used'] = 3 + + def test_single_timestamp_single_value_to_integer(self): + X = [['005,']] + out = self.formatter.format_as_integer(X) + self.assertEqual(len(out), 1) + self.assertEqual(len(out[0]), 1) + np.testing.assert_array_equal(out[0][0], np.array([5])) + + def test_single_timestamp_two_values_to_integer(self): + X = [['000213,']] + out = self.formatter.format_as_integer(X) + self.assertEqual(len(out), 1) + self.assertEqual(len(out[0]), 1) + np.testing.assert_array_equal(out[0][0], np.array([1])) + + def test_multiple_timestamps_in_one_sample_to_integer(self): + X = [['000012,000034,']] + out = self.formatter.format_as_integer(X) + self.assertEqual(len(out), 1) + self.assertEqual(len(out[0]), 2) + np.testing.assert_array_equal(out[0][0], np.array([1])) + np.testing.assert_array_equal(out[0][1], np.array([3])) + + def test_multiple_entries_to_integer(self): + X = [['005,'], ['012,']] + out = self.formatter.format_as_integer(X) + self.assertEqual(len(out), 2) + np.testing.assert_array_equal(out[0][0], np.array([5])) + np.testing.assert_array_equal(out[1][0], np.array([12])) + + def test_trunc_limits_timestamps(self): + X = [['000012,000034,000056,']] + out = self.formatter.format_as_integer(X, trunc=2) + self.assertEqual(len(out[0]), 2) + np.testing.assert_array_equal(out[0][0], np.array([1])) + np.testing.assert_array_equal(out[0][1], np.array([3])) + + def test_trunc_limits_values_per_timestamp(self): + X = [['000000123,']] + out = self.formatter.format_as_integer(X, trunc=2) + self.assertEqual(len(out[0]), 1) + np.testing.assert_array_equal(out[0][0], np.array([1])) + + def test_custom_separator(self): + X = [['001;002;']] + out = self.formatter.format_as_integer(X, separator=';') + np.testing.assert_array_equal(out[0][0], np.array([1])) + np.testing.assert_array_equal(out[0][1], np.array([2])) + + def test_target_column_one(self): + X = [['000012,000034,']] + out = self.formatter.format_as_integer(X, target_column=1) + self.assertEqual(len(out), 1) + self.assertEqual(len(out[0]), 2) + np.testing.assert_array_equal(out[0][0], np.array([2])) + np.testing.assert_array_equal(out[0][1], np.array([4])) + + def test_target_column_with_trunc(self): + X = [['000012,000034,000056,']] + out = self.formatter.format_as_integer(X, target_column=1, trunc=2) + self.assertEqual(len(out[0]), 2) + np.testing.assert_array_equal(out[0][0], np.array([2])) + np.testing.assert_array_equal(out[0][1], np.array([4])) + + def test_target_column_from_config(self): + formatter = DigitInterleave(target_column=1) + formatter.metadata['width_used'] = 3 + X = [['000012,000034,']] + out = formatter.format_as_integer(X) + np.testing.assert_array_equal(out[0][0], np.array([2])) + np.testing.assert_array_equal(out[0][1], np.array([4])) + + +class DigitInterleaveRoundTripTest(unittest.TestCase): + """Round-trip: format_as_string then format_as_integer.""" + + def setUp(self): + self.formatter = DigitInterleave() + + def test_round_trip_single_window(self): + X = np.array([[[1, 23], [45, 6]]]) + strings = self.formatter.format_as_string(X) + X_in = [[s] for s in strings] + out = self.formatter.format_as_integer(X_in) + self.assertEqual(len(out), 1) + self.assertEqual(len(out[0]), 2) + np.testing.assert_array_equal(out[0][0], np.array([1])) + np.testing.assert_array_equal(out[0][1], np.array([45])) + + def test_round_trip_multiple_windows(self): + X = np.array([[[10, 20]], [[30, 40]]]) + strings = self.formatter.format_as_string(X) + X_in = [[s] for s in strings] + out = self.formatter.format_as_integer(X_in) + self.assertEqual(len(out), 2) + np.testing.assert_array_equal(out[0][0], np.array([10])) + np.testing.assert_array_equal(out[1][0], np.array([30])) diff --git a/tests/primitives/formatting/test_json_format.py b/tests/primitives/formatting/test_json_format.py new file mode 100644 index 0000000..497ad00 --- /dev/null +++ b/tests/primitives/formatting/test_json_format.py @@ -0,0 +1,166 @@ +import unittest + +import numpy as np + +from sigllm.primitives.formatting.json_format import JSONFormat + + +class JSONFormatFormatAsStringTest(unittest.TestCase): + """Tests for JSONFormat.format_as_string.""" + + def setUp(self): + self.formatter = JSONFormat(trunc=5) + + def test_single_window_single_row_to_string(self): + X = np.array([[[1, 2, 3]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['d0:1,d1:2,d2:3']) + + def test_single_window_multiple_rows_to_string(self): + X = np.array([[[1, 2], [3, 4]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['d0:1,d1:2,d0:3,d1:4']) + + def test_multiple_windows_to_string(self): + X = np.array([[[10, 20]], [[30, 40]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['d0:10,d1:20', 'd0:30,d1:40']) + + def test_multiple_windows_multiple_rows_to_string(self): + X = np.array([ + [[1, 2], [3, 4]], + [[5, 6], [7, 8]], + ]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['d0:1,d1:2,d0:3,d1:4', 'd0:5,d1:6,d0:7,d1:8']) + + def test_separator_kwarg_accepted(self): + X = np.array([[[1, 2]]]) + out = self.formatter.format_as_string(X, separator=';') + self.assertEqual(out, ['d0:1,d1:2']) + + +class JSONFormatFormatAsIntegerLegacyTest(unittest.TestCase): + """Tests for JSONFormat.format_as_integer with trunc (legacy).""" + + def setUp(self): + self.formatter = JSONFormat(trunc=2) + + def test_trunc_none_single_sample_to_integer(self): + X = np.array([['d0:1,d1:2,d0:3,d1:4']]) + out = self.formatter.format_as_integer(X, trunc=None) + self.assertEqual(out.shape, (1, 1, 2)) + np.testing.assert_array_equal(out[0, 0], [1, 3]) + + def test_trunc_none_multiple_samples_to_integer(self): + X = np.array([['d0:10,d1:11,d0:12', 'd0:20,d1:21']]) + out = self.formatter.format_as_integer(X, trunc=None) + self.assertEqual(out.shape, (1, 2, 2)) + np.testing.assert_array_equal(out[0, 0], [10, 12]) + np.testing.assert_array_equal(out[0, 1], [20, None]) + + def test_trunc_int_single_window_to_integer(self): + X = np.array([['d0:1,d1:2,d0:3,d1:4,d0:5']]) + out = self.formatter.format_as_integer(X, trunc=3) + np.testing.assert_array_equal(out, np.array([[[1, 3, 5]]])) + + def test_trunc_int_multiple_windows_to_integer(self): + X = np.array([ + ['d0:1,d0:2,d0:3'], + ['d0:4,d0:5,d0:6'], + ]) + out = self.formatter.format_as_integer(X, trunc=2) + expected = np.array([[[1, 2]], [[4, 5]]]) + np.testing.assert_array_equal(out, expected) + + def test_trunc_larger_than_values_fills_with_none(self): + X = np.array([['d0:7,d1:8,d0:9']]) + out = self.formatter.format_as_integer(X, trunc=5) + np.testing.assert_array_equal(out[0, 0], [7, 9, None, None, None]) + + +class JSONFormatFormatAsIntegerStepsAheadTest(unittest.TestCase): + """Tests for JSONFormat.format_as_integer with steps_ahead.""" + + def setUp(self): + self.formatter = JSONFormat(trunc=5) + + def test_steps_ahead_single_step(self): + X = np.array([['d0:10,d1:11,d0:20,d1:21,d0:30']]) + out = self.formatter.format_as_integer(X, steps_ahead=[1, 2, 3]) + self.assertIn(1, out) + self.assertIn(2, out) + self.assertIn(3, out) + np.testing.assert_array_equal(out[1], np.array([[10]])) + np.testing.assert_array_equal(out[2], np.array([[20]])) + np.testing.assert_array_equal(out[3], np.array([[30]])) + + def test_steps_ahead_missing_step_is_none(self): + X = np.array([['d0:10,d1:11,d0:20']]) + out = self.formatter.format_as_integer(X, steps_ahead=[1, 2, 5]) + self.assertEqual(out[1][0, 0], 10) + self.assertEqual(out[2][0, 0], 20) + self.assertIsNone(out[5][0, 0]) + + def test_steps_ahead_multiple_samples(self): + X = np.array([['d0:1,d0:2,d0:3', 'd0:4,d0:5']]) + out = self.formatter.format_as_integer(X, steps_ahead=[2]) + np.testing.assert_array_equal(out[2], np.array([[2, 5]])) + + def test_steps_ahead_from_config(self): + formatter = JSONFormat(trunc=1, steps_ahead=[1, 2]) + X = np.array([['d0:100,d0:200']]) + out = formatter.format_as_integer(X) + np.testing.assert_array_equal(out[1], np.array([[100]])) + np.testing.assert_array_equal(out[2], np.array([[200]])) + + +class JSONFormatExtractD0ValuesTest(unittest.TestCase): + """Tests for d0 extraction (via format_as_integer).""" + + def setUp(self): + self.formatter = JSONFormat(trunc=2) + + def test_only_d0_extracted(self): + X = np.array([['d1:5,d0:1,d2:3,d0:2,d1:9']]) + out = self.formatter.format_as_integer(X) + np.testing.assert_array_equal(out[0, 0], [1, 2]) + + def test_no_d0_fills_with_none(self): + X = np.array([['d1:1,d2:2,d1:3']]) + out = self.formatter.format_as_integer(X) + np.testing.assert_array_equal(out[0, 0], [None, None]) + + def test_no_d0_steps_ahead_returns_none(self): + X = np.array([['d1:1,d2:2']]) + out = self.formatter.format_as_integer(X, steps_ahead=[1]) + np.testing.assert_array_equal(out[1][0, 0], [None]) + + +class JSONFormatTargetDimTest(unittest.TestCase): + """Tests for target_column parameter in format_as_integer.""" + + def setUp(self): + self.formatter = JSONFormat() + + def test_target_column_one(self): + X = np.array([['d0:1,d1:10,d0:2,d1:20']]) + out = self.formatter.format_as_integer(X, trunc=None, target_column=1) + np.testing.assert_array_equal(out[0, 0], [10, 20]) + + def test_target_column_with_trunc(self): + X = np.array([['d0:1,d1:10,d0:2,d1:20,d0:3,d1:30']]) + out = self.formatter.format_as_integer(X, trunc=2, target_column=1) + np.testing.assert_array_equal(out[0, 0], [10, 20]) + + def test_target_column_with_steps_ahead(self): + X = np.array([['d0:1,d1:10,d0:2,d1:20,d0:3,d1:30']]) + out = self.formatter.format_as_integer(X, steps_ahead=[1, 2], target_column=1) + self.assertEqual(out[1][0, 0], 10) + self.assertEqual(out[2][0, 0], 20) + + def test_target_column_from_config(self): + formatter = JSONFormat(target_column=1) + X = np.array([['d0:1,d1:10,d0:2,d1:20']]) + out = formatter.format_as_integer(X, trunc=None) + np.testing.assert_array_equal(out[0, 0], [10, 20]) diff --git a/tests/primitives/formatting/test_persistence_control.py b/tests/primitives/formatting/test_persistence_control.py new file mode 100644 index 0000000..d695d2b --- /dev/null +++ b/tests/primitives/formatting/test_persistence_control.py @@ -0,0 +1,103 @@ +import unittest + +import numpy as np + +from sigllm.primitives.formatting.persistence_control import PersistenceControl + + +class PersistenceControlFormatAsStringTest(unittest.TestCase): + """Tests for PersistenceControl.format_as_string.""" + + def setUp(self): + self.formatter = PersistenceControl() + + def test_single_window_single_row_to_string(self): + X = np.array([[[10]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['10']) + + def test_single_window_multiple_rows_to_string(self): + X = np.array([[[1], [2], [3]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['1,2,3']) + + def test_multiple_windows_to_string(self): + # X: (2 windows, 2 rows each, 1 dim) + X = np.array([[[1], [2]], [[3], [4]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['1,2', '3,4']) + + def test_custom_separator(self): + X = np.array([[[1], [2], [3]]]) + out = self.formatter.format_as_string(X, separator=';') + self.assertEqual(out, ['1;2;3']) + + def test_uses_first_dimension_only(self): + X = np.array([[[100, 200, 300], [400, 500, 600]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['100,400']) + + +class PersistenceControlFormatAsIntegerTest(unittest.TestCase): + """Tests for PersistenceControl.format_as_integer (last value only).""" + + def setUp(self): + self.formatter = PersistenceControl() + + def test_single_entry_takes_last_to_integer(self): + X = ['1,2,3'] + out = self.formatter.format_as_integer(X) + self.assertEqual(out.shape, (1, 1, 1)) + np.testing.assert_array_equal(out[0, 0], np.array([3])) + + def test_multiple_entries_to_integer(self): + X = ['1,2,3', '4,5', '99'] + out = self.formatter.format_as_integer(X) + self.assertEqual(out.shape, (3, 1, 1)) + np.testing.assert_array_equal(out[0, 0], np.array([3])) + np.testing.assert_array_equal(out[1, 0], np.array([5])) + np.testing.assert_array_equal(out[2, 0], np.array([99])) + + def test_single_value(self): + X = ['42'] + out = self.formatter.format_as_integer(X) + np.testing.assert_array_equal(out[0, 0], np.array([42])) + + def test_custom_separator(self): + X = ['10;20;30'] + out = self.formatter.format_as_integer(X, separator=';') + np.testing.assert_array_equal(out[0, 0], np.array([30])) + + def test_leading_separator_stripped(self): + X = [',7,8,9'] + out = self.formatter.format_as_integer(X) + np.testing.assert_array_equal(out[0, 0], np.array([9])) + + def test_empty_after_split_filtered(self): + X = ['1,2,'] + out = self.formatter.format_as_integer(X) + np.testing.assert_array_equal(out[0, 0], np.array([2])) + + +class PersistenceControlTargetDimTest(unittest.TestCase): + """Tests for target_column parameter in format_as_string.""" + + def setUp(self): + self.formatter = PersistenceControl() + + def test_target_column_one(self): + X = np.array([[[100, 200], [300, 400]]]) + out = self.formatter.format_as_string(X, target_column=1) + self.assertEqual(out, ['200,400']) + + def test_target_column_from_config(self): + formatter = PersistenceControl(target_column=1) + X = np.array([[[100, 200], [300, 400]]]) + out = formatter.format_as_string(X) + self.assertEqual(out, ['200,400']) + + def test_round_trip_target_column_one(self): + X = np.array([[[100, 200], [300, 400]]]) + strings = self.formatter.format_as_string(X, target_column=1) + out = self.formatter.format_as_integer(strings) + np.testing.assert_array_equal(out[0, 0], np.array([400])) diff --git a/tests/primitives/formatting/test_univariate_control.py b/tests/primitives/formatting/test_univariate_control.py new file mode 100644 index 0000000..acb452d --- /dev/null +++ b/tests/primitives/formatting/test_univariate_control.py @@ -0,0 +1,107 @@ +import unittest + +import numpy as np + +from sigllm.primitives.formatting.univariate_control import UnivariateControl + + +class UnivariateControlFormatAsStringTest(unittest.TestCase): + """Tests for UnivariateControl.format_as_string.""" + + def setUp(self): + self.formatter = UnivariateControl() + + def test_single_window_single_row_to_string(self): + X = np.array([[[10, 20]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['10']) + + def test_single_window_multiple_rows_to_string(self): + X = np.array([[[1, 100], [2, 200], [3, 300]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['1,2,3']) + + def test_multiple_windows_to_string(self): + X = np.array([[[1, 10], [2, 20]], [[3, 30], [4, 40]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['1,2', '3,4']) + + def test_custom_separator(self): + X = np.array([[[1, 10], [2, 20], [3, 30]]]) + out = self.formatter.format_as_string(X, separator=';') + self.assertEqual(out, ['1;2;3']) + + def test_target_column_one(self): + X = np.array([[[1, 10], [2, 20]]]) + out = self.formatter.format_as_string(X, target_column=1) + self.assertEqual(out, ['10,20']) + + def test_target_column_from_config(self): + formatter = UnivariateControl(target_column=1) + X = np.array([[[1, 10], [2, 20]]]) + out = formatter.format_as_string(X) + self.assertEqual(out, ['10,20']) + + +class UnivariateControlFormatAsIntegerTest(unittest.TestCase): + """Tests for UnivariateControl.format_as_integer.""" + + def setUp(self): + self.formatter = UnivariateControl() + + def test_single_entry_single_value_to_integer(self): + X = [['42']] + out = self.formatter.format_as_integer(X) + np.testing.assert_array_equal(out[0, 0], np.array([42])) + + def test_single_entry_multiple_values_to_integer(self): + X = [['1,2,3']] + out = self.formatter.format_as_integer(X) + np.testing.assert_array_equal(out[0, 0], np.array([1, 2, 3])) + + def test_multiple_entries_to_integer(self): + X = [['1,2,3', '4,5,6'], ['11,12,13', '14,15,16']] + out = self.formatter.format_as_integer(X) + np.testing.assert_array_equal(out[0, 0], np.array([1, 2, 3])) + np.testing.assert_array_equal(out[0, 1], np.array([4, 5, 6])) + np.testing.assert_array_equal(out[1, 0], np.array([11, 12, 13])) + np.testing.assert_array_equal(out[1, 1], np.array([14, 15, 16])) + + def test_trunc_limits_values(self): + X = [['10,20,30,40']] + out = self.formatter.format_as_integer(X, trunc=2) + np.testing.assert_array_equal(out[0, 0], np.array([10, 20])) + + def test_custom_separator(self): + X = [['10;20;30']] + out = self.formatter.format_as_integer(X, separator=';') + np.testing.assert_array_equal(out[0, 0], np.array([10, 20, 30])) + + +class UnivariateControlRoundTripTest(unittest.TestCase): + """Round-trip: format_as_string then format_as_integer.""" + + def setUp(self): + self.formatter = UnivariateControl() + + def test_round_trip_default_target_column(self): + X = np.array([[[1, 10], [2, 20]]]) + strings = self.formatter.format_as_string(X) + X_in = [[s] for s in strings] + out = self.formatter.format_as_integer(X_in) + np.testing.assert_array_equal(out[0][0], np.array([1, 2])) + + def test_round_trip_target_column_one(self): + X = np.array([[[1, 10], [2, 20]]]) + strings = self.formatter.format_as_string(X, target_column=1) + X_in = [[s] for s in strings] + out = self.formatter.format_as_integer(X_in) + np.testing.assert_array_equal(out[0][0], np.array([10, 20])) + + def test_round_trip_multiple_windows(self): + X = np.array([[[1, 10], [2, 20]], [[3, 30], [4, 40]]]) + strings = self.formatter.format_as_string(X, target_column=1) + X_in = [[s] for s in strings] + out = self.formatter.format_as_integer(X_in) + np.testing.assert_array_equal(out[0][0], np.array([10, 20])) + np.testing.assert_array_equal(out[1][0], np.array([30, 40])) diff --git a/tests/primitives/formatting/test_value_concatenation.py b/tests/primitives/formatting/test_value_concatenation.py new file mode 100644 index 0000000..a295823 --- /dev/null +++ b/tests/primitives/formatting/test_value_concatenation.py @@ -0,0 +1,117 @@ +import unittest + +import numpy as np + +from sigllm.primitives.formatting.value_concatenation import ValueConcatenation + + +class ValueConcatenationFormatAsStringTest(unittest.TestCase): + """Tests for ValueConcatenation.format_as_string (value concatenation).""" + + def setUp(self): + self.formatter = ValueConcatenation() + + def test_single_window_single_row_to_string(self): + X = np.array([[[10]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['10']) + + def test_single_window_multiple_rows_to_string(self): + X = np.array([[[1], [2], [3]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['1,2,3']) + + def test_single_window_multiple_rows_multiple_dims_to_string(self): + X = np.array([[[1, 2], [3, 4]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['1,2,3,4']) + + def test_multiple_windows_to_string(self): + X = np.array([[[1], [2]], [[3], [4]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['1,2', '3,4']) + + def test_custom_separator(self): + X = np.array([[[1], [2], [3]]]) + out = self.formatter.format_as_string(X, separator=';') + self.assertEqual(out, ['1;2;3']) + + def test_flattens_all_values_in_window(self): + X = np.array([[[100, 200], [300, 400]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['100,200,300,400']) + + +class ValueConcatenationFormatAsIntegerTest(unittest.TestCase): + """Tests for ValueConcatenation.format_as_integer (parsing concatenated values).""" + + def setUp(self): + self.formatter = ValueConcatenation(num_dims=1) + + def test_single_entry_single_value_to_integer(self): + X = [['42']] + out = self.formatter.format_as_integer(X) + self.assertEqual(out.shape, (1, 1, 1)) + np.testing.assert_array_equal(out[0, 0], np.array([42])) + + def test_single_entry_multiple_values_to_integer(self): + X = [['1,2,3']] + out = self.formatter.format_as_integer(X) + self.assertEqual(out.shape, (1, 1, 3)) + np.testing.assert_array_equal(out[0, 0], np.array([1, 2, 3])) + + def test_multiple_entries_equal_length_to_integer(self): + X = [['1,2,3', '4,5,6'], ['11,12,13', '14,15,16']] + out = self.formatter.format_as_integer(X) + self.assertEqual(out.shape, (2, 2, 3)) + np.testing.assert_array_equal(out[0, 0], np.array([1, 2, 3])) + np.testing.assert_array_equal(out[0, 1], np.array([4, 5, 6])) + np.testing.assert_array_equal(out[1, 0], np.array([11, 12, 13])) + np.testing.assert_array_equal(out[1, 1], np.array([14, 15, 16])) + + def test_multiple_entries_unequal_length_to_integer(self): + X = [['1,2,3', '4,5'], ['6', '7,8']] + out = self.formatter.format_as_integer(X) + self.assertEqual(out.shape, (2, 2)) + np.testing.assert_array_equal(out[0, 0], np.array([1, 2, 3])) + np.testing.assert_array_equal(out[0, 1], np.array([4, 5])) + np.testing.assert_array_equal(out[1, 0], np.array([6])) + np.testing.assert_array_equal(out[1, 1], np.array([7, 8])) + + def test_trunc_limits_values(self): + X = [['10,20,30,40']] + out = self.formatter.format_as_integer(X, trunc=2) + self.assertEqual(out.shape, (1, 1, 2)) + np.testing.assert_array_equal(out[0, 0], np.array([10, 20])) + + def test_trunc_none_keeps_all(self): + X = [['7,8,9']] + out = self.formatter.format_as_integer(X, trunc=None) + np.testing.assert_array_equal(out[0, 0], np.array([7, 8, 9])) + + def test_custom_separator(self): + X = [['10;20;30']] + out = self.formatter.format_as_integer(X, separator=';') + np.testing.assert_array_equal(out[0, 0], np.array([10, 20, 30])) + + def test_leading_separator_stripped(self): + X = [[',7,8,9']] + out = self.formatter.format_as_integer(X) + np.testing.assert_array_equal(out[0, 0], np.array([7, 8, 9])) + + def test_empty_after_split_filtered(self): + X = [['1,2,']] + out = self.formatter.format_as_integer(X) + np.testing.assert_array_equal(out[0, 0], np.array([1, 2])) + + def test_multiple_dimensions_to_integer(self): + X = [['10,20,30,40']] + out = self.formatter.format_as_integer(X, num_dims=2) + self.assertEqual(out.shape, (1, 1, 2)) + np.testing.assert_array_equal(out[0, 0], np.array([10, 30])) + + def test_target_column_one(self): + X = [['10,20,30,40']] + out = self.formatter.format_as_integer(X, num_dims=2, target_column=1) + self.assertEqual(out.shape, (1, 1, 2)) + np.testing.assert_array_equal(out[0, 0], np.array([20, 40])) diff --git a/tests/primitives/formatting/test_value_interleave.py b/tests/primitives/formatting/test_value_interleave.py new file mode 100644 index 0000000..9af4931 --- /dev/null +++ b/tests/primitives/formatting/test_value_interleave.py @@ -0,0 +1,162 @@ +import unittest + +import numpy as np + +from sigllm.primitives.formatting.value_interleave import ValueInterleave + + +class ValueInterleaveFormatAsStringTest(unittest.TestCase): + """Tests for ValueInterleave.format_as_string.""" + + def setUp(self): + self.formatter = ValueInterleave() + + def test_single_window_single_timestamp_one_value_to_string(self): + X = np.array([[[512]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['512,']) + self.assertEqual(self.formatter.metadata['width_used'], 3) + + def test_single_window_single_timestamp_two_values_to_string(self): + X = np.array([[[1, 23]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['001023,']) + self.assertEqual(self.formatter.metadata['width_used'], 3) + + def test_single_window_multiple_timestamps_to_string(self): + X = np.array([[[101, 22], [35, 4]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['101022,035004,']) + self.assertEqual(self.formatter.metadata['width_used'], 3) + + def test_multiple_windows_to_string(self): + X = np.array([[[144, 254]], [[321, 456]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(len(out), 2) + self.assertEqual(out[0], '144254,') + self.assertEqual(out[1], '321456,') + + def test_digits_per_timestamp_wider_than_values(self): + X = np.array([[[7]]]) + out = self.formatter.format_as_string(X, digits_per_timestamp=3) + self.assertEqual(out, ['007,']) + self.assertEqual(self.formatter.metadata['width_used'], 3) + + def test_values_wider_than_digits_per_timestamp(self): + X = np.array([[[1234, 500], [101, 500]], [[30, 10], [32, 14]]]) + out = self.formatter.format_as_string(X, digits_per_timestamp=2) + self.assertEqual(out, ['12340500,01010500,', '00300010,00320014,']) + self.assertEqual(self.formatter.metadata['width_used'], 4) + + def test_custom_separator(self): + X = np.array([[[1], [2]]]) + out = self.formatter.format_as_string(X, separator=';') + self.assertEqual(out, ['001;002;']) + + def test_custom_digits_per_timestamp(self): + X = np.array([[[1, 11, 40], [21, 50, 10]]]) + out = self.formatter.format_as_string(X, digits_per_timestamp=2) + self.assertEqual(out, ['011140,215010,']) + self.assertEqual(self.formatter.metadata['width_used'], 2) + + def test_multiple_windows_multiple_timestamps_to_string(self): + X = np.array([[[144, 254], [104, 200]], [[321, 456], [101, 202]]]) + out = self.formatter.format_as_string(X) + self.assertEqual(out, ['144254,104200,', '321456,101202,']) + + +class ValueInterleaveFormatAsIntegerTest(unittest.TestCase): + """Tests for ValueInterleave.format_as_integer (requires width_used in metadata).""" + + def setUp(self): + self.formatter = ValueInterleave() + self.formatter.metadata['width_used'] = 3 + + def test_single_timestamp_single_value_to_integer(self): + X = [['005,']] + out = self.formatter.format_as_integer(X) + self.assertEqual(len(out), 1) + self.assertEqual(len(out[0]), 1) + np.testing.assert_array_equal(out[0][0], np.array([5])) + + def test_single_timestamp_two_values_to_integer(self): + X = [['001023,']] + out = self.formatter.format_as_integer(X) + self.assertEqual(len(out), 1) + self.assertEqual(len(out[0]), 1) + np.testing.assert_array_equal(out[0][0], np.array([1])) + + def test_multiple_timestamps_in_one_sample_to_integer(self): + X = [['001002,003004,']] + out = self.formatter.format_as_integer(X) + self.assertEqual(len(out), 1) + self.assertEqual(len(out[0]), 1) + np.testing.assert_array_equal(out[0][0], np.array([1, 3])) + + def test_multiple_entries_to_integer(self): + X = [['005,'], ['012,']] + out = self.formatter.format_as_integer(X) + self.assertEqual(len(out), 2) + np.testing.assert_array_equal(out[0][0], np.array([5])) + np.testing.assert_array_equal(out[1][0], np.array([12])) + + def test_trunc_limits_timestamps(self): + X = [['001002,003004,005006,']] + out = self.formatter.format_as_integer(X, trunc=2) + self.assertEqual(out.shape, (1, 1, 2)) + np.testing.assert_array_equal(out[0][0], np.array([1, 3])) + + def test_trunc_limits_values_per_timestamp(self): + X = [['001002,003004,005006,']] + out = self.formatter.format_as_integer(X, trunc=2) + self.assertEqual(out.shape, (1, 1, 2)) + np.testing.assert_array_equal(out[0][0], np.array([1, 3])) + + def test_custom_separator(self): + X = [['001;002;']] + out = self.formatter.format_as_integer(X, separator=';') + np.testing.assert_array_equal(out[0][0], np.array([1, 2])) + + def test_target_column_one(self): + X = [['001023,045006,']] + out = self.formatter.format_as_integer(X, target_column=1) + self.assertEqual(len(out), 1) + self.assertEqual(len(out[0]), 1) + np.testing.assert_array_equal(out[0][0], np.array([23, 6])) + + def test_target_column_with_trunc(self): + X = [['001023,045006,007008,']] + out = self.formatter.format_as_integer(X, target_column=1, trunc=2) + np.testing.assert_array_equal(out[0][0], np.array([23, 6])) + + def test_target_column_from_config(self): + formatter = ValueInterleave(target_column=1) + formatter.metadata['width_used'] = 3 + X = [['001023,045006,']] + out = formatter.format_as_integer(X) + np.testing.assert_array_equal(out[0][0], np.array([23, 6])) + + +class ValueInterleaveRoundTripTest(unittest.TestCase): + """Round-trip: format_as_string then format_as_integer.""" + + def setUp(self): + self.formatter = ValueInterleave() + + def test_round_trip_single_window(self): + X = np.array([[[1, 23], [45, 6]]]) + strings = self.formatter.format_as_string(X) + X_in = [[s] for s in strings] + out = self.formatter.format_as_integer(X_in) + self.assertEqual(len(out), 1) + self.assertEqual(len(out[0]), 1) + np.testing.assert_array_equal(out[0][0], np.array([1, 45])) + + def test_round_trip_multiple_windows(self): + X = np.array([[[10, 20]], [[30, 40]]]) + strings = self.formatter.format_as_string(X) + X_in = [[s] for s in strings] + out = self.formatter.format_as_integer(X_in) + self.assertEqual(len(out), 2) + np.testing.assert_array_equal(out[0][0], np.array([10])) + np.testing.assert_array_equal(out[1][0], np.array([30])) diff --git a/tutorials/pipelines/multivariate-detector-pipeline.ipynb b/tutorials/pipelines/multivariate-detector-pipeline.ipynb new file mode 100644 index 0000000..5937932 --- /dev/null +++ b/tutorials/pipelines/multivariate-detector-pipeline.ipynb @@ -0,0 +1,628 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multivariate Pipeline Tutorial\n", + "\n", + "This notebook demonstrates how to use the multivariate detector pipeline with different formatting methods for multidimensional time series data.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sigllm.primitives.formatting import (\n", + " JSONFormat,\n", + " UnivariateControl,\n", + " PersistenceControl,\n", + " ValueConcatenation,\n", + " ValueInterleave,\n", + " DigitInterleave,\n", + " utils\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Sample Multivariate Data\n", + "\n", + "First, let's create some sample multivariate time series data with 3 dimensions.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create sample data with 3 dimensions\n", + "N = 25\n", + "raw_data = utils.create_test_data()\n", + "print(raw_data.head())\n", + "\n", + "raw_data = raw_data.to_numpy()[:, 1:]\n", + "windowed_data = np.array([raw_data[i:i+15,:] for i in range(0, len(raw_data)-15, 1)])\n", + "data = (1000 * windowed_data).astype(int)\n", + "\n", + "print(\"Sample data shape:\", data.shape)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Available Formatting Methods\n", + "\n", + "The multivariate pipeline supports several formatting methods to convert multi-dimensional data into string representations for LLM processing:\n", + "\n", + "1. **JSONFormat**: Formats as d0:val,d1:val,... per timestamp\n", + "2. **ValueConcatenation**: Flattens all dimensions per timestamp\n", + "3. **ValueInterleave**: Interleaves values with zero-padding\n", + "4. **DigitInterleave**: Interleaves individual digits\n", + "5. **UnivariateControl**: Uses only first dimension (baseline)\n", + "6. **PersistenceControl**: Returns last value (naive baseline)\n", + "\n", + "\n", + "For example, given timesteps $t_0$ = [50, 30, 100] and $t_1$ = [55, 28, 104]:\n", + "* Value Concatenation - Simply flatten the values across time: 50,30,100,55,28,104\n", + "* Value Interleave - Pad values to equal digit length and concatenate timestep by timestep: 050030100,055028104\n", + "* Digit Interleave - Interleave digits positionally across dimensions: 001530000,001520584\n", + "* JSON Format - Encode as dimension-labeled key:value pairs: d0:50,d1:30,d2:100,d0:55,d1:28,d2:104\n", + "* Univariate Control - Keep only one dimension (baseline for comparison): 50,55\n", + "* Persistence Control - Bypass the formatting and return last known value: N/A\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compare string representations from different methods\n", + "methods = {\n", + " 'JSONFormat': JSONFormat(),\n", + " 'ValueConcatenation': ValueConcatenation(),\n", + " 'ValueInterleave': ValueInterleave(),\n", + " 'DigitInterleave': DigitInterleave(),\n", + " 'UnivariateControl': UnivariateControl(),\n", + " 'PersistenceControl': PersistenceControl(),\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Comparison of formatting methods on the same data:\\n\")\n", + "\n", + "\n", + "\n", + "for name, method in methods.items():\n", + " try:\n", + " print(f\"{name}:\")\n", + " output = method.format_as_string(data)\n", + " print(f\"\\t{output[0][:80]}...\\n\")\n", + " except Exception as e:\n", + " print(f\"{name}: Error - {e}\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Deep dive into JSONFormat\n", + "\n", + "In this section, we show an end-to-end use of the multivariate detector pipeline." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step-by-Step Pipeline Execution\n", + "\n", + "MLPipelines are composed of a sequence of primitives. These primitives apply transformation and calculation operations to the data and update the variables within the pipeline. To view the primitives used by the pipeline, we access its `primitives` attribute.\n", + "\n", + "The multivariate mistral detector with JSON format contains 12 primitives. We will observe how the context (which are the variables held within the pipeline) are updated after the execution of each primitive." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "pipeline = MLPipeline('multivariate_mistral_detector_jsonformat')\n", + "\n", + "hyperparameters = {\n", + " \"mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1\": {\n", + " \"interval\": 21600\n", + " },\n", + " \"mlstars.custom.timeseries_preprocessing.rolling_window_sequences#1\": {\n", + " \"target_column\": 0,\n", + " \"window_size\": 25,\n", + " \"target_size\": 1,\n", + " \"step_size\": 1\n", + " },\n", + " \"sigllm.primitives.forecasting.huggingface.HF#1\": {\n", + " \"samples\": 1,\n", + " },\n", + " \"sigllm.primitives.formatting.json_format.format_as_integer#1\": {\n", + " \"trunc\": 1\n", + " }\n", + "}\n", + "pipeline.set_hyperparameters(hyperparameters)\n", + "\n", + "pipeline.primitives" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Sample Data\n", + "\n", + "For this tutorial, we'll use a sample signal from the Orion dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from orion.data import load_signal\n", + "import matplotlib.pyplot as plt\n", + "\n", + "signal_data = load_signal('multivariate/E-2')\n", + "\n", + "start = 900\n", + "end = start + 200\n", + "signal_data = signal_data.iloc[start:end]\n", + "\n", + "plt.figure(figsize=(10, 4))\n", + "plt.plot(signal_data['timestamp'], signal_data['0'])\n", + "plt.title('Sample Time Series Data')\n", + "plt.xlabel('Timestamp')\n", + "plt.ylabel('Value')\n", + "plt.show()\n", + "\n", + "print(f\"Data shape: {signal_data.shape}\")\n", + "print(signal_data.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 0: Time Segments Aggregate\n", + "\n", + "This primitive creates an equi-spaced time series by aggregating values over fixed specified interval.\n", + "\n", + "* **input**: `X` which is an n-dimensional sequence of values.\n", + "* **output**:\n", + " * `X` sequence of aggregated values, one column for each aggregation method.\n", + " * `index` sequence of index values (first index of each aggregated segment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "step = 0\n", + "context = pipeline.fit(signal_data, start_=step, output_=step)\n", + "print(f\"Context keys: {list(context.keys())}\")\n", + "print(f\"X shape: {context['X'].shape}\")\n", + "\n", + "for i, x in list(zip(context['index'], context['X']))[:5]:\n", + " print(f\"entry at {i} has value {x}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 1: SimpleImputer\n", + "\n", + "This primitive is an imputation transformer for filling missing values.\n", + "\n", + "* **input**: `X` which is an n-dimensional sequence of values.\n", + "* **output**: `y` which is a transformed version of 'X'." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "step = 1\n", + "context = pipeline.fit(**context, start_=step, output_=step)\n", + "print(f\"Context keys: {list(context.keys())}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2: Float2Scalar\n", + "\n", + "This primitive converts float values into scalar up to certain decimal points.\n", + "\n", + "* **input**: `y` which is an n-dimensional sequence of values in float type.\n", + "* **output**: `X` which is a transformed version of 'y' in scalar." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "step = 2\n", + "context = pipeline.fit(**context, start_=step, output_=step)\n", + "print(f\"Context keys: {list(context.keys())}\")\n", + "\n", + "for i, x in list(zip(context['index'], context['X']))[:5]:\n", + " print(f\"entry at {i} has value {x}\")\n", + "\n", + "print(f\"\\nMinimum value stored: {context['minimum']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: Rolling Window Sequences\n", + "\n", + "This primitive generates many sub-sequences of the original sequence using a rolling window approach.\n", + "\n", + "* **input**:\n", + " * `X` n-dimensional sequence to iterate over.\n", + " * `y` 1-dimensional target sequence.\n", + " * `index` array containing the index values of X.\n", + "* **output**:\n", + " * `X` input sequences.\n", + " * `y` target sequences.\n", + " * `index` first index value of each input sequence.\n", + " * `target_index` first index value of each target sequence." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "step = 3\n", + "context = pipeline.fit(**context, start_=step, output_=step)\n", + "print(f\"Context keys: {list(context.keys())}\")\n", + "\n", + "# After slicing X into multiple sub-sequences we obtain a 3D matrix\n", + "print(f\"\\nX shape = {context['X'].shape}\")\n", + "print(f\"y shape = {context['y'].shape}\")\n", + "print(f\"X index shape = {context['index'].shape}\")\n", + "print(f\"y index shape = {context['target_index'].shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4: JSON Format as String\n", + "\n", + "This primitive converts each sequence of scalar values into a JSON-like string format (`d0:val,d1:val,...`). This is the key step that formats multivariate data for the LLM.\n", + "\n", + "* **input**: `X` which is an n-dimensional sequence of values.\n", + "* **output**: `X` which is a string representation version of X in JSON format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "step = 4\n", + "context = pipeline.fit(**context, start_=step, output_=step)\n", + "print(f\"Context keys: {list(context.keys())}\")\n", + "\n", + "# Inspect the JSON-formatted string\n", + "if len(context['X']) > 0:\n", + " sample_string = context['X'][0]\n", + " print(f\"\\nSample JSON-formatted string (first 200 chars):\")\n", + " print(sample_string[:200] + \"...\")\n", + "else:\n", + " print(f\"\\nWarning: X is empty! Length: {len(context['X'])}\")\n", + " print(\"This may happen if the data segment is too small for the window size.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 5: HuggingFace Model (HF)\n", + "\n", + "This primitive prompts a HuggingFace model (Mistral) to forecast the next steps. The model is configured with `multivariate_allowed_symbols` to allow the JSON format tokens (`d`, `:`, `,`).\n", + "\n", + "* **input**: `X` input sequence (JSON-formatted strings).\n", + "* **output**: `y_hat` predicted sequence.\n", + "\n", + "
\n", + "Warning: This step is time consuming depending on the number of windows.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "step = 5\n", + "context = pipeline.fit(**context, start_=step, output_=step)\n", + "print(f\"Context keys: {list(context.keys())}\")\n", + "\n", + "y_hat = context.get('y_hat', [])\n", + "if len(y_hat) > 0:\n", + " print(f\"\\ny_hat shape: {np.array(y_hat).shape}\")\n", + " print(f\"\\nSample predictions (first 3 windows):\")\n", + " for i, pred in enumerate(y_hat[:3]):\n", + " print(f\" Window {i}: {pred}\")\n", + "else:\n", + " print(\"\\nWarning: y_hat is empty!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 6: JSON Format as Integer\n", + "\n", + "This primitive parses the model output and extracts values from the JSON-like format back to integers.\n", + "\n", + "* **input**: `y_hat` which is a sequence of JSON-formatted string values.\n", + "* **output**: `y_hat` which is an integer representation version of y_hat." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "step = 6\n", + "context = pipeline.fit(**context, start_=step, output_=step)\n", + "print(f\"Context keys: {list(context.keys())}\")\n", + "\n", + "y_hat = np.array(context['y_hat'])\n", + "print(f\"\\ny_hat shape: {y_hat.shape}\")\n", + "print(f\"y_hat dtype: {y_hat.dtype}\")\n", + "print(f\"\\nSample parsed values (first 5):\")\n", + "print(y_hat[:5])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Steps 7 and 8: Scalar2Float\n", + "\n", + "This primitive converts integer values back into float and adds the minimum value that was stored earlier.\n", + "\n", + "* **input**:\n", + " * `y_hat` sequence of integer values.\n", + " * `minimum` value to add to shift by.\n", + "* **output**: `y_hat` which is a transformed version of `y_hat` in float." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "start_step = 7\n", + "output_step = 8\n", + "\n", + "context = pipeline.fit(**context, start_=start_step, output_=output_step)\n", + "print(f\"Context keys: {list(context.keys())}\")\n", + "\n", + "print(f\"\\ny_hat shape: {context['y_hat'].shape}\")\n", + "print(f\"\\nSample float values (first 5):\")\n", + "print(context['y_hat'][:5])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 9: Aggregate Rolling Window\n", + "\n", + "This primitive aggregates multiple horizon predictions into a single representation.\n", + "\n", + "* **input**:\n", + " * `y_hat` n-dimensional sequence of forecasted values.\n", + " * `agg` aggregation method, \"median\" by default.\n", + "* **output**: `y_hat` one-dimensional output sequence depicting the aggregated value of forecasts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "step = 9\n", + "context = pipeline.fit(**context, start_=step, output_=step)\n", + "print(f\"Context keys: {list(context.keys())}\")\n", + "\n", + "print(f\"\\ny_hat shape: {context['y_hat'].shape}\")\n", + "print(f\"y_hat: {context['y_hat']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 10: Reshape\n", + "\n", + "Reshape `y_hat` sequences to the expected format.\n", + "\n", + "* **input**: `y_hat` forecasted and aggregated sequences.\n", + "* **output**: `y_hat` reshaped sequences." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "step = 10\n", + "context = pipeline.fit(**context, start_=step, output_=step)\n", + "print(f\"Context keys: {list(context.keys())}\")\n", + "\n", + "print(f\"\\ny_hat shape: {context['y_hat'].shape}\")\n", + "\n", + "print(f\"y_hat: {context['y_hat']}\")\n", + "block = pipeline.blocks['sigllm.primitives.formatting.json_format.format_as_integer#1']\n", + "print(f\"format_as_integer hyperparameters: {block.get_hyperparameters()}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 11: Regression Errors\n", + "\n", + "This primitive computes the point-wise difference between `y` and `y_hat`.\n", + "\n", + "* **input**:\n", + " * `y` target sequences.\n", + " * `y_hat` forecasted sequences.\n", + "* **output**: `errors` computed errors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "step = 11\n", + "context = pipeline.fit(**context, start_=step, output_=step)\n", + "print(f\"Context keys: {list(context.keys())}\")\n", + "\n", + "print(f\"\\nerrors shape: {context['errors'].shape}\")\n", + "print(f\"y_hat shape: {context['y_hat'].shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 12: Find Anomalies\n", + "\n", + "This primitive extracts anomalies from sequences of errors following the approach explained in the [related paper](https://arxiv.org/pdf/1802.04431.pdf).\n", + "\n", + "* **input**:\n", + " * `errors` array of errors.\n", + " * `target_index` array of indices of errors.\n", + "* **output**: `anomalies` array containing start-index, end-index, score for each anomalous sequence that was found." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "step = 12\n", + "context = pipeline.fit(**context, start_=step, output_=step)\n", + "print(f\"Context keys: {list(context.keys())}\")\n", + "\n", + "# Display detected anomalies\n", + "anomalies = context.get('anomalies', [])\n", + "if len(anomalies) > 0:\n", + " anomalies_df = pd.DataFrame(anomalies, columns=['start', 'end', 'score'])\n", + " print(\"\\nDetected anomalies:\")\n", + " display(anomalies_df)\n", + "else:\n", + " print(\"\\nNo anomalies detected.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualization\n", + "\n", + "Plot the original signal, forecasts, errors, and detected anomalies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "index = context.get('target_index', [])\n", + "y = context.get('y', [])\n", + "yhat = context.get('y_hat', [])\n", + "errors = context.get('errors', [])\n", + "anomalies = context.get('anomalies', [])\n", + "\n", + "plt.figure(figsize=(12, 6))\n", + "plt.plot(index, y, label='Original', alpha=0.7)\n", + "if len(yhat) > 0:\n", + " plt.plot(index, yhat, label='Forecast', alpha=0.7)\n", + "\n", + "handles, labels = plt.gca().get_legend_handles_labels()\n", + "by_label = dict(zip(labels, handles))\n", + "plt.legend(by_label.values(), by_label.keys())\n", + "\n", + "plt.title('Multivariate Detector Pipeline Results')\n", + "plt.xlabel('Timestamp')\n", + "plt.ylabel('Value')\n", + "plt.tight_layout()\n", + "plt.show()`" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "orion310", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}