Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 132 additions & 0 deletions sigllm/pipelines/detector/mistral_detector_lst_context.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
{
"primitives": [
"mlstars.custom.timeseries_preprocessing.time_segments_aggregate",
"sklearn.impute.SimpleImputer",
"sigllm.primitives.transformation.Float2Scalar",
"mlstars.custom.timeseries_preprocessing.rolling_window_sequences",
"sigllm.primitives.prompting.timeseries_preprocessing.long_short_term_context",
"sigllm.primitives.transformation.format_as_string",
"sigllm.primitives.forecasting.huggingface.HF",
"sigllm.primitives.transformation.format_as_integer",
"sigllm.primitives.transformation.Scalar2Float",
"sigllm.primitives.transformation.Scalar2Float",
"sigllm.primitives.postprocessing.aggregate_rolling_window",
"numpy.reshape",
"orion.primitives.timeseries_errors.regression_errors",
"orion.primitives.timeseries_anomalies.find_anomalies"
],
"init_params": {
"mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
"time_column": "timestamp",
"interval": 21600,
"method": "mean"
},
"sigllm.primitives.transformation.Float2Scalar#1": {
"decimal": 2,
"rescale": true
},
"mlstars.custom.timeseries_preprocessing.rolling_window_sequences#1": {
"target_column": 0,
"window_size": 13,
"target_size": 1
},
"sigllm.primitives.prompting.timeseries_preprocessing.long_short_term_context#1": {
"L": 5,
"aggregation_method": "mean"
},
"sigllm.primitives.transformation.format_as_string#1": {
"space": false
},
"sigllm.primitives.forecasting.huggingface.HF#1": {
"name": "mistralai/Mistral-7B-Instruct-v0.2",
"steps": 5
},
"sigllm.primitives.transformation.format_as_integer#1": {
"trunc": 1,
"errors": "coerce"
},
"sigllm.primitives.postprocessing.aggregate_rolling_window#1": {
"agg": "median"
},
"orion.primitives.timeseries_anomalies.find_anomalies#1": {
"window_size_portion": 0.3,
"window_step_size_portion": 0.1,
"fixed_threshold": true
}
},
"input_names": {
"sigllm.primitives.transformation.Float2Scalar#1": {
"X": "y"
},
"mlstars.custom.timeseries_preprocessing.rolling_window_sequences#1": {
"X": "X",
"y": "y",
"index": "index"
},
"sigllm.primitives.prompting.timeseries_preprocessing.long_short_term_context#1": {
"X": "X",
"first_index": "index"
},
"sigllm.primitives.transformation.format_as_integer#1": {
"X": "y_hat"
},
"sigllm.primitives.transformation.Scalar2Float#1": {
"X": "y_hat",
"minimum": "minimum",
"decimal": "decimal"
},
"sigllm.primitives.transformation.Scalar2Float#2": {
"X": "y",
"minimum": "minimum",
"decimal": "decimal"
},
"sigllm.primitives.postprocessing.aggregate_rolling_window#1": {
"y": "y_hat"
},
"numpy.reshape#1": {
"X": "y_hat"
},
"orion.primitives.timeseries_anomalies.find_anomalies#1": {
"index": "target_index"
}
},
"output_names": {
"sklearn.impute.SimpleImputer#1": {
"X": "y"
},
"mlstars.custom.timeseries_preprocessing.rolling_window_sequences#1": {
"X": "X",
"y": "y",
"index": "index",
"target_index": "target_index"
},
"sigllm.primitives.prompting.timeseries_preprocessing.long_short_term_context#1": {
"X": "X",
"first_index": "first_index",
"window_size": "window_size",
"step_size": "step_size",
"dim": "dim"
},
"sigllm.primitives.forecasting.huggingface.HF#1": {
"y": "y_hat"
},
"sigllm.primitives.transformation.format_as_integer#1": {
"X": "y_hat"
},
"sigllm.primitives.transformation.Scalar2Float#1": {
"X": "y_hat"
},
"sigllm.primitives.transformation.Scalar2Float#2": {
"X": "y"
},
"sigllm.primitives.postprocessing.aggregate_rolling_window#1": {
"y": "y_hat"
},
"numpy.reshape#1": {
"X": "y_hat"
},
"orion.primitives.timeseries_anomalies.find_anomalies#1": {
"y": "anomalies"
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
{
"name": "sigllm.primitives.prompting.timeseries_preprocessing.long_short_term_context",
"contributors": [
"Allen Baranov <baranov@mit.edu>"
],
"description": "Process rolling window sequences to create long-term aggregated and short-term raw context windows",
"classifiers": {
"type": "preprocessor",
"subtype": "feature_extractor"
},
"modalities": [
"timeseries"
],
"primitive": "sigllm.primitives.prompting.timeseries_preprocessing.long_short_term_context",
"produce": {
"args": [
{
"name": "X",
"type": "ndarray"
},
{
"name": "first_index",
"type": "ndarray"
}
],
"output": [
{
"name": "X",
"type": "ndarray"
},
{
"name": "first_index",
"type": "ndarray"
},
{
"name": "window_size",
"type": "int"
},
{
"name": "step_size",
"type": "int"
},
{
"name": "dim",
"type": "int"
}
]
},
"hyperparameters": {
"fixed": {
"L": {
"type": "int",
"default": 5
},
"W": {
"type": "int",
"default": null
},
"S": {
"type": "int",
"default": null
},
"step_size": {
"type": "int",
"default": 1
},
"aggregation_method": {
"type": "str",
"default": "mean"
}
}
}
}
138 changes: 138 additions & 0 deletions sigllm/primitives/prompting/timeseries_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,141 @@ def rolling_window_sequences(X, window_size=500, step_size=100):
X_index.append(index[start])
start = start + step_size
return np.asarray(out_X), np.asarray(X_index), window_size, step_size, dim


def long_short_term_context(
X,
first_index,
step_size,
L,
S=None,
W=None,
aggregation_method='mean',
window_size=None,
dim=None,
):
"""Process rolling window sequences to create long and short-term raw segments.

This function processes already-windowed sequences from rolling_window_sequences to create
raw segments with long-term aggregated segments (rounded to nearest integer) and
short-term raw segments.

Args:
X (ndarray):
Input windows from rolling_window_sequences with shape (num_windows, window_size, dim).
first_index (ndarray):
First index value of each input sequence.
step_size (int):
Step size used in rolling_window_sequences.
L (int):
Number of long-term windows to extract.
S (int, optional):
Number of short-term values to extract (last S values). If None, set to L.
Defaults to None.
W (int, optional):
Size of each long-term window. If None, computed automatically
from window_size, L, and S. Defaults to None.
aggregation_method (str):
Aggregation method for long-term windows. Currently only 'mean' is supported.
Defaults to 'mean'.
window_size (int, optional):
Size of each input window. If None, computed from X.shape[1].
dim (int, optional):
Dimensionality of the data. If None, computed from X.shape.

Returns:
tuple:
* processed_windows_array (ndarray): Processed windows
with shape (num_windows, S + L, dim).
* first_index (ndarray): First index array (passed through)
* new_window_size (int): New window size (S + L)
* step_size (int): Step size (passed through)
* dim (int): Dimensionality (passed through)
"""
if aggregation_method != 'mean':
raise ValueError(
f"Aggregation method '{aggregation_method}' not yet supported. "
"Only 'mean' is currently supported."
)

if window_size is None:
window_size = X.shape[1]

if dim is None:
if X.ndim == 2:
dim = 1
else:
dim = X.shape[2]

if S is None:
S = L

if W is None:
if window_size < S:
raise ValueError(f'window_size ({window_size}) must be at least S ({S})')
W = (window_size - S) // L
if W <= 0:
raise ValueError(
f'Cannot compute W: window_size ({window_size}) must be '
f'greater than S ({S}) + L ({L}).'
)

required_size = W * L + S
remainder = window_size - required_size

if remainder < 0:
raise ValueError(
f'window_size ({window_size}) is too small.'
f'Need at least W*L + S = {W}*{L} + {S} = {required_size}.'
)

num_windows = X.shape[0]
processed_windows = []

is_2d_input = X.ndim == 2

for i in range(num_windows):
window = X[i]

if remainder > 0:
window = window[remainder:]

long_term_aggregated = []
for j in range(L):
start_idx = j * W
end_idx = (j + 1) * W
long_term_window = window[start_idx:end_idx]

if is_2d_input:
aggregated_value = np.mean(long_term_window)
else:
if dim == 1:
aggregated_value = np.mean(long_term_window)
else:
aggregated_value = np.mean(long_term_window, axis=0)

aggregated_value = np.round(aggregated_value).astype(int)
long_term_aggregated.append(aggregated_value)

short_term = window[required_size - S : required_size]

if short_term.dtype != np.int64:
short_term = short_term.astype(int)

if is_2d_input:
long_term_array = np.array(long_term_aggregated)
combined = np.concatenate([long_term_array, short_term])
processed_windows.append(combined)
else:
long_term_array = np.array(long_term_aggregated)
if dim == 1:
combined = np.concatenate([long_term_array, short_term.flatten()])
processed_windows.append(combined.reshape(-1, 1))
else:
combined = np.concatenate([long_term_array, short_term], axis=0)
processed_windows.append(combined)

processed_windows_array = np.asarray(processed_windows)
new_window_size = S + L

return processed_windows_array, first_index, new_window_size, step_size, dim
Loading