Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
{
"primitives": [
"mlstars.custom.timeseries_preprocessing.time_segments_aggregate",
"sklearn.impute.SimpleImputer",
"sigllm.primitives.transformation.Float2Scalar",
"mlstars.custom.timeseries_preprocessing.rolling_window_sequences",
"sigllm.primitives.formatting.json_format.format_as_string",
"sigllm.primitives.forecasting.huggingface.HF",
"sigllm.primitives.formatting.json_format.format_as_integer",
"sigllm.primitives.transformation.Scalar2Float",
"sigllm.primitives.transformation.Scalar2Float",
"sigllm.primitives.postprocessing.aggregate_rolling_window",
"numpy.reshape",
"orion.primitives.timeseries_errors.regression_errors",
"orion.primitives.timeseries_anomalies.find_anomalies"
],
"init_params": {
"mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
"time_column": "timestamp",
"interval": 21600,
"method": "mean"
},
"sigllm.primitives.transformation.Float2Scalar#1": {
"decimal": 2,
"rescale": true
},
"mlstars.custom.timeseries_preprocessing.rolling_window_sequences#1": {
"target_column": 0,
"window_size": 140,
"target_size": 1,
"step_size": 1
},
"sigllm.primitives.forecasting.huggingface.HF#1": {
"name": "mistralai/Mistral-7B-Instruct-v0.2",
"steps": 5,
"multivariate_allowed_symbols": [
"d",
":",
","
]
},
"sigllm.primitives.formatting.json_format.format_as_integer#1": {
"trunc": 1,
"target_column": 0
},
"sigllm.primitives.postprocessing.aggregate_rolling_window#1": {
"agg": "median"
},
"orion.primitives.timeseries_anomalies.find_anomalies#1": {
"window_size_portion": 0.3,
"window_step_size_portion": 0.1,
"fixed_threshold": true
}
},
"input_names": {
"sigllm.primitives.transformation.Float2Scalar#1": {
"X": "y"
},
"mlstars.custom.timeseries_preprocessing.rolling_window_sequences#1": {
"X": "y_scaled"
},
"sigllm.primitives.formatting.json_format.format_as_integer#1": {
"X": "y_hat"
},
"sigllm.primitives.transformation.Scalar2Float#1": {
"X": "y_hat",
"minimum": "minimum",
"decimal": "decimal"
},
"sigllm.primitives.transformation.Scalar2Float#2": {
"X": "y",
"minimum": "minimum",
"decimal": "decimal"
},
"sigllm.primitives.postprocessing.aggregate_rolling_window#1": {
"y": "y_hat"
},
"numpy.reshape#1": {
"X": "y_hat"
},
"orion.primitives.timeseries_anomalies.find_anomalies#1": {
"index": "target_index"
}
},
"output_names": {
"sklearn.impute.SimpleImputer#1": {
"X": "y"
},
"sigllm.primitives.transformation.Float2Scalar#1": {
"X": "y_scaled",
"minimum": "minimum",
"decimal": "decimal"
},
"sigllm.primitives.forecasting.huggingface.HF#1": {
"y": "y_hat"
},
"sigllm.primitives.formatting.json_format.format_as_integer#1": {
"X": "y_hat"
},
"sigllm.primitives.transformation.Scalar2Float#1": {
"X": "y_hat"
},
"sigllm.primitives.transformation.Scalar2Float#2": {
"X": "y"
},
"sigllm.primitives.postprocessing.aggregate_rolling_window#1": {
"y": "y_hat"
},
"numpy.reshape#1": {
"X": "y_hat"
},
"orion.primitives.timeseries_anomalies.find_anomalies#1": {
"y": "anomalies"
}
}
}
11 changes: 10 additions & 1 deletion sigllm/primitives/forecasting/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
DEFAULT_PAD_TOKEN = '<pad>'

VALID_NUMBERS = list('0123456789')
VALID_MULTIVARIATE_SYMBOLS = []

DEFAULT_MODEL = 'mistralai/Mistral-7B-Instruct-v0.2'

Expand Down Expand Up @@ -41,6 +42,9 @@ class HF:
padding (int):
Additional padding token to forecast to reduce short horizon predictions.
Default to `0`.
multivariate_allowed_symbols (list):
List of token strings to allow in addition to digits when generating.
Default to `[]`.
"""

def __init__(
Expand All @@ -53,6 +57,7 @@ def __init__(
raw=False,
samples=1,
padding=0,
multivariate_allowed_symbols=[],
):
self.name = name
self.sep = sep
Expand All @@ -62,6 +67,7 @@ def __init__(
self.raw = raw
self.samples = samples
self.padding = padding
self.multivariate_allowed_symbols = multivariate_allowed_symbols

self.tokenizer = AutoTokenizer.from_pretrained(self.name, use_fast=False)

Expand All @@ -85,6 +91,9 @@ def __init__(
token = self.tokenizer.convert_tokens_to_ids(number)
valid_tokens.append(token)

for symbol in self.multivariate_allowed_symbols:
valid_tokens.append(self.tokenizer.convert_tokens_to_ids(symbol))

valid_tokens.append(self.tokenizer.convert_tokens_to_ids(self.sep))
self.invalid_tokens = [
[i] for i in range(len(self.tokenizer) - 1) if i not in valid_tokens
Expand Down Expand Up @@ -116,7 +125,7 @@ def forecast(self, X, **kwargs):
tokenized_input = self.tokenizer([text], return_tensors='pt').to('cuda')

input_length = tokenized_input['input_ids'].shape[1]
average_length = input_length / len(text.split(','))
average_length = input_length / len(text.split(self.sep))
max_tokens = (average_length + self.padding) * self.steps

generate_ids = self.model.generate(
Expand Down
19 changes: 19 additions & 0 deletions sigllm/primitives/formatting/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Multivariate formatting methods for time series data."""

from sigllm.primitives.formatting.multivariate_formatting import MultivariateFormattingMethod
from sigllm.primitives.formatting.json_format import JSONFormat
from sigllm.primitives.formatting.univariate_control import UnivariateControl
from sigllm.primitives.formatting.persistence_control import PersistenceControl
from sigllm.primitives.formatting.value_concatenation import ValueConcatenation
from sigllm.primitives.formatting.value_interleave import ValueInterleave
from sigllm.primitives.formatting.digit_interleave import DigitInterleave

__all__ = [
'MultivariateFormattingMethod',
'JSONFormat',
'UnivariateControl',
'PersistenceControl',
'ValueConcatenation',
'ValueInterleave',
'DigitInterleave',
]
103 changes: 103 additions & 0 deletions sigllm/primitives/formatting/digit_interleave.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import numpy as np

from sigllm.primitives.formatting.multivariate_formatting import MultivariateFormattingMethod


class DigitInterleave(MultivariateFormattingMethod):
"""Formatting method that interleaves digits from multiple values."""

def __init__(self, verbose: bool = False, **kwargs):
super().__init__('digit_interleave', verbose=verbose, **kwargs)

def format_as_string(
self, X: np.ndarray, digits_per_timestamp=3, separator=',', **kwargs
) -> str:
"""Format array as string with interleaved digits."""
max_digits = max(len(str(abs(int(v)))) for window in X for ts in window for v in ts)
width_used = max(digits_per_timestamp, max_digits)
self.metadata['width_used'] = width_used

def interleave_digits(timestamp):
str_values = [str(int(val)) for val in timestamp]
padded_values = [s.zfill(width_used) for s in str_values]
result_str = ''
for digit_pos in range(width_used):
for padded_val in padded_values:
result_str += padded_val[digit_pos]

return result_str

result = [
separator.join(interleave_digits(timestamp) for timestamp in window) + separator
for window in X
]
return result

def format_as_integer(
self,
X: list[str],
separator=',',
trunc=None,
digits_per_timestamp=3,
target_column=None,
**kwargs,
) -> np.ndarray:
"""Parse interleaved digit strings back to integer arrays for the target column.

Args:
X (list[str]):
list of strings, each string is a concatenation of
interleaved digit values separated by separator.
separator (str):
separator between values
trunc (int):
Number of timestamps to extract from each sample.
If None, all timestamps are extracted.
digits_per_timestamp (int):
Number of digits to extract from each timestamp.
target_column (int):
Which column to extract (default 0). Can also be set via config.

Returns:
np.ndarray:
Array that holds int values for the target column
for each sample in each window.
"""
width_used = self.metadata['width_used']
if target_column is None:
target_column = self.config.get('target_column', 0)

def deinterleave_timestamp_target_column(interleaved_str):
"""Convert interleaved digits back to original values and extract target dimension."""
total_digits = len(interleaved_str)
num_values = total_digits // width_used

if target_column >= num_values:
return np.array([None])

value_digits = []
for digit_pos in range(width_used):
pos = digit_pos * num_values + target_column
if pos < total_digits:
value_digits.append(interleaved_str[pos])

if value_digits:
return np.array([int(''.join(value_digits))])
return np.array([None])

result = np.array(
[
[
deinterleave_timestamp_target_column(timestamp)
for sample in entry
for timestamp in sample
.lstrip(separator)
.rstrip(separator)
.split(separator)[:trunc]
if timestamp.strip()
]
for entry in X
],
dtype=object,
)
return result
Loading