Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions sigllm/pipelines/detector/mistral_detector_kmeans.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
{
"primitives": [
"mlstars.custom.timeseries_preprocessing.time_segments_aggregate",
"sklearn.impute.SimpleImputer",
"sigllm.primitives.transformation.Scalar2Cluster",
"mlstars.custom.timeseries_preprocessing.rolling_window_sequences",
"sigllm.primitives.transformation.format_as_string",
"sigllm.primitives.forecasting.huggingface.HF",
"sigllm.primitives.transformation.format_as_integer",
"sigllm.primitives.transformation.Cluster2Scalar",
"sigllm.primitives.transformation.Cluster2Scalar",
"sigllm.primitives.postprocessing.aggregate_rolling_window",
"numpy.reshape",
"orion.primitives.timeseries_errors.regression_errors",
"orion.primitives.timeseries_anomalies.find_anomalies"
],
"init_params": {
"mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
"time_column": "timestamp",
"interval": 21600,
"method": "mean"
},
"sigllm.primitives.transformation.Scalar2Cluster#1": {
"n_clusters": 100,
"fit_fraction": 1.0
},
"mlstars.custom.timeseries_preprocessing.rolling_window_sequences#1": {
"target_column": 0,
"window_size": 140,
"target_size": 1
},
"sigllm.primitives.transformation.format_as_string#1": {
"space": false
},
"sigllm.primitives.forecasting.huggingface.HF#1": {
"name": "mistralai/Mistral-7B-Instruct-v0.2",
"steps": 5
},
"sigllm.primitives.transformation.format_as_integer#1": {
"trunc": 1,
"errors": "coerce"
},
"sigllm.primitives.postprocessing.aggregate_rolling_window#1": {
"agg": "median"
},
"orion.primitives.timeseries_anomalies.find_anomalies#1": {
"window_size_portion": 0.3,
"window_step_size_portion": 0.1,
"fixed_threshold": true
}
},
"input_names": {
"sigllm.primitives.transformation.Scalar2Cluster#1": {
"X": "y"
},
"sigllm.primitives.transformation.Cluster2Scalar#1": {
"X": "y"
},
"sigllm.primitives.transformation.format_as_integer#1": {
"X": "y_hat"
},
"sigllm.primitives.transformation.Cluster2Scalar#2": {
"X": "y_hat"
},
"sigllm.primitives.postprocessing.aggregate_rolling_window#1": {
"y": "y_hat"
},
"numpy.reshape#1": {
"X": "y_hat"
},
"orion.primitives.timeseries_anomalies.find_anomalies#1": {
"index": "target_index"
}
},
"output_names": {
"sklearn.impute.SimpleImputer#1": {
"X": "y"
},
"sigllm.primitives.forecasting.huggingface.HF#1": {
"y": "y_hat"
},
"sigllm.primitives.transformation.format_as_integer#1": {
"X": "y_hat"
},
"sigllm.primitives.transformation.Cluster2Scalar#1": {
"X": "y"
},
"sigllm.primitives.transformation.Cluster2Scalar#2": {
"X": "y_hat"
},
"sigllm.primitives.postprocessing.aggregate_rolling_window#1": {
"y": "y_hat"
},
"numpy.reshape#1": {
"X": "y_hat"
},
"orion.primitives.timeseries_anomalies.find_anomalies#1": {
"y": "anomalies"
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"name": "sigllm.primitives.transformation.Cluster2Scalar",
"contributors": [
"Allen Baranov <baranov@mit.edu>"
],
"description": "Transform cluster indices back to float values using centroids.",
"classifiers": {
"type": "preprocessor",
"subtype": "transformer"
},
"modalities": [],
"primitive": "sigllm.primitives.transformation.Cluster2Scalar",
"produce": {
"method": "transform",
"args": [
{
"name": "X",
"type": "ndarray"
},
{
"name": "centroids",
"type": "list"
}
],
"output": [
{
"name": "X",
"type": "ndarray"
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"name": "sigllm.primitives.transformation.Scalar2Cluster",
"contributors": [
"Allen Baranov <baranov@mit.edu>"
],
"description": "Transform float values into cluster indices using K-means.",
"classifiers": {
"type": "preprocessor",
"subtype": "transformer"
},
"modalities": [],
"primitive": "sigllm.primitives.transformation.Scalar2Cluster",
"fit": {
"method": "fit",
"args": [
{
"name": "X",
"type": "ndarray"
}
]
},
"produce": {
"method": "transform",
"args": [
{
"name": "X",
"type": "ndarray"
}
],
"output": [
{
"name": "X",
"type": "ndarray"
},
{
"name": "centroids",
"type": "list"
}
]
},
"hyperparameters": {
"fixed": {
"n_clusters": {
"type": "int",
"default": 100
},
"fit_fraction": {
"type": "float",
"default": 1.0
}
}
}
}
102 changes: 101 additions & 1 deletion sigllm/primitives/transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re

import numpy as np
from sklearn.cluster import KMeans


def format_as_string(X, sep=',', space=False, single=False):
Expand Down Expand Up @@ -96,7 +97,7 @@ def format_as_integer(X, sep=',', trunc=None, errors='ignore'):
to `'ignore'`.
- If 'ignore', then invalid values will be ignored in the result.
- If 'filter', then invalid values will be filtered out of the string.
- If 'raise', then encountering invalud values will raise an exception.
- If 'raise', then encountering invalid values will raise an exception.
- If 'coerce', then invalid values will be set as NaN.

Returns:
Expand Down Expand Up @@ -188,3 +189,102 @@ def transform(self, X, minimum=0, decimal=2):
values = X * 10 ** (-decimal)

return values + minimum


class Scalar2Cluster:
"""Convert an array of float values to cluster indices using K-means.

Fits K-means on the input data and maps each value to the index of
its nearest centroid. Centroids are sorted in ascending order so that
cluster index 0 corresponds to the smallest centroid value.

Args:
n_clusters (int):
Number of K-means clusters. Default to ``100``.
fit_fraction (float):
Fraction of data to use for fitting K-means (0 < fit_fraction <= 1).
If less than 1, only the first fit_fraction of rows are used for fitting.
Default to ``1.0`` (use all data).
"""

def __init__(self, n_clusters=100, fit_fraction=1.0):
self.n_clusters = n_clusters
self.fit_fraction = fit_fraction
self.centroids = None

def fit(self, X):
"""Fit K-means on the data and store sorted centroids.

Args:
X (ndarray):
2-D array of shape (n_samples, n_features)

Returns:
No output. The method stores the fitted centroids in the
class instance instead.
"""
n_samples = X.shape[0]
n_fit = max(1, int(n_samples * self.fit_fraction))
X_fit = X[:n_fit]

centroids_list = []
for col in X_fit.T:
n_unique = len(np.unique(col))
if self.n_clusters >= n_unique:
centroids = np.sort(np.unique(col))
else:
kmeans = KMeans(n_clusters=self.n_clusters, random_state=0, n_init=10)
kmeans.fit(col.reshape(-1, 1))
centroids = np.sort(kmeans.cluster_centers_.ravel())
centroids_list.append(centroids)

self.centroids = centroids_list

def transform(self, X):
"""Map each value to its nearest centroid index.

Args:
X (ndarray):
2-D array of shape ``(n_samples, n_features)``.

Returns:
X (ndarray):
Integer cluster labels with the same shape as input.
centroids (list of ndarray):
Sorted centroid arrays, one per column.
"""
labels_list = []
for i, col in enumerate(X.T):
centroids = self.centroids[i]
col_labels = np.argmin(np.abs(col[:, None] - centroids[None, :]), axis=1)
labels_list.append(col_labels)

labels = (
np.column_stack(labels_list) if len(labels_list) > 1 else labels_list[0].reshape(-1, 1)
)
return labels, self.centroids


class Cluster2Scalar:
"""Convert cluster indices back to float values using centroids.

Maps an array of integer cluster indices to the corresponding
centroid values produced by Scalar2Cluster.
"""

def transform(self, X, centroids):
"""Convert cluster indices to centroid float values.

Args:
X (ndarray):
Integer cluster labels.
centroids (list of ndarray):
Sorted centroid arrays from Scalar2Cluster.

Returns:
ndarray:
Float values corresponding to the centroid of each label.
"""
base_centroids = np.asarray(centroids[0])
idx = np.clip(X.astype(int), 0, len(base_centroids) - 1)
return np.take(base_centroids, idx)
Loading
Loading