sintel-dev · abaranov25 · Dec 8, 2025 · Dec 8, 2025 · Feb 11, 2026 · Feb 16, 2026
diff --git a/sigllm/pipelines/detector/mistral_detector_kmeans.json b/sigllm/pipelines/detector/mistral_detector_kmeans.json
@@ -0,0 +1,101 @@
+{
+    "primitives": [
+        "mlstars.custom.timeseries_preprocessing.time_segments_aggregate",
+        "sklearn.impute.SimpleImputer",
+        "sigllm.primitives.transformation.Scalar2Cluster",
+        "mlstars.custom.timeseries_preprocessing.rolling_window_sequences",
+        "sigllm.primitives.transformation.format_as_string",
+        "sigllm.primitives.forecasting.huggingface.HF",
+        "sigllm.primitives.transformation.format_as_integer",
+        "sigllm.primitives.transformation.Cluster2Scalar",
+        "sigllm.primitives.transformation.Cluster2Scalar",
+        "sigllm.primitives.postprocessing.aggregate_rolling_window",
+        "numpy.reshape",
+        "orion.primitives.timeseries_errors.regression_errors",
+        "orion.primitives.timeseries_anomalies.find_anomalies"
+    ],
+    "init_params": {
+        "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
+            "time_column": "timestamp",
+            "interval": 21600,
+            "method": "mean"
+        },
+        "sigllm.primitives.transformation.Scalar2Cluster#1": {
+            "n_clusters": 100,
+            "fit_fraction": 1.0
+        },
+        "mlstars.custom.timeseries_preprocessing.rolling_window_sequences#1": {
+            "target_column": 0,
+            "window_size": 140,
+            "target_size": 1
+        },
+        "sigllm.primitives.transformation.format_as_string#1": {
+            "space": false
+        },
+        "sigllm.primitives.forecasting.huggingface.HF#1": {
+            "name": "mistralai/Mistral-7B-Instruct-v0.2",
+            "steps": 5
+        },
+        "sigllm.primitives.transformation.format_as_integer#1": {
+            "trunc": 1,
+            "errors": "coerce"
+        },
+        "sigllm.primitives.postprocessing.aggregate_rolling_window#1": {
+            "agg": "median"
+        },
+        "orion.primitives.timeseries_anomalies.find_anomalies#1": {
+            "window_size_portion": 0.3,
+            "window_step_size_portion": 0.1,
+            "fixed_threshold": true
+        }
+    },
+    "input_names": {
+        "sigllm.primitives.transformation.Scalar2Cluster#1": {
+            "X": "y"
+        },
+        "sigllm.primitives.transformation.Cluster2Scalar#1": {
+            "X": "y"
+        },
+        "sigllm.primitives.transformation.format_as_integer#1": {
+            "X": "y_hat"
+        },
+        "sigllm.primitives.transformation.Cluster2Scalar#2": {
+            "X": "y_hat"
+        },
+        "sigllm.primitives.postprocessing.aggregate_rolling_window#1": {
+            "y": "y_hat"
+        },
+        "numpy.reshape#1": {
+            "X": "y_hat"
+        },
+        "orion.primitives.timeseries_anomalies.find_anomalies#1": {
+            "index": "target_index"
+        }
+    },
+    "output_names": {
+        "sklearn.impute.SimpleImputer#1": {
+            "X": "y"
+        },
+        "sigllm.primitives.forecasting.huggingface.HF#1": {
+            "y": "y_hat"
+        },
+        "sigllm.primitives.transformation.format_as_integer#1": {
+            "X": "y_hat"
+        },
+        "sigllm.primitives.transformation.Cluster2Scalar#1": {
+            "X": "y"
+        },
+        "sigllm.primitives.transformation.Cluster2Scalar#2": {
+            "X": "y_hat"
+        },
+        "sigllm.primitives.postprocessing.aggregate_rolling_window#1": {
+            "y": "y_hat"
+        },
+        "numpy.reshape#1": {
+            "X": "y_hat"
+        },
+        "orion.primitives.timeseries_anomalies.find_anomalies#1": {
+            "y": "anomalies"
+        }
+    }
+}
diff --git a/sigllm/primitives/jsons/sigllm.primitives.transformation.Cluster2Scalar.json b/sigllm/primitives/jsons/sigllm.primitives.transformation.Cluster2Scalar.json
@@ -0,0 +1,32 @@
+{
+    "name": "sigllm.primitives.transformation.Cluster2Scalar",
+    "contributors": [
+        "Allen Baranov <baranov@mit.edu>"
+    ],
+    "description": "Transform cluster indices back to float values using centroids.",
+    "classifiers": {
+        "type": "preprocessor",
+        "subtype": "transformer"
+    },
+    "modalities": [],
+    "primitive": "sigllm.primitives.transformation.Cluster2Scalar",
+    "produce": {
+        "method": "transform",
+        "args": [
+            {
+                "name": "X",
+                "type": "ndarray"
+            },
+            {
+                "name": "centroids",
+                "type": "list"
+            }
+        ],
+        "output": [
+            {
+                "name": "X",
+                "type": "ndarray"
+            }
+        ]
+    }
+}
diff --git a/sigllm/primitives/jsons/sigllm.primitives.transformation.Scalar2Cluster.json b/sigllm/primitives/jsons/sigllm.primitives.transformation.Scalar2Cluster.json
@@ -0,0 +1,53 @@
+{
+    "name": "sigllm.primitives.transformation.Scalar2Cluster",
+    "contributors": [
+        "Allen Baranov <baranov@mit.edu>"
+    ],
+    "description": "Transform float values into cluster indices using K-means.",
+    "classifiers": {
+        "type": "preprocessor",
+        "subtype": "transformer"
+    },
+    "modalities": [],
+    "primitive": "sigllm.primitives.transformation.Scalar2Cluster",
+    "fit": {
+        "method": "fit",
+        "args": [
+            {
+                "name": "X",
+                "type": "ndarray"
+            }
+        ]
+    },
+    "produce": {
+        "method": "transform",
+        "args": [
+            {
+                "name": "X",
+                "type": "ndarray"
+            }
+        ],
+        "output": [
+            {
+                "name": "X",
+                "type": "ndarray"
+            },
+            {
+                "name": "centroids",
+                "type": "list"
+            }
+        ]
+    },
+    "hyperparameters": {
+        "fixed": {
+            "n_clusters": {
+                "type": "int",
+                "default": 100
+            },
+            "fit_fraction": {
+                "type": "float",
+                "default": 1.0
+            }
+        }
+    }
+}
diff --git a/sigllm/primitives/transformation.py b/sigllm/primitives/transformation.py
@@ -4,6 +4,7 @@
 import re
 
 import numpy as np
+from sklearn.cluster import KMeans
 
 
 def format_as_string(X, sep=',', space=False, single=False):
@@ -96,7 +97,7 @@ def format_as_integer(X, sep=',', trunc=None, errors='ignore'):
             to `'ignore'`.
             - If 'ignore', then invalid values will be ignored in the result.
             - If 'filter', then invalid values will be filtered out of the string.
-            - If 'raise', then encountering invalud values will raise an exception.
+            - If 'raise', then encountering invalid values will raise an exception.
             - If 'coerce', then invalid values will be set as NaN.
 
     Returns:
@@ -188,3 +189,102 @@ def transform(self, X, minimum=0, decimal=2):
         values = X * 10 ** (-decimal)
 
         return values + minimum
+
+
+class Scalar2Cluster:
+    """Convert an array of float values to cluster indices using K-means.
+
+    Fits K-means on the input data and maps each value to the index of
+    its nearest centroid. Centroids are sorted in ascending order so that
+    cluster index 0 corresponds to the smallest centroid value.
+
+    Args:
+        n_clusters (int):
+            Number of K-means clusters. Default to ``100``.
+        fit_fraction (float):
+            Fraction of data to use for fitting K-means (0 < fit_fraction <= 1).
+            If less than 1, only the first fit_fraction of rows are used for fitting.
+            Default to ``1.0`` (use all data).
+    """
+
+    def __init__(self, n_clusters=100, fit_fraction=1.0):
+        self.n_clusters = n_clusters
+        self.fit_fraction = fit_fraction
+        self.centroids = None
+
+    def fit(self, X):
+        """Fit K-means on the data and store sorted centroids.
+
+        Args:
+            X (ndarray):
+                2-D array of shape (n_samples, n_features)
+
+        Returns:
+            No output. The method stores the fitted centroids in the
+            class instance instead.
+        """
+        n_samples = X.shape[0]
+        n_fit = max(1, int(n_samples * self.fit_fraction))
+        X_fit = X[:n_fit]
+
+        centroids_list = []
+        for col in X_fit.T:
+            n_unique = len(np.unique(col))
+            if self.n_clusters >= n_unique:
+                centroids = np.sort(np.unique(col))
+            else:
+                kmeans = KMeans(n_clusters=self.n_clusters, random_state=0, n_init=10)
+                kmeans.fit(col.reshape(-1, 1))
+                centroids = np.sort(kmeans.cluster_centers_.ravel())
+            centroids_list.append(centroids)
+
+        self.centroids = centroids_list
+
+    def transform(self, X):
+        """Map each value to its nearest centroid index.
+
+        Args:
+            X (ndarray):
+                2-D array of shape ``(n_samples, n_features)``.
+
+        Returns:
+            X (ndarray):
+                Integer cluster labels with the same shape as input.
+            centroids (list of ndarray):
+                Sorted centroid arrays, one per column.
+        """
+        labels_list = []
+        for i, col in enumerate(X.T):
+            centroids = self.centroids[i]
+            col_labels = np.argmin(np.abs(col[:, None] - centroids[None, :]), axis=1)
+            labels_list.append(col_labels)
+
+        labels = (
+            np.column_stack(labels_list) if len(labels_list) > 1 else labels_list[0].reshape(-1, 1)
+        )
+        return labels, self.centroids
+
+
+class Cluster2Scalar:
+    """Convert cluster indices back to float values using centroids.
+
+    Maps an array of integer cluster indices to the corresponding
+    centroid values produced by Scalar2Cluster.
+    """
+
+    def transform(self, X, centroids):
+        """Convert cluster indices to centroid float values.
+
+        Args:
+            X (ndarray):
+                Integer cluster labels.
+            centroids (list of ndarray):
+                Sorted centroid arrays from Scalar2Cluster.
+
+        Returns:
+            ndarray:
+                Float values corresponding to the centroid of each label.
+        """
+        base_centroids = np.asarray(centroids[0])
+        idx = np.clip(X.astype(int), 0, len(base_centroids) - 1)
+        return np.take(base_centroids, idx)