From 2ba851e4c991963a5ba34136bee385e266f12a64 Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Mon, 31 Mar 2025 16:33:33 -0400 Subject: [PATCH 01/27] Core Changed to get normal behavior in pipeline --- sigllm/core.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sigllm/core.py b/sigllm/core.py index 3e55407..c860115 100644 --- a/sigllm/core.py +++ b/sigllm/core.py @@ -100,8 +100,8 @@ def __repr__(self): return ('SigLLM:\n{}\nhyperparameters:\n{}\n').format(pipeline, hyperparameters) - def detect(self, data: pd.DataFrame, visualization: bool = False, **kwargs) -> pd.DataFrame: - """Detect anomalies in the given data.. + def detect(self, data: pd.DataFrame, normal: pd.DataFrame = None, visualization: bool = False, **kwargs) -> pd.DataFrame: + """Detect anomalies in the given data. If ``visualization=True``, also return the visualization outputs from the MLPipeline object. @@ -110,6 +110,10 @@ def detect(self, data: pd.DataFrame, visualization: bool = False, **kwargs) -> p data (DataFrame): Input data, passed as a ``pandas.DataFrame`` containing exactly two columns: timestamp and value. + normal (DataFrame, optional): + Normal reference data for one-shot learning, passed as a ``pandas.DataFrame`` + containing exactly two columns: timestamp and value. If None, zero-shot + learning is used. Default to None. visualization (bool): If ``True``, also capture the ``visualization`` named output from the ``MLPipeline`` and return it as a second @@ -125,6 +129,9 @@ def detect(self, data: pd.DataFrame, visualization: bool = False, **kwargs) -> p if not self._fitted: self._mlpipeline = self._get_mlpipeline() + if normal is not None: + kwargs['normal'] = normal + result = self._detect(self._mlpipeline.fit, data, visualization, **kwargs) self._fitted = True From bc285a1f1a0695f7529483b6ca2698cc01bdaf08 Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Mon, 31 Mar 2025 16:37:08 -0400 Subject: [PATCH 02/27] Transformation changed --- sigllm/primitives/transformation.py | 119 +++++++++++++++++++++++++--- 1 file changed, 106 insertions(+), 13 deletions(-) diff --git a/sigllm/primitives/transformation.py b/sigllm/primitives/transformation.py index 41a98fc..1813d58 100644 --- a/sigllm/primitives/transformation.py +++ b/sigllm/primitives/transformation.py @@ -6,34 +6,44 @@ import numpy as np -def format_as_string(X, sep=',', space=False): +def format_as_string(X, sep=',', space=False, normal=False): """Format X to a list of string. - Transform a 2-D array of integers to a list of strings, - seperated by the indicated seperator and space. + Transform an array of integers to string(s), separated by the indicated separator and space. + Handles two cases: + - If normal=True, treats X as a single time series (window_size, 1) + - If normal=False, treats X as multiple windows (num_windows, window_size, 1) Args: sep (str): String to separate each element in X. Default to `','`. space (bool): Whether to add space between each digit in the result. Default to `False`. + normal (bool): + Whether to treat X as a normal time series. If True, expects (window_size, 1) + and returns a single string. If False, expects (num_windows, window_size, 1) + and returns a list of strings. Default to `False`. Returns: - ndarray: - A list of string representation of each row. + ndarray or str: + If normal=True, returns a single string representation. + If normal=False, returns a list of string representations for each window. """ + def _as_string(x): text = sep.join(list(map(str, x.flatten()))) - if space: text = ' '.join(text) - return text - results = list(map(_as_string, X)) - - return np.array(results) + if normal: + # Handle as single time series (window_size, 1) + return _as_string(X) + else: + # Handle as multiple windows (num_windows, window_size, 1) + results = list(map(_as_string, X)) + return np.array(results) def _from_string_to_integer(text, sep=',', trunc=None, errors='ignore'): @@ -74,6 +84,7 @@ def format_as_integer(X, sep=',', trunc=None, errors='ignore'): Transforms a list of list of string input as 3-D array of integers, seperated by the indicated seperator and truncated based on `trunc`. + Handles empty strings by returning empty arrays. Args: sep (str): @@ -91,7 +102,7 @@ def format_as_integer(X, sep=',', trunc=None, errors='ignore'): Returns: ndarray: - An array of digits values. + An array of digits values. Empty arrays for empty strings. """ result = list() for string_list in X: @@ -100,8 +111,11 @@ def format_as_integer(X, sep=',', trunc=None, errors='ignore'): raise ValueError('Input is not a list of lists.') for text in string_list: - scalar = _from_string_to_integer(text, sep, trunc, errors) - sample.append(scalar) + if not text: # Handle empty string + sample.append(np.array([], dtype=float)) + else: + scalar = _from_string_to_integer(text, sep, trunc, errors) + sample.append(scalar) result.append(sample) @@ -147,6 +161,7 @@ def transform(self, X): values = sign * (values * 10**self.decimal).astype(int) + return values, self.minimum, self.decimal @@ -171,3 +186,81 @@ def transform(self, X, minimum=0, decimal=2): values = X * 10 ** (-decimal) return values + minimum + +from typing import List +import re + +def parse_anomaly_response(X): + """ + Parse a list of lists of LLM responses to extract anomaly values and format them as strings. + + Args: + X (List[List[str]]): List of lists of response texts from the LLM in the format + "Answer: no anomalies" or "Answer: [val1, val2, ..., valN]" + + Returns: + List[List[str]]: List of lists of parsed responses where each element is either + "val1,val2,...,valN" if anomalies are found, + or empty string if no anomalies are present + """ + + def parse_single_response(text: str) -> str: + # Clean the input text + text = text.strip().lower() + + # Check for "no anomalies" case + if "no anomalies" in text or "no anomaly" in text: + return "" + + # Try to extract the values using regex + # Match anything inside square brackets that consists of digits and commas + pattern = r'\[([\d\s,]+)\]' + match = re.search(pattern, text) + + if match: + # Extract the content inside brackets and clean it + values = match.group(1) + # Split by comma, strip whitespace, and filter out empty strings + values = [val.strip() for val in values.split(',') if val.strip()] + # Join the values with commas + return ','.join(values) + + # Return empty string if no valid format is found + return "" + + # Process each list of responses in the input + result = [] + for response_list in X: + # Process each response in the inner list + parsed_list = [parse_single_response(response) for response in response_list] + result.append(parsed_list) + + #return np.array(result, dtype=object) + return result + +def format_as_single_string(X, sep=',', space=False): + """Format a single time series to a string. + + Transform a 1-D array of integers to a single string, + separated by the indicated separator and space. + + Args: + sep (str): + String to separate each element in X. Default to `','`. + space (bool): + Whether to add space between each digit in the result. Default to `False`. + + Returns: + str: + A string representation of the time series. + """ + # Ensure X is 1D + if X.ndim > 1: + X = X.flatten() + + text = sep.join(list(map(str, X))) + + if space: + text = ' '.join(text) + + return text \ No newline at end of file From 502603cc09fcecd777635d0d1b2587cdfcc96f17 Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Mon, 31 Mar 2025 16:38:28 -0400 Subject: [PATCH 03/27] anomalies.py changed --- sigllm/primitives/prompting/anomalies.py | 14 +++++++++++--- sigllm/primitives/transformation.py | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/sigllm/primitives/prompting/anomalies.py b/sigllm/primitives/prompting/anomalies.py index d70164d..c628237 100644 --- a/sigllm/primitives/prompting/anomalies.py +++ b/sigllm/primitives/prompting/anomalies.py @@ -35,6 +35,8 @@ def val2idx(y, X): idx_win_list.append(indices) idx_list.append(idx_win_list) idx_list = np.array(idx_list, dtype=object) + + return idx_list @@ -57,7 +59,6 @@ def find_anomalies_in_windows(y, alpha=0.5): idx_list = [] for samples in y: min_vote = np.ceil(alpha * len(samples)) - # print(type(samples.tolist())) flattened_res = np.concatenate(samples.tolist()) @@ -67,6 +68,7 @@ def find_anomalies_in_windows(y, alpha=0.5): idx_list.append(final_list) idx_list = np.array(idx_list, dtype=object) + return idx_list @@ -112,7 +114,7 @@ def format_anomalies(y, timestamp, padding_size=50): Args: y (ndarray): - A 1-dimensional array of indices. + A 1-dimensional array of indices. Can be empty if no anomalies are found. timestamp (ndarray): List of full timestamp of the signal. padding_size (int): @@ -120,8 +122,13 @@ def format_anomalies(y, timestamp, padding_size=50): Returns: List[Tuple]: - List of intervals (start, end, score). + List of intervals (start, end, score). Empty list if no anomalies are found. """ + + # Handle empty array case + if len(y) == 0: + return [] + y = timestamp[y] # Convert list of indices into list of timestamps start, end = timestamp[0], timestamp[-1] interval = timestamp[1] - timestamp[0] @@ -151,4 +158,5 @@ def format_anomalies(y, timestamp, padding_size=50): merged_intervals.append(current_interval) # Append the current interval if no overlap merged_intervals = [(interval[0], interval[1], 0) for interval in merged_intervals] + return merged_intervals diff --git a/sigllm/primitives/transformation.py b/sigllm/primitives/transformation.py index 1813d58..d16632e 100644 --- a/sigllm/primitives/transformation.py +++ b/sigllm/primitives/transformation.py @@ -27,7 +27,7 @@ def format_as_string(X, sep=',', space=False, normal=False): Returns: ndarray or str: If normal=True, returns a single string representation. - If normal=False, returns a list of string representations for each window. + If normal=False, returns a list of string representations for each wprintindow. """ From 0769e90c178708a0e849cceff098158152f2c096 Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Mon, 31 Mar 2025 16:40:13 -0400 Subject: [PATCH 04/27] Hugginface.py changed : no restrictions token, and also has normal as input if 1-shot --- sigllm/primitives/prompting/huggingface.py | 90 ++++++++++++++++------ 1 file changed, 65 insertions(+), 25 deletions(-) diff --git a/sigllm/primitives/prompting/huggingface.py b/sigllm/primitives/prompting/huggingface.py index ac33874..d744392 100644 --- a/sigllm/primitives/prompting/huggingface.py +++ b/sigllm/primitives/prompting/huggingface.py @@ -47,6 +47,8 @@ class HF: padding (int): Additional padding token to forecast to reduce short horizon predictions. Default to `0`. + restrict_tokens (bool): + Whether to restrict tokens or not. Default to `True`. """ def __init__( @@ -59,6 +61,7 @@ def __init__( raw=False, samples=10, padding=0, + restrict_tokens=False, ): self.name = name self.sep = sep @@ -68,6 +71,7 @@ def __init__( self.raw = raw self.samples = samples self.padding = padding + self.restrict_tokens = restrict_tokens self.tokenizer = AutoTokenizer.from_pretrained(self.name, use_fast=False) @@ -85,16 +89,19 @@ def __init__( self.tokenizer.add_special_tokens(special_tokens_dict) self.tokenizer.pad_token = self.tokenizer.eos_token # indicate the end of the time series - # invalid tokens - valid_tokens = [] - for number in VALID_NUMBERS: - token = self.tokenizer.convert_tokens_to_ids(number) - valid_tokens.append(token) + # Only set up invalid tokens if restriction is enabled + if self.restrict_tokens: + valid_tokens = [] + for number in VALID_NUMBERS: + token = self.tokenizer.convert_tokens_to_ids(number) + valid_tokens.append(token) - valid_tokens.append(self.tokenizer.convert_tokens_to_ids(self.sep)) - self.invalid_tokens = [ - [i] for i in range(len(self.tokenizer) - 1) if i not in valid_tokens - ] + valid_tokens.append(self.tokenizer.convert_tokens_to_ids(self.sep)) + self.invalid_tokens = [ + [i] for i in range(len(self.tokenizer) - 1) if i not in valid_tokens + ] + else: + self.invalid_tokens = None self.model = AutoModelForCausalLM.from_pretrained( self.name, @@ -104,12 +111,15 @@ def __init__( self.model.eval() - def detect(self, X, **kwargs): + def detect(self, X, normal=None, **kwargs): """Use HF to detect anomalies of a signal. Args: X (ndarray): Input sequences of strings containing signal values + normal (str, optional): + A normal reference sequence for one-shot learning. If None, + zero-shot learning is used. Default to None. Returns: list, list: @@ -120,31 +130,61 @@ def detect(self, X, **kwargs): max_tokens = input_length * float(self.anomalous_percent) all_responses, all_generate_ids = [], [] + # Prepare the one-shot example if provided + one_shot_message = "" + if normal is not None: + one_shot_message = PROMPTS['one_shot_prefix'] + normal + "\n\n" + for text in tqdm(X): + system_message = PROMPTS['system_message'] user_message = PROMPTS['user_message'] - message = ' '.join([system_message, user_message, text, '[RESPONSE]']) - - input_length = len(self.tokenizer.encode(message[0])) + + # Combine messages with one-shot example if provided + message = ' '.join([ + system_message, + one_shot_message, + user_message, + text, + '[RESPONSE]' + ]) + + input_length = len(self.tokenizer.encode(message)) tokenized_input = self.tokenizer(message, return_tensors='pt').to('cuda') - generate_ids = self.model.generate( + generate_kwargs = { **tokenized_input, - do_sample=True, - max_new_tokens=max_tokens, - temperature=self.temp, - top_p=self.top_p, - bad_words_ids=self.invalid_tokens, - renormalize_logits=True, - num_return_sequences=self.samples, - ) - - responses = self.tokenizer.batch_decode( - generate_ids[:, input_length:], + 'do_sample': True, + 'max_new_tokens': max_tokens, + 'temperature': self.temp, + 'top_p': self.top_p, + 'renormalize_logits': True, + 'num_return_sequences': self.samples, + } + + # Only add bad_words_ids if token restriction is enabled + # if self.restrict_tokens: + # generate_kwargs['bad_words_ids'] = self.invalid_tokens + + generate_ids = self.model.generate(**generate_kwargs) + + # Get the full generated text + full_responses = self.tokenizer.batch_decode( + generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False, ) + + # Extract only the part after [RESPONSE] + responses = [] + for full_response in full_responses: + try: + response = full_response.split('[RESPONSE]')[1].strip() + responses.append(response) + except IndexError: + responses.append("") # If no [RESPONSE] found, return empty string + all_responses.append(responses) all_generate_ids.append(generate_ids) From 2929f8b3f475f255309bd5b075b4d6412d3caa25 Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Mon, 31 Mar 2025 16:41:05 -0400 Subject: [PATCH 05/27] Timeseries preprocessing.py --- sigllm/primitives/prompting/timeseries_preprocessing.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sigllm/primitives/prompting/timeseries_preprocessing.py b/sigllm/primitives/prompting/timeseries_preprocessing.py index e5d3644..fa193a9 100644 --- a/sigllm/primitives/prompting/timeseries_preprocessing.py +++ b/sigllm/primitives/prompting/timeseries_preprocessing.py @@ -8,7 +8,7 @@ import numpy as np -def rolling_window_sequences(X, window_size=500, step_size=100): +def rolling_window_sequences(X, window_size=50, step_size=10): """Create rolling window sequences out of time series data. This function creates an array of sequences by rolling over the input sequence. @@ -37,5 +37,4 @@ def rolling_window_sequences(X, window_size=500, step_size=100): out_X.append(X[start:end]) X_index.append(index[start]) start = start + step_size - return np.asarray(out_X), np.asarray(X_index), window_size, step_size From 785f96d6f4f6d6055458594e09631ea8ad8b83fc Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Mon, 31 Mar 2025 16:45:51 -0400 Subject: [PATCH 06/27] jsons files added for primitives --- ...m.primitives.prompting.huggingface.HF.json | 5 ++++ ...reprocessing.rolling_window_sequences.json | 4 +-- ...tives.transformation.format_as_string.json | 6 ++++- ...transformation.parse_anomaly_response.json | 25 +++++++++++++++++++ 4 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 sigllm/primitives/jsons/sigllm.primitives.transformation.parse_anomaly_response.json diff --git a/sigllm/primitives/jsons/sigllm.primitives.prompting.huggingface.HF.json b/sigllm/primitives/jsons/sigllm.primitives.prompting.huggingface.HF.json index 91d0530..a5f6df8 100644 --- a/sigllm/primitives/jsons/sigllm.primitives.prompting.huggingface.HF.json +++ b/sigllm/primitives/jsons/sigllm.primitives.prompting.huggingface.HF.json @@ -16,6 +16,11 @@ { "name": "X", "type": "ndarray" + }, + { + "name": "normal", + "type": "ndarray", + "default": null } ], "output": [ diff --git a/sigllm/primitives/jsons/sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences.json b/sigllm/primitives/jsons/sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences.json index 23658e8..c5f2bd8 100644 --- a/sigllm/primitives/jsons/sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences.json +++ b/sigllm/primitives/jsons/sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences.json @@ -43,11 +43,11 @@ "fixed": { "window_size": { "type": "int", - "default": 500 + "default": 50 }, "step_size": { "type": "int", - "default": 100 + "default": 50 } } } diff --git a/sigllm/primitives/jsons/sigllm.primitives.transformation.format_as_string.json b/sigllm/primitives/jsons/sigllm.primitives.transformation.format_as_string.json index 89d18f5..8f0b115 100644 --- a/sigllm/primitives/jsons/sigllm.primitives.transformation.format_as_string.json +++ b/sigllm/primitives/jsons/sigllm.primitives.transformation.format_as_string.json @@ -4,7 +4,7 @@ "Sarah Alnegheimish ", "Linh Nguyen " ], - "description": "Transform an ndarray of scalar values to an ndarray of string.", + "description": "Format X to string(s). Handles both normal time series (single string) and multiple windows (list of strings).", "classifiers": { "type": "preprocessor", "subtype": "tranformer" @@ -34,6 +34,10 @@ "space": { "type": "bool", "default": false + }, + "normal": { + "type": "bool", + "default": false } } } diff --git a/sigllm/primitives/jsons/sigllm.primitives.transformation.parse_anomaly_response.json b/sigllm/primitives/jsons/sigllm.primitives.transformation.parse_anomaly_response.json new file mode 100644 index 0000000..9914ffd --- /dev/null +++ b/sigllm/primitives/jsons/sigllm.primitives.transformation.parse_anomaly_response.json @@ -0,0 +1,25 @@ +{ + "name": "sigllm.primitives.transformation.parse_anomaly_response", + "contributors": ["Salim Cherkaoui"], + "documentation": "https://github.com/salimch2/sigllm", + "description": "Parse LLM responses to extract anomaly values from text format.", + "classifiers": { + "type": "transformer", + "subtype": "parser" + }, + "modalities": ["text"], + "primitive": "sigllm.primitives.transformation.parse_anomaly_response", + "produce": { + "args": [ + { + "name": "X", + "type": "ndarray" } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + } +} \ No newline at end of file From 16bb0a5b9500c79de59681b5e25a4efdc59c5090 Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Mon, 31 Mar 2025 16:46:55 -0400 Subject: [PATCH 07/27] jsons files added for primitives --- .../sigllm.primitives.transformation.parse_anomaly_response.json | 1 - 1 file changed, 1 deletion(-) diff --git a/sigllm/primitives/jsons/sigllm.primitives.transformation.parse_anomaly_response.json b/sigllm/primitives/jsons/sigllm.primitives.transformation.parse_anomaly_response.json index 9914ffd..93669e4 100644 --- a/sigllm/primitives/jsons/sigllm.primitives.transformation.parse_anomaly_response.json +++ b/sigllm/primitives/jsons/sigllm.primitives.transformation.parse_anomaly_response.json @@ -1,7 +1,6 @@ { "name": "sigllm.primitives.transformation.parse_anomaly_response", "contributors": ["Salim Cherkaoui"], - "documentation": "https://github.com/salimch2/sigllm", "description": "Parse LLM responses to extract anomaly values from text format.", "classifiers": { "type": "transformer", From 4ffaffaba515466be0a7331ad367eab7697de4eb Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Mon, 31 Mar 2025 16:50:24 -0400 Subject: [PATCH 08/27] pipelines 0shot and 1shot added --- sigllm/pipelines/prompter/gpt_prompter.json | 65 ------- ...mpter.json => mistral_prompter_0shot.json} | 19 +- .../prompter/mistral_prompter_1shot.json | 174 ++++++++++++++++++ 3 files changed, 188 insertions(+), 70 deletions(-) delete mode 100644 sigllm/pipelines/prompter/gpt_prompter.json rename sigllm/pipelines/prompter/{mistral_prompter.json => mistral_prompter_0shot.json} (75%) create mode 100644 sigllm/pipelines/prompter/mistral_prompter_1shot.json diff --git a/sigllm/pipelines/prompter/gpt_prompter.json b/sigllm/pipelines/prompter/gpt_prompter.json deleted file mode 100644 index 381dd5b..0000000 --- a/sigllm/pipelines/prompter/gpt_prompter.json +++ /dev/null @@ -1,65 +0,0 @@ -{ - "primitives": [ - "mlstars.custom.timeseries_preprocessing.time_segments_aggregate", - "sklearn.impute.SimpleImputer", - "sigllm.primitives.transformation.Float2Scalar", - "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences", - "sigllm.primitives.transformation.format_as_string", - "sigllm.primitives.prompting.gpt.GPT", - "sigllm.primitives.transformation.format_as_integer", - "sigllm.primitives.prompting.anomalies.val2idx", - "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows", - "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences", - "sigllm.primitives.prompting.anomalies.format_anomalies" - ], - "init_params": { - "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { - "time_column": "timestamp", - "interval": 21600, - "method": "mean" - }, - "sigllm.primitives.transformation.Float2Scalar#1": { - "decimal": 2, - "rescale": true - }, - "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences#1": { - "window_size": 200, - "step_size": 40 - }, - "sigllm.primitives.transformation.format_as_string#1": { - "space": true - }, - "sigllm.primitives.prompting.gpt.GPT#1": { - "name": "gpt-3.5-turbo", - "samples": 10 - }, - "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows#1": { - "alpha": 0.4 - }, - "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences#1": { - "beta": 0.5 - } - }, - "input_names": { - "sigllm.primitives.prompting.gpt.GPT#1": { - "X": "X_str" - }, - "sigllm.primitives.transformation.format_as_integer#1":{ - "X": "y_hat" - } - }, - "output_names": { - "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { - "index": "timestamp" - }, - "sigllm.primitives.transformation.format_as_string#1": { - "X": "X_str" - }, - "sigllm.primitives.prompting.gpt.GPT#1": { - "y": "y_hat" - }, - "sigllm.primitives.transformation.format_as_integer#1":{ - "X": "y" - } - } -} \ No newline at end of file diff --git a/sigllm/pipelines/prompter/mistral_prompter.json b/sigllm/pipelines/prompter/mistral_prompter_0shot.json similarity index 75% rename from sigllm/pipelines/prompter/mistral_prompter.json rename to sigllm/pipelines/prompter/mistral_prompter_0shot.json index 0bc3e10..e15751d 100644 --- a/sigllm/pipelines/prompter/mistral_prompter.json +++ b/sigllm/pipelines/prompter/mistral_prompter_0shot.json @@ -5,10 +5,12 @@ "sigllm.primitives.transformation.Float2Scalar", "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences", "sigllm.primitives.transformation.format_as_string", + "sigllm.primitives.prompting.huggingface.HF", + "sigllm.primitives.transformation.parse_anomaly_response", "sigllm.primitives.transformation.format_as_integer", - "sigllm.primitives.prompting.anomalies.val2idx", - "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows", + "sigllm/primitives/jsons/sigllm.primitives.prompting.anomalies.val2idx.json", + "sigllm/primitives/jsons/sigllm.primitives.prompting.anomalies.find_anomalies_in_windows.json", "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences", "sigllm.primitives.prompting.anomalies.format_anomalies" ], @@ -31,7 +33,8 @@ }, "sigllm.primitives.prompting.huggingface.HF#1": { "name": "mistralai/Mistral-7B-Instruct-v0.2", - "samples": 10 + "samples": 3, + "temp": 0.01 }, "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows#1": { "alpha": 0.4 @@ -44,8 +47,11 @@ "sigllm.primitives.prompting.huggingface.HF#1": { "X": "X_str" }, - "sigllm.primitives.transformation.format_as_integer#1":{ + "sigllm.primitives.transformation.parse_anomaly_response#1": { "X": "y_hat" + }, + "sigllm.primitives.transformation.format_as_integer#1": { + "X": "y_parsed" } }, "output_names": { @@ -58,7 +64,10 @@ "sigllm.primitives.prompting.huggingface.HF#1": { "y": "y_hat" }, - "sigllm.primitives.transformation.format_as_integer#1":{ + "sigllm.primitives.transformation.parse_anomaly_response#1": { + "X": "y_parsed" + }, + "sigllm.primitives.transformation.format_as_integer#1": { "X": "y" } } diff --git a/sigllm/pipelines/prompter/mistral_prompter_1shot.json b/sigllm/pipelines/prompter/mistral_prompter_1shot.json new file mode 100644 index 0000000..99e2996 --- /dev/null +++ b/sigllm/pipelines/prompter/mistral_prompter_1shot.json @@ -0,0 +1,174 @@ +{ + "primitives": [ + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate", + "sklearn.impute.SimpleImputer", + "sigllm.primitives.transformation.Float2Scalar", + "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences", + "sigllm.primitives.transformation.format_as_string", + + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate", + "sklearn.impute.SimpleImputer", + "sigllm.primitives.transformation.Float2Scalar", + "sigllm.primitives.transformation.format_as_string", + + "sigllm.primitives.prompting.huggingface", + "sigllm.primitives.transformation.parse_anomaly_response", + "sigllm.primitives.transformation.format_as_integer", + "sigllm.primitives.prompting.anomalies.val2idx", + "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows", + "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences", + "sigllm.primitives.prompting.anomalies.format_anomalies" + ], + "init_params": { + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { + "time_column": "timestamp", + "interval": 21600, + "method": "mean" + }, + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#2": { + "time_column": "normal_timestamp", + "interval": 21600, + "method": "mean" + }, + "sigllm.primitives.transformation.Float2Scalar#1": { + "decimal": 2, + "rescale": true + }, + "sigllm.primitives.transformation.Float2Scalar#2": { + "decimal": 2, + "rescale": true + }, + "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences#1": { + "window_size": 200, + "step_size": 40 + }, + "sigllm.primitives.transformation.format_as_string#1": { + "space": false + }, + "sigllm.primitives.transformation.format_as_string#2": { + "space": false, + "normal": true + }, + "sigllm.primitives.prompting.huggingface#1": { + "name": "mistralai/Mistral-7B-Instruct-v0.2", + "anomalous_percent": 0.5, + "samples": 1, + "temp": 0.01 + }, + "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows#1": { + "alpha": 0.4 + }, + "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences#1": { + "beta": 0.5 + } + }, + "input_names": { + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { + "X": "X", + "timestamp": "timestamp" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "X_processed" + }, + "sigllm.primitives.transformation.Float2Scalar#1": { + "X": "X_imputed" + }, + "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences#1": { + "X": "X_scalar" + }, + "sigllm.primitives.transformation.format_as_string#1": { + "X": "X_sequences" + }, + + + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#2": { + "X": "normal", + "timestamp": "normal_timestamp" + }, + "sklearn.impute.SimpleImputer#2": { + "X": "normal_processed" + }, + "sigllm.primitives.transformation.Float2Scalar#2": { + "X": "normal_imputed" + }, + "sigllm.primitives.transformation.format_as_string#2": { + "X": "normal_scalar" + }, + + + "sigllm/primitives/jsons/sigllm.primitives.prompting.huggingface.HF.json#1": { + "X": "X_str", + "normal": "normal_str" + }, + "sigllm/primitives/jsons/sigllm.primitives.transformation.parse_anomaly_response.json#1": { + "X": "y_hat" + }, + "sigllm/primitives/jsons/sigllm.primitives.transformation.format_as_integer.json#1": { + "X": "y_parsed" + }, + "sigllm/primitives/jsons/sigllm.primitives.prompting.anomalies.val2idx.json#1": { + "y": "y_intermediate", + "X": "X_sequences" + }, + "sigllm/primitives/jsons/sigllm.primitives.prompting.anomalies.find_anomalies_in_windows.json#1": { + "y": "y_idx" + }, + "sigllm/primitives/jsons/sigllm.primitives.prompting.anomalies.merge_anomalous_sequences.json#1": { + "y": "y_windows" + }, + "sigllm/primitives/jsons/sigllm.primitives.prompting.anomalies.format_anomalies.json#1": { + "y": "y_merged" + } + }, + "output_names": { + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { + "X": "X_processed", + "index": "timestamp" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "X_imputed" + }, + "sigllm.primitives.transformation.Float2Scalar#1": { + "X": "X_scalar" + }, + "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences#1": { + "X": "X_sequences" + }, + "sigllm.primitives.transformation.format_as_string#1": { + "X": "X_str" + }, + + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#2": { + "X": "normal_processed", + "index": "normal_timestamp" + }, + "sklearn.impute.SimpleImputer#2": { + "X": "normal_imputed" + }, + "sigllm.primitives.transformation.Float2Scalar#2": { + "X": "normal_scalar" + }, + "sigllm.primitives.transformation.format_as_string#2": { + "X": "normal_str" + }, + + "sigllm.primitives.prompting.huggingface#1": { + "y": "y_hat" + }, + "sigllm.primitives.transformation.parse_anomaly_response#1": { + "X": "y_parsed" + }, + "sigllm.primitives.transformation.format_as_integer#1": { + "X": "y_intermediate" + }, + "sigllm.primitives.prompting.anomalies.val2idx#1": { + "y": "y_idx" + }, + "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows#1": { + "y": "y_windows" + }, + "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences#1": { + "y": "y_merged" + } + } +} \ No newline at end of file From 8f8fc07ee563a793c63d2058843a6d1a23c0ad3d Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Mon, 31 Mar 2025 16:57:46 -0400 Subject: [PATCH 09/27] add boolean for restrict_tokens in HF --- sigllm/pipelines/prompter/gpt_prompter.json | 65 ++++++++++++++++++ .../pipelines/prompter/mistral_prompter.json | 66 +++++++++++++++++++ ...m.primitives.prompting.huggingface.HF.json | 4 ++ sigllm/primitives/prompting/huggingface.py | 4 +- 4 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 sigllm/pipelines/prompter/gpt_prompter.json create mode 100644 sigllm/pipelines/prompter/mistral_prompter.json diff --git a/sigllm/pipelines/prompter/gpt_prompter.json b/sigllm/pipelines/prompter/gpt_prompter.json new file mode 100644 index 0000000..381dd5b --- /dev/null +++ b/sigllm/pipelines/prompter/gpt_prompter.json @@ -0,0 +1,65 @@ +{ + "primitives": [ + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate", + "sklearn.impute.SimpleImputer", + "sigllm.primitives.transformation.Float2Scalar", + "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences", + "sigllm.primitives.transformation.format_as_string", + "sigllm.primitives.prompting.gpt.GPT", + "sigllm.primitives.transformation.format_as_integer", + "sigllm.primitives.prompting.anomalies.val2idx", + "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows", + "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences", + "sigllm.primitives.prompting.anomalies.format_anomalies" + ], + "init_params": { + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { + "time_column": "timestamp", + "interval": 21600, + "method": "mean" + }, + "sigllm.primitives.transformation.Float2Scalar#1": { + "decimal": 2, + "rescale": true + }, + "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences#1": { + "window_size": 200, + "step_size": 40 + }, + "sigllm.primitives.transformation.format_as_string#1": { + "space": true + }, + "sigllm.primitives.prompting.gpt.GPT#1": { + "name": "gpt-3.5-turbo", + "samples": 10 + }, + "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows#1": { + "alpha": 0.4 + }, + "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences#1": { + "beta": 0.5 + } + }, + "input_names": { + "sigllm.primitives.prompting.gpt.GPT#1": { + "X": "X_str" + }, + "sigllm.primitives.transformation.format_as_integer#1":{ + "X": "y_hat" + } + }, + "output_names": { + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { + "index": "timestamp" + }, + "sigllm.primitives.transformation.format_as_string#1": { + "X": "X_str" + }, + "sigllm.primitives.prompting.gpt.GPT#1": { + "y": "y_hat" + }, + "sigllm.primitives.transformation.format_as_integer#1":{ + "X": "y" + } + } +} \ No newline at end of file diff --git a/sigllm/pipelines/prompter/mistral_prompter.json b/sigllm/pipelines/prompter/mistral_prompter.json new file mode 100644 index 0000000..a1a5bb7 --- /dev/null +++ b/sigllm/pipelines/prompter/mistral_prompter.json @@ -0,0 +1,66 @@ +{ + "primitives": [ + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate", + "sklearn.impute.SimpleImputer", + "sigllm.primitives.transformation.Float2Scalar", + "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences", + "sigllm.primitives.transformation.format_as_string", + "sigllm.primitives.prompting.huggingface.HF", + "sigllm.primitives.transformation.format_as_integer", + "sigllm.primitives.prompting.anomalies.val2idx", + "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows", + "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences", + "sigllm.primitives.prompting.anomalies.format_anomalies" + ], + "init_params": { + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { + "time_column": "timestamp", + "interval": 21600, + "method": "mean" + }, + "sigllm.primitives.transformation.Float2Scalar#1": { + "decimal": 2, + "rescale": true + }, + "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences#1": { + "window_size": 200, + "step_size": 40 + }, + "sigllm.primitives.transformation.format_as_string#1": { + "space": false + }, + "sigllm.primitives.prompting.huggingface.HF#1": { + "name": "mistralai/Mistral-7B-Instruct-v0.2", + "samples": 10, + "restrict_tokens": true + }, + "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows#1": { + "alpha": 0.4 + }, + "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences#1": { + "beta": 0.5 + } + }, + "input_names": { + "sigllm.primitives.prompting.huggingface.HF#1": { + "X": "X_str" + }, + "sigllm.primitives.transformation.format_as_integer#1":{ + "X": "y_hat" + } + }, + "output_names": { + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { + "index": "timestamp" + }, + "sigllm.primitives.transformation.format_as_string#1": { + "X": "X_str" + }, + "sigllm.primitives.prompting.huggingface.HF#1": { + "y": "y_hat" + }, + "sigllm.primitives.transformation.format_as_integer#1":{ + "X": "y" + } + } +} \ No newline at end of file diff --git a/sigllm/primitives/jsons/sigllm.primitives.prompting.huggingface.HF.json b/sigllm/primitives/jsons/sigllm.primitives.prompting.huggingface.HF.json index a5f6df8..b78afc8 100644 --- a/sigllm/primitives/jsons/sigllm.primitives.prompting.huggingface.HF.json +++ b/sigllm/primitives/jsons/sigllm.primitives.prompting.huggingface.HF.json @@ -68,6 +68,10 @@ "padding": { "type": "int", "default": 0 + }, + "restrict_tokens": { + "type": "bool", + "default": false } } } diff --git a/sigllm/primitives/prompting/huggingface.py b/sigllm/primitives/prompting/huggingface.py index d744392..822ae36 100644 --- a/sigllm/primitives/prompting/huggingface.py +++ b/sigllm/primitives/prompting/huggingface.py @@ -164,8 +164,8 @@ def detect(self, X, normal=None, **kwargs): } # Only add bad_words_ids if token restriction is enabled - # if self.restrict_tokens: - # generate_kwargs['bad_words_ids'] = self.invalid_tokens + if self.restrict_tokens: + generate_kwargs['bad_words_ids'] = self.invalid_tokens generate_ids = self.model.generate(**generate_kwargs) From bd334c91848e36986bfc96a40ea79773a9d95baf Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Tue, 1 Apr 2025 11:20:38 -0400 Subject: [PATCH 10/27] good messages.json for prompt --- sigllm/primitives/prompting/huggingface.py | 5 ++++- sigllm/primitives/prompting/huggingface_messages.json | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/sigllm/primitives/prompting/huggingface.py b/sigllm/primitives/prompting/huggingface.py index 822ae36..d8f4606 100644 --- a/sigllm/primitives/prompting/huggingface.py +++ b/sigllm/primitives/prompting/huggingface.py @@ -138,7 +138,10 @@ def detect(self, X, normal=None, **kwargs): for text in tqdm(X): system_message = PROMPTS['system_message'] - user_message = PROMPTS['user_message'] + if self.restrict_tokens: + user_message = PROMPTS['user_message'] + else: + user_message = PROMPTS['user_message_2'] # Combine messages with one-shot example if provided message = ' '.join([ diff --git a/sigllm/primitives/prompting/huggingface_messages.json b/sigllm/primitives/prompting/huggingface_messages.json index 3ad1dad..1b57617 100644 --- a/sigllm/primitives/prompting/huggingface_messages.json +++ b/sigllm/primitives/prompting/huggingface_messages.json @@ -1,4 +1,6 @@ { - "system_message": "You are an exceptionally intelligent assistant that detect anomalies in time series data by listing all the anomalies.", - "user_message": "Below is a [SEQUENCE], please return the anomalies in that sequence in [RESPONSE]. Only return the numbers. [SEQUENCE]" + "system_message": "You are an expert in time series analysis. Your task is to detect anomalies in time series data.", + "user_message": "Below is a [SEQUENCE], please return the anomalies in that sequence in [RESPONSE]. Only return the numbers. [SEQUENCE]", + "user_message_2": "Below is a [SEQUENCE], analyze the following time series and identify any anomalies. If you find anomalies, provide their values in the format [first_anomaly, ..., last_anomaly]. If no anomalies are found, respond with 'no anomalies'. Be concise, do not write code, do not permorm any calculations, just give your answers as told.: [SEQUENCE]", + "one_shot_prefix": "Here is a normal reference of the time series: [NORMAL]" } \ No newline at end of file From dbf8ed1bdb11a63dbf22ed067d12c61ebb2a7b7e Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Tue, 1 Apr 2025 13:16:24 -0400 Subject: [PATCH 11/27] Added load_normal in sigllm.data --- sigllm/data.py | 142 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 sigllm/data.py diff --git a/sigllm/data.py b/sigllm/data.py new file mode 100644 index 0000000..c0561a3 --- /dev/null +++ b/sigllm/data.py @@ -0,0 +1,142 @@ +# -*- coding: utf-8 -*- + +""" +Data Management module. + +This module contains functions that allow downloading demo data from Amazon S3, +as well as load and work with other data stored locally. + +The demo data is a modified version of the NASA data found here: + +https://s3-us-west-2.amazonaws.com/telemanom/data.zip +""" + +import json +import logging +import os + +import numpy as np +import pandas as pd + +LOGGER = logging.getLogger(__name__) + +DATA_PATH = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + 'data' +) +BUCKET = 'sintel-sigllm' +S3_URL = 'https://{}.s3.amazonaws.com/{}' + + +def download_normal(name, test_size=None, data_path=DATA_PATH): + """Load the CSV with the given name from S3. + + If the CSV has never been loaded before, it will be downloaded + from the [d3-ai-orion bucket](https://d3-ai-orion.s3.amazonaws.com) or + the S3 bucket specified following the `s3://{bucket}/path/to/the.csv` format, + and then cached inside the `data` folder, within the `orion` package + directory, and then returned. + + Otherwise, if it has been downloaded and cached before, it will be directly + loaded from the `orion/data` folder without contacting S3. + + If a `test_size` value is given, the data will be split in two parts + without altering its order, making the second one proportionally as + big as the given value. + + Args: + name (str): Name of the CSV to load. + test_size (float): Value between 0 and 1 indicating the proportional + size of the test split. If 0 or None (default), the data is not split. + + Returns: + If no test_size is given, a single pandas.DataFrame is returned containing all + the data. If test_size is given, a tuple containing one pandas.DataFrame for + the train split and another one for the test split is returned. + + Raises: + FileNotFoundError: If the normal file doesn't exist locally and can't be downloaded from S3. + """ + try: + url = None + if name.startswith('s3://'): + parts = name[5:].split('/', 1) + bucket = parts[0] + path = parts[1] + url = S3_URL.format(bucket, path) + filename = os.path.join(data_path, path.split('/')[-1]) + else: + filename = os.path.join(data_path, name + '_normal.csv') + data_path = os.path.join(data_path, os.path.dirname(name)) + + if os.path.exists(filename): + data = pd.read_csv(filename) + return data + + url = url or S3_URL.format(BUCKET, '{}_normal.csv'.format(name)) + LOGGER.info('Downloading CSV %s from %s', name, url) + print("Downloading CSV %s from %s", name, url) + + try: + data = pd.read_csv(url) + os.makedirs(data_path, exist_ok=True) + data.to_csv(filename, index=False) + return data + except Exception as e: + error_msg = f"Could not download or find normal file for {name}. " + error_msg += f"Please ensure the file exists at {filename} or can be downloaded from {url}" + LOGGER.error(error_msg) + raise FileNotFoundError(error_msg) + + except Exception as e: + error_msg = f"Error processing normal file for {name}: {str(e)}" + LOGGER.error(error_msg) + raise FileNotFoundError(error_msg) + + +def format_csv(df, timestamp_column=None, value_columns=None): + timestamp_column_name = df.columns[timestamp_column] if timestamp_column else df.columns[0] + value_column_names = df.columns[value_columns] if value_columns else df.columns[1:] + + data = dict() + data['timestamp'] = df[timestamp_column_name].astype('int64').values + for column in value_column_names: + data[column] = df[column].astype(float).values + + return pd.DataFrame(data) + + +def load_csv(path, timestamp_column=None, value_column=None): + header = None if timestamp_column is not None else 'infer' + data = pd.read_csv(path, header=header) + + if timestamp_column is None: + if value_column is not None: + raise ValueError("If value_column is provided, timestamp_column must be as well") + + return data + + elif value_column is None: + raise ValueError("If timestamp_column is provided, value_column must be as well") + elif timestamp_column == value_column: + raise ValueError("timestamp_column cannot be the same as value_column") + + return format_csv(data, timestamp_column, value_column) + + +def load_normal(normal, test_size=None, timestamp_column=None, value_column=None): + if os.path.isfile(normal): + data = load_csv(normal, timestamp_column, value_column) + else: + data = download_normal(normal) + + data = format_csv(data) + + if test_size is None: + return data + + test_length = round(len(data) * test_size) + train = data.iloc[:-test_length] + test = data.iloc[-test_length:] + + return train, test From 6f08214b105be503b449b0f4664a11a4d0c55e25 Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Tue, 1 Apr 2025 13:30:47 -0400 Subject: [PATCH 12/27] Fixed load_normal in sigllm.data --- sigllm/data.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/sigllm/data.py b/sigllm/data.py index c0561a3..ce69ea4 100644 --- a/sigllm/data.py +++ b/sigllm/data.py @@ -124,7 +124,7 @@ def load_csv(path, timestamp_column=None, value_column=None): return format_csv(data, timestamp_column, value_column) -def load_normal(normal, test_size=None, timestamp_column=None, value_column=None): +def load_normal(normal, timestamp_column=None, value_column=None): if os.path.isfile(normal): data = load_csv(normal, timestamp_column, value_column) else: @@ -132,11 +132,4 @@ def load_normal(normal, test_size=None, timestamp_column=None, value_column=None data = format_csv(data) - if test_size is None: - return data - - test_length = round(len(data) * test_size) - train = data.iloc[:-test_length] - test = data.iloc[-test_length:] - - return train, test + return data From fbedec1cdd7a6f8014910b9069d43feb8c22771d Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Tue, 1 Apr 2025 13:55:43 -0400 Subject: [PATCH 13/27] Fixed lint format --- sigllm/core.py | 3 +- sigllm/data.py | 75 +++++++++++++++++----- sigllm/primitives/prompting/anomalies.py | 4 +- sigllm/primitives/prompting/huggingface.py | 6 +- sigllm/primitives/transformation.py | 34 ++++------ 5 files changed, 78 insertions(+), 44 deletions(-) diff --git a/sigllm/core.py b/sigllm/core.py index c860115..b5aaff9 100644 --- a/sigllm/core.py +++ b/sigllm/core.py @@ -100,7 +100,8 @@ def __repr__(self): return ('SigLLM:\n{}\nhyperparameters:\n{}\n').format(pipeline, hyperparameters) - def detect(self, data: pd.DataFrame, normal: pd.DataFrame = None, visualization: bool = False, **kwargs) -> pd.DataFrame: + def detect(self, data: pd.DataFrame, normal: pd.DataFrame = None, + visualization: bool = False, **kwargs) -> pd.DataFrame: """Detect anomalies in the given data. If ``visualization=True``, also return the visualization diff --git a/sigllm/data.py b/sigllm/data.py index ce69ea4..ca7a27e 100644 --- a/sigllm/data.py +++ b/sigllm/data.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Data Management module. +"""Data Management module. This module contains functions that allow downloading demo data from Amazon S3, as well as load and work with other data stored locally. @@ -11,11 +10,9 @@ https://s3-us-west-2.amazonaws.com/telemanom/data.zip """ -import json import logging import os -import numpy as np import pandas as pd LOGGER = logging.getLogger(__name__) @@ -55,7 +52,8 @@ def download_normal(name, test_size=None, data_path=DATA_PATH): the train split and another one for the test split is returned. Raises: - FileNotFoundError: If the normal file doesn't exist locally and can't be downloaded from S3. + FileNotFoundError: If the normal file doesn't exist locally and can't + be downloaded from S3. """ try: url = None @@ -75,16 +73,18 @@ def download_normal(name, test_size=None, data_path=DATA_PATH): url = url or S3_URL.format(BUCKET, '{}_normal.csv'.format(name)) LOGGER.info('Downloading CSV %s from %s', name, url) - print("Downloading CSV %s from %s", name, url) - + try: data = pd.read_csv(url) os.makedirs(data_path, exist_ok=True) data.to_csv(filename, index=False) return data - except Exception as e: - error_msg = f"Could not download or find normal file for {name}. " - error_msg += f"Please ensure the file exists at {filename} or can be downloaded from {url}" + except Exception: + error_msg = ( + f"Could not download or find normal file for {name}. " + f"Please ensure the file exists at {filename} or can be " + f"downloaded from {url}" + ) LOGGER.error(error_msg) raise FileNotFoundError(error_msg) @@ -95,8 +95,22 @@ def download_normal(name, test_size=None, data_path=DATA_PATH): def format_csv(df, timestamp_column=None, value_columns=None): - timestamp_column_name = df.columns[timestamp_column] if timestamp_column else df.columns[0] - value_column_names = df.columns[value_columns] if value_columns else df.columns[1:] + """Format CSV data with timestamp and value columns. + + Args: + df (pd.DataFrame): Input DataFrame + timestamp_column: Column index or name for timestamp + value_columns: Column index or name for values + + Returns: + pd.DataFrame: Formatted DataFrame with timestamp and values + """ + timestamp_column_name = ( + df.columns[timestamp_column] if timestamp_column else df.columns[0] + ) + value_column_names = ( + df.columns[value_columns] if value_columns else df.columns[1:] + ) data = dict() data['timestamp'] = df[timestamp_column_name].astype('int64').values @@ -107,29 +121,56 @@ def format_csv(df, timestamp_column=None, value_columns=None): def load_csv(path, timestamp_column=None, value_column=None): + """Load and format CSV file. + + Args: + path (str): Path to CSV file + timestamp_column: Column index or name for timestamp + value_column: Column index or name for values + + Returns: + pd.DataFrame: Loaded and formatted DataFrame + + Raises: + ValueError: If column specifications are invalid + """ header = None if timestamp_column is not None else 'infer' data = pd.read_csv(path, header=header) if timestamp_column is None: if value_column is not None: - raise ValueError("If value_column is provided, timestamp_column must be as well") - + raise ValueError( + "If value_column is provided, timestamp_column must be as well" + ) return data elif value_column is None: - raise ValueError("If timestamp_column is provided, value_column must be as well") + raise ValueError( + "If timestamp_column is provided, value_column must be as well" + ) elif timestamp_column == value_column: - raise ValueError("timestamp_column cannot be the same as value_column") + raise ValueError( + "timestamp_column cannot be the same as value_column" + ) return format_csv(data, timestamp_column, value_column) def load_normal(normal, timestamp_column=None, value_column=None): + """Load normal data from file or download if needed. + + Args: + normal (str): Name or path of the normal data + timestamp_column: Column index or name for timestamp + value_column: Column index or name for values + + Returns: + pd.DataFrame: Loaded and formatted normal data + """ if os.path.isfile(normal): data = load_csv(normal, timestamp_column, value_column) else: data = download_normal(normal) data = format_csv(data) - return data diff --git a/sigllm/primitives/prompting/anomalies.py b/sigllm/primitives/prompting/anomalies.py index c628237..82c462f 100644 --- a/sigllm/primitives/prompting/anomalies.py +++ b/sigllm/primitives/prompting/anomalies.py @@ -36,7 +36,6 @@ def val2idx(y, X): idx_list.append(idx_win_list) idx_list = np.array(idx_list, dtype=object) - return idx_list @@ -124,11 +123,10 @@ def format_anomalies(y, timestamp, padding_size=50): List[Tuple]: List of intervals (start, end, score). Empty list if no anomalies are found. """ - # Handle empty array case if len(y) == 0: return [] - + y = timestamp[y] # Convert list of indices into list of timestamps start, end = timestamp[0], timestamp[-1] interval = timestamp[1] - timestamp[0] diff --git a/sigllm/primitives/prompting/huggingface.py b/sigllm/primitives/prompting/huggingface.py index d8f4606..b19bd15 100644 --- a/sigllm/primitives/prompting/huggingface.py +++ b/sigllm/primitives/prompting/huggingface.py @@ -142,7 +142,7 @@ def detect(self, X, normal=None, **kwargs): user_message = PROMPTS['user_message'] else: user_message = PROMPTS['user_message_2'] - + # Combine messages with one-shot example if provided message = ' '.join([ system_message, @@ -178,7 +178,7 @@ def detect(self, X, normal=None, **kwargs): skip_special_tokens=True, clean_up_tokenization_spaces=False, ) - + # Extract only the part after [RESPONSE] responses = [] for full_response in full_responses: @@ -187,7 +187,7 @@ def detect(self, X, normal=None, **kwargs): responses.append(response) except IndexError: responses.append("") # If no [RESPONSE] found, return empty string - + all_responses.append(responses) all_generate_ids.append(generate_ids) diff --git a/sigllm/primitives/transformation.py b/sigllm/primitives/transformation.py index d16632e..6cd726d 100644 --- a/sigllm/primitives/transformation.py +++ b/sigllm/primitives/transformation.py @@ -29,8 +29,6 @@ def format_as_string(X, sep=',', space=False, normal=False): If normal=True, returns a single string representation. If normal=False, returns a list of string representations for each wprintindow. """ - - def _as_string(x): text = sep.join(list(map(str, x.flatten()))) if space: @@ -161,7 +159,6 @@ def transform(self, X): values = sign * (values * 10**self.decimal).astype(int) - return values, self.minimum, self.decimal @@ -187,36 +184,32 @@ def transform(self, X, minimum=0, decimal=2): return values + minimum -from typing import List -import re def parse_anomaly_response(X): - """ - Parse a list of lists of LLM responses to extract anomaly values and format them as strings. - + """Parse a list of lists of LLM responses to extract anomaly values and format them as strings. + Args: X (List[List[str]]): List of lists of response texts from the LLM in the format "Answer: no anomalies" or "Answer: [val1, val2, ..., valN]" - Returns: List[List[str]]: List of lists of parsed responses where each element is either "val1,val2,...,valN" if anomalies are found, or empty string if no anomalies are present """ - + def parse_single_response(text: str) -> str: # Clean the input text text = text.strip().lower() - + # Check for "no anomalies" case if "no anomalies" in text or "no anomaly" in text: return "" - + # Try to extract the values using regex # Match anything inside square brackets that consists of digits and commas pattern = r'\[([\d\s,]+)\]' match = re.search(pattern, text) - + if match: # Extract the content inside brackets and clean it values = match.group(1) @@ -224,10 +217,10 @@ def parse_single_response(text: str) -> str: values = [val.strip() for val in values.split(',') if val.strip()] # Join the values with commas return ','.join(values) - + # Return empty string if no valid format is found return "" - + # Process each list of responses in the input result = [] for response_list in X: @@ -235,9 +228,10 @@ def parse_single_response(text: str) -> str: parsed_list = [parse_single_response(response) for response in response_list] result.append(parsed_list) - #return np.array(result, dtype=object) + # return np.array(result, dtype=object) return result + def format_as_single_string(X, sep=',', space=False): """Format a single time series to a string. @@ -257,10 +251,10 @@ def format_as_single_string(X, sep=',', space=False): # Ensure X is 1D if X.ndim > 1: X = X.flatten() - + text = sep.join(list(map(str, X))) - + if space: text = ' '.join(text) - - return text \ No newline at end of file + + return text From fa98d60e676ad7ca6a91c4b26f7bd9c5bc19369c Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Tue, 1 Apr 2025 14:11:57 -0400 Subject: [PATCH 14/27] Fixed lint format Ruff --- sigllm/core.py | 9 ++++-- sigllm/data.py | 33 +++++++--------------- sigllm/primitives/prompting/huggingface.py | 9 +++--- sigllm/primitives/transformation.py | 11 ++++---- 4 files changed, 27 insertions(+), 35 deletions(-) diff --git a/sigllm/core.py b/sigllm/core.py index b5aaff9..cc8566f 100644 --- a/sigllm/core.py +++ b/sigllm/core.py @@ -100,8 +100,13 @@ def __repr__(self): return ('SigLLM:\n{}\nhyperparameters:\n{}\n').format(pipeline, hyperparameters) - def detect(self, data: pd.DataFrame, normal: pd.DataFrame = None, - visualization: bool = False, **kwargs) -> pd.DataFrame: + def detect( + self, + data: pd.DataFrame, + normal: pd.DataFrame = None, + visualization: bool = False, + **kwargs, + ) -> pd.DataFrame: """Detect anomalies in the given data. If ``visualization=True``, also return the visualization diff --git a/sigllm/data.py b/sigllm/data.py index ca7a27e..4fea4d9 100644 --- a/sigllm/data.py +++ b/sigllm/data.py @@ -17,10 +17,7 @@ LOGGER = logging.getLogger(__name__) -DATA_PATH = os.path.join( - os.path.dirname(os.path.abspath(__file__)), - 'data' -) +DATA_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') BUCKET = 'sintel-sigllm' S3_URL = 'https://{}.s3.amazonaws.com/{}' @@ -81,15 +78,15 @@ def download_normal(name, test_size=None, data_path=DATA_PATH): return data except Exception: error_msg = ( - f"Could not download or find normal file for {name}. " - f"Please ensure the file exists at {filename} or can be " - f"downloaded from {url}" + f'Could not download or find normal file for {name}. ' + f'Please ensure the file exists at {filename} or can be ' + f'downloaded from {url}' ) LOGGER.error(error_msg) raise FileNotFoundError(error_msg) except Exception as e: - error_msg = f"Error processing normal file for {name}: {str(e)}" + error_msg = f'Error processing normal file for {name}: {str(e)}' LOGGER.error(error_msg) raise FileNotFoundError(error_msg) @@ -105,12 +102,8 @@ def format_csv(df, timestamp_column=None, value_columns=None): Returns: pd.DataFrame: Formatted DataFrame with timestamp and values """ - timestamp_column_name = ( - df.columns[timestamp_column] if timestamp_column else df.columns[0] - ) - value_column_names = ( - df.columns[value_columns] if value_columns else df.columns[1:] - ) + timestamp_column_name = df.columns[timestamp_column] if timestamp_column else df.columns[0] + value_column_names = df.columns[value_columns] if value_columns else df.columns[1:] data = dict() data['timestamp'] = df[timestamp_column_name].astype('int64').values @@ -139,19 +132,13 @@ def load_csv(path, timestamp_column=None, value_column=None): if timestamp_column is None: if value_column is not None: - raise ValueError( - "If value_column is provided, timestamp_column must be as well" - ) + raise ValueError('If value_column is provided, timestamp_column must be as well') return data elif value_column is None: - raise ValueError( - "If timestamp_column is provided, value_column must be as well" - ) + raise ValueError('If timestamp_column is provided, value_column must be as well') elif timestamp_column == value_column: - raise ValueError( - "timestamp_column cannot be the same as value_column" - ) + raise ValueError('timestamp_column cannot be the same as value_column') return format_csv(data, timestamp_column, value_column) diff --git a/sigllm/primitives/prompting/huggingface.py b/sigllm/primitives/prompting/huggingface.py index b19bd15..e11efc9 100644 --- a/sigllm/primitives/prompting/huggingface.py +++ b/sigllm/primitives/prompting/huggingface.py @@ -131,12 +131,11 @@ def detect(self, X, normal=None, **kwargs): all_responses, all_generate_ids = [], [] # Prepare the one-shot example if provided - one_shot_message = "" + one_shot_message = '' if normal is not None: - one_shot_message = PROMPTS['one_shot_prefix'] + normal + "\n\n" + one_shot_message = PROMPTS['one_shot_prefix'] + normal + '\n\n' for text in tqdm(X): - system_message = PROMPTS['system_message'] if self.restrict_tokens: user_message = PROMPTS['user_message'] @@ -149,7 +148,7 @@ def detect(self, X, normal=None, **kwargs): one_shot_message, user_message, text, - '[RESPONSE]' + '[RESPONSE]', ]) input_length = len(self.tokenizer.encode(message)) @@ -186,7 +185,7 @@ def detect(self, X, normal=None, **kwargs): response = full_response.split('[RESPONSE]')[1].strip() responses.append(response) except IndexError: - responses.append("") # If no [RESPONSE] found, return empty string + responses.append('') # If no [RESPONSE] found, return empty string all_responses.append(responses) all_generate_ids.append(generate_ids) diff --git a/sigllm/primitives/transformation.py b/sigllm/primitives/transformation.py index 6cd726d..3f22b13 100644 --- a/sigllm/primitives/transformation.py +++ b/sigllm/primitives/transformation.py @@ -26,9 +26,10 @@ def format_as_string(X, sep=',', space=False, normal=False): Returns: ndarray or str: - If normal=True, returns a single string representation. - If normal=False, returns a list of string representations for each wprintindow. + If normal=True, returns a single string representation. If normal=False, + returns a list of string representations for each window. """ + def _as_string(x): text = sep.join(list(map(str, x.flatten()))) if space: @@ -202,8 +203,8 @@ def parse_single_response(text: str) -> str: text = text.strip().lower() # Check for "no anomalies" case - if "no anomalies" in text or "no anomaly" in text: - return "" + if 'no anomalies' in text or 'no anomaly' in text: + return '' # Try to extract the values using regex # Match anything inside square brackets that consists of digits and commas @@ -219,7 +220,7 @@ def parse_single_response(text: str) -> str: return ','.join(values) # Return empty string if no valid format is found - return "" + return '' # Process each list of responses in the input result = [] From 8ea8f97ba6a1b7b7880168f14a2f27a00e8a53d5 Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Tue, 1 Apr 2025 17:06:34 -0400 Subject: [PATCH 15/27] Fixed from review Sarah --- sigllm/core.py | 4 +- sigllm/data.py | 42 ++++++++++++----- .../prompter/mistral_prompter_0shot.json | 4 +- ...reprocessing.rolling_window_sequences.json | 4 +- ...transformation.parse_anomaly_response.json | 3 +- sigllm/primitives/prompting/huggingface.py | 46 ++++++++++--------- .../prompting/timeseries_preprocessing.py | 2 +- sigllm/primitives/transformation.py | 19 ++++---- 8 files changed, 75 insertions(+), 49 deletions(-) diff --git a/sigllm/core.py b/sigllm/core.py index cc8566f..0008002 100644 --- a/sigllm/core.py +++ b/sigllm/core.py @@ -117,9 +117,9 @@ def detect( Input data, passed as a ``pandas.DataFrame`` containing exactly two columns: timestamp and value. normal (DataFrame, optional): - Normal reference data for one-shot learning, passed as a ``pandas.DataFrame`` + Normal reference data for one-shot prompting, passed as a ``pandas.DataFrame`` containing exactly two columns: timestamp and value. If None, zero-shot - learning is used. Default to None. + prompting is used. Default to None. visualization (bool): If ``True``, also capture the ``visualization`` named output from the ``MLPipeline`` and return it as a second diff --git a/sigllm/data.py b/sigllm/data.py index 4fea4d9..03d0e29 100644 --- a/sigllm/data.py +++ b/sigllm/data.py @@ -4,10 +4,6 @@ This module contains functions that allow downloading demo data from Amazon S3, as well as load and work with other data stored locally. - -The demo data is a modified version of the NASA data found here: - -https://s3-us-west-2.amazonaws.com/telemanom/data.zip """ import logging @@ -143,21 +139,43 @@ def load_csv(path, timestamp_column=None, value_column=None): return format_csv(data, timestamp_column, value_column) -def load_normal(normal, timestamp_column=None, value_column=None): +def load_normal(name, timestamp_column=None, value_column=None, start=None, end=None, use_timestamps=False): """Load normal data from file or download if needed. Args: - normal (str): Name or path of the normal data - timestamp_column: Column index or name for timestamp - value_column: Column index or name for values + name (str): + Name or path of the normal data. + timestamp_column (str or int): + Column index or name for timestamp. + value_column (str or int): + Column index or name for values. + start (int or timestamp): + Optional. If specified, this will be start of the sub-sequence. + end (int or timestamp): + Optional. If specified, this will be end of the sub-sequence. + use_timestamps (bool): + If True, start and end are interpreted as timestamps. + If False, start and end are interpreted as row indices. Returns: - pd.DataFrame: Loaded and formatted normal data + pandas.DataFrame: + Loaded subsequence with `timestamp` and `value` columns. """ - if os.path.isfile(normal): - data = load_csv(normal, timestamp_column, value_column) + if os.path.isfile(name): + data = load_csv(name, timestamp_column, value_column) else: - data = download_normal(normal) + data = download_normal(name) data = format_csv(data) + + # Handle slicing if start or end is specified + if start is not None or end is not None: + if use_timestamps: + # If start and end are timestamps + mask = (data['timestamp'] >= start) & (data['timestamp'] <= end) + data = data[mask] + else: + # If start and end are indices + data = data.iloc[start:end] + return data diff --git a/sigllm/pipelines/prompter/mistral_prompter_0shot.json b/sigllm/pipelines/prompter/mistral_prompter_0shot.json index e15751d..430b6ab 100644 --- a/sigllm/pipelines/prompter/mistral_prompter_0shot.json +++ b/sigllm/pipelines/prompter/mistral_prompter_0shot.json @@ -9,8 +9,8 @@ "sigllm.primitives.prompting.huggingface.HF", "sigllm.primitives.transformation.parse_anomaly_response", "sigllm.primitives.transformation.format_as_integer", - "sigllm/primitives/jsons/sigllm.primitives.prompting.anomalies.val2idx.json", - "sigllm/primitives/jsons/sigllm.primitives.prompting.anomalies.find_anomalies_in_windows.json", + "sigllm.primitives.prompting.anomalies.val2idx", + "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows", "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences", "sigllm.primitives.prompting.anomalies.format_anomalies" ], diff --git a/sigllm/primitives/jsons/sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences.json b/sigllm/primitives/jsons/sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences.json index c5f2bd8..23658e8 100644 --- a/sigllm/primitives/jsons/sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences.json +++ b/sigllm/primitives/jsons/sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences.json @@ -43,11 +43,11 @@ "fixed": { "window_size": { "type": "int", - "default": 50 + "default": 500 }, "step_size": { "type": "int", - "default": 50 + "default": 100 } } } diff --git a/sigllm/primitives/jsons/sigllm.primitives.transformation.parse_anomaly_response.json b/sigllm/primitives/jsons/sigllm.primitives.transformation.parse_anomaly_response.json index 93669e4..a7ff470 100644 --- a/sigllm/primitives/jsons/sigllm.primitives.transformation.parse_anomaly_response.json +++ b/sigllm/primitives/jsons/sigllm.primitives.transformation.parse_anomaly_response.json @@ -12,7 +12,8 @@ "args": [ { "name": "X", - "type": "ndarray" } + "type": "ndarray" + } ], "output": [ { diff --git a/sigllm/primitives/prompting/huggingface.py b/sigllm/primitives/prompting/huggingface.py index e11efc9..8aff62f 100644 --- a/sigllm/primitives/prompting/huggingface.py +++ b/sigllm/primitives/prompting/huggingface.py @@ -48,7 +48,7 @@ class HF: Additional padding token to forecast to reduce short horizon predictions. Default to `0`. restrict_tokens (bool): - Whether to restrict tokens or not. Default to `True`. + Whether to restrict tokens or not. Default to `False`. """ def __init__( @@ -118,8 +118,8 @@ def detect(self, X, normal=None, **kwargs): X (ndarray): Input sequences of strings containing signal values normal (str, optional): - A normal reference sequence for one-shot learning. If None, - zero-shot learning is used. Default to None. + A normal reference sequence for one-shot prompting. If None, + zero-shot prompting is used. Default to None. Returns: list, list: @@ -156,7 +156,6 @@ def detect(self, X, normal=None, **kwargs): tokenized_input = self.tokenizer(message, return_tensors='pt').to('cuda') generate_kwargs = { - **tokenized_input, 'do_sample': True, 'max_new_tokens': max_tokens, 'temperature': self.temp, @@ -169,23 +168,28 @@ def detect(self, X, normal=None, **kwargs): if self.restrict_tokens: generate_kwargs['bad_words_ids'] = self.invalid_tokens - generate_ids = self.model.generate(**generate_kwargs) - - # Get the full generated text - full_responses = self.tokenizer.batch_decode( - generate_ids, - skip_special_tokens=True, - clean_up_tokenization_spaces=False, - ) - - # Extract only the part after [RESPONSE] - responses = [] - for full_response in full_responses: - try: - response = full_response.split('[RESPONSE]')[1].strip() - responses.append(response) - except IndexError: - responses.append('') # If no [RESPONSE] found, return empty string + generate_ids = self.model.generate(**tokenized_input, **generate_kwargs) + + if self.restrict_tokens: + responses = self.tokenizer.batch_decode( + generate_ids[:, input_length:], + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + ) + else: # Extract only the part after [RESPONSE] + # Get the full generated text + full_responses = self.tokenizer.batch_decode( + generate_ids, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + ) + responses = [] + for full_response in full_responses: + try: + response = full_response.split('[RESPONSE]')[1].strip() + responses.append(response) + except IndexError: + responses.append('') # If no [RESPONSE] found, return empty string all_responses.append(responses) all_generate_ids.append(generate_ids) diff --git a/sigllm/primitives/prompting/timeseries_preprocessing.py b/sigllm/primitives/prompting/timeseries_preprocessing.py index fa193a9..fee3de9 100644 --- a/sigllm/primitives/prompting/timeseries_preprocessing.py +++ b/sigllm/primitives/prompting/timeseries_preprocessing.py @@ -8,7 +8,7 @@ import numpy as np -def rolling_window_sequences(X, window_size=50, step_size=10): +def rolling_window_sequences(X, window_size=500, step_size=100): """Create rolling window sequences out of time series data. This function creates an array of sequences by rolling over the input sequence. diff --git a/sigllm/primitives/transformation.py b/sigllm/primitives/transformation.py index 3f22b13..8ed47aa 100644 --- a/sigllm/primitives/transformation.py +++ b/sigllm/primitives/transformation.py @@ -187,18 +187,21 @@ def transform(self, X, minimum=0, decimal=2): def parse_anomaly_response(X): - """Parse a list of lists of LLM responses to extract anomaly values and format them as strings. + """Parse a list of lists of LLM responses to extract anomaly + values and format them as strings. Args: - X (List[List[str]]): List of lists of response texts from the LLM in the format - "Answer: no anomalies" or "Answer: [val1, val2, ..., valN]" + X (List[List[str]]): + List of lists of response texts from the LLM in the format + "Answer: no anomalies" or "Answer: [val1, val2, ..., valN]" Returns: - List[List[str]]: List of lists of parsed responses where each element is either - "val1,val2,...,valN" if anomalies are found, - or empty string if no anomalies are present + List[List[str]]: + List of lists of parsed responses where each element is either + "val1,val2,...,valN" if anomalies are found, or empty string if + no anomalies are present """ - def parse_single_response(text: str) -> str: + def _parse_single_response(text: str): # Clean the input text text = text.strip().lower() @@ -226,7 +229,7 @@ def parse_single_response(text: str) -> str: result = [] for response_list in X: # Process each response in the inner list - parsed_list = [parse_single_response(response) for response in response_list] + parsed_list = [_parse_single_response(response) for response in response_list] result.append(parsed_list) # return np.array(result, dtype=object) From 293f1cab1fc79411ba0b94da2de71f3cde78f036 Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Tue, 1 Apr 2025 17:17:10 -0400 Subject: [PATCH 16/27] Fixed lint format after working on Sarah's reviews --- sigllm/data.py | 6 ++++-- sigllm/primitives/prompting/huggingface.py | 4 ++-- sigllm/primitives/transformation.py | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/sigllm/data.py b/sigllm/data.py index 03d0e29..dc2e161 100644 --- a/sigllm/data.py +++ b/sigllm/data.py @@ -139,13 +139,15 @@ def load_csv(path, timestamp_column=None, value_column=None): return format_csv(data, timestamp_column, value_column) -def load_normal(name, timestamp_column=None, value_column=None, start=None, end=None, use_timestamps=False): +def load_normal( + name, timestamp_column=None, value_column=None, start=None, end=None, use_timestamps=False +): """Load normal data from file or download if needed. Args: name (str): Name or path of the normal data. - timestamp_column (str or int): + timestamp_column (str or int): Column index or name for timestamp. value_column (str or int): Column index or name for values. diff --git a/sigllm/primitives/prompting/huggingface.py b/sigllm/primitives/prompting/huggingface.py index 8aff62f..301253e 100644 --- a/sigllm/primitives/prompting/huggingface.py +++ b/sigllm/primitives/prompting/huggingface.py @@ -175,8 +175,8 @@ def detect(self, X, normal=None, **kwargs): generate_ids[:, input_length:], skip_special_tokens=True, clean_up_tokenization_spaces=False, - ) - else: # Extract only the part after [RESPONSE] + ) + else: # Extract only the part after [RESPONSE] # Get the full generated text full_responses = self.tokenizer.batch_decode( generate_ids, diff --git a/sigllm/primitives/transformation.py b/sigllm/primitives/transformation.py index 8ed47aa..5037131 100644 --- a/sigllm/primitives/transformation.py +++ b/sigllm/primitives/transformation.py @@ -187,13 +187,13 @@ def transform(self, X, minimum=0, decimal=2): def parse_anomaly_response(X): - """Parse a list of lists of LLM responses to extract anomaly - values and format them as strings. + """Parse a list of lists of LLM responses to extract anomaly values and format them as strings. Args: X (List[List[str]]): List of lists of response texts from the LLM in the format "Answer: no anomalies" or "Answer: [val1, val2, ..., valN]" + Returns: List[List[str]]: List of lists of parsed responses where each element is either From 8b6dd6e57e315ece4a47785637d3cbcd4f5e5680 Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Tue, 1 Apr 2025 19:58:19 -0400 Subject: [PATCH 17/27] Dataset prompter parameters --- .../pipelines/prompter/prompter_artificialwithanomaly.json | 6 ++++++ sigllm/pipelines/prompter/prompter_realadexchange.json | 6 ++++++ sigllm/pipelines/prompter/prompter_realawscloudwatch.json | 6 ++++++ sigllm/pipelines/prompter/prompter_realtraffic.json | 6 ++++++ sigllm/pipelines/prompter/prompter_realtweets.json | 6 ++++++ sigllm/pipelines/prompter/prompter_smap.json | 6 ++++++ 6 files changed, 36 insertions(+) create mode 100644 sigllm/pipelines/prompter/prompter_artificialwithanomaly.json create mode 100644 sigllm/pipelines/prompter/prompter_realadexchange.json create mode 100644 sigllm/pipelines/prompter/prompter_realawscloudwatch.json create mode 100644 sigllm/pipelines/prompter/prompter_realtraffic.json create mode 100644 sigllm/pipelines/prompter/prompter_realtweets.json create mode 100644 sigllm/pipelines/prompter/prompter_smap.json diff --git a/sigllm/pipelines/prompter/prompter_artificialwithanomaly.json b/sigllm/pipelines/prompter/prompter_artificialwithanomaly.json new file mode 100644 index 0000000..eebcc81 --- /dev/null +++ b/sigllm/pipelines/prompter/prompter_artificialwithanomaly.json @@ -0,0 +1,6 @@ +{ + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { + "time_column": "timestamp", + "interval": 600 + } +} diff --git a/sigllm/pipelines/prompter/prompter_realadexchange.json b/sigllm/pipelines/prompter/prompter_realadexchange.json new file mode 100644 index 0000000..6b8aac0 --- /dev/null +++ b/sigllm/pipelines/prompter/prompter_realadexchange.json @@ -0,0 +1,6 @@ +{ + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { + "time_column": "timestamp", + "interval": 3600 + } +} diff --git a/sigllm/pipelines/prompter/prompter_realawscloudwatch.json b/sigllm/pipelines/prompter/prompter_realawscloudwatch.json new file mode 100644 index 0000000..eebcc81 --- /dev/null +++ b/sigllm/pipelines/prompter/prompter_realawscloudwatch.json @@ -0,0 +1,6 @@ +{ + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { + "time_column": "timestamp", + "interval": 600 + } +} diff --git a/sigllm/pipelines/prompter/prompter_realtraffic.json b/sigllm/pipelines/prompter/prompter_realtraffic.json new file mode 100644 index 0000000..eebcc81 --- /dev/null +++ b/sigllm/pipelines/prompter/prompter_realtraffic.json @@ -0,0 +1,6 @@ +{ + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { + "time_column": "timestamp", + "interval": 600 + } +} diff --git a/sigllm/pipelines/prompter/prompter_realtweets.json b/sigllm/pipelines/prompter/prompter_realtweets.json new file mode 100644 index 0000000..eebcc81 --- /dev/null +++ b/sigllm/pipelines/prompter/prompter_realtweets.json @@ -0,0 +1,6 @@ +{ + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { + "time_column": "timestamp", + "interval": 600 + } +} diff --git a/sigllm/pipelines/prompter/prompter_smap.json b/sigllm/pipelines/prompter/prompter_smap.json new file mode 100644 index 0000000..e4fe0c1 --- /dev/null +++ b/sigllm/pipelines/prompter/prompter_smap.json @@ -0,0 +1,6 @@ +{ + "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": { + "time_column": "timestamp", + "interval": 21600 + } +} From 368991228301b45040e2e98225f3fb1fe2ed6d8d Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Wed, 2 Apr 2025 08:43:43 -0400 Subject: [PATCH 18/27] .jons removed from input names in 1_shot pipeline.json --- .../prompter/mistral_prompter_0shot.json | 4 ++-- .../prompter/mistral_prompter_1shot.json | 20 +++++++++---------- template/gpt_system_prompt_one_shot.txt | 1 - template/gpt_system_prompt_zero_shot.txt | 1 - 4 files changed, 12 insertions(+), 14 deletions(-) delete mode 100644 template/gpt_system_prompt_one_shot.txt delete mode 100644 template/gpt_system_prompt_zero_shot.txt diff --git a/sigllm/pipelines/prompter/mistral_prompter_0shot.json b/sigllm/pipelines/prompter/mistral_prompter_0shot.json index 430b6ab..40188e0 100644 --- a/sigllm/pipelines/prompter/mistral_prompter_0shot.json +++ b/sigllm/pipelines/prompter/mistral_prompter_0shot.json @@ -25,7 +25,7 @@ "rescale": true }, "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences#1": { - "window_size": 200, + "window_size": 100, "step_size": 40 }, "sigllm.primitives.transformation.format_as_string#1": { @@ -33,7 +33,7 @@ }, "sigllm.primitives.prompting.huggingface.HF#1": { "name": "mistralai/Mistral-7B-Instruct-v0.2", - "samples": 3, + "samples": 1, "temp": 0.01 }, "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows#1": { diff --git a/sigllm/pipelines/prompter/mistral_prompter_1shot.json b/sigllm/pipelines/prompter/mistral_prompter_1shot.json index 99e2996..f2edcc4 100644 --- a/sigllm/pipelines/prompter/mistral_prompter_1shot.json +++ b/sigllm/pipelines/prompter/mistral_prompter_1shot.json @@ -11,7 +11,7 @@ "sigllm.primitives.transformation.Float2Scalar", "sigllm.primitives.transformation.format_as_string", - "sigllm.primitives.prompting.huggingface", + "sigllm.primitives.prompting.huggingface.HF", "sigllm.primitives.transformation.parse_anomaly_response", "sigllm.primitives.transformation.format_as_integer", "sigllm.primitives.prompting.anomalies.val2idx", @@ -49,7 +49,7 @@ "space": false, "normal": true }, - "sigllm.primitives.prompting.huggingface#1": { + "sigllm.primitives.prompting.huggingface.HF#1": { "name": "mistralai/Mistral-7B-Instruct-v0.2", "anomalous_percent": 0.5, "samples": 1, @@ -96,27 +96,27 @@ }, - "sigllm/primitives/jsons/sigllm.primitives.prompting.huggingface.HF.json#1": { + "sigllm.primitives.prompting.huggingface.HF#1": { "X": "X_str", "normal": "normal_str" }, - "sigllm/primitives/jsons/sigllm.primitives.transformation.parse_anomaly_response.json#1": { + "sigllm.primitives.transformation.parse_anomaly_response#1": { "X": "y_hat" }, - "sigllm/primitives/jsons/sigllm.primitives.transformation.format_as_integer.json#1": { + "sigllm.primitives.transformation.format_as_integer#1": { "X": "y_parsed" }, - "sigllm/primitives/jsons/sigllm.primitives.prompting.anomalies.val2idx.json#1": { + "sigllm.primitives.prompting.anomalies.val2idx#1": { "y": "y_intermediate", "X": "X_sequences" }, - "sigllm/primitives/jsons/sigllm.primitives.prompting.anomalies.find_anomalies_in_windows.json#1": { + "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows#1": { "y": "y_idx" }, - "sigllm/primitives/jsons/sigllm.primitives.prompting.anomalies.merge_anomalous_sequences.json#1": { + "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences#1": { "y": "y_windows" }, - "sigllm/primitives/jsons/sigllm.primitives.prompting.anomalies.format_anomalies.json#1": { + "sigllm.primitives.prompting.anomalies.format_anomalies#1": { "y": "y_merged" } }, @@ -152,7 +152,7 @@ "X": "normal_str" }, - "sigllm.primitives.prompting.huggingface#1": { + "sigllm.primitives.prompting.huggingface.HF#1": { "y": "y_hat" }, "sigllm.primitives.transformation.parse_anomaly_response#1": { diff --git a/template/gpt_system_prompt_one_shot.txt b/template/gpt_system_prompt_one_shot.txt deleted file mode 100644 index d01c882..0000000 --- a/template/gpt_system_prompt_one_shot.txt +++ /dev/null @@ -1 +0,0 @@ -You are a helpful assistant that performs time series anomaly detection. The user will provide an example of a sequence and a list of indices that are anomalous. Then the user will provide sequence and you will be asked to give a list of indices that are anomalous in the sequence. The sequences are represented by decimal strings separated by commas. Please give a list of indices are anomalous in the following sequence without producing any additional text. Do not say anything like 'the anomalous indices in the sequence are', just return the numbers. \ No newline at end of file diff --git a/template/gpt_system_prompt_zero_shot.txt b/template/gpt_system_prompt_zero_shot.txt deleted file mode 100644 index 65b9823..0000000 --- a/template/gpt_system_prompt_zero_shot.txt +++ /dev/null @@ -1 +0,0 @@ -You are a helpful assistant that performs time series anomaly detection. The user will provide sequence and you will be asked to give a list of indices that are anomalous in the sequence. The sequences are represented by decimal strings separated by commas. Please give a list of indices are anomalous in the sequence without producing any additional text. Do not say anything like 'the anomalous indices in the sequence are', just return the numbers. \ No newline at end of file From 42efea0871d5b9f35c4e09ddf6df6b0a4150ad3a Mon Sep 17 00:00:00 2001 From: Salim Cherkaoui Date: Wed, 2 Apr 2025 08:45:43 -0400 Subject: [PATCH 19/27] .jons removed from input names in 1_shot pipeline.json --- template/gpt_system_prompt_one_shot.txt | 1 + template/gpt_system_prompt_zero_shot.txt | 1 + 2 files changed, 2 insertions(+) create mode 100644 template/gpt_system_prompt_one_shot.txt create mode 100644 template/gpt_system_prompt_zero_shot.txt diff --git a/template/gpt_system_prompt_one_shot.txt b/template/gpt_system_prompt_one_shot.txt new file mode 100644 index 0000000..d01c882 --- /dev/null +++ b/template/gpt_system_prompt_one_shot.txt @@ -0,0 +1 @@ +You are a helpful assistant that performs time series anomaly detection. The user will provide an example of a sequence and a list of indices that are anomalous. Then the user will provide sequence and you will be asked to give a list of indices that are anomalous in the sequence. The sequences are represented by decimal strings separated by commas. Please give a list of indices are anomalous in the following sequence without producing any additional text. Do not say anything like 'the anomalous indices in the sequence are', just return the numbers. \ No newline at end of file diff --git a/template/gpt_system_prompt_zero_shot.txt b/template/gpt_system_prompt_zero_shot.txt new file mode 100644 index 0000000..65b9823 --- /dev/null +++ b/template/gpt_system_prompt_zero_shot.txt @@ -0,0 +1 @@ +You are a helpful assistant that performs time series anomaly detection. The user will provide sequence and you will be asked to give a list of indices that are anomalous in the sequence. The sequences are represented by decimal strings separated by commas. Please give a list of indices are anomalous in the sequence without producing any additional text. Do not say anything like 'the anomalous indices in the sequence are', just return the numbers. \ No newline at end of file From 5d99162a0c02c25414ebdfe898042dfa1f9daa4f Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Wed, 16 Apr 2025 14:56:22 -0400 Subject: [PATCH 20/27] fix PR issues & add unittests --- sigllm/data.py | 102 ++++-------------- .../prompting/huggingface_messages.json | 2 +- sigllm/primitives/transformation.py | 23 ++-- tests/test_data.py | 74 +++++++++++++ 4 files changed, 103 insertions(+), 98 deletions(-) create mode 100644 tests/test_data.py diff --git a/sigllm/data.py b/sigllm/data.py index dc2e161..ac89b25 100644 --- a/sigllm/data.py +++ b/sigllm/data.py @@ -10,6 +10,7 @@ import os import pandas as pd +from orion.data import format_csv, load_csv LOGGER = logging.getLogger(__name__) @@ -18,31 +19,27 @@ S3_URL = 'https://{}.s3.amazonaws.com/{}' -def download_normal(name, test_size=None, data_path=DATA_PATH): +def download_normal(name, data_path=DATA_PATH): """Load the CSV with the given name from S3. If the CSV has never been loaded before, it will be downloaded - from the [d3-ai-orion bucket](https://d3-ai-orion.s3.amazonaws.com) or + from the [sintel-sigllm bucket](https://sintel-sigllm.s3.amazonaws.com) or the S3 bucket specified following the `s3://{bucket}/path/to/the.csv` format, - and then cached inside the `data` folder, within the `orion` package + and then cached inside the `data` folder, within the `sigllm` package directory, and then returned. Otherwise, if it has been downloaded and cached before, it will be directly - loaded from the `orion/data` folder without contacting S3. - - If a `test_size` value is given, the data will be split in two parts - without altering its order, making the second one proportionally as - big as the given value. + loaded from the `sigllm/data` folder without contacting S3. Args: - name (str): Name of the CSV to load. - test_size (float): Value between 0 and 1 indicating the proportional - size of the test split. If 0 or None (default), the data is not split. + name (str): + Name of the CSV to load. + data_path (str): + Path to store data. Returns: - If no test_size is given, a single pandas.DataFrame is returned containing all - the data. If test_size is given, a tuple containing one pandas.DataFrame for - the train split and another one for the test split is returned. + pandas.DataFrame: + A pandas.DataFrame is returned containing all the data. Raises: FileNotFoundError: If the normal file doesn't exist locally and can't @@ -87,61 +84,7 @@ def download_normal(name, test_size=None, data_path=DATA_PATH): raise FileNotFoundError(error_msg) -def format_csv(df, timestamp_column=None, value_columns=None): - """Format CSV data with timestamp and value columns. - - Args: - df (pd.DataFrame): Input DataFrame - timestamp_column: Column index or name for timestamp - value_columns: Column index or name for values - - Returns: - pd.DataFrame: Formatted DataFrame with timestamp and values - """ - timestamp_column_name = df.columns[timestamp_column] if timestamp_column else df.columns[0] - value_column_names = df.columns[value_columns] if value_columns else df.columns[1:] - - data = dict() - data['timestamp'] = df[timestamp_column_name].astype('int64').values - for column in value_column_names: - data[column] = df[column].astype(float).values - - return pd.DataFrame(data) - - -def load_csv(path, timestamp_column=None, value_column=None): - """Load and format CSV file. - - Args: - path (str): Path to CSV file - timestamp_column: Column index or name for timestamp - value_column: Column index or name for values - - Returns: - pd.DataFrame: Loaded and formatted DataFrame - - Raises: - ValueError: If column specifications are invalid - """ - header = None if timestamp_column is not None else 'infer' - data = pd.read_csv(path, header=header) - - if timestamp_column is None: - if value_column is not None: - raise ValueError('If value_column is provided, timestamp_column must be as well') - return data - - elif value_column is None: - raise ValueError('If timestamp_column is provided, value_column must be as well') - elif timestamp_column == value_column: - raise ValueError('timestamp_column cannot be the same as value_column') - - return format_csv(data, timestamp_column, value_column) - - -def load_normal( - name, timestamp_column=None, value_column=None, start=None, end=None, use_timestamps=False -): +def load_normal(name, timestamp_column=None, value_column=None, start=None, end=None): """Load normal data from file or download if needed. Args: @@ -155,9 +98,6 @@ def load_normal( Optional. If specified, this will be start of the sub-sequence. end (int or timestamp): Optional. If specified, this will be end of the sub-sequence. - use_timestamps (bool): - If True, start and end are interpreted as timestamps. - If False, start and end are interpreted as row indices. Returns: pandas.DataFrame: @@ -170,14 +110,16 @@ def load_normal( data = format_csv(data) - # Handle slicing if start or end is specified - if start is not None or end is not None: - if use_timestamps: - # If start and end are timestamps - mask = (data['timestamp'] >= start) & (data['timestamp'] <= end) - data = data[mask] - else: - # If start and end are indices + # handle start or end is specified + if start or end: + if any(data.index.isin([start, end])): data = data.iloc[start:end] + else: + mask = True + if start is not None: + mask &= data[timestamp_column] >= start + if end is not None: + mask &= data[timestamp_column] <= end + data = data[mask] return data diff --git a/sigllm/primitives/prompting/huggingface_messages.json b/sigllm/primitives/prompting/huggingface_messages.json index 1b57617..e329949 100644 --- a/sigllm/primitives/prompting/huggingface_messages.json +++ b/sigllm/primitives/prompting/huggingface_messages.json @@ -1,6 +1,6 @@ { "system_message": "You are an expert in time series analysis. Your task is to detect anomalies in time series data.", "user_message": "Below is a [SEQUENCE], please return the anomalies in that sequence in [RESPONSE]. Only return the numbers. [SEQUENCE]", - "user_message_2": "Below is a [SEQUENCE], analyze the following time series and identify any anomalies. If you find anomalies, provide their values in the format [first_anomaly, ..., last_anomaly]. If no anomalies are found, respond with 'no anomalies'. Be concise, do not write code, do not permorm any calculations, just give your answers as told.: [SEQUENCE]", + "user_message_2": "Below is a [SEQUENCE], analyze the following time series and identify any anomalies. If you find anomalies, provide their values in the format [first_anomaly, ..., last_anomaly]. If no anomalies are found, respond with 'no anomalies'. Be concise, do not write code, do not perform any calculations, just give your answers as told.: [SEQUENCE]", "one_shot_prefix": "Here is a normal reference of the time series: [NORMAL]" } \ No newline at end of file diff --git a/sigllm/primitives/transformation.py b/sigllm/primitives/transformation.py index 5037131..c70ad3b 100644 --- a/sigllm/primitives/transformation.py +++ b/sigllm/primitives/transformation.py @@ -191,48 +191,38 @@ def parse_anomaly_response(X): Args: X (List[List[str]]): - List of lists of response texts from the LLM in the format - "Answer: no anomalies" or "Answer: [val1, val2, ..., valN]" + List of lists of response texts from the LLM in the format + "Answer: no anomalies" or "Answer: [val1, val2, ..., valN]" Returns: List[List[str]]: - List of lists of parsed responses where each element is either - "val1,val2,...,valN" if anomalies are found, or empty string if - no anomalies are present + List of lists of parsed responses where each element is either + "val1,val2,...,valN" if anomalies are found, or empty string if + no anomalies are present """ def _parse_single_response(text: str): - # Clean the input text text = text.strip().lower() - # Check for "no anomalies" case if 'no anomalies' in text or 'no anomaly' in text: return '' - # Try to extract the values using regex - # Match anything inside square brackets that consists of digits and commas + # match anything that consists of digits and commas pattern = r'\[([\d\s,]+)\]' match = re.search(pattern, text) if match: - # Extract the content inside brackets and clean it values = match.group(1) - # Split by comma, strip whitespace, and filter out empty strings values = [val.strip() for val in values.split(',') if val.strip()] - # Join the values with commas return ','.join(values) - # Return empty string if no valid format is found return '' - # Process each list of responses in the input result = [] for response_list in X: - # Process each response in the inner list parsed_list = [_parse_single_response(response) for response in response_list] result.append(parsed_list) - # return np.array(result, dtype=object) return result @@ -252,7 +242,6 @@ def format_as_single_string(X, sep=',', space=False): str: A string representation of the time series. """ - # Ensure X is 1D if X.ndim > 1: X = X.flatten() diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000..8efe4d0 --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""Tests for `sigllm.data` module.""" + +from datetime import datetime +from unittest.mock import patch + +import pandas as pd +import pytest + +from sigllm.data import load_normal + + +@pytest.fixture +def sample_data(): + return pd.DataFrame({ + 'timestamp': pd.date_range(start='2023-01-01', periods=10, freq='D'), + 'value': range(10), + }) + + +@patch('sigllm.data.download_normal') +@patch('sigllm.data.format_csv') +def test_load_normal_without_start_end(mock_format_csv, mock_download, sample_data): + mock_format_csv.return_value = sample_data + mock_download.return_value = sample_data + + result = load_normal('test.csv') + mock_download.assert_called_once() + pd.testing.assert_frame_equal(result, sample_data) + + +@patch('sigllm.data.download_normal') +@patch('sigllm.data.format_csv') +def test_load_normal_with_index_based_start_end(mock_format_csv, mock_download, sample_data): + mock_format_csv.return_value = sample_data + mock_download.return_value = sample_data + + result = load_normal('test.csv', start=2, end=5) + expected = sample_data.iloc[2:5] + pd.testing.assert_frame_equal(result, expected) + + result = load_normal('test.csv', start=2) + expected = sample_data.iloc[2:] + pd.testing.assert_frame_equal(result, expected) + + result = load_normal('test.csv', end=5) + expected = sample_data.iloc[:5] + pd.testing.assert_frame_equal(result, expected) + + +@patch('sigllm.data.download_normal') +@patch('sigllm.data.format_csv') +def test_load_normal_with_timestamp_based_start_end(mock_format_csv, mock_download, sample_data): + mock_format_csv.return_value = sample_data + mock_download.return_value = sample_data + + start_date = datetime(2023, 1, 3) + end_date = datetime(2023, 1, 6) + result = load_normal('test.csv', timestamp_column='timestamp', start=start_date, end=end_date) + + expected = sample_data[ + (sample_data['timestamp'] >= start_date) & (sample_data['timestamp'] <= end_date) + ] + pd.testing.assert_frame_equal(result, expected) + + result = load_normal('test.csv', timestamp_column='timestamp', start=start_date) + expected = sample_data[sample_data['timestamp'] >= start_date] + pd.testing.assert_frame_equal(result, expected) + + result = load_normal('test.csv', timestamp_column='timestamp', end=end_date) + expected = sample_data[sample_data['timestamp'] <= end_date] + pd.testing.assert_frame_equal(result, expected) From 49e67d8c62b586ad3070194214bb343d95d0d9db Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Thu, 17 Apr 2025 12:09:09 -0400 Subject: [PATCH 21/27] add unittests for parse_anomaly_response --- sigllm/primitives/transformation.py | 5 +- tests/primitives/test_transformation.py | 61 +++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/sigllm/primitives/transformation.py b/sigllm/primitives/transformation.py index c70ad3b..113a19b 100644 --- a/sigllm/primitives/transformation.py +++ b/sigllm/primitives/transformation.py @@ -192,13 +192,14 @@ def parse_anomaly_response(X): Args: X (List[List[str]]): List of lists of response texts from the LLM in the format - "Answer: no anomalies" or "Answer: [val1, val2, ..., valN]" + "Answer: no anomalies" or "Answer: [val1, val2, ..., valN]." + values must be within brackets. Returns: List[List[str]]: List of lists of parsed responses where each element is either "val1,val2,...,valN" if anomalies are found, or empty string if - no anomalies are present + no anomalies are present. """ def _parse_single_response(text: str): diff --git a/tests/primitives/test_transformation.py b/tests/primitives/test_transformation.py index eb759d9..a620fb1 100644 --- a/tests/primitives/test_transformation.py +++ b/tests/primitives/test_transformation.py @@ -9,6 +9,7 @@ _from_string_to_integer, format_as_integer, format_as_string, + parse_anomaly_response, ) @@ -311,3 +312,63 @@ def test_float2scalar_scalar2float_integration(): output = scalar2float.transform(transformed, minimum, decimal) np.testing.assert_allclose(output, expected, rtol=1e-2) + + +class ParseAnomalyResponseTest(unittest.TestCase): + def test_no_anomalies(self): + data = [['Answer: no anomalies'], ['Answer: no anomaly'], ['no anomaly, with extra']] + expected = [[''], [''], ['']] + + output = parse_anomaly_response(data) + self.assertEqual(output, expected) + + def test_single_anomaly(self): + data = [['Answer: [123]'], ['Answer: [456]', 'answer: [789]']] + expected = [['123'], ['456', '789']] + + output = parse_anomaly_response(data) + self.assertEqual(output, expected) + + def test_multiple_anomalies(self): + data = [['Answer: [123, 456, 789]'], ['Answer: [111, 222, 333]']] + expected = [['123,456,789'], ['111,222,333']] + + output = parse_anomaly_response(data) + self.assertEqual(output, expected) + + def test_mixed_responses(self): + data = [ + ['Answer: no anomalies', 'Answer: [123, 456]'], + ['Answer: [789]', 'no anomaly'] + ] + expected = [['', '123,456'], ['789', '']] + + output = parse_anomaly_response(data) + self.assertEqual(output, expected) + + def test_different_formats(self): + data = [ + ['Answer: [123, 456]', 'Answer: [ 789 , 101 ]'], + ['Answer: [1,2,3]', 'Answer: [ 4 , 5 , 6 ]'] + ] + expected = [ + ['123,456', '789,101'], + ['1,2,3', '4,5,6'] + ] + + output = parse_anomaly_response(data) + self.assertEqual(output, expected) + + def test_empty_responses(self): + data = [[''], ['Answer: no anomalies'], ['answer'], ['no anomly']] + expected = [[''], [''], [''], ['']] + + output = parse_anomaly_response(data) + self.assertEqual(output, expected) + + def test_invalid_format(self): + data = [['Answer: invalid format'], ['Answer: [123, abc]']] + expected = [[''], ['']] + + output = parse_anomaly_response(data) + self.assertEqual(output, expected) From 11ff33ad40596776b72aad83a5dc13b502255b1e Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Thu, 17 Apr 2025 12:37:59 -0400 Subject: [PATCH 22/27] remove unused functions --- sigllm/primitives/transformation.py | 53 +++++++---------------------- 1 file changed, 13 insertions(+), 40 deletions(-) diff --git a/sigllm/primitives/transformation.py b/sigllm/primitives/transformation.py index 113a19b..d4b3e82 100644 --- a/sigllm/primitives/transformation.py +++ b/sigllm/primitives/transformation.py @@ -6,27 +6,27 @@ import numpy as np -def format_as_string(X, sep=',', space=False, normal=False): +def format_as_string(X, sep=',', space=False, single=False): """Format X to a list of string. - Transform an array of integers to string(s), separated by the indicated separator and space. - Handles two cases: - - If normal=True, treats X as a single time series (window_size, 1) - - If normal=False, treats X as multiple windows (num_windows, window_size, 1) + Transform an array of integers to string(s), separated by the + indicated separator and space. Handles two cases: + - If single=True, treats X as a single time series (window_size, 1) + - If single=False, treats X as multiple windows (num_windows, window_size, 1) Args: sep (str): String to separate each element in X. Default to `','`. space (bool): Whether to add space between each digit in the result. Default to `False`. - normal (bool): - Whether to treat X as a normal time series. If True, expects (window_size, 1) + single (bool): + Whether to treat X as a single time series. If True, expects (window_size, 1) and returns a single string. If False, expects (num_windows, window_size, 1) and returns a list of strings. Default to `False`. Returns: ndarray or str: - If normal=True, returns a single string representation. If normal=False, + If single=True, returns one string representation. If single=False, returns a list of string representations for each window. """ @@ -36,11 +36,11 @@ def _as_string(x): text = ' '.join(text) return text - if normal: - # Handle as single time series (window_size, 1) + if single: + # single time series (window_size, 1) return _as_string(X) else: - # Handle as multiple windows (num_windows, window_size, 1) + # multiple windows (num_windows, window_size, 1) results = list(map(_as_string, X)) return np.array(results) @@ -110,7 +110,7 @@ def format_as_integer(X, sep=',', trunc=None, errors='ignore'): raise ValueError('Input is not a list of lists.') for text in string_list: - if not text: # Handle empty string + if not text: # empty string sample.append(np.array([], dtype=float)) else: scalar = _from_string_to_integer(text, sep, trunc, errors) @@ -202,7 +202,7 @@ def parse_anomaly_response(X): no anomalies are present. """ - def _parse_single_response(text: str): + def _parse_single_response(text): text = text.strip().lower() if 'no anomalies' in text or 'no anomaly' in text: @@ -225,30 +225,3 @@ def _parse_single_response(text: str): result.append(parsed_list) return result - - -def format_as_single_string(X, sep=',', space=False): - """Format a single time series to a string. - - Transform a 1-D array of integers to a single string, - separated by the indicated separator and space. - - Args: - sep (str): - String to separate each element in X. Default to `','`. - space (bool): - Whether to add space between each digit in the result. Default to `False`. - - Returns: - str: - A string representation of the time series. - """ - if X.ndim > 1: - X = X.flatten() - - text = sep.join(list(map(str, X))) - - if space: - text = ' '.join(text) - - return text From a2e28f391afbc462b3f696a27217a9b8e9bcb3b1 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Thu, 17 Apr 2025 12:38:17 -0400 Subject: [PATCH 23/27] add new functionality tests --- tests/primitives/test_transformation.py | 29 ++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/primitives/test_transformation.py b/tests/primitives/test_transformation.py index a620fb1..2bebb68 100644 --- a/tests/primitives/test_transformation.py +++ b/tests/primitives/test_transformation.py @@ -46,6 +46,14 @@ def test_format_as_string_decimal(self): assert output == expected + def test_format_as_string_single(self): + data = np.array([1, 2, 3, 4, 5]) + expected = '1,2,3,4,5' + + output = format_as_string(data, single=True) + + np.testing.assert_array_equal(output, expected) + class FromStringToIntegerTest(unittest.TestCase): def test__from_string_to_integer_default(self): @@ -121,7 +129,6 @@ def test_format_as_integer_one(): with pytest.raises(ValueError): format_as_integer(data) - def test_format_as_integer_list(): data = [['1,2,3,4,5']] @@ -131,6 +138,14 @@ def test_format_as_integer_list(): np.testing.assert_equal(output, expected) +def test_format_as_integer_empty(): + data = [['']] + + expected = np.array([[np.array([], dtype=float)]]) + + output = format_as_integer(data) + + np.testing.assert_equal(output, expected) def test_format_as_integer_2d_shape_mismatch(): data = [['1,2,3,4,5'], ['1, 294., 3 , j34,5'], ['!232, 23,3,4,5']] @@ -147,6 +162,18 @@ def test_format_as_integer_2d_shape_mismatch(): np.testing.assert_equal(o, e) +def test_format_as_integer_mixed(): + data = [[''], ['1,2,3']] + + expected = np.array([[np.array([], dtype=float)], [np.array([1., 2., 3.])]], dtype=object) + + output = format_as_integer(data) + + for out, exp in list(zip(output, expected)): + for o, e in list(zip(out, exp)): + np.testing.assert_equal(o, e) + + def test_format_as_integer_2d_trunc(): data = [['1,2,3,4,5'], ['1,294.,3,j34,5'], ['!232, 23,3,4,5']] From f293d840c390ac05ba357ba47d1573d02c0428cc Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Thu, 17 Apr 2025 12:38:53 -0400 Subject: [PATCH 24/27] update ubuntu image --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6ee5c65..6f7940e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,7 +11,7 @@ on: jobs: lint: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Python 3.9 From f3f7b4c179c94d150aad469bee9dd259322b065e Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Thu, 17 Apr 2025 12:40:09 -0400 Subject: [PATCH 25/27] change normal->single --- .../sigllm.primitives.transformation.format_as_string.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sigllm/primitives/jsons/sigllm.primitives.transformation.format_as_string.json b/sigllm/primitives/jsons/sigllm.primitives.transformation.format_as_string.json index 8f0b115..faa32fa 100644 --- a/sigllm/primitives/jsons/sigllm.primitives.transformation.format_as_string.json +++ b/sigllm/primitives/jsons/sigllm.primitives.transformation.format_as_string.json @@ -35,7 +35,7 @@ "type": "bool", "default": false }, - "normal": { + "single": { "type": "bool", "default": false } From 540ea9283eafd60b00fdd8c33246ebecb5ef229e Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Thu, 17 Apr 2025 14:02:25 -0400 Subject: [PATCH 26/27] fix lint --- sigllm/primitives/transformation.py | 2 +- tests/primitives/test_transformation.py | 27 +++++++++++-------------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/sigllm/primitives/transformation.py b/sigllm/primitives/transformation.py index d4b3e82..b8ee151 100644 --- a/sigllm/primitives/transformation.py +++ b/sigllm/primitives/transformation.py @@ -9,7 +9,7 @@ def format_as_string(X, sep=',', space=False, single=False): """Format X to a list of string. - Transform an array of integers to string(s), separated by the + Transform an array of integers to string(s), separated by the indicated separator and space. Handles two cases: - If single=True, treats X as a single time series (window_size, 1) - If single=False, treats X as multiple windows (num_windows, window_size, 1) diff --git a/tests/primitives/test_transformation.py b/tests/primitives/test_transformation.py index 2bebb68..538ceef 100644 --- a/tests/primitives/test_transformation.py +++ b/tests/primitives/test_transformation.py @@ -129,6 +129,7 @@ def test_format_as_integer_one(): with pytest.raises(ValueError): format_as_integer(data) + def test_format_as_integer_list(): data = [['1,2,3,4,5']] @@ -138,15 +139,17 @@ def test_format_as_integer_list(): np.testing.assert_equal(output, expected) + def test_format_as_integer_empty(): data = [['']] expected = np.array([[np.array([], dtype=float)]]) - + output = format_as_integer(data) - + np.testing.assert_equal(output, expected) + def test_format_as_integer_2d_shape_mismatch(): data = [['1,2,3,4,5'], ['1, 294., 3 , j34,5'], ['!232, 23,3,4,5']] @@ -165,15 +168,15 @@ def test_format_as_integer_2d_shape_mismatch(): def test_format_as_integer_mixed(): data = [[''], ['1,2,3']] - expected = np.array([[np.array([], dtype=float)], [np.array([1., 2., 3.])]], dtype=object) - + expected = np.array([[np.array([], dtype=float)], [np.array([1.0, 2.0, 3.0])]], dtype=object) + output = format_as_integer(data) - + for out, exp in list(zip(output, expected)): for o, e in list(zip(out, exp)): np.testing.assert_equal(o, e) - + def test_format_as_integer_2d_trunc(): data = [['1,2,3,4,5'], ['1,294.,3,j34,5'], ['!232, 23,3,4,5']] @@ -364,10 +367,7 @@ def test_multiple_anomalies(self): self.assertEqual(output, expected) def test_mixed_responses(self): - data = [ - ['Answer: no anomalies', 'Answer: [123, 456]'], - ['Answer: [789]', 'no anomaly'] - ] + data = [['Answer: no anomalies', 'Answer: [123, 456]'], ['Answer: [789]', 'no anomaly']] expected = [['', '123,456'], ['789', '']] output = parse_anomaly_response(data) @@ -376,12 +376,9 @@ def test_mixed_responses(self): def test_different_formats(self): data = [ ['Answer: [123, 456]', 'Answer: [ 789 , 101 ]'], - ['Answer: [1,2,3]', 'Answer: [ 4 , 5 , 6 ]'] - ] - expected = [ - ['123,456', '789,101'], - ['1,2,3', '4,5,6'] + ['Answer: [1,2,3]', 'Answer: [ 4 , 5 , 6 ]'], ] + expected = [['123,456', '789,101'], ['1,2,3', '4,5,6']] output = parse_anomaly_response(data) self.assertEqual(output, expected) From 5876feb37ff13fff341a6412d6d714d3f50950d2 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Thu, 17 Apr 2025 14:05:18 -0400 Subject: [PATCH 27/27] swap normal -> single --- sigllm/pipelines/prompter/mistral_prompter_1shot.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sigllm/pipelines/prompter/mistral_prompter_1shot.json b/sigllm/pipelines/prompter/mistral_prompter_1shot.json index f2edcc4..62dc8ce 100644 --- a/sigllm/pipelines/prompter/mistral_prompter_1shot.json +++ b/sigllm/pipelines/prompter/mistral_prompter_1shot.json @@ -47,7 +47,7 @@ }, "sigllm.primitives.transformation.format_as_string#2": { "space": false, - "normal": true + "single": true }, "sigllm.primitives.prompting.huggingface.HF#1": { "name": "mistralai/Mistral-7B-Instruct-v0.2", @@ -171,4 +171,4 @@ "y": "y_merged" } } -} \ No newline at end of file +}