Merge pull request #35 from wwhenxuan/master

wwhenxuan · web-flow · commit 2c55e6e1beb8 · 2026-02-17T13:12:50.000+08:00
Fixed a bug in the ARIMA model caused by linear operations.
diff --git a/README.md b/README.md
@@ -18,17 +18,18 @@ This method allows for the unrestricted creation of high-quality time series dat
 
 ### 🔥 News
 
+**[Feb. 2026]** Since all stationary time series can be obtained by exciting a linear time-invariant system with white noise, we propose [a learnable series generation method](https://github.com/wwhenxuan/S2Generator/blob/main/s2generator/simulator/arima.py) based on the ARIMA model. This method ensures the generated series is highly similar to the inputs in autocorrelation and power spectrum density.
+
 **[Sep. 2025]** Our paper "Synthetic Series-Symbol Data Generation for Time Series Foundation Models" has been accepted by **NeurIPS 2025**, where **[*SymTime*](https://arxiv.org/abs/2502.15466)** pre-trained on the $S^2$ synthetic dataset achieved SOTA results in fine-tuning of forecasting, classification, imputation and anomaly detection tasks.
 
 ## 🚀 Installation <a id="Installation"></a>
 
-We have highly encapsulated the algorithm and uploaded the code to PyPI. Users can download the code through `pip`.
-
+We have highly encapsulated the algorithm and uploaded the code to PyPI:
 ~~~
 pip install s2generator
 ~~~
 
-We only used [`NumPy`](https://numpy.org/), [`Scipy`](https://scipy.org/) and [`matplotlib`](https://matplotlib.org/) when developing the project.
+We used [`NumPy`](https://numpy.org/), [`Pandas`](https://pandas.pydata.org/), and [`Scipy`](https://scipy.org/) to build the data science environment, [`Matplotlib`](https://matplotlib.org/) for data visualization, and [`Statsmodels`](https://www.statsmodels.org/stable/index.html) for time series analysis and statistical processing.
 
 ## ✨ Usage
 
diff --git a/examples/19-arma_simulator.ipynb b/examples/19-arma_simulator.ipynb
diff --git a/s2generator/simulator/arima.py b/s2generator/simulator/arima.py
@@ -15,7 +15,8 @@
 from statsmodels.tsa.api import acf, pacf
 from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
 
-from s2generator.utils._tools import eacf_rlike, plot_shapiro_wilk
+from s2generator.utils._tools import eacf_rlike
+from s2generator.utils.visualization import plot_shapiro_wilk
 
 import warnings
 
@@ -32,6 +33,17 @@ class ARIMASimulator(object):
 
     Based on these two points, we can use the ARIMA model to generate non-stationary time series data.
     Compared to previous data generation methods, we can further fit the statistical characteristics of real time series data through the ARIMA model, thereby generating more realistic time series data.
+
+    Since this generation method involves the fitting and training of the ARIMA model, linear operations may trigger exceptions such as `LinAlgError`, resulting in generation failure.
+    This issue is generally related to the input time series data and the order of the ARIMA model. We have investigated the common input data problems as follows:
+
+    1. The data is completely constant (variance = 0);
+    2. The length of the input time series is too short;
+    3. There are obvious extreme values or outliers in the input sequence after standardization;
+    4. An excessively high order setting (p,q) leads to matrix dimension mismatch or singularity.
+
+    In addition, the `ARIMA` implementation in `statsmodels` has limited ability to handle certain ill-conditioned matrices (e.g., nearly singular matrices).
+    Even if the data appears normal, LU decomposition may still fail due to floating-point precision issues.
     """
 
     def __init__(
@@ -41,10 +53,17 @@ def __init__(
         max_q: int = 5,
         signif: float = 0.05,
         not_white_alarm: bool = True,
+        revin: bool = True,
         random_state: Optional[int] = 42,
     ) -> None:
         """
-        :param order: A tuple specifying the (p, d, q) order of the ARIMA model.
+        :param max_p: Maximum AR order (p) to consider when fitting the ARIMA model.
+        :param max_d: Maximum differencing order (d) to consider when fitting the ARIMA model.
+        :param max_q: Maximum MA order (q) to consider when fitting the ARIMA model.
+        :param signif: Significance level for the ADF test to determine stationarity.
+        :param not_white_alarm: Whether to issue a warning when the residuals of the fitted model are not white noise.
+        :param revin: Should reversible normalization be performed on time series data?
+        :param random_state: Random state for reproducibility when generating new time series data.
         """
         self.max_p = max_p
         self.max_d = max_d
@@ -56,6 +75,12 @@ def __init__(
         # Whether to issue a warning when residuals are not white noise
         self.not_white_alarm = not_white_alarm
 
+        # Should reversible normalization be performed on time series data?
+        # If True, the generated time series data will be normalized to have zero mean and unit variance,
+        # and the original mean and variance will be recorded for potential inverse transformation.
+        self.revin = revin
+        self.mean, self.std = None, None
+
         # Record the parameters of the model fit
         self.d_order = None
         self.p_order, self.q_order = None, None
@@ -82,6 +107,13 @@ def fit(
         # Check the input time series data
         time_series = self.check_inputs(time_series=time_series)
 
+        # Optionally reverse the time series data to generate data in reverse order
+        if self.revin:
+            self.mean, self.std = time_series.mean(), time_series.std()
+            time_series = (
+                time_series - self.mean
+            ) / self.std  # Normalize the time series data
+
         # First, difference the time series to make it stationary
         stationary_series, self.d_order = self.diff_stationary(time_series=time_series)
 
@@ -103,8 +135,9 @@ def fit(
 
         # Perform residual diagnosis
         mean_p_value, is_white = self.residual_diagnosis(signif=self.signif)
+
         if not is_white and self.not_white_alarm:
-            print(
+            raise ValueError(
                 f"Warning: Model residuals may not be white noise (mean p-value={mean_p_value:.4f} < significance level={self.signif}), please re-evaluate the model order or parameters."
             )
 
@@ -132,7 +165,35 @@ def transform(
             ),
         )
 
-        return generated_series.values.T
+        return (
+            generated_series.values.T * self.std + self.mean
+            if self.revin
+            else generated_series.values.T
+        )
+
+    @property
+    def param_names(self) -> List[str]:
+        """Return the names of the parameters in the fitted ARIMA model."""
+        if not hasattr(self, "model"):
+            raise ValueError("The model must be fitted before calling param_names.")
+
+        return self.model.param_names
+
+    @property
+    def params(self) -> Union[np.ndarray, pd.Series]:
+        """Return the parameter values of the fitted ARIMA model."""
+        if not hasattr(self, "model"):
+            raise ValueError("The model must be fitted before calling params.")
+
+        return self.model.params
+
+    @property
+    def param_items(self) -> List[Tuple[str, float]]:
+        """Return a list of (parameter name, parameter value) tuples for the fitted ARIMA model."""
+        if not hasattr(self, "model"):
+            raise ValueError("The model must be fitted before calling param_items.")
+
+        return list(zip(self.param_names, self.params))
 
     def check_inputs(self, time_series: Union[pd.Series, np.ndarray]) -> pd.Series:
         """
@@ -163,6 +224,19 @@ def check_inputs(self, time_series: Union[pd.Series, np.ndarray]) -> pd.Series:
         if len(time_series) < 10:
             raise ValueError("Input time series must have at least 10 data points.")
 
+        # Check if the time series contains NaN values
+        if pd.isnull(time_series).any():
+            raise ValueError("Input time series must not contain NaN values.")
+
+        # std = np.std(time_series)
+        std = np.std(time_series)
+        if (
+            std < 1e-8
+        ):  # A very small threshold to check if the variance is effectively zero
+            raise ValueError(
+                "The time series variance is 0 (all values ​​are the same), making it impossible to fit the ARIMA model."
+            )
+
         return pd.Series(time_series)
 
     def select_arma_order(
@@ -192,7 +266,6 @@ def select_arma_order(
                     continue
                 try:
                     # Fit ARMA model
-                    # FIXME: Consider using the EACF method to select the optimal (p,q) combination?
                     model = ARIMA(stationary_series, order=(p, 0, q))
                     results = model.fit()
                     if results.aic < best_aic:
diff --git a/s2generator/utils/__init__.py b/s2generator/utils/__init__.py
@@ -21,7 +21,6 @@
     "generate_arma_samples",
     "generate_nonstationary_sine",
     "eacf_rlike",
-    "plot_shapiro_wilk",
     "fft",
     "fftshift",
     "ifft",
@@ -39,14 +38,12 @@
     "exponential_smoothing",
     "smooth_show_info",
     "MovingDecomp",
+    "plot_series",
+    "plot_symbol",
+    "plot_shapiro_wilk",
+    "plot_simulator_statistics",
 ]
 
-# # Visualization the time series data in S2
-# from .visualization import plot_series
-#
-# # Visualization the Symbol data in S2
-# from .visualization import plot_symbol
-
 # Transform the symbol from string to latex
 from .print_symbol import symbol_to_markdown
 
@@ -71,9 +68,6 @@
 # The EACF function to determine the order of ARMA model
 from ._tools import eacf_rlike
 
-# The Shapiro-Wilk test for normality of the residuals
-from ._tools import plot_shapiro_wilk
-
 # Print the Generation Status
 from ._print_status import PrintStatus
 
@@ -101,3 +95,11 @@
 
 # The Seasonal-Trend decomposition using LOESS (STL)
 from ._decomposition import STL, STLResult
+
+# The Shapiro-Wilk test for normality of the residuals
+from .visualization import (
+    plot_series,
+    plot_symbol,
+    plot_shapiro_wilk,
+    plot_simulator_statistics,
+)
diff --git a/s2generator/utils/_tools.py b/s2generator/utils/_tools.py
@@ -30,7 +30,6 @@
     "generate_arma_samples",
     "generate_nonstationary_sine",
     "eacf_rlike",
-    "plot_shapiro_wilk",
 ]
 
 import os
@@ -41,7 +40,6 @@
 from numpy import fft as np_fft
 
 import pandas as pd
-from matplotlib import pyplot as plt
 
 from typing import Optional, Dict, Union, Tuple
 
@@ -529,99 +527,3 @@ def eacf_rlike(
     )
 
     return eacf_matrix, threshold, eacf_df
-
-
-def plot_shapiro_wilk(
-    residuals: np.ndarray,
-    bins: int = 13,
-    dpi: int = 500,
-    figsize: Tuple[int, int] = (12, 5),
-) -> Tuple[plt.Figure, float, float]:
-    """
-    Plot the Shapiro-Wilk test for normality of the residuals.
-    This method generates a Q-Q plot to visually assess whether the residuals
-    of the fitted ARIMA model follow a normal distribution.
-
-    :param residuals: Residuals from the fitted ARIMA model.
-    :param bins: Number of bins for the histogram of residuals.
-    :param dpi: Dots per inch (resolution) for the generated plot.
-    :param figsize: Figure size for the generated plot.
-    :return: A tuple containing the matplotlib Figure object, the Shapiro-Wilk statistic, and the p-value.
-    """
-    # Ensure the model has been fitted and the residuals have been calculated.
-    if residuals is None:
-        raise ValueError("Residuals must be provided before calling plot_shapiro_wilk.")
-
-    # Convert residuals to a numpy array for consistency
-    residuals = np.asarray(residuals)
-
-    # Import necessary libraries
-    from statsmodels.graphics.gofplots import qqplot
-    from scipy.stats import shapiro
-
-    # import seaborn as sns
-    # sns.set_theme(style="ticks")
-
-    # Perform Shapiro-Wilk normality test
-    stat, p_value = shapiro(residuals)
-
-    # Create visualization figure
-    fig, ax = plt.subplots(1, 2, figsize=figsize, dpi=dpi)
-    fig.subplots_adjust(wspace=0.16)
-
-    # Plot histogram of the fitted residuals
-    ax[0].hist(residuals, bins=bins, alpha=1, color="w", edgecolor="k", lw=1.2)
-
-    # Plot Q-Q plot for normality test
-    qqplot(
-        residuals,
-        line="s",
-        ax=ax[1],
-        markerfacecolor="white",
-        markeredgecolor="k",
-        markersize=7.5,
-    )
-    for line in ax[1].get_lines():
-        if line.get_linestyle() == "-":
-            line.set_color("#DC143C")
-            line.set_linewidth(2.1)
-
-    # Set titles and labels
-    ax[0].grid(which="major", color="gray", linestyle="--", lw=0.5, alpha=0.8)
-    ax[1].grid(which="major", color="gray", linestyle="--", lw=0.5, alpha=0.8)
-    ax[0].set_xlabel("Standard Residual", fontsize=12.5)
-    ax[0].set_ylabel("Frequency", fontsize=12.5)
-    ax[1].set_xlabel("Theoretical Quantiles", fontsize=12.5)
-    ax[1].set_ylabel("Sample Quantiles", fontsize=12.5)
-
-    # Annotate the plots with statistics
-    mean = np.round(np.mean(residuals), 4)
-    std = np.round(np.std(residuals), 4)
-    stat = np.round(stat, 4)
-    p_value = np.round(p_value, 4)
-
-    # Set the text annotations for the mean and std on the histogram
-    ax[0].text(
-        0.05,
-        0.95,
-        f"$\mu$ = {mean}\n$\sigma$ = {std}",
-        transform=ax[0].transAxes,
-        verticalalignment="top",
-        horizontalalignment="left",
-        fontsize=13.5,
-        color="k",
-    )
-
-    # Set the text annotations for the Shapiro-Wilk test on the Q-Q plot
-    ax[1].text(
-        0.05,
-        0.95,
-        f"$W$ = {stat}\n$p$ = {p_value}",
-        transform=ax[1].transAxes,
-        verticalalignment="top",
-        horizontalalignment="left",
-        fontsize=13.5,
-        color="k",
-    )
-
-    return fig, stat, p_value
diff --git a/s2generator/utils/visualization.py b/s2generator/utils/visualization.py