diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cc0fb26..6803be6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,7 +29,7 @@ jobs: runs-on: ubuntu-24.04 strategy: matrix: - python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 @@ -46,7 +46,7 @@ jobs: runs-on: ubuntu-24.04 strategy: matrix: - python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 diff --git a/.gitignore b/.gitignore index 0a19790..47f98d1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.vscode/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index bded931..22e6746 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,40 @@ -# Robust principle component analysis for Python +# Robust principal component analysis for Python -Work in progress. Coming soon here and on PyPI. +This package provides algorithms to solve the Robust Principal Component Analysis (RPCA) problem, as presented by Candès et al.[[1]](#candes2011). +Currently, a single algorithm is implemented: it solves the Principal Component Pursuit (PCP) convex relaxation of RPCA from the same paper, using the Inexact Augmented Lagrange Multiplier (IALM) method from Lin et al.[[2]](#lin2011)[[3]](#lin2013). + +## Example +```python +from pyrpca import rpca_pcp_ialm + +# given an m x n data matrix +data = ... + +# decide on sparsity factor. +# this parameter is also commonly known as `lambda`. +sparsity_factor = 1.0 / numpy.sqrt(max(data.shape)) + +# run the ialm algorithm. +low_rank, sparse = rpca_pcp_ialm(data, sparsity_factor) +``` ## Installing ```shell pip install pyrpca ``` -## Example -TODO +## Feature requests and contributing +Pull requests and feature requests are welcome. The current version is minimal and suits my personal needs, but feel free to make suggestions. Even if the repository looks inactive, I will still respond :) + +## References +1. [Emmanuel J. Candès, Xiaodong Li, Yi Ma, John Wright. Robust principal component analysis? Association for Computing Machinery 2011.](https://doi.org/10.1145/1970392.1970395) (preprint on [arXiv](https://doi.org/10.48550/arXiv.0912.3599)) +2. [Zhouchen Lin, Risheng Liu, Zhixun Su. Linearized Alternating Direction Method with Adaptive Penalty for Low-Rank Representation. arXiv 2011.](https://doi.org/10.48550/arXiv.1109.0367) +3. [Zhouchen Lin, Minming Chen, Yi Ma. The Augmented Lagrange Multiplier Method for Exact Recovery of Corrupted Low-Rank Matrices, V3. arXiv 2013.](https://doi.org/10.48550/arXiv.1009.5055) -## Acknowledgements -Appreciation is due to various other Python implementations of RPCA; below is a non-exhaustive list. -The code in this project has been loosely inspired by these works. +## Acknowledgements +Appreciation is due to various other Python implementations of RPCA that served as inspiration for this project. Below is a non-exhaustive list: -- https://github.com/2020leon/rpca -- https://github.com/dganguli/robust-pca -- https://github.com/weilinear/PyRPCA +- https://github.com/2020leon/rpca +- https://github.com/dganguli/robust-pca +- https://github.com/weilinear/PyRPCA - https://github.com/loiccoyle/RPCA diff --git a/pyproject.toml b/pyproject.toml index c9790b3..da1b54f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,5 @@ [build-system] -requires = [ - "setuptools >= 78", - "setuptools-git-versioning >=2.1, <3", -] +requires = ["setuptools >= 78", "setuptools-git-versioning >=2.1, <3"] build-backend = "setuptools.build_meta" [tool.setuptools-git-versioning] @@ -10,41 +7,32 @@ enabled = true [project] name = "pyrpca" -description = "Robust principle component analysis for Python." +description = "Robust principal component analysis for Python." dynamic = ["version"] readme = "README.md" -requires-python = ">=3.9,<4" +requires-python = ">=3.10,<4" classifiers = [ "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", ] -dependencies = [ - "numpy >=2.0.2, <3", -] -keywords = [ - "rpca", - "robust pca", - "robust principal component analysis", -] -authors = [ - {name = "Aart Stuurman", email = "aart@astuurman.com"}, -] -maintainers = [ - {name = "Aart Stuurman", email = "aart@astuurman.com"}, -] -license = {file = "LICENSE"} +dependencies = ["numpy >= 2.0.2, < 3", "scipy >= 1.15.2"] +keywords = ["rpca", "robust pca", "robust principal component analysis"] +authors = [{ name = "Aart Stuurman", email = "aart@astuurman.com" }] +maintainers = [{ name = "Aart Stuurman", email = "aart@astuurman.com" }] +license = { file = "LICENSE" } [project.optional-dependencies] dev = [ "ruff == 0.11.2", "mypy == 1.15.0", "pytest == 8.3.5", + "scipy-stubs >= 1.15.2", ] [project.urls] homepage = "https://github.com/surgura/PyRPCA" [tool.setuptools] -package-dir = {"pyrpca" = "pyrpca"} +package-dir = { "pyrpca" = "pyrpca" } [tool.mypy] strict = true diff --git a/pyrpca/__init__.py b/pyrpca/__init__.py index e69de29..573ceb6 100644 --- a/pyrpca/__init__.py +++ b/pyrpca/__init__.py @@ -0,0 +1,3 @@ +from .pcp_ialm import rpca_pcp_ialm + +__all__ = ["rpca_pcp_ialm"] diff --git a/pyrpca/pcp_ialm.py b/pyrpca/pcp_ialm.py new file mode 100644 index 0000000..084d054 --- /dev/null +++ b/pyrpca/pcp_ialm.py @@ -0,0 +1,85 @@ +import numpy.typing as npt +from typing import Tuple +import numpy as np +from numpy.linalg import norm +from scipy.linalg import svd + + +def rpca_pcp_ialm( + observations: npt.ArrayLike, + sparsity_factor: float, + max_iter: int = 1000, + mu: float | None = None, + mu_upper_bound: float | None = None, + rho: float = 1.5, + tol: float = 1e-7, + verbose: bool = True, +) -> Tuple[npt.ArrayLike, npt.ArrayLike]: + """ + Solve the Principal Component Pursuit (PCP) convex relaxation of Robust PCA using the Inexact Augmented Lagrange Multiplier (IALM) method. + + See README for algorithmic details and references. + + Mu is updated every loop by multiplying it by `rho` until reaching `mu_upper_bound`. + + Parameters: + observations: The m x n input matrix to decompose ('D' in the IALM paper). + sparsity_factor: Weight on the sparse term in the objective ('lambda' in the IALM paper). + max_iter: Maximum number of iterations to perform. + mu: Initial value for the penalty parameter. If None, defaults to 1/spectral norm of observations. + mu_upper_bound: Maximum allowed value for `mu`. If None, defaults to `mu * 1e7`. + rho: Multiplicative factor to increase `mu` in each iteration. + tol: Tolerance for stopping criterion (relative Frobenius norm of the residual). + verbose: If True, print status and debug information during optimization. + + Returns: + low_rank_component: The recovered low-rank matrix ('A' in the IALM paper). + sparse_component: The recovered sparse matrix ('E' in the IALM paper). + """ + if mu is None: + mu = float(1.25 / norm(observations, ord=2)) + if mu_upper_bound is None: + mu_upper_bound = mu * 1e7 + + norm_fro_obs = norm(observations, ord="fro") + + dual = observations / np.maximum( + norm(observations, ord=2), norm(observations, ord=np.inf) / sparsity_factor + ) + sparse = np.zeros_like(observations) + + i = 0 + while True: + # compute next iteration of a + u, s, v = svd(observations - sparse + 1.0 / mu * dual, full_matrices=False) + s_thresholded = np.maximum(s - 1.0 / mu, 0) + low_rank = (u * s_thresholded) @ v + + # compute next iteration of e + residual_for_sparse = observations - low_rank + 1.0 / mu * dual + sparse = np.sign(residual_for_sparse) * np.maximum( + np.abs(residual_for_sparse) - sparsity_factor / mu, 0 + ) + + # calculate error + residual = observations - low_rank - sparse + err = norm(residual, ord="fro") / norm_fro_obs + + i += 1 + + if verbose: + print(f"iter {i:<4} | err {err:<25} | mu {mu:<25}") + + if err < tol: + if verbose: + print("Finished optimization. Error smaller than tolerance.") + break + if i == max_iter: + if verbose: + print("Finized optimization. Max iterations reached.") + break + + # update dual and mu + dual = dual + mu * (residual) + mu = min(mu * rho, mu_upper_bound) + return low_rank, sparse diff --git a/tests/test_dummy.py b/tests/test_dummy.py deleted file mode 100644 index 5514af1..0000000 --- a/tests/test_dummy.py +++ /dev/null @@ -1,12 +0,0 @@ -""" -Dummy test file to see the tests run properly. - -To be removed after an actual test is added. -""" - - -def test_dummpy() -> None: - """ - Dummy test. - """ - pass diff --git a/tests/test_pcp_ialm.py b/tests/test_pcp_ialm.py new file mode 100644 index 0000000..33fb10d --- /dev/null +++ b/tests/test_pcp_ialm.py @@ -0,0 +1,43 @@ +import numpy as np +from pyrpca import rpca_pcp_ialm +from numpy.linalg import norm +from scipy.sparse import random as sparse_random + + +def test_rpca_separates_low_rank_and_sparse(): + np.random.seed(0) + m, n, rank = 500, 400, 5 + + # create low-rank matrix A + u = np.random.randn(m, rank) + v = np.random.randn(rank, n) + low_rank = u @ v + + # create sparse matrix E + sparse = sparse_random( + m, n, density=0.1, format="csr", data_rvs=np.random.randn + ).toarray() + + # create observation matrix + observations = low_rank + sparse + + # Run RPCA + low_rank_recovered, sparse_recovered = rpca_pcp_ialm( + observations, + sparsity_factor=1.0 / np.sqrt(max(observations.shape)), + ) + + # check that the reconstruction is close + reconstruction_error = norm( + observations - (low_rank_recovered + sparse_recovered), ord="fro" + ) / norm(observations, ord="fro") + assert reconstruction_error < 1e-6, ( + f"Reconstruction error too high: {reconstruction_error}" + ) + + # check that recovered matrices are low rank and sparse + approx_rank = np.linalg.matrix_rank(low_rank_recovered, tol=1e-3) + sparsity = np.count_nonzero(sparse_recovered) / sparse_recovered.size + + assert approx_rank <= rank + 2, f"Recovered A not low rank: {approx_rank}" + assert sparsity < 0.2, f"Recovered E not sparse: sparsity={sparsity}"