diff --git a/.gitignore b/.gitignore index bb18211..0f50078 100644 --- a/.gitignore +++ b/.gitignore @@ -181,3 +181,4 @@ CLAUDE.local.md # package version *_version.py /docs/tutorials/SoundscapyResults.xlsx +CLAUDE.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index aba711b..74d98d3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,10 +17,7 @@ repos: rev: v1.5.5 hooks: - id: forbid-tabs - - repo: https://github.com/pappasam/toml-sort - rev: v0.24.2 - hooks: - - id: toml-sort-fix + # TODO(MitchellAcoustics): Enable linting pre-commit hooks # https://github.com/MitchellAcoustics/Soundscapy/issues/114 # - repo: https://github.com/pre-commit/mirrors-mypy diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..1440a26 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,88 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +Soundscapy is a Python library for analysing and visualising soundscape assessments. It supports survey data processing, circumplex (ISO) visualisation, binaural audio processing, and predicted soundscape perception via R-backed models. + +## Commands + +### Setup + +```bash +uv sync --all-extras # Install all dependencies (core + audio + r/satp/spi) +uv sync # Core dependencies only (uses default groups: dev, docs, test) +uv sync --extra audio # Core + audio extras +uv sync --extra r # Core + R integration (rpy2) +``` + +### Testing + +```bash +uv run pytest # Run all available tests (auto-skips missing dep modules) +uv run pytest test/test_surveys.py # Run a single test file +uv run pytest -k "test_name" # Run tests matching a pattern +uv run pytest --no-cov # Skip coverage measurement + +# Tox environments for full multi-dep testing +uv run tox -e py312-core # Core only +uv run tox -e py312-audio # Core + audio +uv run tox -e py312-r # Core + R (requires R installed) +uv run tox -e py312-all # All extras +``` + +### Linting & Formatting + +```bash +uv run ruff check . # Lint +uv run ruff check . --fix # Lint and auto-fix +uv run ruff format . # Format +``` + +### Docs + +```bash +uv run mkdocs build # Build documentation +uv run mkdocs serve # Serve docs locally +``` + +## Architecture + +### Source Layout (`src/soundscapy/`) + +The package uses an `src/` layout with `importlib` import mode. Modules are split into **core** (always available) and **optional** (gated by extras): + +- **Core**: `surveys/`, `plotting/`, `databases/` — always imported in `__init__.py` +- **Optional audio**: `audio/` — `try/except` import in `__init__.py`; requires `soundscapy[audio]` +- **Optional R-backed**: `spi/`, `satp/`, `r_wrapper/` — lazily loaded via module-level `__getattr__` (PEP 562) to avoid starting the R process on `import soundscapy` + +### Key Modules + +- **`surveys/`**: PAQ (Perceived Affective Quality) data processing, ISO coordinate calculations for the circumplex model (`ISOPleasant`, `ISOEventful` axes), data quality checks +- **`plotting/`**: `ISOPlot` — fluent API (method chaining) for circumplex plots using seaborn/matplotlib; also Likert scale plots +- **`databases/`**: Loaders for ISD (International Soundscape Database), SATP, and ARAUS datasets +- **`audio/`**: `Binaural` class (extends `acoustic_toolbox.Signal`) for psychoacoustic metrics from mosqito, maad, acoustic_toolbox +- **`satp/`**: `CircE` model — R-backed circumplex SEM analysis via `rpy2`; wraps the [CircE R package](https://github.com/MitchellAcoustics/CircE-R) +- **`spi/`**: Soundscape Perception Index using multi-skew normal distributions; also R-backed +- **`r_wrapper/`**: Low-level `rpy2` wrappers for CircE and SN (skew-normal) R packages + +### Dependency System + +Optional dependencies are managed at multiple levels: + +1. **Module-level**: Each optional module's `__init__.py` imports its deps; failure surfaces clearly +2. **Package `__init__.py`**: Audio uses `try/except`; R modules use lazy `__getattr__` +3. **pytest**: `conftest.py` auto-skips collection for optional module directories (`audio/`, `r_wrapper/`, `spi/`, `satp/`) when deps are absent; use `@pytest.mark.optional_deps('group')` for integration tests + +### Configuration + +All tool config lives in `pyproject.toml` (ruff, mypy, pytest, coverage, setuptools-scm). Version is managed by `setuptools-scm` and written to `src/soundscapy/_version.py` — do not edit this file manually. + +## Development Conventions + +- **Branching**: feature branches → `dev` → `main` (only stable releases on `main`) +- **Commits**: Angular format (`feat:`, `fix:`, `docs:`, `refactor:`, `test:`, `chore:`) +- **R package testing**: The `sn` and `CircE` R packages must be installed separately via `pak` before running R-dependent tests (see `tox.ini` `commands_pre`) +- **xdoctest**: Doctests in source files run automatically with pytest via `--xdoctest`; write doctests using `Example:` sections +- **Logging**: Uses `loguru`; disabled by default for library use (`logger.disable("soundscapy")`). Expose via `soundscapy.enable_debug()` / `setup_logging()` diff --git a/src/soundscapy/__init__.py b/src/soundscapy/__init__.py index 19d70b0..a13504b 100644 --- a/src/soundscapy/__init__.py +++ b/src/soundscapy/__init__.py @@ -29,7 +29,7 @@ get_logger, setup_logging, ) -from soundscapy.surveys import add_iso_coords, processing, rename_paqs +from soundscapy.surveys import add_iso_coords, ipsatize, processing, rename_paqs from soundscapy.surveys.survey_utils import PAQ_IDS, PAQ_LABELS __all__ = [ @@ -44,6 +44,7 @@ "disable_logging", "enable_debug", "get_logger", + "ipsatize", "isd", "iso_plot", "jointplot", @@ -107,7 +108,14 @@ } ) _SATP_ATTRS: frozenset[str] = frozenset( - {"satp", "fit_circe", "CircModelE", "normalize_polar_angles"} + { + "satp", + "CircE", + "CircEResults", + "CircModelE", + "fit_circe", + "normalize_polar_angles", + } ) diff --git a/src/soundscapy/satp/__init__.py b/src/soundscapy/satp/__init__.py index f000b09..353be03 100644 --- a/src/soundscapy/satp/__init__.py +++ b/src/soundscapy/satp/__init__.py @@ -21,6 +21,7 @@ from soundscapy.satp import circe from soundscapy.satp.circe import ( CircE, + CircEResults, CircModelE, fit_circe, normalize_polar_angles, @@ -29,6 +30,7 @@ __all__ = [ "CircE", + "CircEResults", "CircModelE", "circe", "fit_circe", diff --git a/src/soundscapy/satp/circe.py b/src/soundscapy/satp/circe.py index fcb4ea9..47d913e 100644 --- a/src/soundscapy/satp/circe.py +++ b/src/soundscapy/satp/circe.py @@ -31,18 +31,19 @@ import dataclasses import warnings from enum import Enum -from typing import Any +from typing import Any, Literal import numpy as np import pandas as pd import pandera.pandas as pa from pandera import Field +from pandera.errors import SchemaErrors from pandera.typing.pandas import DataFrame, Series - from rpy2.rinterface_lib.embedded import RRuntimeError import soundscapy.r_wrapper as sspyr from soundscapy import PAQ_IDS, PAQ_LABELS, get_logger +from soundscapy.surveys.processing import ipsatize logger = get_logger() @@ -113,7 +114,7 @@ class SATPSchema(pa.DataFrameModel): class Config: """Configuration for the schema validation behavior.""" - drop_invalid_rows = True + drop_invalid_rows = False strict = "filter" @pa.dataframe_parser @@ -430,39 +431,118 @@ def to_dict(self) -> dict[str, Any]: return base -def person_center(data: pd.DataFrame, by: str = "participant") -> pd.DataFrame: +@dataclasses.dataclass +class CircEResults: """ - Center PAQ ratings within each participant (column-wise within-person centering). + Collection of fitted CircE models returned by :func:`fit_circe`. + + Holds both successfully-fitted :class:`CircE` instances and any error rows + from models that failed to converge. Access the full tidy DataFrame via + :attr:`table`; access individual model results via :meth:`for_model`. + + Attributes + ---------- + models + Successfully-fitted :class:`CircE` results, in fitting order. + language + Language code passed to :func:`fit_circe`. + datasource + Dataset identifier passed to :func:`fit_circe`. + error_rows + Dicts for model runs that raised an exception during fitting. + Each dict contains ``language``, ``datasource``, ``model``, ``n``, + and an ``error`` key with the exception message. + + """ + + models: list[CircE] + language: str + datasource: str + error_rows: list[dict] = dataclasses.field(default_factory=list) + + def __len__(self) -> int: + """Total number of model runs (successful + failed).""" + return len(self.models) + len(self.error_rows) + + @property + def table(self) -> pd.DataFrame: + """ + Full tidy DataFrame of all model fit statistics. + + One row per model (including error rows). Columns match those + described in :func:`fit_circe`. Integer columns (``n``, ``d``, ``m``) + use pandas nullable ``Int64`` dtype so that ``None`` in error rows does + not promote the whole column to ``float64``. + """ + _order = {m.value: i for i, m in enumerate(CircModelE)} + rows = [m.to_dict() for m in self.models] + self.error_rows + result = pd.DataFrame(rows) + if "model" in result.columns: + result = ( + result.assign(_ord=result["model"].map(_order)) + .sort_values("_ord") + .drop("_ord", axis=1) + .reset_index(drop=True) + ) + for _int_col in ("n", "d", "m"): + if _int_col in result.columns: + result[_int_col] = result[_int_col].astype(pd.Int64Dtype()) + return result + + def for_model(self, model: CircModelE) -> CircE: + """ + Return the fitted :class:`CircE` result for a specific model type. + + Parameters + ---------- + model + The :class:`CircModelE` variant to retrieve. + + Raises + ------ + KeyError + If no successful result exists for the requested model (e.g. it + failed to converge). + + """ + for m in self.models: + if m.model is model: + return m + msg = f"No successful result for model {model.value!r}" + raise KeyError(msg) + + def _repr_html_(self) -> str: + return self.table._repr_html_() + + def __repr__(self) -> str: + return ( + f"CircEResults(language={self.language!r}, " + f"datasource={self.datasource!r}, " + f"{len(self.models)} fitted, {len(self.error_rows)} failed)" + ) - **Psychometric background** - In cross-cultural and multi-lingual soundscape studies, participants from - different language groups may use rating scales differently — some cultures - favour extreme responses; others cluster near the midpoint. These - *response-style biases* inflate between-person variance and can distort the - correlation structure that circumplex SEM models depend on. +def person_center(data: pd.DataFrame, by: str = "participant") -> pd.DataFrame: + """ + Center PAQ ratings within each participant (column-wise within-person centering). - Within-person centering addresses this by removing each participant's - *scale-specific* mean: for every PAQ column independently, the participant's - mean across all their soundscape observations is subtracted. The result is - that every participant has a zero mean on every PAQ scale, so the residual - variance reflects genuine perceptual variation across soundscapes rather - than individual scale-use tendencies. + .. deprecated:: + Use :func:`soundscapy.surveys.ipsatize` with ``method="column_wise"`` + instead. For the centering that matches the published SATP analysis, + use ``method="grand_mean"`` (the default of + :func:`~soundscapy.surveys.ipsatize`). - This is the form of centering recommended for the SATP circumplex analysis - (Aletta et al., 2024) and corresponds to the ``ipsatize`` preprocessing step - in the original R implementation. + This function applies **column-wise** centering: for every PAQ column + independently, each participant's mean across their observations is + subtracted (8 centering scalars per participant). .. note:: - This is **column-wise** within-participant centering. It is distinct - from *row-wise* ipsatization (e.g. ``circumplex.ipsatize()``), which - subtracts the mean across all PAQ items within a single observation. - Row-wise centering removes the general impression of each soundscape; - column-wise centering removes the participant's personal use of each - scale. Use this function when participants have multiple observations - (one per soundscape) and the goal is to remove person-level scale-use - biases before computing a correlation matrix. + This is *not* the centering described in the original SATP R + implementation (Aletta et al., 2024), which applies grand-mean + centering (one scalar per participant across all PAQ columns and + observations). Use :func:`~soundscapy.surveys.ipsatize` with + ``method="grand_mean"`` to match the R reference implementation. Parameters ---------- @@ -475,13 +555,17 @@ def person_center(data: pd.DataFrame, by: str = "participant") -> pd.DataFrame: ------- pd.DataFrame DataFrame containing only the PAQ columns (not ``by``), with - participant-centred values. The ``by`` column is excluded from the - result because arithmetic centering is undefined for string identifiers. + column-wise participant-centred values. """ - paq_cols = [c for c in data.columns if c != by] - means = data.groupby(by)[paq_cols].transform("mean") - return data[paq_cols] - means + import warnings + + warnings.warn( + "person_center() is deprecated; use ipsatize(method='column_wise') instead.", + DeprecationWarning, + stacklevel=2, + ) + return ipsatize(data, method="column_wise", participant_col=by) def fit_circe( @@ -491,13 +575,15 @@ def fit_circe( *, models: list[CircModelE] | None = None, center_by_participant: bool = True, -) -> pd.DataFrame: + errors: Literal["raise", "warn"] = "raise", +) -> "CircEResults": """ Fit circumplex SEM models to PAQ data and return a tidy DataFrame. - Validates input data, optionally applies within-person centering, computes a - complete-case correlation matrix, and fits the requested circumplex - model types using Browne's BFGS optimisation via the R ``CircE`` package. + Validates input data, optionally applies grand-mean within-person centering + (matching the published SATP analysis), computes a complete-case correlation + matrix, and fits the requested circumplex model types using Browne's BFGS + optimisation via the R ``CircE`` package. Parameters ---------- @@ -513,21 +599,38 @@ def fit_circe( Stored in the results; not used for computation. models List of model types to fit. Default: all four ``CircModelE`` variants. - Passing ``[]`` returns an empty DataFrame with no columns. + Passing ``[]`` returns an empty :class:`CircEResults` + (``len(result) == 0``). center_by_participant - Whether to apply within-person centering (via :func:`person_center`) - before fitting. Set to ``False`` if the data is already centered. + Whether to apply grand-mean within-person centering (via + :func:`~soundscapy.surveys.ipsatize` with ``method="grand_mean"``) + before fitting. Set to ``False`` if the data is already centered or + if no centering is desired. + errors + How to handle rows that fail schema validation (PAQ values outside + ``[0, 100]``, missing required columns, etc.): + + ``"raise"`` *(default)* — raise a :class:`pandera.errors.SchemaErrors` + immediately, listing every failing row and constraint. + + ``"warn"`` — emit a :class:`UserWarning` describing the failing rows + and continue with the valid rows only. + + .. note:: + If you pass *already-centered* data, set + ``center_by_participant=False`` to skip the internal centering step; + otherwise pass raw ``[0, 100]``-range data and use the default + ``center_by_participant=True``. Passing pre-centered data without + disabling centering will cause schema validation to reject the + negative values. Returns ------- - pd.DataFrame - One row per fitted model. Columns: ``datasource``, ``language``, - ``model``, ``n``, ``m``, ``chisq``, ``d``, ``p``, ``cfi``, ``gfi``, - ``agfi``, ``srmr``, ``mcsc``, ``rmsea``, ``rmsea_l``, ``rmsea_u``, - ``gdiff``, ``PAQ1``-``PAQ8``. - ``PAQ1``-``PAQ8`` contain fitted polar angle estimates for free-angle - models (UNCONSTRAINED, EQUAL_COM); ``None`` for constrained models. - Rows for models that fail to converge contain an ``error`` column. + CircEResults + Collection of fitted models. Access the tidy DataFrame via + ``.table``; access individual model results via ``.for_model()``. + Failed models are stored in ``.error_rows`` and included in + ``.table``. Examples -------- @@ -535,8 +638,8 @@ def fit_circe( >>> from soundscapy.satp import fit_circe >>> data = sspy.isd.load() >>> data = data.rename(columns={'SessionID': 'participant'}) - >>> results = fit_circe(data, language='eng', datasource='ISD') # center_by_participant=True by default - >>> results.shape[0] + >>> results = fit_circe(data, language='eng', datasource='ISD', errors='warn') + >>> len(results) 4 """ @@ -551,14 +654,40 @@ def fit_circe( "Check that data contains valid rows with PAQ1-PAQ8 columns." ) raise ValueError(msg) - validated = SATPSchema.validate(data, lazy=True) + + try: + validated = SATPSchema.validate(data, lazy=True) + except SchemaErrors as exc: + if errors == "raise": + raise + bad_idx = exc.failure_cases["index"].dropna().unique() + clean = data.loc[~data.index.isin(bad_idx)] + warnings.warn( + f"Dropping {len(data) - len(clean)} rows that failed schema validation " + f"({len(clean)} rows remain). " + "Pass errors='raise' to raise an error instead.", + UserWarning, + stacklevel=2, + ) + try: + validated = SATPSchema.validate(clean, lazy=True) + except SchemaErrors as exc2: + raise SchemaErrors( + schema_errors=exc2.schema_errors, + data=exc2.data, + ) from exc2 + if center_by_participant and "participant" not in validated.columns: msg = ( "center_by_participant=True requires a 'participant' column. " "Pass center_by_participant=False if your data is already centered." ) raise ValueError(msg) - processed = person_center(validated) if center_by_participant else validated + processed = ( + ipsatize(validated, method="grand_mean", participant_col="participant") + if center_by_participant + else validated + ) # Use listwise deletion (complete cases only) — consistent with R's na.omit(). complete = processed[PAQ_IDS].dropna() @@ -572,18 +701,19 @@ def fit_circe( corr = complete.corr() circ_models = models if models is not None else list(CircModelE) - rows: list[dict] = [] + fitted: list[CircE] = [] + error_rows: list[dict] = [] fit_exceptions = (ValueError, np.linalg.LinAlgError, RuntimeError, RRuntimeError) for model in circ_models: try: circe = CircE.compute_bfgs_fit(corr, n, datasource, language, model) - rows.append(circe.to_dict()) + fitted.append(circe) except fit_exceptions as e: # noqa: PERF203 warnings.warn(f"{model.value} raised {e}", stacklevel=2) # Populate all expected columns with None so that pandas does not # promote numeric columns (e.g. n, d) to float64 across all rows # when mixing sparse error dicts with full success dicts. - rows.append( + error_rows.append( { "language": language, "datasource": datasource, @@ -607,10 +737,9 @@ def fit_circe( } ) - result = pd.DataFrame(rows) - # Use pandas nullable integer dtype for degree-of-freedom columns so that - # None in error rows does not promote the whole column to float64. - for _int_col in ("n", "d", "m"): - if _int_col in result.columns: - result[_int_col] = result[_int_col].astype(pd.Int64Dtype()) - return result + return CircEResults( + models=fitted, + language=language, + datasource=datasource, + error_rows=error_rows, + ) diff --git a/src/soundscapy/surveys/__init__.py b/src/soundscapy/surveys/__init__.py index 5be2b61..415bf78 100644 --- a/src/soundscapy/surveys/__init__.py +++ b/src/soundscapy/surveys/__init__.py @@ -9,6 +9,7 @@ from soundscapy.surveys.processing import ( add_iso_coords, calculate_iso_coords, + ipsatize, return_paqs, simulation, ) @@ -25,6 +26,7 @@ "PAQ_LABELS", "add_iso_coords", "calculate_iso_coords", + "ipsatize", "processing", "rename_paqs", "return_paqs", diff --git a/src/soundscapy/surveys/processing.py b/src/soundscapy/surveys/processing.py index d25a690..c016f35 100644 --- a/src/soundscapy/surveys/processing.py +++ b/src/soundscapy/surveys/processing.py @@ -23,7 +23,7 @@ import warnings from dataclasses import dataclass -from typing import TypedDict +from typing import Literal, TypedDict try: from typing import Unpack @@ -618,6 +618,103 @@ def _r2_score(y_true: np.ndarray, y_pred: np.ndarray) -> float: return float(1 - (ss_residual / ss_total)) +def ipsatize( + data: pd.DataFrame, + method: Literal["grand_mean", "column_wise", "row_wise"] = "grand_mean", + participant_col: str = "participant", + scales: list[str] | None = None, +) -> pd.DataFrame: + """ + Participant-level ipsatization for circumplex analysis. + + Removes systematic response biases before computing a correlation matrix. + The choice of method depends on the study design and the type of bias + being corrected. + + Parameters + ---------- + data + DataFrame containing PAQ scale columns and (for participant-level + methods) a grouping column. + method + Centering strategy: + + ``"grand_mean"`` *(default)* — one scalar per participant: the mean + across *all* PAQ values and *all* observations for that participant. + Removes overall response-level differences between participants. + **Matches the published SATP analysis (Aletta et al., 2024) and the + original R implementation.** + + ``"column_wise"`` — eight scalars per participant: the per-scale mean + across that participant's observations. Removes scale-specific + response biases. This is the behaviour of the legacy + :func:`person_center` function. + + ``"row_wise"`` — one scalar per observation: the mean across all PAQ + scales within that observation. Removes the general impression of + each individual soundscape stimulus. Equivalent to + ``circumplex.ipsatize()``. + participant_col + Column used to group observations by participant. Required for + ``"grand_mean"`` and ``"column_wise"``; ignored for ``"row_wise"``. + scales + PAQ column names to centre. Defaults to :data:`PAQ_IDS` when + ``None``. + + Returns + ------- + pd.DataFrame + DataFrame containing only the scale columns with centred values. + The ``participant_col`` grouping column is excluded from the result. + + Raises + ------ + KeyError + If ``participant_col`` is not present in ``data`` when + ``method`` is ``"grand_mean"`` or ``"column_wise"``. + + Examples + -------- + >>> import pandas as pd + >>> data = pd.DataFrame({ + ... 'PAQ1': [50., 60., 40., 30.], 'PAQ2': [50., 60., 40., 30.], + ... 'PAQ3': [50., 60., 40., 30.], 'PAQ4': [50., 60., 40., 30.], + ... 'PAQ5': [50., 60., 40., 30.], 'PAQ6': [50., 60., 40., 30.], + ... 'PAQ7': [50., 60., 40., 30.], 'PAQ8': [50., 60., 40., 30.], + ... 'participant': ['A', 'A', 'B', 'B'], + ... }) + >>> result = ipsatize(data, method="grand_mean") + >>> result['PAQ1'].tolist() + [-5.0, 5.0, 5.0, -5.0] + + """ + _scales = scales if scales is not None else PAQ_IDS + + if method == "column_wise": + means = data.groupby(participant_col)[_scales].transform("mean") + return data[_scales] - means + + if method == "grand_mean": + # Compute a single scalar per participant: mean across all PAQ values + # and all observations for that participant. Use nanmean so that + # participants with partial NaN data still get a valid grand mean + # computed from their non-NaN values; NaN rows are then removed by + # downstream listwise deletion rather than silently expanding data loss + # to the whole participant. + grand_means = data.groupby(participant_col)[_scales].apply( + lambda df: float(np.nanmean(df.values)) + ) + grand_mean_per_row = data[participant_col].map(grand_means) + return data[_scales].subtract(grand_mean_per_row, axis=0) + + if method == "row_wise": + row_means = data[_scales].mean(axis=1) + return data[_scales].sub(row_means, axis=0) + + msg = f"method must be 'grand_mean', 'column_wise', or 'row_wise'; got {method!r}" + raise ValueError(msg) + + if __name__ == "__main__": import doctest diff --git a/test/satp/test_circe.py b/test/satp/test_circe.py index fa48467..01bceb1 100644 --- a/test/satp/test_circe.py +++ b/test/satp/test_circe.py @@ -522,7 +522,8 @@ def test_person_center_per_participant_mean_zero(self, isd_with_participant): from soundscapy.satp.circe import person_center from soundscapy.surveys.survey_utils import PAQ_IDS - centered = person_center(isd_with_participant, by="participant") + with pytest.warns(DeprecationWarning, match="person_center"): + centered = person_center(isd_with_participant, by="participant") # person_center returns only PAQ columns; re-attach participant for groupby. check = centered[PAQ_IDS].assign( participant=isd_with_participant["participant"] @@ -531,11 +532,11 @@ def test_person_center_per_participant_mean_zero(self, isd_with_participant): np.testing.assert_allclose(group_means.to_numpy(), 0.0, atol=1e-10) def test_fit_circe_returns_dataframe(self, isd_with_participant): - """fit_circe() must return a pd.DataFrame.""" - from soundscapy.satp.circe import fit_circe + """fit_circe() must return a CircEResults.""" + from soundscapy.satp.circe import CircEResults, fit_circe result = fit_circe(isd_with_participant, language="EN", datasource="ISD") - assert isinstance(result, pd.DataFrame) + assert isinstance(result, CircEResults) def test_fit_circe_returns_four_rows(self, isd_with_participant): """fit_circe() with default models must return 4 rows (one per model).""" @@ -550,7 +551,7 @@ def test_fit_circe_model_column_contains_all_variants(self, isd_with_participant result = fit_circe(isd_with_participant, language="EN", datasource="ISD") expected = {m.value for m in CircModelE} - assert set(result["model"]) == expected + assert set(result.table["model"]) == expected def test_fit_circe_numeric_fit_indices(self, isd_with_participant): """chisq, cfi, rmsea must be numeric floats (not None or NaN) in all rows.""" @@ -558,8 +559,8 @@ def test_fit_circe_numeric_fit_indices(self, isd_with_participant): result = fit_circe(isd_with_participant, language="EN", datasource="ISD") for col in ("chisq", "cfi", "rmsea", "d"): - assert result[col].notna().all(), f"Column '{col}' has NaN values" - assert pd.api.types.is_numeric_dtype(result[col]), ( + assert result.table[col].notna().all(), f"Column '{col}' has NaN values" + assert pd.api.types.is_numeric_dtype(result.table[col]), ( f"Column '{col}' is not numeric" ) @@ -568,17 +569,18 @@ def test_fit_circe_p_value_formula(self, isd_with_participant): from soundscapy.satp.circe import fit_circe result = fit_circe(isd_with_participant, language="EN", datasource="ISD") - for _, row in result.iterrows(): + for _, row in result.table.iterrows(): expected_p = scipy_chi2.sf(row["chisq"], row["d"]) assert pytest.approx(row["p"], rel=1e-6) == expected_p def test_fit_circe_n_uses_listwise_deletion(self, isd_with_participant): """ - N in results must equal len(data[PAQ_IDS].dropna()) after person-centering. + N in results must equal len(data[PAQ_IDS].dropna()) after grand-mean centering. Introducing NaN rows verifies listwise deletion is applied. """ - from soundscapy.satp.circe import fit_circe, person_center + from soundscapy.satp.circe import fit_circe + from soundscapy.surveys import ipsatize from soundscapy.surveys.survey_utils import PAQ_IDS # Introduce NaN in one PAQ column for a single participant's rows @@ -587,15 +589,23 @@ def test_fit_circe_n_uses_listwise_deletion(self, isd_with_participant): mask = data_with_nan["participant"] == first_participant data_with_nan.loc[mask, "PAQ1"] = np.nan - result = fit_circe(data_with_nan, language="EN", datasource="ISD") + # errors="warn" so NaN rows are dropped rather than raising SchemaErrors; + # this test is specifically about listwise deletion, not schema validation. + result = fit_circe( + data_with_nan, language="EN", datasource="ISD", errors="warn" + ) - # Manually compute expected n - centered = person_center(data_with_nan, by="participant") + # Manually compute expected n — mirror the production path: schema + # validation drops NaN rows first, then ipsatize is called on the + # valid subset. Using the unfiltered data would give a different + # grand-mean scalar and diverge from what fit_circe actually computes. + valid = data_with_nan.dropna(subset=PAQ_IDS) + centered = ipsatize(valid, method="grand_mean", participant_col="participant") expected_n = len(centered[PAQ_IDS].dropna()) # All rows should report the same n - assert (result["n"] == expected_n).all(), ( - f"n={result['n'].unique()} but expected {expected_n}" + assert (result.table["n"] == expected_n).all(), ( + f"n={result.table['n'].unique()} but expected {expected_n}" ) def test_fit_circe_subset_of_models(self, isd_with_participant): @@ -609,7 +619,7 @@ def test_fit_circe_subset_of_models(self, isd_with_participant): models=[CircModelE.UNCONSTRAINED, CircModelE.CIRCUMPLEX], ) assert len(result) == 2 - assert set(result["model"]) == { + assert set(result.table["model"]) == { CircModelE.UNCONSTRAINED.value, CircModelE.CIRCUMPLEX.value, } @@ -619,7 +629,7 @@ def test_fit_circe_rmsea_bounds_ordering(self, isd_with_participant): from soundscapy.satp.circe import fit_circe result = fit_circe(isd_with_participant, language="EN", datasource="ISD") - for _, row in result.iterrows(): + for _, row in result.table.iterrows(): assert row["rmsea_l"] <= row["rmsea"], ( f"{row['model']}: rmsea_l ({row['rmsea_l']}) > rmsea ({row['rmsea']})" ) @@ -628,8 +638,8 @@ def test_fit_circe_rmsea_bounds_ordering(self, isd_with_participant): ) def test_fit_circe_no_person_centering(self, isd_with_participant): - """center_by_participant=False must run without error and return a 4-row DataFrame.""" - from soundscapy.satp.circe import fit_circe + """center_by_participant=False must run without error and return a CircEResults.""" + from soundscapy.satp.circe import CircEResults, fit_circe result = fit_circe( isd_with_participant, @@ -637,17 +647,17 @@ def test_fit_circe_no_person_centering(self, isd_with_participant): datasource="ISD", center_by_participant=False, ) - assert isinstance(result, pd.DataFrame) + assert isinstance(result, CircEResults) assert len(result) == 4 def test_fit_circe_empty_models_returns_empty_df(self, isd_with_participant): - """fit_circe() with models=[] must return an empty DataFrame.""" - from soundscapy.satp.circe import fit_circe + """fit_circe() with models=[] must return an empty CircEResults.""" + from soundscapy.satp.circe import CircEResults, fit_circe result = fit_circe( isd_with_participant, language="EN", datasource="ISD", models=[] ) - assert isinstance(result, pd.DataFrame) + assert isinstance(result, CircEResults) assert len(result) == 0 def test_fit_circe_error_row_structure(self, isd_with_participant): @@ -668,7 +678,9 @@ def failing_fit(data_cor, n, datasource, language, circ_model) -> CircE: with patch.object(CircE, "compute_bfgs_fit", staticmethod(failing_fit)): result = fit_circe(isd_with_participant, language="EN", datasource="ISD") - error_rows = result[result["model"] == CircModelE.UNCONSTRAINED.value] + error_rows = result.table[ + result.table["model"] == CircModelE.UNCONSTRAINED.value + ] assert len(error_rows) == 1 row = error_rows.iloc[0] assert "error" in row.index @@ -733,7 +745,7 @@ def test_fit_circe_no_person_centering_no_participant(self): import warnings import soundscapy as sspy - from soundscapy.satp.circe import fit_circe + from soundscapy.satp.circe import CircEResults, fit_circe from soundscapy.surveys.survey_utils import PAQ_IDS data = sspy.isd.load()[PAQ_IDS].dropna() # no participant column @@ -742,7 +754,7 @@ def test_fit_circe_no_person_centering_no_participant(self): result = fit_circe( data, language="EN", datasource="ISD", center_by_participant=False ) - assert isinstance(result, pd.DataFrame) + assert isinstance(result, CircEResults) assert len(result) == 4 def test_fit_circe_person_centering_no_participant_raises(self): @@ -781,13 +793,81 @@ def failing_fit(data_cor, n, datasource, language, circ_model) -> CircE: # n, d, m must remain integer in success rows even when one row is an # error row (which pads those columns with None). pandas must not # promote the whole column to float64. - success_rows = result[result["model"] != CircModelE.UNCONSTRAINED.value] + success_rows = result.table[ + result.table["model"] != CircModelE.UNCONSTRAINED.value + ] for _, row in success_rows.iterrows(): for col in ("n", "d", "m"): assert isinstance(row[col], int | np.integer), ( f"{col} should be int in success row, got {type(row[col])}" ) + def test_fit_circe_returns_circe_results(self, isd_with_participant): + """fit_circe() must return a CircEResults instance.""" + from soundscapy.satp.circe import CircEResults, fit_circe + + result = fit_circe(isd_with_participant, language="EN", datasource="ISD") + assert isinstance(result, CircEResults) + + def test_fit_circe_table_is_dataframe(self, isd_with_participant): + """CircEResults.table must be a pd.DataFrame with 4 rows.""" + from soundscapy.satp.circe import fit_circe + + result = fit_circe(isd_with_participant, language="EN", datasource="ISD") + assert isinstance(result.table, pd.DataFrame) + assert len(result.table) == 4 + + def test_fit_circe_for_model(self, isd_with_participant): + """CircEResults.for_model() must return the correct CircE instance.""" + from soundscapy.satp.circe import CircE, CircModelE, fit_circe + + result = fit_circe(isd_with_participant, language="EN", datasource="ISD") + circe = result.for_model(CircModelE.UNCONSTRAINED) + assert isinstance(circe, CircE) + assert circe.model is CircModelE.UNCONSTRAINED + + def test_fit_circe_errors_warn_drops_invalid(self, isd_with_participant): + """errors='warn' must drop out-of-range rows and emit a warning.""" + import warnings as _warnings + + from soundscapy.satp.circe import fit_circe + + # Inject rows with out-of-range PAQ values (negative = post-centered) + data = isd_with_participant.copy() + # Add a few rows with PAQ1 > 100 (invalid for raw data) + bad_rows = data.iloc[:3].copy() + bad_rows["PAQ1"] = 150.0 + data_with_bad = pd.concat([data, bad_rows], ignore_index=True) + + with _warnings.catch_warnings(record=True) as w: + _warnings.simplefilter("always") + result = fit_circe( + data_with_bad, language="EN", datasource="ISD", errors="warn" + ) + # At least one UserWarning about dropped rows + user_warnings = [x for x in w if issubclass(x.category, UserWarning)] + assert any("rows" in str(x.message).lower() for x in user_warnings) + assert len(result) == 4 + + def test_fit_circe_grand_mean_centering(self, isd_with_participant): + """fit_circe uses grand-mean centering: one scalar per participant.""" + from soundscapy.surveys import ipsatize + from soundscapy.surveys.survey_utils import PAQ_IDS + + # Grand-mean centering: for each participant, all PAQ values sum to zero + # (the grand mean across all scales × all rows is zero). + centered = ipsatize( + isd_with_participant, method="grand_mean", participant_col="participant" + ) + # The mean of ALL values per participant should be zero + check = centered[PAQ_IDS].assign( + participant=isd_with_participant["participant"].values + ) + flat_means = check.groupby("participant")[PAQ_IDS].apply( + lambda df: float(df.values.mean()) + ) + np.testing.assert_allclose(flat_means.to_numpy(), 0.0, atol=1e-10) + # --------------------------------------------------------------------------- # Tests for normalize_polar_angles diff --git a/test/surveys/test_survey_processing.py b/test/surveys/test_survey_processing.py index fb0c822..522314d 100644 --- a/test/surveys/test_survey_processing.py +++ b/test/surveys/test_survey_processing.py @@ -264,5 +264,92 @@ def test_end_to_end_workflow(self): assert not ssm_result.isna().any().any() +class TestIpsatize: + """Tests for soundscapy.surveys.ipsatize().""" + + @pytest.fixture + def sample_data(self): + """Small DataFrame with known values for centering checks.""" + return pd.DataFrame( + { + "PAQ1": [50.0, 60.0, 40.0, 30.0], + "PAQ2": [50.0, 60.0, 40.0, 30.0], + "PAQ3": [50.0, 60.0, 40.0, 30.0], + "PAQ4": [50.0, 60.0, 40.0, 30.0], + "PAQ5": [50.0, 60.0, 40.0, 30.0], + "PAQ6": [50.0, 60.0, 40.0, 30.0], + "PAQ7": [50.0, 60.0, 40.0, 30.0], + "PAQ8": [50.0, 60.0, 40.0, 30.0], + "participant": ["A", "A", "B", "B"], + } + ) + + def test_grand_mean_one_scalar_per_participant(self, sample_data): + """Grand-mean centering must produce zero grand mean per participant.""" + from soundscapy.surveys import ipsatize + from soundscapy.surveys.survey_utils import PAQ_IDS + + result = ipsatize( + sample_data, method="grand_mean", participant_col="participant" + ) + check = result[PAQ_IDS].assign(participant=sample_data["participant"].values) + flat_means = check.groupby("participant")[PAQ_IDS].apply( + lambda df: float(df.values.mean()) + ) + np.testing.assert_allclose(flat_means.to_numpy(), 0.0, atol=1e-10) + + def test_column_wise_zero_per_scale_per_participant(self, sample_data): + """Column-wise centering must produce zero mean per scale per participant.""" + from soundscapy.surveys import ipsatize + from soundscapy.surveys.survey_utils import PAQ_IDS + + result = ipsatize( + sample_data, method="column_wise", participant_col="participant" + ) + check = result[PAQ_IDS].assign(participant=sample_data["participant"].values) + group_means = check.groupby("participant")[PAQ_IDS].mean() + np.testing.assert_allclose(group_means.to_numpy(), 0.0, atol=1e-10) + + def test_row_wise_zero_per_observation(self, sample_data): + """Row-wise centering must produce zero mean across scales per row.""" + from soundscapy.surveys import ipsatize + from soundscapy.surveys.survey_utils import PAQ_IDS + + result = ipsatize(sample_data, method="row_wise") + row_means = result[PAQ_IDS].mean(axis=1) + np.testing.assert_allclose(row_means.to_numpy(), 0.0, atol=1e-10) + + def test_returns_only_scale_columns(self, sample_data): + """ipsatize must return only the PAQ scale columns, not participant.""" + from soundscapy.surveys import ipsatize + from soundscapy.surveys.survey_utils import PAQ_IDS + + result = ipsatize( + sample_data, method="grand_mean", participant_col="participant" + ) + assert set(result.columns) == set(PAQ_IDS) + assert "participant" not in result.columns + + def test_invalid_method_raises(self, sample_data): + """ipsatize must raise ValueError for an unknown method string.""" + from soundscapy.surveys import ipsatize + + with pytest.raises(ValueError, match="method"): + ipsatize(sample_data, method="bad_method") + + def test_grand_mean_differs_from_column_wise(self, sample_data): + """Grand-mean and column-wise results must differ for asymmetric data.""" + from soundscapy.surveys import ipsatize + + # Make data asymmetric: A has different values across scales + asym = sample_data.copy() + asym.loc[asym["participant"] == "A", "PAQ1"] = 80.0 + + gm = ipsatize(asym, method="grand_mean", participant_col="participant") + cw = ipsatize(asym, method="column_wise", participant_col="participant") + # Results should differ + assert not gm.equals(cw) + + if __name__ == "__main__": pytest.main()