From 8f7b5757b9d0705e2345619a6c6d5cdfb1bf5478 Mon Sep 17 00:00:00 2001 From: sofiane87 Date: Thu, 25 Dec 2025 17:26:51 +0000 Subject: [PATCH 1/2] Fix domain privacy leak Signed-off-by: sofiane87 Add Sphinx documentation and Read the Docs configuration Ensure domain is always required Add comment in README --- .github/workflows/pytest.yml | 2 - README.md | 6 ++- pyproject.toml | 4 +- src/dpmm/models/aim.py | 16 ++++---- src/dpmm/models/base/base.py | 7 ++-- src/dpmm/models/base/mbi/inference.py | 4 +- src/dpmm/models/base/mechanisms/mechanism.py | 17 ++------- src/dpmm/models/mst.py | 8 ++-- src/dpmm/models/priv_bayes.py | 17 +++++---- src/dpmm/pipelines/base.py | 3 ++ tests/test_models.py | 39 ++++++++++++++++---- tests/test_pipeline.py | 2 +- 12 files changed, 78 insertions(+), 47 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 6a55ddd..03455a5 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -11,8 +11,6 @@ on: - 'pyproject.toml' - 'poetry.lock' - '.github/workflows/pytest.yml' - push: - branches: [ "main" ] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/README.md b/README.md index 2180c07..fa60405 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ ![Tests](https://github.com/sassoftware/dpmm/workflows/Test%20Suite/badge.svg) ![Coverage](https://raw.githubusercontent.com/sassoftware/dpmm/main/.github/badges/coverage.svg) +[![arXiv](https://img.shields.io/badge/arXiv-2506.00322-b31b1b.svg)](https://arxiv.org/abs/2506.00322) + ## Overview @@ -20,7 +22,7 @@ Summary of main features: __NB: Intended Use -- _dpmm_ is designed for research and exploratory use in privacy-preserving synthetic data generation (particularly in simple scenarios such as preserving high-quality 1/2-way marginals in datasets with up to 32 features[paper1](https://arxiv.org/abs/2112.09238),[paper2](https://arxiv.org/abs/2305.10994)) and is not intended for production use in complex, real-world applications.__ - + ## Installation @@ -62,6 +64,8 @@ We provide numerous examples demonstrating the features of __dpmm__ across data The examples are available across all models and model settings, and are accessible from the repository (if installed locally). +__NB: the general intent of this package is to be used through the pipeline layer to guarantee that no privacy leakage is occuring, it is possible to use the models directly but in that instance providing a domain is a requirement to ensure privacy guarantees.__ + ### Preprocessing The provided generative pipelines combine automatic DP descritization preprocessing with a generative model and allows for the following features: diff --git a/pyproject.toml b/pyproject.toml index 59e597b..5ef702e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "dpmm" -version = "0.1.9" +version = "0.2.0" description = "dpmm: a library for synthetic tabular data generation with rich functionality and end-to-end Differential Privacy guarantees" license = "Apache-2.0" @@ -13,7 +13,7 @@ keywords = ["machine-learning", "tabular-data", "differential-privacy", "synthet [project] name = "dpmm" -version = "0.1.9" +version = "0.2.0" description = "dpmm: a library for synthetic tabular data generation with rich functionality and end-to-end Differential Privacy guarantees" authors = [ {name = "Sofiane Mahiou, Georgi Ganev", email = "sofiane.mahiou@sas.com"} diff --git a/src/dpmm/models/aim.py b/src/dpmm/models/aim.py index de4b98a..d2bccbf 100644 --- a/src/dpmm/models/aim.py +++ b/src/dpmm/models/aim.py @@ -311,6 +311,7 @@ class AIM(Mechanism): def __init__( self, + domain: Domain, epsilon=1, delta=1e-5, prng: RandomState = None, @@ -322,7 +323,6 @@ def __init__( max_cells=10000, structural_zeros={}, compress=False, - domain=None, n_jobs=-1, ): super().__init__( @@ -465,7 +465,9 @@ def _fit(self, data: Dataset, public=False, workload=None): # noqa: C901 if self.num_marginals is not None: workload = [ workload[i] - for i in self.prng.choice(len(workload), self.num_marginals, replace=False) + for i in self.prng.choice( + len(workload), self.num_marginals, replace=False + ) ] rounds = self.rounds or 16 * len(data.domain) @@ -669,17 +671,17 @@ class AIMGM(GraphicalGenerativeModel): def __init__( self, - epsilon=1, - delta=1e-5, - rounds=None, - compress=True, + domain: Domain, + epsilon: float = 1, + delta: float = 1e-5, + rounds: int = None, + compress: bool = True, n_iters: int = 1000, max_model_size=80, degree=2, num_marginals=None, max_cells=10000, structural_zeros={}, - domain=None, random_state: RandomState = None, n_jobs: int = -1, ): diff --git a/src/dpmm/models/base/base.py b/src/dpmm/models/base/base.py index f174476..055efb7 100644 --- a/src/dpmm/models/base/base.py +++ b/src/dpmm/models/base/base.py @@ -1,7 +1,8 @@ import json from numpy.random import RandomState from pathlib import Path -from typing import Self, Dict +from typing import Self +from dpmm.models.base.mbi import Domain import pandas as pd @@ -10,11 +11,11 @@ class GenerativeModel: name: str = None - def __init__(self, domain: Dict, random_state: RandomState = None): + def __init__(self, domain: Domain, random_state: RandomState = None): self.domain = domain self.random_state = random_state - def set_domain(self, domain: Dict): + def set_domain(self, domain: Domain): self.domain = domain def set_random_state(self, random_state: RandomState): diff --git a/src/dpmm/models/base/mbi/inference.py b/src/dpmm/models/base/mbi/inference.py index c6010ce..0b1bb5c 100644 --- a/src/dpmm/models/base/mbi/inference.py +++ b/src/dpmm/models/base/mbi/inference.py @@ -386,7 +386,9 @@ def _setup(self, measurements, total): # noqa: C901 device = self.Factor.device y = torch.tensor(y, dtype=torch.float32, device=device) if isinstance(q_matrix, np.ndarray): - q_matrix = torch.tensor(q_matrix, dtype=torch.float32, device=device) + q_matrix = torch.tensor( + q_matrix, dtype=torch.float32, device=device + ) elif sparse.issparse(q_matrix): q_matrix = q_matrix.tocoo() idx = torch.LongTensor([q_matrix.row, q_matrix.col]) diff --git a/src/dpmm/models/base/mechanisms/mechanism.py b/src/dpmm/models/base/mechanisms/mechanism.py index 48eb27b..cee4a40 100644 --- a/src/dpmm/models/base/mechanisms/mechanism.py +++ b/src/dpmm/models/base/mechanisms/mechanism.py @@ -20,10 +20,10 @@ def __init__( self, epsilon, delta, + domain, prng: RandomState = None, max_model_size: int = None, compress=False, - domain=None, structural_zeros: Dict = None, n_jobs: int = -1, ): @@ -59,7 +59,6 @@ def __init__( self.compressor = None self.set_structural_zeros(structural_zeros) - self._domain = domain self.max_model_size = max_model_size self.model_size = None @@ -108,19 +107,11 @@ def set_domain(self, domain: Dict): self._domain = domain def fit(self, df, public=False, marginals_only=False, *args, **kwargs): - # prepare data - if self._domain is None: - _domain = (df.astype(int).max(axis=0) + 1).to_dict() - if not public: - # TODO: Add warning - pass - else: - self._domain = _domain - else: - _domain = self._domain + + assert self._domain is not None, "Domain must be provided" domain = Domain( - list(df.columns), np.array([_domain[col] for col in df.columns]) + list(df.columns), np.array([self._domain[col] for col in df.columns]) ) data = Dataset(df, domain) diff --git a/src/dpmm/models/mst.py b/src/dpmm/models/mst.py index 434f70a..fefddb2 100644 --- a/src/dpmm/models/mst.py +++ b/src/dpmm/models/mst.py @@ -92,11 +92,11 @@ class MST(Mechanism): def __init__( self, + domain: Domain, epsilon: Optional[float] = None, delta: Optional[float] = None, n_iters: int = 10000, compress: bool = True, - domain: Optional[Domain] = None, prng: Optional[RandomState] = None, max_model_size: Optional[int] = None, structural_zeros: Optional[dict] = None, @@ -333,12 +333,12 @@ class MSTGM(GraphicalGenerativeModel): def __init__( self, + domain: Domain, epsilon: float = 1, delta: float = 1e-5, n_iters: int = 5000, compress: bool = True, max_model_size: Optional[int] = None, - domain: Optional[Domain] = None, random_state: Optional[RandomState] = None, n_jobs: int = -1, ): @@ -401,7 +401,9 @@ def load(cls, path: Path) -> Self: :rtype: MSTGM """ generator = MST.load(path) - obj = cls(epsilon=generator.epsilon, delta=generator.delta) + obj = cls( + epsilon=generator.epsilon, delta=generator.delta, domain=generator._domain + ) del obj.generator obj.generator = generator diff --git a/src/dpmm/models/priv_bayes.py b/src/dpmm/models/priv_bayes.py index 8699e3f..2ab7d96 100644 --- a/src/dpmm/models/priv_bayes.py +++ b/src/dpmm/models/priv_bayes.py @@ -18,6 +18,7 @@ from dpmm.models.base.graphical import GraphicalGenerativeModel from dpmm.models.base.mbi import Dataset, GraphicalModel +from dpmm.models.base.mbi.domain import Domain from dpmm.models.base.mechanisms import cdp_rho from dpmm.models.base.memory import clique_size from dpmm.utils import to_path @@ -195,6 +196,7 @@ class PrivBayes(Mechanism): def __init__( self, + domain: Domain, epsilon=1, delta=None, degree=2, @@ -204,7 +206,6 @@ def __init__( prng: RandomState = None, max_model_size: int = None, compress=False, - domain=None, structural_zeros: Dict = None, ): super().__init__( @@ -428,13 +429,13 @@ class PrivBayesGM(GraphicalGenerativeModel): def __init__( self, - epsilon=1, - delta=1e-5, - degree=2, + domain: Domain, + epsilon: float = 1, + delta: float = 1e-5, + degree: int = 2, n_iters: int = 5000, - compress=True, + compress: bool = True, max_model_size: int = None, - domain=None, random_state: RandomState = None, n_jobs: int = -1, ): @@ -498,7 +499,9 @@ def load(cls, path: Path) -> Self: :rtype: PrivBayesGM """ generator = PrivBayes.load(path) - obj = cls(epsilon=generator.epsilon, delta=generator.delta) + obj = cls( + epsilon=generator.epsilon, delta=generator.delta, domain=generator._domain + ) del obj.generator obj.generator = generator diff --git a/src/dpmm/pipelines/base.py b/src/dpmm/pipelines/base.py index ef77576..b2413cf 100644 --- a/src/dpmm/pipelines/base.py +++ b/src/dpmm/pipelines/base.py @@ -86,6 +86,9 @@ def fit( zeros = self.proc.zeros t_domain = self.proc.bin_domain else: + assert ( + domain is not None + ), "Domain must be provided if no processing is used." t_domain = domain t_df = df zeros = structural_zeros diff --git a/tests/test_models.py b/tests/test_models.py index 78a6a37..ba5103d 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -26,14 +26,14 @@ def sample_dataframe(): @pytest.mark.parametrize("fit_mode", ["pretrain_only", "pretrain_and_fit", "fit_only"]) def test_models(model_class, use_domain, compress, max_model_size, epsilon, condition, serialise, sample_dataframe, with_zeros, fit_mode): if use_domain: - domain = { + domain = { col: sample_dataframe[col].max() + 1 for col in sample_dataframe.columns } else: domain = None - + # When domain is None, expect an AssertionError structural_zeros = None if with_zeros: zero_col = np.random.choice(sample_dataframe.columns, size=1)[0] @@ -50,12 +50,41 @@ def test_models(model_class, use_domain, compress, max_model_size, epsilon, cond n_iters=10, ) + if model_class.name == "aim": + model_args["rounds"] = 5 + elif model_class.name == "priv-bayes": + model_args["degree"] = 1 + + model = model_class(**model_args) + + if with_zeros: + zero_col = np.random.choice(sample_dataframe.columns, size=1)[0] + structural_zeros = { + zero_col: np.random.choice(sample_dataframe[zero_col].max() + 1, replace=False, size=3) + } + + random_state = np.random.RandomState(42) + model_args = dict( + epsilon=epsilon, + domain=domain, + compress=compress, + max_model_size=max_model_size, + n_iters=10, + ) + if model_class.name == "aim": model_args["rounds"] = 5 elif model_class.name == "priv-bayes": model_args["degree"] = 1 model = model_class(**model_args) + + if not use_domain: + with pytest.raises(AssertionError, match="Domain must be provided") as e: + model.fit(sample_dataframe) + + return + if with_zeros: model.set_structural_zeros(structural_zeros) @@ -69,17 +98,13 @@ def test_models(model_class, use_domain, compress, max_model_size, epsilon, cond assert model.generator.cliques is not None assert model.generator.fit_state == "pretrained" - - if fit_mode in ["fit_only", "pretrain_and_fit"]: + if fit_mode in ["fit_only", "pretrain_and_fit"]: model.fit(sample_dataframe) assert model.generator.fit_state == "trained" - if max_model_size is not None: assert model.generator.model_size <= max_model_size - - if serialise: with TemporaryDirectory() as tmp_dir: tmp_path = Path(tmp_dir) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 45e3a59..11fb418 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -80,7 +80,7 @@ def test_pipeline(model_class, use_domain, compress, with_zeros, condition, max_ pipeline.fit(sample_dataframe, domain=domain, public=True, structural_zeros=structural_zeros) assert pipeline.gen.generator.cliques is not None - if fit_mode in ["fit_only", "pretrain_and_fit"]: + if fit_mode in ["fit_only", "pretrain_and_fit"]: pipeline.fit(sample_dataframe, domain=domain, structural_zeros=structural_zeros) if serialise: From 68e7b03f810113d77e7ac6e2c96149673a787669 Mon Sep 17 00:00:00 2001 From: sofiane87 Date: Mon, 12 Jan 2026 11:16:45 +0000 Subject: [PATCH 2/2] Add missing parameter Signed-off-by: sofiane87 --- src/dpmm/pipelines/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dpmm/pipelines/base.py b/src/dpmm/pipelines/base.py index b2413cf..b64fcae 100644 --- a/src/dpmm/pipelines/base.py +++ b/src/dpmm/pipelines/base.py @@ -241,6 +241,7 @@ def __init__( compress=compress, max_model_size=max_model_size, n_jobs=n_jobs, + domain=None, **gen_kwargs )