Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ on:
- 'pyproject.toml'
- 'poetry.lock'
- '.github/workflows/pytest.yml'
push:
branches: [ "main" ]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

![Tests](https://github.com/sassoftware/dpmm/workflows/Test%20Suite/badge.svg)
![Coverage](https://raw.githubusercontent.com/sassoftware/dpmm/main/.github/badges/coverage.svg)
[![arXiv](https://img.shields.io/badge/arXiv-2506.00322-b31b1b.svg)](https://arxiv.org/abs/2506.00322)



## Overview
Expand All @@ -20,7 +22,7 @@ Summary of main features:

__NB: Intended Use -- _dpmm_ is designed for research and exploratory use in privacy-preserving synthetic data generation (particularly in simple scenarios such as preserving high-quality 1/2-way marginals in datasets with up to 32 features<sup>[paper<sub>1</sub>](https://arxiv.org/abs/2112.09238),[paper<sub>2</sub>](https://arxiv.org/abs/2305.10994)</sup>) and is not intended for production use in complex, real-world applications.__



## Installation

Expand Down Expand Up @@ -62,6 +64,8 @@ We provide numerous examples demonstrating the features of __dpmm__ across data
The examples are available across all models and model settings, and are accessible from the repository (if installed locally).


__NB: the general intent of this package is to be used through the pipeline layer to guarantee that no privacy leakage is occuring, it is possible to use the models directly but in that instance providing a domain is a requirement to ensure privacy guarantees.__

### Preprocessing
The provided generative pipelines combine automatic DP descritization preprocessing with a generative model and allows for the following features:

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

[tool.poetry]
name = "dpmm"
version = "0.1.9"
version = "0.2.0"
description = "dpmm: a library for synthetic tabular data generation with rich functionality and end-to-end Differential Privacy guarantees"
license = "Apache-2.0"

Expand All @@ -13,7 +13,7 @@ keywords = ["machine-learning", "tabular-data", "differential-privacy", "synthet

[project]
name = "dpmm"
version = "0.1.9"
version = "0.2.0"
description = "dpmm: a library for synthetic tabular data generation with rich functionality and end-to-end Differential Privacy guarantees"
authors = [
{name = "Sofiane Mahiou, Georgi Ganev", email = "sofiane.mahiou@sas.com"}
Expand Down
16 changes: 9 additions & 7 deletions src/dpmm/models/aim.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ class AIM(Mechanism):

def __init__(
self,
domain: Domain,
epsilon=1,
delta=1e-5,
prng: RandomState = None,
Expand All @@ -322,7 +323,6 @@ def __init__(
max_cells=10000,
structural_zeros={},
compress=False,
domain=None,
n_jobs=-1,
):
super().__init__(
Expand Down Expand Up @@ -465,7 +465,9 @@ def _fit(self, data: Dataset, public=False, workload=None): # noqa: C901
if self.num_marginals is not None:
workload = [
workload[i]
for i in self.prng.choice(len(workload), self.num_marginals, replace=False)
for i in self.prng.choice(
len(workload), self.num_marginals, replace=False
)
]

rounds = self.rounds or 16 * len(data.domain)
Expand Down Expand Up @@ -669,17 +671,17 @@ class AIMGM(GraphicalGenerativeModel):

def __init__(
self,
epsilon=1,
delta=1e-5,
rounds=None,
compress=True,
domain: Domain,
epsilon: float = 1,
delta: float = 1e-5,
rounds: int = None,
compress: bool = True,
n_iters: int = 1000,
max_model_size=80,
degree=2,
num_marginals=None,
max_cells=10000,
structural_zeros={},
domain=None,
random_state: RandomState = None,
n_jobs: int = -1,
):
Expand Down
7 changes: 4 additions & 3 deletions src/dpmm/models/base/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import json
from numpy.random import RandomState
from pathlib import Path
from typing import Self, Dict
from typing import Self
from dpmm.models.base.mbi import Domain
import pandas as pd


Expand All @@ -10,11 +11,11 @@ class GenerativeModel:

name: str = None

def __init__(self, domain: Dict, random_state: RandomState = None):
def __init__(self, domain: Domain, random_state: RandomState = None):
self.domain = domain
self.random_state = random_state

def set_domain(self, domain: Dict):
def set_domain(self, domain: Domain):
self.domain = domain

def set_random_state(self, random_state: RandomState):
Expand Down
4 changes: 3 additions & 1 deletion src/dpmm/models/base/mbi/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,9 @@ def _setup(self, measurements, total): # noqa: C901
device = self.Factor.device
y = torch.tensor(y, dtype=torch.float32, device=device)
if isinstance(q_matrix, np.ndarray):
q_matrix = torch.tensor(q_matrix, dtype=torch.float32, device=device)
q_matrix = torch.tensor(
q_matrix, dtype=torch.float32, device=device
)
elif sparse.issparse(q_matrix):
q_matrix = q_matrix.tocoo()
idx = torch.LongTensor([q_matrix.row, q_matrix.col])
Expand Down
17 changes: 4 additions & 13 deletions src/dpmm/models/base/mechanisms/mechanism.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ def __init__(
self,
epsilon,
delta,
domain,
prng: RandomState = None,
max_model_size: int = None,
compress=False,
domain=None,
structural_zeros: Dict = None,
n_jobs: int = -1,
):
Expand Down Expand Up @@ -59,7 +59,6 @@ def __init__(
self.compressor = None

self.set_structural_zeros(structural_zeros)

self._domain = domain
self.max_model_size = max_model_size
self.model_size = None
Expand Down Expand Up @@ -108,19 +107,11 @@ def set_domain(self, domain: Dict):
self._domain = domain

def fit(self, df, public=False, marginals_only=False, *args, **kwargs):
# prepare data
if self._domain is None:
_domain = (df.astype(int).max(axis=0) + 1).to_dict()
if not public:
# TODO: Add warning
pass
else:
self._domain = _domain
else:
_domain = self._domain

assert self._domain is not None, "Domain must be provided"

domain = Domain(
list(df.columns), np.array([_domain[col] for col in df.columns])
list(df.columns), np.array([self._domain[col] for col in df.columns])
)

data = Dataset(df, domain)
Expand Down
8 changes: 5 additions & 3 deletions src/dpmm/models/mst.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,11 @@ class MST(Mechanism):

def __init__(
self,
domain: Domain,
epsilon: Optional[float] = None,
delta: Optional[float] = None,
n_iters: int = 10000,
compress: bool = True,
domain: Optional[Domain] = None,
prng: Optional[RandomState] = None,
max_model_size: Optional[int] = None,
structural_zeros: Optional[dict] = None,
Expand Down Expand Up @@ -333,12 +333,12 @@ class MSTGM(GraphicalGenerativeModel):

def __init__(
self,
domain: Domain,
epsilon: float = 1,
delta: float = 1e-5,
n_iters: int = 5000,
compress: bool = True,
max_model_size: Optional[int] = None,
domain: Optional[Domain] = None,
random_state: Optional[RandomState] = None,
n_jobs: int = -1,
):
Expand Down Expand Up @@ -401,7 +401,9 @@ def load(cls, path: Path) -> Self:
:rtype: MSTGM
"""
generator = MST.load(path)
obj = cls(epsilon=generator.epsilon, delta=generator.delta)
obj = cls(
epsilon=generator.epsilon, delta=generator.delta, domain=generator._domain
)
del obj.generator
obj.generator = generator

Expand Down
17 changes: 10 additions & 7 deletions src/dpmm/models/priv_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from dpmm.models.base.graphical import GraphicalGenerativeModel
from dpmm.models.base.mbi import Dataset, GraphicalModel
from dpmm.models.base.mbi.domain import Domain
from dpmm.models.base.mechanisms import cdp_rho
from dpmm.models.base.memory import clique_size
from dpmm.utils import to_path
Expand Down Expand Up @@ -195,6 +196,7 @@ class PrivBayes(Mechanism):

def __init__(
self,
domain: Domain,
epsilon=1,
delta=None,
degree=2,
Expand All @@ -204,7 +206,6 @@ def __init__(
prng: RandomState = None,
max_model_size: int = None,
compress=False,
domain=None,
structural_zeros: Dict = None,
):
super().__init__(
Expand Down Expand Up @@ -428,13 +429,13 @@ class PrivBayesGM(GraphicalGenerativeModel):

def __init__(
self,
epsilon=1,
delta=1e-5,
degree=2,
domain: Domain,
epsilon: float = 1,
delta: float = 1e-5,
degree: int = 2,
n_iters: int = 5000,
compress=True,
compress: bool = True,
max_model_size: int = None,
domain=None,
random_state: RandomState = None,
n_jobs: int = -1,
):
Expand Down Expand Up @@ -498,7 +499,9 @@ def load(cls, path: Path) -> Self:
:rtype: PrivBayesGM
"""
generator = PrivBayes.load(path)
obj = cls(epsilon=generator.epsilon, delta=generator.delta)
obj = cls(
epsilon=generator.epsilon, delta=generator.delta, domain=generator._domain
)
del obj.generator
obj.generator = generator

Expand Down
4 changes: 4 additions & 0 deletions src/dpmm/pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ def fit(
zeros = self.proc.zeros
t_domain = self.proc.bin_domain
else:
assert (
domain is not None
), "Domain must be provided if no processing is used."
t_domain = domain
t_df = df
zeros = structural_zeros
Expand Down Expand Up @@ -238,6 +241,7 @@ def __init__(
compress=compress,
max_model_size=max_model_size,
n_jobs=n_jobs,
domain=None,
**gen_kwargs
)

Expand Down
39 changes: 32 additions & 7 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ def sample_dataframe():
@pytest.mark.parametrize("fit_mode", ["pretrain_only", "pretrain_and_fit", "fit_only"])
def test_models(model_class, use_domain, compress, max_model_size, epsilon, condition, serialise, sample_dataframe, with_zeros, fit_mode):
if use_domain:
domain = {
domain = {
col: sample_dataframe[col].max() + 1
for col in sample_dataframe.columns
}
else:
domain = None


# When domain is None, expect an AssertionError
structural_zeros = None
if with_zeros:
zero_col = np.random.choice(sample_dataframe.columns, size=1)[0]
Expand All @@ -50,12 +50,41 @@ def test_models(model_class, use_domain, compress, max_model_size, epsilon, cond
n_iters=10,
)

if model_class.name == "aim":
model_args["rounds"] = 5
elif model_class.name == "priv-bayes":
model_args["degree"] = 1

model = model_class(**model_args)

if with_zeros:
zero_col = np.random.choice(sample_dataframe.columns, size=1)[0]
structural_zeros = {
zero_col: np.random.choice(sample_dataframe[zero_col].max() + 1, replace=False, size=3)
}

random_state = np.random.RandomState(42)
model_args = dict(
epsilon=epsilon,
domain=domain,
compress=compress,
max_model_size=max_model_size,
n_iters=10,
)

if model_class.name == "aim":
model_args["rounds"] = 5
elif model_class.name == "priv-bayes":
model_args["degree"] = 1

model = model_class(**model_args)

if not use_domain:
with pytest.raises(AssertionError, match="Domain must be provided") as e:
model.fit(sample_dataframe)

return

if with_zeros:
model.set_structural_zeros(structural_zeros)

Expand All @@ -69,17 +98,13 @@ def test_models(model_class, use_domain, compress, max_model_size, epsilon, cond
assert model.generator.cliques is not None
assert model.generator.fit_state == "pretrained"


if fit_mode in ["fit_only", "pretrain_and_fit"]:
if fit_mode in ["fit_only", "pretrain_and_fit"]:
model.fit(sample_dataframe)
assert model.generator.fit_state == "trained"


if max_model_size is not None:
assert model.generator.model_size <= max_model_size



if serialise:
with TemporaryDirectory() as tmp_dir:
tmp_path = Path(tmp_dir)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def test_pipeline(model_class, use_domain, compress, with_zeros, condition, max_
pipeline.fit(sample_dataframe, domain=domain, public=True, structural_zeros=structural_zeros)
assert pipeline.gen.generator.cliques is not None

if fit_mode in ["fit_only", "pretrain_and_fit"]:
if fit_mode in ["fit_only", "pretrain_and_fit"]:
pipeline.fit(sample_dataframe, domain=domain, structural_zeros=structural_zeros)

if serialise:
Expand Down