diff --git a/.gitignore b/.gitignore index f26de6aa..537a9828 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ sbi-logs/ # Editors .tags .vscode +.idea # macOS *.DS_Store diff --git a/sbibm/algorithms/abcpy/__init__.py b/sbibm/algorithms/abcpy/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sbibm/algorithms/abcpy/abcpy_utils.py b/sbibm/algorithms/abcpy/abcpy_utils.py new file mode 100644 index 00000000..c6cc7458 --- /dev/null +++ b/sbibm/algorithms/abcpy/abcpy_utils.py @@ -0,0 +1,146 @@ +from typing import Callable + +import numpy as np +import torch +from abcpy.continuousmodels import Continuous, InputConnector +from abcpy.distances import Euclidean, LogReg, PenLogReg +from abcpy.output import Journal +from abcpy.probabilisticmodels import ProbabilisticModel +from abcpy.statistics import Statistics + + +class ABCpyPrior(ProbabilisticModel, Continuous): + def __init__(self, task, name="ABCpy_prior"): + self.prior_forward = task.get_prior() + self.dim_parameters = task.dim_parameters + self.name = task.name if task.name is not None else name + + input_parameters = InputConnector.from_list([]) + super(ABCpyPrior, self).__init__(input_parameters, self.name + "_prior") + + def forward_simulate( + self, abcpy_input_values, num_forward_simulations, rng=np.random.RandomState() + ): + result = np.array(self.prior_forward(num_forward_simulations)) + return [ + np.array([x]).reshape( + -1, + ) + for x in result + ] + + def get_output_dimension(self): + return self.dim_parameters + + def _check_input(self, input_values): + return True + + def _check_output(self, values): + return True + + +class ABCpySimulator(ProbabilisticModel, Continuous): + def __init__(self, parameters, task, max_calls, name="ABCpy_simulator"): + self.simulator = task.get_simulator(max_calls=max_calls) + self.output_dim = task.dim_data + self.name = task.name if task.name is not None else name + + input_parameters = InputConnector.from_list(parameters) + super(ABCpySimulator, self).__init__(input_parameters, self.name) + + def forward_simulate( + self, abcpy_input_values, num_forward_simulations, rng=np.random.RandomState() + ): + tensor_param = torch.tensor(abcpy_input_values) + tensor_res = [ + self.simulator(tensor_param) for k in range(num_forward_simulations) + ] + # print(tensor_res) + # print([np.array(x).reshape(-1, ) for x in tensor_res]) + return [ + np.array(x).reshape( + -1, + ) + for x in tensor_res + ] + + def get_output_dimension(self): + return self.output_dim + + def _check_input(self, input_values): + return True + + def _check_output(self, values): + return True + + +def get_distance(distance: str, statistics: Statistics) -> Callable: + """Return distance function for ABCpy.""" + + if distance == "l2": + distance_calc = Euclidean(statistics) + + elif distance == "log_reg": + distance_calc = LogReg(statistics) + + elif distance == "pen_log_reg": + distance_calc = PenLogReg(statistics) + + elif distance == "Wasserstein": + raise NotImplementedError( + "Wasserstein distace not yet implemented as we are considering only one single " + "simulation for parameter value" + ) + + else: + raise NotImplementedError(f"Distance '{distance}' not implemented.") + + return distance_calc + + +def journal_cleanup_rejABC(journal, percentile=None, threshold=None): + """This function takes a Journal file (typically produced by an Rejection ABC run with very large epsilon value) + and the keeps only the samples which achieve performance less than either some percentile of the achieved distances, + or either some specified threshold. It is + a very simple way to obtain a Rejection ABC which works on a percentile of the obtained distances.""" + + if (threshold is None) == (percentile is None): + raise RuntimeError( + "Exactly one of percentile or epsilon needs to be specified." + ) + + if percentile is not None: + distance_cutoff = np.percentile(journal.distances[-1], percentile) + else: + distance_cutoff = threshold + picked_simulations = journal.distances[-1] < distance_cutoff + new_distances = journal.distances[-1][picked_simulations] + if len(picked_simulations) == 0: + raise RuntimeError( + "The specified value of threshold is too low, no simulations are selected." + ) + n_reduced_samples = np.sum(picked_simulations) + new_journal = Journal(journal._type) + new_journal.configuration["n_samples"] = n_reduced_samples + new_journal.configuration["n_samples_per_param"] = journal.configuration[ + "n_samples_per_param" + ] + new_journal.configuration["epsilon"] = distance_cutoff + + new_accepted_parameters = [] + param_names = journal.get_parameters().keys() + new_names_and_parameters = {name: [] for name in param_names} + for i in np.where(picked_simulations)[0]: + if picked_simulations[i]: + new_accepted_parameters.append(journal.get_accepted_parameters()[i]) + for name in param_names: + new_names_and_parameters[name].append(journal.get_parameters()[name][i]) + + new_journal.add_accepted_parameters(new_accepted_parameters) + new_journal.add_weights(np.ones((n_reduced_samples, 1))) + new_journal.add_ESS_estimate(np.ones((n_reduced_samples, 1))) + new_journal.add_distances(new_distances) + new_journal.add_user_parameters(new_names_and_parameters) + new_journal.number_of_simulations.append(journal.number_of_simulations[-1]) + + return new_journal diff --git a/sbibm/algorithms/abcpy/rejection_abc.py b/sbibm/algorithms/abcpy/rejection_abc.py new file mode 100644 index 00000000..52e7faa0 --- /dev/null +++ b/sbibm/algorithms/abcpy/rejection_abc.py @@ -0,0 +1,166 @@ +from typing import Optional, Tuple + +import numpy as np +import torch +from abcpy.backends import BackendDummy +from abcpy.inferences import RejectionABC +from abcpy.statistics import Identity + +import sbibm +from sbibm.algorithms.abcpy.abcpy_utils import ( + ABCpyPrior, + ABCpySimulator, + get_distance, + journal_cleanup_rejABC, +) +from sbibm.tasks.task import Task +from sbibm.utils.io import save_tensor_to_csv +from sbibm.utils.kde import get_kde +from sbibm.utils.torch import sample + + +def run( + task: Task, + num_samples: int, + num_simulations: int, + num_simulations_per_param: Optional[int] = 1, + num_observation: Optional[int] = None, + observation: Optional[torch.Tensor] = None, + num_top_samples: Optional[int] = 100, + quantile: Optional[float] = None, + eps: Optional[float] = None, + distance: str = "l2", + save_distances: bool = False, + sass: bool = False, + sass_fraction: float = 0.5, + sass_feature_expansion_degree: int = 3, + kde_bandwidth: Optional[str] = None, +) -> Tuple[torch.Tensor, int, Optional[torch.Tensor]]: + """Runs REJ-ABC from `ABCpy` + + ABCpy does not implement LRA post processing. SASS is supported in ABCpy but not yet implemented here. + + Choose one of `num_top_samples`, `quantile`, `eps`. + + Args: + task: Task instance + num_samples: Number of samples to generate from posterior + num_simulations: Simulation budget + num_simulations_per_param: The number of the total simulations budget to produce for each parameter value; + used to better estimate the ABC distance from the observation for each parameter value. Useful to + investigate whether splitting the budget to do multi-simulations per parameter is helpful or not. + num_observation: Observation number to load, alternative to `observation` + observation: Observation, alternative to `num_observation` + num_top_samples: If given, will use `top=True` with num_top_samples + quantile: Quantile to use + eps: Epsilon threshold to use + distance: Distance to use; can be "l2" (Euclidean Distance), "log_reg" (LogReg) or "pen_log_reg" (PenLogReg) + save_distances: If True, stores distances of samples to disk + sass: If True, summary statistics are learned as in Fearnhead & Prangle 2012. + Not yet implemented, left for compatibility with other algorithms. + sass_fraction: Fraction of simulation budget to use for sass. Unused for now, left for compatibility + with other algorithms. + sass_feature_expansion_degree: Degree of polynomial expansion of the summary statistics. Unused for now, only + left for compatibility with other algorithms. + kde_bandwidth: If not None, will resample using KDE when necessary, set + e.g. to "cv" for cross-validated bandwidth selection + Returns: + Samples from posterior, number of simulator calls, log probability of true params if computable + """ + if sass: + raise NotImplementedError("SASS not yet implemented") + + assert not (num_observation is None and observation is None) + assert not (num_observation is not None and observation is not None) + + if (eps is not None) + (num_top_samples is not None) + (quantile is not None) != 1: + raise RuntimeError( + "Exactly one of `num_top_samples`, `quantile`, `eps` needs to be specified." + ) + + log = sbibm.get_logger(__name__) + log.info(f"Running REJ-ABC") + + if observation is None: + observation = task.get_observation(num_observation) + + if num_top_samples is not None: + if sass: + quantile = num_top_samples / ( + num_simulations - int(sass_fraction * num_simulations) + ) + else: + quantile = num_top_samples / num_simulations + + # use the dummy backend only (for now) + backend = BackendDummy() + + # as Rejection ABC works by directly drawing until you get below some threshold -> here we set the threshold to + # very large value and then we select the best objects later. + + statistics = Identity( + degree=1, cross=True + ) # we assume statistics are already computed inside the task + distance_calc = get_distance(distance, statistics) + + # define prior and model from the task simulator: + parameters = ABCpyPrior(task) + model = ABCpySimulator([parameters], task, max_calls=num_simulations) + + # inference + sampler = RejectionABC( + root_models=[model], distances=[distance_calc], backend=backend + ) + journal_standard_ABC = sampler.sample( + [[np.array(observation)]], + n_samples=num_simulations // num_simulations_per_param, + n_samples_per_param=num_simulations_per_param, + epsilon=10 ** 50, + ) # set epsilon to very large number + if ( + eps is None + ): # then we use quantile to select the posterior samples from the generated ones + journal_standard_ABC_reduced = journal_cleanup_rejABC( + journal_standard_ABC, percentile=quantile * 100 + ) + else: + journal_standard_ABC_reduced = journal_cleanup_rejABC( + journal_standard_ABC, threshold=eps + ) + + actual_n_simulations = ( + journal_standard_ABC.number_of_simulations[-1] + * journal_standard_ABC.configuration["n_samples_per_param"] + ) + # this takes into account the fact that num_simulations can be not divisible by num_simulations_per_param + expected_n_simulations = ( + num_simulations // num_simulations_per_param + ) * num_simulations_per_param + assert actual_n_simulations == expected_n_simulations + + if save_distances: + distances = torch.tensor(journal_standard_ABC_reduced.distances[-1]) + save_tensor_to_csv("distances.csv", distances) + + samples = torch.tensor( + journal_standard_ABC_reduced.get_accepted_parameters() + ).squeeze() # that should work + + if kde_bandwidth is not None: + # fit KDE on data to obtain samples + log.info( + f"KDE on {samples.shape[0]} samples with bandwidth option {kde_bandwidth}" + ) + kde = get_kde( + samples, + bandwidth=kde_bandwidth, + ) + samples = kde.sample(num_samples) + else: + # otherwise sample with replacement until you get the num_samples you want + log.info(f"Sampling {num_samples} samples from trace") + samples = sample(samples, num_samples=num_samples, replace=True) + + # The latter is None as we cannot compute posterior probabilities with ABCpy (it would be the posterior probability + # of the true parameter space in case that was available) + return samples, actual_n_simulations, None diff --git a/tests/algorithms/test_abcpy.py b/tests/algorithms/test_abcpy.py new file mode 100644 index 00000000..988c7b57 --- /dev/null +++ b/tests/algorithms/test_abcpy.py @@ -0,0 +1,60 @@ +import numpy as np +import pytest +import torch + +import sbibm +from sbibm.algorithms.abcpy.rejection_abc import run as rej_abc + + +@pytest.mark.parametrize( + "task_name", + [(task_name) for task_name in ["gaussian_linear_uniform"]], +) +def test_abcpy_rejection( + task_name, + num_observation=1, + num_samples=10_000, + num_simulations=1_000, +): + # set random seeds: + torch.manual_seed(1) + np.random.seed(1) + + task = sbibm.get_task(task_name) + + posterior_samples, actual_num_simulations, _ = rej_abc( + task=task, + num_samples=num_samples, + num_observation=num_observation, + num_simulations=num_simulations, + quantile=0.1, + num_top_samples=None, + kde_bandwidth="cv", + ) + assert actual_num_simulations == num_simulations + assert posterior_samples.shape[0] == num_samples + + posterior_samples, actual_num_simulations, _ = rej_abc( + task=task, + num_samples=num_samples, + num_observation=num_observation, + num_simulations=num_simulations, + num_top_samples=500, + kde_bandwidth="cv", + ) + assert actual_num_simulations == num_simulations + assert posterior_samples.shape[0] == num_samples + + # test multiple simulations per parameter + posterior_samples, actual_num_simulations, _ = rej_abc( + task=task, + num_samples=num_samples, + num_observation=num_observation, + num_simulations_per_param=2, + num_simulations=num_simulations, + eps=6, + num_top_samples=None, + kde_bandwidth="cv", + ) + assert actual_num_simulations == num_simulations + assert posterior_samples.shape[0] == num_samples