diff --git a/rcc_dp/README.md b/rcc_dp/README.md index 864c6836..9d5ffc3b 100644 --- a/rcc_dp/README.md +++ b/rcc_dp/README.md @@ -1,3 +1,154 @@ -# RCC/DP project +# Source code for "Optimal Compression of Locally Differentially Private Mechanisms" -This is work in progress. Links to relevant publications will be added later. +Reference: Abhin Shah, Wei-Ning Chen, Johannes Balle, Peter Kairouz, Lucas Theis, +"Optimal Compression of Locally Differentially Private Mechanisms," +The 25th International Conference on Artificial Intelligence and Statistics (AISTATS), 2022 + +Contact: abhin@mit.edu, jballe@google.com + +Arxiv: [https://arxiv.org/pdf/2111.00092.pdf](https://arxiv.org/pdf/2111.00092.pdf) + +### Dependencies: + +In order to successfully execute the code, the following libraries must be installed: + +Python --- json, math, time, matplotlib, numpy, scipy, [absl](https://github.com/abseil/abseil-py), [ml_collections](https://github.com/google/ml_collections) + +### Wrapper functions: + +This repository contains the code for (a) mean estimation and (b) frequency estimation. To run the code, a wrapper function needs to be written. For example, to run the mean estimation code, the following could be used: + +``` +from mean_estimation import config as defaults +from mean_estimation import experiment +from mean_estimation import experiment_coding_cost + +def main(): + config = defaults.get_config() + experiment.evaluate('path-to-the-mean-estimation-code', config) + +if __name__ == "__main__": + main() +``` + +Similarly, to run the frequency estimation code, the following could be used: + +``` +from frequency_estimation import config as defaults +from frequency_estimation import experiment +from frequency_estimation import experiment_coding_cost + +def main(): + config = defaults.get_config() + experiment.evaluate('path-to-the-frequency-estimation-code', config) + +if __name__ == "__main__": + main() +``` + +### Reproducing the figures + +1. To reproduce Figure 1(Top), make the following changes in mean_estimation/config.py: +``` +num_itr = 10 +vary = "cc" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment_coding_cost.evaluate('path-to-the-mean-estimation-code', config) +``` +2. To reproduce Figure 1(Bottom), make the following changes in mean_estimation/config.py: +``` +num_itr = 10 +vary = "eps" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-mean-estimation-code', config) +``` +3. To reproduce Figure 2(Top), make the following changes in frequency_estimation/config.py: +``` +num_itr = 10 +vary = "cc" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment_coding_cost.evaluate('path-to-the-frequency-estimation-code', config) +``` +4. To reproduce Figure 2(Bottom), make the following changes in frequency_estimation/config.py: +``` +num_itr = 10 +vary = "eps" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-frequency-estimation-code', config) +``` +5. To reproduce Figure 3, make the following changes in mean_estimation/config.py: +``` +run_approx_miracle=True, +run_modified_miracle=False, +num_itr = 10 +vary = "eps" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-mean-estimation-code', config) +``` +6. To reproduce Figure 4(Left), make the following changes in mean_estimation/config.py: +``` +num_itr = 10 +vary = "d" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-mean-estimation-code', config) +``` +7. To reproduce Figure 4(Right), make the following changes in mean_estimation/config.py: +``` +num_itr = 10 +vary = "n" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-mean-estimation-code', config) +``` +8. To reproduce Figure 5, make the following changes in frequency_estimation/config.py: +``` +run_approx_miracle=True, +run_modified_miracle=False, +num_itr = 10 +vary = "eps" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-frequency-estimation-code', config) +``` +9. To reproduce Figure 6(Left), make the following changes in frequency_estimation/config.py: +``` +num_itr = 10 +vary = "d" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-frequency-estimation-code', config) +``` +10. To reproduce Figure 6(Right), make the following changes in frequency_estimation/config.py: +``` +num_itr = 10 +vary = "n" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-frequency-estimation-code', config) +``` \ No newline at end of file diff --git a/rcc_dp/frequency_estimation/config.py b/rcc_dp/frequency_estimation/config.py new file mode 100644 index 00000000..47e8fef4 --- /dev/null +++ b/rcc_dp/frequency_estimation/config.py @@ -0,0 +1,58 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Base configuration.""" + +from ml_collections.config_dict import config_dict + + +def get_config(): + """Returns config dictionary for model.""" + config = dict( + name="defaults", + # Either use geometric, zipf, or uniform i.e., data variable + # can take one of "geometric", "zipf", "uniform". + distribution="zipf", + lbd_geometric=0.8, + degree_zipf=1.0, + # Flags to indicate which methods to compare. + run_approx_miracle=False, + run_miracle=False, + run_modified_miracle=True, + run_ss=True, + run_rhr=True, + encoding_type="fast", # Can take either fast or normal + # Common parameters. + num_itr=1, + coding_cost=14, + coding_cost_multiplier=1, + approx_coding_cost_multiplier=3, + approx_t=6, + # Specific parameters (leave them as they are for now). + delta=10**(-6), + alpha=1.0, + # Variation. + vary="eps", # Can take one of "cc", "k", "n", "eps". + cc_space=[6, 8, 10, 12, 14], + k_space=[200, 400, 600, 800, 1000], + n_space=[2000, 4000, 6000, 8000, 10000], + eps_space=list(range(1, 9)), + # Defaults. + n=5000, + k=500, + t=3, + epsilon_target=6, + ) + config = config_dict.ConfigDict(config) + config.lock() # Prevent addition of new fields. + return config diff --git a/rcc_dp/frequency_estimation/experiment.py b/rcc_dp/frequency_estimation/experiment.py new file mode 100644 index 00000000..c1e3ac60 --- /dev/null +++ b/rcc_dp/frequency_estimation/experiment.py @@ -0,0 +1,305 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Experiment definitions (i.e., evaluation of miracle, rhr, subset selection +methods when either the data dimension k, the number of users n, or the +privacy parameter epsilon is varied).""" + +import json +import math +import time +import matplotlib +import matplotlib.pyplot as plt +import matplotlib.lines as mlines +import numpy as np +from rcc_dp.frequency_estimation import miracle +from rcc_dp.frequency_estimation import modify_pi +from rcc_dp.frequency_estimation import rhr +from rcc_dp.frequency_estimation import ss +from rcc_dp.frequency_estimation import unbias +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['text.usetex'] = True + + +def generate_geometric_distribution(k,lbd): + """Generate the discrete geometric distribution.""" + elements = range(0,k) + prob = [(1-lbd)*math.pow(lbd,x)/(1-math.pow(lbd,k)) for x in elements] + return prob + + +def generate_uniform_distribution(k): + """Generate the discrete uniform distribution.""" + raw_distribution = [1] * k + sum_raw = sum(raw_distribution) + prob = [float(y)/float(sum_raw) for y in raw_distribution] + return prob + + +def generate_zipf_distribution(k,degree): + """Generate the discrete zipf distribution.""" + raw_distribution = [1/(float(i)**(degree)) for i in range(1,k+1)] + sum_raw = sum(raw_distribution) + prob = [float(y)/float(sum_raw) for y in raw_distribution] + return prob + + +def evaluate(work_path, config, file_open=open): + """Evaluates miracle, rhr, ss methods.""" + with file_open(work_path + "/config.json", "w") as f: + json.dump(config.to_dict(), f) + + start_time = time.time() + + delta = config.delta + alpha = config.alpha + # Get default values. + k = config.k + n = config.n + epsilon_target = config.epsilon_target + + if config.vary == "k": + vary_space = config.k_space + print("k space = " + str(vary_space)) + elif config.vary == "n": + vary_space = config.n_space + print("n space = " + str(vary_space)) + elif config.vary == "eps": + vary_space = config.eps_space + print("eps space = " + str(vary_space)) + else: + raise ValueError("vary should be either be k, n or eps.") + + approx_miracle_error = np.zeros((config.num_itr, len(vary_space))) + miracle_error = np.zeros((config.num_itr, len(vary_space))) + modified_miracle_error = np.zeros((config.num_itr, len(vary_space))) + rhr_error = np.zeros((config.num_itr, len(vary_space))) + ss_error = np.zeros((config.num_itr, len(vary_space))) + + for step, vary_parameter in enumerate(vary_space): + if config.vary == "k": + k = vary_parameter + coding_cost = config.coding_cost + elif config.vary == "n": + n = vary_parameter + coding_cost = config.coding_cost + elif config.vary == "eps": + epsilon_target = vary_parameter + coding_cost = config.coding_cost_multiplier * epsilon_target / np.log(2) + coding_cost += config.t + coding_cost = max(int(np.ceil(coding_cost)), 8) + print("epsilon target = " + str(epsilon_target)) + print("n = " + str(n)) + print("k = %d" % k) + print("coding cost = %d" % coding_cost) + + if config.run_modified_miracle: + eta = epsilon_target / 2.0 + print("eta = " + str(eta)) + print("alpha = " + str(alpha)) + + for itr in range(config.num_itr): + print("itr = %d" % itr) + if config.distribution == "geometric": + lbd = config.lbd_geometric + prob = generate_geometric_distribution(k,lbd) + elif config.distribution == "zipf": + degree = config.degree_zipf + prob = generate_zipf_distribution(k,degree) + elif config.distribution == "uniform": + prob = generate_uniform_distribution(k) + else: + raise ValueError( + "distribution should be either be geometric, zipf, uniform.") + x = np.random.choice(k, n, p=prob) + + if config.run_miracle: + x_miracle = np.zeros((k,n)) + for i in range(n): + if config.encoding_type == "fast": + x_miracle[:,i] = miracle.encode_decode_miracle_fast(i+itr*n, x[i], + k, epsilon_target/2, 2**coding_cost) + else: + _, _, index = miracle.encoder(i+itr*n, x[i], k, epsilon_target/2, + 2**coding_cost) + x_miracle[:,i] = miracle.decoder(i+itr*n, index, k, epsilon_target/2, + 2**coding_cost) + prob_miracle = unbias.unbias_miracle(k, epsilon_target/2, 2**coding_cost, + x_miracle.T, n, normalization = 1) + miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_miracle)], ord=1) + + if config.run_modified_miracle: + x_modified_miracle = np.zeros((k, n)) + for i in range(n): + if config.encoding_type == "fast": + x_modified_miracle[:,i] = miracle.encode_decode_modified_miracle_fast( + i+itr*n, x[i], k, alpha*epsilon_target, 2**coding_cost) + else: + _, pi, _ = miracle.encoder(i+itr*n, x[i], k, alpha*epsilon_target, + 2**coding_cost) + expected_beta = np.ceil(k/(np.exp(epsilon_target)+1))/k + pi_all = modify_pi.modify_pi(pi, eta, epsilon_target, + (np.exp(epsilon_target/2))/(1+expected_beta*(np.exp(epsilon_target)-1))) + index = np.random.choice(2**coding_cost, 1, p=pi_all[-1])[0] + x_modified_miracle[:,i] = miracle.decoder(i+itr*n, index, k, + alpha*epsilon_target, 2**coding_cost) + prob_modified_miracle = unbias.unbias_modified_miracle(k, alpha*epsilon_target, + 2**coding_cost, x_modified_miracle.T, n, normalization = 1) + modified_miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_modified_miracle)], ord=1) + + if config.run_approx_miracle: + x_approx_miracle = np.zeros((k,n)) + approx_coding_cost = int(np.ceil( + config.approx_coding_cost_multiplier*epsilon_target/np.log(2) + config.approx_t)) + epsilon_approx = miracle.get_approx_epsilon(epsilon_target, k, + 2**approx_coding_cost, delta) + for i in range(n): + if config.encoding_type == "fast": + x_approx_miracle[:,i] = miracle.encode_decode_miracle_fast(i+itr*n, x[i], + k, epsilon_approx, 2**coding_cost) + else: + _, _, index = miracle.encoder(i+itr*n, x[i], k, epsilon_approx, 2**coding_cost) + x_approx_miracle[:,i] = miracle.decoder(i+itr*n, index, k, epsilon_approx, + 2**coding_cost) + prob_approx_miracle = unbias.unbias_miracle(k, epsilon_approx, 2**coding_cost, + x_approx_miracle.T, n, normalization = 1) + approx_miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_approx_miracle)], ord=1) + + if config.run_ss: + x_ss = ss.encode_string_fast(k, epsilon_target, x) + prob_ss = ss.decode_string(k, epsilon_target, x_ss, n, normalization = 1) + ss_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_ss)], ord=1) + + if config.run_rhr: + x_rhr = rhr.encode_string(k, epsilon_target, coding_cost, x) + prob_rhr = rhr.decode_string_fast(k, epsilon_target, coding_cost, x_rhr, + normalization = 1) # estimate the original underlying distribution + rhr_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_rhr)], ord=1) + + if config.run_approx_miracle: + print("approx miracle error: " + + str(approx_miracle_error[:, step])) + if config.run_miracle: + print("miracle error: " + str(miracle_error[:, step])) + if config.run_modified_miracle: + print("modified miracle error: " + + str(modified_miracle_error[:, step])) + if config.run_ss: + print("ss error: " + str(ss_error[:, step])) + if config.run_rhr: + print("rhr error: " + str(rhr_error[:, step])) + print(time.time() - start_time) + + print("--------------") + if config.run_approx_miracle: + print("approx miracle error:") + print(np.mean(approx_miracle_error, axis=0)) + if config.run_miracle: + print("miracle error:") + print(np.mean(miracle_error, axis=0)) + if config.run_modified_miracle: + print("modified miracle error:") + print(np.mean(modified_miracle_error, axis=0)) + if config.run_ss: + print("ss error:") + print(np.mean(ss_error, axis=0)) + if config.run_rhr: + print("rhr error:") + print(np.mean(rhr_error, axis=0)) + + plt.figure(figsize=((8, 5)), dpi=80) + plt.axes((.15, .2, .83, .75)) + if config.run_approx_miracle: + plt.errorbar( + vary_space, + np.mean(approx_miracle_error, axis=0), + yerr=np.std(approx_miracle_error, axis=0)/np.sqrt(config.num_itr), + linewidth = 3.0, + label="MRC") + if config.run_miracle: + plt.errorbar( + vary_space, + np.mean(miracle_error, axis=0), + yerr=np.std(miracle_error, axis=0)/np.sqrt(config.num_itr), + linewidth = 3.0, + label="MRC") + if config.run_modified_miracle: + plt.errorbar( + vary_space, + np.mean(modified_miracle_error, axis=0), + yerr=np.std(modified_miracle_error, axis=0)/np.sqrt(config.num_itr), + linewidth = 3.0, + label="MMRC") + if config.run_ss: + plt.errorbar( + vary_space, + np.mean(ss_error, axis=0), + yerr=np.std(ss_error, axis=0)/np.sqrt(config.num_itr), + ls='--', + linewidth = 3.0, + label="Subset Selection") + if config.run_rhr: + plt.errorbar( + vary_space, + np.mean(rhr_error, axis=0), + yerr=np.std(rhr_error, axis=0)/np.sqrt(config.num_itr), + ls='--', + linewidth = 3.0, + label="RHR") + plt.xticks(fontsize=28) + plt.yticks(fontsize=28) + plt.ylabel("$\ell_{2}$ error", fontsize=28) + if config.vary == "k": + plt.xlabel(r"$d$", fontsize=28) + plt.xticks([200,400,600,800,1000]) + plt.yticks([0.2,0.4,0.6,0.8]) + plt.legend(fontsize=24, loc='upper left') + elif config.vary == "n": + plt.xlabel(r"$n$", fontsize=28) + plt.ticklabel_format(style='sci', axis='x', scilimits=(3,3)) + plt.gca().xaxis.get_offset_text().set_fontsize(16) + plt.yticks([0.3,0.4,0.5,0.6,0.7]) + plt.legend(fontsize=24, loc='upper right') + elif config.vary == "eps": + plt.xlabel(r"$\varepsilon$", fontsize=28) + plt.yticks([0.3,0.6,0.9,1.2,1.5]) + plt.legend(fontsize=24, loc='upper right') + + with file_open(work_path + "/rcc_dp_ss_comparison.png", "wb") as f: + plt.savefig(f, format="png") + + if config.run_approx_miracle: + with file_open(work_path + "/approx_miracle_error.csv", "w") as f: + np.savetxt(f, approx_miracle_error, delimiter=",") + + if config.run_miracle: + with file_open(work_path + "/miracle_error.csv", "w") as f: + np.savetxt(f, miracle_error, delimiter=",") + + if config.run_modified_miracle: + with file_open(work_path + "/modified_miracle_error.csv", "w") as f: + np.savetxt(f, modified_miracle_error, delimiter=",") + + if config.run_ss: + with file_open(work_path + "/ss_error.csv", "w") as f: + np.savetxt(f, ss_error, delimiter=",") + + if config.run_rhr: + with file_open(work_path + "/rhr_error.csv", "w") as f: + np.savetxt(f, rhr_error, delimiter=",") diff --git a/rcc_dp/frequency_estimation/experiment_coding_cost.py b/rcc_dp/frequency_estimation/experiment_coding_cost.py new file mode 100644 index 00000000..5f4dfbfa --- /dev/null +++ b/rcc_dp/frequency_estimation/experiment_coding_cost.py @@ -0,0 +1,199 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Evaluation of miracle, rhr, and subset selection when coding cost is varied.""" + +import json +import math +import time +import matplotlib +import matplotlib.pyplot as plt +import matplotlib.lines as mlines +import numpy as np +from rcc.frequency_estimation import miracle +from rcc.frequency_estimation import modify_pi +from rcc.frequency_estimation import rhr +from rcc.frequency_estimation import ss +from rcc.frequency_estimation import unbias +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['text.usetex'] = True + + +def generate_geometric_distribution(k,lbd): + """Generate the discrete geometric distribution.""" + elements = range(0,k) + prob = [(1-lbd)*math.pow(lbd,x)/(1-math.pow(lbd,k)) for x in elements] + return prob + + +def generate_uniform_distribution(k): + """Generate the discrete uniform distribution.""" + raw_distribution = [1] * k + sum_raw = sum(raw_distribution) + prob = [float(y)/float(sum_raw) for y in raw_distribution] + return prob + + +def generate_zipf_distribution(k,degree): + """Generate the discrete zipf distribution.""" + raw_distribution = [1/(float(i)**(degree)) for i in range(1,k+1)] + sum_raw = sum(raw_distribution) + prob = [float(y)/float(sum_raw) for y in raw_distribution] + return prob + + +def evaluate(work_path, config, file_open=open): + """Evaluates miracle, rhr, ss methods.""" + with file_open(work_path + "/config.json", "w") as f: + json.dump(config.to_dict(), f) + + start_time = time.time() + + alpha = config.alpha + # Get default values. + k = config.k + n = config.n + epsilon_target = config.epsilon_target + + vary_space = config.cc + + print("coding space = " + str(vary_space)) + + modified_miracle_error = np.zeros((config.num_itr, len(vary_space))) + rhr_error = np.zeros((config.num_itr, 1)) + ss_error = np.zeros((config.num_itr, 1)) + + rhr_coding_cost = epsilon_target + + for itr in range(config.num_itr): + print("itr = %d" % itr) + print("epsilon target = " + str(epsilon_target)) + print("n = " + str(n)) + print("k = %d" % k) + + if config.run_modified_miracle: + eta = epsilon_target / 2.0 + print("eta = " + str(eta)) + print("alpha = " + str(alpha)) + + if config.distribution == "geometric": + lbd = config.lbd_geometric + prob = generate_geometric_distribution(k,lbd) + elif config.distribution == "zipf": + degree = config.degree_zipf + prob = generate_zipf_distribution(k,degree) + elif config.distribution == "uniform": + prob = generate_uniform_distribution(k) + else: + raise ValueError( + "distribution should be either be geometric, zipf, uniform.") + x = np.random.choice(k, n, p=prob) + + + if config.run_ss: + x_ss = ss.encode_string_fast(k, epsilon_target, x) + prob_ss = ss.decode_string(k, epsilon_target, x_ss, n, normalization = 1) + ss_error[itr, 0] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_ss)], ord=1) + + if config.run_rhr: + x_rhr = rhr.encode_string(k, epsilon_target, rhr_coding_cost, x) + prob_rhr = rhr.decode_string_fast(k, epsilon_target, rhr_coding_cost, x_rhr, + normalization = 1) # estimate the original underlying distribution + rhr_error[itr, 0] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_rhr)], ord=1) + + for step, vary_parameter in enumerate(vary_space): + coding_cost = vary_parameter + print("coding cost = %d" % coding_cost) + + if config.run_modified_miracle: + x_modified_miracle = np.zeros((k, n)) + for i in range(n): + if config.encoding_type == "fast": + x_modified_miracle[:,i] = miracle.encode_decode_modified_miracle_fast( + i+itr*n, x[i], k, alpha*epsilon_target, 2**coding_cost) + else: + _, pi, _ = miracle.encoder(i+itr*n, x[i], k, alpha*epsilon_target, + 2**coding_cost) + expected_beta = np.ceil(k/(np.exp(epsilon_target)+1))/k + pi_all = modify_pi.modify_pi(pi, eta, epsilon_target, + (np.exp(epsilon_target/2))/(1+expected_beta*(np.exp(epsilon_target)-1))) + index = np.random.choice(2**coding_cost, 1, p=pi_all[-1])[0] + x_modified_miracle[:,i] = miracle.decoder(i+itr*n, index, k, + alpha*epsilon_target, 2**coding_cost) + prob_modified_miracle = unbias.unbias_modified_miracle(k, alpha*epsilon_target, + 2**coding_cost, x_modified_miracle.T, n, normalization = 1) + modified_miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_modified_miracle)], ord=1) + + print(time.time() - start_time) + + print("--------------") + if config.run_modified_miracle: + print("modified miracle error:") + print(np.mean(modified_miracle_error, axis=0)) + if config.run_ss: + print("ss error:") + print(np.mean(ss_error, axis=0)) + if config.run_rhr: + print("rhr error:") + print(np.mean(rhr_error, axis=0)) + + plt.figure(figsize=(10, 8), dpi=80) + if config.run_modified_miracle: + plt.errorbar( + vary_space, + np.mean(modified_miracle_error, axis=0), + yerr=np.std(modified_miracle_error, axis=0), + linewidth = 3.0, + label="MMRC") + if config.run_ss: + plt.errorbar(vary_space, + [np.mean(ss_error, axis=0)[0]]*len(vary_space), + yerr = [np.std(ss_error, axis=0)[0]/np.sqrt(config.num_itr)]*len(vary_space), + ls='--', + linewidth = 3.0, + label="Subset Selection") + if config.run_rhr: + plt.errorbar(vary_space, + [np.mean(rhr_error, axis=0)[0]]*len(vary_space), + yerr = [np.std(rhr_error, axis=0)[0]/np.sqrt(config.num_itr)]*len(vary_space), + ls='--', + linewidth = 3.0, + label="RHR") + plt.xticks(fontsize=28) + plt.yticks(fontsize=28) + plt.ylabel("$\ell_{2}$ error", fontsize=28) + plt.xlabel(r"$\#$ bits", fontsize=28) + plt.yticks([0.4,0.5,0.6,0.7,0.8,0.9]) + plt.legend(fontsize=24, loc='upper right') + + with file_open(work_path + "/rcc_dp_ss_mse_vs_coding_cost.png", "wb") as f: + plt.savefig(f, format="png") + + with file_open(work_path + "/time.txt", "w") as f: + np.savetxt(f, np.array(time.time() - start_time).reshape(-1, 1)) + + if config.run_modified_miracle: + with file_open(work_path + "/modified_miracle_error.csv", "w") as f: + np.savetxt(f, modified_miracle_error, delimiter=",") + + if config.run_ss: + with file_open(work_path + "/ss_error.csv", "w") as f: + np.savetxt(f, ss_error, delimiter=",") + + if config.run_rhr: + with file_open(work_path + "/rhr_error.csv", "w") as f: + np.savetxt(f, rhr_error, delimiter=",") diff --git a/rcc_dp/frequency_estimation/experiment_test.py b/rcc_dp/frequency_estimation/experiment_test.py new file mode 100644 index 00000000..32abd110 --- /dev/null +++ b/rcc_dp/frequency_estimation/experiment_test.py @@ -0,0 +1,41 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for experiment definitions.""" + +from absl.testing import absltest +from rcc_dp.frequency_estimation import config as defaults +from rcc_dp.frequency_estimation import experiment +from rcc_dp.frequency_estimation import experiment_coding_cost + + +class ExperimentTest(absltest.TestCase): + + def test_evaluate_does_not_fail(self): + work_path = self.create_tempdir().full_path + config = defaults.get_config() + config.num_itr = 1 + config.k = 100 + config.n = 200 + if config.vary == "cc": + config.epsilon_target = 1 + config.cc_space = [4] + experiment_coding_cost.evaluate(work_path, config) + else: + config.eps_space = [1] + config.t = 2 + experiment.evaluate(work_path, config) + + +if __name__ == "__main__": + absltest.main() diff --git a/rcc_dp/frequency_estimation/miracle.py b/rcc_dp/frequency_estimation/miracle.py new file mode 100644 index 00000000..89c8a648 --- /dev/null +++ b/rcc_dp/frequency_estimation/miracle.py @@ -0,0 +1,199 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Minimal random coding (likelihood decoder) definitions. + +This is the code to be used to simulate the minimum random coding algorithm +(tailored to subset selection). The algorithm was introduced by Havasi et al. +in "Minimal Random Code Learning: Getting Bits Back from Compressed Model +Parameters" - https://arxiv.org/pdf/1810.00440.pdf. + +For brevity, we may refer to it as 'MIRACLE', although technically this refers +to the complete model compression pipeline of Havasi et al. +""" + +import numpy as np +import scipy.special as sc + +def encoder(seed, x, k, epsilon, number_candidates): + """This is the encoder used by the miracle algorithm. + + Args: + seed: The random seed to be used by the encoder. + x: The user input data. + k: The dimension of the one-hot encoded x. + epsilon: The privacy parameter epsilon. + number_candidates: The number of candidates to be sampled. + + Returns: + z: The set of candidates sampled at the encoder. + index: The index sampled by the encoder. + pi: The distribution over the candidates for the given input data x. + """ + d = int(np.ceil(k/(np.exp(epsilon)+1))) + + c1 = np.exp(epsilon)/(sc.comb(k-1, d-1)* np.exp(epsilon) + sc.comb(k-1, d)) + c2 = 1/(sc.comb(k-1, d-1)* np.exp(epsilon) + sc.comb(k-1, d)) + + # Proposal distribution is chosen to be uniform on all d-hot encoded vectors. + z = np.zeros((k, number_candidates)) + z[:d, :] = 1 + rs = np.random.RandomState(seed) + for i in range(number_candidates): + rs.shuffle(z[:, i]) + + pi = np.where(z[x, :] == 1, c1, c2) + pi /= np.sum(pi) + index = np.random.choice(number_candidates, 1, p=pi)[0] + return z, pi, index + + +def decoder(seed, index, k, epsilon, number_candidates): + """This is the decoder used by the miracle algorithm. + + Args: + seed: The random seed to be used by the decoder (this seed should be the + same as the one used by the encoder). + index: The index transmitted by the encoder. + k: The dimension of the data. + epsilon: The privacy parameter epsilon. + number_candidates: The number of candidates to be sampled. + + Returns: + z_k: The candidate corresponding to the index (This is the candidate that + is distributed according to the conditional distribution of privunit). + """ + d = int(np.ceil(k/(np.exp(epsilon)+1))) + + # Proposal distribution should be the same as the one used by the encoder. + z = np.zeros((k, number_candidates)) + z[:d, :] = 1 + rs = np.random.RandomState(seed) + for i in range(number_candidates): + rs.shuffle(z[:, i]) + z_k = z[:, index] + return z_k + + +def get_approx_epsilon(epsilon_target, k, number_candidates, delta): + """Get the effective epsilon for miracle with approxmiate-DP guarantees. + + Args: + epsilon_target: The privacy guarantee we desire. + k: The number of dimensions. + number_candidates: The number of candidates. + delta: The delta in the differential privacy guarantee. + + Returns: + epsilon_approx: The resulting epsilon that this version of miracle ends + up using. + """ + epsilon_search_space = np.linspace(0,epsilon_target,100) + epsilon_search_space = epsilon_search_space[:-1] + epsilon_approx = 0 + # Find the largest epsilon for SS so that MIRACLE meets epsilon_target + for _, epsilon in enumerate(epsilon_search_space): + d = int(np.ceil(k/(np.exp(epsilon)+1))) + cupper = (k*np.exp(epsilon))/(k-d+d*np.exp(epsilon)) + clower = k/(k-d+d*np.exp(epsilon)) + t = np.abs(cupper-clower)*np.sqrt((np.log(2/delta))/(2*number_candidates)) + if -1 < t < 1: + if epsilon + np.log(1+t) - np.log(1-t) <= epsilon_target: + epsilon_approx = epsilon + return epsilon_approx + + +def encode_decode_miracle_fast(seed, x, k, epsilon, number_candidates): + """A fast implementation of the miracle protocol -- instead of + generating number_candidates samples, generate one sample with + true expectation. + + Args: + seed: The random seed to be used by the encoder. + x: The user input data. + k: The dimension of the one-hot encoded x. + epsilon: The privacy parameter epsilon. + number_candidates: The number of candidates to be sampled. + + Returns: + z: The candidate sampled at the decoder. + """ + d = int(np.ceil(k/(np.exp(epsilon)+1))) + num_cand_in_cap = np.random.binomial(number_candidates, d/k, size=None) + pi_in = np.exp(epsilon)/(num_cand_in_cap*np.exp(epsilon) + + (number_candidates-num_cand_in_cap)) + prob_sample_from_cap = num_cand_in_cap*pi_in + + if np.random.uniform(0,1) <= prob_sample_from_cap: + # Generate a sample uniformly from the cap + z = [1]*(d-1)+[0]*(k-d) + rs = np.random.RandomState(seed) + rs.shuffle(z) + z = z[:x] + [1] + z[x:] + else: + # Generate a sample uniformly from outside the cap + z = [1]*(d)+[0]*(k-d-1) + rs = np.random.RandomState(seed) + rs.shuffle(z) + z = z[:x] + [0] + z[x:] + z = np.array(z) + return z + + +def encode_decode_modified_miracle_fast(seed, x, k, epsilon, number_candidates): + """A fast implementation of the modified miracle protocol -- instead of + generating number_candidates samples, generate one sample with + true expectation. + + Args: + seed: The random seed to be used by the encoder. + x: The user input data. + k: The dimension of the one-hot encoded x. + epsilon: The privacy parameter epsilon. + number_candidates: The number of candidates to be sampled. + + Returns: + z: The candidate sampled at the decoder. + """ + d = int(np.ceil(k/(np.exp(epsilon)+1))) + c1 = np.exp(epsilon)/(sc.comb(k-1,d-1)* np.exp(epsilon) + sc.comb(k-1,d)) + c2 = 1/(sc.comb(k-1,d-1)* np.exp(epsilon) + sc.comb(k-1,d)) + + num_cand_in_cap = np.random.binomial(number_candidates, d/k, size=None) + + beta = num_cand_in_cap/number_candidates + expected_beta = np.ceil(k/(np.exp(epsilon)+1))/k + + pi_in = c1/(num_cand_in_cap*c1+(number_candidates-num_cand_in_cap)*c2) + tilde_pi_in = pi_in*(1+beta*(np.exp(epsilon)-1))/((1 + + expected_beta*(np.exp(epsilon)-1))) + if beta > expected_beta: + tilde_pi_in = tilde_pi_in*(beta+expected_beta*(np.exp(epsilon) + - 1))/(beta*np.exp(epsilon)) + + prob_sample_from_cap = num_cand_in_cap*tilde_pi_in + + if np.random.uniform(0,1) <= prob_sample_from_cap: + # Generate a sample uniformly from the cap + z = [1]*(d-1)+[0]*(k-d) + rs = np.random.RandomState(seed) + rs.shuffle(z) + z = z[:x] + [1] + z[x:] + else: + # Generate a sample uniformly from outside the cap + z = [1]*(d)+[0]*(k-d-1) + rs = np.random.RandomState(seed) + rs.shuffle(z) + z = z[:x] + [0] + z[x:] + z = np.array(z) + return z diff --git a/rcc_dp/frequency_estimation/modify_pi.py b/rcc_dp/frequency_estimation/modify_pi.py new file mode 100644 index 00000000..6e55865f --- /dev/null +++ b/rcc_dp/frequency_estimation/modify_pi.py @@ -0,0 +1,151 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Code to modify the distribution pi to tilde_pi. + +The distribution tilde_pi (which is equal to pi_all[-1]) is 2 * eta-DP. +""" + +import numpy as np + + +def modify_pi(pi, eta, epsilon, multiplicative_factor): + """This function modifies the distribution pi to make it 2eta-LDP. + + The function essentially ensures that the new distribution lies between the + upper threshold `exp(eta) * multiplicative_factor * p` and the lower threshold + `exp(-eta) * multiplicative_factor * p`. It first checks if the distribution + already lies inside the thresholds. If not, it trades as mass beyond the upper + threshold with mass beyond the lower threshold. In other words, it ensures + that at least one of the constraint is satisfied (the one which is violated + less severely). This is done with the help of the helper function + `trade_mass`. Next, it iteratively enforces the constraint that is still + violated and renormalizes the distribution. This is done with the help of the + helper function `normalize`. + + Args: + pi: The input distribution to be modified + eta: The privacy parameter that is half times the desired privacy guarantee. + epsilon: The privacy parameter epsilon. + multiplicative_factor: The factor by which the uniform distribution over the + candidates is scaled with. + + Returns: + pi_all: The container containing how the distrbution pi evolved from pi to + tilde_pi (which is equal to pi_all[-1]). Further, tilde_pi is 2eta-LDP. + """ + if eta < epsilon / 2: + raise ValueError('eta should be larger than epsilon/2.') + + number_candidates = len(pi) + p = np.zeros(number_candidates) + 1.0 / number_candidates + p = p * multiplicative_factor + # The container used to track changes in the distribution. + pi_all = [pi] + + # Calculate how much mass lies above the upper threshold. + mass_above = np.sum(np.maximum(pi - np.exp(eta) * p, 0)) + # Calculate how much mass lies below the lower threshold. + mass_below = np.sum(np.maximum(np.exp(-eta) * p - pi, 0)) + + if mass_above == 0 and mass_below == 0: + return pi_all + elif mass_above > mass_below: + # Since the lower threshold is violated less severely, correct all + # probabilities which are too low. + below = pi < np.exp(-eta) * p + pi_new = pi.copy() + pi_new[below] = np.exp(-eta) * p[below] + + # Sort by distance from upper threshold (biggest offenders first). The + # indices above the upper threshold are first in order. + indices = np.argsort(np.exp(eta) * p - pi_all[-1]) + + # Trade mass above the upper threshold against mass below the lower + # threshold as much as possible. + budget = mass_below + for i in indices: + # Correct probability above threshold as much as possible. + diff = pi_new[i] - np.exp(eta) * p[i] + if diff > budget: + pi_new[i] -= budget + break + elif diff > 0: + pi_new[i] -= diff + budget -= diff + pi_all.append(pi_new.copy()) + + # Now, pi_new disobeys at-most one constraint i.e., some mass is either + # above the upper threshold or all the mass is between the thresholds. + + # Calculate the remaining mass above the upper threshold. + mass_above = np.sum(np.maximum(pi_new - np.exp(eta) * p, 0)) + + while mass_above > 0: + # Correct probabilities above the upper threshold. + above = pi_new >= np.exp(eta) * p + pi_new[above] = np.exp(eta) * p[above] + + # Renormalize distribution. + not_above = ~above + pi_new[not_above] += mass_above * pi[not_above] / np.sum(pi[not_above]) + pi_all.append(pi_new.copy()) + + # Calculate the remaining mass above the upper threshold. + mass_above = np.sum(np.maximum(pi_new - np.exp(eta) * p, 0)) + + else: + # Since the upper threshold is violated less severely, correct all + # probabilities which are too high. + above = pi > np.exp(eta) * p + pi_new = pi.copy() + pi_new[above] = np.exp(eta) * p[above] + + # Sort by distance from lower threshold (biggest offenders first). The + # indices below the lower threshold are first in order. + indices = np.argsort(pi_all[-1] - np.exp(-eta) * p) + + # Trade mass below the lower threshold against mass above the upper + # threshold as much as possible. + budget = mass_above + for i in indices: + # Correct probability below threshold as much as possible. + diff = np.exp(-eta) * p[i] - pi_new[i] + if diff > budget: + pi_new[i] += budget + break + elif diff > 0: + pi_new[i] += diff + budget -= diff + pi_all.append(pi_new.copy()) + + # Now, pi_new disobeys at-most one constraint i.e., some mass is either + # below the lower threshold or all the mass is between the thresholds. + + # Calculate the remaining mass below the lower threshold. + mass_below = np.sum(np.maximum(np.exp(-eta) * p - pi_new, 0)) + + while mass_below > 0: + # Correct probabilities below the lower threshold. + below = pi_new <= np.exp(-eta) * p + pi_new[below] = np.exp(-eta) * p[below] + + # Renormalize distribution. + not_below = ~below + pi_new[not_below] -= mass_below * pi[not_below] / np.sum(pi[not_below]) + pi_all.append(pi_new.copy()) + + # Calculate the remaining mass below the lower threshold. + mass_below = np.sum(np.maximum(np.exp(-eta) * p - pi_new, 0)) + + return pi_all \ No newline at end of file diff --git a/rcc_dp/frequency_estimation/modify_pi_test.py b/rcc_dp/frequency_estimation/modify_pi_test.py new file mode 100644 index 00000000..bc8d7491 --- /dev/null +++ b/rcc_dp/frequency_estimation/modify_pi_test.py @@ -0,0 +1,82 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for modify pi.""" + +from absl.testing import absltest +import numpy as np +from rcc_dp.frequency_estimation import get_parameters +from rcc_dp.frequency_estimation import miracle +from rcc_dp.frequency_estimation import modify_pi + + +class ModifyPiTest(absltest.TestCase): + + def test_tilde_pi_is_a_distribution(self): + """Test whether every distribution generated by modify_all sums to 1.""" + budget = 0.5 + for d in [100, 200]: + for number_candidates in [2**4, 2**5]: + for epsilon in [2, 3]: + x = np.random.normal(0, 1, (d, 1)) + x = np.divide(x, np.linalg.norm(x, axis=0).reshape(1, -1)) + c1, c2, _, gamma = get_parameters.get_parameters_unbiased_miracle( + epsilon, d, number_candidates, budget) + _, _, pi = miracle.encoder(0, x[:, 0], number_candidates, c1, c2, + gamma) + eta = epsilon / 2 + pi_all = modify_pi.modify_pi(pi, eta, epsilon, + c1 / (np.exp(epsilon / 2))) + self.assertLessEqual(len(pi_all), 3) + for distribution in pi_all: + self.assertLessEqual(np.abs(np.sum(distribution)-1), 0.0001) + + def test_tilde_pi_is_private(self): + """Test whether tilde pi satisfies the DP constraint.""" + budget = 0.5 + for d in [100, 200]: + for number_candidates in [2**4, 2**5]: + for epsilon in [2, 3]: + x = np.random.normal(0, 1, (d, 1)) + x = np.divide(x, np.linalg.norm(x, axis=0).reshape(1, -1)) + c1, c2, _, gamma = get_parameters.get_parameters_unbiased_miracle( + epsilon, d, number_candidates, budget) + _, _, pi = miracle.encoder(0, x[:, 0], number_candidates, c1, c2, + gamma) + eta = epsilon / 2 + pi_all = modify_pi.modify_pi(pi, eta, epsilon, + c1 / (np.exp(epsilon / 2))) + self.assertLessEqual(np.max(pi_all[-1]), c1 / number_candidates) + self.assertLessEqual(c1 * np.exp(-epsilon) / number_candidates, + np.min(pi_all[-1]) + 1e-6) + + def test_convergence(self): + """Test whether modify_pi converges in at-most 3 iterations as expected.""" + budget = 0.5 + for d in [100, 200]: + for number_candidates in [2**4, 2**5]: + for epsilon in [2, 3]: + x = np.random.normal(0, 1, (d, 1)) + x = np.divide(x, np.linalg.norm(x, axis=0).reshape(1, -1)) + c1, c2, _, gamma = get_parameters.get_parameters_unbiased_miracle( + epsilon, d, number_candidates, budget) + _, _, pi = miracle.encoder(0, x[:, 0], number_candidates, c1, c2, + gamma) + eta = epsilon / 2 + pi_all = modify_pi.modify_pi(pi, eta, epsilon, + c1 / (np.exp(epsilon / 2))) + self.assertLessEqual(len(pi_all), 3) + + +if __name__ == "__main__": + absltest.main() \ No newline at end of file diff --git a/rcc_dp/frequency_estimation/rhr.py b/rcc_dp/frequency_estimation/rhr.py new file mode 100644 index 00000000..f783f705 --- /dev/null +++ b/rcc_dp/frequency_estimation/rhr.py @@ -0,0 +1,140 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""RHR definitions. + +This is the code to be used to simulate RHR. + +RHR was introduced by Chen et al in "Breaking the Communication-Privacy- +Accuracy Trilemma" - https://arxiv.org/pdf/2007.11707.pdf +""" + +import math +import numpy as np +from rcc_dp.frequency_estimation import ss + + +def encode_string(dim, epsilon, comm, x): + """An implementation of the RHR protocol to privatize x.""" + n = len(x) + + # Pad dim to the power of 2. + padded_d = int(math.pow(2, math.ceil(math.log(dim, 2)))) + # Calculate the effective communication cost + # i.e., min(comm, log(e^\epsilon)). + eff_comm = min(comm, math.ceil(epsilon*math.log(math.e, 2)), + math.ceil(math.log(dim, 2))+1) + # Set the block size. + block_size = int(padded_d/math.pow(2, eff_comm-1)) + + loc_sign = np.zeros(n) + for idx in range(n): + j = int(idx % block_size) + x_individual = int(x[idx]) + loc_sign[idx] = 2*int(x_individual/block_size) + if get_hadamard_entry(padded_d, j, x_individual) == -1: + loc_sign[idx] = loc_sign[idx] + 1 + + z = rr_encode_string(int(math.pow(2, eff_comm)), epsilon, + np.array(loc_sign)) + return z + + +def rr_encode_string(alphabet_size, epsilon, samples): + """An implementation of the RR protocol to privatize x.""" + n = len(samples) + # Start by setting private_samples = samples. + private_samples_rr = np.copy(samples) + + # Determine which samples need to be noised (i.e., flipped). + flip = np.random.random_sample(n) < (alphabet_size + - 1)/(math.exp(epsilon) + alphabet_size - 1) + flip_samples = samples[flip] + + # Select new samples uniformly at random to replace the original ones. + rand_samples = np.random.randint(0, alphabet_size - 1, len(flip_samples)) + + # Shift the samples if needed to avoid sampling the orginal samples. + rand_samples[rand_samples >= flip_samples] += 1 + + # Replace the original samples by the randomly selected ones. + private_samples_rr[flip] = rand_samples + return private_samples_rr + + +def decode_string_fast(dim, epsilon, comm, z, normalization = 1): + """Learn the original distribution from the privatized strings faster.""" + l = len(z) + # Pad dim to the power of 2. + padded_d = int(math.pow(2, math.ceil(math.log(dim,2)))) + # Calculate the effective communication cost + # i.e., min(comm, log(e^\epsilon)). + eff_comm = min(comm, math.ceil(epsilon*math.log(math.e, 2)), + math.ceil(math.log(dim, 2))+1) + # Set the block size. + block_size = int(padded_d/math.pow(2, eff_comm-1)) + n = int(l/block_size)*block_size + + # The input z is a matrix consisting of all the bit vectors. + group_list = np.array(z[:n]).reshape(int(n/block_size), block_size).T + + # Create histograms to specify the empirical distributions of each group. + histograms = np.zeros((block_size, int(padded_d/block_size))) + for g_idx in range(block_size): + g_count, _ = np.histogram(group_list[g_idx], + range(int(math.pow(2, eff_comm))+1)) + histograms[g_idx] = g_count[::2] - g_count[1::2] + histograms[g_idx] = histograms[g_idx] * (math.exp(epsilon) + + math.pow(2, eff_comm)-1)/(math.exp(epsilon)-1)*(block_size/n) + + # Obtain estimator of q. + q = np.zeros((block_size, int(padded_d/block_size))) + for j in range(block_size): + q[j, :] = fast_inverse_hadamard_transform(int(padded_d/block_size), + histograms[j, :]) + + q = q.reshape((padded_d, ), order = 'F') + + # Perform inverse Hadamard transform to get p + p_estimate = fast_inverse_hadamard_transform(padded_d, q)/padded_d + p_estimate = p_estimate[:dim] + + if normalization == 0: + # Clip and normalize. + p_estimate = ss.probability_normalize(p_estimate) + if normalization == 1: + # Project on the simplex. + p_estimate = ss.probability_project_simplex(p_estimate) + return p_estimate + + +def get_hadamard_entry(d, x, y): + """Get (H_d)_{x,y}.""" + z = x & y + z_bit = bin(z)[2:].zfill(int(math.log(d, 2))) + check = 0 + for i in range(0, int(math.log(d, 2))): + check = check^int(z_bit[i]) + return (-1)**check + + +def fast_inverse_hadamard_transform(k, dist): + """Performs inverse Hadamard transform.""" + if k == 1: + return dist + dist1 = dist[0 : k//2] + dist2 = dist[k//2 : k] + trans1 = fast_inverse_hadamard_transform(k//2, dist1) + trans2 = fast_inverse_hadamard_transform(k//2, dist2) + trans = np.concatenate((trans1+ trans2, trans1 - trans2)) + return trans diff --git a/rcc_dp/frequency_estimation/ss.py b/rcc_dp/frequency_estimation/ss.py new file mode 100644 index 00000000..572f42ad --- /dev/null +++ b/rcc_dp/frequency_estimation/ss.py @@ -0,0 +1,82 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Subset selection definitions. + +This is the code to be used to simulate the subset selection algorithm. + +The subset selection algorithm was introduced by Ye et al in +"Optimal Schemes for Discrete Distribution Estimation under +Locally Differential Privacy" - https://arxiv.org/pdf/1702.00610.pdf. +""" + +import numpy as np + + +def probability_project_simplex(p): + """Project the probabilitiess on the simplex.""" + k = len(p) # Infer the size of the alphabet. + p_sorted = np.sort(p) + p_sorted[:] = p_sorted[::-1] + p_sorted_cumsum = np.cumsum(p_sorted) + i = 1 + while i < k: + if p_sorted[i] + (1.0 / (i + 1)) * (1 - p_sorted_cumsum[i]) < 0: + break + i += 1 + lmd = (1.0 / i) * (1 - p_sorted_cumsum[i - 1]) + return np.maximum(p + lmd, 0) + + +def probability_normalize(p): + """Normalize the probabilities so that they sum to 1.""" + p = np.maximum(p,0) # Map it to be positive. + norm = np.sum(p) + p = np.true_divide(p,norm) + return p + + +def encode_string_fast(k, epsilon, x): + """A fast implementation of the subset selection protocol.""" + d = int(np.ceil(k/(np.exp(epsilon)+1))) + p = d*np.exp(epsilon)/(d*np.exp(epsilon)+k-d) + q = (d-p)/(k-1) + + n = len(x) + z = np.zeros((n, k)) + flip = np.random.random_sample((n, k)) + + # Instead of selecting exactly d ones, set each bit to be one + # independently with true expectation. + for i in range(n): + z[i,x[i]] = np.logical_or(0,flip[i,x[i]] < p) + return np.logical_or(z, flip < q) + +def decode_string(k, epsilon, z, length, normalization = 1): + """Learn the original distribution from the privatized strings faster.""" + d = int(np.ceil(k/(np.exp(epsilon)+1))) + + temp1 = ((k-1)*np.exp(epsilon) + +1.0*(k-1)*(k-d)/d)/((k-d)*(np.exp(epsilon)-1)) + temp2 = ((d-1)*np.exp(epsilon)+k-d) / (1.0*(k-d)*(np.exp(epsilon)-1)) + + # The input z is a matrix consisting of all the bit vectors. + p_estimate = (1.0*np.sum(z, axis=0)*temp1/length)-temp2 + + if normalization == 0: + # Clip and normalize. + p_estimate = probability_normalize(p_estimate) + if normalization == 1: + # Project on the simplex. + p_estimate = probability_project_simplex(p_estimate) + return p_estimate diff --git a/rcc_dp/frequency_estimation/unbias.py b/rcc_dp/frequency_estimation/unbias.py new file mode 100644 index 00000000..f45b2b3a --- /dev/null +++ b/rcc_dp/frequency_estimation/unbias.py @@ -0,0 +1,112 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Code to un-bias miracle & modified miracle for ss. + +Please note that this unbiasing techniques are specific to ss + miracle +and are not general strategies for miracle. The un-biasing happens by +modifying the corresponding m and b parameters. +""" + +import numpy as np +from scipy import stats +import scipy.special as sc +from rcc_dp.frequency_estimation import ss + + +def unbias_miracle(k, epsilon, number_candidates, z, n, normalization = 1): + """Get the unbiased estimate for miracle. + + Args: + k: The dimension of the one-hot encoded x. + epsilon: The privacy parameter epsilon. + number_candidates: The number of candidates to be sampled. + z: The privatized data. + n: The number of users. + normalization: Indicator whether to clip and normalize or + project on the simplex + + Returns: + p_estimate: The unbiased estimate for miracle + """ + d = int(np.ceil(k/(np.exp(epsilon)+1))) + c1 = np.exp(epsilon)/(sc.comb(k-1,d-1)* np.exp(epsilon) + sc.comb(k-1,d)) + c2 = 1/(sc.comb(k-1,d-1)* np.exp(epsilon) + sc.comb(k-1,d)) + beta = np.array(range(number_candidates+1))/number_candidates + # The probability with which you choose a candidate inside the cap. + pi_in = 1/number_candidates*(c1/(beta*c1+(1-beta)*c2)) + expectation = np.sum(stats.binom.pmf(range(number_candidates+1), + number_candidates, d/k)*range(number_candidates+1)*pi_in) + b_hat = (d - expectation)/(k-1) + m_hat = k*expectation/(k-1) - d/(k-1) + + p_estimate = (1.0*np.sum(z, axis=0)/(n*m_hat))-b_hat/m_hat + + if normalization == 0: + # Clip and normalize. + p_estimate = ss.probability_normalize(p_estimate) + elif normalization == 1: + # Project on the simplex. + p_estimate = ss.probability_project_simplex(p_estimate) + return p_estimate + + +def unbias_modified_miracle(k, epsilon, number_candidates, z, n, + normalization = 1): + """Get the unbiased estimate for modified miracle. + + Args: + k: The dimension of the one-hot encoded x. + epsilon: The privacy parameter epsilon. + number_candidates: The number of candidates to be sampled. + z: The privatized data. + n: The number of users. + normalization: Indicator whether to clip and normalize or + project on the simplex + + Returns: + p_estimate: The unbiased estimate for modified miracle + """ + expected_beta = np.ceil(k/(np.exp(epsilon)+1))/k + d = int(np.ceil(k/(np.exp(epsilon)+1))) + c1 = np.exp(epsilon)/(sc.comb(k-1,d-1)* np.exp(epsilon) + sc.comb(k-1,d)) + c2 = 1/(sc.comb(k-1,d-1)* np.exp(epsilon) + sc.comb(k-1,d)) + # The fraction of candidates that lie inside the cap. + beta = np.array(range(number_candidates + 1)) / number_candidates + + pi_in = 1 / number_candidates * (c1 / (beta * c1 + (1 - beta) * c2)) + tilde_pi_in_1 = ( + pi_in * (1 + beta * (np.exp(epsilon) - 1)) / (1 + expected_beta * + (np.exp(epsilon) - 1))) + tilde_pi_in_2 = ( + tilde_pi_in_1 * (beta + expected_beta * (np.exp(epsilon) - 1)) / + (beta * np.exp(epsilon))) + tilde_pi_in_2[0,] = 0 + indicator = beta <= expected_beta + # The probability with which you choose a candidate inside the cap. + tilde_pi_in = indicator * tilde_pi_in_1 + (1 - indicator) * tilde_pi_in_2 + expectation = np.sum( + stats.binom.pmf(range(number_candidates + 1), number_candidates, + d / k) * range(number_candidates + 1) * tilde_pi_in) + b_tilde = (d - expectation)/(k-1) + m_tilde = k*expectation/(k-1) - d/(k-1) + + p_estimate = (1.0*np.sum(z, axis=0)/(n*m_tilde))-b_tilde/m_tilde + + if normalization == 0: + # Clip and normalize. + p_estimate = ss.probability_normalize(p_estimate) + elif normalization == 1: + # Project on the simplex. + p_estimate = ss.probability_project_simplex(p_estimate) + return p_estimate diff --git a/rcc_dp/BUILD b/rcc_dp/mean_estimation/BUILD similarity index 100% rename from rcc_dp/BUILD rename to rcc_dp/mean_estimation/BUILD diff --git a/rcc_dp/config.py b/rcc_dp/mean_estimation/config.py similarity index 88% rename from rcc_dp/config.py rename to rcc_dp/mean_estimation/config.py index 772f4ae7..a40d033e 100644 --- a/rcc_dp/config.py +++ b/rcc_dp/mean_estimation/config.py @@ -24,14 +24,14 @@ def get_config(): # can take one of "biased_data", "unbiased_data", "same_data". data="biased_data", # Flags to indicate which methods to compare. - run_approx_miracle=True, + run_approx_miracle=False, run_miracle=False, - run_modified_miracle=False, + run_modified_miracle=True, run_privunit=True, run_sqkr=True, # Common parameters. num_itr=1, - coding_cost=8, + coding_cost=11, coding_cost_multiplier=1, approx_coding_cost_multiplier=3, approx_t=6, @@ -40,7 +40,8 @@ def get_config(): budget=0.5, alpha=1.0, # Variation. - vary="eps", # Can take one of "d", "n", "eps". + vary="eps", # Can take one of "cc", "d", "n", "eps". + cc_space=[6, 8, 10, 12], d_space=[200, 400, 600, 800, 1000], n_space=[2000, 4000, 6000, 8000, 10000], eps_space=list(range(1, 9)), @@ -48,7 +49,7 @@ def get_config(): n=5000, d=500, t=2, - epsilon_target=4, + epsilon_target=6, ) config = config_dict.ConfigDict(config) config.lock() # Prevent addition of new fields. diff --git a/rcc_dp/experiment.py b/rcc_dp/mean_estimation/experiment.py similarity index 86% rename from rcc_dp/experiment.py rename to rcc_dp/mean_estimation/experiment.py index a8866ded..5d0c01e6 100644 --- a/rcc_dp/experiment.py +++ b/rcc_dp/mean_estimation/experiment.py @@ -11,19 +11,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Experiment definitions (i.e., evaluation of miracle, sqkr, privunit methods).""" +"""Experiment definitions (i.e., evaluation of miracle, sqkr, privunit methods +when either the data dimension d, the number of users n, or the privacy +parameter epsilon is varied).""" import json import math import time +import matplotlib import matplotlib.pyplot as plt +import matplotlib.lines as mlines import numpy as np from scipy import stats -from rcc_dp import get_parameters -from rcc_dp import miracle -from rcc_dp import modify_pi -from rcc_dp import privunit -from rcc_dp import sqkr +from rcc_dp.mean_estimation import get_parameters +from rcc_dp.mean_estimation import miracle +from rcc_dp.mean_estimation import modify_pi +from rcc_dp.mean_estimation import privunit +from rcc_dp.mean_estimation import sqkr +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['text.usetex'] = True def evaluate(work_path, config, file_open=open): @@ -196,54 +203,64 @@ def evaluate(work_path, config, file_open=open): print("sqkr mse:") print(np.mean(sqkr_mse, axis=0)) - plt.figure(figsize=(10, 8), dpi=80) + plt.figure(figsize=((8, 5)), dpi=80) + plt.axes((.15, .2, .83, .75)) if config.run_approx_miracle: plt.errorbar( vary_space, np.mean(approx_miracle_mse, axis=0), yerr=np.std(approx_miracle_mse, axis=0) / np.sqrt(config.num_itr), - label="Approx-DP Miracle") + linewidth = 3.0, + label="MRC") if config.run_miracle: plt.errorbar( vary_space, np.mean(miracle_mse, axis=0), yerr=np.std(miracle_mse, axis=0) / np.sqrt(config.num_itr), - label="Miracle") + linewidth = 3.0, + label="MRC") if config.run_modified_miracle: plt.errorbar( vary_space, np.mean(modified_miracle_mse, axis=0), yerr=np.std(modified_miracle_mse, axis=0) / np.sqrt(config.num_itr), - label="Modified Miracle") + linewidth = 3.0, + label="MMRC") if config.run_privunit: plt.errorbar( vary_space, np.mean(privunit_mse, axis=0), yerr=np.std(privunit_mse, axis=0) / np.sqrt(config.num_itr), - label="PrivUnit") + ls='--', + linewidth = 3.0, + label="PrivUnit$_{2}$") if config.run_sqkr: plt.errorbar( vary_space, np.mean(sqkr_mse, axis=0), yerr=np.std(sqkr_mse, axis=0) / np.sqrt(config.num_itr), + ls='--', + linewidth = 3.0, label="SQKR") - plt.legend(fontsize=18) - plt.xticks(fontsize=18) - plt.yticks(fontsize=18) - plt.ylabel("mse", fontsize=18) + plt.legend(fontsize=24) + plt.xticks(fontsize=28) + plt.yticks(fontsize=28) + plt.ylabel("$\ell_{2}$ error", fontsize=28) if config.vary == "d": - plt.xlabel(r"$d$", fontsize=18) - plt.title( - "n = " + str(n) + ", " + r"$\epsilon = $" + str(epsilon_target), - fontsize=18) + plt.xlabel(r"$d$", fontsize=28) + plt.xticks([200,400,600,800,1000]) + plt.yticks([0.00,0.04,0.08,0.12,0.16]) + plt.legend(fontsize=24, loc='upper left') elif config.vary == "n": - plt.xlabel(r"$n$", fontsize=18) - plt.title( - "d = " + str(d) + ", " + r"$\epsilon = $" + str(epsilon_target), - fontsize=18) + plt.xlabel(r"$n$", fontsize=28) + plt.ticklabel_format(style='sci', axis='x', scilimits=(3,3)) + plt.gca().xaxis.get_offset_text().set_fontsize(16) + plt.yticks([0.00,0.05,0.10,0.15,0.20]) + plt.legend(fontsize=24, loc='upper right') elif config.vary == "eps": - plt.xlabel(r"$\epsilon$", fontsize=18) - plt.title("d = " + str(d) + ", $n = $" + str(n), fontsize=18) + plt.xlabel(r"$\varepsilon$", fontsize=28) + plt.yticks([0.3,0.6,0.9,1.2,1.5]) + plt.legend(fontsize=24, loc='upper right') with file_open(work_path + "/rcc_dp_comparison.png", "wb") as f: plt.savefig(f, format="png") diff --git a/rcc_dp/mean_estimation/experiment_coding_cost.py b/rcc_dp/mean_estimation/experiment_coding_cost.py new file mode 100644 index 00000000..c8bf74e2 --- /dev/null +++ b/rcc_dp/mean_estimation/experiment_coding_cost.py @@ -0,0 +1,181 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Evaluation of miracle, sqkr, and privunit when coding cost is varied.""" + +import json +import math +import time +import matplotlib +import matplotlib.pyplot as plt +import matplotlib.lines as mlines +import numpy as np +from scipy import stats +from rcc_dp.mean_estimation import get_parameters +from rcc_dp.mean_estimation import miracle +from rcc_dp.mean_estimation import modify_pi +from rcc_dp.mean_estimation import privunit +from rcc_dp.mean_estimation import sqkr +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['text.usetex'] = True + + +def evaluate(work_path, config, file_open=open): + """Evaluates miracle, sqkr, privunit methods.""" + with file_open(work_path + "/config.json", "w") as f: + json.dump(config.to_dict(), f) + + start_time = time.time() + + budget = config.budget + alpha = config.alpha + # Get default values. + d = config.d + n = config.n + epsilon_target = config.epsilon_target + + vary_space = config.cc + + print("coding space = " + str(vary_space)) + + modified_miracle_mse = np.zeros((config.num_itr, len(vary_space))) + sqkr_mse = np.zeros((config.num_itr, 1)) + privunit_mse = np.zeros((config.num_itr, 1)) + + sqkr_coding_cost = epsilon_target + + for itr in range(config.num_itr): + print("itr = %d" % itr) + print("epsilon target = " + str(epsilon_target)) + print("n = " + str(n)) + print("d = %d" % d) + if config.run_modified_miracle: + eta = epsilon_target / 2.0 + print("eta = " + str(eta)) + print("alpha = " + str(alpha)) + + if config.data == "unbiased_data": + x = np.random.normal(0, 1, (d, n)) + x /= np.linalg.norm(x, axis=0) + elif config.data == "biased_data": + x = np.zeros((d, n)) + x[:, 0::2] = np.random.normal(10, 1, (d, (n + 1) // 2)) + x[:, 1::2] = np.random.normal(1, 1, (d, n // 2)) + x /= np.linalg.norm(x, axis=0) + elif config.data == "same_data": + x = np.random.normal(0, 1, (d, 1)) + x /= np.linalg.norm(x, axis=0) + x = np.repeat(x, n, axis=1) + else: + raise ValueError( + "data should be either be biased_data, unbiased_data, same_data.") + + if config.run_privunit: + x_privunit, _ = privunit.apply_privunit(x, epsilon_target, budget) + x_privunit = np.mean(np.array(x_privunit), axis=1, keepdims=True) + privunit_mse[itr, 0] = np.linalg.norm( + np.mean(x, axis=1, keepdims=True) - x_privunit)**2 + + if config.run_sqkr: + # Generate a random tight frame satisfying UP -- for sqkr. + frame = 2**int(math.ceil(math.log(d, 2)) + 1) + u = stats.ortho_group.rvs(dim=frame).T[:, 0:d] + + k_equiv = min(epsilon_target, sqkr_coding_cost) + [_, _, q_perturb] = sqkr.kashin_encode(u, x, k_equiv, epsilon_target) + x_kashin = sqkr.kashin_decode(u, k_equiv, epsilon_target, q_perturb) + sqkr_mse[itr, 0] = np.linalg.norm( + np.mean(x, axis=1, keepdims=True) - x_kashin)**2 + + for step, vary_parameter in enumerate(vary_space): + coding_cost = vary_parameter + print("coding cost = %d" % coding_cost) + + if config.run_modified_miracle: + x_modified_miracle = np.zeros((d, n)) + c1, c2, m, gamma = ( + get_parameters.get_parameters_unbiased_modified_miracle( + alpha * epsilon_target, d, 2**coding_cost, budget)) + for i in range(n): + _, _, pi = miracle.encoder(i + itr * n, x[:, i], 2**coding_cost, c1, + c2, gamma) + pi_all = modify_pi.modify_pi(pi, eta, epsilon_target, + c1 / (np.exp(epsilon_target / 2))) + k = np.random.choice(2**coding_cost, 1, p=pi_all[-1])[0] + z_k = miracle.decoder(i + itr * n, k, d, 2**coding_cost) + x_modified_miracle[:, i] = z_k / m + x_modified_miracle = np.mean(x_modified_miracle, axis=1, keepdims=True) + modified_miracle_mse[itr, step] = np.linalg.norm( + np.mean(x, axis=1, keepdims=True) - x_modified_miracle)**2 + + print(time.time() - start_time) + + print("--------------") + if config.run_modified_miracle: + print("approx miracle mse:") + print(np.mean(modified_miracle_mse, axis=0)) + if config.run_privunit: + print("privunit mse:") + print(np.mean(privunit_mse, axis=0)) + if config.run_sqkr: + print("sqkr mse:") + print(np.mean(sqkr_mse, axis=0)) + + plt.figure(figsize=((8, 5)), dpi=80) + plt.axes((.15, .2, .83, .75)) + if config.run_modified_miracle: + plt.errorbar( + vary_space, + np.mean(modified_miracle_mse, axis=0), + yerr=np.std(modified_miracle_mse, axis=0)/np.sqrt(config.num_itr), + linewidth = 3.0, + label="MMRC") + if config.run_privunit: + plt.errorbar(vary_space, + [np.mean(privunit_mse, axis=0)[0]]*len(vary_space), + yerr = [np.std(privunit_mse, axis=0)[0]/np.sqrt(config.num_itr)]*len(vary_space), + ls='--', + linewidth = 3.0, + label="PrivUnit$_{2}$") + if config.run_sqkr: + plt.errorbar(vary_space, + [np.mean(sqkr_mse, axis=0)[0]]*len(vary_space), + yerr = [np.std(sqkr_mse, axis=0)[0]/np.sqrt(config.num_itr)]*len(vary_space), + ls='--', + linewidth = 3.0, + label="SQKR") + plt.xticks(fontsize=28) + plt.yticks(fontsize=28) + plt.ylabel("$\ell_{2}$ error", fontsize=28) + plt.xlabel(r"$\#$ bits", fontsize=28) + plt.yticks([0.04,0.05,0.06,0.07,0.08]) + plt.legend(fontsize=24, loc='center right') + + with file_open(work_path + "/rcc_dp_mse_vs_coding_cost.png", "wb") as f: + plt.savefig(f, format="png") + + with file_open(work_path + "/time.txt", "w") as f: + np.savetxt(f, np.array(time.time() - start_time).reshape(-1, 1)) + + if config.run_modified_miracle: + with file_open(work_path + "/modified_miracle_mse.csv", "w") as f: + np.savetxt(f, modified_miracle_mse, delimiter=",") + + if config.run_privunit: + with file_open(work_path + "/privunit_mse.csv", "w") as f: + np.savetxt(f, privunit_mse, delimiter=",") + + if config.run_sqkr: + with file_open(work_path + "/sqkr_mse.csv", "w") as f: + np.savetxt(f, sqkr_mse, delimiter=",") diff --git a/rcc_dp/experiment_test.py b/rcc_dp/mean_estimation/experiment_test.py similarity index 67% rename from rcc_dp/experiment_test.py rename to rcc_dp/mean_estimation/experiment_test.py index a03bc032..ac71d55d 100644 --- a/rcc_dp/experiment_test.py +++ b/rcc_dp/mean_estimation/experiment_test.py @@ -14,8 +14,9 @@ """Tests for experiment definitions.""" from absl.testing import absltest -from rcc_dp import config as defaults -from rcc_dp import experiment +from rcc_dp.mean_estimation import config as defaults +from rcc_dp.mean_estimation import experiment +from rcc_dp.mean_estimation import experiment_coding_cost class ExperimentTest(absltest.TestCase): @@ -24,12 +25,17 @@ def test_evaluate_does_not_fail(self): work_path = self.create_tempdir().full_path config = defaults.get_config() config.num_itr = 1 - config.eps_space = [1] - config.t = 2 config.d = 100 config.n = 200 - experiment.evaluate(work_path, config) + if config.vary == "cc": + config.epsilon_target = 1 + config.cc_space = [4] + experiment_coding_cost.evaluate(work_path, config) + else: + config.eps_space = [1] + config.t = 2 + experiment.evaluate(work_path, config) if __name__ == "__main__": - absltest.main() + absltest.main() \ No newline at end of file diff --git a/rcc_dp/get_parameters.py b/rcc_dp/mean_estimation/get_parameters.py similarity index 97% rename from rcc_dp/get_parameters.py rename to rcc_dp/mean_estimation/get_parameters.py index 4ddc5c67..bba1d220 100644 --- a/rcc_dp/get_parameters.py +++ b/rcc_dp/mean_estimation/get_parameters.py @@ -14,8 +14,8 @@ """Code to obtain parameters of various miracle methods.""" import numpy as np -from rcc_dp import optimize_unbias -from rcc_dp import privunit +from rcc_dp.mean_estimation import optimize_unbias +from rcc_dp.mean_estimation import privunit def get_parameters_unbiased_approx_miracle(epsilon_target, d, diff --git a/rcc_dp/get_parameters_test.py b/rcc_dp/mean_estimation/get_parameters_test.py similarity index 96% rename from rcc_dp/get_parameters_test.py rename to rcc_dp/mean_estimation/get_parameters_test.py index 00c2c4ec..b5e0db8b 100644 --- a/rcc_dp/get_parameters_test.py +++ b/rcc_dp/mean_estimation/get_parameters_test.py @@ -15,9 +15,9 @@ from absl.testing import absltest import numpy as np -from rcc_dp import get_parameters -from rcc_dp import miracle -from rcc_dp import modify_pi +from rcc_dp.mean_estimation import get_parameters +from rcc_dp.mean_estimation import miracle +from rcc_dp.mean_estimation import modify_pi class GetParametersTest(absltest.TestCase): diff --git a/rcc_dp/miracle.py b/rcc_dp/mean_estimation/miracle.py similarity index 100% rename from rcc_dp/miracle.py rename to rcc_dp/mean_estimation/miracle.py diff --git a/rcc_dp/miracle_test.py b/rcc_dp/mean_estimation/miracle_test.py similarity index 94% rename from rcc_dp/miracle_test.py rename to rcc_dp/mean_estimation/miracle_test.py index 9f0f65a8..c2710bf0 100644 --- a/rcc_dp/miracle_test.py +++ b/rcc_dp/mean_estimation/miracle_test.py @@ -15,8 +15,8 @@ from absl.testing import absltest import numpy as np -from rcc_dp import get_parameters -from rcc_dp import miracle +from rcc_dp.mean_estimation import get_parameters +from rcc_dp.mean_estimation import miracle class MiracleTest(absltest.TestCase): diff --git a/rcc_dp/modify_pi.py b/rcc_dp/mean_estimation/modify_pi.py similarity index 100% rename from rcc_dp/modify_pi.py rename to rcc_dp/mean_estimation/modify_pi.py diff --git a/rcc_dp/modify_pi_test.py b/rcc_dp/mean_estimation/modify_pi_test.py similarity index 96% rename from rcc_dp/modify_pi_test.py rename to rcc_dp/mean_estimation/modify_pi_test.py index 75e39bcc..963c71e5 100644 --- a/rcc_dp/modify_pi_test.py +++ b/rcc_dp/mean_estimation/modify_pi_test.py @@ -15,9 +15,9 @@ from absl.testing import absltest import numpy as np -from rcc_dp import get_parameters -from rcc_dp import miracle -from rcc_dp import modify_pi +from rcc_dp.mean_estimation import get_parameters +from rcc_dp.mean_estimation import miracle +from rcc_dp.mean_estimation import modify_pi class ModifyPiTest(absltest.TestCase): diff --git a/rcc_dp/optimize_unbias.py b/rcc_dp/mean_estimation/optimize_unbias.py similarity index 99% rename from rcc_dp/optimize_unbias.py rename to rcc_dp/mean_estimation/optimize_unbias.py index 9cf0fa41..c70625d8 100644 --- a/rcc_dp/optimize_unbias.py +++ b/rcc_dp/mean_estimation/optimize_unbias.py @@ -28,7 +28,7 @@ import numpy as np from scipy import stats -from rcc_dp import privunit +from rcc_dp.mean_estimation import privunit def get_unbiased_p_hat(number_candidates, c1, c2, p): diff --git a/rcc_dp/privunit.py b/rcc_dp/mean_estimation/privunit.py similarity index 100% rename from rcc_dp/privunit.py rename to rcc_dp/mean_estimation/privunit.py diff --git a/rcc_dp/privunit_test.py b/rcc_dp/mean_estimation/privunit_test.py similarity index 98% rename from rcc_dp/privunit_test.py rename to rcc_dp/mean_estimation/privunit_test.py index 5e5296de..821d2604 100644 --- a/rcc_dp/privunit_test.py +++ b/rcc_dp/mean_estimation/privunit_test.py @@ -15,7 +15,7 @@ from absl.testing import absltest import numpy as np -from rcc_dp import privunit +from rcc_dp.mean_estimation import privunit class PrivunitTest(absltest.TestCase): diff --git a/rcc_dp/sqkr.py b/rcc_dp/mean_estimation/sqkr.py similarity index 100% rename from rcc_dp/sqkr.py rename to rcc_dp/mean_estimation/sqkr.py diff --git a/rcc_dp/sqkr_test.py b/rcc_dp/mean_estimation/sqkr_test.py similarity index 98% rename from rcc_dp/sqkr_test.py rename to rcc_dp/mean_estimation/sqkr_test.py index e806647d..8545eae6 100644 --- a/rcc_dp/sqkr_test.py +++ b/rcc_dp/mean_estimation/sqkr_test.py @@ -17,7 +17,7 @@ from absl.testing import parameterized import numpy as np from scipy import stats -from rcc_dp import sqkr +from rcc_dp.mean_estimation import sqkr class SqkrTest(parameterized.TestCase):