From 682023a25a1ffc5e13491a09f541cf85a3f1b36b Mon Sep 17 00:00:00 2001 From: Abhin Shah Date: Tue, 22 Feb 2022 16:34:33 -0500 Subject: [PATCH 1/4] Adding code for frequency estimation and updating code for mean estimation --- rcc_dp/frequency_estimation/config.py | 58 ++++ rcc_dp/frequency_estimation/experiment.py | 302 ++++++++++++++++++ .../experiment_coding_cost.py | 198 ++++++++++++ .../frequency_estimation/experiment_test.py | 41 +++ rcc_dp/frequency_estimation/miracle.py | 177 ++++++++++ rcc_dp/frequency_estimation/modify_pi.py | 151 +++++++++ rcc_dp/frequency_estimation/modify_pi_test.py | 82 +++++ rcc_dp/frequency_estimation/rhr.py | 141 ++++++++ rcc_dp/frequency_estimation/ss.py | 82 +++++ rcc_dp/frequency_estimation/unbias.py | 112 +++++++ rcc_dp/{ => mean_estimation}/BUILD | 0 rcc_dp/{ => mean_estimation}/config.py | 11 +- rcc_dp/{ => mean_estimation}/experiment.py | 67 ++-- .../mean_estimation/experiment_coding_cost.py | 183 +++++++++++ .../{ => mean_estimation}/experiment_test.py | 18 +- .../{ => mean_estimation}/get_parameters.py | 4 +- .../get_parameters_test.py | 6 +- rcc_dp/{ => mean_estimation}/miracle.py | 0 rcc_dp/{ => mean_estimation}/miracle_test.py | 4 +- rcc_dp/{ => mean_estimation}/modify_pi.py | 0 .../{ => mean_estimation}/modify_pi_test.py | 6 +- .../{ => mean_estimation}/optimize_unbias.py | 2 +- rcc_dp/{ => mean_estimation}/privunit.py | 0 rcc_dp/{ => mean_estimation}/privunit_test.py | 2 +- rcc_dp/{ => mean_estimation}/sqkr.py | 0 rcc_dp/{ => mean_estimation}/sqkr_test.py | 2 +- 26 files changed, 1600 insertions(+), 49 deletions(-) create mode 100644 rcc_dp/frequency_estimation/config.py create mode 100644 rcc_dp/frequency_estimation/experiment.py create mode 100644 rcc_dp/frequency_estimation/experiment_coding_cost.py create mode 100644 rcc_dp/frequency_estimation/experiment_test.py create mode 100644 rcc_dp/frequency_estimation/miracle.py create mode 100644 rcc_dp/frequency_estimation/modify_pi.py create mode 100644 rcc_dp/frequency_estimation/modify_pi_test.py create mode 100644 rcc_dp/frequency_estimation/rhr.py create mode 100644 rcc_dp/frequency_estimation/ss.py create mode 100644 rcc_dp/frequency_estimation/unbias.py rename rcc_dp/{ => mean_estimation}/BUILD (100%) rename rcc_dp/{ => mean_estimation}/config.py (88%) rename rcc_dp/{ => mean_estimation}/experiment.py (86%) create mode 100644 rcc_dp/mean_estimation/experiment_coding_cost.py rename rcc_dp/{ => mean_estimation}/experiment_test.py (67%) rename rcc_dp/{ => mean_estimation}/get_parameters.py (97%) rename rcc_dp/{ => mean_estimation}/get_parameters_test.py (96%) rename rcc_dp/{ => mean_estimation}/miracle.py (100%) rename rcc_dp/{ => mean_estimation}/miracle_test.py (94%) rename rcc_dp/{ => mean_estimation}/modify_pi.py (100%) rename rcc_dp/{ => mean_estimation}/modify_pi_test.py (96%) rename rcc_dp/{ => mean_estimation}/optimize_unbias.py (99%) rename rcc_dp/{ => mean_estimation}/privunit.py (100%) rename rcc_dp/{ => mean_estimation}/privunit_test.py (98%) rename rcc_dp/{ => mean_estimation}/sqkr.py (100%) rename rcc_dp/{ => mean_estimation}/sqkr_test.py (98%) diff --git a/rcc_dp/frequency_estimation/config.py b/rcc_dp/frequency_estimation/config.py new file mode 100644 index 00000000..e6b89dd7 --- /dev/null +++ b/rcc_dp/frequency_estimation/config.py @@ -0,0 +1,58 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Base configuration.""" + +from ml_collections.config_dict import config_dict + + +def get_config(): + """Returns config dictionary for model.""" + config = dict( + name="defaults", + # Either use geometric, zipf, or uniform i.e., data variable + # can take one of "geometric", "zipf", "uniform". + distribution="zipf", + lbd_geometric=0.8, + degree_zipf=1.0, + # Flags to indicate which methods to compare. + run_approx_miracle=False, + run_miracle=False, + run_modified_miracle=True, + run_ss=True, + run_rhr=True, + encoding_type="fast", # Can take either fast or normal + # Common parameters. + num_itr=1, + coding_cost=14, + coding_cost_multiplier=1, + approx_coding_cost_multiplier=3, + approx_t=6, + # Specific parameters (leave them as they are for now). + delta=10**(-6), + alpha=1.0, + # Variation. + vary="eps", # Can take one of "cc", "k", "n", "eps". + cc_space=[6, 8, 10, 12], + k_space=[200, 400, 600, 800, 1000], + n_space=[2000, 4000, 6000, 8000, 10000], + eps_space=list(range(1, 9)), + # Defaults. + n=5000, + k=500, + t=3, + epsilon_target=6, + ) + config = config_dict.ConfigDict(config) + config.lock() # Prevent addition of new fields. + return config \ No newline at end of file diff --git a/rcc_dp/frequency_estimation/experiment.py b/rcc_dp/frequency_estimation/experiment.py new file mode 100644 index 00000000..3ab43d91 --- /dev/null +++ b/rcc_dp/frequency_estimation/experiment.py @@ -0,0 +1,302 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Experiment definitions (i.e., evaluation of miracle, rhr, subset selection +methods when either the data dimension k, the number of users n, or the +privacy parameter epsilon is varied).""" + +import json +import math +import time +import matplotlib +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['text.usetex'] = True +import matplotlib.pyplot as plt +import matplotlib.lines as mlines +import numpy as np +from rcc_dp.frequency_estimation import miracle +from rcc_dp.frequency_estimation import modify_pi +from rcc_dp.frequency_estimation import rhr +from rcc_dp.frequency_estimation import ss +from rcc_dp.frequency_estimation import unbias + + +def generate_geometric_distribution(k,lbd): + elements = range(0,k) + prob = [(1-lbd)*math.pow(lbd,x)/(1-math.pow(lbd,k)) for x in elements] + return prob + + +def generate_uniform_distribution(k): + raw_distribution = [1] * k + sum_raw = sum(raw_distribution) + prob = [float(y)/float(sum_raw) for y in raw_distribution] + return prob + + +def generate_zipf_distribution(k,degree): + raw_distribution = [1/(float(i)**(degree)) for i in range(1,k+1)] + sum_raw = sum(raw_distribution) + prob = [float(y)/float(sum_raw) for y in raw_distribution] + return prob + + +def evaluate(work_path, config, file_open=open): + """Evaluates miracle, rhr, ss methods.""" + with file_open(work_path + "/config.json", "w") as f: + json.dump(config.to_dict(), f) + + start_time = time.time() + + delta = config.delta + alpha = config.alpha + # Get default values. + k = config.k + n = config.n + epsilon_target = config.epsilon_target + + if config.vary == "k": + vary_space = config.k_space + print("k space = " + str(vary_space)) + elif config.vary == "n": + vary_space = config.n_space + print("n space = " + str(vary_space)) + elif config.vary == "eps": + vary_space = config.eps_space + print("eps space = " + str(vary_space)) + else: + raise ValueError("vary should be either be k, n or eps.") + + approx_miracle_error = np.zeros((config.num_itr, len(vary_space))) + miracle_error = np.zeros((config.num_itr, len(vary_space))) + modified_miracle_error = np.zeros((config.num_itr, len(vary_space))) + rhr_error = np.zeros((config.num_itr, len(vary_space))) + ss_error = np.zeros((config.num_itr, len(vary_space))) + + for step, vary_parameter in enumerate(vary_space): + if config.vary == "k": + k = vary_parameter + coding_cost = config.coding_cost + elif config.vary == "n": + n = vary_parameter + coding_cost = config.coding_cost + elif config.vary == "eps": + epsilon_target = vary_parameter + coding_cost = config.coding_cost_multiplier * epsilon_target / np.log(2) + coding_cost += config.t + coding_cost = max(int(np.ceil(coding_cost)), 8) + print("epsilon target = " + str(epsilon_target)) + print("n = " + str(n)) + print("k = %d" % k) + print("coding cost = %d" % coding_cost) + + if config.run_modified_miracle: + eta = epsilon_target / 2.0 + print("eta = " + str(eta)) + print("alpha = " + str(alpha)) + + for itr in range(config.num_itr): + print("itr = %d" % itr) + if config.distribution == "geometric": + lbd = config.lbd_geometric + prob = generate_geometric_distribution(k,lbd) + elif config.distribution == "zipf": + degree = config.degree_zipf + prob = generate_zipf_distribution(k,degree) + elif config.distribution == "uniform": + prob = generate_uniform_distribution(k) + else: + raise ValueError( + "distribution should be either be geometric, zipf, uniform.") + x = np.random.choice(k, n, p=prob) + + if config.run_miracle: + x_miracle = np.zeros((k,n)) + for i in range(n): + if config.encoding_type == "fast": + x_miracle[:,i] = miracle.encode_decode_miracle_fast(i+itr*n, x[i], + k, epsilon_target/2, 2**coding_cost) + else: + _, _, index = miracle.encoder(i+itr*n, x[i], k, epsilon_target/2, + 2**coding_cost) + x_miracle[:,i] = miracle.decoder(i+itr*n, index, k, epsilon_target/2, + 2**coding_cost) + prob_miracle = unbias.unbias_miracle(k, epsilon_target/2, 2**coding_cost, + x_miracle.T, n, normalization = 1) + miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_miracle)], ord=1) + + if config.run_modified_miracle: + x_modified_miracle = np.zeros((k, n)) + for i in range(n): + if config.encoding_type == "fast": + x_modified_miracle[:,i] = miracle.encode_decode_modified_miracle_fast( + i+itr*n, x[i], k, alpha*epsilon_target, 2**coding_cost) + else: + _, pi, _ = miracle.encoder(i+itr*n, x[i], k, alpha*epsilon_target, + 2**coding_cost) + expected_beta = np.ceil(k/(np.exp(epsilon_target)+1))/k + pi_all = modify_pi.modify_pi(pi, eta, epsilon_target, + (np.exp(epsilon_target/2))/(1+expected_beta*(np.exp(epsilon_target)-1))) + index = np.random.choice(2**coding_cost, 1, p=pi_all[-1])[0] + x_modified_miracle[:,i] = miracle.decoder(i+itr*n, index, k, + alpha*epsilon_target, 2**coding_cost) + prob_modified_miracle = unbias.unbias_modified_miracle(k, alpha*epsilon_target, + 2**coding_cost, x_modified_miracle.T, n, normalization = 1) + modified_miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_modified_miracle)], ord=1) + + if config.run_approx_miracle: + x_approx_miracle = np.zeros((k,n)) + approx_coding_cost = int(np.ceil( + config.approx_coding_cost_multiplier*epsilon_target/np.log(2) + config.approx_t)) + epsilon_approx = miracle.get_approx_epsilon(epsilon_target, k, + 2**approx_coding_cost, delta) + for i in range(n): + if config.encoding_type == "fast": + x_approx_miracle[:,i] = miracle.encode_decode_miracle_fast(i+itr*n, x[i], + k, epsilon_approx, 2**coding_cost) + else: + _, _, index = miracle.encoder(i+itr*n, x[i], k, epsilon_approx, 2**coding_cost) + x_approx_miracle[:,i] = miracle.decoder(i+itr*n, index, k, epsilon_approx, + 2**coding_cost) + prob_approx_miracle = unbias.unbias_miracle(k, epsilon_approx, 2**coding_cost, + x_approx_miracle.T, n, normalization = 1) + approx_miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_approx_miracle)], ord=1) + + if config.run_ss: + x_ss = ss.encode_string_fast(k, epsilon_target, x) + prob_ss = ss.decode_string(k, epsilon_target, x_ss, n, normalization = 1) + ss_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_ss)], ord=1) + + if config.run_rhr: + x_rhr = rhr.encode_string(k, epsilon_target, coding_cost, x) + prob_rhr = rhr.decode_string_fast(k, epsilon_target, coding_cost, x_rhr, + normalization = 1) # estimate the original underlying distribution + rhr_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_rhr)], ord=1) + + if config.run_approx_miracle: + print("approx miracle error: " + + str(approx_miracle_error[:, step])) + if config.run_miracle: + print("miracle error: " + str(miracle_error[:, step])) + if config.run_modified_miracle: + print("modified miracle error: " + + str(modified_miracle_error[:, step])) + if config.run_ss: + print("ss error: " + str(ss_error[:, step])) + if config.run_rhr: + print("rhr error: " + str(rhr_error[:, step])) + print(time.time() - start_time) + + print("--------------") + if config.run_approx_miracle: + print("approx miracle error:") + print(np.mean(approx_miracle_error, axis=0)) + if config.run_miracle: + print("miracle error:") + print(np.mean(miracle_error, axis=0)) + if config.run_modified_miracle: + print("modified miracle error:") + print(np.mean(modified_miracle_error, axis=0)) + if config.run_ss: + print("ss error:") + print(np.mean(ss_error, axis=0)) + if config.run_rhr: + print("rhr error:") + print(np.mean(rhr_error, axis=0)) + + plt.figure(figsize=((8, 5)), dpi=80) + plt.axes((.15, .2, .83, .75)) + if config.run_approx_miracle: + plt.errorbar( + vary_space, + np.mean(approx_miracle_error, axis=0), + yerr=np.std(approx_miracle_error, axis=0)/np.sqrt(num_itr), + linewidth = 3.0, + label="MRC") + if config.run_miracle: + plt.errorbar( + vary_space, + np.mean(miracle_error, axis=0), + yerr=np.std(miracle_error, axis=0)/np.sqrt(num_itr), + linewidth = 3.0, + label="MRC") + if config.run_modified_miracle: + plt.errorbar( + vary_space, + np.mean(modified_miracle_error, axis=0), + yerr=np.std(modified_miracle_error, axis=0)/np.sqrt(num_itr), + linewidth = 3.0, + label="MMRC") + if config.run_ss: + plt.errorbar( + vary_space, + np.mean(ss_error, axis=0), + yerr=np.std(ss_error, axis=0)/np.sqrt(num_itr), + ls='--', + linewidth = 3.0, + label="Subset Selection") + if config.run_rhr: + plt.errorbar( + vary_space, + np.mean(rhr_error, axis=0), + yerr=np.std(rhr_error, axis=0)/np.sqrt(num_itr), + ls='--', + linewidth = 3.0, + label="RHR") + plt.xticks(fontsize=28) + plt.yticks(fontsize=28) + plt.ylabel("$\ell_{2}$ error", fontsize=28) + if config.vary == "k": + plt.xlabel(r"$d$", fontsize=28) + plt.xticks([200,400,600,800,1000]) + plt.yticks([0.2,0.4,0.6,0.8]) + plt.legend(fontsize=24, loc='upper left') + elif config.vary == "n": + plt.xlabel(r"$n$", fontsize=28) + plt.ticklabel_format(style='sci', axis='x', scilimits=(3,3)) + plt.gca().xaxis.get_offset_text().set_fontsize(16) + plt.yticks([0.3,0.4,0.5,0.6,0.7]) + plt.legend(fontsize=24, loc='upper right') + elif config.vary == "eps": + plt.xlabel(r"$\varepsilon$", fontsize=28) + plt.yticks([0.3,0.6,0.9,1.2,1.5]) + plt.legend(fontsize=24, loc='upper right') + + with file_open(work_path + "/rcc_dp_ss_comparison.png", "wb") as f: + plt.savefig(f, format="png") + + if config.run_approx_miracle: + with file_open(work_path + "/approx_miracle_error.csv", "w") as f: + np.savetxt(f, approx_miracle_error, delimiter=",") + + if config.run_miracle: + with file_open(work_path + "/miracle_error.csv", "w") as f: + np.savetxt(f, miracle_error, delimiter=",") + + if config.run_modified_miracle: + with file_open(work_path + "/modified_miracle_error.csv", "w") as f: + np.savetxt(f, modified_miracle_error, delimiter=",") + + if config.run_ss: + with file_open(work_path + "/ss_error.csv", "w") as f: + np.savetxt(f, ss_error, delimiter=",") + + if config.run_rhr: + with file_open(work_path + "/rhr_error.csv", "w") as f: + np.savetxt(f, rhr_error, delimiter=",") \ No newline at end of file diff --git a/rcc_dp/frequency_estimation/experiment_coding_cost.py b/rcc_dp/frequency_estimation/experiment_coding_cost.py new file mode 100644 index 00000000..41804c56 --- /dev/null +++ b/rcc_dp/frequency_estimation/experiment_coding_cost.py @@ -0,0 +1,198 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Experiment definitions (i.e., evaluation of miracle, rhr, subset selection +methods when the coding cost is varied).""" + +import json +import math +import time +import matplotlib +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['text.usetex'] = True +import matplotlib.pyplot as plt +import matplotlib.lines as mlines +import numpy as np +from rcc.frequency_estimation import miracle +from rcc.frequency_estimation import modify_pi +from rcc.frequency_estimation import rhr +from rcc.frequency_estimation import ss +from rcc.frequency_estimation import unbias + + +def generate_geometric_distribution(k,lbd): + elements = range(0,k) + prob = [(1-lbd)*math.pow(lbd,x)/(1-math.pow(lbd,k)) for x in elements] + return prob + + +def generate_uniform_distribution(k): + raw_distribution = [1] * k + sum_raw = sum(raw_distribution) + prob = [float(y)/float(sum_raw) for y in raw_distribution] + return prob + + +def generate_zipf_distribution(k,degree): + raw_distribution = [1/(float(i)**(degree)) for i in range(1,k+1)] + sum_raw = sum(raw_distribution) + prob = [float(y)/float(sum_raw) for y in raw_distribution] + return prob + + +def evaluate(work_path, config, file_open=open): + """Evaluates miracle, rhr, ss methods.""" + with file_open(work_path + "/config.json", "w") as f: + json.dump(config.to_dict(), f) + + start_time = time.time() + + delta = config.delta + alpha = config.alpha + # Get default values. + k = config.k + n = config.n + epsilon_target = config.epsilon_target + + vary_space = config.cc + + print("coding space = " + str(vary_space)) + + modified_miracle_error = np.zeros((config.num_itr, len(vary_space))) + rhr_error = np.zeros((config.num_itr, 1)) + ss_error = np.zeros((config.num_itr, 1)) + + rhr_coding_cost = epsilon_target + + for itr in range(num_itr): + print("itr = %d" % itr) + print("epsilon target = " + str(epsilon_target)) + print("n = " + str(n)) + print("k = %d" % k) + + if config.run_modified_miracle: + eta = epsilon_target / 2.0 + print("eta = " + str(eta)) + print("alpha = " + str(alpha)) + + if config.distribution == "geometric": + lbd = config.lbd_geometric + prob = generate_geometric_distribution(k,lbd) + elif config.distribution == "zipf": + degree = config.degree_zipf + prob = generate_zipf_distribution(k,degree) + elif config.distribution == "uniform": + prob = generate_uniform_distribution(k) + else: + raise ValueError( + "distribution should be either be geometric, zipf, uniform.") + x = np.random.choice(k, n, p=prob) + + + if config.run_ss: + x_ss = ss.encode_string_fast(k, epsilon_target, x) + prob_ss = ss.decode_string(k, epsilon_target, x_ss, n, normalization = 1) + ss_error[itr, 0] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_ss)], ord=1) + + if config.run_rhr: + x_rhr = rhr.encode_string(k, epsilon_target, coding_cost, x) + prob_rhr = rhr.decode_string_fast(k, epsilon_target, coding_cost, x_rhr, + normalization = 1) # estimate the original underlying distribution + rhr_error[itr, 0] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_rhr)], ord=1) + + for step, vary_parameter in enumerate(vary_space): + coding_cost = vary_parameter + print("coding cost = %d" % coding_cost) + + if config.run_modified_miracle: + x_modified_miracle = np.zeros((k, n)) + for i in range(n): + if config.encoding_type == "fast": + x_modified_miracle[:,i] = miracle.encode_decode_modified_miracle_fast( + i+itr*n, x[i], k, alpha*epsilon_target, 2**coding_cost) + else: + _, pi, _ = miracle.encoder(i+itr*n, x[i], k, alpha*epsilon_target, + 2**coding_cost) + expected_beta = np.ceil(k/(np.exp(epsilon_target)+1))/k + pi_all = modify_pi.modify_pi(pi, eta, epsilon_target, + (np.exp(epsilon_target/2))/(1+expected_beta*(np.exp(epsilon_target)-1))) + index = np.random.choice(2**coding_cost, 1, p=pi_all[-1])[0] + x_modified_miracle[:,i] = miracle.decoder(i+itr*n, index, k, + alpha*epsilon_target, 2**coding_cost) + prob_modified_miracle = unbias.unbias_modified_miracle(k, alpha*epsilon_target, + 2**coding_cost, x_modified_miracle.T, n, normalization = 1) + modified_miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + in zip(prob, prob_modified_miracle)], ord=1) + + print(time.time() - start_time) + + print("--------------") + if config.run_modified_miracle: + print("modified miracle error:") + print(np.mean(modified_miracle_error, axis=0)) + if config.run_ss: + print("ss error:") + print(np.mean(ss_error, axis=0)) + if config.run_rhr: + print("rhr error:") + print(np.mean(rhr_error, axis=0)) + + plt.figure(figsize=(10, 8), dpi=80) + if config.run_modified_miracle: + plt.errorbar( + vary_space, + np.mean(modified_miracle_error, axis=0), + yerr=np.std(modified_miracle_error, axis=0), + linewidth = 3.0, + label="MMRC") + if config.run_ss: + line1 = plt.errorbar(vary_space, + [np.mean(ss_error, axis=0)[0]]*len(vary_space), + yerr = [np.std(ss_error, axis=0)[0]/np.sqrt(num_itr)]*len(vary_space), + ls='--', + linewidth = 3.0, + label="Subset Selection") + if config.run_rhr: + line2 = plt.errorbar(vary_space, + [np.mean(rhr_error, axis=0)[0]]*len(vary_space), + yerr = [np.std(rhr_error, axis=0)[0]/np.sqrt(num_itr)]*len(vary_space), + ls='--', + linewidth = 3.0, + label="RHR") + plt.xticks(fontsize=28) + plt.yticks(fontsize=28) + plt.ylabel("$\ell_{2}$ error", fontsize=28) + plt.xlabel(r"$\#$ bits", fontsize=28) + plt.yticks([0.4,0.5,0.6,0.7,0.8,0.9]) + plt.legend(fontsize=24, loc='upper right') + + with file_open(work_path + "/rcc_dp_ss_mse_vs_coding_cost.png", "wb") as f: + plt.savefig(f, format="png") + + with file_open(work_path + "/time.txt", "w") as f: + np.savetxt(f, np.array(time.time() - start_time).reshape(-1, 1)) + + if config.run_modified_miracle: + with file_open(work_path + "/modified_miracle_error.csv", "w") as f: + np.savetxt(f, modified_miracle_error, delimiter=",") + + if config.run_ss: + with file_open(work_path + "/ss_error.csv", "w") as f: + np.savetxt(f, ss_error, delimiter=",") + + if config.run_rhr: + with file_open(work_path + "/rhr_error.csv", "w") as f: + np.savetxt(f, rhr_error, delimiter=",") \ No newline at end of file diff --git a/rcc_dp/frequency_estimation/experiment_test.py b/rcc_dp/frequency_estimation/experiment_test.py new file mode 100644 index 00000000..af731c32 --- /dev/null +++ b/rcc_dp/frequency_estimation/experiment_test.py @@ -0,0 +1,41 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for experiment definitions.""" + +from absl.testing import absltest +from rcc_dp.frequency_estimation import config as defaults +from rcc_dp.frequency_estimation import experiment +from rcc_dp.frequency_estimation import experiment_coding_cost + + +class ExperimentTest(absltest.TestCase): + + def test_evaluate_does_not_fail(self): + work_path = self.create_tempdir().full_path + config = defaults.get_config() + config.num_itr = 1 + config.k = 100 + config.n = 200 + if config.vary == "cc": + config.epsilon_target = 1 + config.cc_space = [4] + experiment_coding_cost.evaluate(work_path, config) + else: + config.eps_space = [1] + config.t = 2 + experiment.evaluate(work_path, config) + + +if __name__ == "__main__": + absltest.main() \ No newline at end of file diff --git a/rcc_dp/frequency_estimation/miracle.py b/rcc_dp/frequency_estimation/miracle.py new file mode 100644 index 00000000..5f748b54 --- /dev/null +++ b/rcc_dp/frequency_estimation/miracle.py @@ -0,0 +1,177 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Minimal random coding (likelihood decoder) definitions. + +This is the code to be used to simulate the minimum random coding algorithm +(tailored to subset selection). The algorithm was introduced by Havasi et al. +in "Minimal Random Code Learning: Getting Bits Back from Compressed Model +Parameters" - https://arxiv.org/pdf/1810.00440.pdf. + +For brevity, we may refer to it as 'MIRACLE', although technically this refers +to the complete model compression pipeline of Havasi et al. +""" + +import numpy as np +import scipy.special as sc + +def encoder(seed, x, k, epsilon, number_candidates): + """This is the encoder used by the miracle algorithm. + + Args: + seed: The random seed to be used by the encoder. + x: The user input data. + k: The dimension of the one-hot encoded x. + epsilon: The privacy parameter epsilon. + number_candidates: The number of candidates to be sampled. + + Returns: + z: The set of candidates sampled at the encoder. + index: The index sampled by the encoder. + pi: The distribution over the candidates for the given input data x. + """ + d = int(np.ceil(k/(np.exp(epsilon)+1))) + + c1 = np.exp(epsilon)/(sc.comb(k-1, d-1)* np.exp(epsilon) + sc.comb(k-1, d)) + c2 = 1/(sc.comb(k-1, d-1)* np.exp(epsilon) + sc.comb(k-1, d)) + + # Proposal distribution is chosen to be uniform on all d-hot encoded vectors. + z = np.zeros((k, number_candidates)) + z[:d, :] = 1 + rs = np.random.RandomState(seed) + for i in range(number_candidates): + rs.shuffle(z[:, i]) + + pi = np.where(z[x, :] == 1, c1, c2) + pi /= np.sum(pi) + index = np.random.choice(number_candidates, 1, p=pi)[0] + return z, pi, index + + +def decoder(seed, index, k, epsilon, number_candidates): + """This is the decoder used by the miracle algorithm. + + Args: + seed: The random seed to be used by the decoder (this seed should be the + same as the one used by the encoder). + index: The index transmitted by the encoder. + k: The dimension of the data. + epsilon: The privacy parameter epsilon. + number_candidates: The number of candidates to be sampled. + + Returns: + z_k: The candidate corresponding to the index (This is the candidate that + is distributed according to the conditional distribution of privunit). + """ + d = int(np.ceil(k/(np.exp(epsilon)+1))) + + # Proposal distribution should be the same as the one used by the encoder. + z = np.zeros((k, number_candidates)) + z[:d, :] = 1 + rs = np.random.RandomState(seed) + for i in range(number_candidates): + rs.shuffle(z[:, i]) + z_k = z[:, index] + return z_k + + +def get_approx_epsilon(epsilon_target, k, number_candidates, delta): + """Get the effective epsilon for miracle with approxmiate-DP guarantees. + + Args: + epsilon_target: The privacy guarantee we desire. + k: The number of dimensions. + number_candidates: The number of candidates. + delta: The delta in the differential privacy guarantee. + + Returns: + epsilon_approx: The resulting epsilon that this version of miracle ends + up using. + """ + epsilon_search_space = np.linspace(0,epsilon_target,100) + epsilon_search_space = epsilon_search_space[:-1] + epsilon_approx = 0 + # Find the largest epsilon for SS so that MIRACLE meets epsilon_target + for _, epsilon in enumerate(epsilon_search_space): + d = int(np.ceil(k/(np.exp(epsilon)+1))) + cupper = (k*np.exp(epsilon))/(k-d+d*np.exp(epsilon)) + clower = k/(k-d+d*np.exp(epsilon)) + t = np.abs(cupper-clower)*np.sqrt((np.log(2/delta))/(2*number_candidates)) + if -1 < t < 1: + if epsilon + np.log(1+t) - np.log(1-t) <= epsilon_target: + epsilon_approx = epsilon + return epsilon_approx + + +def encode_decode_miracle_fast(seed, x, k, epsilon, number_candidates): + """A fast implementation of the miracle protocol -- instead of + generating number_candidates samples, generate one sample with + true expectation. + """ + d = int(np.ceil(k/(np.exp(epsilon)+1))) + num_cand_in_cap = np.random.binomial(number_candidates, d/k, size=None) + pi_in = np.exp(epsilon)/(num_cand_in_cap*np.exp(epsilon) + + (number_candidates-num_cand_in_cap)) + prob_sample_from_cap = num_cand_in_cap*pi_in + + if np.random.uniform(0,1) <= prob_sample_from_cap: + # Generate a sample uniformly from the cap + z = [1]*(d-1)+[0]*(k-d) + rs = np.random.RandomState(seed) + rs.shuffle(z) + z = z[:x] + [1] + z[x:] + else: + # Generate a sample uniformly from outside the cap + z = [1]*(d)+[0]*(k-d-1) + rs = np.random.RandomState(seed) + rs.shuffle(z) + z = z[:x] + [0] + z[x:] + return np.array(z) + + +def encode_decode_modified_miracle_fast(seed, x, k, epsilon, number_candidates): + """A fast implementation of the modified miracle protocol -- instead of + generating number_candidates samples, generate one sample with + true expectation. + """ + d = int(np.ceil(k/(np.exp(epsilon)+1))) + c1 = np.exp(epsilon)/(sc.comb(k-1,d-1)* np.exp(epsilon) + sc.comb(k-1,d)) + c2 = 1/(sc.comb(k-1,d-1)* np.exp(epsilon) + sc.comb(k-1,d)) + + num_cand_in_cap = np.random.binomial(number_candidates, d/k, size=None) + + beta = num_cand_in_cap/number_candidates + expected_beta = np.ceil(k/(np.exp(epsilon)+1))/k + + pi_in = c1/(num_cand_in_cap*c1+(number_candidates-num_cand_in_cap)*c2) + tilde_pi_in = pi_in*(1+beta*(np.exp(epsilon)-1))/((1 + + expected_beta*(np.exp(epsilon)-1))) + if beta > expected_beta: + tilde_pi_in = tilde_pi_in*(beta+expected_beta*(np.exp(epsilon) + - 1))/(beta*np.exp(epsilon)) + + prob_sample_from_cap = num_cand_in_cap*tilde_pi_in + + if np.random.uniform(0,1) <= prob_sample_from_cap: + # Generate a sample uniformly from the cap + z = [1]*(d-1)+[0]*(k-d) + rs = np.random.RandomState(seed) + rs.shuffle(z) + z = z[:x] + [1] + z[x:] + else: + # Generate a sample uniformly from outside the cap + z = [1]*(d)+[0]*(k-d-1) + rs = np.random.RandomState(seed) + rs.shuffle(z) + z = z[:x] + [0] + z[x:] + return np.array(z) \ No newline at end of file diff --git a/rcc_dp/frequency_estimation/modify_pi.py b/rcc_dp/frequency_estimation/modify_pi.py new file mode 100644 index 00000000..6e55865f --- /dev/null +++ b/rcc_dp/frequency_estimation/modify_pi.py @@ -0,0 +1,151 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Code to modify the distribution pi to tilde_pi. + +The distribution tilde_pi (which is equal to pi_all[-1]) is 2 * eta-DP. +""" + +import numpy as np + + +def modify_pi(pi, eta, epsilon, multiplicative_factor): + """This function modifies the distribution pi to make it 2eta-LDP. + + The function essentially ensures that the new distribution lies between the + upper threshold `exp(eta) * multiplicative_factor * p` and the lower threshold + `exp(-eta) * multiplicative_factor * p`. It first checks if the distribution + already lies inside the thresholds. If not, it trades as mass beyond the upper + threshold with mass beyond the lower threshold. In other words, it ensures + that at least one of the constraint is satisfied (the one which is violated + less severely). This is done with the help of the helper function + `trade_mass`. Next, it iteratively enforces the constraint that is still + violated and renormalizes the distribution. This is done with the help of the + helper function `normalize`. + + Args: + pi: The input distribution to be modified + eta: The privacy parameter that is half times the desired privacy guarantee. + epsilon: The privacy parameter epsilon. + multiplicative_factor: The factor by which the uniform distribution over the + candidates is scaled with. + + Returns: + pi_all: The container containing how the distrbution pi evolved from pi to + tilde_pi (which is equal to pi_all[-1]). Further, tilde_pi is 2eta-LDP. + """ + if eta < epsilon / 2: + raise ValueError('eta should be larger than epsilon/2.') + + number_candidates = len(pi) + p = np.zeros(number_candidates) + 1.0 / number_candidates + p = p * multiplicative_factor + # The container used to track changes in the distribution. + pi_all = [pi] + + # Calculate how much mass lies above the upper threshold. + mass_above = np.sum(np.maximum(pi - np.exp(eta) * p, 0)) + # Calculate how much mass lies below the lower threshold. + mass_below = np.sum(np.maximum(np.exp(-eta) * p - pi, 0)) + + if mass_above == 0 and mass_below == 0: + return pi_all + elif mass_above > mass_below: + # Since the lower threshold is violated less severely, correct all + # probabilities which are too low. + below = pi < np.exp(-eta) * p + pi_new = pi.copy() + pi_new[below] = np.exp(-eta) * p[below] + + # Sort by distance from upper threshold (biggest offenders first). The + # indices above the upper threshold are first in order. + indices = np.argsort(np.exp(eta) * p - pi_all[-1]) + + # Trade mass above the upper threshold against mass below the lower + # threshold as much as possible. + budget = mass_below + for i in indices: + # Correct probability above threshold as much as possible. + diff = pi_new[i] - np.exp(eta) * p[i] + if diff > budget: + pi_new[i] -= budget + break + elif diff > 0: + pi_new[i] -= diff + budget -= diff + pi_all.append(pi_new.copy()) + + # Now, pi_new disobeys at-most one constraint i.e., some mass is either + # above the upper threshold or all the mass is between the thresholds. + + # Calculate the remaining mass above the upper threshold. + mass_above = np.sum(np.maximum(pi_new - np.exp(eta) * p, 0)) + + while mass_above > 0: + # Correct probabilities above the upper threshold. + above = pi_new >= np.exp(eta) * p + pi_new[above] = np.exp(eta) * p[above] + + # Renormalize distribution. + not_above = ~above + pi_new[not_above] += mass_above * pi[not_above] / np.sum(pi[not_above]) + pi_all.append(pi_new.copy()) + + # Calculate the remaining mass above the upper threshold. + mass_above = np.sum(np.maximum(pi_new - np.exp(eta) * p, 0)) + + else: + # Since the upper threshold is violated less severely, correct all + # probabilities which are too high. + above = pi > np.exp(eta) * p + pi_new = pi.copy() + pi_new[above] = np.exp(eta) * p[above] + + # Sort by distance from lower threshold (biggest offenders first). The + # indices below the lower threshold are first in order. + indices = np.argsort(pi_all[-1] - np.exp(-eta) * p) + + # Trade mass below the lower threshold against mass above the upper + # threshold as much as possible. + budget = mass_above + for i in indices: + # Correct probability below threshold as much as possible. + diff = np.exp(-eta) * p[i] - pi_new[i] + if diff > budget: + pi_new[i] += budget + break + elif diff > 0: + pi_new[i] += diff + budget -= diff + pi_all.append(pi_new.copy()) + + # Now, pi_new disobeys at-most one constraint i.e., some mass is either + # below the lower threshold or all the mass is between the thresholds. + + # Calculate the remaining mass below the lower threshold. + mass_below = np.sum(np.maximum(np.exp(-eta) * p - pi_new, 0)) + + while mass_below > 0: + # Correct probabilities below the lower threshold. + below = pi_new <= np.exp(-eta) * p + pi_new[below] = np.exp(-eta) * p[below] + + # Renormalize distribution. + not_below = ~below + pi_new[not_below] -= mass_below * pi[not_below] / np.sum(pi[not_below]) + pi_all.append(pi_new.copy()) + + # Calculate the remaining mass below the lower threshold. + mass_below = np.sum(np.maximum(np.exp(-eta) * p - pi_new, 0)) + + return pi_all \ No newline at end of file diff --git a/rcc_dp/frequency_estimation/modify_pi_test.py b/rcc_dp/frequency_estimation/modify_pi_test.py new file mode 100644 index 00000000..bc8d7491 --- /dev/null +++ b/rcc_dp/frequency_estimation/modify_pi_test.py @@ -0,0 +1,82 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for modify pi.""" + +from absl.testing import absltest +import numpy as np +from rcc_dp.frequency_estimation import get_parameters +from rcc_dp.frequency_estimation import miracle +from rcc_dp.frequency_estimation import modify_pi + + +class ModifyPiTest(absltest.TestCase): + + def test_tilde_pi_is_a_distribution(self): + """Test whether every distribution generated by modify_all sums to 1.""" + budget = 0.5 + for d in [100, 200]: + for number_candidates in [2**4, 2**5]: + for epsilon in [2, 3]: + x = np.random.normal(0, 1, (d, 1)) + x = np.divide(x, np.linalg.norm(x, axis=0).reshape(1, -1)) + c1, c2, _, gamma = get_parameters.get_parameters_unbiased_miracle( + epsilon, d, number_candidates, budget) + _, _, pi = miracle.encoder(0, x[:, 0], number_candidates, c1, c2, + gamma) + eta = epsilon / 2 + pi_all = modify_pi.modify_pi(pi, eta, epsilon, + c1 / (np.exp(epsilon / 2))) + self.assertLessEqual(len(pi_all), 3) + for distribution in pi_all: + self.assertLessEqual(np.abs(np.sum(distribution)-1), 0.0001) + + def test_tilde_pi_is_private(self): + """Test whether tilde pi satisfies the DP constraint.""" + budget = 0.5 + for d in [100, 200]: + for number_candidates in [2**4, 2**5]: + for epsilon in [2, 3]: + x = np.random.normal(0, 1, (d, 1)) + x = np.divide(x, np.linalg.norm(x, axis=0).reshape(1, -1)) + c1, c2, _, gamma = get_parameters.get_parameters_unbiased_miracle( + epsilon, d, number_candidates, budget) + _, _, pi = miracle.encoder(0, x[:, 0], number_candidates, c1, c2, + gamma) + eta = epsilon / 2 + pi_all = modify_pi.modify_pi(pi, eta, epsilon, + c1 / (np.exp(epsilon / 2))) + self.assertLessEqual(np.max(pi_all[-1]), c1 / number_candidates) + self.assertLessEqual(c1 * np.exp(-epsilon) / number_candidates, + np.min(pi_all[-1]) + 1e-6) + + def test_convergence(self): + """Test whether modify_pi converges in at-most 3 iterations as expected.""" + budget = 0.5 + for d in [100, 200]: + for number_candidates in [2**4, 2**5]: + for epsilon in [2, 3]: + x = np.random.normal(0, 1, (d, 1)) + x = np.divide(x, np.linalg.norm(x, axis=0).reshape(1, -1)) + c1, c2, _, gamma = get_parameters.get_parameters_unbiased_miracle( + epsilon, d, number_candidates, budget) + _, _, pi = miracle.encoder(0, x[:, 0], number_candidates, c1, c2, + gamma) + eta = epsilon / 2 + pi_all = modify_pi.modify_pi(pi, eta, epsilon, + c1 / (np.exp(epsilon / 2))) + self.assertLessEqual(len(pi_all), 3) + + +if __name__ == "__main__": + absltest.main() \ No newline at end of file diff --git a/rcc_dp/frequency_estimation/rhr.py b/rcc_dp/frequency_estimation/rhr.py new file mode 100644 index 00000000..dad7a0df --- /dev/null +++ b/rcc_dp/frequency_estimation/rhr.py @@ -0,0 +1,141 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""RHR definitions. + +This is the code to be used to simulate RHR. + +RHR was introduced by Chen et al in "Breaking the Communication-Privacy- +Accuracy Trilemma" - https://arxiv.org/pdf/2007.11707.pdf +""" + +import math +import numpy as np +from rcc_dp.frequency_estimation import ss + + +def encode_string(dim, epsilon, comm, x): + """An implementation of the RHR protocol to privatize x.""" + n = len(x) + + # Pad dim to the power of 2. + padded_d = int(math.pow(2, math.ceil(math.log(dim, 2)))) + # Calculate the effective communication cost + # i.e., min(comm, log(e^\epsilon)). + eff_comm = min(comm, math.ceil(epsilon*math.log(math.e, 2)), + math.ceil(math.log(dim, 2))+1) + # Set the block size. + block_size = int(padded_d/math.pow(2, eff_comm-1)) + + loc_sign = np.zeros(n) + for idx in range(n): + j = int(idx % block_size) + x_individual = int(x[idx]) + loc_sign[idx] = 2*int(x_individual/block_size) + if get_hadamard_entry(padded_d, j, x_individual) == -1: + loc_sign[idx] = loc_sign[idx] + 1 + + z = rr_encode_string(int(math.pow(2, eff_comm)), epsilon, + np.array(loc_sign)) + return z + + +def rr_encode_string(alphabet_size, epsilon, samples): + """An implementation of the RR protocol to privatize x.""" + n = len(samples) + # Start by setting private_samples = samples. + private_samples_rr = np.copy(samples) + + # Determine which samples need to be noised (i.e., flipped). + flip = np.random.random_sample(n) < (alphabet_size + - 1)/(math.exp(epsilon) + alphabet_size - 1) + flip_samples = samples[flip] + + # Select new samples uniformly at random to replace the original ones. + rand_samples = np.random.randint(0, alphabet_size - 1, len(flip_samples)) + + # Shift the samples if needed to avoid sampling the orginal samples. + rand_samples[rand_samples >= flip_samples] += 1 + + # Replace the original samples by the randomly selected ones. + private_samples_rr[flip] = rand_samples + return private_samples_rr + + +def decode_string_fast(dim, epsilon, comm, z, normalization = 1): + """Learn the original distribution from the privatized strings when + the input is a matrix consisting of all the bit vectors. + """ + l = len(z) + # Pad dim to the power of 2. + padded_d = int(math.pow(2, math.ceil(math.log(dim,2)))) + # Calculate the effective communication cost + # i.e., min(comm, log(e^\epsilon)). + eff_comm = min(comm, math.ceil(epsilon*math.log(math.e, 2)), + math.ceil(math.log(dim, 2))+1) + # Set the block size. + block_size = int(padded_d/math.pow(2, eff_comm-1)) + n = int(l/block_size)*block_size + + group_list = np.array(z[:n]).reshape(int(n/block_size), block_size).T + + # Create histograms to specify the empirical distributions of each group. + histograms = np.zeros((block_size, int(padded_d/block_size))) + for g_idx in range(block_size): + g_count,g_axis = np.histogram(group_list[g_idx], + range(int(math.pow(2, eff_comm))+1)) + histograms[g_idx] = g_count[::2] - g_count[1::2] + histograms[g_idx] = histograms[g_idx] * (math.exp(epsilon) + + math.pow(2, eff_comm)-1)/(math.exp(epsilon)-1)*(block_size/n) + + # Obtain estimator of q. + q = np.zeros((block_size, int(padded_d/block_size))) + for j in range(block_size): + q[j, :] = fast_inverse_hadamard_transform(int(padded_d/block_size), + histograms[j, :]) + + q = q.reshape((padded_d, ), order = 'F') + + # Perform inverse Hadamard transform to get p + p_estimate = fast_inverse_hadamard_transform(padded_d, q)/padded_d + p_estimate = p_estimate[:dim] + + if normalization == 0: + # Clip and normalize. + p_estimate = ss.probability_normalize(p_estimate) + if normalization == 1: + # Project on the simplex. + p_estimate = ss.probability_project_simplex(p_estimate) + return p_estimate + + +def get_hadamard_entry(d, x, y): + """Get (H_d)_{x,y}.""" + z = x & y + z_bit = bin(z)[2:].zfill(int(math.log(d, 2))) + check = 0 + for i in range(0, int(math.log(d, 2))): + check = check^int(z_bit[i]) + return (-1)**check + + +def fast_inverse_hadamard_transform(k, dist): + """ Performs inverse Hadamard transform.""" + if k == 1: + return dist + dist1 = dist[0 : k//2] + dist2 = dist[k//2 : k] + trans1 = fast_inverse_hadamard_transform(k//2, dist1) + trans2 = fast_inverse_hadamard_transform(k//2, dist2) + trans = np.concatenate((trans1+ trans2, trans1 - trans2)) + return trans \ No newline at end of file diff --git a/rcc_dp/frequency_estimation/ss.py b/rcc_dp/frequency_estimation/ss.py new file mode 100644 index 00000000..e3763741 --- /dev/null +++ b/rcc_dp/frequency_estimation/ss.py @@ -0,0 +1,82 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Subset selection definitions. + +This is the code to be used to simulate the subset selection algorithm. + +The subset selection algorithm was introduced by Ye et al in +"Optimal Schemes for Discrete Distribution Estimation under +Locally Differential Privacy" - https://arxiv.org/pdf/1702.00610.pdf. +""" + +import numpy as np + + +def probability_project_simplex(p): + k = len(p) # Infer the size of the alphabet. + p_sorted = np.sort(p) + p_sorted[:] = p_sorted[::-1] + p_sorted_cumsum = np.cumsum(p_sorted) + i = 1 + while i < k: + if p_sorted[i] + (1.0 / (i + 1)) * (1 - p_sorted_cumsum[i]) < 0: + break + i += 1 + lmd = (1.0 / i) * (1 - p_sorted_cumsum[i - 1]) + return np.maximum(p + lmd, 0) + + +def probability_normalize(p): + p = np.maximum(p,0) # Map it to be positive. + norm = np.sum(p) + p = np.true_divide(p,norm) # Ensure the l_1 norm is one. + return p + + +def encode_string_fast(k, epsilon, x): + """A fast implementation of the subset selection protocol -- instead + of selecting exactly d ones, set each bit to be one independently + with true expectation. + """ + d = int(np.ceil(k/(np.exp(epsilon)+1))) + p = d*np.exp(epsilon)/(d*np.exp(epsilon)+k-d) + q = (d-p)/(k-1) + + n = len(x) + z = np.zeros((n, k)) + flip = np.random.random_sample((n, k)) + + for i in range(n): + z[i,x[i]] = np.logical_or(0,flip[i,x[i]] < p) + return np.logical_or(z, flip < q) + +def decode_string(k, epsilon, z, length, normalization = 1): + """Learn the original distribution from the privatized strings when + the input is a matrix consisting of all the bit vectors. + """ + d = int(np.ceil(k/(np.exp(epsilon)+1))) + + temp1 = ((k-1)*np.exp(epsilon) + +1.0*(k-1)*(k-d)/d)/((k-d)*(np.exp(epsilon)-1)) + temp2 = ((d-1)*np.exp(epsilon)+k-d) / (1.0*(k-d)*(np.exp(epsilon)-1)) + + p_estimate = (1.0*np.sum(z, axis=0)*temp1/length)-temp2 + + if normalization == 0: + # Clip and normalize. + p_estimate = probability_normalize(p_estimate) + if normalization == 1: + # Project on the simplex. + p_estimate = probability_project_simplex(p_estimate) + return p_estimate \ No newline at end of file diff --git a/rcc_dp/frequency_estimation/unbias.py b/rcc_dp/frequency_estimation/unbias.py new file mode 100644 index 00000000..b6b199aa --- /dev/null +++ b/rcc_dp/frequency_estimation/unbias.py @@ -0,0 +1,112 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Code to un-bias miracle & modified miracle for ss. + +Please note that this unbiasing techniques are specific to ss + miracle +and are not general strategies for miracle. The un-biasing happens by +modifying the corresponding m and b parameters. +""" + +import numpy as np +from scipy import stats +import scipy.special as sc +from rcc_dp.frequency_estimation import ss + + +def unbias_miracle(k, epsilon, number_candidates, z, n, normalization = 1): + """Get the unbiased estimate for miracle. + + Args: + k: The dimension of the one-hot encoded x. + epsilon: The privacy parameter epsilon. + number_candidates: The number of candidates to be sampled. + z: The privatized data. + n: The number of users. + normalization: Indicator whether to clip and normalize or + project on the simplex + + Returns: + p_estimate: The unbiased estimate for miracle + """ + d = int(np.ceil(k/(np.exp(epsilon)+1))) + c1 = np.exp(epsilon)/(sc.comb(k-1,d-1)* np.exp(epsilon) + sc.comb(k-1,d)) + c2 = 1/(sc.comb(k-1,d-1)* np.exp(epsilon) + sc.comb(k-1,d)) + beta = np.array(range(number_candidates+1))/number_candidates + # The probability with which you choose a candidate inside the cap. + pi_in = 1/number_candidates*(c1/(beta*c1+(1-beta)*c2)) + expectation = np.sum(stats.binom.pmf(range(number_candidates+1), + number_candidates, d/k)*range(number_candidates+1)*pi_in) + b_hat = (d - expectation)/(k-1) + m_hat = k*expectation/(k-1) - d/(k-1) + + p_estimate = (1.0*np.sum(z, axis=0)/(n*m_hat))-b_hat/m_hat + + if normalization == 0: + # Clip and normalize. + p_estimate = ss.probability_normalize(p_estimate) + elif normalization == 1: + # Project on the simplex. + p_estimate = ss.probability_project_simplex(p_estimate) + return p_estimate + + +def unbias_modified_miracle(k, epsilon, number_candidates, z, n, + normalization = 1): + """Get the unbiased estimate for modified miracle. + + Args: + k: The dimension of the one-hot encoded x. + epsilon: The privacy parameter epsilon. + number_candidates: The number of candidates to be sampled. + z: The privatized data. + n: The number of users. + normalization: Indicator whether to clip and normalize or + project on the simplex + + Returns: + p_estimate: The unbiased estimate for modified miracle + """ + expected_beta = np.ceil(k/(np.exp(epsilon)+1))/k + d = int(np.ceil(k/(np.exp(epsilon)+1))) + c1 = np.exp(epsilon)/(sc.comb(k-1,d-1)* np.exp(epsilon) + sc.comb(k-1,d)) + c2 = 1/(sc.comb(k-1,d-1)* np.exp(epsilon) + sc.comb(k-1,d)) + # The fraction of candidates that lie inside the cap. + beta = np.array(range(number_candidates + 1)) / number_candidates + + pi_in = 1 / number_candidates * (c1 / (beta * c1 + (1 - beta) * c2)) + tilde_pi_in_1 = ( + pi_in * (1 + beta * (np.exp(epsilon) - 1)) / (1 + expected_beta * + (np.exp(epsilon) - 1))) + tilde_pi_in_2 = ( + tilde_pi_in_1 * (beta + expected_beta * (np.exp(epsilon) - 1)) / + (beta * np.exp(epsilon))) + tilde_pi_in_2[0,] = 0 + indicator = beta <= expected_beta + # The probability with which you choose a candidate inside the cap. + tilde_pi_in = indicator * tilde_pi_in_1 + (1 - indicator) * tilde_pi_in_2 + expectation = np.sum( + stats.binom.pmf(range(number_candidates + 1), number_candidates, + d / k) * range(number_candidates + 1) * tilde_pi_in) + b_tilde = (d - expectation)/(k-1) + m_tilde = k*expectation/(k-1) - d/(k-1) + + p_estimate = (1.0*np.sum(z, axis=0)/(n*m_tilde))-b_tilde/m_tilde + + if normalization == 0: + # Clip and normalize. + p_estimate = ss.probability_normalize(p_estimate) + elif normalization == 1: + # Project on the simplex. + p_estimate = ss.probability_project_simplex(p_estimate) + return p_estimate \ No newline at end of file diff --git a/rcc_dp/BUILD b/rcc_dp/mean_estimation/BUILD similarity index 100% rename from rcc_dp/BUILD rename to rcc_dp/mean_estimation/BUILD diff --git a/rcc_dp/config.py b/rcc_dp/mean_estimation/config.py similarity index 88% rename from rcc_dp/config.py rename to rcc_dp/mean_estimation/config.py index 772f4ae7..a40d033e 100644 --- a/rcc_dp/config.py +++ b/rcc_dp/mean_estimation/config.py @@ -24,14 +24,14 @@ def get_config(): # can take one of "biased_data", "unbiased_data", "same_data". data="biased_data", # Flags to indicate which methods to compare. - run_approx_miracle=True, + run_approx_miracle=False, run_miracle=False, - run_modified_miracle=False, + run_modified_miracle=True, run_privunit=True, run_sqkr=True, # Common parameters. num_itr=1, - coding_cost=8, + coding_cost=11, coding_cost_multiplier=1, approx_coding_cost_multiplier=3, approx_t=6, @@ -40,7 +40,8 @@ def get_config(): budget=0.5, alpha=1.0, # Variation. - vary="eps", # Can take one of "d", "n", "eps". + vary="eps", # Can take one of "cc", "d", "n", "eps". + cc_space=[6, 8, 10, 12], d_space=[200, 400, 600, 800, 1000], n_space=[2000, 4000, 6000, 8000, 10000], eps_space=list(range(1, 9)), @@ -48,7 +49,7 @@ def get_config(): n=5000, d=500, t=2, - epsilon_target=4, + epsilon_target=6, ) config = config_dict.ConfigDict(config) config.lock() # Prevent addition of new fields. diff --git a/rcc_dp/experiment.py b/rcc_dp/mean_estimation/experiment.py similarity index 86% rename from rcc_dp/experiment.py rename to rcc_dp/mean_estimation/experiment.py index a8866ded..4dd401b7 100644 --- a/rcc_dp/experiment.py +++ b/rcc_dp/mean_estimation/experiment.py @@ -11,19 +11,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Experiment definitions (i.e., evaluation of miracle, sqkr, privunit methods).""" +"""Experiment definitions (i.e., evaluation of miracle, sqkr, privunit methods +when either the data dimension d, the number of users n, or the privacy +parameter epsilon is varied).""" import json import math import time +import matplotlib +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['text.usetex'] = True import matplotlib.pyplot as plt +import matplotlib.lines as mlines import numpy as np from scipy import stats -from rcc_dp import get_parameters -from rcc_dp import miracle -from rcc_dp import modify_pi -from rcc_dp import privunit -from rcc_dp import sqkr +from rcc_dp.mean_estimation import get_parameters +from rcc_dp.mean_estimation import miracle +from rcc_dp.mean_estimation import modify_pi +from rcc_dp.mean_estimation import privunit +from rcc_dp.mean_estimation import sqkr def evaluate(work_path, config, file_open=open): @@ -196,54 +203,64 @@ def evaluate(work_path, config, file_open=open): print("sqkr mse:") print(np.mean(sqkr_mse, axis=0)) - plt.figure(figsize=(10, 8), dpi=80) + plt.figure(figsize=((8, 5)), dpi=80) + plt.axes((.15, .2, .83, .75)) if config.run_approx_miracle: plt.errorbar( vary_space, np.mean(approx_miracle_mse, axis=0), yerr=np.std(approx_miracle_mse, axis=0) / np.sqrt(config.num_itr), - label="Approx-DP Miracle") + linewidth = 3.0, + label="MRC") if config.run_miracle: plt.errorbar( vary_space, np.mean(miracle_mse, axis=0), yerr=np.std(miracle_mse, axis=0) / np.sqrt(config.num_itr), - label="Miracle") + linewidth = 3.0, + label="MRC") if config.run_modified_miracle: plt.errorbar( vary_space, np.mean(modified_miracle_mse, axis=0), yerr=np.std(modified_miracle_mse, axis=0) / np.sqrt(config.num_itr), - label="Modified Miracle") + linewidth = 3.0, + label="MMRC") if config.run_privunit: plt.errorbar( vary_space, np.mean(privunit_mse, axis=0), yerr=np.std(privunit_mse, axis=0) / np.sqrt(config.num_itr), - label="PrivUnit") + ls='--', + linewidth = 3.0, + label="PrivUnit$_{2}$") if config.run_sqkr: plt.errorbar( vary_space, np.mean(sqkr_mse, axis=0), yerr=np.std(sqkr_mse, axis=0) / np.sqrt(config.num_itr), + ls='--', + linewidth = 3.0, label="SQKR") - plt.legend(fontsize=18) - plt.xticks(fontsize=18) - plt.yticks(fontsize=18) - plt.ylabel("mse", fontsize=18) + plt.legend(fontsize=24) + plt.xticks(fontsize=28) + plt.yticks(fontsize=28) + plt.ylabel("$\ell_{2}$ error", fontsize=28) if config.vary == "d": - plt.xlabel(r"$d$", fontsize=18) - plt.title( - "n = " + str(n) + ", " + r"$\epsilon = $" + str(epsilon_target), - fontsize=18) + plt.xlabel(r"$d$", fontsize=28) + plt.xticks([200,400,600,800,1000]) + plt.yticks([0.00,0.04,0.08,0.12,0.16]) + plt.legend(fontsize=24, loc='upper left') elif config.vary == "n": - plt.xlabel(r"$n$", fontsize=18) - plt.title( - "d = " + str(d) + ", " + r"$\epsilon = $" + str(epsilon_target), - fontsize=18) + plt.xlabel(r"$n$", fontsize=28) + plt.ticklabel_format(style='sci', axis='x', scilimits=(3,3)) + plt.gca().xaxis.get_offset_text().set_fontsize(16) + plt.yticks([0.00,0.05,0.10,0.15,0.20]) + plt.legend(fontsize=24, loc='upper right') elif config.vary == "eps": - plt.xlabel(r"$\epsilon$", fontsize=18) - plt.title("d = " + str(d) + ", $n = $" + str(n), fontsize=18) + plt.xlabel(r"$\varepsilon$", fontsize=28) + plt.yticks([0.3,0.6,0.9,1.2,1.5]) + plt.legend(fontsize=24, loc='upper right') with file_open(work_path + "/rcc_dp_comparison.png", "wb") as f: plt.savefig(f, format="png") diff --git a/rcc_dp/mean_estimation/experiment_coding_cost.py b/rcc_dp/mean_estimation/experiment_coding_cost.py new file mode 100644 index 00000000..21dceb28 --- /dev/null +++ b/rcc_dp/mean_estimation/experiment_coding_cost.py @@ -0,0 +1,183 @@ +# Copyright 2021, Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Experiment definitions (i.e., evaluation of miracle, sqkr, privunit methods +when the coding cost is varied).""" + +import json +import math +import time +import matplotlib +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['text.usetex'] = True +import matplotlib.pyplot as plt +import matplotlib.lines as mlines +import numpy as np +from scipy import stats +from rcc_dp.mean_estimation import get_parameters +from rcc_dp.mean_estimation import miracle +from rcc_dp.mean_estimation import modify_pi +from rcc_dp.mean_estimation import privunit +from rcc_dp.mean_estimation import sqkr + + +def evaluate(work_path, config, file_open=open): + """Evaluates miracle, sqkr, privunit methods.""" + with file_open(work_path + "/config.json", "w") as f: + json.dump(config.to_dict(), f) + + start_time = time.time() + + delta = config.delta + budget = config.budget + alpha = config.alpha + # Get default values. + d = config.d + n = config.n + epsilon_target = config.epsilon_target + + vary_space = config.cc + + print("coding space = " + str(vary_space)) + + modified_miracle_mse = np.zeros((config.num_itr, len(vary_space))) + sqkr_mse = np.zeros((config.num_itr, 1)) + privunit_mse = np.zeros((config.num_itr, 1)) + + sqkr_coding_cost = epsilon_target + + for itr in range(num_itr): + print("itr = %d" % itr) + print("epsilon target = " + str(epsilon_target)) + print("n = " + str(n)) + print("d = %d" % d) + if config.run_modified_miracle: + eta = epsilon_target / 2.0 + print("eta = " + str(eta)) + print("alpha = " + str(alpha)) + + if config.data == "unbiased_data": + x = np.random.normal(0, 1, (d, n)) + x /= np.linalg.norm(x, axis=0) + elif config.data == "biased_data": + x = np.zeros((d, n)) + x[:, 0::2] = np.random.normal(10, 1, (d, (n + 1) // 2)) + x[:, 1::2] = np.random.normal(1, 1, (d, n // 2)) + x /= np.linalg.norm(x, axis=0) + elif config.data == "same_data": + x = np.random.normal(0, 1, (d, 1)) + x /= np.linalg.norm(x, axis=0) + x = np.repeat(x, n, axis=1) + else: + raise ValueError( + "data should be either be biased_data, unbiased_data, same_data.") + + if config.run_privunit: + x_privunit, _ = privunit.apply_privunit(x, epsilon_target, budget) + x_privunit = np.mean(np.array(x_privunit), axis=1, keepdims=True) + privunit_mse[itr, 0] = np.linalg.norm( + np.mean(x, axis=1, keepdims=True) - x_privunit)**2 + + if config.run_sqkr: + # Generate a random tight frame satisfying UP -- for sqkr. + frame = 2**int(math.ceil(math.log(d, 2)) + 1) + u = stats.ortho_group.rvs(dim=frame).T[:, 0:d] + + k_equiv = min(epsilon_target, sqkr_coding_cost) + [_, _, q_perturb] = sqkr.kashin_encode(u, x, k_equiv, epsilon_target) + x_kashin = sqkr.kashin_decode(u, k_equiv, epsilon_target, q_perturb) + sqkr_mse[itr, 0] = np.linalg.norm( + np.mean(x, axis=1, keepdims=True) - x_kashin)**2 + + for step, vary_parameter in enumerate(vary_space): + coding_cost = vary_parameter + print("coding cost = %d" % coding_cost) + + if config.run_modified_miracle: + x_modified_miracle = np.zeros((d, n)) + c1, c2, m, gamma = ( + get_parameters.get_parameters_unbiased_modified_miracle( + alpha * epsilon_target, d, 2**coding_cost, budget)) + for i in range(n): + _, _, pi = miracle.encoder(i + itr * n, x[:, i], 2**coding_cost, c1, + c2, gamma) + pi_all = modify_pi.modify_pi(pi, eta, epsilon_target, + c1 / (np.exp(epsilon_target / 2))) + k = np.random.choice(2**coding_cost, 1, p=pi_all[-1])[0] + z_k = miracle.decoder(i + itr * n, k, d, 2**coding_cost) + x_modified_miracle[:, i] = z_k / m + x_modified_miracle = np.mean(x_modified_miracle, axis=1, keepdims=True) + modified_miracle_mse[itr, step] = np.linalg.norm( + np.mean(x, axis=1, keepdims=True) - x_modified_miracle)**2 + + print(time.time() - start_time) + + print("--------------") + if config.run_approx_miracle: + print("approx miracle mse:") + print(np.mean(approx_miracle_mse, axis=0)) + if config.run_privunit: + print("privunit mse:") + print(np.mean(privunit_mse, axis=0)) + if config.run_sqkr: + print("sqkr mse:") + print(np.mean(sqkr_mse, axis=0)) + + plt.figure(figsize=((8, 5)), dpi=80) + plt.axes((.15, .2, .83, .75)) + if config.run_modified_miracle: + plt.errorbar( + vary_space, + np.mean(modified_miracle_mse, axis=0), + yerr=np.std(modified_miracle_mse, axis=0)/np.sqrt(num_itr), + linewidth = 3.0, + label="MMRC") + if config.run_privunit: + line1 = plt.errorbar(vary_space, + [np.mean(privunit_mse, axis=0)[0]]*len(vary_space), + yerr = [np.std(privunit_mse, axis=0)[0]/np.sqrt(num_itr)]*len(vary_space), + ls='--', + linewidth = 3.0, + label="PrivUnit$_{2}$") + if config.run_sqkr: + line2 = plt.errorbar(vary_space, + [np.mean(sqkr_mse, axis=0)[0]]*len(vary_space), + yerr = [np.std(sqkr_mse, axis=0)[0]/np.sqrt(num_itr)]*len(vary_space), + ls='--', + linewidth = 3.0, + label="SQKR") + plt.xticks(fontsize=28) + plt.yticks(fontsize=28) + plt.ylabel("$\ell_{2}$ error", fontsize=28) + plt.xlabel(r"$\#$ bits", fontsize=28) + plt.yticks([0.04,0.05,0.06,0.07,0.08]) + plt.legend(fontsize=24, loc='center right') + + with file_open(work_path + "/rcc_dp_mse_vs_coding_cost.png", "wb") as f: + plt.savefig(f, format="png") + + with file_open(work_path + "/time.txt", "w") as f: + np.savetxt(f, np.array(time.time() - start_time).reshape(-1, 1)) + + if config.run_modified_miracle: + with file_open(work_path + "/modified_miracle_mse.csv", "w") as f: + np.savetxt(f, modified_miracle_mse, delimiter=",") + + if config.run_privunit: + with file_open(work_path + "/privunit_mse.csv", "w") as f: + np.savetxt(f, privunit_mse, delimiter=",") + + if config.run_sqkr: + with file_open(work_path + "/sqkr_mse.csv", "w") as f: + np.savetxt(f, sqkr_mse, delimiter=",") \ No newline at end of file diff --git a/rcc_dp/experiment_test.py b/rcc_dp/mean_estimation/experiment_test.py similarity index 67% rename from rcc_dp/experiment_test.py rename to rcc_dp/mean_estimation/experiment_test.py index a03bc032..ac71d55d 100644 --- a/rcc_dp/experiment_test.py +++ b/rcc_dp/mean_estimation/experiment_test.py @@ -14,8 +14,9 @@ """Tests for experiment definitions.""" from absl.testing import absltest -from rcc_dp import config as defaults -from rcc_dp import experiment +from rcc_dp.mean_estimation import config as defaults +from rcc_dp.mean_estimation import experiment +from rcc_dp.mean_estimation import experiment_coding_cost class ExperimentTest(absltest.TestCase): @@ -24,12 +25,17 @@ def test_evaluate_does_not_fail(self): work_path = self.create_tempdir().full_path config = defaults.get_config() config.num_itr = 1 - config.eps_space = [1] - config.t = 2 config.d = 100 config.n = 200 - experiment.evaluate(work_path, config) + if config.vary == "cc": + config.epsilon_target = 1 + config.cc_space = [4] + experiment_coding_cost.evaluate(work_path, config) + else: + config.eps_space = [1] + config.t = 2 + experiment.evaluate(work_path, config) if __name__ == "__main__": - absltest.main() + absltest.main() \ No newline at end of file diff --git a/rcc_dp/get_parameters.py b/rcc_dp/mean_estimation/get_parameters.py similarity index 97% rename from rcc_dp/get_parameters.py rename to rcc_dp/mean_estimation/get_parameters.py index 4ddc5c67..bba1d220 100644 --- a/rcc_dp/get_parameters.py +++ b/rcc_dp/mean_estimation/get_parameters.py @@ -14,8 +14,8 @@ """Code to obtain parameters of various miracle methods.""" import numpy as np -from rcc_dp import optimize_unbias -from rcc_dp import privunit +from rcc_dp.mean_estimation import optimize_unbias +from rcc_dp.mean_estimation import privunit def get_parameters_unbiased_approx_miracle(epsilon_target, d, diff --git a/rcc_dp/get_parameters_test.py b/rcc_dp/mean_estimation/get_parameters_test.py similarity index 96% rename from rcc_dp/get_parameters_test.py rename to rcc_dp/mean_estimation/get_parameters_test.py index 00c2c4ec..b5e0db8b 100644 --- a/rcc_dp/get_parameters_test.py +++ b/rcc_dp/mean_estimation/get_parameters_test.py @@ -15,9 +15,9 @@ from absl.testing import absltest import numpy as np -from rcc_dp import get_parameters -from rcc_dp import miracle -from rcc_dp import modify_pi +from rcc_dp.mean_estimation import get_parameters +from rcc_dp.mean_estimation import miracle +from rcc_dp.mean_estimation import modify_pi class GetParametersTest(absltest.TestCase): diff --git a/rcc_dp/miracle.py b/rcc_dp/mean_estimation/miracle.py similarity index 100% rename from rcc_dp/miracle.py rename to rcc_dp/mean_estimation/miracle.py diff --git a/rcc_dp/miracle_test.py b/rcc_dp/mean_estimation/miracle_test.py similarity index 94% rename from rcc_dp/miracle_test.py rename to rcc_dp/mean_estimation/miracle_test.py index 9f0f65a8..c2710bf0 100644 --- a/rcc_dp/miracle_test.py +++ b/rcc_dp/mean_estimation/miracle_test.py @@ -15,8 +15,8 @@ from absl.testing import absltest import numpy as np -from rcc_dp import get_parameters -from rcc_dp import miracle +from rcc_dp.mean_estimation import get_parameters +from rcc_dp.mean_estimation import miracle class MiracleTest(absltest.TestCase): diff --git a/rcc_dp/modify_pi.py b/rcc_dp/mean_estimation/modify_pi.py similarity index 100% rename from rcc_dp/modify_pi.py rename to rcc_dp/mean_estimation/modify_pi.py diff --git a/rcc_dp/modify_pi_test.py b/rcc_dp/mean_estimation/modify_pi_test.py similarity index 96% rename from rcc_dp/modify_pi_test.py rename to rcc_dp/mean_estimation/modify_pi_test.py index 75e39bcc..963c71e5 100644 --- a/rcc_dp/modify_pi_test.py +++ b/rcc_dp/mean_estimation/modify_pi_test.py @@ -15,9 +15,9 @@ from absl.testing import absltest import numpy as np -from rcc_dp import get_parameters -from rcc_dp import miracle -from rcc_dp import modify_pi +from rcc_dp.mean_estimation import get_parameters +from rcc_dp.mean_estimation import miracle +from rcc_dp.mean_estimation import modify_pi class ModifyPiTest(absltest.TestCase): diff --git a/rcc_dp/optimize_unbias.py b/rcc_dp/mean_estimation/optimize_unbias.py similarity index 99% rename from rcc_dp/optimize_unbias.py rename to rcc_dp/mean_estimation/optimize_unbias.py index 9cf0fa41..c70625d8 100644 --- a/rcc_dp/optimize_unbias.py +++ b/rcc_dp/mean_estimation/optimize_unbias.py @@ -28,7 +28,7 @@ import numpy as np from scipy import stats -from rcc_dp import privunit +from rcc_dp.mean_estimation import privunit def get_unbiased_p_hat(number_candidates, c1, c2, p): diff --git a/rcc_dp/privunit.py b/rcc_dp/mean_estimation/privunit.py similarity index 100% rename from rcc_dp/privunit.py rename to rcc_dp/mean_estimation/privunit.py diff --git a/rcc_dp/privunit_test.py b/rcc_dp/mean_estimation/privunit_test.py similarity index 98% rename from rcc_dp/privunit_test.py rename to rcc_dp/mean_estimation/privunit_test.py index 5e5296de..821d2604 100644 --- a/rcc_dp/privunit_test.py +++ b/rcc_dp/mean_estimation/privunit_test.py @@ -15,7 +15,7 @@ from absl.testing import absltest import numpy as np -from rcc_dp import privunit +from rcc_dp.mean_estimation import privunit class PrivunitTest(absltest.TestCase): diff --git a/rcc_dp/sqkr.py b/rcc_dp/mean_estimation/sqkr.py similarity index 100% rename from rcc_dp/sqkr.py rename to rcc_dp/mean_estimation/sqkr.py diff --git a/rcc_dp/sqkr_test.py b/rcc_dp/mean_estimation/sqkr_test.py similarity index 98% rename from rcc_dp/sqkr_test.py rename to rcc_dp/mean_estimation/sqkr_test.py index e806647d..8545eae6 100644 --- a/rcc_dp/sqkr_test.py +++ b/rcc_dp/mean_estimation/sqkr_test.py @@ -17,7 +17,7 @@ from absl.testing import parameterized import numpy as np from scipy import stats -from rcc_dp import sqkr +from rcc_dp.mean_estimation import sqkr class SqkrTest(parameterized.TestCase): From 78a8657187ee79021e99d473b4181a12aa1f62df Mon Sep 17 00:00:00 2001 From: Abhin Shah Date: Tue, 22 Feb 2022 19:52:18 -0500 Subject: [PATCH 2/4] Updating readme and a parameter for freq estimation --- rcc_dp/README.md | 155 +++++++++++++++++++++++++- rcc_dp/frequency_estimation/config.py | 2 +- 2 files changed, 154 insertions(+), 3 deletions(-) diff --git a/rcc_dp/README.md b/rcc_dp/README.md index 864c6836..9d5ffc3b 100644 --- a/rcc_dp/README.md +++ b/rcc_dp/README.md @@ -1,3 +1,154 @@ -# RCC/DP project +# Source code for "Optimal Compression of Locally Differentially Private Mechanisms" -This is work in progress. Links to relevant publications will be added later. +Reference: Abhin Shah, Wei-Ning Chen, Johannes Balle, Peter Kairouz, Lucas Theis, +"Optimal Compression of Locally Differentially Private Mechanisms," +The 25th International Conference on Artificial Intelligence and Statistics (AISTATS), 2022 + +Contact: abhin@mit.edu, jballe@google.com + +Arxiv: [https://arxiv.org/pdf/2111.00092.pdf](https://arxiv.org/pdf/2111.00092.pdf) + +### Dependencies: + +In order to successfully execute the code, the following libraries must be installed: + +Python --- json, math, time, matplotlib, numpy, scipy, [absl](https://github.com/abseil/abseil-py), [ml_collections](https://github.com/google/ml_collections) + +### Wrapper functions: + +This repository contains the code for (a) mean estimation and (b) frequency estimation. To run the code, a wrapper function needs to be written. For example, to run the mean estimation code, the following could be used: + +``` +from mean_estimation import config as defaults +from mean_estimation import experiment +from mean_estimation import experiment_coding_cost + +def main(): + config = defaults.get_config() + experiment.evaluate('path-to-the-mean-estimation-code', config) + +if __name__ == "__main__": + main() +``` + +Similarly, to run the frequency estimation code, the following could be used: + +``` +from frequency_estimation import config as defaults +from frequency_estimation import experiment +from frequency_estimation import experiment_coding_cost + +def main(): + config = defaults.get_config() + experiment.evaluate('path-to-the-frequency-estimation-code', config) + +if __name__ == "__main__": + main() +``` + +### Reproducing the figures + +1. To reproduce Figure 1(Top), make the following changes in mean_estimation/config.py: +``` +num_itr = 10 +vary = "cc" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment_coding_cost.evaluate('path-to-the-mean-estimation-code', config) +``` +2. To reproduce Figure 1(Bottom), make the following changes in mean_estimation/config.py: +``` +num_itr = 10 +vary = "eps" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-mean-estimation-code', config) +``` +3. To reproduce Figure 2(Top), make the following changes in frequency_estimation/config.py: +``` +num_itr = 10 +vary = "cc" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment_coding_cost.evaluate('path-to-the-frequency-estimation-code', config) +``` +4. To reproduce Figure 2(Bottom), make the following changes in frequency_estimation/config.py: +``` +num_itr = 10 +vary = "eps" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-frequency-estimation-code', config) +``` +5. To reproduce Figure 3, make the following changes in mean_estimation/config.py: +``` +run_approx_miracle=True, +run_modified_miracle=False, +num_itr = 10 +vary = "eps" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-mean-estimation-code', config) +``` +6. To reproduce Figure 4(Left), make the following changes in mean_estimation/config.py: +``` +num_itr = 10 +vary = "d" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-mean-estimation-code', config) +``` +7. To reproduce Figure 4(Right), make the following changes in mean_estimation/config.py: +``` +num_itr = 10 +vary = "n" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-mean-estimation-code', config) +``` +8. To reproduce Figure 5, make the following changes in frequency_estimation/config.py: +``` +run_approx_miracle=True, +run_modified_miracle=False, +num_itr = 10 +vary = "eps" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-frequency-estimation-code', config) +``` +9. To reproduce Figure 6(Left), make the following changes in frequency_estimation/config.py: +``` +num_itr = 10 +vary = "d" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-frequency-estimation-code', config) +``` +10. To reproduce Figure 6(Right), make the following changes in frequency_estimation/config.py: +``` +num_itr = 10 +vary = "n" +``` +and add the following commands in the main function of the wrapper: +``` +config = defaults.get_config() +experiment.evaluate('path-to-the-frequency-estimation-code', config) +``` \ No newline at end of file diff --git a/rcc_dp/frequency_estimation/config.py b/rcc_dp/frequency_estimation/config.py index e6b89dd7..4969de7d 100644 --- a/rcc_dp/frequency_estimation/config.py +++ b/rcc_dp/frequency_estimation/config.py @@ -43,7 +43,7 @@ def get_config(): alpha=1.0, # Variation. vary="eps", # Can take one of "cc", "k", "n", "eps". - cc_space=[6, 8, 10, 12], + cc_space=[6, 8, 10, 12, 14], k_space=[200, 400, 600, 800, 1000], n_space=[2000, 4000, 6000, 8000, 10000], eps_space=list(range(1, 9)), From 8a8263dc4ece1a4b46180b721e77afa47c33ad7d Mon Sep 17 00:00:00 2001 From: Abhin Shah Date: Tue, 22 Feb 2022 21:13:52 -0500 Subject: [PATCH 3/4] Fixing pylint issues --- rcc_dp/frequency_estimation/config.py | 2 +- rcc_dp/frequency_estimation/experiment.py | 83 ++++++++++--------- .../experiment_coding_cost.py | 70 ++++++++-------- .../frequency_estimation/experiment_test.py | 2 +- rcc_dp/frequency_estimation/miracle.py | 22 ++--- rcc_dp/frequency_estimation/rhr.py | 20 ++--- rcc_dp/frequency_estimation/ss.py | 10 ++- rcc_dp/frequency_estimation/unbias.py | 12 +-- rcc_dp/mean_estimation/experiment.py | 10 +-- .../mean_estimation/experiment_coding_cost.py | 61 +++++++------- 10 files changed, 149 insertions(+), 143 deletions(-) diff --git a/rcc_dp/frequency_estimation/config.py b/rcc_dp/frequency_estimation/config.py index 4969de7d..47e8fef4 100644 --- a/rcc_dp/frequency_estimation/config.py +++ b/rcc_dp/frequency_estimation/config.py @@ -55,4 +55,4 @@ def get_config(): ) config = config_dict.ConfigDict(config) config.lock() # Prevent addition of new fields. - return config \ No newline at end of file + return config diff --git a/rcc_dp/frequency_estimation/experiment.py b/rcc_dp/frequency_estimation/experiment.py index 3ab43d91..c1e3ac60 100644 --- a/rcc_dp/frequency_estimation/experiment.py +++ b/rcc_dp/frequency_estimation/experiment.py @@ -11,17 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Experiment definitions (i.e., evaluation of miracle, rhr, subset selection -methods when either the data dimension k, the number of users n, or the +"""Experiment definitions (i.e., evaluation of miracle, rhr, subset selection +methods when either the data dimension k, the number of users n, or the privacy parameter epsilon is varied).""" import json import math import time import matplotlib -matplotlib.rcParams['ps.useafm'] = True -matplotlib.rcParams['pdf.use14corefonts'] = True -matplotlib.rcParams['text.usetex'] = True import matplotlib.pyplot as plt import matplotlib.lines as mlines import numpy as np @@ -30,26 +27,32 @@ from rcc_dp.frequency_estimation import rhr from rcc_dp.frequency_estimation import ss from rcc_dp.frequency_estimation import unbias +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['text.usetex'] = True def generate_geometric_distribution(k,lbd): - elements = range(0,k) - prob = [(1-lbd)*math.pow(lbd,x)/(1-math.pow(lbd,k)) for x in elements] - return prob + """Generate the discrete geometric distribution.""" + elements = range(0,k) + prob = [(1-lbd)*math.pow(lbd,x)/(1-math.pow(lbd,k)) for x in elements] + return prob def generate_uniform_distribution(k): - raw_distribution = [1] * k - sum_raw = sum(raw_distribution) - prob = [float(y)/float(sum_raw) for y in raw_distribution] - return prob + """Generate the discrete uniform distribution.""" + raw_distribution = [1] * k + sum_raw = sum(raw_distribution) + prob = [float(y)/float(sum_raw) for y in raw_distribution] + return prob def generate_zipf_distribution(k,degree): - raw_distribution = [1/(float(i)**(degree)) for i in range(1,k+1)] - sum_raw = sum(raw_distribution) - prob = [float(y)/float(sum_raw) for y in raw_distribution] - return prob + """Generate the discrete zipf distribution.""" + raw_distribution = [1/(float(i)**(degree)) for i in range(1,k+1)] + sum_raw = sum(raw_distribution) + prob = [float(y)/float(sum_raw) for y in raw_distribution] + return prob def evaluate(work_path, config, file_open=open): @@ -125,16 +128,16 @@ def evaluate(work_path, config, file_open=open): x_miracle = np.zeros((k,n)) for i in range(n): if config.encoding_type == "fast": - x_miracle[:,i] = miracle.encode_decode_miracle_fast(i+itr*n, x[i], + x_miracle[:,i] = miracle.encode_decode_miracle_fast(i+itr*n, x[i], k, epsilon_target/2, 2**coding_cost) else: - _, _, index = miracle.encoder(i+itr*n, x[i], k, epsilon_target/2, + _, _, index = miracle.encoder(i+itr*n, x[i], k, epsilon_target/2, 2**coding_cost) - x_miracle[:,i] = miracle.decoder(i+itr*n, index, k, epsilon_target/2, + x_miracle[:,i] = miracle.decoder(i+itr*n, index, k, epsilon_target/2, 2**coding_cost) - prob_miracle = unbias.unbias_miracle(k, epsilon_target/2, 2**coding_cost, + prob_miracle = unbias.unbias_miracle(k, epsilon_target/2, 2**coding_cost, x_miracle.T, n, normalization = 1) - miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i in zip(prob, prob_miracle)], ord=1) if config.run_modified_miracle: @@ -144,49 +147,49 @@ def evaluate(work_path, config, file_open=open): x_modified_miracle[:,i] = miracle.encode_decode_modified_miracle_fast( i+itr*n, x[i], k, alpha*epsilon_target, 2**coding_cost) else: - _, pi, _ = miracle.encoder(i+itr*n, x[i], k, alpha*epsilon_target, + _, pi, _ = miracle.encoder(i+itr*n, x[i], k, alpha*epsilon_target, 2**coding_cost) expected_beta = np.ceil(k/(np.exp(epsilon_target)+1))/k - pi_all = modify_pi.modify_pi(pi, eta, epsilon_target, + pi_all = modify_pi.modify_pi(pi, eta, epsilon_target, (np.exp(epsilon_target/2))/(1+expected_beta*(np.exp(epsilon_target)-1))) index = np.random.choice(2**coding_cost, 1, p=pi_all[-1])[0] - x_modified_miracle[:,i] = miracle.decoder(i+itr*n, index, k, + x_modified_miracle[:,i] = miracle.decoder(i+itr*n, index, k, alpha*epsilon_target, 2**coding_cost) - prob_modified_miracle = unbias.unbias_modified_miracle(k, alpha*epsilon_target, + prob_modified_miracle = unbias.unbias_modified_miracle(k, alpha*epsilon_target, 2**coding_cost, x_modified_miracle.T, n, normalization = 1) - modified_miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + modified_miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i in zip(prob, prob_modified_miracle)], ord=1) if config.run_approx_miracle: x_approx_miracle = np.zeros((k,n)) approx_coding_cost = int(np.ceil( config.approx_coding_cost_multiplier*epsilon_target/np.log(2) + config.approx_t)) - epsilon_approx = miracle.get_approx_epsilon(epsilon_target, k, + epsilon_approx = miracle.get_approx_epsilon(epsilon_target, k, 2**approx_coding_cost, delta) for i in range(n): if config.encoding_type == "fast": - x_approx_miracle[:,i] = miracle.encode_decode_miracle_fast(i+itr*n, x[i], + x_approx_miracle[:,i] = miracle.encode_decode_miracle_fast(i+itr*n, x[i], k, epsilon_approx, 2**coding_cost) else: _, _, index = miracle.encoder(i+itr*n, x[i], k, epsilon_approx, 2**coding_cost) - x_approx_miracle[:,i] = miracle.decoder(i+itr*n, index, k, epsilon_approx, + x_approx_miracle[:,i] = miracle.decoder(i+itr*n, index, k, epsilon_approx, 2**coding_cost) - prob_approx_miracle = unbias.unbias_miracle(k, epsilon_approx, 2**coding_cost, + prob_approx_miracle = unbias.unbias_miracle(k, epsilon_approx, 2**coding_cost, x_approx_miracle.T, n, normalization = 1) - approx_miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + approx_miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i in zip(prob, prob_approx_miracle)], ord=1) if config.run_ss: x_ss = ss.encode_string_fast(k, epsilon_target, x) prob_ss = ss.decode_string(k, epsilon_target, x_ss, n, normalization = 1) - ss_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + ss_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i in zip(prob, prob_ss)], ord=1) if config.run_rhr: x_rhr = rhr.encode_string(k, epsilon_target, coding_cost, x) - prob_rhr = rhr.decode_string_fast(k, epsilon_target, coding_cost, x_rhr, + prob_rhr = rhr.decode_string_fast(k, epsilon_target, coding_cost, x_rhr, normalization = 1) # estimate the original underlying distribution - rhr_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + rhr_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i in zip(prob, prob_rhr)], ord=1) if config.run_approx_miracle: @@ -226,28 +229,28 @@ def evaluate(work_path, config, file_open=open): plt.errorbar( vary_space, np.mean(approx_miracle_error, axis=0), - yerr=np.std(approx_miracle_error, axis=0)/np.sqrt(num_itr), + yerr=np.std(approx_miracle_error, axis=0)/np.sqrt(config.num_itr), linewidth = 3.0, label="MRC") if config.run_miracle: plt.errorbar( vary_space, np.mean(miracle_error, axis=0), - yerr=np.std(miracle_error, axis=0)/np.sqrt(num_itr), + yerr=np.std(miracle_error, axis=0)/np.sqrt(config.num_itr), linewidth = 3.0, label="MRC") if config.run_modified_miracle: plt.errorbar( vary_space, np.mean(modified_miracle_error, axis=0), - yerr=np.std(modified_miracle_error, axis=0)/np.sqrt(num_itr), + yerr=np.std(modified_miracle_error, axis=0)/np.sqrt(config.num_itr), linewidth = 3.0, label="MMRC") if config.run_ss: plt.errorbar( vary_space, np.mean(ss_error, axis=0), - yerr=np.std(ss_error, axis=0)/np.sqrt(num_itr), + yerr=np.std(ss_error, axis=0)/np.sqrt(config.num_itr), ls='--', linewidth = 3.0, label="Subset Selection") @@ -255,7 +258,7 @@ def evaluate(work_path, config, file_open=open): plt.errorbar( vary_space, np.mean(rhr_error, axis=0), - yerr=np.std(rhr_error, axis=0)/np.sqrt(num_itr), + yerr=np.std(rhr_error, axis=0)/np.sqrt(config.num_itr), ls='--', linewidth = 3.0, label="RHR") @@ -299,4 +302,4 @@ def evaluate(work_path, config, file_open=open): if config.run_rhr: with file_open(work_path + "/rhr_error.csv", "w") as f: - np.savetxt(f, rhr_error, delimiter=",") \ No newline at end of file + np.savetxt(f, rhr_error, delimiter=",") diff --git a/rcc_dp/frequency_estimation/experiment_coding_cost.py b/rcc_dp/frequency_estimation/experiment_coding_cost.py index 41804c56..b017245b 100644 --- a/rcc_dp/frequency_estimation/experiment_coding_cost.py +++ b/rcc_dp/frequency_estimation/experiment_coding_cost.py @@ -11,16 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Experiment definitions (i.e., evaluation of miracle, rhr, subset selection +"""Experiment definitions (i.e., evaluation of miracle, rhr, subset selection methods when the coding cost is varied).""" import json import math import time import matplotlib -matplotlib.rcParams['ps.useafm'] = True -matplotlib.rcParams['pdf.use14corefonts'] = True -matplotlib.rcParams['text.usetex'] = True import matplotlib.pyplot as plt import matplotlib.lines as mlines import numpy as np @@ -29,26 +26,32 @@ from rcc.frequency_estimation import rhr from rcc.frequency_estimation import ss from rcc.frequency_estimation import unbias +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['text.usetex'] = True def generate_geometric_distribution(k,lbd): - elements = range(0,k) - prob = [(1-lbd)*math.pow(lbd,x)/(1-math.pow(lbd,k)) for x in elements] - return prob + """Generate the discrete geometric distribution.""" + elements = range(0,k) + prob = [(1-lbd)*math.pow(lbd,x)/(1-math.pow(lbd,k)) for x in elements] + return prob def generate_uniform_distribution(k): - raw_distribution = [1] * k - sum_raw = sum(raw_distribution) - prob = [float(y)/float(sum_raw) for y in raw_distribution] - return prob + """Generate the discrete uniform distribution.""" + raw_distribution = [1] * k + sum_raw = sum(raw_distribution) + prob = [float(y)/float(sum_raw) for y in raw_distribution] + return prob def generate_zipf_distribution(k,degree): - raw_distribution = [1/(float(i)**(degree)) for i in range(1,k+1)] - sum_raw = sum(raw_distribution) - prob = [float(y)/float(sum_raw) for y in raw_distribution] - return prob + """Generate the discrete zipf distribution.""" + raw_distribution = [1/(float(i)**(degree)) for i in range(1,k+1)] + sum_raw = sum(raw_distribution) + prob = [float(y)/float(sum_raw) for y in raw_distribution] + return prob def evaluate(work_path, config, file_open=open): @@ -58,7 +61,6 @@ def evaluate(work_path, config, file_open=open): start_time = time.time() - delta = config.delta alpha = config.alpha # Get default values. k = config.k @@ -75,7 +77,7 @@ def evaluate(work_path, config, file_open=open): rhr_coding_cost = epsilon_target - for itr in range(num_itr): + for itr in range(config.num_itr): print("itr = %d" % itr) print("epsilon target = " + str(epsilon_target)) print("n = " + str(n)) @@ -99,18 +101,18 @@ def evaluate(work_path, config, file_open=open): "distribution should be either be geometric, zipf, uniform.") x = np.random.choice(k, n, p=prob) - + if config.run_ss: x_ss = ss.encode_string_fast(k, epsilon_target, x) prob_ss = ss.decode_string(k, epsilon_target, x_ss, n, normalization = 1) - ss_error[itr, 0] = np.linalg.norm([p_i - phat_i for p_i, phat_i + ss_error[itr, 0] = np.linalg.norm([p_i - phat_i for p_i, phat_i in zip(prob, prob_ss)], ord=1) if config.run_rhr: - x_rhr = rhr.encode_string(k, epsilon_target, coding_cost, x) - prob_rhr = rhr.decode_string_fast(k, epsilon_target, coding_cost, x_rhr, + x_rhr = rhr.encode_string(k, epsilon_target, rhr_coding_cost, x) + prob_rhr = rhr.decode_string_fast(k, epsilon_target, rhr_coding_cost, x_rhr, normalization = 1) # estimate the original underlying distribution - rhr_error[itr, 0] = np.linalg.norm([p_i - phat_i for p_i, phat_i + rhr_error[itr, 0] = np.linalg.norm([p_i - phat_i for p_i, phat_i in zip(prob, prob_rhr)], ord=1) for step, vary_parameter in enumerate(vary_space): @@ -124,17 +126,17 @@ def evaluate(work_path, config, file_open=open): x_modified_miracle[:,i] = miracle.encode_decode_modified_miracle_fast( i+itr*n, x[i], k, alpha*epsilon_target, 2**coding_cost) else: - _, pi, _ = miracle.encoder(i+itr*n, x[i], k, alpha*epsilon_target, + _, pi, _ = miracle.encoder(i+itr*n, x[i], k, alpha*epsilon_target, 2**coding_cost) expected_beta = np.ceil(k/(np.exp(epsilon_target)+1))/k - pi_all = modify_pi.modify_pi(pi, eta, epsilon_target, + pi_all = modify_pi.modify_pi(pi, eta, epsilon_target, (np.exp(epsilon_target/2))/(1+expected_beta*(np.exp(epsilon_target)-1))) index = np.random.choice(2**coding_cost, 1, p=pi_all[-1])[0] - x_modified_miracle[:,i] = miracle.decoder(i+itr*n, index, k, + x_modified_miracle[:,i] = miracle.decoder(i+itr*n, index, k, alpha*epsilon_target, 2**coding_cost) - prob_modified_miracle = unbias.unbias_modified_miracle(k, alpha*epsilon_target, + prob_modified_miracle = unbias.unbias_modified_miracle(k, alpha*epsilon_target, 2**coding_cost, x_modified_miracle.T, n, normalization = 1) - modified_miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i + modified_miracle_error[itr, step] = np.linalg.norm([p_i - phat_i for p_i, phat_i in zip(prob, prob_modified_miracle)], ord=1) print(time.time() - start_time) @@ -159,16 +161,16 @@ def evaluate(work_path, config, file_open=open): linewidth = 3.0, label="MMRC") if config.run_ss: - line1 = plt.errorbar(vary_space, - [np.mean(ss_error, axis=0)[0]]*len(vary_space), - yerr = [np.std(ss_error, axis=0)[0]/np.sqrt(num_itr)]*len(vary_space), + line1 = plt.errorbar(vary_space, + [np.mean(ss_error, axis=0)[0]]*len(vary_space), + yerr = [np.std(ss_error, axis=0)[0]/np.sqrt(config.num_itr)]*len(vary_space), ls='--', linewidth = 3.0, label="Subset Selection") if config.run_rhr: - line2 = plt.errorbar(vary_space, - [np.mean(rhr_error, axis=0)[0]]*len(vary_space), - yerr = [np.std(rhr_error, axis=0)[0]/np.sqrt(num_itr)]*len(vary_space), + line2 = plt.errorbar(vary_space, + [np.mean(rhr_error, axis=0)[0]]*len(vary_space), + yerr = [np.std(rhr_error, axis=0)[0]/np.sqrt(config.num_itr)]*len(vary_space), ls='--', linewidth = 3.0, label="RHR") @@ -195,4 +197,4 @@ def evaluate(work_path, config, file_open=open): if config.run_rhr: with file_open(work_path + "/rhr_error.csv", "w") as f: - np.savetxt(f, rhr_error, delimiter=",") \ No newline at end of file + np.savetxt(f, rhr_error, delimiter=",") diff --git a/rcc_dp/frequency_estimation/experiment_test.py b/rcc_dp/frequency_estimation/experiment_test.py index af731c32..32abd110 100644 --- a/rcc_dp/frequency_estimation/experiment_test.py +++ b/rcc_dp/frequency_estimation/experiment_test.py @@ -38,4 +38,4 @@ def test_evaluate_does_not_fail(self): if __name__ == "__main__": - absltest.main() \ No newline at end of file + absltest.main() diff --git a/rcc_dp/frequency_estimation/miracle.py b/rcc_dp/frequency_estimation/miracle.py index 5f748b54..84c9432c 100644 --- a/rcc_dp/frequency_estimation/miracle.py +++ b/rcc_dp/frequency_estimation/miracle.py @@ -13,9 +13,9 @@ # limitations under the License. """Minimal random coding (likelihood decoder) definitions. -This is the code to be used to simulate the minimum random coding algorithm -(tailored to subset selection). The algorithm was introduced by Havasi et al. -in "Minimal Random Code Learning: Getting Bits Back from Compressed Model +This is the code to be used to simulate the minimum random coding algorithm +(tailored to subset selection). The algorithm was introduced by Havasi et al. +in "Minimal Random Code Learning: Getting Bits Back from Compressed Model Parameters" - https://arxiv.org/pdf/1810.00440.pdf. For brevity, we may refer to it as 'MIRACLE', although technically this refers @@ -114,13 +114,13 @@ def get_approx_epsilon(epsilon_target, k, number_candidates, delta): def encode_decode_miracle_fast(seed, x, k, epsilon, number_candidates): - """A fast implementation of the miracle protocol -- instead of - generating number_candidates samples, generate one sample with + """A fast implementation of the miracle protocol -- instead of + generating number_candidates samples, generate one sample with true expectation. """ d = int(np.ceil(k/(np.exp(epsilon)+1))) num_cand_in_cap = np.random.binomial(number_candidates, d/k, size=None) - pi_in = np.exp(epsilon)/(num_cand_in_cap*np.exp(epsilon) + pi_in = np.exp(epsilon)/(num_cand_in_cap*np.exp(epsilon) + (number_candidates-num_cand_in_cap)) prob_sample_from_cap = num_cand_in_cap*pi_in @@ -140,8 +140,8 @@ def encode_decode_miracle_fast(seed, x, k, epsilon, number_candidates): def encode_decode_modified_miracle_fast(seed, x, k, epsilon, number_candidates): - """A fast implementation of the modified miracle protocol -- instead of - generating number_candidates samples, generate one sample with + """A fast implementation of the modified miracle protocol -- instead of + generating number_candidates samples, generate one sample with true expectation. """ d = int(np.ceil(k/(np.exp(epsilon)+1))) @@ -154,10 +154,10 @@ def encode_decode_modified_miracle_fast(seed, x, k, epsilon, number_candidates): expected_beta = np.ceil(k/(np.exp(epsilon)+1))/k pi_in = c1/(num_cand_in_cap*c1+(number_candidates-num_cand_in_cap)*c2) - tilde_pi_in = pi_in*(1+beta*(np.exp(epsilon)-1))/((1 + tilde_pi_in = pi_in*(1+beta*(np.exp(epsilon)-1))/((1 + expected_beta*(np.exp(epsilon)-1))) if beta > expected_beta: - tilde_pi_in = tilde_pi_in*(beta+expected_beta*(np.exp(epsilon) + tilde_pi_in = tilde_pi_in*(beta+expected_beta*(np.exp(epsilon) - 1))/(beta*np.exp(epsilon)) prob_sample_from_cap = num_cand_in_cap*tilde_pi_in @@ -174,4 +174,4 @@ def encode_decode_modified_miracle_fast(seed, x, k, epsilon, number_candidates): rs = np.random.RandomState(seed) rs.shuffle(z) z = z[:x] + [0] + z[x:] - return np.array(z) \ No newline at end of file + return np.array(z) diff --git a/rcc_dp/frequency_estimation/rhr.py b/rcc_dp/frequency_estimation/rhr.py index dad7a0df..8e021f5d 100644 --- a/rcc_dp/frequency_estimation/rhr.py +++ b/rcc_dp/frequency_estimation/rhr.py @@ -30,9 +30,9 @@ def encode_string(dim, epsilon, comm, x): # Pad dim to the power of 2. padded_d = int(math.pow(2, math.ceil(math.log(dim, 2)))) - # Calculate the effective communication cost + # Calculate the effective communication cost # i.e., min(comm, log(e^\epsilon)). - eff_comm = min(comm, math.ceil(epsilon*math.log(math.e, 2)), + eff_comm = min(comm, math.ceil(epsilon*math.log(math.e, 2)), math.ceil(math.log(dim, 2))+1) # Set the block size. block_size = int(padded_d/math.pow(2, eff_comm-1)) @@ -45,7 +45,7 @@ def encode_string(dim, epsilon, comm, x): if get_hadamard_entry(padded_d, j, x_individual) == -1: loc_sign[idx] = loc_sign[idx] + 1 - z = rr_encode_string(int(math.pow(2, eff_comm)), epsilon, + z = rr_encode_string(int(math.pow(2, eff_comm)), epsilon, np.array(loc_sign)) return z @@ -57,7 +57,7 @@ def rr_encode_string(alphabet_size, epsilon, samples): private_samples_rr = np.copy(samples) # Determine which samples need to be noised (i.e., flipped). - flip = np.random.random_sample(n) < (alphabet_size + flip = np.random.random_sample(n) < (alphabet_size - 1)/(math.exp(epsilon) + alphabet_size - 1) flip_samples = samples[flip] @@ -79,9 +79,9 @@ def decode_string_fast(dim, epsilon, comm, z, normalization = 1): l = len(z) # Pad dim to the power of 2. padded_d = int(math.pow(2, math.ceil(math.log(dim,2)))) - # Calculate the effective communication cost + # Calculate the effective communication cost # i.e., min(comm, log(e^\epsilon)). - eff_comm = min(comm, math.ceil(epsilon*math.log(math.e, 2)), + eff_comm = min(comm, math.ceil(epsilon*math.log(math.e, 2)), math.ceil(math.log(dim, 2))+1) # Set the block size. block_size = int(padded_d/math.pow(2, eff_comm-1)) @@ -92,16 +92,16 @@ def decode_string_fast(dim, epsilon, comm, z, normalization = 1): # Create histograms to specify the empirical distributions of each group. histograms = np.zeros((block_size, int(padded_d/block_size))) for g_idx in range(block_size): - g_count,g_axis = np.histogram(group_list[g_idx], + g_count, _ = np.histogram(group_list[g_idx], range(int(math.pow(2, eff_comm))+1)) histograms[g_idx] = g_count[::2] - g_count[1::2] - histograms[g_idx] = histograms[g_idx] * (math.exp(epsilon) + histograms[g_idx] = histograms[g_idx] * (math.exp(epsilon) + math.pow(2, eff_comm)-1)/(math.exp(epsilon)-1)*(block_size/n) # Obtain estimator of q. q = np.zeros((block_size, int(padded_d/block_size))) for j in range(block_size): - q[j, :] = fast_inverse_hadamard_transform(int(padded_d/block_size), + q[j, :] = fast_inverse_hadamard_transform(int(padded_d/block_size), histograms[j, :]) q = q.reshape((padded_d, ), order = 'F') @@ -138,4 +138,4 @@ def fast_inverse_hadamard_transform(k, dist): trans1 = fast_inverse_hadamard_transform(k//2, dist1) trans2 = fast_inverse_hadamard_transform(k//2, dist2) trans = np.concatenate((trans1+ trans2, trans1 - trans2)) - return trans \ No newline at end of file + return trans diff --git a/rcc_dp/frequency_estimation/ss.py b/rcc_dp/frequency_estimation/ss.py index e3763741..86ac8e6d 100644 --- a/rcc_dp/frequency_estimation/ss.py +++ b/rcc_dp/frequency_estimation/ss.py @@ -24,6 +24,7 @@ def probability_project_simplex(p): + """Project the probabilitiess on the simplex.""" k = len(p) # Infer the size of the alphabet. p_sorted = np.sort(p) p_sorted[:] = p_sorted[::-1] @@ -38,15 +39,16 @@ def probability_project_simplex(p): def probability_normalize(p): + """Normalize the probabilities so that they sum to 1.""" p = np.maximum(p,0) # Map it to be positive. norm = np.sum(p) - p = np.true_divide(p,norm) # Ensure the l_1 norm is one. + p = np.true_divide(p,norm) return p def encode_string_fast(k, epsilon, x): - """A fast implementation of the subset selection protocol -- instead - of selecting exactly d ones, set each bit to be one independently + """A fast implementation of the subset selection protocol -- instead + of selecting exactly d ones, set each bit to be one independently with true expectation. """ d = int(np.ceil(k/(np.exp(epsilon)+1))) @@ -79,4 +81,4 @@ def decode_string(k, epsilon, z, length, normalization = 1): if normalization == 1: # Project on the simplex. p_estimate = probability_project_simplex(p_estimate) - return p_estimate \ No newline at end of file + return p_estimate diff --git a/rcc_dp/frequency_estimation/unbias.py b/rcc_dp/frequency_estimation/unbias.py index b6b199aa..f45b2b3a 100644 --- a/rcc_dp/frequency_estimation/unbias.py +++ b/rcc_dp/frequency_estimation/unbias.py @@ -33,7 +33,7 @@ def unbias_miracle(k, epsilon, number_candidates, z, n, normalization = 1): number_candidates: The number of candidates to be sampled. z: The privatized data. n: The number of users. - normalization: Indicator whether to clip and normalize or + normalization: Indicator whether to clip and normalize or project on the simplex Returns: @@ -45,7 +45,7 @@ def unbias_miracle(k, epsilon, number_candidates, z, n, normalization = 1): beta = np.array(range(number_candidates+1))/number_candidates # The probability with which you choose a candidate inside the cap. pi_in = 1/number_candidates*(c1/(beta*c1+(1-beta)*c2)) - expectation = np.sum(stats.binom.pmf(range(number_candidates+1), + expectation = np.sum(stats.binom.pmf(range(number_candidates+1), number_candidates, d/k)*range(number_candidates+1)*pi_in) b_hat = (d - expectation)/(k-1) m_hat = k*expectation/(k-1) - d/(k-1) @@ -61,7 +61,7 @@ def unbias_miracle(k, epsilon, number_candidates, z, n, normalization = 1): return p_estimate -def unbias_modified_miracle(k, epsilon, number_candidates, z, n, +def unbias_modified_miracle(k, epsilon, number_candidates, z, n, normalization = 1): """Get the unbiased estimate for modified miracle. @@ -71,7 +71,7 @@ def unbias_modified_miracle(k, epsilon, number_candidates, z, n, number_candidates: The number of candidates to be sampled. z: The privatized data. n: The number of users. - normalization: Indicator whether to clip and normalize or + normalization: Indicator whether to clip and normalize or project on the simplex Returns: @@ -96,7 +96,7 @@ def unbias_modified_miracle(k, epsilon, number_candidates, z, n, # The probability with which you choose a candidate inside the cap. tilde_pi_in = indicator * tilde_pi_in_1 + (1 - indicator) * tilde_pi_in_2 expectation = np.sum( - stats.binom.pmf(range(number_candidates + 1), number_candidates, + stats.binom.pmf(range(number_candidates + 1), number_candidates, d / k) * range(number_candidates + 1) * tilde_pi_in) b_tilde = (d - expectation)/(k-1) m_tilde = k*expectation/(k-1) - d/(k-1) @@ -109,4 +109,4 @@ def unbias_modified_miracle(k, epsilon, number_candidates, z, n, elif normalization == 1: # Project on the simplex. p_estimate = ss.probability_project_simplex(p_estimate) - return p_estimate \ No newline at end of file + return p_estimate diff --git a/rcc_dp/mean_estimation/experiment.py b/rcc_dp/mean_estimation/experiment.py index 4dd401b7..5d0c01e6 100644 --- a/rcc_dp/mean_estimation/experiment.py +++ b/rcc_dp/mean_estimation/experiment.py @@ -11,17 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Experiment definitions (i.e., evaluation of miracle, sqkr, privunit methods -when either the data dimension d, the number of users n, or the privacy +"""Experiment definitions (i.e., evaluation of miracle, sqkr, privunit methods +when either the data dimension d, the number of users n, or the privacy parameter epsilon is varied).""" import json import math import time import matplotlib -matplotlib.rcParams['ps.useafm'] = True -matplotlib.rcParams['pdf.use14corefonts'] = True -matplotlib.rcParams['text.usetex'] = True import matplotlib.pyplot as plt import matplotlib.lines as mlines import numpy as np @@ -31,6 +28,9 @@ from rcc_dp.mean_estimation import modify_pi from rcc_dp.mean_estimation import privunit from rcc_dp.mean_estimation import sqkr +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['text.usetex'] = True def evaluate(work_path, config, file_open=open): diff --git a/rcc_dp/mean_estimation/experiment_coding_cost.py b/rcc_dp/mean_estimation/experiment_coding_cost.py index 21dceb28..fc037ced 100644 --- a/rcc_dp/mean_estimation/experiment_coding_cost.py +++ b/rcc_dp/mean_estimation/experiment_coding_cost.py @@ -11,16 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Experiment definitions (i.e., evaluation of miracle, sqkr, privunit methods +"""Experiment definitions (i.e., evaluation of miracle, sqkr, privunit methods when the coding cost is varied).""" import json import math import time import matplotlib -matplotlib.rcParams['ps.useafm'] = True -matplotlib.rcParams['pdf.use14corefonts'] = True -matplotlib.rcParams['text.usetex'] = True import matplotlib.pyplot as plt import matplotlib.lines as mlines import numpy as np @@ -30,6 +27,9 @@ from rcc_dp.mean_estimation import modify_pi from rcc_dp.mean_estimation import privunit from rcc_dp.mean_estimation import sqkr +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['text.usetex'] = True def evaluate(work_path, config, file_open=open): @@ -39,7 +39,6 @@ def evaluate(work_path, config, file_open=open): start_time = time.time() - delta = config.delta budget = config.budget alpha = config.alpha # Get default values. @@ -57,7 +56,7 @@ def evaluate(work_path, config, file_open=open): sqkr_coding_cost = epsilon_target - for itr in range(num_itr): + for itr in range(config.num_itr): print("itr = %d" % itr) print("epsilon target = " + str(epsilon_target)) print("n = " + str(n)) @@ -68,20 +67,20 @@ def evaluate(work_path, config, file_open=open): print("alpha = " + str(alpha)) if config.data == "unbiased_data": - x = np.random.normal(0, 1, (d, n)) - x /= np.linalg.norm(x, axis=0) - elif config.data == "biased_data": - x = np.zeros((d, n)) - x[:, 0::2] = np.random.normal(10, 1, (d, (n + 1) // 2)) - x[:, 1::2] = np.random.normal(1, 1, (d, n // 2)) - x /= np.linalg.norm(x, axis=0) - elif config.data == "same_data": - x = np.random.normal(0, 1, (d, 1)) - x /= np.linalg.norm(x, axis=0) - x = np.repeat(x, n, axis=1) - else: - raise ValueError( - "data should be either be biased_data, unbiased_data, same_data.") + x = np.random.normal(0, 1, (d, n)) + x /= np.linalg.norm(x, axis=0) + elif config.data == "biased_data": + x = np.zeros((d, n)) + x[:, 0::2] = np.random.normal(10, 1, (d, (n + 1) // 2)) + x[:, 1::2] = np.random.normal(1, 1, (d, n // 2)) + x /= np.linalg.norm(x, axis=0) + elif config.data == "same_data": + x = np.random.normal(0, 1, (d, 1)) + x /= np.linalg.norm(x, axis=0) + x = np.repeat(x, n, axis=1) + else: + raise ValueError( + "data should be either be biased_data, unbiased_data, same_data.") if config.run_privunit: x_privunit, _ = privunit.apply_privunit(x, epsilon_target, budget) @@ -102,7 +101,7 @@ def evaluate(work_path, config, file_open=open): for step, vary_parameter in enumerate(vary_space): coding_cost = vary_parameter - print("coding cost = %d" % coding_cost) + print("coding cost = %d" % coding_cost) if config.run_modified_miracle: x_modified_miracle = np.zeros((d, n)) @@ -124,9 +123,9 @@ def evaluate(work_path, config, file_open=open): print(time.time() - start_time) print("--------------") - if config.run_approx_miracle: + if config.run_modified_miracle: print("approx miracle mse:") - print(np.mean(approx_miracle_mse, axis=0)) + print(np.mean(modified_miracle_mse, axis=0)) if config.run_privunit: print("privunit mse:") print(np.mean(privunit_mse, axis=0)) @@ -140,20 +139,20 @@ def evaluate(work_path, config, file_open=open): plt.errorbar( vary_space, np.mean(modified_miracle_mse, axis=0), - yerr=np.std(modified_miracle_mse, axis=0)/np.sqrt(num_itr), + yerr=np.std(modified_miracle_mse, axis=0)/np.sqrt(config.num_itr), linewidth = 3.0, label="MMRC") if config.run_privunit: - line1 = plt.errorbar(vary_space, - [np.mean(privunit_mse, axis=0)[0]]*len(vary_space), - yerr = [np.std(privunit_mse, axis=0)[0]/np.sqrt(num_itr)]*len(vary_space), + line1 = plt.errorbar(vary_space, + [np.mean(privunit_mse, axis=0)[0]]*len(vary_space), + yerr = [np.std(privunit_mse, axis=0)[0]/np.sqrt(config.num_itr)]*len(vary_space), ls='--', linewidth = 3.0, label="PrivUnit$_{2}$") if config.run_sqkr: - line2 = plt.errorbar(vary_space, - [np.mean(sqkr_mse, axis=0)[0]]*len(vary_space), - yerr = [np.std(sqkr_mse, axis=0)[0]/np.sqrt(num_itr)]*len(vary_space), + line2 = plt.errorbar(vary_space, + [np.mean(sqkr_mse, axis=0)[0]]*len(vary_space), + yerr = [np.std(sqkr_mse, axis=0)[0]/np.sqrt(config.num_itr)]*len(vary_space), ls='--', linewidth = 3.0, label="SQKR") @@ -180,4 +179,4 @@ def evaluate(work_path, config, file_open=open): if config.run_sqkr: with file_open(work_path + "/sqkr_mse.csv", "w") as f: - np.savetxt(f, sqkr_mse, delimiter=",") \ No newline at end of file + np.savetxt(f, sqkr_mse, delimiter=",") From 441630e18063ccad5206837fbfeb330318f15235 Mon Sep 17 00:00:00 2001 From: Abhin Shah Date: Wed, 23 Feb 2022 20:43:31 -0500 Subject: [PATCH 4/4] Fixing more pylint issues --- .../experiment_coding_cost.py | 7 +++-- rcc_dp/frequency_estimation/miracle.py | 26 +++++++++++++++++-- rcc_dp/frequency_estimation/rhr.py | 7 +++-- rcc_dp/frequency_estimation/ss.py | 12 ++++----- .../mean_estimation/experiment_coding_cost.py | 7 +++-- 5 files changed, 38 insertions(+), 21 deletions(-) diff --git a/rcc_dp/frequency_estimation/experiment_coding_cost.py b/rcc_dp/frequency_estimation/experiment_coding_cost.py index b017245b..5f4dfbfa 100644 --- a/rcc_dp/frequency_estimation/experiment_coding_cost.py +++ b/rcc_dp/frequency_estimation/experiment_coding_cost.py @@ -11,8 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Experiment definitions (i.e., evaluation of miracle, rhr, subset selection -methods when the coding cost is varied).""" +"""Evaluation of miracle, rhr, and subset selection when coding cost is varied.""" import json import math @@ -161,14 +160,14 @@ def evaluate(work_path, config, file_open=open): linewidth = 3.0, label="MMRC") if config.run_ss: - line1 = plt.errorbar(vary_space, + plt.errorbar(vary_space, [np.mean(ss_error, axis=0)[0]]*len(vary_space), yerr = [np.std(ss_error, axis=0)[0]/np.sqrt(config.num_itr)]*len(vary_space), ls='--', linewidth = 3.0, label="Subset Selection") if config.run_rhr: - line2 = plt.errorbar(vary_space, + plt.errorbar(vary_space, [np.mean(rhr_error, axis=0)[0]]*len(vary_space), yerr = [np.std(rhr_error, axis=0)[0]/np.sqrt(config.num_itr)]*len(vary_space), ls='--', diff --git a/rcc_dp/frequency_estimation/miracle.py b/rcc_dp/frequency_estimation/miracle.py index 84c9432c..89c8a648 100644 --- a/rcc_dp/frequency_estimation/miracle.py +++ b/rcc_dp/frequency_estimation/miracle.py @@ -117,6 +117,16 @@ def encode_decode_miracle_fast(seed, x, k, epsilon, number_candidates): """A fast implementation of the miracle protocol -- instead of generating number_candidates samples, generate one sample with true expectation. + + Args: + seed: The random seed to be used by the encoder. + x: The user input data. + k: The dimension of the one-hot encoded x. + epsilon: The privacy parameter epsilon. + number_candidates: The number of candidates to be sampled. + + Returns: + z: The candidate sampled at the decoder. """ d = int(np.ceil(k/(np.exp(epsilon)+1))) num_cand_in_cap = np.random.binomial(number_candidates, d/k, size=None) @@ -136,13 +146,24 @@ def encode_decode_miracle_fast(seed, x, k, epsilon, number_candidates): rs = np.random.RandomState(seed) rs.shuffle(z) z = z[:x] + [0] + z[x:] - return np.array(z) + z = np.array(z) + return z def encode_decode_modified_miracle_fast(seed, x, k, epsilon, number_candidates): """A fast implementation of the modified miracle protocol -- instead of generating number_candidates samples, generate one sample with true expectation. + + Args: + seed: The random seed to be used by the encoder. + x: The user input data. + k: The dimension of the one-hot encoded x. + epsilon: The privacy parameter epsilon. + number_candidates: The number of candidates to be sampled. + + Returns: + z: The candidate sampled at the decoder. """ d = int(np.ceil(k/(np.exp(epsilon)+1))) c1 = np.exp(epsilon)/(sc.comb(k-1,d-1)* np.exp(epsilon) + sc.comb(k-1,d)) @@ -174,4 +195,5 @@ def encode_decode_modified_miracle_fast(seed, x, k, epsilon, number_candidates): rs = np.random.RandomState(seed) rs.shuffle(z) z = z[:x] + [0] + z[x:] - return np.array(z) + z = np.array(z) + return z diff --git a/rcc_dp/frequency_estimation/rhr.py b/rcc_dp/frequency_estimation/rhr.py index 8e021f5d..f783f705 100644 --- a/rcc_dp/frequency_estimation/rhr.py +++ b/rcc_dp/frequency_estimation/rhr.py @@ -73,9 +73,7 @@ def rr_encode_string(alphabet_size, epsilon, samples): def decode_string_fast(dim, epsilon, comm, z, normalization = 1): - """Learn the original distribution from the privatized strings when - the input is a matrix consisting of all the bit vectors. - """ + """Learn the original distribution from the privatized strings faster.""" l = len(z) # Pad dim to the power of 2. padded_d = int(math.pow(2, math.ceil(math.log(dim,2)))) @@ -87,6 +85,7 @@ def decode_string_fast(dim, epsilon, comm, z, normalization = 1): block_size = int(padded_d/math.pow(2, eff_comm-1)) n = int(l/block_size)*block_size + # The input z is a matrix consisting of all the bit vectors. group_list = np.array(z[:n]).reshape(int(n/block_size), block_size).T # Create histograms to specify the empirical distributions of each group. @@ -130,7 +129,7 @@ def get_hadamard_entry(d, x, y): def fast_inverse_hadamard_transform(k, dist): - """ Performs inverse Hadamard transform.""" + """Performs inverse Hadamard transform.""" if k == 1: return dist dist1 = dist[0 : k//2] diff --git a/rcc_dp/frequency_estimation/ss.py b/rcc_dp/frequency_estimation/ss.py index 86ac8e6d..572f42ad 100644 --- a/rcc_dp/frequency_estimation/ss.py +++ b/rcc_dp/frequency_estimation/ss.py @@ -47,10 +47,7 @@ def probability_normalize(p): def encode_string_fast(k, epsilon, x): - """A fast implementation of the subset selection protocol -- instead - of selecting exactly d ones, set each bit to be one independently - with true expectation. - """ + """A fast implementation of the subset selection protocol.""" d = int(np.ceil(k/(np.exp(epsilon)+1))) p = d*np.exp(epsilon)/(d*np.exp(epsilon)+k-d) q = (d-p)/(k-1) @@ -59,20 +56,21 @@ def encode_string_fast(k, epsilon, x): z = np.zeros((n, k)) flip = np.random.random_sample((n, k)) + # Instead of selecting exactly d ones, set each bit to be one + # independently with true expectation. for i in range(n): z[i,x[i]] = np.logical_or(0,flip[i,x[i]] < p) return np.logical_or(z, flip < q) def decode_string(k, epsilon, z, length, normalization = 1): - """Learn the original distribution from the privatized strings when - the input is a matrix consisting of all the bit vectors. - """ + """Learn the original distribution from the privatized strings faster.""" d = int(np.ceil(k/(np.exp(epsilon)+1))) temp1 = ((k-1)*np.exp(epsilon) +1.0*(k-1)*(k-d)/d)/((k-d)*(np.exp(epsilon)-1)) temp2 = ((d-1)*np.exp(epsilon)+k-d) / (1.0*(k-d)*(np.exp(epsilon)-1)) + # The input z is a matrix consisting of all the bit vectors. p_estimate = (1.0*np.sum(z, axis=0)*temp1/length)-temp2 if normalization == 0: diff --git a/rcc_dp/mean_estimation/experiment_coding_cost.py b/rcc_dp/mean_estimation/experiment_coding_cost.py index fc037ced..c8bf74e2 100644 --- a/rcc_dp/mean_estimation/experiment_coding_cost.py +++ b/rcc_dp/mean_estimation/experiment_coding_cost.py @@ -11,8 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Experiment definitions (i.e., evaluation of miracle, sqkr, privunit methods -when the coding cost is varied).""" +"""Evaluation of miracle, sqkr, and privunit when coding cost is varied.""" import json import math @@ -143,14 +142,14 @@ def evaluate(work_path, config, file_open=open): linewidth = 3.0, label="MMRC") if config.run_privunit: - line1 = plt.errorbar(vary_space, + plt.errorbar(vary_space, [np.mean(privunit_mse, axis=0)[0]]*len(vary_space), yerr = [np.std(privunit_mse, axis=0)[0]/np.sqrt(config.num_itr)]*len(vary_space), ls='--', linewidth = 3.0, label="PrivUnit$_{2}$") if config.run_sqkr: - line2 = plt.errorbar(vary_space, + plt.errorbar(vary_space, [np.mean(sqkr_mse, axis=0)[0]]*len(vary_space), yerr = [np.std(sqkr_mse, axis=0)[0]/np.sqrt(config.num_itr)]*len(vary_space), ls='--',