From dd546e9be5de0595a7e6c232f58568ab1594ae79 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Wed, 29 Apr 2015 11:17:21 -0700 Subject: [PATCH 01/67] Adding some more testing to assoc analysis. --- tests/analyze_assoc.R | 19 ++++++++- tests/assoc_sim.R | 89 ++++++++++++++++++++++++++++++++++--------- tests/gen_counts.R | 2 +- tests/params.csv | 2 +- tests/uvals.csv | 4 +- 5 files changed, 94 insertions(+), 22 deletions(-) diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R index 5d78806f..56b66ea0 100755 --- a/tests/analyze_assoc.R +++ b/tests/analyze_assoc.R @@ -114,9 +114,26 @@ main <- function(opts) { ignore_other = TRUE, params, marginals = NULL, estimate_var = FALSE) + + # Hardcoded place to lookup true distribution + # TODO(pseudorandom): Make this a flag + td <- read.csv(file = "truedist.csv") + ed <- joint_dist$fit + + # L1 distance = 1 - sum(min(td|x, ed|x)) where + # td|x / ed|x projects the distribution to the intersection x of the + # supports of td and ed + rowsi <- intersect(rownames(td), rownames(ed)) + colsi <- intersect(colnames(td), colnames(ed)) + print("L1 DISTANCE") + print(1 - sum(mapply(min, + unlist(td[rowsi, colsi], use.names = FALSE), + unlist(as.data.frame(ed)[rowsi, colsi], use.names = FALSE) + ))) + # TODO(pseudorandom): Export the results to a file for further analysis print("JOINT_DIST$FIT") - print(joint_dist$fit) + print(signif(ed[order(rowSums(ed)),], 4)) print("PROC.TIME") print(proc.time() - ptm) } diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R index 3ff1e5df..6ac2b857 100755 --- a/tests/assoc_sim.R +++ b/tests/assoc_sim.R @@ -41,12 +41,17 @@ if(!interactive()) { help = "Filename for RAPPOR parameters"), make_option(c("--reports", "-r"), default = "reports.csv", help = "Filename for reports"), + make_option(c("--true", "-t"), default = "truedist.csv", + help = "Filename for the true distribution"), make_option(c("--map", "-m"), default = "map", help = "Filename *prefix* for map(s)"), make_option(c("--num", "-n"), default = 1e05, help = "Number of reports"), - make_option(c("--unif", "-u"), default = FALSE, - help = "Run simulation with uniform distribution") + make_option(c("--extras", "-e"), default = TRUE, + help = "Does 1st map have spurious candidates?"), + make_option(c("--distr", "-d"), default = "zipfg", + help = "Type of distribution. Choose between + {unif, poisson, poisson2}") ) opts <- parse_args(OptionParser(option_list = option_list)) } @@ -56,6 +61,7 @@ source("../analysis/R/decode.R") source("../analysis/R/simulation.R") source("../analysis/R/read_input.R") source("../analysis/R/association.R") +source("../tests/gen_counts.R") # Read unique values of reports from a csv file # Inputs: filename. The file is expected to contain two rows of strings @@ -83,28 +89,66 @@ GetUniqueValsFromFile <- function(filename) { # Inputs: N = number of reports # uvals = list containing a list of unique values # params = list with RAPPOR parameters -# unif = whether to replace poisson with uniform +# distr = the type of distribution to use +# {unif, poisson, poisson2, zipfg} +# extras = whether map_1.csv has spurious candidates or not # mapfile = file to write maps into (with .csv suffixes) # reportsfile = file to write reports into (with .csv suffix) -SimulateReports <- function(N, uvals, params, unif, +SimulateReports <- function(N, uvals, params, distr, extras, truefile, mapfile, reportsfile) { # Compute true distribution m <- params$m - if (unif) { + if (distr == "unif") { # Draw uniformly from 1 to 10 v1_samples <- as.integer(runif(N, 1, 10)) - } else { + + # Pr[var2 = N + 1 | var1 = N] = 0.5 + # Pr[var2 = N | var1 = N] = 0.5 + v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE) + + } else if(distr == "poisson") { # Draw from a Poisson random variable v1_samples <- rpois(N, 1) + rep(1, N) + + # Pr[var2 = N + 1 | var1 = N] = 0.5 + # Pr[var2 = N | var1 = N] = 0.5 + v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE) + } else if (distr == "poisson2") { + + v1_samples <- rpois(N, 1) + rep(1, N) + # supp(var2) = {1, 2} + # Pr[var2 = 1 | var1 = even] = 0.75 + # Pr[var2 = 1 | var1 = odd] = 0.25 + pr25 <- rbinom(N, 1, 0.25) + 1 + pr75 <- rbinom(N, 1, 0.75) + 1 + v2_samples <- rep(1, N) + v2_samples[v1_samples %% 2 == 0] <- pr25[v1_samples %% 2 == 0] + v2_samples[v1_samples %% 2 == 1] <- pr75[v1_samples %% 2 == 1] + } else if (distr == "zipfg") { + + # Zipfian over 25 strings + partition <- RandomPartition(N, ComputePdf("zipf1.5", 25)) + v1_samples <- rep(1:25, partition) # expand partition + # Shuffle values randomly (may take a few sec for > 10^8 inputs) + v1_samples <- sample(v1_samples) + + # supp(var2) = {1, 2, 3, 4, 6} + # We look at two zipfian distributions over supp(var2) + # D1 = zipfian distribution + # D2 = zipfian distr over {6, 5, 4, 3, 2, 1} + # (i.e., D1 in reverse) + # var2 ~ D1 if var1 = even + # var2 ~ D2 if var1 = odd + d1 <- sample(rep(1:6, RandomPartition(N, ComputePdf("zipf1.5", 6)))) + d2 <- c(6, 5, 4, 3, 2, 1)[d1] + v2_samples <- rep(1, N) + v2_samples[v1_samples %% 2 == 0] <- d1[v1_samples %% 2 == 0] + v2_samples[v1_samples %% 2 == 1] <- d2[v1_samples %% 2 == 1] } - - # Pr[var2 = N + 1 | var1 = N] = 0.5 - # Pr[var2 = N | var1 = N] = 0.5 - v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE) - + tmp_samples <- list(v1_samples, v2_samples) - + # Function to pad strings to uval_vec if sample_vec has # larger support than the number of strings in uval_vec # For e.g., if samples have support {1, 2, 3, 4, ...} and uvals @@ -122,21 +166,31 @@ SimulateReports <- function(N, uvals, params, unif, } uval_vec } - + # Pad and update uvals uvals <- lapply(1:2, function(i) PadStrings(tmp_samples[[i]], uvals[[i]])) - # Replace integers in tmp_samples with actual sample strings samples <- lapply(1:2, function(i) uvals[[i]][tmp_samples[[i]]]) + print("TRUE DISTR") + td <- table(samples)/sum(table(samples)) + td <- td[order(rowSums(td), decreasing = TRUE),] + print(td) + write.table(td, file = truefile, sep = ",", col.names = TRUE, + row.names = TRUE, quote = FALSE) # Randomly assign cohorts in each dimension cohorts <- sample(1:m, N, replace = TRUE) # Create and write map into mapfile_1.csv and mapfile_2.csv + if (extras == TRUE) { + # 1000 spurious candidates for mapfile_1.csv + len <- length(uvals[[1]]) + 1000 + uvals[[1]] <- PadStrings(len, uvals[[1]]) + } map <- lapply(uvals, function(u) CreateMap(u, params)) write.table(map[[1]]$map_pos, file = paste(mapfile, "_1.csv", sep = ""), - sep = ",", col.names = FALSE, na = "", quote = FALSE) + sep = ",", col.names = FALSE, na = "", quote = FALSE) write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""), sep = ",", col.names = FALSE, na = "", quote = FALSE) @@ -160,8 +214,9 @@ main <- function(opts) { uvals <- GetUniqueValsFromFile(opts$uvals) params <- ReadParameterFile(opts$params) - SimulateReports(opts$num, uvals, params, opts$unif, # inputs - opts$map, opts$reports) # outputs + SimulateReports(opts$num, uvals, params, opts$distr, # inuts + opts$extras, opts$true, # inputs + opts$map, opts$reports) # outputs print("PROC.TIME") print(proc.time() - ptm) diff --git a/tests/gen_counts.R b/tests/gen_counts.R index e947a5cf..4c8359f6 100755 --- a/tests/gen_counts.R +++ b/tests/gen_counts.R @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -source('analysis/R/read_input.R') +source('../analysis/R/read_input.R') RandomPartition <- function(total, weights) { # Outputs a random partition according to a specified distribution diff --git a/tests/params.csv b/tests/params.csv index a2114c90..0dd2c58c 100644 --- a/tests/params.csv +++ b/tests/params.csv @@ -1,2 +1,2 @@ k, h, m, p, q, f -16, 2, 4, 0.1, 0.9, 0.2 +16, 2, 64, 0.1, 0.9, 0.2 diff --git a/tests/uvals.csv b/tests/uvals.csv index cebc17ec..986e994f 100644 --- a/tests/uvals.csv +++ b/tests/uvals.csv @@ -1,2 +1,2 @@ -google.com,intel.com,yahoo.com -ssl,nossl +str1 +option1,option2,option3,option4,option5,option6 From 62375c6ac4ebed6b6fff5b0484972f1200e0af92 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 30 Apr 2015 13:59:41 -0700 Subject: [PATCH 02/67] Adding assoctest.sh. --- assoctest.sh | 362 +++++++++++++++++++++++++++++++++++++++++++++ tests/gen_counts.R | 2 +- 2 files changed, 363 insertions(+), 1 deletion(-) create mode 100755 assoctest.sh diff --git a/assoctest.sh b/assoctest.sh new file mode 100755 index 00000000..95805201 --- /dev/null +++ b/assoctest.sh @@ -0,0 +1,362 @@ +#!/bin/bash +# +# Run end-to-end tests in parallel. +# +# Usage: +# ./regtest.sh + +# At the end, it will print an HTML summary. +# +# Three main functions are +# run [[ [ []] - run tests matching in +# parallel, each times. The fast +# mode (T/F) shortcuts generation of +# reports. +# run-seq [ [ []] - ditto, except that tests are run +# sequentially +# run-all [] - run all tests, in parallel, each times +# +# Examples: +# $ ./regtest.sh run-seq unif-small-typical # Sequential run, matches 1 case +# $ ./regtest.sh run-seq unif-small- 3 F # Sequential, each test is run three +# times, using slow generation +# $ ./regtest.sh run unif- # Parallel run, matches multiple cases +# $ ./regtest.sh run unif- 5 # Parallel run, matches multiple cases, each test +# is run 5 times +# $ ./regtest.sh run-all # Run all tests once +# +# The argument is a regex in 'grep -E' format. (Detail: Don't +# use $ in the pattern, since it matches the whole spec line and not just the +# test case name.) The number of processors used in a parallel run is one less +# than the number of CPUs on the machine. + + +# Future speedups: +# - Reuse the same input -- come up with naming scheme based on params +# - Reuse the same maps -- ditto, rappor library can cache it + +set -o nounset +set -o pipefail +set -o errexit + +. util.sh + +readonly THIS_DIR=$(dirname $0) +readonly REPO_ROOT=$THIS_DIR +readonly CLIENT_DIR=$REPO_ROOT/client/python +readonly REGTEST_DIR=_tmp/regtest +readonly ASSOCTEST_DIR=_tmp/assoctest + +# All the Python tools need this +export PYTHONPATH=$CLIENT_DIR + +#print-true-inputs() { +# local num_unique_values=$1 +# seq 1 $num_unique_values | awk '{print "v" $1}' +#} + +# Add some more candidates here. We hope these are estimated at 0. +# e.g. if add_start=51, and num_additional is 20, show v51-v70 +#more-candidates() { +# local last_true=$1 +# local num_additional=$2 +# +# local begin +# local end +# begin=$(expr $last_true + 1) +# end=$(expr $last_true + $num_additional) +# +# seq $begin $end | awk '{print "v" $1}' +#} + +# Args: +# true_inputs: File of true inputs +# last_true: last true input, e.g. 50 if we generated "v1" .. "v50". +# num_additional: additional candidates to generate (starting at 'last_true') +# to_remove: Regex of true values to omit from the candidates list, or the +# string 'NONE' if none should be. (Our values look like 'v1', 'v2', etc. so +# there isn't any ambiguity.) +#print-candidates() { +# local true_inputs=$1 +# local last_true=$2 +# local num_additional=$3 +# local to_remove=$4 +# +# if test $to_remove = NONE; then +# cat $true_inputs # include all true inputs +# else +# egrep -v $to_remove $true_inputs # remove some true inputs +# fi +# more-candidates $last_true $num_additional +#} + +# Generate a single test case, specified by a line of the test spec. +# This is a helper function for _run_tests(). +_setup-one-case() { + local test_case=$1 + + # input params + local dist=$2 + local num_unique_values=$3 + local num_clients=$4 + local values_per_client=$5 + + # RAPPOR params + local num_bits=$6 + local num_hashes=$7 + local num_cohorts=$8 + local p=$9 + local q=${10} # need curly braces to get the 10th arg + local f=${11} + + # map params + local num_additional=${12} + local to_remove=${13} + + banner 'Setting up parameters and candidate files for '$test_case + + local case_dir=$REGTEST_DIR/$test_case + mkdir --verbose -p $case_dir + + # Save the "spec" + echo "$@" > $case_dir/spec.txt + + local params_path=$case_dir/case_params.csv + + echo 'k,h,m,p,q,f' > $params_path + echo "$num_bits,$num_hashes,$num_cohorts,$p,$q,$f" >> $params_path + + print-true-inputs $num_unique_values > $case_dir/case_true_inputs.txt + + local true_map_path=$case_dir/case_true_map.csv + + analysis/tools/hash_candidates.py \ + $params_path \ + < $case_dir/case_true_inputs.txt \ + > $true_map_path + + # banner "Constructing candidates" + + print-candidates \ + $case_dir/case_true_inputs.txt $num_unique_values \ + $num_additional "$to_remove" \ + > $case_dir/case_candidates.txt + + # banner "Hashing candidates to get 'map'" + + analysis/tools/hash_candidates.py \ + $case_dir/case_params.csv \ + < $case_dir/case_candidates.txt \ + > $case_dir/case_map.csv +} + +# Run a single test instance, specified by . +# This is a helper function for _run_tests(). +_run-one-instance() { + local test_case=$1 + local test_instance=$2 + local fast_counts=$3 + + local case_dir=$REGTEST_DIR/$test_case + + read -r case_name distr num_unique_values num_clients \ + values_per_client num_bits num_hashes num_cohorts p q f num_additional \ + to_remove < $case_dir/spec.txt + + local instance_dir=$REGTEST_DIR/$test_case/$test_instance + mkdir --verbose -p $instance_dir + + if test $fast_counts = T; then + local params_file=$case_dir/case_params.csv + local true_map_file=$case_dir/case_true_map.csv + + banner "Using gen_counts.R" + + tests/gen_counts.R $distr $num_clients $values_per_client $params_file \ + $true_map_file "$instance_dir/case" + else + banner "Generating input" + + tests/gen_reports.R $distr $num_unique_values $num_clients \ + $values_per_client $instance_dir/case.csv + + banner "Running RAPPOR client" + + # Writes encoded "out" file, true histogram, true inputs to $instance_dir. + tests/rappor_sim.py \ + --num-bits $num_bits \ + --num-hashes $num_hashes \ + --num-cohorts $num_cohorts \ + -p $p \ + -q $q \ + -f $f \ + -i $instance_dir/case.csv \ + --out-prefix "$instance_dir/case" + + banner "Summing bits to get 'counts'" + + analysis/tools/sum_bits.py \ + $case_dir/case_params.csv \ + < $instance_dir/case_out.csv \ + > $instance_dir/case_counts.csv + fi + + local out_dir=${instance_dir}_report + mkdir --verbose -p $out_dir + + # Currently, the summary file shows and aggregates timing of the inference + # engine, which excludes R's loading time and reading of the (possibly + # substantial) map file. Timing below is more inclusive. + TIMEFORMAT='Running analyze.R took %R seconds' + time { + # Input prefix, output dir + tests/analyze.R -t "Test case: $test_case (instance $test_instance)" \ + "$case_dir/case" "$instance_dir/case" $out_dir + } +} + +# Like _run-once-case, but log to a file. +_run-one-instance-logged() { + local test_case_id=$1 + local test_case_run=$2 + + local log_dir=$REGTEST_DIR/$test_case_id/${test_case_run}_report + mkdir --verbose -p $log_dir + + log "Started '$test_case_id' (instance $test_case_run) -- logging to $log_dir/log.txt" + _run-one-instance "$@" >$log_dir/log.txt 2>&1 \ + && log "Test case $test_case_id (instance $test_case_run) done" \ + || log "Test case $test_case_id (instance $test_case_run) failed" +} + +#make-summary() { +# local dir=$1 +# local filename=${2:-results.html} +# +# tests/make_summary.py $dir > $dir/rows.html +# +# pushd $dir >/dev/null +# +# cat ../../tests/regtest.html \ +# | sed -e '/TABLE_ROWS/ r rows.html' \ +# > $filename +# +# popd >/dev/null +# +# log "Wrote $dir/$filename" +# log "URL: file://$PWD/$dir/$filename" +#} + +test-error() { + local spec_regex=${1:-} + log "Some test cases failed" + if test -n "$spec_regex"; then + log "(Perhaps none matched pattern '$spec_regex')" + fi + # don't quit just yet + # exit 1 +} + +# Assuming the spec file, write a list of test case names (first column) with +# the instance ids (second column), where instance ids run from 1 to $1. +# Third column is fast_counts (T/F). +#_setup-test-instances() { +# local instances=$1 +# local fast_counts=$2 +# +# while read line; do +# for i in $(seq 1 $instances); do +# read case_name _ <<< $line # extract the first token +# echo $case_name $i $fast_counts +# done +# done +#} + +# Args: +# regexp: A pattern selecting the subset of tests to run +# instances: A number of times each test case is run +# parallel: Whether the tests are run in parallel (T/F) +# fast_counts: Whether counts are sampled directly (T/F) +# +_run-tests() { + local spec_regex=$1 # grep -E format on the spec + local instances=$2 + local parallel=$3 + local fast_counts=$4 + + rm -r -f --verbose $ASSOCTEST_DIR + + mkdir --verbose -p $ASSOCTEST_DIR + + echo "PARAMS" + echo $spec_regex + echo $instances + echo $parallel + echo $fast_counts + + local func + local processors=1 + + if test $parallel = F; then + func=_run-one-instance output to the console + else + func=_run-one-instance-logged + processors=$(grep -c ^processor /proc/cpuinfo || echo 4) # POSIX-specific + if test $processors -gt 1; then # leave one CPU for the OS + processors=$(expr $processors - 1) + fi + log "Running $processors parallel processes" + fi + + echo "FUNC" + echo $func + + local cases_list=$ASSOCTEST_DIR/test-cases.txt + tests/regtest_spec.py | grep -E $spec_regex > $cases_list + break + + # Generate parameters for all test cases. + cat $cases_list \ + | xargs -l -P $processors -- $0 _setup-one-case \ + || test-error + + log "Done generating parameters for all test cases" + + local instances_list=$REGTEST_DIR/test-instances.txt + _setup-test-instances $instances $fast_counts < $cases_list > $instances_list + + cat $instances_list \ + | xargs -l -P $processors -- $0 $func || test-error + + log "Done running all test instances" + + make-summary $REGTEST_DIR +} + +# Run tests sequentially +#run-seq() { +# local spec_regex=${1:-'^r-'} # grep -E format on the spec +# local instances=${2:-1} +# local fast_counts=${3:-T} +# +# _run-tests $spec_regex $instances F $fast_counts +#} + +# Run tests in parallel +#run() { +# local spec_regex=${1:-'^r-'} # grep -E format on the spec +# local instances=${2:-1} +# local fast_counts=${3:-T} +# +# _run-tests $spec_regex $instances T $fast_counts +#} + +# Run tests in parallel +run-all() { + local instances=${1:-1} + + log "Running all tests. Can take a while." + _run-tests '^r-' $instances T T +} + +"$@" diff --git a/tests/gen_counts.R b/tests/gen_counts.R index 4c8359f6..e947a5cf 100755 --- a/tests/gen_counts.R +++ b/tests/gen_counts.R @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -source('../analysis/R/read_input.R') +source('analysis/R/read_input.R') RandomPartition <- function(total, weights) { # Outputs a random partition according to a specified distribution From 935309ee56d0f333f0670c4f05cb0abf857a1673 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Wed, 6 May 2015 17:47:46 -0700 Subject: [PATCH 03/67] Assoctest.sh test suite. --- assoctest.sh | 154 ++++++++++++++---------------------------- tests/analyze_assoc.R | 77 ++++++++++++--------- tests/assoc_sim.R | 45 ++++++++---- tests/regtest_spec.py | 27 ++++++++ tests/uvals.csv | 2 +- 5 files changed, 156 insertions(+), 149 deletions(-) diff --git a/assoctest.sh b/assoctest.sh index 95805201..7b806ef8 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -6,11 +6,11 @@ # ./regtest.sh # At the end, it will print an HTML summary. -# -# Three main functions are +# +# Three main functions are # run [[ [ []] - run tests matching in -# parallel, each times. The fast -# mode (T/F) shortcuts generation of +# parallel, each times. The fast +# mode (T/F) shortcuts generation of # reports. # run-seq [ [ []] - ditto, except that tests are run # sequentially @@ -21,7 +21,7 @@ # $ ./regtest.sh run-seq unif-small- 3 F # Sequential, each test is run three # times, using slow generation # $ ./regtest.sh run unif- # Parallel run, matches multiple cases -# $ ./regtest.sh run unif- 5 # Parallel run, matches multiple cases, each test +# $ ./regtest.sh run unif- 5 # Parallel run, matches multiple cases, each test # is run 5 times # $ ./regtest.sh run-all # Run all tests once # @@ -79,7 +79,7 @@ export PYTHONPATH=$CLIENT_DIR #print-candidates() { # local true_inputs=$1 # local last_true=$2 -# local num_additional=$3 +# local num_additional=$3 # local to_remove=$4 # # if test $to_remove = NONE; then @@ -98,8 +98,9 @@ _setup-one-case() { # input params local dist=$2 local num_unique_values=$3 - local num_clients=$4 - local values_per_client=$5 + local num_unique_values2=$4 + local num_clients=$5 + local values_per_client=$6 # RAPPOR params local num_bits=$6 @@ -109,13 +110,9 @@ _setup-one-case() { local q=${10} # need curly braces to get the 10th arg local f=${11} - # map params - local num_additional=${12} - local to_remove=${13} - banner 'Setting up parameters and candidate files for '$test_case - local case_dir=$REGTEST_DIR/$test_case + local case_dir=$ASSOCTEST_DIR/$test_case mkdir --verbose -p $case_dir # Save the "spec" @@ -125,29 +122,6 @@ _setup-one-case() { echo 'k,h,m,p,q,f' > $params_path echo "$num_bits,$num_hashes,$num_cohorts,$p,$q,$f" >> $params_path - - print-true-inputs $num_unique_values > $case_dir/case_true_inputs.txt - - local true_map_path=$case_dir/case_true_map.csv - - analysis/tools/hash_candidates.py \ - $params_path \ - < $case_dir/case_true_inputs.txt \ - > $true_map_path - - # banner "Constructing candidates" - - print-candidates \ - $case_dir/case_true_inputs.txt $num_unique_values \ - $num_additional "$to_remove" \ - > $case_dir/case_candidates.txt - - # banner "Hashing candidates to get 'map'" - - analysis/tools/hash_candidates.py \ - $case_dir/case_params.csv \ - < $case_dir/case_candidates.txt \ - > $case_dir/case_map.csv } # Run a single test instance, specified by . @@ -155,63 +129,44 @@ _setup-one-case() { _run-one-instance() { local test_case=$1 local test_instance=$2 - local fast_counts=$3 - local case_dir=$REGTEST_DIR/$test_case - - read -r case_name distr num_unique_values num_clients \ - values_per_client num_bits num_hashes num_cohorts p q f num_additional \ - to_remove < $case_dir/spec.txt + local case_dir=$ASSOCTEST_DIR/$test_case - local instance_dir=$REGTEST_DIR/$test_case/$test_instance - mkdir --verbose -p $instance_dir + read -r case_name case_descr num_unique_values num_unique_values2 \ + num_clients num_bits num_hashes num_cohorts p q f < $case_dir/spec.txt - if test $fast_counts = T; then - local params_file=$case_dir/case_params.csv - local true_map_file=$case_dir/case_true_map.csv + local instance_dir=$ASSOCTEST_DIR/$test_case/$test_instance + mkdir --verbose -p $instance_dir - banner "Using gen_counts.R" + banner "Running association input simulation" - tests/gen_counts.R $distr $num_clients $values_per_client $params_file \ - $true_map_file "$instance_dir/case" - else - banner "Generating input" - - tests/gen_reports.R $distr $num_unique_values $num_clients \ - $values_per_client $instance_dir/case.csv - - banner "Running RAPPOR client" - - # Writes encoded "out" file, true histogram, true inputs to $instance_dir. - tests/rappor_sim.py \ - --num-bits $num_bits \ - --num-hashes $num_hashes \ - --num-cohorts $num_cohorts \ - -p $p \ - -q $q \ - -f $f \ - -i $instance_dir/case.csv \ - --out-prefix "$instance_dir/case" - - banner "Summing bits to get 'counts'" - - analysis/tools/sum_bits.py \ - $case_dir/case_params.csv \ - < $instance_dir/case_out.csv \ - > $instance_dir/case_counts.csv - fi + tests/assoc_sim.R \ + -p $case_dir/case_params.csv \ + -r $instance_dir/reports.csv \ + -t $instance_dir/truedist.csv \ + -m $instance_dir/map \ + -n $num_clients \ + --var1_num $num_unique_values \ + --var2_num $num_unique_values2 local out_dir=${instance_dir}_report mkdir --verbose -p $out_dir # Currently, the summary file shows and aggregates timing of the inference - # engine, which excludes R's loading time and reading of the (possibly + # engine, which excludes R's loading time and reading of the (possibly # substantial) map file. Timing below is more inclusive. TIMEFORMAT='Running analyze.R took %R seconds' time { + tests/analyze_assoc.R \ + --map1 $instance_dir/map_1.csv \ + --map2 $instance_dir/map_2.csv \ + --reports $instance_dir/reports.csv \ + --truefile $instance_dir/truedist.csv \ + --outdir $out_dir \ + --params $case_dir/case_params.csv # Input prefix, output dir - tests/analyze.R -t "Test case: $test_case (instance $test_instance)" \ - "$case_dir/case" "$instance_dir/case" $out_dir +# tests/analyze.R -t "Test case: $test_case (instance $test_instance)" \ +# "$case_dir/case" "$instance_dir/case" $out_dir } } @@ -220,7 +175,7 @@ _run-one-instance-logged() { local test_case_id=$1 local test_case_run=$2 - local log_dir=$REGTEST_DIR/$test_case_id/${test_case_run}_report + local log_dir=$ASSOCTEST_DIR/$test_case_id/${test_case_run}_report mkdir --verbose -p $log_dir log "Started '$test_case_id' (instance $test_case_run) -- logging to $log_dir/log.txt" @@ -254,23 +209,21 @@ test-error() { log "(Perhaps none matched pattern '$spec_regex')" fi # don't quit just yet - # exit 1 + # exit 1 } # Assuming the spec file, write a list of test case names (first column) with # the instance ids (second column), where instance ids run from 1 to $1. -# Third column is fast_counts (T/F). -#_setup-test-instances() { -# local instances=$1 -# local fast_counts=$2 -# -# while read line; do -# for i in $(seq 1 $instances); do -# read case_name _ <<< $line # extract the first token -# echo $case_name $i $fast_counts -# done -# done -#} +_setup-test-instances() { + local instances=$1 + + while read line; do + for i in $(seq 1 $instances); do + read case_name _ <<< $line # extract the first token + echo $case_name $i + done + done +} # Args: # regexp: A pattern selecting the subset of tests to run @@ -298,7 +251,7 @@ _run-tests() { local processors=1 if test $parallel = F; then - func=_run-one-instance output to the console + func=_run-one-instance # output to the console else func=_run-one-instance-logged processors=$(grep -c ^processor /proc/cpuinfo || echo 4) # POSIX-specific @@ -308,12 +261,8 @@ _run-tests() { log "Running $processors parallel processes" fi - echo "FUNC" - echo $func - local cases_list=$ASSOCTEST_DIR/test-cases.txt tests/regtest_spec.py | grep -E $spec_regex > $cases_list - break # Generate parameters for all test cases. cat $cases_list \ @@ -322,15 +271,16 @@ _run-tests() { log "Done generating parameters for all test cases" - local instances_list=$REGTEST_DIR/test-instances.txt - _setup-test-instances $instances $fast_counts < $cases_list > $instances_list + local instances_list=$ASSOCTEST_DIR/test-instances.txt + _setup-test-instances $instances $fast_counts < $cases_list > $instances_list cat $instances_list \ | xargs -l -P $processors -- $0 $func || test-error log "Done running all test instances" + exit 1 - make-summary $REGTEST_DIR + make-summary $ASSOCTEST_DIR } # Run tests sequentially @@ -356,7 +306,7 @@ run-all() { local instances=${1:-1} log "Running all tests. Can take a while." - _run-tests '^r-' $instances T T + _run-tests '^a-' $instances T T } "$@" diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R index 56b66ea0..50f98c33 100755 --- a/tests/analyze_assoc.R +++ b/tests/analyze_assoc.R @@ -1,22 +1,22 @@ #!/usr/bin/env Rscript # # Copyright 2015 Google Inc. All rights reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# Reads map files, report files, and RAPPOR parameters to run +# Reads map files, report files, and RAPPOR parameters to run # an EM algorithm to estimate joint distribution over two or more variables -# +# # Usage: # $ ./analyze_assoc.R -map1 map_1.csv -map2 map_2.csv \ # -reports reports.csv \ @@ -25,7 +25,7 @@ # Outputs: # prints a table with estimated joint probability masses # over candidate strings -# Ex. +# Ex. # ssl nossl # intel 0.1 0.3 # google 0.5 0.1 @@ -43,23 +43,27 @@ if(!interactive()) { help = "Hashed candidates for 2nd variable"), make_option(c("--reports", "-r"), default = "reports.csv", help = "File with raw reports as "), + make_option(c("--truefile", "-t"), default = "truedist.csv", + help = "File with true distribution generated by assoc_sim.R"), + make_option(c("--outdir", "-o"), default = ".", + help = "File where the metrics go"), make_option(c("--params", "-p"), default = "params.csv", help = "Filename for RAPPOR parameters") ) opts <- parse_args(OptionParser(option_list = option_list)) -} +} -source("../analysis/R/encode.R") -source("../analysis/R/decode.R") -source("../analysis/R/simulation.R") -source("../analysis/R/read_input.R") -source("../analysis/R/association.R") +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/read_input.R") +source("analysis/R/association.R") # This function processes the maps loaded using ReadMapFile # Association analysis requires a map object with a map # field that has the map split into cohorts and an rmap field # that has all the cohorts combined -# Arguments: +# Arguments: # map = map object with cohorts as sparse matrix in # object map$map # This is the expected object from ReadMapFile @@ -81,7 +85,7 @@ ProcessMap <- function(map, params) { main <- function(opts) { ptm <- proc.time() - + params <- ReadParameterFile(opts$params) opts_map <- list(opts$map1, opts$map2) map <- lapply(opts_map, function(o) @@ -89,10 +93,10 @@ main <- function(opts) { params = params)) # Reports must be of the format # cohort no, rappor bitstring 1, rappor bitstring 2 - reportsObj <- read.csv(opts$reports, + reportsObj <- read.csv(opts$reports, colClasses = c("integer", "character", "character"), header = FALSE) - + # Parsing reportsObj # ComputeDistributionEM allows for different sets of cohorts # for each variable. Here, both sets of cohorts are identical @@ -100,44 +104,55 @@ main <- function(opts) { cohorts <- list(co, co) # Parse reports from reportObj cols 2 and 3 reports <- lapply(1:2, function(x) as.list(reportsObj[x + 1])) - + # Split strings into bit arrays (as required by assoc analysis) reports <- lapply(1:2, function(i) { # apply the following function to each of reports[[1]] and reports[[2]] lapply(reports[[i]][[1]], function(x) { - # function splits strings and converts them to numeric values + # function splits strings and converts them to numeric values as.numeric(strsplit(x, split = "")[[1]]) }) }) - - joint_dist <- ComputeDistributionEM(reports, cohorts, map, + + joint_dist <- ComputeDistributionEM(reports, cohorts, map, ignore_other = TRUE, params, marginals = NULL, estimate_var = FALSE) - - # Hardcoded place to lookup true distribution - # TODO(pseudorandom): Make this a flag - td <- read.csv(file = "truedist.csv") + + td <- read.csv(file = opts$truefile) ed <- joint_dist$fit - + print("CHI-SQUARED") + td_chisq <- chisq.test(td) + ed_chisq <- chisq.test(ed) + print(td_chisq) + print(ed_chisq) + # L1 distance = 1 - sum(min(td|x, ed|x)) where # td|x / ed|x projects the distribution to the intersection x of the # supports of td and ed rowsi <- intersect(rownames(td), rownames(ed)) colsi <- intersect(colnames(td), colnames(ed)) print("L1 DISTANCE") - print(1 - sum(mapply(min, + l1d <- 1 - sum(mapply(min, unlist(td[rowsi, colsi], use.names = FALSE), unlist(as.data.frame(ed)[rowsi, colsi], use.names = FALSE) - ))) - - # TODO(pseudorandom): Export the results to a file for further analysis + )) + print(l1d) + print("JOINT_DIST$FIT") print(signif(ed[order(rowSums(ed)),], 4)) print("PROC.TIME") - print(proc.time() - ptm) + time_taken <- proc.time() - ptm + print(time_taken) + + # Write metrics to metrics.csv + metrics <- list(td_chisq = td_chisq[1][[1]][[1]], + ed_chisq = ed_chisq[1][[1]][[1]], + l1d = l1d, time = time_taken[1]) + filename <- file.path(opts$outdir, 'metrics.csv') + write.csv(metrics, file = filename, row.names = FALSE) } if(!interactive()) { main(opts) -} \ No newline at end of file +} diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R index 6ac2b857..61ee822f 100755 --- a/tests/assoc_sim.R +++ b/tests/assoc_sim.R @@ -32,7 +32,7 @@ options(stringsAsFactors = FALSE) if(!interactive()) { option_list <- list( - make_option(c("--uvals", "-v"), default = "uvals.csv", + make_option(c("--uvals", "-v"), help = "Filename for list of values over which distributions are simulated. The file is a list of comma-separated strings each line of which refers @@ -47,6 +47,10 @@ if(!interactive()) { help = "Filename *prefix* for map(s)"), make_option(c("--num", "-n"), default = 1e05, help = "Number of reports"), + make_option(c("--var1_num", "-z"), default = 25, + help = "Number of values for var1"), + make_option(c("--var2_num", "-y"), default = 5, + help = "Number of values for var2"), make_option(c("--extras", "-e"), default = TRUE, help = "Does 1st map have spurious candidates?"), make_option(c("--distr", "-d"), default = "zipfg", @@ -56,12 +60,12 @@ if(!interactive()) { opts <- parse_args(OptionParser(option_list = option_list)) } -source("../analysis/R/encode.R") -source("../analysis/R/decode.R") -source("../analysis/R/simulation.R") -source("../analysis/R/read_input.R") -source("../analysis/R/association.R") -source("../tests/gen_counts.R") +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/read_input.R") +source("analysis/R/association.R") +source("tests/gen_counts.R") # Read unique values of reports from a csv file # Inputs: filename. The file is expected to contain two rows of strings @@ -92,9 +96,14 @@ GetUniqueValsFromFile <- function(filename) { # distr = the type of distribution to use # {unif, poisson, poisson2, zipfg} # extras = whether map_1.csv has spurious candidates or not +# truefile = name of the file with true distribution +# var1_num = number of var1 candidates +# var2_num = number of var2 candidates +# *** CURRENTLY ONLY USEFUL IF DISTR = ZIPFG *** # mapfile = file to write maps into (with .csv suffixes) # reportsfile = file to write reports into (with .csv suffix) SimulateReports <- function(N, uvals, params, distr, extras, truefile, + var1_num, var2_num, mapfile, reportsfile) { # Compute true distribution m <- params$m @@ -127,21 +136,22 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, v2_samples[v1_samples %% 2 == 1] <- pr75[v1_samples %% 2 == 1] } else if (distr == "zipfg") { - # Zipfian over 25 strings - partition <- RandomPartition(N, ComputePdf("zipf1.5", 25)) - v1_samples <- rep(1:25, partition) # expand partition + # Zipfian over var1_num strings + partition <- RandomPartition(N, ComputePdf("zipf1.5", var1_num)) + v1_samples <- rep(1:var1_num, partition) # expand partition # Shuffle values randomly (may take a few sec for > 10^8 inputs) v1_samples <- sample(v1_samples) - # supp(var2) = {1, 2, 3, 4, 6} + # supp(var2) = {1, 2, 3, ..., var2_num} # We look at two zipfian distributions over supp(var2) # D1 = zipfian distribution - # D2 = zipfian distr over {6, 5, 4, 3, 2, 1} + # D2 = zipfian distr over {var2_num, ..., 4, 3, 2, 1} # (i.e., D1 in reverse) # var2 ~ D1 if var1 = even # var2 ~ D2 if var1 = odd - d1 <- sample(rep(1:6, RandomPartition(N, ComputePdf("zipf1.5", 6)))) - d2 <- c(6, 5, 4, 3, 2, 1)[d1] + d1 <- sample(rep(1:var2_num, + RandomPartition(N, ComputePdf("zipf1.5", var2_num)))) + d2 <- (var2_num:1)[d1] v2_samples <- rep(1, N) v2_samples[v1_samples %% 2 == 0] <- d1[v1_samples %% 2 == 0] v2_samples[v1_samples %% 2 == 1] <- d2[v1_samples %% 2 == 1] @@ -212,10 +222,15 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, main <- function(opts) { ptm <- proc.time() - uvals <- GetUniqueValsFromFile(opts$uvals) + if(is.null(opts$uvals)) { + uvals = list(var1 = c("str1"), var2 = c("option1")) + } else { + uvals <- GetUniqueValsFromFile(opts$uvals) + } params <- ReadParameterFile(opts$params) SimulateReports(opts$num, uvals, params, opts$distr, # inuts opts$extras, opts$true, # inputs + opts$var1_num, opts$var2_num, # inputs opts$map, opts$reports) # outputs print("PROC.TIME") diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 6774e400..0c5798f0 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -41,6 +41,17 @@ ('large', 10000, 100000000, 1), ) +DISTRIBUTION_PARAMS_ASSOC = ( + # name, num unique values 1, + # num unique values 2, num clients, values per client + ('tiny', 100, 2, int(1e03), 1), # test for insufficient data + ('small', 100, 10, int(1e04), 1), + ('medium', 1000, 10, int(1e05), 1), + ('large', 1000, 10, int(1e06), 1), + ('mediumsquared', 1000, 100, int(1e05), 1), + ('largesquared', int(1e04), 100, int(1e06), 1), +) + # 'k, h, m' as in params file. BLOOMFILTER_PARAMS = { '8x16': (8, 2, 16), # 16 cohorts, 8 bits each, 2 bits set in each @@ -102,6 +113,22 @@ def main(argv): for params in DEMO: rows.append(params) + # Association tests + for (distr_params, num_values1, num_values2, num_clients, + num_reports_per_client) in DISTRIBUTION_PARAMS_ASSOC: + for bloom_params in BLOOMFILTER_PARAMS: + for privacy_params in PRIVACY_PARAMS: + test_name = 'a-{}-{}-{}'.format(distr_params, bloom_params, + privacy_params) + + params = (BLOOMFILTER_PARAMS[bloom_name] + + PRIVACY_PARAMS[privacy_params]) + test_case = (test_name, distr_params, num_values1, num_values2, + num_clients) + params + row_str = [str(element) for element in test_case] + rows.append(row_str) + # End of association tests + for row in rows: print ' '.join(row) diff --git a/tests/uvals.csv b/tests/uvals.csv index 986e994f..18600571 100644 --- a/tests/uvals.csv +++ b/tests/uvals.csv @@ -1,2 +1,2 @@ str1 -option1,option2,option3,option4,option5,option6 +option1 From 17d3f1fe2483ebd283844d08a5a9ecb445503c87 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Wed, 6 May 2015 18:34:23 -0700 Subject: [PATCH 04/67] Modifying files involved with generating summary. --- assoctest.sh | 34 ++-- tests/assoctest.html | 98 +++++++++++ tests/make_summary_assoc.py | 343 ++++++++++++++++++++++++++++++++++++ 3 files changed, 458 insertions(+), 17 deletions(-) create mode 100644 tests/assoctest.html create mode 100755 tests/make_summary_assoc.py diff --git a/assoctest.sh b/assoctest.sh index 7b806ef8..5bbabc30 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -184,23 +184,23 @@ _run-one-instance-logged() { || log "Test case $test_case_id (instance $test_case_run) failed" } -#make-summary() { -# local dir=$1 -# local filename=${2:-results.html} -# -# tests/make_summary.py $dir > $dir/rows.html -# -# pushd $dir >/dev/null -# -# cat ../../tests/regtest.html \ -# | sed -e '/TABLE_ROWS/ r rows.html' \ -# > $filename -# -# popd >/dev/null -# -# log "Wrote $dir/$filename" -# log "URL: file://$PWD/$dir/$filename" -#} +make-summary() { + local dir=$1 + local filename=${2:-results.html} + + tests/make_summary_assoc.py $dir > $dir/rows.html + + pushd $dir >/dev/null + + cat ../../tests/assoctest.html \ + | sed -e '/TABLE_ROWS/ r rows.html' \ + > $filename + + popd >/dev/null + + log "Wrote $dir/$filename" + log "URL: file://$PWD/$dir/$filename" +} test-error() { local spec_regex=${1:-} diff --git a/tests/assoctest.html b/tests/assoctest.html new file mode 100644 index 00000000..91ee25be --- /dev/null +++ b/tests/assoctest.html @@ -0,0 +1,98 @@ + + + + RAPPOR assoctest.sh + + + + + + +

RAPPOR regtest.sh

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py new file mode 100755 index 00000000..ac3fb160 --- /dev/null +++ b/tests/make_summary_assoc.py @@ -0,0 +1,343 @@ +#!/usr/bin/python +"""Given a regtest result tree, prints an HTML summary on stdout. + +See HTML skeleton in tests/regtest.html. +""" + +import os +import re +import sys + + +SUMMARY_ROW = """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +""" + +# Navigation and links to plot. +DETAILS = """\ +

+ Up +

+ + + +

+ +

+ +

+%(name)s files +

+""" + + +def FormatFloat(x, percent): + """Formats a floating-point number.""" + if percent: + return '{:.1f}%'.format(x * 100.0) + else: + return '{:.3f}'.format(x) + + +def FormatMeanWithSem(m_std_error, percent=False): + """Formats an estimate with standard error.""" + if m_std_error is None: + return '' + m, std_error = m_std_error + if std_error is None: + return FormatFloat(m, percent) + else: + return '{}±{}'.format( + FormatFloat(m, percent), + FormatFloat(std_error, percent)) + + +def Mean(l): + """Computes the mean (average) for a list of numbers.""" + if l: + return float(sum(l)) / len(l) + else: + return None + + +def SampleVar(l): + """Computes the sample variance for a list of numbers.""" + if len(l) > 1: + mean = Mean(l) + var = sum([(x - mean) ** 2 for x in l]) / (len(l) - 1) + return var + else: + return None + + +def StandardErrorEstimate(l): + """Returns the standard error estimate for a list of numbers. + + For a singleton the standard error is assumed to be 10% of its value. + """ + if len(l) > 1: + return (SampleVar(l) / len(l)) ** .5 + elif l: + return l[0] / 10.0 + else: + return None + + +def MeanOfMeans(dict_of_lists): + """Returns the average of averages with the standard error of the estimate. + """ + means = [Mean(dict_of_lists[key]) for key in dict_of_lists + if dict_of_lists[key]] + if means: + # Compute variances of the estimate for each sublist. + se = [StandardErrorEstimate(dict_of_lists[key]) ** 2 for key + in dict_of_lists if dict_of_lists[key]] + return (Mean(means), # Mean over all sublists + sum(se) ** .5 / len(se)) # Standard deviation of the mean + else: + return None + + +def ParseSpecFile(spec_filename): + """Parses the spec (parameters) file. + + Returns: + An integer and a string. The integer is the number of bogus candidates + and the string is parameters in the HTML format. + """ + with open(spec_filename) as s: + spec_row = s.readline().split() + + spec_in_html = ' '.join('' % cell for cell in spec_row[1:]) + + return spec_in_html + + +def ExtractTime(log_filename): + """Extracts the elapsed time information from the log file. + + Returns: + Elapsed time (in seconds) or None in case of failure. + """ + if os.path.isfile(log_filename): + with open(log_filename) as log: + log_str = log.read() + # Matching a line output by analyze.R. + match = re.search(r'Inference took ([0-9.]+) seconds', log_str) + if match: + return float(match.group(1)) + return None + + +def ParseMetrics(metrics_file, log_file): + """Processes the metrics file. + + Args: + report_dir: A directory name containing metrics.csv and log.txt. + num_additional: A number of bogus candidates added to the candidate list. + + Returns a pair: + - A dictionary of metrics (some can be []). + - An HTML-formatted portion of the report row. + """ + with open(metrics_file) as m: + m.readline() + metrics_row = m.readline().split(',') + + (td_chisq, ed_chisq, l1d, rtime) = metrics_row + + td_chisq = float(td_chisq) + ed_chisq = float(ed_chisq) + + l1d = float(l1d) + rtime = float(rtime) + + elapsed_time = ExtractTime(log_file) + + metrics_row_str = [ + str(td_chisq), + str(ed_chisq), + str(l1d), + str(rtime), + ] + + metrics_row_dict = { + 'l1d': [l1d], + 'rtime': [rtime], + 'chisqdiff': [abs(td_chisq - ed_chisq)], + } + + # return metrics formatted as HTML table entries + return (metrics_row_dict, + ' '.join('' % cell for cell in metrics_row_str)) + + +def FormatCell1(test_case, test_instance, metrics_file, log_file, plot_file, + link_to_plots): + """Outputs an HTML table entry for the first cell of the row. + + The row is filled if the metrics file exist. The first cell contains a link + that for short tables points to a plot file inline, for large tables to an + external file. + + If the metrics file is missing, the link points to the log file (if one + exists) + """ + relpath_report = '{}/{}_report'.format(test_case, test_instance) + if os.path.isfile(metrics_file): + external_file = plot_file + if link_to_plots: + link = '#{}_{}'.format(test_case, test_instance) # anchor + else: + link = os.path.join(relpath_report, 'dist.png') + else: # no results likely due to an error, puts a link to the log file + external_file = log_file + link = os.path.join(relpath_report, 'log.txt') + + if os.path.isfile(external_file): + return ''.format(link, test_case) + else: # if no file to link to + return ''.format(test_case) + + +def FormatSummaryRow(metrics_lists): + """Outputs an HTML-formatted summary row.""" + means_with_sem = {} # SEM - standard error of the mean + + for key in metrics_lists: + means_with_sem[key] = MeanOfMeans(metrics_lists[key]) + # If none of the lists is longer than one element, drop the SEM component. + if means_with_sem[key] and max([len(l) for l in metrics_lists[key]]) < 2: + means_with_sem[key] = [means_with_sem[key][0], None] + + summary = { + 'name': 'Means', + 'mean_l1d': FormatMeanWithSem(means_with_sem['l1d'], percent=True), + 'mean_chisqdiff': FormatMeanWithSem(means_with_sem['chisqdiff'], percent=True), + 'mean_rtime': FormatMeanWithSem(means_with_sem['rtime']), + } + return SUMMARY_ROW % summary + + +def FormatPlots(base_dir, test_instances): + """Outputs HTML-formatted plots.""" + result = '' + for instance in test_instances: + # A test instance is identified by the test name and the test run. + test_case, test_instance, _ = instance.split(' ') + instance_dir = test_case + '/' + test_instance + '_report' + if os.path.isfile(os.path.join(base_dir, instance_dir, 'dist.png')): + result += DETAILS % {'anchor': test_case + '_' + test_instance, + 'name': '{} (instance {})'.format(test_case, + test_instance), + 'instance_dir': instance_dir} + return result + + +def main(argv): + base_dir = argv[1] + + # This file has the test case names, in the order that they should be + # displayed. + path = os.path.join(base_dir, 'test-instances.txt') + with open(path) as f: + test_instances = [line.strip() for line in f] + + # Metrics are assembled into a dictionary of dictionaries. The top-level + # key is the metric name ('tv', 'fpr', etc.), the second level key is + # the test case. These keys reference a list of floats, which can be empty. + metrics = { + 'l1d': {}, # l1 distance + 'rtime': {}, # R run time + 'chisqdiff': {}, # abs diff in values for the chisq test between true + # distr and estimated distr. + } + + # If there are too many tests, the plots are not included in the results + # file. Instead, rows' names are links to the corresponding .png files. + include_plots = len(test_instances) < 20 + + for instance in test_instances: + # A test instance is idenfied by the test name and the test run. + test_case, test_instance = instance.split(' ') + + spec_file = os.path.join(base_dir, test_case, 'spec.txt') + if not os.path.isfile(spec_file): + raise RuntimeError('{} is missing'.format(spec_file)) + + spec_html = ParseSpecFile(spec_file) + metrics_html = '' # will be filled in later on, if metrics exist + + report_dir = os.path.join(base_dir, test_case, test_instance + '_report') + + metrics_file = os.path.join(report_dir, 'metrics.csv') + log_file = os.path.join(report_dir, 'log.txt') + plot_file = os.path.join(report_dir, 'dist.png') + + cell1_html = FormatCell1(test_case, test_instance, metrics_file, log_file, + plot_file, include_plots) + + if os.path.isfile(metrics_file): + # ParseMetrics outputs an HTML table row and also updates lists + metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file) + + # Update the metrics structure. Initialize dictionaries if necessary. + for m in metrics: + if not test_case in metrics[m]: + metrics[m][test_case] = metrics_dict[m] + else: + metrics[m][test_case] += metrics_dict[m] + + print '{}{}{}'.format(cell1_html, spec_html, metrics_html) + + print FormatSummaryRow(metrics) + + print '' + print '
+ Test Case + + Input Params + + RAPPOR Params + + Result Metrics +
+ d: distribution drawn from
+ u: total unique values
+ c: clients
+ v: values per client
+
+ k: report bits
+ h: hashes
+ m: cohorts
+ p, q, f: probabilities
+
+ td_chisq: chisq test on true distr.
+ ed_chisq: chisq test on est. distr.
+ l1d: l1 distance
+ rtime: R runtime
+
ducvkhmpqftd_chisqed_chisql1drtime
+ %(name)s + %(mean_l1d)s%(mean_rtime)s%(mean_chisqdiff)s
%s%s{}{}
' + print '

' # vertical space + + # Plot links. + if include_plots: + print FormatPlots(base_dir, test_instances) + else: + print ('

Too many tests to include plots. ' + 'Click links within rows for details.

') + + +if __name__ == '__main__': + try: + main(sys.argv) + except RuntimeError, e: + print >>sys.stderr, 'FATAL: %s' % e + sys.exit(1) From 9ab52bb68259df840cdcaa682921d238e719dcde Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 7 May 2015 09:50:40 -0700 Subject: [PATCH 05/67] Cleaning up assoctest.sh --- assoctest.sh | 83 +++++++++---------------------------------- tests/regtest_spec.py | 6 ++-- 2 files changed, 20 insertions(+), 69 deletions(-) diff --git a/assoctest.sh b/assoctest.sh index 5bbabc30..80d9a067 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -1,28 +1,26 @@ #!/bin/bash # -# Run end-to-end tests in parallel. +# Run and end-to-end association test in parallel. # # Usage: -# ./regtest.sh +# ./assoctest.sh # At the end, it will print an HTML summary. # # Three main functions are -# run [[ [ []] - run tests matching in -# parallel, each times. The fast -# mode (T/F) shortcuts generation of -# reports. -# run-seq [ [ []] - ditto, except that tests are run -# sequentially -# run-all [] - run all tests, in parallel, each times +# run [[ []] - run tests matching in +# parallel, each times. +# +# ## run-seq currently not supported! +# run-seq [ []] - ditto, except that tests are run sequentially +# ## -- +# +# run-all [] - run all tests, in parallel, each times # # Examples: -# $ ./regtest.sh run-seq unif-small-typical # Sequential run, matches 1 case -# $ ./regtest.sh run-seq unif-small- 3 F # Sequential, each test is run three -# times, using slow generation -# $ ./regtest.sh run unif- # Parallel run, matches multiple cases -# $ ./regtest.sh run unif- 5 # Parallel run, matches multiple cases, each test -# is run 5 times +# $ ./regtest.sh run-seq tiny-8x16- # Sequential run, matches 2 cases +# $ ./regtest.sh run-seq tiny-8x16- 3 # Sequential, each test is run three +# times # $ ./regtest.sh run-all # Run all tests once # # The argument is a regex in 'grep -E' format. (Detail: Don't @@ -31,10 +29,6 @@ # than the number of CPUs on the machine. -# Future speedups: -# - Reuse the same input -- come up with naming scheme based on params -# - Reuse the same maps -- ditto, rappor library can cache it - set -o nounset set -o pipefail set -o errexit @@ -44,58 +38,17 @@ set -o errexit readonly THIS_DIR=$(dirname $0) readonly REPO_ROOT=$THIS_DIR readonly CLIENT_DIR=$REPO_ROOT/client/python -readonly REGTEST_DIR=_tmp/regtest readonly ASSOCTEST_DIR=_tmp/assoctest # All the Python tools need this export PYTHONPATH=$CLIENT_DIR -#print-true-inputs() { -# local num_unique_values=$1 -# seq 1 $num_unique_values | awk '{print "v" $1}' -#} - -# Add some more candidates here. We hope these are estimated at 0. -# e.g. if add_start=51, and num_additional is 20, show v51-v70 -#more-candidates() { -# local last_true=$1 -# local num_additional=$2 -# -# local begin -# local end -# begin=$(expr $last_true + 1) -# end=$(expr $last_true + $num_additional) -# -# seq $begin $end | awk '{print "v" $1}' -#} - -# Args: -# true_inputs: File of true inputs -# last_true: last true input, e.g. 50 if we generated "v1" .. "v50". -# num_additional: additional candidates to generate (starting at 'last_true') -# to_remove: Regex of true values to omit from the candidates list, or the -# string 'NONE' if none should be. (Our values look like 'v1', 'v2', etc. so -# there isn't any ambiguity.) -#print-candidates() { -# local true_inputs=$1 -# local last_true=$2 -# local num_additional=$3 -# local to_remove=$4 -# -# if test $to_remove = NONE; then -# cat $true_inputs # include all true inputs -# else -# egrep -v $to_remove $true_inputs # remove some true inputs -# fi -# more-candidates $last_true $num_additional -#} - # Generate a single test case, specified by a line of the test spec. # This is a helper function for _run_tests(). _setup-one-case() { local test_case=$1 - # input params + # Input parameters local dist=$2 local num_unique_values=$3 local num_unique_values2=$4 @@ -164,9 +117,6 @@ _run-one-instance() { --truefile $instance_dir/truedist.csv \ --outdir $out_dir \ --params $case_dir/case_params.csv - # Input prefix, output dir -# tests/analyze.R -t "Test case: $test_case (instance $test_instance)" \ -# "$case_dir/case" "$instance_dir/case" $out_dir } } @@ -297,8 +247,8 @@ _run-tests() { # local spec_regex=${1:-'^r-'} # grep -E format on the spec # local instances=${2:-1} # local fast_counts=${3:-T} -# -# _run-tests $spec_regex $instances T $fast_counts +# +# _run-tests $spec_regex $instances T $fast_counts #} # Run tests in parallel @@ -306,6 +256,7 @@ run-all() { local instances=${1:-1} log "Running all tests. Can take a while." + # a- for assoc tests _run-tests '^a-' $instances T T } diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 0c5798f0..3961b39a 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -47,9 +47,9 @@ ('tiny', 100, 2, int(1e03), 1), # test for insufficient data ('small', 100, 10, int(1e04), 1), ('medium', 1000, 10, int(1e05), 1), - ('large', 1000, 10, int(1e06), 1), - ('mediumsquared', 1000, 100, int(1e05), 1), - ('largesquared', int(1e04), 100, int(1e06), 1), +# ('large', 1000, 10, int(1e06), 1), +# ('mediumsquared', 1000, 100, int(1e05), 1), +# ('largesquared', int(1e04), 100, int(1e06), 1), ) # 'k, h, m' as in params file. From 69fe145effe7666e1f53b5a80ea367f8ba6f3242 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 7 May 2015 11:25:30 -0700 Subject: [PATCH 06/67] Cleaning up code and summary HTML. --- assoctest.sh | 1 - tests/assoctest.html | 2 +- tests/make_summary_assoc.py | 13 ++++--------- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/assoctest.sh b/assoctest.sh index 80d9a067..e37a4f8c 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -228,7 +228,6 @@ _run-tests() { | xargs -l -P $processors -- $0 $func || test-error log "Done running all test instances" - exit 1 make-summary $ASSOCTEST_DIR } diff --git a/tests/assoctest.html b/tests/assoctest.html index 91ee25be..80ef6515 100644 --- a/tests/assoctest.html +++ b/tests/assoctest.html @@ -18,7 +18,7 @@ -

RAPPOR regtest.sh

+

RAPPOR assoctest.sh

diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py index ac3fb160..0558893d 100755 --- a/tests/make_summary_assoc.py +++ b/tests/make_summary_assoc.py @@ -30,16 +30,11 @@ - - - - - + %(mean_chisqdiff)s %(mean_l1d)s %(mean_rtime)s - %(mean_chisqdiff)s """ @@ -243,8 +238,8 @@ def FormatSummaryRow(metrics_lists): summary = { 'name': 'Means', - 'mean_l1d': FormatMeanWithSem(means_with_sem['l1d'], percent=True), - 'mean_chisqdiff': FormatMeanWithSem(means_with_sem['chisqdiff'], percent=True), + 'mean_l1d': FormatMeanWithSem(means_with_sem['l1d'], percent=False), + 'mean_chisqdiff': FormatMeanWithSem(means_with_sem['chisqdiff'], percent=False), 'mean_rtime': FormatMeanWithSem(means_with_sem['rtime']), } return SUMMARY_ROW % summary @@ -279,9 +274,9 @@ def main(argv): # the test case. These keys reference a list of floats, which can be empty. metrics = { 'l1d': {}, # l1 distance - 'rtime': {}, # R run time 'chisqdiff': {}, # abs diff in values for the chisq test between true # distr and estimated distr. + 'rtime': {}, # R run time } # If there are too many tests, the plots are not included in the results From deff9a20bdf2230beb25868e149916817392fd4f Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Tue, 12 May 2015 22:19:12 +0000 Subject: [PATCH 07/67] Some small changes to help with test rig. --- tests/make_summary_assoc.py | 1 + tests/regtest_spec.py | 61 ++++++++++++++++++++----------------- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py index 0558893d..dc16d3f1 100755 --- a/tests/make_summary_assoc.py +++ b/tests/make_summary_assoc.py @@ -282,6 +282,7 @@ def main(argv): # If there are too many tests, the plots are not included in the results # file. Instead, rows' names are links to the corresponding .png files. include_plots = len(test_instances) < 20 + include_plots = False for instance in test_instances: # A test instance is idenfied by the test name and the test run. diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 3961b39a..3f192fe7 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -44,11 +44,12 @@ DISTRIBUTION_PARAMS_ASSOC = ( # name, num unique values 1, # num unique values 2, num clients, values per client - ('tiny', 100, 2, int(1e03), 1), # test for insufficient data - ('small', 100, 10, int(1e04), 1), +# ('tiny', 100, 2, int(1e03), 1), # test for insufficient data +# ('small', 100, 10, int(1e04), 1), ('medium', 1000, 10, int(1e05), 1), -# ('large', 1000, 10, int(1e06), 1), -# ('mediumsquared', 1000, 100, int(1e05), 1), + ('medium2', 1000, 2, int(1e05), 1), +# ('large', 10000, 10, int(1e06), 1), +# ('large2', 10000, 2, int(1e06), 1), # ('largesquared', int(1e04), 100, int(1e06), 1), ) @@ -56,14 +57,17 @@ BLOOMFILTER_PARAMS = { '8x16': (8, 2, 16), # 16 cohorts, 8 bits each, 2 bits set in each '8x32': (8, 2, 32), # 32 cohorts, 8 bits each, 2 bits set in each + '16x32': (16, 2, 32), # 32 cohorts, 16 bits each, 2 bits set in each '8x128': (8, 2, 128), # 128 cohorts, 8 bits each, 2 bits set in each - '128x128': (128, 2, 128), # 8 cohorts, 128 bits each, 2 bits set in each +# '128x128': (128, 2, 128), # 8 cohorts, 128 bits each, 2 bits set in each } # 'p, q, f' as in params file. PRIVACY_PARAMS = { - 'eps_1_1': (0.39, 0.61, 0.45), # eps_1 = 1, eps_inf = 5: - 'eps_1_5': (0.225, 0.775, 0.0), # eps_1 = 5, no eps_inf +# 'eps_1_1': (0.39, 0.61, 0.45), # eps_1 = 1, eps_inf = 5: +# 'eps_1_5': (0.225, 0.775, 0.0), # eps_1 = 5, no eps_inf + 'eps_verysmall': (0.125, 0.875, 0.125), + 'eps_small': (0.125, 0.875, 0.5), } # For deriving candidates from true inputs. @@ -92,26 +96,27 @@ def main(argv): rows = [] test_case = [] - for (distr_params, num_values, num_clients, - num_reports_per_client) in DISTRIBUTION_PARAMS: - for distribution in DISTRIBUTIONS: - for (config_name, bloom_name, privacy_params, fr_extra, - regex_missing) in TEST_CONFIGS: - test_name = 'r-{}-{}-{}'.format(distribution, distr_params, - config_name) - - params = (BLOOMFILTER_PARAMS[bloom_name] - + PRIVACY_PARAMS[privacy_params] - + tuple([int(num_values * fr_extra)]) - + tuple([MAP_REGEX_MISSING[regex_missing]])) - - test_case = (test_name, distribution, num_values, num_clients, - num_reports_per_client) + params - row_str = [str(element) for element in test_case] - rows.append(row_str) - - for params in DEMO: - rows.append(params) + if(False): + for (distr_params, num_values, num_clients, + num_reports_per_client) in DISTRIBUTION_PARAMS: + for distribution in DISTRIBUTIONS: + for (config_name, bloom_name, privacy_params, fr_extra, + regex_missing) in TEST_CONFIGS: + test_name = 'r-{}-{}-{}'.format(distribution, distr_params, + config_name) + + params = (BLOOMFILTER_PARAMS[bloom_name] + + PRIVACY_PARAMS[privacy_params] + + tuple([int(num_values * fr_extra)]) + + tuple([MAP_REGEX_MISSING[regex_missing]])) + + test_case = (test_name, distribution, num_values, num_clients, + num_reports_per_client) + params + row_str = [str(element) for element in test_case] + rows.append(row_str) + + for params in DEMO: + rows.append(params) # Association tests for (distr_params, num_values1, num_values2, num_clients, @@ -121,7 +126,7 @@ def main(argv): test_name = 'a-{}-{}-{}'.format(distr_params, bloom_params, privacy_params) - params = (BLOOMFILTER_PARAMS[bloom_name] + params = (BLOOMFILTER_PARAMS[bloom_params] + PRIVACY_PARAMS[privacy_params]) test_case = (test_name, distr_params, num_values1, num_values2, num_clients) + params From d9831c4e617cb0ec365369aec6806da21600f7b0 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Tue, 12 May 2015 16:07:08 -0700 Subject: [PATCH 08/67] Moving from l1 distance to t.v. = l1/2. --- tests/analyze_assoc.R | 8 ++++---- tests/assoctest.html | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R index 50f98c33..c97602b5 100755 --- a/tests/analyze_assoc.R +++ b/tests/analyze_assoc.R @@ -144,15 +144,15 @@ main <- function(opts) { print("PROC.TIME") time_taken <- proc.time() - ptm print(time_taken) - + # Write metrics to metrics.csv metrics <- list(td_chisq = td_chisq[1][[1]][[1]], ed_chisq = ed_chisq[1][[1]][[1]], - l1d = l1d, time = time_taken[1]) + tv = l1d/2, time = time_taken[2]) # report l1 distance / 2 + # to be consistent with + # histogram analysis filename <- file.path(opts$outdir, 'metrics.csv') write.csv(metrics, file = filename, row.names = FALSE) } if(!interactive()) { - main(opts) -} diff --git a/tests/assoctest.html b/tests/assoctest.html index 80ef6515..c5004882 100644 --- a/tests/assoctest.html +++ b/tests/assoctest.html @@ -49,10 +49,10 @@

RAPPOR assoctest.sh

- d: distribution drawn from
+ d: distribution type
u: total unique values
- c: clients
- v: values per client
+ u2: total unique values 2
+ c: number of reports/clients
k: report bits
@@ -63,7 +63,7 @@

RAPPOR assoctest.sh

td_chisq: chisq test on true distr.
ed_chisq: chisq test on est. distr.
- l1d: l1 distance
+ tv: tot. var. distance
rtime: R runtime
@@ -73,8 +73,8 @@

RAPPOR assoctest.sh

d u + u2 c - v k h @@ -85,7 +85,7 @@

RAPPOR assoctest.sh

td_chisq ed_chisq - l1d + tv rtime From 6754f2d9fde44c6a3791d83cf13630c66cdbf26a Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 18 May 2015 10:49:15 -0700 Subject: [PATCH 09/67] Pushing small changes. --- tests/assoc_sim.R | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R index 61ee822f..4ead3273 100755 --- a/tests/assoc_sim.R +++ b/tests/assoc_sim.R @@ -1,13 +1,13 @@ #!/usr/bin/env Rscript # # Copyright 2015 Google Inc. All rights reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,7 +17,7 @@ # Simulates inputs on which association analysis can be run. # Currently assoc_sim.R only supports 2 variables but can # be easily extended to support more. -# +# # Usage: # $ ./assoc_sim.R -n 1000 # Inputs: uvals, params, reports, map, num, unif @@ -54,11 +54,11 @@ if(!interactive()) { make_option(c("--extras", "-e"), default = TRUE, help = "Does 1st map have spurious candidates?"), make_option(c("--distr", "-d"), default = "zipfg", - help = "Type of distribution. Choose between + help = "Type of distribution. Choose between {unif, poisson, poisson2}") ) opts <- parse_args(OptionParser(option_list = option_list)) -} +} source("analysis/R/encode.R") source("analysis/R/decode.R") @@ -106,12 +106,12 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, var1_num, var2_num, mapfile, reportsfile) { # Compute true distribution - m <- params$m + m <- params$m if (distr == "unif") { # Draw uniformly from 1 to 10 v1_samples <- as.integer(runif(N, 1, 10)) - + # Pr[var2 = N + 1 | var1 = N] = 0.5 # Pr[var2 = N | var1 = N] = 0.5 v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE) @@ -149,12 +149,12 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, # (i.e., D1 in reverse) # var2 ~ D1 if var1 = even # var2 ~ D2 if var1 = odd - d1 <- sample(rep(1:var2_num, + d1 <- sample(rep(1:var2_num, RandomPartition(N, ComputePdf("zipf1.5", var2_num)))) d2 <- (var2_num:1)[d1] v2_samples <- rep(1, N) v2_samples[v1_samples %% 2 == 0] <- d1[v1_samples %% 2 == 0] - v2_samples[v1_samples %% 2 == 1] <- d2[v1_samples %% 2 == 1] + v2_samples[v1_samples %% 2 == 1] <- d2[v1_samples %% 2 == 1] } tmp_samples <- list(v1_samples, v2_samples) @@ -191,7 +191,7 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, row.names = TRUE, quote = FALSE) # Randomly assign cohorts in each dimension cohorts <- sample(1:m, N, replace = TRUE) - + # Create and write map into mapfile_1.csv and mapfile_2.csv if (extras == TRUE) { # 1000 spurious candidates for mapfile_1.csv @@ -203,7 +203,7 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, sep = ",", col.names = FALSE, na = "", quote = FALSE) write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""), sep = ",", col.names = FALSE, na = "", quote = FALSE) - + # Write reports into a csv file # Format: # cohort, bloom filter var1, bloom filter var2 @@ -211,7 +211,7 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, EncodeAll(samples[[i]], cohorts, map[[i]]$map, params)) # Organize cohorts and reports into format write_matrix <- cbind(as.matrix(cohorts), - as.matrix(lapply(reports[[1]], + as.matrix(lapply(reports[[1]], function(x) paste(x, collapse = ""))), as.matrix(lapply(reports[[2]], function(x) paste(x, collapse = "")))) @@ -221,7 +221,7 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, main <- function(opts) { ptm <- proc.time() - + if(is.null(opts$uvals)) { uvals = list(var1 = c("str1"), var2 = c("option1")) } else { @@ -232,7 +232,7 @@ main <- function(opts) { opts$extras, opts$true, # inputs opts$var1_num, opts$var2_num, # inputs opts$map, opts$reports) # outputs - + print("PROC.TIME") print(proc.time() - ptm) } From 45ee2f80014140a43d3a4bf974f5976825fb335b Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 18 May 2015 15:02:43 -0700 Subject: [PATCH 10/67] Fixing inconsistencies in map objects. --- tests/analyze_assoc.R | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R index c97602b5..8948e4ac 100755 --- a/tests/analyze_assoc.R +++ b/tests/analyze_assoc.R @@ -71,15 +71,9 @@ source("analysis/R/association.R") # TODO(pseudorandom): move this functionality to ReadMapFile ProcessMap <- function(map, params) { map$rmap <- map$map - split_map <- function(i, map_struct) { - numbits <- params$k - indices <- which(as.matrix( - map_struct[((i - 1) * numbits + 1):(i * numbits),]) == TRUE, - arr.ind = TRUE) - sparseMatrix(indices[, "row"], indices[, "col"], - dims = c(numbits, max(indices[, "col"]))) - } - map$map <- lapply(1:params$m, function(i) split_map(i, map$rmap)) + map$map <- lapply(1:params$m, function(i) + map$rmap[seq(from = (i - 1) * params$k + 1), + length.out = params$k),]) map } @@ -156,3 +150,5 @@ main <- function(opts) { } if(!interactive()) { + main(opts) +} From c9484c57eda478bf2bc30c0ce4332d23d0237795 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 18 May 2015 15:51:58 -0700 Subject: [PATCH 11/67] Re-factoring regtest_spec.py to suit assoc better. --- tests/regtest_spec.py | 94 +++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 44 deletions(-) diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 3f192fe7..98ca1fa1 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -41,17 +41,17 @@ ('large', 10000, 100000000, 1), ) -DISTRIBUTION_PARAMS_ASSOC = ( +DISTRIBUTION_PARAMS_ASSOC = { # name, num unique values 1, # num unique values 2, num clients, values per client -# ('tiny', 100, 2, int(1e03), 1), # test for insufficient data -# ('small', 100, 10, int(1e04), 1), - ('medium', 1000, 10, int(1e05), 1), - ('medium2', 1000, 2, int(1e05), 1), -# ('large', 10000, 10, int(1e06), 1), -# ('large2', 10000, 2, int(1e06), 1), -# ('largesquared', int(1e04), 100, int(1e06), 1), -) + 'tiny': (100, 2, int(1e03), 1), # test for insufficient data + 'small': (100, 10, int(1e04), 1), + 'medium': (1000, 10, int(1e05), 1), + 'medium2': (1000, 2, int(1e05), 1), + 'large': (10000, 10, int(1e06), 1), + 'large2': (10000, 2, int(1e06), 1), + 'largesquared': (int(1e04), 100, int(1e06), 1), +} # 'k, h, m' as in params file. BLOOMFILTER_PARAMS = { @@ -59,13 +59,13 @@ '8x32': (8, 2, 32), # 32 cohorts, 8 bits each, 2 bits set in each '16x32': (16, 2, 32), # 32 cohorts, 16 bits each, 2 bits set in each '8x128': (8, 2, 128), # 128 cohorts, 8 bits each, 2 bits set in each -# '128x128': (128, 2, 128), # 8 cohorts, 128 bits each, 2 bits set in each + '128x128': (128, 2, 128), # 8 cohorts, 128 bits each, 2 bits set in each } # 'p, q, f' as in params file. PRIVACY_PARAMS = { -# 'eps_1_1': (0.39, 0.61, 0.45), # eps_1 = 1, eps_inf = 5: -# 'eps_1_5': (0.225, 0.775, 0.0), # eps_1 = 5, no eps_inf + 'eps_1_1': (0.39, 0.61, 0.45), # eps_1 = 1, eps_inf = 5: + 'eps_1_5': (0.225, 0.775, 0.0), # eps_1 = 5, no eps_inf 'eps_verysmall': (0.125, 0.875, 0.125), 'eps_small': (0.125, 0.875, 0.5), } @@ -87,6 +87,17 @@ ('over_x10', '8x128', 'eps_1_1', 10.0, '10%'), # overshoot by x10 ] +# assoc test configuration -> +# (distribution params set, bloomfilter params set, +# privacy params set) +# The test config runs a test suite that is the cross product of all the above +# sets +ASSOC_TEST_CONFIG = { + 'distr': ('small', 'medium'), + 'blooms': ('8x16', '8x32', '16x32'), + 'privacy': ('eps_verysmall', 'eps_small'), +} + # # END TEST CONFIGURATION # @@ -96,40 +107,35 @@ def main(argv): rows = [] test_case = [] - if(False): - for (distr_params, num_values, num_clients, - num_reports_per_client) in DISTRIBUTION_PARAMS: - for distribution in DISTRIBUTIONS: - for (config_name, bloom_name, privacy_params, fr_extra, - regex_missing) in TEST_CONFIGS: - test_name = 'r-{}-{}-{}'.format(distribution, distr_params, - config_name) - - params = (BLOOMFILTER_PARAMS[bloom_name] - + PRIVACY_PARAMS[privacy_params] - + tuple([int(num_values * fr_extra)]) - + tuple([MAP_REGEX_MISSING[regex_missing]])) - - test_case = (test_name, distribution, num_values, num_clients, - num_reports_per_client) + params - row_str = [str(element) for element in test_case] - rows.append(row_str) - - for params in DEMO: - rows.append(params) + for (distr_params, num_values, num_clients, + num_reports_per_client) in DISTRIBUTION_PARAMS: + for distribution in DISTRIBUTIONS: + for (config_name, bloom_name, privacy_params, fr_extra, + regex_missing) in TEST_CONFIGS: + test_name = 'r-{}-{}-{}'.format(distribution, distr_params, + config_name) + + params = (BLOOMFILTER_PARAMS[bloom_name] + + PRIVACY_PARAMS[privacy_params] + + tuple([int(num_values * fr_extra)]) + + tuple([MAP_REGEX_MISSING[regex_missing]])) + + test_case = (test_name, distribution, num_values, num_clients, + num_reports_per_client) + params + row_str = [str(element) for element in test_case] + rows.append(row_str) + + for params in DEMO: + rows.append(params) # Association tests - for (distr_params, num_values1, num_values2, num_clients, - num_reports_per_client) in DISTRIBUTION_PARAMS_ASSOC: - for bloom_params in BLOOMFILTER_PARAMS: - for privacy_params in PRIVACY_PARAMS: - test_name = 'a-{}-{}-{}'.format(distr_params, bloom_params, - privacy_params) - - params = (BLOOMFILTER_PARAMS[bloom_params] - + PRIVACY_PARAMS[privacy_params]) - test_case = (test_name, distr_params, num_values1, num_values2, - num_clients) + params + for distr in ASSOC_TEST_CONFIG['distr']: + for blooms in ASSOC_TEST_CONFIG['blooms']: + for privacy in ASSOC_TEST_CONFIG['privacy']: + test_name = 'a-{}-{}-{}'.format(distr, blooms, privacy) + params = (BLOOMFILTER_PARAMS[blooms] + + PRIVACY_PARAMS[privacy]) + test_case = (test_name,) + DISTRIBUTION_PARAMS_ASSOC[distr] + params row_str = [str(element) for element in test_case] rows.append(row_str) # End of association tests From b61f251582c3fb5861b37794fc7a6909e05d5e81 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 18 May 2015 21:09:23 -0700 Subject: [PATCH 12/67] Modifications to allow the use of the new decode. Quick mode only requires 2 LASSO calls for estimating std dev instead of 5. Minor changes in several places including modifications to the test suite and assoc test params. --- analysis/R/association.R | 4 ++-- analysis/R/decode.R | 21 +++++++++++++-------- assoctest.sh | 24 +++++++++++------------- tests/analyze_assoc.R | 5 +++-- tests/assoc_sim.R | 20 ++++++++++---------- tests/assoctest.html | 18 ++++++++---------- tests/make_summary_assoc.py | 1 - tests/regtest_spec.py | 23 ++++++++++++----------- 8 files changed, 59 insertions(+), 57 deletions(-) diff --git a/analysis/R/association.R b/analysis/R/association.R index c5b23e26..dd3080fc 100644 --- a/analysis/R/association.R +++ b/analysis/R/association.R @@ -283,7 +283,7 @@ UpdateJointConditional <- function(cond_report_dist, joint_conditional = NULL) { ComputeDistributionEM <- function(reports, report_cohorts, maps, ignore_other = FALSE, - params, + params, quick = FALSE, marginals = NULL, estimate_var = FALSE) { # Computes the distribution of num_variables variables, where @@ -322,7 +322,7 @@ ComputeDistributionEM <- function(reports, report_cohorts, variable_counts <- NULL if (is.null(marginals)) { variable_counts <- ComputeCounts(variable_report, variable_cohort, params) - marginal <- Decode(variable_counts, map$rmap, params)$fit + marginal <- Decode(variable_counts, map$rmap, params, quick)$fit if (nrow(marginal) == 0) { return (NULL) } diff --git a/analysis/R/decode.R b/analysis/R/decode.R index b965ebdd..4fae9d86 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -254,7 +254,7 @@ Resample <- function(e) { result } -Decode <- function(counts, map, params, alpha = 0.05, +Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05, correction = c("Bonferroni"), ...) { k <- params$k p <- params$p @@ -280,27 +280,32 @@ Decode <- function(counts, map, params, alpha = 0.05, stds = es$stds[filter_cohorts, , drop = FALSE]) coefs_all <- vector() - - for(r in 1:5) + if(quick) {num_reps <- 2} else {num_reps <- 5} + for(r in 1:num_reps) { if(r > 1) e <- Resample(estimates_stds_filtered) else e <- estimates_stds_filtered - + coefs_all <- rbind(coefs_all, - FitDistribution(e, map[filter_bits, , drop = FALSE])) + FitDistribution(e, map[filter_bits, , drop = FALSE])) } - coefs_ssd <- N * apply(coefs_all, 2, sd) # compute sample standard deviations coefs_ave <- N * apply(coefs_all, 2, mean) - + # Only select coefficients more than two standard deviations from 0. May # inflate empirical SD of the estimates. reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd) - + mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported]) +# Old code ... +# coefs_all <- FitDistribution(estimates_stds_filtered, +# map[filter_bits, , drop = FALSE]) +# reported <- which(coefs_all > 1E-6) +# mod <- list(coefs = coefs_all[reported], stds = rep(0, length(reported))) + if (correction == "Bonferroni") { alpha <- alpha / S } diff --git a/assoctest.sh b/assoctest.sh index e37a4f8c..61eec301 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -49,19 +49,17 @@ _setup-one-case() { local test_case=$1 # Input parameters - local dist=$2 - local num_unique_values=$3 - local num_unique_values2=$4 - local num_clients=$5 - local values_per_client=$6 + local num_unique_values=$2 + local num_unique_values2=$3 + local num_clients=$4 # RAPPOR params - local num_bits=$6 - local num_hashes=$7 - local num_cohorts=$8 - local p=$9 - local q=${10} # need curly braces to get the 10th arg - local f=${11} + local num_bits=$5 + local num_hashes=$6 + local num_cohorts=$7 + local p=$8 + local q=$9 # need curly braces to get the 10th arg + local f=${10} banner 'Setting up parameters and candidate files for '$test_case @@ -85,7 +83,7 @@ _run-one-instance() { local case_dir=$ASSOCTEST_DIR/$test_case - read -r case_name case_descr num_unique_values num_unique_values2 \ + read -r case_name num_unique_values num_unique_values2 \ num_clients num_bits num_hashes num_cohorts p q f < $case_dir/spec.txt local instance_dir=$ASSOCTEST_DIR/$test_case/$test_instance @@ -99,7 +97,7 @@ _run-one-instance() { -t $instance_dir/truedist.csv \ -m $instance_dir/map \ -n $num_clients \ - --var1_num $num_unique_values \ + --extras $num_unique_values \ --var2_num $num_unique_values2 local out_dir=${instance_dir}_report diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R index 8948e4ac..c58cc345 100755 --- a/tests/analyze_assoc.R +++ b/tests/analyze_assoc.R @@ -72,7 +72,7 @@ source("analysis/R/association.R") ProcessMap <- function(map, params) { map$rmap <- map$map map$map <- lapply(1:params$m, function(i) - map$rmap[seq(from = (i - 1) * params$k + 1), + map$rmap[seq(from = ((i - 1) * params$k + 1), length.out = params$k),]) map } @@ -110,6 +110,7 @@ main <- function(opts) { joint_dist <- ComputeDistributionEM(reports, cohorts, map, ignore_other = TRUE, + quick = TRUE, params, marginals = NULL, estimate_var = FALSE) @@ -142,7 +143,7 @@ main <- function(opts) { # Write metrics to metrics.csv metrics <- list(td_chisq = td_chisq[1][[1]][[1]], ed_chisq = ed_chisq[1][[1]][[1]], - tv = l1d/2, time = time_taken[2]) # report l1 distance / 2 + tv = l1d/2, time = time_taken[1]) # report l1 distance / 2 # to be consistent with # histogram analysis filename <- file.path(opts$outdir, 'metrics.csv') diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R index 4ead3273..3b8e89c5 100755 --- a/tests/assoc_sim.R +++ b/tests/assoc_sim.R @@ -47,15 +47,15 @@ if(!interactive()) { help = "Filename *prefix* for map(s)"), make_option(c("--num", "-n"), default = 1e05, help = "Number of reports"), - make_option(c("--var1_num", "-z"), default = 25, + make_option(c("--var1_num", "-z"), default = 40, help = "Number of values for var1"), make_option(c("--var2_num", "-y"), default = 5, help = "Number of values for var2"), - make_option(c("--extras", "-e"), default = TRUE, - help = "Does 1st map have spurious candidates?"), - make_option(c("--distr", "-d"), default = "zipfg", + make_option(c("--extras", "-e"), default = 1000, + help = "How many spurious candidates does the 1st map have?"), + make_option(c("--distr", "-d"), default = "zipf2", help = "Type of distribution. Choose between - {unif, poisson, poisson2}") + {unif, poisson, poisson2, zipf2}") ) opts <- parse_args(OptionParser(option_list = option_list)) } @@ -99,7 +99,7 @@ GetUniqueValsFromFile <- function(filename) { # truefile = name of the file with true distribution # var1_num = number of var1 candidates # var2_num = number of var2 candidates -# *** CURRENTLY ONLY USEFUL IF DISTR = ZIPFG *** +# *** FOR ASSOCTEST TEST SUITE, USE ONLY ZIPF2 *** # mapfile = file to write maps into (with .csv suffixes) # reportsfile = file to write reports into (with .csv suffix) SimulateReports <- function(N, uvals, params, distr, extras, truefile, @@ -134,7 +134,7 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, v2_samples <- rep(1, N) v2_samples[v1_samples %% 2 == 0] <- pr25[v1_samples %% 2 == 0] v2_samples[v1_samples %% 2 == 1] <- pr75[v1_samples %% 2 == 1] - } else if (distr == "zipfg") { + } else if (distr == "zipf2") { # Zipfian over var1_num strings partition <- RandomPartition(N, ComputePdf("zipf1.5", var1_num)) @@ -193,9 +193,9 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, cohorts <- sample(1:m, N, replace = TRUE) # Create and write map into mapfile_1.csv and mapfile_2.csv - if (extras == TRUE) { - # 1000 spurious candidates for mapfile_1.csv - len <- length(uvals[[1]]) + 1000 + if (extras > 0) { + # spurious candidates for mapfile_1.csv + len <- length(uvals[[1]]) + as.numeric(extras) uvals[[1]] <- PadStrings(len, uvals[[1]]) } map <- lapply(uvals, function(u) CreateMap(u, params)) diff --git a/tests/assoctest.html b/tests/assoctest.html index c5004882..7fc6aff0 100644 --- a/tests/assoctest.html +++ b/tests/assoctest.html @@ -25,7 +25,7 @@

RAPPOR assoctest.sh

- + @@ -35,7 +35,7 @@

RAPPOR assoctest.sh

- - - - + - + diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py index dc16d3f1..2c959971 100755 --- a/tests/make_summary_assoc.py +++ b/tests/make_summary_assoc.py @@ -20,7 +20,6 @@ - diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 98ca1fa1..47feb470 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -43,14 +43,14 @@ DISTRIBUTION_PARAMS_ASSOC = { # name, num unique values 1, - # num unique values 2, num clients, values per client - 'tiny': (100, 2, int(1e03), 1), # test for insufficient data - 'small': (100, 10, int(1e04), 1), - 'medium': (1000, 10, int(1e05), 1), - 'medium2': (1000, 2, int(1e05), 1), - 'large': (10000, 10, int(1e06), 1), - 'large2': (10000, 2, int(1e06), 1), - 'largesquared': (int(1e04), 100, int(1e06), 1), + # num unique values 2, num clients + 'tiny': (100, 2, int(1e03)), # test for insufficient data + 'small': (100, 10, int(1e04)), + 'medium': (1000, 10, int(1e05)), + 'medium2': (1000, 2, int(1e05)), + 'large': (10000, 10, int(1e06)), + 'large2': (10000, 2, int(1e06)), + 'largesquared': (int(1e04), 100, int(1e06)), } # 'k, h, m' as in params file. @@ -93,9 +93,9 @@ # The test config runs a test suite that is the cross product of all the above # sets ASSOC_TEST_CONFIG = { - 'distr': ('small', 'medium'), - 'blooms': ('8x16', '8x32', '16x32'), - 'privacy': ('eps_verysmall', 'eps_small'), + 'distr': ('small',),# 'medium'), + 'blooms': ('8x16',), # '8x32', '16x32'), + 'privacy': ('eps_verysmall',), # 'eps_small'), } # @@ -132,6 +132,7 @@ def main(argv): for distr in ASSOC_TEST_CONFIG['distr']: for blooms in ASSOC_TEST_CONFIG['blooms']: for privacy in ASSOC_TEST_CONFIG['privacy']: + print distr, blooms, privacy test_name = 'a-{}-{}-{}'.format(distr, blooms, privacy) params = (BLOOMFILTER_PARAMS[blooms] + PRIVACY_PARAMS[privacy]) From a7e69eba11287c662ab88f76d0d5d15e084ff6f3 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 4 Jun 2015 15:16:11 -0700 Subject: [PATCH 13/67] Updates on association test suite. - Contains code fragments to experiment with EM - More metrics reported (support recovered in Decode for each var) - Timing information (experimental) - Default params in assoc_sim are modified to match use cases closer - Different metrics presented in association test suite results page --- analysis/R/association.R | 157 +++++++++++++++++++++++++++++++++--- tests/analyze_assoc.R | 43 +++++----- tests/assoc_sim.R | 27 ++++--- tests/assoctest.html | 26 +++--- tests/make_summary_assoc.py | 28 ++++--- tests/regtest_spec.py | 6 +- 6 files changed, 221 insertions(+), 66 deletions(-) diff --git a/analysis/R/association.R b/analysis/R/association.R index dd3080fc..d50bd490 100644 --- a/analysis/R/association.R +++ b/analysis/R/association.R @@ -137,6 +137,33 @@ GetJointConditionalProb <- function(cond_x, cond_y) { mapply("outer", cond_x, cond_y, SIMPLIFY = FALSE) } +UpdatePij2 <- function(pij, reports, cohorts, cand_strs, + params, map) { + + accum <- array(0, dim(pij)) + # For each report + for (i in seq(length(reports[[1]]))) { + # For each var + for (var in seq(length(reports))) { + idx <- cohorts[[var]][i] + rep <- GetCondProb(reports[[var]][[i]], + candidate_strings = cand_strs[[var]], + params = params, + map[[var]]$map[[idx]], NULL) + if(var == 1) { + cond_joint_distr <- rep + } else { + cond_joint_distr <- outer(cond_joint_distr, rep) + } + } + z <- cond_joint_distr * pij + z <- z / sum(z) + z[is.nan(z)] <- 0 + accum <- accum + z + } + accum / length(reports[[1]]) +} + UpdatePij <- function(pij, cond_prob) { # Update the probability matrix based on the EM algorithm. # @@ -155,6 +182,23 @@ UpdatePij <- function(pij, cond_prob) { Reduce("+", wcp) / length(wcp) } +UpdatePij3 <- function(pij, cond_prob) { + wcp <- lapply(cond_prob, function(x) { + for (i in seq(length(x))) { + if (i == 1) { + op <- x[[i]] + } else { + op <- outer(op, x[[i]]) + } + } + z <- op * pij + z <- z / sum(z) + z[is.nan(z)] <- 0 + z + }) + Reduce("+", wcp) / length(wcp) +} + NLL <- function(pij, cond_prob) { # Update the probability matrix based on the EM algorithm. # @@ -186,6 +230,62 @@ ComputeVar <- function(cond_prob, est) { list(var_cov = var_cov, sd = sd, inform = inform) } +EM2 <- function(reports, cohorts, cand_strs, starting_pij = NULL, + params, map, + max_iter = 1e03, epsilon = 1e-06) { + + # State space is the product of lengths. + state_space <- sapply(cand_strs, "length") + pij <- array() + if(is.null(starting_pij)) { + pij <- array(1 / prod(state_space), state_space) + } else { + pij <- starting_pij + } + + if (nrow(pij) > 0) { + # Run EM + for (i in 1:max_iter) { + pij_new <- UpdatePij2(pij, reports, cohorts, cand_strs, + params, map) + diff <- max(abs(pij_new - pij)) + pij <- pij_new + if (diff < epsilon) { + break + } + } + } + list(hist = pij) +} + +EM3 <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE, + max_iter = 1e03, epsilon = 1e-06, verbose = FALSE) { + pij <- list() + + # Compute dimensions of conditional distributions. + state_space <- sapply(cond_prob[[1]], length) + if (is.null(starting_pij)) { + pij <- array(1 / prod(state_space), state_space) + } else { + pij <- starting_pij + } + if (nrow(pij) > 0) { + # Run EM + for (i in 1:max_iter) { + if (i == 1) { + ptm_iter <- proc.time() + } + pij_new <- UpdatePij3(pij, cond_prob) + diff <- max(abs(pij_new - pij)) + pij <- pij_new + if (diff < epsilon) { + break + } + } + } + list(est = pij, hist = pij, sd = 0) +} + EM <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE, max_iter = 1000, epsilon = 10^-6, verbose = FALSE) { # Performs estimation. @@ -213,8 +313,15 @@ EM <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE, if (nrow(pij[[1]]) > 0) { # Run EM for (i in 1:max_iter) { + if (i == 1) { + ptm_iter <- proc.time() + } pij[[i + 1]] <- UpdatePij(pij[[i]], cond_prob) dif <- max(abs(pij[[i + 1]] - pij[[i]])) + if (i == 1) { + print("ONE ITERATION") + print(proc.time() - ptm_iter) + } if (dif < epsilon) { break } @@ -285,7 +392,8 @@ ComputeDistributionEM <- function(reports, report_cohorts, maps, ignore_other = FALSE, params, quick = FALSE, marginals = NULL, - estimate_var = FALSE) { + estimate_var = FALSE, + new_alg = FALSE) { # Computes the distribution of num_variables variables, where # num_variables is chosen by the client, using the EM algorithm. # @@ -312,17 +420,22 @@ ComputeDistributionEM <- function(reports, report_cohorts, # Compute the counts for each variable and then do conditionals. joint_conditional = NULL found_strings <- list() - + cd_for_reports <- list() + for (j in (1:num_variables)) { + ptm <- proc.time() variable_report <- reports[[j]] variable_cohort <- report_cohorts[[j]] map <- maps[[j]] - + # Compute the probability of the "other" category variable_counts <- NULL if (is.null(marginals)) { + ptm2 <- proc.time() variable_counts <- ComputeCounts(variable_report, variable_cohort, params) marginal <- Decode(variable_counts, map$rmap, params, quick)$fit + print("TIME IN MARGINALS") + print(proc.time() - ptm2) if (nrow(marginal) == 0) { return (NULL) } @@ -353,17 +466,39 @@ ComputeDistributionEM <- function(reports, report_cohorts, prob_other[[idx]]) rep }) - - # Update the joint conditional distribution of all variables - joint_conditional <- UpdateJointConditional(cond_report_dist, + + if(new_alg) { + # Report conditional distributions as lists + if (j == 1) { + # Conditional distribution for reports + joint_conditional <- lapply(cond_report_dist, "list") + } else { + joint_conditional <- mapply(function (x, y) c(x, list(y)), + joint_conditional, cond_report_dist, + SIMPLIFY = FALSE) + } + } else { + # Update the joint conditional distribution of all variables + joint_conditional <- UpdateJointConditional(cond_report_dist, joint_conditional) + } + print("TIME IN COND_REPORT_DIST") + print(proc.time()-ptm) } - + + ptm <- proc.time() # Run expectation maximization to find joint distribution - em <- EM(joint_conditional, epsilon = 10 ^ -6, verbose = FALSE, + if (new_alg) { + funct <- EM3 + } else { + funct <- EM + } + em <- funct(joint_conditional, epsilon = 10 ^ -5, verbose = FALSE, estimate_var = estimate_var) + print("TIME IN EM") + print(proc.time() - ptm) dimnames(em$est) <- found_strings + # Return results in a usable format - list(fit = em$est, sd = em$sd, em = em) - -} + list(orig = list(fit = em$est, sd = em$sd, em = em)) +} \ No newline at end of file diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R index c58cc345..ade385c0 100755 --- a/tests/analyze_assoc.R +++ b/tests/analyze_assoc.R @@ -48,7 +48,9 @@ if(!interactive()) { make_option(c("--outdir", "-o"), default = ".", help = "File where the metrics go"), make_option(c("--params", "-p"), default = "params.csv", - help = "Filename for RAPPOR parameters") + help = "Filename for RAPPOR parameters"), + make_option(c("--newalg", "-a"), default = FALSE, + help = "Flag to run new EM3 algorithm or not") ) opts <- parse_args(OptionParser(option_list = option_list)) } @@ -112,28 +114,19 @@ main <- function(opts) { ignore_other = TRUE, quick = TRUE, params, marginals = NULL, - estimate_var = FALSE) + estimate_var = FALSE, + new_alg = opts$newalg) td <- read.csv(file = opts$truefile) - ed <- joint_dist$fit + ed <- joint_dist$orig$fit print("CHI-SQUARED") td_chisq <- chisq.test(td) ed_chisq <- chisq.test(ed) print(td_chisq) print(ed_chisq) - # L1 distance = 1 - sum(min(td|x, ed|x)) where - # td|x / ed|x projects the distribution to the intersection x of the - # supports of td and ed - rowsi <- intersect(rownames(td), rownames(ed)) - colsi <- intersect(colnames(td), colnames(ed)) - print("L1 DISTANCE") - l1d <- 1 - sum(mapply(min, - unlist(td[rowsi, colsi], use.names = FALSE), - unlist(as.data.frame(ed)[rowsi, colsi], use.names = FALSE) - )) - print(l1d) - + print(l1d(td, ed, "L1 DISTANCE")) + print("JOINT_DIST$FIT") print(signif(ed[order(rowSums(ed)),], 4)) print("PROC.TIME") @@ -141,15 +134,29 @@ main <- function(opts) { print(time_taken) # Write metrics to metrics.csv + # Report l1 distance / 2 to be consistent with histogram analysis metrics <- list(td_chisq = td_chisq[1][[1]][[1]], ed_chisq = ed_chisq[1][[1]][[1]], - tv = l1d/2, time = time_taken[1]) # report l1 distance / 2 - # to be consistent with - # histogram analysis + tv = l1d(td, ed, "L1 DISTANCE")/2, + time = time_taken[1], + dim1 = dim(ed)[[1]], + dim2 = dim(ed)[[2]]) filename <- file.path(opts$outdir, 'metrics.csv') write.csv(metrics, file = filename, row.names = FALSE) } +# L1 distance = 1 - sum(min(df1|x, df2|x)) where +# df1|x / df2|x projects the distribution to the intersection x of the +# supports of df1 and df2 +l1d <- function(df1, df2, statement = "L1 DISTANCE") { + rowsi <- intersect(rownames(df1), rownames(df2)) + colsi <- intersect(colnames(df1), colnames(df2)) + print(statement) + 1 - sum(mapply(min, + unlist(as.data.frame(df1)[rowsi, colsi], use.names = FALSE), + unlist(as.data.frame(df2)[rowsi, colsi], use.names = FALSE))) +} + if(!interactive()) { main(opts) } diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R index 3b8e89c5..e93918e4 100755 --- a/tests/assoc_sim.R +++ b/tests/assoc_sim.R @@ -47,19 +47,25 @@ if(!interactive()) { help = "Filename *prefix* for map(s)"), make_option(c("--num", "-n"), default = 1e05, help = "Number of reports"), - make_option(c("--var1_num", "-z"), default = 40, + make_option(c("--var1_num", "-z"), default = 100, help = "Number of values for var1"), - make_option(c("--var2_num", "-y"), default = 5, + make_option(c("--var2_num", "-y"), default = 20, help = "Number of values for var2"), - make_option(c("--extras", "-e"), default = 1000, + make_option(c("--extras", "-e"), default = 1e05, help = "How many spurious candidates does the 1st map have?"), make_option(c("--distr", "-d"), default = "zipf2", help = "Type of distribution. Choose between - {unif, poisson, poisson2, zipf2}") + {unif, poisson, poisson2, zipf2}"), + make_option(c("--prefix", "-x"), default = "./", + help = "Path to prefix all default files") ) opts <- parse_args(OptionParser(option_list = option_list)) } +apply_prefix <- function(path) { + paste(opts$prefix, path, sep = "") +} + source("analysis/R/encode.R") source("analysis/R/decode.R") source("analysis/R/simulation.R") @@ -225,13 +231,14 @@ main <- function(opts) { if(is.null(opts$uvals)) { uvals = list(var1 = c("str1"), var2 = c("option1")) } else { - uvals <- GetUniqueValsFromFile(opts$uvals) + uvals <- GetUniqueValsFromFile(apply_prefix(opts$uvals)) } - params <- ReadParameterFile(opts$params) - SimulateReports(opts$num, uvals, params, opts$distr, # inuts - opts$extras, opts$true, # inputs - opts$var1_num, opts$var2_num, # inputs - opts$map, opts$reports) # outputs + params <- ReadParameterFile(apply_prefix(opts$params)) + SimulateReports(opts$num, uvals, params, opts$distr, # inuts + opts$extras, apply_prefix(opts$true), # inputs + opts$var1_num, opts$var2_num, # inputs + apply_prefix(opts$map), + apply_prefix(opts$reports)) # outputs print("PROC.TIME") print(proc.time() - ptm) diff --git a/tests/assoctest.html b/tests/assoctest.html index 7fc6aff0..38e5abac 100644 --- a/tests/assoctest.html +++ b/tests/assoctest.html @@ -25,9 +25,9 @@

RAPPOR assoctest.sh

Test Case + Input Params @@ -48,11 +48,10 @@

RAPPOR assoctest.sh

- d: distribution type
- u: total unique values
- u2: total unique values 2
- c: number of reports/clients
+
+ e: number of extras
+ u2: number of unique vals in var2
+ n: number of reports/clients
k: report bits
@@ -71,10 +70,9 @@

RAPPOR assoctest.sh

due u2cn k h
- + - + @@ -35,23 +35,21 @@

RAPPOR assoctest.sh

- - - - @@ -70,8 +70,6 @@

RAPPOR assoctest.sh

- - @@ -81,6 +79,8 @@

RAPPOR assoctest.sh

+ + diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py index 2c959971..59a4f247 100755 --- a/tests/make_summary_assoc.py +++ b/tests/make_summary_assoc.py @@ -1,7 +1,7 @@ #!/usr/bin/python """Given a regtest result tree, prints an HTML summary on stdout. -See HTML skeleton in tests/regtest.html. +See HTML skeleton in tests/assoctest.html. """ import os @@ -18,8 +18,6 @@ - - @@ -31,6 +29,8 @@ + + @@ -133,7 +133,7 @@ def ParseSpecFile(spec_filename): with open(spec_filename) as s: spec_row = s.readline().split() - spec_in_html = ' '.join('' % cell for cell in spec_row[1:]) + spec_in_html = ' '.join('' % cell for cell in spec_row[3:]) return spec_in_html @@ -169,7 +169,7 @@ def ParseMetrics(metrics_file, log_file): m.readline() metrics_row = m.readline().split(',') - (td_chisq, ed_chisq, l1d, rtime) = metrics_row + (td_chisq, ed_chisq, l1d, rtime, d1, d2) = metrics_row td_chisq = float(td_chisq) ed_chisq = float(ed_chisq) @@ -180,16 +180,20 @@ def ParseMetrics(metrics_file, log_file): elapsed_time = ExtractTime(log_file) metrics_row_str = [ - str(td_chisq), - str(ed_chisq), - str(l1d), - str(rtime), + '%s' % d1, + '%s' % d2, + '%.3f' % td_chisq, + '%.3f' % ed_chisq, + '%.3f' % l1d, + str(rtime), ] metrics_row_dict = { - 'l1d': [l1d], - 'rtime': [rtime], - 'chisqdiff': [abs(td_chisq - ed_chisq)], + 'd1': [d1], + 'd2': [d2], + 'l1d': [l1d], + 'rtime': [rtime], + 'chisqdiff': [abs(td_chisq - ed_chisq)], } # return metrics formatted as HTML table entries diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 47feb470..e4458b8e 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -46,6 +46,8 @@ # num unique values 2, num clients 'tiny': (100, 2, int(1e03)), # test for insufficient data 'small': (100, 10, int(1e04)), + 'fizz': (100, 20, int(1e05)), + 'fizzbool': (100, 2, int(1e05)), 'medium': (1000, 10, int(1e05)), 'medium2': (1000, 2, int(1e05)), 'large': (10000, 10, int(1e06)), @@ -93,9 +95,9 @@ # The test config runs a test suite that is the cross product of all the above # sets ASSOC_TEST_CONFIG = { - 'distr': ('small',),# 'medium'), + 'distr': ('fizz', 'fizzbool'),# 'medium'), 'blooms': ('8x16',), # '8x32', '16x32'), - 'privacy': ('eps_verysmall',), # 'eps_small'), + 'privacy': ('eps_small',), # 'eps_small'), } # From 19d7f9318a3cb7c812744798d755e894c650df88 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 4 Jun 2015 15:22:09 -0700 Subject: [PATCH 14/67] Adding a couple more specs to test. --- tests/regtest_spec.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index e4458b8e..0e7de91e 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -46,8 +46,12 @@ # num unique values 2, num clients 'tiny': (100, 2, int(1e03)), # test for insufficient data 'small': (100, 10, int(1e04)), + 'fizz-tiny': (100, 20, int(1e03)), + 'fizz-tiny-bool': (100, 2, int(1e03)), + 'fizz-small': (100, 20, int(1e04)), + 'fizz-small-bool': (100, 2, int(1e04)), 'fizz': (100, 20, int(1e05)), - 'fizzbool': (100, 2, int(1e05)), + 'fizz-bool': (100, 2, int(1e05)), 'medium': (1000, 10, int(1e05)), 'medium2': (1000, 2, int(1e05)), 'large': (10000, 10, int(1e06)), @@ -95,7 +99,8 @@ # The test config runs a test suite that is the cross product of all the above # sets ASSOC_TEST_CONFIG = { - 'distr': ('fizz', 'fizzbool'),# 'medium'), + 'distr': ('fizz-tiny', 'fizz-tiny-bool', + 'fizz-small', 'fizz-small-bool',),# 'medium'), 'blooms': ('8x16',), # '8x32', '16x32'), 'privacy': ('eps_small',), # 'eps_small'), } From d15179156b47bcfb53cba51147e433a86968f6e5 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 8 Jun 2015 17:02:20 -0700 Subject: [PATCH 15/67] Updating association to work with 3 variables. --- assoctest.sh | 3 ++- tests/analyze_assoc.R | 49 +++++++++++++++++++++++++++---------------- tests/assoc_sim.R | 26 ++++++++++++++++------- tests/regtest_spec.py | 12 +++++++---- 4 files changed, 60 insertions(+), 30 deletions(-) diff --git a/assoctest.sh b/assoctest.sh index 61eec301..947c33bd 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -97,7 +97,7 @@ _run-one-instance() { -t $instance_dir/truedist.csv \ -m $instance_dir/map \ -n $num_clients \ - --extras $num_unique_values \ + --var1_num $num_unique_values \ --var2_num $num_unique_values2 local out_dir=${instance_dir}_report @@ -111,6 +111,7 @@ _run-one-instance() { tests/analyze_assoc.R \ --map1 $instance_dir/map_1.csv \ --map2 $instance_dir/map_2.csv \ + --map3 $instance_dir/map_3.csv \ --reports $instance_dir/reports.csv \ --truefile $instance_dir/truedist.csv \ --outdir $out_dir \ diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R index ade385c0..4e6af972 100755 --- a/tests/analyze_assoc.R +++ b/tests/analyze_assoc.R @@ -37,12 +37,14 @@ options(stringsAsFactors = FALSE) if(!interactive()) { option_list <- list( # Flags - make_option(c("--map1", "-m1"), default = "map_1.csv", + make_option(c("--map1"), default = "map_1.csv", help = "Hashed candidates for 1st variable"), - make_option(c("--map2", "-m2"), default = "map_2.csv", + make_option(c("--map2"), default = "map_2.csv", help = "Hashed candidates for 2nd variable"), + make_option(c("--map3"), default = "map_3.csv", + help = "Hashed candidates for 3rd variable"), make_option(c("--reports", "-r"), default = "reports.csv", - help = "File with raw reports as "), + help = "File with raw reports as "), make_option(c("--truefile", "-t"), default = "truedist.csv", help = "File with true distribution generated by assoc_sim.R"), make_option(c("--outdir", "-o"), default = ".", @@ -83,26 +85,27 @@ main <- function(opts) { ptm <- proc.time() params <- ReadParameterFile(opts$params) - opts_map <- list(opts$map1, opts$map2) + opts_map <- list(opts$map1, opts$map2, opts$map3) map <- lapply(opts_map, function(o) ProcessMap(ReadMapFile(o, params = params), params = params)) # Reports must be of the format # cohort no, rappor bitstring 1, rappor bitstring 2 reportsObj <- read.csv(opts$reports, - colClasses = c("integer", "character", "character"), + colClasses = c("integer", "character", + "character", "character"), header = FALSE) # Parsing reportsObj # ComputeDistributionEM allows for different sets of cohorts # for each variable. Here, both sets of cohorts are identical co <- as.list(reportsObj[1])[[1]] - cohorts <- list(co, co) - # Parse reports from reportObj cols 2 and 3 - reports <- lapply(1:2, function(x) as.list(reportsObj[x + 1])) + cohorts <- list(co, co, co) + # Parse reports from reportObj cols 2, 3, and 4 + reports <- lapply(1:3, function(x) as.list(reportsObj[x + 1])) # Split strings into bit arrays (as required by assoc analysis) - reports <- lapply(1:2, function(i) { + reports <- lapply(1:3, function(i) { # apply the following function to each of reports[[1]] and reports[[2]] lapply(reports[[i]][[1]], function(x) { # function splits strings and converts them to numeric values @@ -117,30 +120,40 @@ main <- function(opts) { estimate_var = FALSE, new_alg = opts$newalg) + td <- read.csv(file = opts$truefile) ed <- joint_dist$orig$fit + if(length(reports) == 3) { + ed <- as.data.frame(ed) + } + + # We can see if chi-squared tests show different results + # for estimated vs real distribution print("CHI-SQUARED") td_chisq <- chisq.test(td) ed_chisq <- chisq.test(ed) print(td_chisq) print(ed_chisq) - print(l1d(td, ed, "L1 DISTANCE")) - + l1d_metric <- l1d(td, ed, "") print("JOINT_DIST$FIT") print(signif(ed[order(rowSums(ed)),], 4)) + td_metric <- td_chisq[1][[1]][[1]] + ed_metric <- ed_chisq[1][[1]][[1]] + print("PROC.TIME") time_taken <- proc.time() - ptm print(time_taken) - + + metrics <- list(td_chisq = td_metric, + ed_chisq = ed_metric, + tv = l1d_metric/2, + time = time_taken[1], + dim1 = dim(ed)[[2]], + dim2 = dim(ed)[[1]]) + # Write metrics to metrics.csv # Report l1 distance / 2 to be consistent with histogram analysis - metrics <- list(td_chisq = td_chisq[1][[1]][[1]], - ed_chisq = ed_chisq[1][[1]][[1]], - tv = l1d(td, ed, "L1 DISTANCE")/2, - time = time_taken[1], - dim1 = dim(ed)[[1]], - dim2 = dim(ed)[[2]]) filename <- file.path(opts$outdir, 'metrics.csv') write.csv(metrics, file = filename, row.names = FALSE) } diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R index e93918e4..a4e82c6d 100755 --- a/tests/assoc_sim.R +++ b/tests/assoc_sim.R @@ -53,7 +53,7 @@ if(!interactive()) { help = "Number of values for var2"), make_option(c("--extras", "-e"), default = 1e05, help = "How many spurious candidates does the 1st map have?"), - make_option(c("--distr", "-d"), default = "zipf2", + make_option(c("--distr", "-d"), default = "zipf3", help = "Type of distribution. Choose between {unif, poisson, poisson2, zipf2}"), make_option(c("--prefix", "-x"), default = "./", @@ -105,7 +105,7 @@ GetUniqueValsFromFile <- function(filename) { # truefile = name of the file with true distribution # var1_num = number of var1 candidates # var2_num = number of var2 candidates -# *** FOR ASSOCTEST TEST SUITE, USE ONLY ZIPF2 *** +# *** FOR ASSOCTEST TEST SUITE, USE ONLY ZIPF2 / ZIPF3 *** # mapfile = file to write maps into (with .csv suffixes) # reportsfile = file to write reports into (with .csv suffix) SimulateReports <- function(N, uvals, params, distr, extras, truefile, @@ -140,7 +140,7 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, v2_samples <- rep(1, N) v2_samples[v1_samples %% 2 == 0] <- pr25[v1_samples %% 2 == 0] v2_samples[v1_samples %% 2 == 1] <- pr75[v1_samples %% 2 == 1] - } else if (distr == "zipf2") { + } else if (distr == "zipf2" || distr == "zipf3") { # Zipfian over var1_num strings partition <- RandomPartition(N, ComputePdf("zipf1.5", var1_num)) @@ -159,11 +159,18 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, RandomPartition(N, ComputePdf("zipf1.5", var2_num)))) d2 <- (var2_num:1)[d1] v2_samples <- rep(1, N) + v3_samples <- rep(1, N) v2_samples[v1_samples %% 2 == 0] <- d1[v1_samples %% 2 == 0] v2_samples[v1_samples %% 2 == 1] <- d2[v1_samples %% 2 == 1] + if(distr == "zipf3") { + bool1 <- rbinom(N, 1, 0.25) + rep(1, N) + bool2 <- rbinom(N, 1, 0.75) + rep(1, N) + v3_samples[v1_samples %% 2 == 0] <- bool1[v1_samples %% 2 == 0] + v3_samples[v1_samples %% 2 == 1] <- bool2[v1_samples %% 2 == 1] + } } - tmp_samples <- list(v1_samples, v2_samples) + tmp_samples <- list(v1_samples, v2_samples, v3_samples) # Function to pad strings to uval_vec if sample_vec has # larger support than the number of strings in uval_vec @@ -186,12 +193,13 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, # Pad and update uvals uvals <- lapply(1:2, function(i) PadStrings(tmp_samples[[i]], uvals[[i]])) + uvals[[3]] <- c("true", "false") # Replace integers in tmp_samples with actual sample strings - samples <- lapply(1:2, function(i) uvals[[i]][tmp_samples[[i]]]) + samples <- lapply(1:3, function(i) uvals[[i]][tmp_samples[[i]]]) print("TRUE DISTR") td <- table(samples)/sum(table(samples)) - td <- td[order(rowSums(td), decreasing = TRUE),] + td <- td[order(rowSums(td), decreasing = TRUE),,] print(td) write.table(td, file = truefile, sep = ",", col.names = TRUE, row.names = TRUE, quote = FALSE) @@ -209,17 +217,21 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, sep = ",", col.names = FALSE, na = "", quote = FALSE) write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""), sep = ",", col.names = FALSE, na = "", quote = FALSE) + write.table(map[[3]]$map_pos, file = paste(mapfile, "_3.csv", sep = ""), + sep = ",", col.names = FALSE, na = "", quote = FALSE) # Write reports into a csv file # Format: # cohort, bloom filter var1, bloom filter var2 - reports <- lapply(1:2, function(i) + reports <- lapply(1:3, function(i) EncodeAll(samples[[i]], cohorts, map[[i]]$map, params)) # Organize cohorts and reports into format write_matrix <- cbind(as.matrix(cohorts), as.matrix(lapply(reports[[1]], function(x) paste(x, collapse = ""))), as.matrix(lapply(reports[[2]], + function(x) paste(x, collapse = ""))), + as.matrix(lapply(reports[[3]], function(x) paste(x, collapse = "")))) write.table(write_matrix, file = reportsfile, quote = FALSE, row.names = FALSE, col.names = FALSE, sep = ",") diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 0e7de91e..973913ce 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -51,6 +51,8 @@ 'fizz-small': (100, 20, int(1e04)), 'fizz-small-bool': (100, 2, int(1e04)), 'fizz': (100, 20, int(1e05)), + 'fizz-large': (100, 50, int(1e05)), + 'fizz-2large': (100, 50, int(5e05)), 'fizz-bool': (100, 2, int(1e05)), 'medium': (1000, 10, int(1e05)), 'medium2': (1000, 2, int(1e05)), @@ -74,6 +76,7 @@ 'eps_1_5': (0.225, 0.775, 0.0), # eps_1 = 5, no eps_inf 'eps_verysmall': (0.125, 0.875, 0.125), 'eps_small': (0.125, 0.875, 0.5), + 'uma_rappor_type': (0.50, 0.75, 0.5), } # For deriving candidates from true inputs. @@ -99,10 +102,11 @@ # The test config runs a test suite that is the cross product of all the above # sets ASSOC_TEST_CONFIG = { - 'distr': ('fizz-tiny', 'fizz-tiny-bool', - 'fizz-small', 'fizz-small-bool',),# 'medium'), - 'blooms': ('8x16',), # '8x32', '16x32'), - 'privacy': ('eps_small',), # 'eps_small'), + 'distr': ('fizz-tiny', + 'fizz-small', + 'fizz','fizz-large','fizz-2large'),# 'medium'), + 'blooms': ('8x32',), # '8x32', '16x32'), + 'privacy': ('eps_small','uma_rappor_type'), # 'eps_small'), } # From 0ed6ab69e4c5aae18270d172ae8145d25ecffcef Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 11 Jun 2015 14:06:47 -0700 Subject: [PATCH 16/67] Can we replace EM with 2-way marginal computations? --- tests/analyze_assoc_expt.R | 211 +++++++++++++++++++++++++++++++ tests/assoc_sim.R | 8 +- tests/assoc_sim_expt.R | 250 +++++++++++++++++++++++++++++++++++++ tests/regtest_spec.py | 8 +- 4 files changed, 472 insertions(+), 5 deletions(-) create mode 100755 tests/analyze_assoc_expt.R create mode 100755 tests/assoc_sim_expt.R diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R new file mode 100755 index 00000000..10c35341 --- /dev/null +++ b/tests/analyze_assoc_expt.R @@ -0,0 +1,211 @@ +#!/usr/bin/env Rscript +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Reads map files, report files, and RAPPOR parameters to run +# an EM algorithm to estimate joint distribution over two or more variables +# +# Usage: +# $ ./analyze_assoc_expt.R --inp +# +# Input file: +# Outputs: + +library("jsonlite") +library("optparse") + +options(stringsAsFactors = FALSE) + +if(!interactive()) { + option_list <- list( + make_option(c("--inp"), default = "analyze_inp.json", + help = "JSON file with inputs for analyze_assoc_expt")) + opts <- parse_args(OptionParser(option_list = option_list)) +} + +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/read_input.R") +source("analysis/R/association.R") + +# This function processes the maps loaded using ReadMapFile +# Association analysis requires a map object with a map +# field that has the map split into cohorts and an rmap field +# that has all the cohorts combined +# Arguments: +# map = map object with cohorts as sparse matrix in +# object map$map +# This is the expected object from ReadMapFile +# params = data field with parameters +# TODO(pseudorandom): move this functionality to ReadMapFile +ProcessMap <- function(map, params) { + map$rmap <- map$map + map$map <- lapply(1:params$m, function(i) + map$rmap[seq(from = ((i - 1) * params$k + 1), + length.out = params$k),]) + map +} + +# Function to combine reports +# Currently assume 2-way marginals +CombineReports <- function(reports1, reports2) { + two_bits <- list(c(0, 0, 0, 1), c(0, 0, 1, 0), c(0, 1, 0, 0), c(1, 0, 0, 0)) + OuterProd <- function(x, y) { + as.vector(outer(x, y, + function(z, t) z + 2 * t)) + } + creports <- mapply(OuterProd, reports1, reports2, + SIMPLIFY = FALSE) + # Collapse counts to bit vector according to two_bits + lapply(creports, + function(x) as.vector(sapply(x, function(z) two_bits[[z+1]]))) +} + +# Function to combine maps +# Using map1-major order for both candidates and bits of the report +# to be consistent with how CombineReports works +# Currently assume 2-way marginals +CombineMaps <- function(map1, map2) { + # Retrieve set indices and dimensions + rows1 <- which(map1, arr.ind = TRUE)[,1] + cols1 <- which(map1, arr.ind = TRUE)[,2] + length1 <- dim(map1)[[1]] + width1 <- dim(map1)[[2]] + rows2 <- which(map2, arr.ind = TRUE)[,1] + cols2 <- which(map2, arr.ind = TRUE)[,2] + length2 <- dim(map2)[[1]] + width2 <- dim(map2)[[2]] + + map1fn <- function(i, j) { + i1 <- seq(1, length2) + (i-1) * length2 + j1 <- seq(1, width2) + (j-1) * width2 + indices1 <- expand.grid(i1, j1) + } + map1indices <- do.call(rbind, + mapply(map1fn, rows1, cols1, SIMPLIFY = FALSE)) + map1_big <- sparseMatrix(map1indices[,"Var1"], + map1indices[,"var2"], + dims = c(length1 * length2, + width1 * width2)) + colnames(map1_big) <- outer(function(x, y) paste(x, y, sep = "x"), + colnames(map1), + colnames(map2)) +} + + +main <- function(opts) { + ptm <- proc.time() + inp <- fromJSON(opts$inp) + params <- ReadParameterFile(inp$params) + # ensure sufficient maps as required by number of vars + stopifnot(inp$numvars == length(inp$maps)) + opts_map <- inp$maps + map <- lapply(opts_map, function(o) + ProcessMap(ReadMapFile(o, params = params), + params = params)) + # Reports must be of the format + # cohort no, rappor bitstring 1, rappor bitstring 2, ... + reportsObj <- read.csv(inp$reports, + colClasses = c("integer", + rep("character", inp$numvars)), + header = FALSE) + + # Parsing reportsObj + # ComputeDistributionEM allows for different sets of cohorts + # for each variable. Here, both sets of cohorts are identical + co <- as.list(reportsObj[1])[[1]] + cohorts <- rep(list(co), inp$numvars) + # Parse reports from reportObj cols 2, 3, ... + reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1])) + + # Split strings into bit arrays (as required by assoc analysis) + reports <- lapply(1:inp$numvars, function(i) { + # apply the following function to each of reports[[1]] and reports[[2]] + lapply(reports[[i]][[1]], function(x) { + # function splits strings and converts them to numeric values + as.numeric(strsplit(x, split = "")[[1]]) + }) + }) + + creports <- CombineReports(reports[[1]], reports[[2]]) + params2 <- params + params2$k <- (params$k ** 2) * 4 + CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]]) + cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map) + counts <- ComputeCounts(creports, cohorts[[1]], params2) + + + return + joint_dist <- ComputeDistributionEM(reports, cohorts, map, + ignore_other = TRUE, + quick = TRUE, + params, marginals = NULL, + estimate_var = FALSE, + new_alg = inp$newalg) + + + td <- read.csv(file = inp$truefile) + ed <- joint_dist$orig$fit + if(length(reports) == 3) { + ed <- as.data.frame(ed) + } + + # We can see if chi-squared tests show different results + # for estimated vs real distribution + print("CHI-SQUARED") + td_chisq <- chisq.test(td) + ed_chisq <- chisq.test(ed) + print(td_chisq) + print(ed_chisq) + print(l1d(td, ed, "L1 DISTANCE")) + l1d_metric <- l1d(td, ed, "") + print("JOINT_DIST$FIT") + print(signif(ed[order(rowSums(ed)),], 4)) + td_metric <- td_chisq[1][[1]][[1]] + ed_metric <- ed_chisq[1][[1]][[1]] + + print("PROC.TIME") + time_taken <- proc.time() - ptm + print(time_taken) + + metrics <- list(td_chisq = td_metric, + ed_chisq = ed_metric, + tv = l1d_metric/2, + time = time_taken[1], + dim1 = dim(ed)[[2]], + dim2 = dim(ed)[[1]]) + + # Write metrics to metrics.csv + # Report l1 distance / 2 to be consistent with histogram analysis + filename <- file.path(inp$outdir, 'metrics.csv') + write.csv(metrics, file = filename, row.names = FALSE) +} + +# L1 distance = 1 - sum(min(df1|x, df2|x)) where +# df1|x / df2|x projects the distribution to the intersection x of the +# supports of df1 and df2 +l1d <- function(df1, df2, statement = "L1 DISTANCE") { + rowsi <- intersect(rownames(df1), rownames(df2)) + colsi <- intersect(colnames(df1), colnames(df2)) + print(statement) + 1 - sum(mapply(min, + unlist(as.data.frame(df1)[rowsi, colsi], use.names = FALSE), + unlist(as.data.frame(df2)[rowsi, colsi], use.names = FALSE))) +} + +if(!interactive()) { + main(opts) +} diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R index a4e82c6d..c1166bc1 100755 --- a/tests/assoc_sim.R +++ b/tests/assoc_sim.R @@ -170,7 +170,12 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, } } - tmp_samples <- list(v1_samples, v2_samples, v3_samples) + if(distr == "zipf2") { + tmp_samples <- list(v1_samples, v2_samples) + } else if(distr == "zipf3") { + tmp_samples <- list(v1_samples, v2_samples, v3_samples) + } + # Function to pad strings to uval_vec if sample_vec has # larger support than the number of strings in uval_vec @@ -193,6 +198,7 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile, # Pad and update uvals uvals <- lapply(1:2, function(i) PadStrings(tmp_samples[[i]], uvals[[i]])) + uvals[[3]] <- c("true", "false") # Replace integers in tmp_samples with actual sample strings samples <- lapply(1:3, function(i) uvals[[i]][tmp_samples[[i]]]) diff --git a/tests/assoc_sim_expt.R b/tests/assoc_sim_expt.R new file mode 100755 index 00000000..59ce1356 --- /dev/null +++ b/tests/assoc_sim_expt.R @@ -0,0 +1,250 @@ +#!/usr/bin/env Rscript +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Simulates inputs on which association analysis can be run. +# Currently assoc_sim.R only supports 2 variables but can +# be easily extended to support more. +# +# Usage: +# $ ./assoc_sim_expt.R --inp sim_inp.json +# Inputs: uvals, params, reports, map, num, unif +# see how options are parsed below for more information +# Outputs: +# reports.csv file containing reports +# map_{1, 2, ...}.csv file(s) containing maps of variables + +library("jsonlite") +library("optparse") + +options(stringsAsFactors = FALSE) + +if(!interactive()) { + option_list <- list( + make_option(c("--inp"), default = "assoc_inp.json", + help = "JSON file with inputs for assoc_sim_expt")) + opts <- parse_args(OptionParser(option_list = option_list)) + inp <- fromJSON(opts$inp) +} + +apply_prefix <- function(path) { + paste(inp$prefix, path, sep = "") +} + +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/read_input.R") +source("analysis/R/association.R") +source("tests/gen_counts.R") + +# Read unique values of reports from a csv file +# Inputs: filename. The file is expected to contain two rows of strings +# (one for each variable): +# "google.com", "apple.com", ... +# "ssl", "nossl", ... +# Returns: a list containing strings +GetUniqueValsFromFile <- function(filename) { + contents <- read.csv(filename, header = FALSE) + # Expect 2 rows of unique vals + if(nrow(contents) != 2) { + stop(paste("Unique vals file", filename, "expected to have + two rows of strings.")) + } + # Removes superfluous "" entries if the lists of unique values + # differ in length + strip_empty <- function(vec) { + vec[!vec %in% c("")] + } + list(var1 = strip_empty(as.vector(t(contents[1,]))), + var2 = strip_empty(as.vector(t(contents[2,])))) +} + +# Simulate correlated reports and write into reportsfile +# Inputs: N = number of reports +# uvals = list containing a list of unique values +# params = list with RAPPOR parameters +# distr = the type of distribution to use +# {unif, poisson, poisson2, zipfg} +# extras = whether map_1.csv has spurious candidates or not +# truefile = name of the file with true distribution +# varcandidates = list of number of candidates for each var +# *** FOR ASSOCTEST TEST SUITE, USE ONLY ZIPF2 / ZIPF3 *** +# mapfile = file to write maps into (with .csv suffixes) +# reportsfile = file to write reports into (with .csv suffix) +SimulateReports <- function(N, uvals, params, distr, extras, truefile, + varcandidates, + mapfile, reportsfile) { + # Compute true distribution + m <- params$m + + if (distr == "unif") { + # Draw uniformly from 1 to 10 + v1_samples <- as.integer(runif(N, 1, 10)) + + # Pr[var2 = N + 1 | var1 = N] = 0.5 + # Pr[var2 = N | var1 = N] = 0.5 + v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE) + + } else if(distr == "poisson") { + # Draw from a Poisson random variable + v1_samples <- rpois(N, 1) + rep(1, N) + + # Pr[var2 = N + 1 | var1 = N] = 0.5 + # Pr[var2 = N | var1 = N] = 0.5 + v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE) + } else if (distr == "poisson2") { + + v1_samples <- rpois(N, 1) + rep(1, N) + # supp(var2) = {1, 2} + # Pr[var2 = 1 | var1 = even] = 0.75 + # Pr[var2 = 1 | var1 = odd] = 0.25 + pr25 <- rbinom(N, 1, 0.25) + 1 + pr75 <- rbinom(N, 1, 0.75) + 1 + v2_samples <- rep(1, N) + v2_samples[v1_samples %% 2 == 0] <- pr25[v1_samples %% 2 == 0] + v2_samples[v1_samples %% 2 == 1] <- pr75[v1_samples %% 2 == 1] + } else if (distr == "zipf2" || distr == "zipf3") { + + var1_num <- varcandidates[[1]] + var2_num <- varcandidates[[2]] + + # Zipfian over var1_num strings + partition <- RandomPartition(N, ComputePdf("zipf1.5", var1_num)) + v1_samples <- rep(1:var1_num, partition) # expand partition + # Shuffle values randomly (may take a few sec for > 10^8 inputs) + v1_samples <- sample(v1_samples) + + # supp(var2) = {1, 2, 3, ..., var2_num} + # We look at two zipfian distributions over supp(var2) + # D1 = zipfian distribution + # D2 = zipfian distr over {var2_num, ..., 4, 3, 2, 1} + # (i.e., D1 in reverse) + # var2 ~ D1 if var1 = even + # var2 ~ D2 if var1 = odd + d1 <- sample(rep(1:var2_num, + RandomPartition(N, ComputePdf("zipf1.5", var2_num)))) + d2 <- (var2_num:1)[d1] + v2_samples <- rep(1, N) + v3_samples <- rep(1, N) + v2_samples[v1_samples %% 2 == 0] <- d1[v1_samples %% 2 == 0] + v2_samples[v1_samples %% 2 == 1] <- d2[v1_samples %% 2 == 1] + if(distr == "zipf3") { + bool1 <- rbinom(N, 1, 0.25) + rep(1, N) + bool2 <- rbinom(N, 1, 0.75) + rep(1, N) + v3_samples[v1_samples %% 2 == 0] <- bool1[v1_samples %% 2 == 0] + v3_samples[v1_samples %% 2 == 1] <- bool2[v1_samples %% 2 == 1] + } + } + + if(length(varcandidates) == 3) { + tmp_samples <- list(v1_samples, v2_samples, v3_samples) + } else if (length(varcandidates) == 2) { + tmp_samples <- list(v1_samples, v2_samples) + } + + # Function to pad strings to uval_vec if sample_vec has + # larger support than the number of strings in uval_vec + # For e.g., if samples have support {1, 2, 3, 4, ...} and uvals + # only have "value1", "value2", and "value3", samples now + # over support {"value1", "value2", "value3", "str4", ...} + PadStrings <- function(sample_vec, uval_vec) { + if (max(sample_vec) > length(uval_vec)) { + # Padding uvals to required length + len <- length(uval_vec) + max_of_samples <- max(sample_vec) + uval_vec[(len + 1):max_of_samples] <- apply( + as.matrix((len + 1):max_of_samples), + 1, + function(i) sprintf("str%d", i)) + } + uval_vec + } + + # Pad and update uvals + uvals <- lapply(1:length(varcandidates), + function(i) PadStrings(tmp_samples[[i]], + uvals[[i]])) + # Replace integers in tmp_samples with actual sample strings + samples <- lapply(1:length(varcandidates), + function(i) uvals[[i]][tmp_samples[[i]]]) + + print("TRUE DISTR") + td <- table(samples)/sum(table(samples)) + if (length(varcandidates) == 2) { + td <- td[order(rowSums(td), decreasing = TRUE),] + } else { + td <- td[order(rowSums(td), decreasing = TRUE),,] + } + print(td) + write.table(td, file = truefile, sep = ",", col.names = TRUE, + row.names = TRUE, quote = FALSE) + # Randomly assign cohorts in each dimension + cohorts <- sample(1:m, N, replace = TRUE) + + # Create and write map into mapfile_1.csv and mapfile_2.csv + if (extras > 0) { + # spurious candidates for mapfile_1.csv + len <- length(uvals[[1]]) + as.numeric(extras) + uvals[[1]] <- PadStrings(len, uvals[[1]]) + } + map <- lapply(uvals, function(u) CreateMap(u, params)) + write.table(map[[1]]$map_pos, file = paste(mapfile, "_1.csv", sep = ""), + sep = ",", col.names = FALSE, na = "", quote = FALSE) + write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""), + sep = ",", col.names = FALSE, na = "", quote = FALSE) + if(length(varcandidates) == 3) { + write.table(map[[3]]$map_pos, file = paste(mapfile, "_3.csv", sep = ""), + sep = ",", col.names = FALSE, na = "", quote = FALSE) + } + + # Write reports into a csv file + # Format: + # cohort, bloom filter var1, bloom filter var2 + reports <- lapply(1:length(varcandidates), function(i) + EncodeAll(samples[[i]], cohorts, map[[i]]$map, params)) + # Organize cohorts and reports into format + write_matrix <- cbind(as.matrix(cohorts), + sapply(reports, + function(x) as.matrix(lapply(x, + function(z) paste(z, collapse = ""))))) + write.table(write_matrix, file = reportsfile, quote = FALSE, + row.names = FALSE, col.names = FALSE, sep = ",") +} + +main <- function(inp) { + ptm <- proc.time() + + if(is.null(inp$uvals)) { + # One off case. + # TODO(pseudorandom): More sensible defaults. + uvals = list(var1 = c("str1", "str2"), var2 = c("option1", "option2", "option3")) + } else { + uvals <- GetUniqueValsFromFile(apply_prefix(inp$uvals)) + } + params <- ReadParameterFile(apply_prefix(inp$params)) + SimulateReports(inp$num, uvals, params, inp$distr, # inuts + inp$extras, apply_prefix(inp$true), # inputs + inp$varcandidates, # inputs + apply_prefix(inp$map), + apply_prefix(inp$reports)) # outputs + + print("PROC.TIME") + print(proc.time() - ptm) +} + +if(!interactive()) { + main(inp) +} diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 973913ce..f21ba367 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -102,11 +102,11 @@ # The test config runs a test suite that is the cross product of all the above # sets ASSOC_TEST_CONFIG = { - 'distr': ('fizz-tiny', - 'fizz-small', - 'fizz','fizz-large','fizz-2large'),# 'medium'), + 'distr': (#'fizz-tiny', + #'fizz-small', + 'fizz',),#'fizz-large','fizz-2large'),# 'medium'), 'blooms': ('8x32',), # '8x32', '16x32'), - 'privacy': ('eps_small','uma_rappor_type'), # 'eps_small'), + 'privacy': ('eps_small',),#'uma_rappor_type'), # 'eps_small'), } # From d37dcf0eda4bdc4487577daef09b34b27f5cd18b Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 11 Jun 2015 23:21:08 -0700 Subject: [PATCH 17/67] Combining maps. --- tests/analyze_assoc_expt.R | 54 +++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 7 deletions(-) diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index 10c35341..e06f2fa5 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -89,20 +89,60 @@ CombineMaps <- function(map1, map2) { length2 <- dim(map2)[[1]] width2 <- dim(map2)[[2]] + # Now process map1 map1fn <- function(i, j) { - i1 <- seq(1, length2) + (i-1) * length2 - j1 <- seq(1, width2) + (j-1) * width2 - indices1 <- expand.grid(i1, j1) + i1 <- seq(1, length2) + ((i-1) * length2) + j1 <- seq(1, width2) + ((j-1) * width2) + expand.grid(i1, j1) } map1indices <- do.call(rbind, mapply(map1fn, rows1, cols1, SIMPLIFY = FALSE)) map1_big <- sparseMatrix(map1indices[,"Var1"], - map1indices[,"var2"], + map1indices[,"Var2"], dims = c(length1 * length2, width1 * width2)) - colnames(map1_big) <- outer(function(x, y) paste(x, y, sep = "x"), - colnames(map1), - colnames(map2)) + colnames(map1_big) <- t(outer(colnames(map1), + colnames(map2), + function(x, y) paste(x, y, sep = "x"))) + + # Now process map2 + map2fn <- function(i, j) { + i2 <- i + (seq(0, length1 - 1) * length2) + j2 <- j + (seq(0, width1 - 1) * width2) + expand.grid(i2, j2) + } + map2indices <- do.call(rbind, + mapply(map2fn, rows2, cols2, SIMPLIFY = FALSE)) + map2_big <- sparseMatrix(map2indices[,"Var1"], + map2indices[,"Var2"], + dims = c(length1 * length2, + width1 * width2)) + colnames(map2_big) <- t(outer(colnames(map1), + colnames(map2), + function(x, y) paste(x, y, sep = "x"))) + + # Now collate two maps with entries in (1000, 0100, 0010, 0001) + # (m1&m2, !m1 & m2, m1 & !m2, !(m1 & m2)) respectively + findices <- which(map1_big & map2_big, arr.ind = TRUE) + # 1000 + findices[, 1] <- findices[, 1] * 4 - 3 + # 0100 + indices_0100 <- which((!map1_big) & map2_big, arr.ind = TRUE) + indices_0100[, 1] <- indices_0100[, 1] * 4 - 2 + findices <- rbind(findices, indices_0100) + # 0010 + indices_0010 <- which(map1_big & (!map2_big), arr.ind = TRUE) + indices_0010[, 1] <- indices_0010[, 1] * 4 - 1 + findices <- rbind(findices, indices_0010) + # 0001 + indices_0001 <- which(!(map1_big & map2_big), arr.ind = TRUE) + indices_0001[, 1] <- indices_0001[, 1] * 4 + findices <- rbind(findices, indices_0001) + sm <- sparseMatrix(findices[, 1], findices[, 2], + dims = c(4 * length1 * length2, + width1 * width2)) + colnames(sm) <- colnames(map1_big) + sm } From 1a983db150f591825a117457db422eadc00aae9a Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Fri, 12 Jun 2015 18:16:16 -0700 Subject: [PATCH 18/67] More Decode code to support 2-way marginals. --- analysis/R/decode.R | 61 ++++++++++++++++++++++++++++++++++++++ tests/analyze_assoc_expt.R | 4 +-- 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 4fae9d86..e75385be 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -19,6 +19,67 @@ library(glmnet) source('analysis/R/alternative.R') +Estimate2WayBloomCounts <- function(params, obs_counts) { + p <- params$p + q <- params$q + f <- params$f + m <- params$m + + stopifnot(m == nrow(obs_counts), params$k + 1 == ncol(obs_counts)) + + p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1 + p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1 + p10 <- 1 - p11 # probability of a true 1 reported as 0 + p00 <- 1 - p01 # probability of a true 0 reported as 0 + p2 <- p11 - p01 # == (1 - f) * (q - p) + + ests <- apply(obs_counts, 1, function(x) { + N <- x[1] # sample size of cohort + inds <- seq(0, m/4 - 1) + v <- x[-1] # counts for individual bits + # 11 or (1000) estimates + v[inds*4 + 2] <- + (v[inds*4 + 2] - (p11**2)*N) / (2*p01*p11 + p01**2 - p11**2) + + # 10 or (0100) estimates + v[inds*4 + 3] <- + (v[inds*4 + 3] - (p11*p00)*N) / (p10*p11 + p01*p10 + p01*p00 - p11*p00) + + # 01 or (0010) estimates + v[inds*4 + 4] <- + (v[inds*4 + 4] - (p11*p00)*N) / (p10*p11 + p01*p10 + p01*p00 - p11*p00) + + # 00 or (0001) estimates + v[inds*4 + 5] <- + (v[inds*4 + 5] - (p11**2)*N) / (2*p10*p00 + p10**2 - p00**2) + v + }) + + if(FALSE) { + # TODO(pseudorandom): Compute variances + variances <- apply(obs_counts, 1, function(x) { + N <- x[1] + v <- x[-1] + p_hats <- (v - p01 * N) / (N * p2) # expectation of a true 1 + p_hats <- pmax(0, pmin(1, p_hats)) # clamp to [0,1] + r <- p_hats * p11 + (1 - p_hats) * p01 # expectation of a reported 1 + N * r * (1 - r) / p2^2 # variance of the binomial + }) + } + + # Transform counts from absolute values to fractional, removing bias due to + # variability of reporting between cohorts. + ests <- apply(ests, 1, function(x) x / obs_counts[,1]) + # stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1]) + + # Some estimates may be set to infinity, e.g. if f=1. We want to + # account for this possibility, and set the corresponding counts + # to 0. + ests[abs(ests) == Inf] <- 0 + + list(estimates = ests, stds = ests) +} + EstimateBloomCounts <- function(params, obs_counts) { # Estimates the number of times each bit in each cohort was set in original # Bloom filters. diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index e06f2fa5..430e0b24 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -135,7 +135,7 @@ CombineMaps <- function(map1, map2) { indices_0010[, 1] <- indices_0010[, 1] * 4 - 1 findices <- rbind(findices, indices_0010) # 0001 - indices_0001 <- which(!(map1_big & map2_big), arr.ind = TRUE) + indices_0001 <- which((!map1_big) & (!map2_big), arr.ind = TRUE) indices_0001[, 1] <- indices_0001[, 1] * 4 findices <- rbind(findices, indices_0001) sm <- sparseMatrix(findices[, 1], findices[, 2], @@ -186,7 +186,7 @@ main <- function(opts) { CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]]) cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map) counts <- ComputeCounts(creports, cohorts[[1]], params2) - + ests <- Estimate2WayBloomCounts(params2, counts) return joint_dist <- ComputeDistributionEM(reports, cohorts, map, From 669c500391869b72ecd083408f1ade18e5d7de48 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 15 Jun 2015 18:29:03 -0700 Subject: [PATCH 19/67] Replacing EM with two-way marginals. --- analysis/R/decode.R | 69 ++++++++++++++++-------- tests/analyze_assoc_expt.R | 105 +++++++++++++++++++++---------------- tests/assoc_sim_expt.R | 2 +- 3 files changed, 108 insertions(+), 68 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index e75385be..07d3c815 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -24,6 +24,7 @@ Estimate2WayBloomCounts <- function(params, obs_counts) { q <- params$q f <- params$f m <- params$m + k <- params$k stopifnot(m == nrow(obs_counts), params$k + 1 == ncol(obs_counts)) @@ -31,28 +32,20 @@ Estimate2WayBloomCounts <- function(params, obs_counts) { p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1 p10 <- 1 - p11 # probability of a true 1 reported as 0 p00 <- 1 - p01 # probability of a true 0 reported as 0 - p2 <- p11 - p01 # == (1 - f) * (q - p) + + NoiseMatrix <- matrix(rep(0, 16), 4) + NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2) + NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00) + NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01) + NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2) ests <- apply(obs_counts, 1, function(x) { - N <- x[1] # sample size of cohort - inds <- seq(0, m/4 - 1) - v <- x[-1] # counts for individual bits - # 11 or (1000) estimates - v[inds*4 + 2] <- - (v[inds*4 + 2] - (p11**2)*N) / (2*p01*p11 + p01**2 - p11**2) - - # 10 or (0100) estimates - v[inds*4 + 3] <- - (v[inds*4 + 3] - (p11*p00)*N) / (p10*p11 + p01*p10 + p01*p00 - p11*p00) - - # 01 or (0010) estimates - v[inds*4 + 4] <- - (v[inds*4 + 4] - (p11*p00)*N) / (p10*p11 + p01*p10 + p01*p00 - p11*p00) - - # 00 or (0001) estimates - v[inds*4 + 5] <- - (v[inds*4 + 5] - (p11**2)*N) / (2*p10*p00 + p10**2 - p00**2) - v + N <- x[1] + inds <- seq(0, (k/4)-1) + v <- x[-1] + sapply(inds, function(i){ + as.vector(t(Solve(NoiseMatrix)) %*% v[(i*4 + 1):((i+1)*4)]) + }) }) if(FALSE) { @@ -76,8 +69,9 @@ Estimate2WayBloomCounts <- function(params, obs_counts) { # account for this possibility, and set the corresponding counts # to 0. ests[abs(ests) == Inf] <- 0 - - list(estimates = ests, stds = ests) + + list(estimates = ests, + stds = matrix(rep(1, 2 * length(ests[1,])), 2)) } EstimateBloomCounts <- function(params, obs_counts) { @@ -315,6 +309,37 @@ Resample <- function(e) { result } +Decode2Way <- function(counts, map, params) { + k <- params$k + p <- params$p + q <- params$q + f <- params$f + h <- params$h + m <- params$m + + S <- ncol(map) # total number of candidates + + N <- sum(counts[, 1]) + + filter_cohorts <- which(counts[, 1] != 0) # exclude cohorts with zero reports + + # stretch cohorts to bits + filter_bits <- as.vector( + t(matrix(1:nrow(map), nrow = m, byrow = TRUE)[filter_cohorts,])) + + es <- Estimate2WayBloomCounts(params, counts) + e <- list(estimates = es$estimates[filter_cohorts, , drop = FALSE], + stds = es$stds[filter_cohorts, , drop = FALSE]) + coefs <- FitDistribution(e, map[filter_bits, , drop = FALSE]) + mod <- list(coefs = coefs, stds = coefs) + inf <- PerformInference(map[filter_bits, , drop = FALSE], + as.vector(t(es$estimates)), + N, mod, params, alpha = (0.05/S), + correction = "Bonferroni") + fit <- inf$fit + list(fit = fit) +} + Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05, correction = c("Bonferroni"), ...) { k <- params$k diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index 430e0b24..f50afb2a 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -62,12 +62,14 @@ ProcessMap <- function(map, params) { # Function to combine reports # Currently assume 2-way marginals CombineReports <- function(reports1, reports2) { - two_bits <- list(c(0, 0, 0, 1), c(0, 0, 1, 0), c(0, 1, 0, 0), c(1, 0, 0, 0)) + # Encoding (var1, var2) \in {(0, 0), (0, 1), (1, 0), (1, 1)} + two_bits <- list(c(0, 0, 0, 1), c(0, 1, 0, 0), c(0, 0, 1, 0), c(1, 0, 0, 0)) OuterProd <- function(x, y) { as.vector(outer(x, y, function(z, t) z + 2 * t)) } - creports <- mapply(OuterProd, reports1, reports2, + # "report1-major" order + creports <- mapply(OuterProd, reports2, reports1, SIMPLIFY = FALSE) # Collapse counts to bit vector according to two_bits lapply(creports, @@ -183,55 +185,68 @@ main <- function(opts) { creports <- CombineReports(reports[[1]], reports[[2]]) params2 <- params params2$k <- (params$k ** 2) * 4 - CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]]) + # CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]]) cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map) + # Combine cohorts into one map. Needed for Decode2Way + inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE)) + inds[[2]][, 1] <- inds[[2]][, 1] + length(inds[[1]][, 1]) + inds <- rbind(inds[[1]], inds[[2]]) + crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c( + nrow(cmap[[1]]) + nrow(cmap[[2]]), + ncol(cmap[[1]]))) + colnames(crmap) <- colnames(cmap[[1]]) counts <- ComputeCounts(creports, cohorts[[1]], params2) - ests <- Estimate2WayBloomCounts(params2, counts) + marginal <- Decode2Way(counts, crmap, params2)$fit + print(marginal) - return - joint_dist <- ComputeDistributionEM(reports, cohorts, map, - ignore_other = TRUE, - quick = TRUE, - params, marginals = NULL, - estimate_var = FALSE, - new_alg = inp$newalg) - - - td <- read.csv(file = inp$truefile) - ed <- joint_dist$orig$fit - if(length(reports) == 3) { - ed <- as.data.frame(ed) + if (FALSE) { + joint_dist <- ComputeDistributionEM(reports, cohorts, map, + ignore_other = TRUE, + quick = TRUE, + params, marginals = NULL, + estimate_var = FALSE, + new_alg = inp$newalg) } - # We can see if chi-squared tests show different results - # for estimated vs real distribution - print("CHI-SQUARED") - td_chisq <- chisq.test(td) - ed_chisq <- chisq.test(ed) - print(td_chisq) - print(ed_chisq) - print(l1d(td, ed, "L1 DISTANCE")) - l1d_metric <- l1d(td, ed, "") - print("JOINT_DIST$FIT") - print(signif(ed[order(rowSums(ed)),], 4)) - td_metric <- td_chisq[1][[1]][[1]] - ed_metric <- ed_chisq[1][[1]][[1]] - - print("PROC.TIME") - time_taken <- proc.time() - ptm - print(time_taken) - - metrics <- list(td_chisq = td_metric, - ed_chisq = ed_metric, - tv = l1d_metric/2, - time = time_taken[1], - dim1 = dim(ed)[[2]], - dim2 = dim(ed)[[1]]) + td <- read.csv(file = inp$truefile) + print(td) - # Write metrics to metrics.csv - # Report l1 distance / 2 to be consistent with histogram analysis - filename <- file.path(inp$outdir, 'metrics.csv') - write.csv(metrics, file = filename, row.names = FALSE) + if(FALSE) { + ed <- joint_dist$orig$fit + if(length(reports) == 3) { + ed <- as.data.frame(ed) + } + + # We can see if chi-squared tests show different results + # for estimated vs real distribution + print("CHI-SQUARED") + td_chisq <- chisq.test(td) + ed_chisq <- chisq.test(ed) + print(td_chisq) + print(ed_chisq) + print(l1d(td, ed, "L1 DISTANCE")) + l1d_metric <- l1d(td, ed, "") + print("JOINT_DIST$FIT") + print(signif(ed[order(rowSums(ed)),], 4)) + td_metric <- td_chisq[1][[1]][[1]] + ed_metric <- ed_chisq[1][[1]][[1]] + + print("PROC.TIME") + time_taken <- proc.time() - ptm + print(time_taken) + + metrics <- list(td_chisq = td_metric, + ed_chisq = ed_metric, + tv = l1d_metric/2, + time = time_taken[1], + dim1 = dim(ed)[[2]], + dim2 = dim(ed)[[1]]) + + # Write metrics to metrics.csv + # Report l1 distance / 2 to be consistent with histogram analysis + filename <- file.path(inp$outdir, 'metrics.csv') + write.csv(metrics, file = filename, row.names = FALSE) + } } # L1 distance = 1 - sum(min(df1|x, df2|x)) where diff --git a/tests/assoc_sim_expt.R b/tests/assoc_sim_expt.R index 59ce1356..5d3438ef 100755 --- a/tests/assoc_sim_expt.R +++ b/tests/assoc_sim_expt.R @@ -230,7 +230,7 @@ main <- function(inp) { if(is.null(inp$uvals)) { # One off case. # TODO(pseudorandom): More sensible defaults. - uvals = list(var1 = c("str1", "str2"), var2 = c("option1", "option2", "option3")) + uvals = list(var1 = c("str1", "str2"), var2 = c("option1", "option2")) } else { uvals <- GetUniqueValsFromFile(apply_prefix(inp$uvals)) } From 598abc2d20c352c79cb54c51627c883b9e275618 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Tue, 16 Jun 2015 10:42:01 -0700 Subject: [PATCH 20/67] Working on 2-way marginal code. --- analysis/R/decode.R | 2 +- tests/analyze_assoc_expt.R | 80 +++++++++++++++++++------------------- 2 files changed, 42 insertions(+), 40 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 07d3c815..b68db0e4 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -71,7 +71,7 @@ Estimate2WayBloomCounts <- function(params, obs_counts) { ests[abs(ests) == Inf] <- 0 list(estimates = ests, - stds = matrix(rep(1, 2 * length(ests[1,])), 2)) + stds = matrix(rep(5, 2 * length(ests[1,])), 2)) } EstimateBloomCounts <- function(params, obs_counts) { diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index f50afb2a..1aa01f03 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -189,7 +189,7 @@ main <- function(opts) { cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map) # Combine cohorts into one map. Needed for Decode2Way inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE)) - inds[[2]][, 1] <- inds[[2]][, 1] + length(inds[[1]][, 1]) + inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1] inds <- rbind(inds[[1]], inds[[2]]) crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c( nrow(cmap[[1]]) + nrow(cmap[[2]]), @@ -197,56 +197,58 @@ main <- function(opts) { colnames(crmap) <- colnames(cmap[[1]]) counts <- ComputeCounts(creports, cohorts[[1]], params2) marginal <- Decode2Way(counts, crmap, params2)$fit - print(marginal) - if (FALSE) { + also_em = FALSE + ed_em <- list() + if(also_em == TRUE) { joint_dist <- ComputeDistributionEM(reports, cohorts, map, ignore_other = TRUE, quick = TRUE, params, marginals = NULL, estimate_var = FALSE, new_alg = inp$newalg) + ed_em <- joint_dist$orig$fit + if(length(reports) == 3) { + ed_em <- as.data.frame(ed_em) + } } td <- read.csv(file = inp$truefile) - print(td) - - if(FALSE) { - ed <- joint_dist$orig$fit - if(length(reports) == 3) { - ed <- as.data.frame(ed) + ed <- td + for (cols in colnames(td)) { + for (rows in rownames(td)) { + ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] } - - # We can see if chi-squared tests show different results - # for estimated vs real distribution - print("CHI-SQUARED") - td_chisq <- chisq.test(td) - ed_chisq <- chisq.test(ed) - print(td_chisq) - print(ed_chisq) - print(l1d(td, ed, "L1 DISTANCE")) - l1d_metric <- l1d(td, ed, "") - print("JOINT_DIST$FIT") - print(signif(ed[order(rowSums(ed)),], 4)) - td_metric <- td_chisq[1][[1]][[1]] - ed_metric <- ed_chisq[1][[1]][[1]] - - print("PROC.TIME") - time_taken <- proc.time() - ptm - print(time_taken) - - metrics <- list(td_chisq = td_metric, - ed_chisq = ed_metric, - tv = l1d_metric/2, - time = time_taken[1], - dim1 = dim(ed)[[2]], - dim2 = dim(ed)[[1]]) - - # Write metrics to metrics.csv - # Report l1 distance / 2 to be consistent with histogram analysis - filename <- file.path(inp$outdir, 'metrics.csv') - write.csv(metrics, file = filename, row.names = FALSE) } + + print("PROC.TIME") + time_taken <- proc.time() - ptm + print(time_taken) + + print("2 WAY RESULTS") + print(signif(ed[order(rowSums(ed)), ], 4)) + print(l1d(td, ed, "L1 DISTANCE 2 WAY")) + metrics <- list( + td_chisq = chisq.test(td)[1][[1]][[1]], + ed_chisq = chisq.test(ed)[1][[1]][[1]], + tv = l1d(td, ed, "")/2, + time = time_taken[1], + dim1 = dim(ed)[[2]], + dim2 = dim(ed)[[1]] + ) + + if(also_em == TRUE) { + # Add EM metrics + metrics <- c(metrics, + list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]], + tv_em = l1d(td, ed_em, "")/2)) + } + + # Write metrics to metrics.csv + # Report l1 distance / 2 to be consistent with histogram analysis + filename <- file.path(inp$outdir, 'metrics.csv') + write.csv(metrics, file = filename, row.names = FALSE) + } # L1 distance = 1 - sum(min(df1|x, df2|x)) where From e293e670acee1343c057189ee274a834f98ab714 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Tue, 16 Jun 2015 15:53:57 -0700 Subject: [PATCH 21/67] Fixing some bugs. --- analysis/R/decode.R | 6 +++-- assoctest.sh | 51 ++++++++++++++++++++++++++------------ tests/analyze_assoc_expt.R | 11 +++++--- tests/regtest_spec.py | 4 +-- 4 files changed, 49 insertions(+), 23 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index b68db0e4..6e755423 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -71,7 +71,8 @@ Estimate2WayBloomCounts <- function(params, obs_counts) { ests[abs(ests) == Inf] <- 0 list(estimates = ests, - stds = matrix(rep(5, 2 * length(ests[1,])), 2)) + stds = matrix(rep(5, length(ests[,1]) * length(ests[1,])), + length(ests[,1]))) } EstimateBloomCounts <- function(params, obs_counts) { @@ -277,7 +278,8 @@ FitDistribution <- function(estimates_stds, map) { support_coefs <- 1:S - if (S > length(estimates_stds$estimates) * .8) { + if (TRUE) { + # if (S > length(estimates_stds$estimates) * .8) { # the system is close to being underdetermined lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) diff --git a/assoctest.sh b/assoctest.sh index 947c33bd..01de8793 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -91,14 +91,24 @@ _run-one-instance() { banner "Running association input simulation" - tests/assoc_sim.R \ - -p $case_dir/case_params.csv \ - -r $instance_dir/reports.csv \ - -t $instance_dir/truedist.csv \ - -m $instance_dir/map \ - -n $num_clients \ - --var1_num $num_unique_values \ - --var2_num $num_unique_values2 + # Setting up JSON file containing assoc_sim inputs with python + python -c "import json; \ + f = file('$instance_dir/assoc_inp.json', 'w'); \ + inp = dict(); \ + inp['params'] = '$case_dir/case_params.csv'; \ + inp['reports'] = '$instance_dir/reports.csv'; \ + inp['true'] = '$instance_dir/truedist.csv'; \ + inp['map'] = '$instance_dir/map'; \ + inp['num'] = $num_clients; \ + inp['extras'] = 0; \ + inp['distr'] = 'zipf2'; \ + inp['prefix'] = './'; \ + inp['vars'] = 2; \ + inp['varcandidates'] = [$num_unique_values, $num_unique_values2]; \ + json.dump(inp, f); \ + f.close();" + + tests/assoc_sim_expt.R --inp $instance_dir/assoc_inp.json local out_dir=${instance_dir}_report mkdir --verbose -p $out_dir @@ -107,15 +117,24 @@ _run-one-instance() { # engine, which excludes R's loading time and reading of the (possibly # substantial) map file. Timing below is more inclusive. TIMEFORMAT='Running analyze.R took %R seconds' + + # Setting up JSON file with python + python -c "import json; \ + f = file('$instance_dir/analyze_inp.json', 'w'); \ + inp = dict(); \ + inp['maps'] = ['$instance_dir/map_1.csv',\ + '$instance_dir/map_2.csv']; \ + inp['reports'] = '$instance_dir/reports.csv'; \ + inp['truefile'] = '$instance_dir/truedist.csv'; \ + inp['outdir'] = '.'; \ + inp['params'] = '$case_dir/case_params.csv'; \ + inp['newalg'] = 'false'; \ + inp['numvars'] = 2; \ + json.dump(inp, f); \ + f.close();" + time { - tests/analyze_assoc.R \ - --map1 $instance_dir/map_1.csv \ - --map2 $instance_dir/map_2.csv \ - --map3 $instance_dir/map_3.csv \ - --reports $instance_dir/reports.csv \ - --truefile $instance_dir/truedist.csv \ - --outdir $out_dir \ - --params $case_dir/case_params.csv + tests/analyze_assoc_expt.R --inp $instance_dir/analyze_inp.json } } diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index 1aa01f03..936ca1da 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -189,10 +189,15 @@ main <- function(opts) { cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map) # Combine cohorts into one map. Needed for Decode2Way inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE)) - inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1] - inds <- rbind(inds[[1]], inds[[2]]) + for (i in seq(1, length(inds))) { + inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1] + } + inds <- do.call("rbind", inds) + + # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1] + # inds <- rbind(inds[[1]], inds[[2]]) crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c( - nrow(cmap[[1]]) + nrow(cmap[[2]]), + nrow(cmap[[1]]) * length(cmap), ncol(cmap[[1]]))) colnames(crmap) <- colnames(cmap[[1]]) counts <- ComputeCounts(creports, cohorts[[1]], params2) diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index f21ba367..93101384 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -105,8 +105,8 @@ 'distr': (#'fizz-tiny', #'fizz-small', 'fizz',),#'fizz-large','fizz-2large'),# 'medium'), - 'blooms': ('8x32',), # '8x32', '16x32'), - 'privacy': ('eps_small',),#'uma_rappor_type'), # 'eps_small'), + 'blooms': ('8x16', '8x32'), # '8x32', '16x32'), + 'privacy': ('eps_small','uma_rappor_type'),#'uma_rappor_type'), # 'eps_small'), } # From 85495da94b7d035c45c865d72b19d5ead4890057 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Tue, 16 Jun 2015 16:38:26 -0700 Subject: [PATCH 22/67] Fixing a bug in assoctest.sh --- assoctest.sh | 2 +- tests/analyze_assoc_expt.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/assoctest.sh b/assoctest.sh index 01de8793..74fd7149 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -126,7 +126,7 @@ _run-one-instance() { '$instance_dir/map_2.csv']; \ inp['reports'] = '$instance_dir/reports.csv'; \ inp['truefile'] = '$instance_dir/truedist.csv'; \ - inp['outdir'] = '.'; \ + inp['outdir'] = '$out_dir'; \ inp['params'] = '$case_dir/case_params.csv'; \ inp['newalg'] = 'false'; \ inp['numvars'] = 2; \ diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index 936ca1da..7eede713 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -236,7 +236,7 @@ main <- function(opts) { metrics <- list( td_chisq = chisq.test(td)[1][[1]][[1]], ed_chisq = chisq.test(ed)[1][[1]][[1]], - tv = l1d(td, ed, "")/2, + tv = l1d(td, ed, ""), time = time_taken[1], dim1 = dim(ed)[[2]], dim2 = dim(ed)[[1]] From f0e82721ef8b03b28a7a75c57ecdb72308a14927 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Wed, 17 Jun 2015 20:54:38 -0700 Subject: [PATCH 23/67] More testing with 2-way marginals. --- analysis/R/decode.R | 11 +- tests/analyze_assoc_expt.R | 252 ++++++++++++++++++++++--------------- 2 files changed, 159 insertions(+), 104 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 6e755423..fc3c29f5 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -333,12 +333,11 @@ Decode2Way <- function(counts, map, params) { e <- list(estimates = es$estimates[filter_cohorts, , drop = FALSE], stds = es$stds[filter_cohorts, , drop = FALSE]) coefs <- FitDistribution(e, map[filter_bits, , drop = FALSE]) - mod <- list(coefs = coefs, stds = coefs) - inf <- PerformInference(map[filter_bits, , drop = FALSE], - as.vector(t(es$estimates)), - N, mod, params, alpha = (0.05/S), - correction = "Bonferroni") - fit <- inf$fit + fit <- data.frame(String = colnames(map[filter_bits, , drop = FALSE]), + Estimate = matrix(coefs, ncol = 1), + SD = matrix(coefs, ncol = 1), + stringsAsFactors = FALSE) + rownames(fit) <- fit[,"String"] list(fit = fit) } diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index 7eede713..4036411c 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -76,11 +76,32 @@ CombineReports <- function(reports1, reports2) { function(x) as.vector(sapply(x, function(z) two_bits[[z+1]]))) } + +# Given 2 lists of maps, maps1 and maps2, the function +# combines the maps by cohort and outputs both +# cohort-organized maps and flattened versions +CombineMaps <- function(maps1, maps2) { + # Combine maps + cmap <- mapply(CombineMapsInternal, maps1, maps2) + + # Flatten map + inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE)) + for (i in seq(1, length(inds))) { + inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1] + } + inds <- do.call("rbind", inds) + crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c( + nrow(cmap[[1]]) * length(cmap), + ncol(cmap[[1]]))) + colnames(crmap) <- colnames(cmap[[1]]) + list(cmap = cmap, crmap = crmap) +} + # Function to combine maps # Using map1-major order for both candidates and bits of the report # to be consistent with how CombineReports works # Currently assume 2-way marginals -CombineMaps <- function(map1, map2) { +CombineMapsInternal <- function(map1, map2) { # Retrieve set indices and dimensions rows1 <- which(map1, arr.ind = TRUE)[,1] cols1 <- which(map1, arr.ind = TRUE)[,2] @@ -150,110 +171,145 @@ CombineMaps <- function(map1, map2) { main <- function(opts) { ptm <- proc.time() + direct_simulation = TRUE inp <- fromJSON(opts$inp) params <- ReadParameterFile(inp$params) - # ensure sufficient maps as required by number of vars - stopifnot(inp$numvars == length(inp$maps)) - opts_map <- inp$maps - map <- lapply(opts_map, function(o) - ProcessMap(ReadMapFile(o, params = params), - params = params)) - # Reports must be of the format - # cohort no, rappor bitstring 1, rappor bitstring 2, ... - reportsObj <- read.csv(inp$reports, - colClasses = c("integer", - rep("character", inp$numvars)), - header = FALSE) - - # Parsing reportsObj - # ComputeDistributionEM allows for different sets of cohorts - # for each variable. Here, both sets of cohorts are identical - co <- as.list(reportsObj[1])[[1]] - cohorts <- rep(list(co), inp$numvars) - # Parse reports from reportObj cols 2, 3, ... - reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1])) - - # Split strings into bit arrays (as required by assoc analysis) - reports <- lapply(1:inp$numvars, function(i) { - # apply the following function to each of reports[[1]] and reports[[2]] - lapply(reports[[i]][[1]], function(x) { - # function splits strings and converts them to numeric values - as.numeric(strsplit(x, split = "")[[1]]) - }) - }) - - creports <- CombineReports(reports[[1]], reports[[2]]) - params2 <- params - params2$k <- (params$k ** 2) * 4 - # CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]]) - cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map) - # Combine cohorts into one map. Needed for Decode2Way - inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE)) - for (i in seq(1, length(inds))) { - inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1] - } - inds <- do.call("rbind", inds) - - # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1] - # inds <- rbind(inds[[1]], inds[[2]]) - crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c( - nrow(cmap[[1]]) * length(cmap), - ncol(cmap[[1]]))) - colnames(crmap) <- colnames(cmap[[1]]) - counts <- ComputeCounts(creports, cohorts[[1]], params2) - marginal <- Decode2Way(counts, crmap, params2)$fit - - also_em = FALSE - ed_em <- list() - if(also_em == TRUE) { - joint_dist <- ComputeDistributionEM(reports, cohorts, map, - ignore_other = TRUE, - quick = TRUE, - params, marginals = NULL, - estimate_var = FALSE, - new_alg = inp$newalg) - ed_em <- joint_dist$orig$fit - if(length(reports) == 3) { - ed_em <- as.data.frame(ed_em) + if(direct_simulation == TRUE) { + # TWO WAY ASSOCIATIONS; INPUTS SIMULATED DIRECTLY + strconstant <- c("string", "option") + + # Construct unique vals for each variable using strconstant + stopifnot(length(strconstant) == inp$numvars) + uvals <- lapply(1:inp$numvars, + function(i) { + apply(as.matrix(1:inp$varcandidates[[i]]), + 1, + function(z) sprintf("%s%d", strconstant[[i]], z)) + }) + + # Add extras if any + if(inp$extras > 0) { + uvals[[1]] <- c(uvals[[1]], apply(as.matrix(1:inp$extras), 1, + function(z) sprintf("%s%d", strconstant[[1]], z + inp$varcandidates[[1]]))) } - } - - td <- read.csv(file = inp$truefile) - ed <- td - for (cols in colnames(td)) { - for (rows in rownames(td)) { - ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] + + map <- lapply(uvals, function(u) CreateMap(u, params)) + trim <- function(map) { + lapply(map, function(z) z[,1:inp$varcandidates[[1]]]) } - } + # Trim maps to real # of candidates + # Use extras only for decoding + tmap <- trim(map[[1]]$map) + crmap_trimmed <- CombineMaps(tmap, map[[2]]$map)$crmap + + cohorts <- as.matrix( + apply(as.data.frame(partition), 1, + function(count) RandomPartition(count, rep(1, params$m)))) + + } else { + # ensure sufficient maps as required by number of vars + stopifnot(inp$numvars == length(inp$maps)) + opts_map <- inp$maps + map <- lapply(opts_map, function(o) + ProcessMap(ReadMapFile(o, params = params), + params = params)) + # Reports must be of the format + # cohort no, rappor bitstring 1, rappor bitstring 2, ... + reportsObj <- read.csv(inp$reports, + colClasses = c("integer", + rep("character", inp$numvars)), + header = FALSE) - print("PROC.TIME") - time_taken <- proc.time() - ptm - print(time_taken) + # Parsing reportsObj + # ComputeDistributionEM allows for different sets of cohorts + # for each variable. Here, both sets of cohorts are identical + co <- as.list(reportsObj[1])[[1]] + cohorts <- rep(list(co), inp$numvars) + # Parse reports from reportObj cols 2, 3, ... + reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1])) - print("2 WAY RESULTS") - print(signif(ed[order(rowSums(ed)), ], 4)) - print(l1d(td, ed, "L1 DISTANCE 2 WAY")) - metrics <- list( - td_chisq = chisq.test(td)[1][[1]][[1]], - ed_chisq = chisq.test(ed)[1][[1]][[1]], - tv = l1d(td, ed, ""), - time = time_taken[1], - dim1 = dim(ed)[[2]], - dim2 = dim(ed)[[1]] - ) - - if(also_em == TRUE) { - # Add EM metrics - metrics <- c(metrics, - list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]], - tv_em = l1d(td, ed_em, "")/2)) - } + # Split strings into bit arrays (as required by assoc analysis) + reports <- lapply(1:inp$numvars, function(i) { + # apply the following function to each of reports[[1]] and reports[[2]] + lapply(reports[[i]][[1]], function(x) { + # function splits strings and converts them to numeric values + as.numeric(strsplit(x, split = "")[[1]]) + }) + }) - # Write metrics to metrics.csv - # Report l1 distance / 2 to be consistent with histogram analysis - filename <- file.path(inp$outdir, 'metrics.csv') - write.csv(metrics, file = filename, row.names = FALSE) - + creports <- CombineReports(reports[[1]], reports[[2]]) + params2 <- params + params2$k <- (params$k ** 2) * 4 + # CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]]) + cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map) + # Combine cohorts into one map. Needed for Decode2Way + inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE)) + for (i in seq(1, length(inds))) { + inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1] + } + inds <- do.call("rbind", inds) + + # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1] + # inds <- rbind(inds[[1]], inds[[2]]) + crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c( + nrow(cmap[[1]]) * length(cmap), + ncol(cmap[[1]]))) + td <- read.csv(file = inp$truefile) + colnames(crmap) <- colnames(cmap[[1]]) + counts <- ComputeCounts(creports, cohorts[[1]], params2) + marginal <- Decode2Way(counts, crmap, params2)$fit + + also_em = FALSE + ed_em <- list() + if(also_em == TRUE) { + joint_dist <- ComputeDistributionEM(reports, cohorts, map, + ignore_other = TRUE, + quick = TRUE, + params, marginals = NULL, + estimate_var = FALSE, + new_alg = inp$newalg) + ed_em <- joint_dist$orig$fit + if(length(reports) == 3) { + ed_em <- as.data.frame(ed_em) + } + } + + ed <- td + for (cols in colnames(td)) { + for (rows in rownames(td)) { + ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] + } + } + + time_taken <- proc.time() - ptm + + print("2 WAY RESULTS") + print(signif(ed[order(rowSums(ed)), ], 4)) + print(l1d(td, ed, "L1 DISTANCE 2 WAY")) + print("PROC.TIME") + print(time_taken) + + metrics <- list( + td_chisq = chisq.test(td)[1][[1]][[1]], + ed_chisq = chisq.test(ed)[1][[1]][[1]], + tv = l1d(td, ed, ""), + time = time_taken[1], + dim1 = dim(ed)[[2]], + dim2 = dim(ed)[[1]] + ) + + if(also_em == TRUE) { + # Add EM metrics + metrics <- c(metrics, + list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]], + tv_em = l1d(td, ed_em, "")/2)) + } + + # Write metrics to metrics.csv + # Report l1 distance / 2 to be consistent with histogram analysis + filename <- file.path(inp$outdir, 'metrics.csv') + write.csv(metrics, file = filename, row.names = FALSE) + } } # L1 distance = 1 - sum(min(df1|x, df2|x)) where From 26722821e1bee660abd8ed6a85e4237f1cc4ea17 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 18 Jun 2015 12:59:21 -0700 Subject: [PATCH 24/67] Edits. --- analysis/R/decode.R | 2 +- tests/analyze_assoc_expt.R | 71 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index fc3c29f5..c84a23dd 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -71,7 +71,7 @@ Estimate2WayBloomCounts <- function(params, obs_counts) { ests[abs(ests) == Inf] <- 0 list(estimates = ests, - stds = matrix(rep(5, length(ests[,1]) * length(ests[1,])), + stds = matrix(rep(100, length(ests[,1]) * length(ests[1,])), length(ests[,1]))) } diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index 4036411c..c82257f7 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -40,6 +40,7 @@ source("analysis/R/decode.R") source("analysis/R/simulation.R") source("analysis/R/read_input.R") source("analysis/R/association.R") +source("tests/gen_counts.R") # This function processes the maps loaded using ReadMapFile # Association analysis requires a map object with a map @@ -177,6 +178,9 @@ main <- function(opts) { if(direct_simulation == TRUE) { # TWO WAY ASSOCIATIONS; INPUTS SIMULATED DIRECTLY strconstant <- c("string", "option") + N <- inp$num + n1 <- inp$varcandidates[[1]] + n2 <- inp$varcandidates[[2]] # Construct unique vals for each variable using strconstant stopifnot(length(strconstant) == inp$numvars) @@ -190,22 +194,83 @@ main <- function(opts) { # Add extras if any if(inp$extras > 0) { uvals[[1]] <- c(uvals[[1]], apply(as.matrix(1:inp$extras), 1, - function(z) sprintf("%s%d", strconstant[[1]], z + inp$varcandidates[[1]]))) + function(z) sprintf("%s%d", strconstant[[1]], z + n1))) } map <- lapply(uvals, function(u) CreateMap(u, params)) trim <- function(map) { - lapply(map, function(z) z[,1:inp$varcandidates[[1]]]) + lapply(map, function(z) z[,1:n1]) } # Trim maps to real # of candidates # Use extras only for decoding tmap <- trim(map[[1]]$map) crmap_trimmed <- CombineMaps(tmap, map[[2]]$map)$crmap + # Sample values to compute partition + # Zipfian over n1 strings + v1_part <- RandomPartition(N, ComputePdf("zipf1.5", n1)) + # Zipfian over n2 strings for each of variable 1 + # Distr. are correlated as in assoc_sim.R + final_part <- as.vector(sapply(1:n1, + function(i) { + v2_part <- RandomPartition(v1_part[[i]], + ComputePdf("zipf1.5", n2)) + if (i %% 2 == 0) {v2_part} else {rev(v2_part)} + })) + + td <- matrix(final_part/sum(final_part), nrow = n1, ncol = n2, byrow = TRUE) + rownames(td) <- uvals[[1]][1:n1] # Don't take into account extras + colnames(td) <- uvals[[2]] + print(signif(td, 4)) cohorts <- as.matrix( - apply(as.data.frame(partition), 1, + apply(as.data.frame(final_part), 1, function(count) RandomPartition(count, rep(1, params$m)))) + expanded <- apply(cohorts, 2, function(vec) rep(vec, each = ((params$k)**2)*4)) + true_ones <- apply(expanded * crmap_trimmed, 1, sum) + + p <- params$p + q <- params$q + f <- params$f + m <- params$m + k <- params$k + p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1 + p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1 + p10 <- 1 - p11 # probability of a true 1 reported as 0 + p00 <- 1 - p01 # probability of a true 0 reported as 0 + + NoiseMatrix <- matrix(rep(0, 16), 4) + NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2) + NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00) + NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01) + NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2) + + after_noise <- as.vector(sapply(1:(length(true_ones)/4), + function(x) + t(NoiseMatrix) %*% true_ones[((x-1)*4+1):(x*4)])) + counts <- cbind(apply(cohorts, 1, sum), + matrix(after_noise, + nrow = m, + ncol = 4 * (k**2), + byrow = TRUE)) + params2 <- params + params2$k <- (params$k ** 2) * 4 + crmap <- CombineMaps(map[[1]]$map, map[[2]]$map)$crmap + marginal <- Decode2Way(counts, crmap, params2)$fit + ed <- td + for (cols in colnames(td)) { + for (rows in rownames(td)) { + ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] + } + } + + time_taken <- proc.time() - ptm + + print("2 WAY RESULTS") + print(signif(ed, 4)) + print(l1d(td, ed, "L1 DISTANCE 2 WAY")) + print("PROC.TIME") + print(time_taken) } else { # ensure sufficient maps as required by number of vars stopifnot(inp$numvars == length(inp$maps)) From b43aa8724026a5e493a817ae3c6e7729f035ca10 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Fri, 19 Jun 2015 13:54:19 -0700 Subject: [PATCH 25/67] Simulating noise directly. --- tests/analyze_assoc_expt.R | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index c82257f7..b0bfc82f 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -244,7 +244,13 @@ main <- function(opts) { NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00) NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01) NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2) - + + NoiseMatrix2 <- matrix(rep(0, 16), 4) + NoiseMatrix2[1,] <- c(1, 0, 0, 0) + NoiseMatrix2[2,] <- c(0, 1, 0, 0) + NoiseMatrix2[3,] <- c(0, 0, 1, 0) + NoiseMatrix2[4,] <- c(0, 0, 0, 1) + after_noise <- as.vector(sapply(1:(length(true_ones)/4), function(x) t(NoiseMatrix) %*% true_ones[((x-1)*4+1):(x*4)])) @@ -253,6 +259,7 @@ main <- function(opts) { nrow = m, ncol = 4 * (k**2), byrow = TRUE)) + params2 <- params params2$k <- (params$k ** 2) * 4 crmap <- CombineMaps(map[[1]]$map, map[[2]]$map)$crmap @@ -271,6 +278,17 @@ main <- function(opts) { print(l1d(td, ed, "L1 DISTANCE 2 WAY")) print("PROC.TIME") print(time_taken) + + metrics <- list( + td_chisq = chisq.test(td)[1][[1]][[1]], + ed_chisq = chisq.test(ed)[1][[1]][[1]], + tv = l1d(td, ed, ""), + time = time_taken[1], + dim1 = dim(ed)[[2]], + dim2 = dim(ed)[[1]] + ) + filename <- file.path(inp$outdir, 'metrics.csv') + write.csv(metrics, file = filename, row.names = FALSE) } else { # ensure sufficient maps as required by number of vars stopifnot(inp$numvars == length(inp$maps)) From ca9953eaf164b6928f4ec222ca967cc43bbfa6f7 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Fri, 19 Jun 2015 16:59:59 -0700 Subject: [PATCH 26/67] Test suite updated to consider 2-way marginals. --- assoctest.sh | 21 ++++--- tests/analyze_assoc_expt.R | 111 ++++++++++++++++++++++++------------ tests/assoctest.html | 8 ++- tests/make_summary_assoc.py | 1 + tests/regtest_spec.py | 50 +++++++++++----- 5 files changed, 131 insertions(+), 60 deletions(-) diff --git a/assoctest.sh b/assoctest.sh index 74fd7149..6516653e 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -52,14 +52,15 @@ _setup-one-case() { local num_unique_values=$2 local num_unique_values2=$3 local num_clients=$4 + local num_extras=$5 # RAPPOR params - local num_bits=$5 - local num_hashes=$6 - local num_cohorts=$7 - local p=$8 - local q=$9 # need curly braces to get the 10th arg - local f=${10} + local num_bits=$6 + local num_hashes=$7 + local num_cohorts=$8 + local p=$9 + local q=${10} # need curly braces to get the 10th arg + local f=${11} banner 'Setting up parameters and candidate files for '$test_case @@ -84,7 +85,8 @@ _run-one-instance() { local case_dir=$ASSOCTEST_DIR/$test_case read -r case_name num_unique_values num_unique_values2 \ - num_clients num_bits num_hashes num_cohorts p q f < $case_dir/spec.txt + num_clients num_extras \ + num_bits num_hashes num_cohorts p q f < $case_dir/spec.txt local instance_dir=$ASSOCTEST_DIR/$test_case/$test_instance mkdir --verbose -p $instance_dir @@ -108,7 +110,7 @@ _run-one-instance() { json.dump(inp, f); \ f.close();" - tests/assoc_sim_expt.R --inp $instance_dir/assoc_inp.json + # tests/assoc_sim_expt.R --inp $instance_dir/assoc_inp.json local out_dir=${instance_dir}_report mkdir --verbose -p $out_dir @@ -130,6 +132,9 @@ _run-one-instance() { inp['params'] = '$case_dir/case_params.csv'; \ inp['newalg'] = 'false'; \ inp['numvars'] = 2; \ + inp['num'] = $num_clients; \ + inp['extras'] = $num_extras; \ + inp['varcandidates'] = [$num_unique_values, $num_unique_values2]; \ json.dump(inp, f); \ f.close();" diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index b0bfc82f..cbd2e7ff 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -169,14 +169,37 @@ CombineMapsInternal <- function(map1, map2) { sm } +GenerateNoiseMatrix <- function(params) { + p <- params$p + q <- params$q + f <- params$f + m <- params$m + k <- params$k + + p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1 + p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1 + p10 <- 1 - p11 # probability of a true 1 reported as 0 + p00 <- 1 - p01 # probability of a true 0 reported as 0 + + NoiseMatrix <- matrix(rep(0, 16), 4) + NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2) + NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00) + NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01) + NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2) + + NoiseMatrix +} + main <- function(opts) { ptm <- proc.time() direct_simulation = TRUE inp <- fromJSON(opts$inp) params <- ReadParameterFile(inp$params) + if(direct_simulation == TRUE) { # TWO WAY ASSOCIATIONS; INPUTS SIMULATED DIRECTLY + strconstant <- c("string", "option") N <- inp$num n1 <- inp$varcandidates[[1]] @@ -197,13 +220,12 @@ main <- function(opts) { function(z) sprintf("%s%d", strconstant[[1]], z + n1))) } + # Compute map map <- lapply(uvals, function(u) CreateMap(u, params)) - trim <- function(map) { - lapply(map, function(z) z[,1:n1]) - } + # Trim maps to real # of candidates # Use extras only for decoding - tmap <- trim(map[[1]]$map) + tmap <- lapply(map[[1]]$map, function(i) i[, 1:n1]) crmap_trimmed <- CombineMaps(tmap, map[[2]]$map)$crmap # Sample values to compute partition @@ -219,8 +241,23 @@ main <- function(opts) { })) td <- matrix(final_part/sum(final_part), nrow = n1, ncol = n2, byrow = TRUE) + v2_part <- RandomPartition(N, apply(td, 2, sum)) + ow_parts <- list(v1_part, v2_part) + ow_parts[[1]] <- c(ow_parts[[1]], rep(0, inp$extra)) + + # -------------- + # Generate 1-way counts + ow_counts <- lapply(1:2, function(i) + GenerateCounts(params, map[[i]]$rmap, ow_parts[[i]], 1)) + found_strings <- lapply(1:2, function(i) + Decode(ow_counts[[i]], + map[[i]]$rmap, + params, quick = TRUE)$fit$strings) + # -------------- + rownames(td) <- uvals[[1]][1:n1] # Don't take into account extras colnames(td) <- uvals[[2]] + print("TRUE DISTRIBUTION") print(signif(td, 4)) cohorts <- as.matrix( apply(as.data.frame(final_part), 1, @@ -228,49 +265,39 @@ main <- function(opts) { expanded <- apply(cohorts, 2, function(vec) rep(vec, each = ((params$k)**2)*4)) true_ones <- apply(expanded * crmap_trimmed, 1, sum) - p <- params$p - q <- params$q - f <- params$f - m <- params$m - k <- params$k - - p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1 - p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1 - p10 <- 1 - p11 # probability of a true 1 reported as 0 - p00 <- 1 - p01 # probability of a true 0 reported as 0 - NoiseMatrix <- matrix(rep(0, 16), 4) - NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2) - NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00) - NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01) - NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2) - NoiseMatrix2 <- matrix(rep(0, 16), 4) - NoiseMatrix2[1,] <- c(1, 0, 0, 0) - NoiseMatrix2[2,] <- c(0, 1, 0, 0) - NoiseMatrix2[3,] <- c(0, 0, 1, 0) - NoiseMatrix2[4,] <- c(0, 0, 0, 1) + NoiseMatrix <- GenerateNoiseMatrix(params) after_noise <- as.vector(sapply(1:(length(true_ones)/4), function(x) t(NoiseMatrix) %*% true_ones[((x-1)*4+1):(x*4)])) counts <- cbind(apply(cohorts, 1, sum), matrix(after_noise, - nrow = m, - ncol = 4 * (k**2), + nrow = params$m, + ncol = 4 * (params$k**2), byrow = TRUE)) params2 <- params params2$k <- (params$k ** 2) * 4 - crmap <- CombineMaps(map[[1]]$map, map[[2]]$map)$crmap + + # Combine maps to feed into Decode2Way + # Prune first to found_strings + pruned <- lapply(1:2, function(i) + lapply(map[[i]]$map, function(z) z[,found_strings[[i]]])) + crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap marginal <- Decode2Way(counts, crmap, params2)$fit - ed <- td + + # Fill in estimated results with rows and cols from td + ed <- matrix(0, nrow = (n1+inp$extra), ncol = n2) + rownames(ed) <- uvals[[1]] + colnames(ed) <- uvals[[2]] for (cols in colnames(td)) { for (rows in rownames(td)) { ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] } } - + ed[is.na(ed)] <- 0 time_taken <- proc.time() - ptm print("2 WAY RESULTS") @@ -278,14 +305,22 @@ main <- function(opts) { print(l1d(td, ed, "L1 DISTANCE 2 WAY")) print("PROC.TIME") print(time_taken) + chisq_td <- chisq.test(td)[1][[1]][[1]] + chisq_ed <- chisq.test(ed)[1][[1]][[1]] + if(is.nan(chisq_ed)) { + chisq_ed <- 0 + } + if(is.nan(chisq_td)) { + chisq_td <- 0 + } metrics <- list( - td_chisq = chisq.test(td)[1][[1]][[1]], - ed_chisq = chisq.test(ed)[1][[1]][[1]], + td_chisq = chisq_td, + ed_chisq = chisq_ed, tv = l1d(td, ed, ""), time = time_taken[1], - dim1 = dim(ed)[[2]], - dim2 = dim(ed)[[1]] + dim1 = length(found_strings[[1]]), + dim2 = length(found_strings[[2]]) ) filename <- file.path(inp$outdir, 'metrics.csv') write.csv(metrics, file = filename, row.names = FALSE) @@ -371,14 +406,18 @@ main <- function(opts) { print(l1d(td, ed, "L1 DISTANCE 2 WAY")) print("PROC.TIME") print(time_taken) + chisq_ed <- chisq.test(ed)[1][[1]][[1]] + if(is.nan(chisq_ed)) { + chisq_ed <- 0 + } metrics <- list( td_chisq = chisq.test(td)[1][[1]][[1]], - ed_chisq = chisq.test(ed)[1][[1]][[1]], + ed_chisq = chisq_ed, tv = l1d(td, ed, ""), time = time_taken[1], - dim1 = dim(ed)[[2]], - dim2 = dim(ed)[[1]] + dim1 = length(found_strings[[1]]), + dim2 = length(found_strings[[2]]) ) if(also_em == TRUE) { diff --git a/tests/assoctest.html b/tests/assoctest.html index 38e5abac..0c839c86 100644 --- a/tests/assoctest.html +++ b/tests/assoctest.html @@ -25,7 +25,7 @@

RAPPOR assoctest.sh

Test Case + Input Params RAPPOR Params + Result Metrics
- e: number of extras
- u2: number of unique vals in var2
- n: number of reports/clients
+
+ n: num reports
k: report bits
@@ -59,9 +57,11 @@

RAPPOR assoctest.sh

m: cohorts
p, q, f: probabilities
- td_chisq: chisq test on true distr.
- ed_chisq: chisq test on est. distr.
+
+ d1: dimension of var1 solutions.
+ d2: dimension of var2 solutions.
+ td_chisq: chisq test on true distr.
+ ed_chisq: chisq test on est. distr.
tv: tot. var. distance
rtime: R runtime
eu2 n k q fd1d2 td_chisq ed_chisq tv %(mean_chisqdiff)s %(mean_l1d)s %(mean_rtime)s%s%s
- + @@ -35,7 +35,7 @@

RAPPOR assoctest.sh

- - + diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py index 59a4f247..67843b4d 100755 --- a/tests/make_summary_assoc.py +++ b/tests/make_summary_assoc.py @@ -18,6 +18,7 @@ + diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 93101384..cf8c6059 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -46,19 +46,41 @@ # num unique values 2, num clients 'tiny': (100, 2, int(1e03)), # test for insufficient data 'small': (100, 10, int(1e04)), - 'fizz-tiny': (100, 20, int(1e03)), - 'fizz-tiny-bool': (100, 2, int(1e03)), - 'fizz-small': (100, 20, int(1e04)), - 'fizz-small-bool': (100, 2, int(1e04)), - 'fizz': (100, 20, int(1e05)), - 'fizz-large': (100, 50, int(1e05)), - 'fizz-2large': (100, 50, int(5e05)), - 'fizz-bool': (100, 2, int(1e05)), +# 'fizz-tiny': (100, 20, int(1e03)), +# 'fizz-tiny-bool': (100, 2, int(1e03)), +# 'fizz-small': (100, 20, int(1e04)), +# 'fizz-small-bool': (100, 2, int(1e04)), +# 'fizz': (100, 20, int(1e05)), +# 'fizz-large': (100, 50, int(1e05)), +# 'fizz-2large': (100, 50, int(5e05)), +# 'fizz-bool': (100, 2, int(1e05)), 'medium': (1000, 10, int(1e05)), 'medium2': (1000, 2, int(1e05)), 'large': (10000, 10, int(1e06)), 'large2': (10000, 2, int(1e06)), 'largesquared': (int(1e04), 100, int(1e06)), + + # new test names for 2-way marginals + # includes testing for extras + 'fizz-tiny': (100, 20, int(1e03), int(1e04)), + 'fizz-tiny-bool': (100, 2, int(1e03), int(1e04)), + 'fizz-small': (100, 20, int(1e04), int(1e04)), + 'fizz-small-bool': (100, 2, int(1e04), int(1e04)), + 'fizz': (100, 20, int(1e05), int(1e04)), + 'fizz-bool': (100, 2, int(1e05), int(1e04)), + + 'compact-noextra-small': (40, 5, 1e04, 0), + 'loose-noextra-small': (100, 20, 1e04, 0), + 'compact-noextra-large': (40, 5, 1e06, 0), + 'loose-noextra-large': (100, 20, 1e06, 0), + 'compact-extra-small': (40, 5, int(1e04), int(1e04)), + 'loose-extra-small': (100, 20, int(1e04), int(1e04)), + 'compact-extra-large': (40, 5, int(1e06), int(1e04)), + 'loose-extra-large': (100, 20, int(1e06), int(1e04)), + 'compact-excess-small': (40, 5, int(1e04), int(1e05)), + 'loose-excess-small': (100, 20, int(1e04), int(1e05)), + 'compact-excess-large': (40, 5, int(1e06), int(1e05)), + 'loose-excess-large': (100, 20, int(1e06), int(1e05)), } # 'k, h, m' as in params file. @@ -76,6 +98,7 @@ 'eps_1_5': (0.225, 0.775, 0.0), # eps_1 = 5, no eps_inf 'eps_verysmall': (0.125, 0.875, 0.125), 'eps_small': (0.125, 0.875, 0.5), + 'eps_chrome': (0.25, 0.75, 0.5), 'uma_rappor_type': (0.50, 0.75, 0.5), } @@ -102,11 +125,12 @@ # The test config runs a test suite that is the cross product of all the above # sets ASSOC_TEST_CONFIG = { - 'distr': (#'fizz-tiny', - #'fizz-small', - 'fizz',),#'fizz-large','fizz-2large'),# 'medium'), - 'blooms': ('8x16', '8x32'), # '8x32', '16x32'), - 'privacy': ('eps_small','uma_rappor_type'),#'uma_rappor_type'), # 'eps_small'), +# 'distr': ('fizz-tiny', 'fizz-tiny-bool', +# 'fizz-small', 'fizz-small-bool', +# 'fizz', 'fizz-bool'), + 'distr': ('fizz-small',), + 'blooms': ('8x16',), # '8x32', '16x32'), + 'privacy': ('eps_small',) } # From f33b285de1671295fb8016acbd0370179705b544 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Fri, 19 Jun 2015 17:01:28 -0700 Subject: [PATCH 27/67] Small updates to test cases. --- tests/regtest_spec.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index cf8c6059..ef3c3ea5 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -125,12 +125,11 @@ # The test config runs a test suite that is the cross product of all the above # sets ASSOC_TEST_CONFIG = { -# 'distr': ('fizz-tiny', 'fizz-tiny-bool', -# 'fizz-small', 'fizz-small-bool', -# 'fizz', 'fizz-bool'), - 'distr': ('fizz-small',), - 'blooms': ('8x16',), # '8x32', '16x32'), - 'privacy': ('eps_small',) + 'distr': ('fizz-tiny', 'fizz-tiny-bool', + 'fizz-small', 'fizz-small-bool', + 'fizz', 'fizz-bool'), + 'blooms': ('8x16','8x32',)# '16x32'), + 'privacy': ('eps_small','eps_chrome') } # From b3cd75983b78e6414270388714c0fb874f0fbc71 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Fri, 19 Jun 2015 17:02:13 -0700 Subject: [PATCH 28/67] Making tests run sequentially. --- assoctest.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/assoctest.sh b/assoctest.sh index 6516653e..1221a59f 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -279,7 +279,8 @@ run-all() { log "Running all tests. Can take a while." # a- for assoc tests - _run-tests '^a-' $instances T T + # F for sequential + _run-tests '^a-' $instances F T } "$@" From c1c48ccc0bd9294a37a406b26438ce058c2fff85 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Wed, 24 Jun 2015 14:40:32 -0700 Subject: [PATCH 29/67] gen_assoc_reports.R to produce assoc. reports. --- assoctest.sh | 22 ++++++++++ tests/gen_assoc_reports.R | 84 +++++++++++++++++++++++++++++++++++++++ tests/regtest_spec.py | 16 +++++--- 3 files changed, 117 insertions(+), 5 deletions(-) create mode 100755 tests/gen_assoc_reports.R diff --git a/assoctest.sh b/assoctest.sh index 1221a59f..1144da1b 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -43,6 +43,13 @@ readonly ASSOCTEST_DIR=_tmp/assoctest # All the Python tools need this export PYTHONPATH=$CLIENT_DIR +# Print true inputs into a file with selected prefix +print-true-inputs() { + local num_unique_values=$1 + local prefix=$2 + seq 1 $num_unique_values | awk '{print "'$prefix'" $1}' +} + # Generate a single test case, specified by a line of the test spec. # This is a helper function for _run_tests(). _setup-one-case() { @@ -74,6 +81,21 @@ _setup-one-case() { echo 'k,h,m,p,q,f' > $params_path echo "$num_bits,$num_hashes,$num_cohorts,$p,$q,$f" >> $params_path + + print-true-inputs $[num_unique_values+num_extras] \ + "str" > $case_dir/case_true_inputs1.txt + print-true-inputs $num_unique_values2 "opt" > $case_dir/case_true_inputs2.txt + + # Hash candidates + analysis/tools/hash_candidates.py \ + $params_path \ + < $case_dir/case_true_inputs1.txt \ + > $case_dir/case_map1.csv + + analysis/tools/hash_candidates.py \ + $params_path \ + < $case_dir/case_true_inputs2.txt \ + > $case_dir/case_map2.csv } # Run a single test instance, specified by . diff --git a/tests/gen_assoc_reports.R b/tests/gen_assoc_reports.R new file mode 100755 index 00000000..41eb045a --- /dev/null +++ b/tests/gen_assoc_reports.R @@ -0,0 +1,84 @@ +#!/usr/bin/env Rscript +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +source('tests/gen_counts.R') + +# Usage: +# +# $ ./gen_assoc_reports.R 100 20 10000 foo.csv +# +# Inputs: +# size of the distribution's support for var 1 +# size of the distribution's support for var 2 +# number of clients +# name of the output file +# Output: +# csv file with reports sampled according to the specified distribution. + +main <- function(argv) { + n <- list(as.integer(argv[[1]]), as.integer(argv[[2]])) + N <- as.integer(argv[[3]]) + out_file <- argv[[4]] + + # Sample values to compute partition + # Resulting distribution is a correlated zipf x zipf + # distribution over 2 variables + PartitionWithCorrelation <- function(size, support, index) { + part <- RandomPartition(size, ComputePdf("zipf1.5", support)) + if (index %% 2 == 0) {part} else {rev(part)} + } + + # Zipfian over n[[1]] strings + part <- RandomPartition(N, ComputePdf("zipf1.5", n[[1]])) + # Zipfian over n[[2]] strings for each of variable 1 + final_part <- as.vector(sapply(1:n[[1]], + function(i) PartitionWithCorrelation(part[[i]], n[[2]], i))) + + final_part <- matrix(final_part, nrow = n[[1]], byrow = TRUE) + rownames(final_part) <- sapply(1:n[[1]], function(x) paste("str", x, sep = "")) + colnames(final_part) <- sapply(1:n[[2]], function(x) paste("opt", x, sep = "")) + distr <- final_part/sum(final_part) + print("DISTRIBUTION") + print(distr) + + print('PARTITION') + print(final_part) + + # Expand partition + values <- list( + rep(1:n[[1]], rowSums(final_part)), + unlist(sapply(1:n[[1]], function(x) rep(1:n[[2]], final_part[x, ])))) + + stopifnot((length(values[[1]]) == N) & + (length(values[[2]]) == N)) + + # Shuffle values randomly (may take a few sec for > 10^8 inputs) + perm <- sample(N) + values <- list(values[[1]][perm], values[[2]][perm]) + + # Obtain reports by prefixing values with "v"s. Even slower than shuffling. + reports <- list(sprintf("str%d", values[[1]]), + sprintf("opt%d", values[[2]])) + + reports <- cbind(1:N, reports[[1]], reports[[2]]) # paste together "1 v342" + + write.table(reports, file = out_file, row.names = FALSE, col.names = FALSE, + sep = ",", quote = FALSE) +} + +if (length(sys.frames()) == 0) { + main(commandArgs(TRUE)) +} diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index ef3c3ea5..31f1d1ac 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -124,12 +124,18 @@ # privacy params set) # The test config runs a test suite that is the cross product of all the above # sets +#ASSOC_TEST_CONFIG = { +# 'distr': ('fizz-tiny', 'fizz-tiny-bool', +# 'fizz-small', 'fizz-small-bool', +# 'fizz', 'fizz-bool'), +# 'blooms': ('8x16','8x32',),# '16x32'), +# 'privacy': ('eps_small','eps_chrome') +#} + ASSOC_TEST_CONFIG = { - 'distr': ('fizz-tiny', 'fizz-tiny-bool', - 'fizz-small', 'fizz-small-bool', - 'fizz', 'fizz-bool'), - 'blooms': ('8x16','8x32',)# '16x32'), - 'privacy': ('eps_small','eps_chrome') + 'distr': ('fizz-small',), + 'blooms': ('8x32',), + 'privacy': ('eps_small',) } # From dd3dd839c640dd8738f8bc57e46ff3f5a45b0e1c Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Wed, 24 Jun 2015 22:30:36 -0700 Subject: [PATCH 30/67] Changes for running experiments with 2 way marginals. --- assoctest.sh | 27 +- client/python/rappor.py | 6 +- tests/analyze_assoc_expt.R | 592 +++++++++++++++++++++++-------------- tests/regtest_spec.py | 8 +- 4 files changed, 402 insertions(+), 231 deletions(-) diff --git a/assoctest.sh b/assoctest.sh index 1144da1b..047d09fc 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -113,7 +113,28 @@ _run-one-instance() { local instance_dir=$ASSOCTEST_DIR/$test_case/$test_instance mkdir --verbose -p $instance_dir - banner "Running association input simulation" + banner "Generating input" + + tests/gen_assoc_reports.R $num_unique_values $num_unique_values2 \ + $num_clients $instance_dir/case.csv + + banner "Running RAPPOR client" + tests/rappor_assoc_sim.py \ + --num-bits $num_bits \ + --num-hashes $num_hashes \ + --num-cohorts $num_cohorts \ + -p $p \ + -q $q \ + -f $f \ + -i $instance_dir/case.csv \ + --out-prefix "$instance_dir/case" + + analysis/tools/sum_bits_assoc.py \ + $case_dir/case_params.csv \ + "$instance_dir/case" \ + < $instance_dir/case_out.csv + + return # Setting up JSON file containing assoc_sim inputs with python python -c "import json; \ @@ -146,8 +167,8 @@ _run-one-instance() { python -c "import json; \ f = file('$instance_dir/analyze_inp.json', 'w'); \ inp = dict(); \ - inp['maps'] = ['$instance_dir/map_1.csv',\ - '$instance_dir/map_2.csv']; \ + inp['maps'] = ['$case_dir/case_map1.csv',\ + '$case_dir/case_map2.csv']; \ inp['reports'] = '$instance_dir/reports.csv'; \ inp['truefile'] = '$instance_dir/truedist.csv'; \ inp['outdir'] = '$out_dir'; \ diff --git a/client/python/rappor.py b/client/python/rappor.py index 4423f8af..5481963b 100644 --- a/client/python/rappor.py +++ b/client/python/rappor.py @@ -216,14 +216,16 @@ def __init__(self, params, user_id, rand_funcs=None): self.p_gen = self.rand_funcs.p_gen self.q_gen = self.rand_funcs.q_gen - def encode(self, word): + def encode(self, word, assigned_cohort = -1): """Compute rappor (Instantaneous Randomized Response).""" params = self.params cohort, uniform, f_mask = get_rappor_masks(self.user_id, word, params, self.rand_funcs) - + if (assigned_cohort != -1) and (assigned_cohort in + range(0, params.num_cohorts)): + cohort = assigned_cohort bloom_bits_array = 0 # Compute Bloom Filter for hash_no in xrange(params.num_hashes): diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index cbd2e7ff..27208017 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -60,6 +60,18 @@ ProcessMap <- function(map, params) { map } +# TV distance = L1 distance / 2 = 1 - sum(min(df1|x, df2|x)) where +# df1|x / df2|x projects the distribution to the intersection x of the +# supports of df1 and df2 +TVDistance <- function(df1, df2, statement = "TV DISTANCE") { + rowsi <- intersect(rownames(df1), rownames(df2)) + colsi <- intersect(colnames(df1), colnames(df2)) + print(statement) + 1 - sum(mapply(min, + unlist(as.data.frame(df1)[rowsi, colsi], use.names = FALSE), + unlist(as.data.frame(df2)[rowsi, colsi], use.names = FALSE))) +} + # Function to combine reports # Currently assume 2-way marginals CombineReports <- function(reports1, reports2) { @@ -190,154 +202,273 @@ GenerateNoiseMatrix <- function(params) { NoiseMatrix } - -main <- function(opts) { - ptm <- proc.time() - direct_simulation = TRUE - inp <- fromJSON(opts$inp) +# ------------------------------------------------------------------------ +## +## Direct simulation of reports without simulated variance +## +## Inputs: +## +## Outputs: +# +# ------------------------------------------------------------------------ +DirectSimulationOfReports <- function(inp) { params <- ReadParameterFile(inp$params) + # TWO WAY ASSOCIATIONS; INPUTS SIMULATED DIRECTLY - if(direct_simulation == TRUE) { - # TWO WAY ASSOCIATIONS; INPUTS SIMULATED DIRECTLY - - strconstant <- c("string", "option") - N <- inp$num - n1 <- inp$varcandidates[[1]] - n2 <- inp$varcandidates[[2]] - - # Construct unique vals for each variable using strconstant - stopifnot(length(strconstant) == inp$numvars) - uvals <- lapply(1:inp$numvars, - function(i) { - apply(as.matrix(1:inp$varcandidates[[i]]), - 1, - function(z) sprintf("%s%d", strconstant[[i]], z)) - }) - - # Add extras if any - if(inp$extras > 0) { - uvals[[1]] <- c(uvals[[1]], apply(as.matrix(1:inp$extras), 1, - function(z) sprintf("%s%d", strconstant[[1]], z + n1))) - } - - # Compute map - map <- lapply(uvals, function(u) CreateMap(u, params)) - - # Trim maps to real # of candidates - # Use extras only for decoding - tmap <- lapply(map[[1]]$map, function(i) i[, 1:n1]) - crmap_trimmed <- CombineMaps(tmap, map[[2]]$map)$crmap - - # Sample values to compute partition - # Zipfian over n1 strings - v1_part <- RandomPartition(N, ComputePdf("zipf1.5", n1)) - # Zipfian over n2 strings for each of variable 1 - # Distr. are correlated as in assoc_sim.R - final_part <- as.vector(sapply(1:n1, - function(i) { - v2_part <- RandomPartition(v1_part[[i]], - ComputePdf("zipf1.5", n2)) - if (i %% 2 == 0) {v2_part} else {rev(v2_part)} - })) - - td <- matrix(final_part/sum(final_part), nrow = n1, ncol = n2, byrow = TRUE) - v2_part <- RandomPartition(N, apply(td, 2, sum)) - ow_parts <- list(v1_part, v2_part) - ow_parts[[1]] <- c(ow_parts[[1]], rep(0, inp$extra)) - - # -------------- - # Generate 1-way counts - ow_counts <- lapply(1:2, function(i) - GenerateCounts(params, map[[i]]$rmap, ow_parts[[i]], 1)) - found_strings <- lapply(1:2, function(i) - Decode(ow_counts[[i]], - map[[i]]$rmap, - params, quick = TRUE)$fit$strings) - # -------------- - - rownames(td) <- uvals[[1]][1:n1] # Don't take into account extras - colnames(td) <- uvals[[2]] - print("TRUE DISTRIBUTION") - print(signif(td, 4)) - cohorts <- as.matrix( - apply(as.data.frame(final_part), 1, - function(count) RandomPartition(count, rep(1, params$m)))) - expanded <- apply(cohorts, 2, function(vec) rep(vec, each = ((params$k)**2)*4)) - true_ones <- apply(expanded * crmap_trimmed, 1, sum) - - - - - NoiseMatrix <- GenerateNoiseMatrix(params) - after_noise <- as.vector(sapply(1:(length(true_ones)/4), - function(x) - t(NoiseMatrix) %*% true_ones[((x-1)*4+1):(x*4)])) - counts <- cbind(apply(cohorts, 1, sum), - matrix(after_noise, - nrow = params$m, - ncol = 4 * (params$k**2), - byrow = TRUE)) - - params2 <- params - params2$k <- (params$k ** 2) * 4 - - # Combine maps to feed into Decode2Way - # Prune first to found_strings - pruned <- lapply(1:2, function(i) - lapply(map[[i]]$map, function(z) z[,found_strings[[i]]])) - crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap - marginal <- Decode2Way(counts, crmap, params2)$fit - - # Fill in estimated results with rows and cols from td - ed <- matrix(0, nrow = (n1+inp$extra), ncol = n2) - rownames(ed) <- uvals[[1]] - colnames(ed) <- uvals[[2]] - for (cols in colnames(td)) { - for (rows in rownames(td)) { - ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] - } + strconstant <- c("string", "option") + N <- inp$num + n1 <- inp$varcandidates[[1]] + n2 <- inp$varcandidates[[2]] + + # Construct unique vals for each variable using strconstant + stopifnot(length(strconstant) == inp$numvars) + uvals <- lapply(1:inp$numvars, + function(i) { + apply(as.matrix(1:inp$varcandidates[[i]]), + 1, + function(z) sprintf("%s%d", strconstant[[i]], z)) + }) + + # Add extras if any + if(inp$extras > 0) { + uvals[[1]] <- c(uvals[[1]], apply(as.matrix(1:inp$extras), 1, + function(z) sprintf("%s%d", strconstant[[1]], z + n1))) + } + + # Compute map + map <- lapply(uvals, function(u) CreateMap(u, params)) + + # Trim maps to real # of candidates + # Use extras only for decoding + tmap <- lapply(map[[1]]$map, function(i) i[, 1:n1]) + crmap_trimmed <- CombineMaps(tmap, map[[2]]$map)$crmap + + # Sample values to compute partition + # Zipfian over n1 strings + v1_part <- RandomPartition(N, ComputePdf("zipf1.5", n1)) + # Zipfian over n2 strings for each of variable 1 + # Distr. are correlated as in assoc_sim.R + final_part <- as.vector(sapply(1:n1, + function(i) { + v2_part <- RandomPartition(v1_part[[i]], + ComputePdf("zipf1.5", n2)) + if (i %% 2 == 0) {v2_part} else {rev(v2_part)} + })) + + td <- matrix(final_part/sum(final_part), nrow = n1, ncol = n2, byrow = TRUE) + v2_part <- RandomPartition(N, apply(td, 2, sum)) + ow_parts <- list(v1_part, v2_part) + ow_parts[[1]] <- c(ow_parts[[1]], rep(0, inp$extra)) + + # -------------- + # Generate 1-way counts + ow_counts <- lapply(1:2, function(i) + GenerateCounts(params, map[[i]]$rmap, ow_parts[[i]], 1)) + found_strings <- lapply(1:2, function(i) + Decode(ow_counts[[i]], + map[[i]]$rmap, + params, quick = TRUE)$fit$strings) + # -------------- + + rownames(td) <- uvals[[1]][1:n1] # Don't take into account extras + colnames(td) <- uvals[[2]] + print("TRUE DISTRIBUTION") + print(signif(td, 4)) + cohorts <- as.matrix( + apply(as.data.frame(final_part), 1, + function(count) RandomPartition(count, rep(1, params$m)))) + expanded <- apply(cohorts, 2, function(vec) rep(vec, each = ((params$k)**2)*4)) + true_ones <- apply(expanded * crmap_trimmed, 1, sum) + + NoiseMatrix <- GenerateNoiseMatrix(params) + after_noise <- as.vector(sapply(1:(length(true_ones)/4), + function(x) + t(NoiseMatrix) %*% true_ones[((x-1)*4+1):(x*4)])) + counts <- cbind(apply(cohorts, 1, sum), + matrix(after_noise, + nrow = params$m, + ncol = 4 * (params$k**2), + byrow = TRUE)) + + params2 <- params + params2$k <- (params$k ** 2) * 4 + + # Combine maps to feed into Decode2Way + # Prune first to found_strings from Decode on 1-way counts + pruned <- lapply(1:2, function(i) + lapply(map[[i]]$map, function(z) z[,found_strings[[i]]])) + crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap + marginal <- Decode2Way(counts, crmap, params2)$fit + + # Fill in estimated results with rows and cols from td + ed <- matrix(0, nrow = (n1+inp$extra), ncol = n2) + rownames(ed) <- uvals[[1]] + colnames(ed) <- uvals[[2]] + for (cols in colnames(td)) { + for (rows in rownames(td)) { + ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] } - ed[is.na(ed)] <- 0 - time_taken <- proc.time() - ptm - - print("2 WAY RESULTS") - print(signif(ed, 4)) - print(l1d(td, ed, "L1 DISTANCE 2 WAY")) - print("PROC.TIME") - print(time_taken) - chisq_td <- chisq.test(td)[1][[1]][[1]] - chisq_ed <- chisq.test(ed)[1][[1]][[1]] - if(is.nan(chisq_ed)) { - chisq_ed <- 0 + } + ed[is.na(ed)] <- 0 + time_taken <- proc.time() - ptm + + print("2 WAY RESULTS") + print(signif(ed, 4)) + print(TVDistance(td, ed, "TV DISTANCE 2 WAY ALGORITHM")) + print("PROC.TIME") + print(time_taken) + chisq_td <- chisq.test(td)[1][[1]][[1]] + chisq_ed <- chisq.test(ed)[1][[1]][[1]] + if(is.nan(chisq_ed)) { + chisq_ed <- 0 + } + if(is.nan(chisq_td)) { + chisq_td <- 0 + } + + metrics <- list( + td_chisq = chisq_td, + ed_chisq = chisq_ed, + tv = TVDistance(td, ed, ""), + time = time_taken[1], + dim1 = length(found_strings[[1]]), + dim2 = length(found_strings[[2]]) + ) + filename <- file.path(inp$outdir, 'metrics.csv') + write.csv(metrics, file = filename, row.names = FALSE) +} + +# ------------------------------------------------------------------------ +## +## Externally provided counts (gen_assoc_counts.R and sum_assoc_reports.py) +## 2 WAY ASSOCIATION ONLY +## +## Inputs: +## count files (2 way counts, individual marginal counts) +## map files (2 variables) +## +## Outputs: +# +# ------------------------------------------------------------------------ +ExternalCounts <- function(inp) { + params <- ReadParameterFile(inp$params) + # Ensure sufficient maps as required by number of vars + stopifnot(inp$numvars == length(inp$maps)) + map <- lapply(inp$maps, function(o) + ProcessMap(ReadMapFile(o, params = params), + params = params)) + + # (2 way counts, marginal 1 counts, marginal 2 counts) + counts <- lapply(1:3, function(i) ReadCountsFile(inp$counts[[i]])) + + params2 <- params + params2$k <- (params$k ** 2) * 4 + + # Prune candidates + found_strings <- lapply(1:2, function(i) + Decode(counts[[i + 1]], + map[[i]]$rmap, + params, quick = FALSE)$fit$strings) + + cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map) + # Combine cohorts into one map. Needed for Decode2Way + inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE)) + for (i in seq(1, length(inds))) { + inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1] + } + inds <- do.call("rbind", inds) + + # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1] + # inds <- rbind(inds[[1]], inds[[2]]) + crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c( + nrow(cmap[[1]]) * length(cmap), + ncol(cmap[[1]]))) + td <- read.csv(file = inp$truefile) + colnames(crmap) <- colnames(cmap[[1]]) + counts <- ComputeCounts(creports, cohorts[[1]], params2) + marginal <- Decode2Way(counts, crmap, params2)$fit + + also_em = FALSE + ed_em <- list() + if(also_em == TRUE) { + joint_dist <- ComputeDistributionEM(reports, cohorts, map, + ignore_other = TRUE, + quick = TRUE, + params, marginals = NULL, + estimate_var = FALSE, + new_alg = inp$newalg) + ed_em <- joint_dist$orig$fit + if(length(reports) == 3) { + ed_em <- as.data.frame(ed_em) } - if(is.nan(chisq_td)) { - chisq_td <- 0 + } + + ed <- td + for (cols in colnames(td)) { + for (rows in rownames(td)) { + ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] } - - metrics <- list( - td_chisq = chisq_td, - ed_chisq = chisq_ed, - tv = l1d(td, ed, ""), - time = time_taken[1], - dim1 = length(found_strings[[1]]), - dim2 = length(found_strings[[2]]) - ) - filename <- file.path(inp$outdir, 'metrics.csv') - write.csv(metrics, file = filename, row.names = FALSE) - } else { - # ensure sufficient maps as required by number of vars - stopifnot(inp$numvars == length(inp$maps)) - opts_map <- inp$maps - map <- lapply(opts_map, function(o) - ProcessMap(ReadMapFile(o, params = params), - params = params)) + } + + time_taken <- proc.time() - ptm + + print("2 WAY RESULTS") + print(signif(ed[order(rowSums(ed)), ], 4)) + print(TVDistance(td, ed, "TV DISTANCE 2 WAY")) + print("PROC.TIME") + print(time_taken) + chisq_ed <- chisq.test(ed)[1][[1]][[1]] + if(is.nan(chisq_ed)) { + chisq_ed <- 0 + } + + metrics <- list( + td_chisq = chisq.test(td)[1][[1]][[1]], + ed_chisq = chisq_ed, + tv = TVDistance(td, ed, ""), + time = time_taken[1], + dim1 = length(found_strings[[1]]), + dim2 = length(found_strings[[2]]) + ) + + if(also_em == TRUE) { + # Add EM metrics + metrics <- c(metrics, + list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]], + tv_em = TVDistance(td, ed_em, "")/2)) + } + + # Write metrics to metrics.csv + filename <- file.path(inp$outdir, 'metrics.csv') + write.csv(metrics, file = filename, row.names = FALSE) +} + +# ------------------------------------------------------------------------ +## +## Externally provided reports +## 2 OR 3 WAY ASSOCIATION +## +## Inputs: +## +## Outputs: +# +# ------------------------------------------------------------------------ +ExternalReports <- function(inp) { + params <- ReadParameterFile(inp$params) + # Ensure sufficient maps as required by number of vars + stopifnot(inp$numvars == length(inp$maps)) + map <- lapply(inp$maps, function(o) + ProcessMap(ReadMapFile(o, params = params), + params = params)) + + if (read_reports_flag == TRUE) { # Reports must be of the format # cohort no, rappor bitstring 1, rappor bitstring 2, ... reportsObj <- read.csv(inp$reports, colClasses = c("integer", rep("character", inp$numvars)), header = FALSE) - + # Parsing reportsObj # ComputeDistributionEM allows for different sets of cohorts # for each variable. Here, both sets of cohorts are identical @@ -345,7 +476,7 @@ main <- function(opts) { cohorts <- rep(list(co), inp$numvars) # Parse reports from reportObj cols 2, 3, ... reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1])) - + # Split strings into bit arrays (as required by assoc analysis) reports <- lapply(1:inp$numvars, function(i) { # apply the following function to each of reports[[1]] and reports[[2]] @@ -354,96 +485,111 @@ main <- function(opts) { as.numeric(strsplit(x, split = "")[[1]]) }) }) - - creports <- CombineReports(reports[[1]], reports[[2]]) - params2 <- params - params2$k <- (params$k ** 2) * 4 - # CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]]) - cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map) - # Combine cohorts into one map. Needed for Decode2Way - inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE)) - for (i in seq(1, length(inds))) { - inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1] - } - inds <- do.call("rbind", inds) - - # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1] - # inds <- rbind(inds[[1]], inds[[2]]) - crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c( - nrow(cmap[[1]]) * length(cmap), - ncol(cmap[[1]]))) - td <- read.csv(file = inp$truefile) - colnames(crmap) <- colnames(cmap[[1]]) - counts <- ComputeCounts(creports, cohorts[[1]], params2) - marginal <- Decode2Way(counts, crmap, params2)$fit - - also_em = FALSE - ed_em <- list() - if(also_em == TRUE) { - joint_dist <- ComputeDistributionEM(reports, cohorts, map, - ignore_other = TRUE, - quick = TRUE, - params, marginals = NULL, - estimate_var = FALSE, - new_alg = inp$newalg) - ed_em <- joint_dist$orig$fit - if(length(reports) == 3) { - ed_em <- as.data.frame(ed_em) - } - } - - ed <- td - for (cols in colnames(td)) { - for (rows in rownames(td)) { - ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] - } - } - time_taken <- proc.time() - ptm - - print("2 WAY RESULTS") - print(signif(ed[order(rowSums(ed)), ], 4)) - print(l1d(td, ed, "L1 DISTANCE 2 WAY")) - print("PROC.TIME") - print(time_taken) - chisq_ed <- chisq.test(ed)[1][[1]][[1]] - if(is.nan(chisq_ed)) { - chisq_ed <- 0 + creports <- CombineReports(reports[[1]], reports[[2]]) + } + + params2 <- params + params2$k <- (params$k ** 2) * 4 + # CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]]) + cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map) + # Combine cohorts into one map. Needed for Decode2Way + inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE)) + for (i in seq(1, length(inds))) { + inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1] + } + inds <- do.call("rbind", inds) + + # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1] + # inds <- rbind(inds[[1]], inds[[2]]) + crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c( + nrow(cmap[[1]]) * length(cmap), + ncol(cmap[[1]]))) + td <- read.csv(file = inp$truefile) + colnames(crmap) <- colnames(cmap[[1]]) + counts <- ComputeCounts(creports, cohorts[[1]], params2) + marginal <- Decode2Way(counts, crmap, params2)$fit + + also_em = FALSE + ed_em <- list() + if(also_em == TRUE) { + joint_dist <- ComputeDistributionEM(reports, cohorts, map, + ignore_other = TRUE, + quick = TRUE, + params, marginals = NULL, + estimate_var = FALSE, + new_alg = inp$newalg) + ed_em <- joint_dist$orig$fit + if(length(reports) == 3) { + ed_em <- as.data.frame(ed_em) } - - metrics <- list( - td_chisq = chisq.test(td)[1][[1]][[1]], - ed_chisq = chisq_ed, - tv = l1d(td, ed, ""), - time = time_taken[1], - dim1 = length(found_strings[[1]]), - dim2 = length(found_strings[[2]]) - ) - - if(also_em == TRUE) { - # Add EM metrics - metrics <- c(metrics, - list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]], - tv_em = l1d(td, ed_em, "")/2)) + } + + ed <- td + for (cols in colnames(td)) { + for (rows in rownames(td)) { + ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] } - - # Write metrics to metrics.csv - # Report l1 distance / 2 to be consistent with histogram analysis - filename <- file.path(inp$outdir, 'metrics.csv') - write.csv(metrics, file = filename, row.names = FALSE) - } + } + + time_taken <- proc.time() - ptm + + print("2 WAY RESULTS") + print(signif(ed[order(rowSums(ed)), ], 4)) + print(TVDistance(td, ed, "TV DISTANCE 2 WAY")) + print("PROC.TIME") + print(time_taken) + chisq_ed <- chisq.test(ed)[1][[1]][[1]] + if(is.nan(chisq_ed)) { + chisq_ed <- 0 + } + + metrics <- list( + td_chisq = chisq.test(td)[1][[1]][[1]], + ed_chisq = chisq_ed, + tv = TVDistance(td, ed, ""), + time = time_taken[1], + dim1 = length(found_strings[[1]]), + dim2 = length(found_strings[[2]]) + ) + + if(also_em == TRUE) { + # Add EM metrics + metrics <- c(metrics, + list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]], + tv_em = TVDistance(td, ed_em, "")/2)) + } + + # Write metrics to metrics.csv + filename <- file.path(inp$outdir, 'metrics.csv') + write.csv(metrics, file = filename, row.names = FALSE) } -# L1 distance = 1 - sum(min(df1|x, df2|x)) where -# df1|x / df2|x projects the distribution to the intersection x of the -# supports of df1 and df2 -l1d <- function(df1, df2, statement = "L1 DISTANCE") { - rowsi <- intersect(rownames(df1), rownames(df2)) - colsi <- intersect(colnames(df1), colnames(df2)) - print(statement) - 1 - sum(mapply(min, - unlist(as.data.frame(df1)[rowsi, colsi], use.names = FALSE), - unlist(as.data.frame(df2)[rowsi, colsi], use.names = FALSE))) +main <- function(opts) { + ptm <- proc.time() + direct_simulation = FALSE + inp <- fromJSON(opts$inp) + + # Choose from a set of experiments to run + # direct -> direct simulation of reports (without variances) + # external-counts -> externally supplied counts for 2 way and marginals + # external-reports -> externally supplied reports + if (!(inp$expt %in% c("direct", "external-counts", "external-reports"))) { + stop("Incorrect experiment in JSON file.") + } + + if(inp$expt == "direct") { + print("---------- RUNNING EXPERIMENT \"DIRECT\" ----------") + DirectSimulationOfReports(inp) + } + if (inp$expt == "external-counts") { + print("---------- RUNNING EXPERIMENT \"EXT COUNTS\" ----------") + ExternalCounts(inp) + } + if (inp$expt == "external-reports") { + print("---------- RUNNING EXPERIMENT \"EXT REPORTS\" ----------") + ExternalReports(inp) + } } if(!interactive()) { diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 31f1d1ac..845dc93b 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -69,6 +69,7 @@ 'fizz': (100, 20, int(1e05), int(1e04)), 'fizz-bool': (100, 2, int(1e05), int(1e04)), + 'toy': (5, 2, 1e04, 20), # for testing purposes only 'compact-noextra-small': (40, 5, 1e04, 0), 'loose-noextra-small': (100, 20, 1e04, 0), 'compact-noextra-large': (40, 5, 1e06, 0), @@ -94,6 +95,7 @@ # 'p, q, f' as in params file. PRIVACY_PARAMS = { + 'eps_zero': (0, 0.99, 0), # testing purposes only! 'eps_1_1': (0.39, 0.61, 0.45), # eps_1 = 1, eps_inf = 5: 'eps_1_5': (0.225, 0.775, 0.0), # eps_1 = 5, no eps_inf 'eps_verysmall': (0.125, 0.875, 0.125), @@ -133,9 +135,9 @@ #} ASSOC_TEST_CONFIG = { - 'distr': ('fizz-small',), - 'blooms': ('8x32',), - 'privacy': ('eps_small',) + 'distr': ('toy',), + 'blooms': ('16x32',), + 'privacy': ('eps_zero',) } # From e296f6b5100f92225aa1ef2caadfcb0261118f48 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Wed, 24 Jun 2015 23:55:25 -0700 Subject: [PATCH 31/67] Assoctest.sh test suite for experiments up. --- assoctest.sh | 9 ++++-- tests/analyze_assoc_expt.R | 64 +++++++++----------------------------- tests/regtest_spec.py | 2 +- 3 files changed, 22 insertions(+), 53 deletions(-) diff --git a/assoctest.sh b/assoctest.sh index 047d09fc..f433862f 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -134,7 +134,6 @@ _run-one-instance() { "$instance_dir/case" \ < $instance_dir/case_out.csv - return # Setting up JSON file containing assoc_sim inputs with python python -c "import json; \ @@ -170,7 +169,7 @@ _run-one-instance() { inp['maps'] = ['$case_dir/case_map1.csv',\ '$case_dir/case_map2.csv']; \ inp['reports'] = '$instance_dir/reports.csv'; \ - inp['truefile'] = '$instance_dir/truedist.csv'; \ + inp['truefile'] = '$instance_dir/case.csv'; \ inp['outdir'] = '$out_dir'; \ inp['params'] = '$case_dir/case_params.csv'; \ inp['newalg'] = 'false'; \ @@ -178,6 +177,10 @@ _run-one-instance() { inp['num'] = $num_clients; \ inp['extras'] = $num_extras; \ inp['varcandidates'] = [$num_unique_values, $num_unique_values2]; \ + inp['counts'] = ['$instance_dir/case_2way.csv',\ + '$instance_dir/case_marg1.csv',\ + '$instance_dir/case_marg2.csv']; \ + inp['expt'] = 'external-counts'; \ json.dump(inp, f); \ f.close();" @@ -267,7 +270,7 @@ _run-tests() { local processors=1 if test $parallel = F; then - func=_run-one-instance # output to the console + func=_run-one-instance-logged # output to the console else func=_run-one-instance-logged processors=$(grep -c ^processor /proc/cpuinfo || echo 4) # POSIX-specific diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index 27208017..be630207 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -68,8 +68,8 @@ TVDistance <- function(df1, df2, statement = "TV DISTANCE") { colsi <- intersect(colnames(df1), colnames(df2)) print(statement) 1 - sum(mapply(min, - unlist(as.data.frame(df1)[rowsi, colsi], use.names = FALSE), - unlist(as.data.frame(df2)[rowsi, colsi], use.names = FALSE))) + unlist(as.data.frame(df1[rowsi, colsi]), use.names = FALSE), + unlist(as.data.frame(df2[rowsi, colsi]), use.names = FALSE))) } # Function to combine reports @@ -351,6 +351,7 @@ DirectSimulationOfReports <- function(inp) { # # ------------------------------------------------------------------------ ExternalCounts <- function(inp) { + ptm <- proc.time() params <- ReadParameterFile(inp$params) # Ensure sufficient maps as required by number of vars stopifnot(inp$numvars == length(inp$maps)) @@ -370,39 +371,15 @@ ExternalCounts <- function(inp) { map[[i]]$rmap, params, quick = FALSE)$fit$strings) - cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map) - # Combine cohorts into one map. Needed for Decode2Way - inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE)) - for (i in seq(1, length(inds))) { - inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1] - } - inds <- do.call("rbind", inds) - - # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1] - # inds <- rbind(inds[[1]], inds[[2]]) - crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c( - nrow(cmap[[1]]) * length(cmap), - ncol(cmap[[1]]))) - td <- read.csv(file = inp$truefile) - colnames(crmap) <- colnames(cmap[[1]]) - counts <- ComputeCounts(creports, cohorts[[1]], params2) - marginal <- Decode2Way(counts, crmap, params2)$fit - - also_em = FALSE - ed_em <- list() - if(also_em == TRUE) { - joint_dist <- ComputeDistributionEM(reports, cohorts, map, - ignore_other = TRUE, - quick = TRUE, - params, marginals = NULL, - estimate_var = FALSE, - new_alg = inp$newalg) - ed_em <- joint_dist$orig$fit - if(length(reports) == 3) { - ed_em <- as.data.frame(ed_em) - } - } - + # Combine maps to feed into Decode2Way + # Prune first to found_strings from Decode on 1-way counts + pruned <- lapply(1:2, function(i) + lapply(map[[i]]$map, function(z) z[,found_strings[[i]]])) + crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap + marginal <- Decode2Way(counts[[1]], crmap, params2)$fit + td <- read.csv(file = inp$truefile, header = FALSE) + td <- table(td[,2:3]) + td <- td / sum(td) ed <- td for (cols in colnames(td)) { for (rows in rownames(td)) { @@ -412,8 +389,6 @@ ExternalCounts <- function(inp) { time_taken <- proc.time() - ptm - print("2 WAY RESULTS") - print(signif(ed[order(rowSums(ed)), ], 4)) print(TVDistance(td, ed, "TV DISTANCE 2 WAY")) print("PROC.TIME") print(time_taken) @@ -431,13 +406,6 @@ ExternalCounts <- function(inp) { dim2 = length(found_strings[[2]]) ) - if(also_em == TRUE) { - # Add EM metrics - metrics <- c(metrics, - list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]], - tv_em = TVDistance(td, ed_em, "")/2)) - } - # Write metrics to metrics.csv filename <- file.path(inp$outdir, 'metrics.csv') write.csv(metrics, file = filename, row.names = FALSE) @@ -566,8 +534,6 @@ ExternalReports <- function(inp) { } main <- function(opts) { - ptm <- proc.time() - direct_simulation = FALSE inp <- fromJSON(opts$inp) # Choose from a set of experiments to run @@ -579,15 +545,15 @@ main <- function(opts) { } if(inp$expt == "direct") { - print("---------- RUNNING EXPERIMENT \"DIRECT\" ----------") + print("---------- RUNNING EXPERIMENT DIRECT ----------") DirectSimulationOfReports(inp) } if (inp$expt == "external-counts") { - print("---------- RUNNING EXPERIMENT \"EXT COUNTS\" ----------") + print("---------- RUNNING EXPERIMENT EXT COUNTS ----------") ExternalCounts(inp) } if (inp$expt == "external-reports") { - print("---------- RUNNING EXPERIMENT \"EXT REPORTS\" ----------") + print("---------- RUNNING EXPERIMENT EXT REPORTS ----------") ExternalReports(inp) } } diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 845dc93b..8a8ca60a 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -137,7 +137,7 @@ ASSOC_TEST_CONFIG = { 'distr': ('toy',), 'blooms': ('16x32',), - 'privacy': ('eps_zero',) + 'privacy': ('eps_small',) } # From 53233bab26a05d63ed47b4ea99178b36554593a8 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 25 Jun 2015 12:02:17 -0700 Subject: [PATCH 32/67] The test suite now can run two experiments simultaneously. --- analysis/tools/sum_bits_assoc.py | 121 ++++++++++++++++++ assoctest.sh | 8 +- tests/analyze_assoc_expt.R | 157 ++++++++++------------- tests/make_summary_assoc.py | 11 +- tests/rappor_assoc_sim.py | 211 +++++++++++++++++++++++++++++++ tests/regtest_spec.py | 34 +++-- 6 files changed, 435 insertions(+), 107 deletions(-) create mode 100755 analysis/tools/sum_bits_assoc.py create mode 100755 tests/rappor_assoc_sim.py diff --git a/analysis/tools/sum_bits_assoc.py b/analysis/tools/sum_bits_assoc.py new file mode 100755 index 00000000..acf5ea2c --- /dev/null +++ b/analysis/tools/sum_bits_assoc.py @@ -0,0 +1,121 @@ +#!/usr/bin/python +# +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Read the RAPPOR'd values on stdin, and sum the bits to produce a Counting Bloom +filter by cohort. This can then be analyzed by R. +""" + +import csv +import sys + +import rappor + + +def SumBits(params, stdin, f_2way, f_1, f_2): + csv_in = csv.reader(stdin) + csv_out_two_way = csv.writer(open(f_2way, "w")) + csv_out_1 = csv.writer(open(f_1, "w")) + csv_out_2 = csv.writer(open(f_2, "w")) + + num_cohorts = params.num_cohorts + num_bloombits = params.num_bloombits + + sums = [[0] * (4 * (num_bloombits ** 2)) for _ in xrange(num_cohorts)] + sums_1 = [[0] * num_bloombits for _ in xrange(num_cohorts)] + sums_2 = [[0] * num_bloombits for _ in xrange(num_cohorts)] + num_reports = [0] * num_cohorts + + for i, row in enumerate(csv_in): + try: + (user_id, cohort, irr_1, irr_2) = row + except ValueError: + raise RuntimeError('Error parsing row %r' % row) + + if i == 0: + continue # skip header + + cohort = int(cohort) + num_reports[cohort] += 1 + + # TODO: Extend checking for both reports + if not len(irr_1) == params.num_bloombits: + raise RuntimeError( + "Expected %d bits, got %r" % (params.num_bloombits, len(irr_1))) + # "Unrolled" joint encoding of both reports + for i, c in enumerate(irr_1): + for j, d in enumerate(irr_2): + index = 4 * ((num_bloombits - i - 1) * params.num_bloombits + + num_bloombits - j - 1) + if (c == '1' and d == '1'): + sums[cohort][index] += 1 + elif (c == '0' and d == '1'): + sums[cohort][index + 1] += 1 + elif (c == '1' and d == '0'): + sums[cohort][index + 2] += 1 + elif (c == '0' and d == '0'): + sums[cohort][index + 3] += 1 + else: + raise RuntimeError('Invalid IRRs -- digits should be 0 or 1') + + for i, c in enumerate(irr_1): + bit_num = num_bloombits - i - 1 # e.g. char 0 = bit 15, char 15 = bit 0 + if c == '1': + sums_1[cohort][bit_num] += 1 + else: + if c != '0': + raise RuntimeError('Invalid IRR -- digits should be 0 or 1') + + for i, c in enumerate(irr_2): + bit_num = num_bloombits - i - 1 # e.g. char 0 = bit 15, char 15 = bit 0 + if c == '1': + sums_2[cohort][bit_num] += 1 + else: + if c != '0': + raise RuntimeError('Invalid IRR -- digits should be 0 or 1') + + for cohort in xrange(num_cohorts): + # First column is the total number of reports in the cohort. + row = [num_reports[cohort]] + sums[cohort] + csv_out_two_way.writerow(row) + row = [num_reports[cohort]] + sums_1[cohort] + csv_out_1.writerow(row) + row = [num_reports[cohort]] + sums_2[cohort] + csv_out_2.writerow(row) + + +def main(argv): + try: + filename = argv[1] + prefix = argv[2] + except IndexError: + raise RuntimeError('Usage: sum_bits.py ') + with open(filename) as f: + try: + params = rappor.Params.from_csv(f) + except rappor.Error as e: + raise RuntimeError(e) + + SumBits(params, sys.stdin, prefix + "_2way.csv", + prefix + "_marg1.csv", prefix + "_marg2.csv") + + +if __name__ == '__main__': + try: + main(sys.argv) + except RuntimeError, e: + print >>sys.stderr, e.args[0] + sys.exit(1) diff --git a/assoctest.sh b/assoctest.sh index f433862f..3cbd2f8b 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -274,8 +274,10 @@ _run-tests() { else func=_run-one-instance-logged processors=$(grep -c ^processor /proc/cpuinfo || echo 4) # POSIX-specific - if test $processors -gt 1; then # leave one CPU for the OS - processors=$(expr $processors - 1) + if test $processors -gt 3; then # leave few CPUs for the OS + processors=$(expr $processors - 3) + else + processors=1 fi log "Running $processors parallel processes" fi @@ -326,7 +328,7 @@ run-all() { log "Running all tests. Can take a while." # a- for assoc tests # F for sequential - _run-tests '^a-' $instances F T + _run-tests '^a-' $instances T T } "$@" diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index be630207..77fc7df3 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -370,11 +370,16 @@ ExternalCounts <- function(inp) { Decode(counts[[i + 1]], map[[i]]$rmap, params, quick = FALSE)$fit$strings) + if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) { + print("FOUND_STRINGS") + print(found_strings) + stop("No strings found in 1-way marginal.") + } # Combine maps to feed into Decode2Way # Prune first to found_strings from Decode on 1-way counts pruned <- lapply(1:2, function(i) - lapply(map[[i]]$map, function(z) z[,found_strings[[i]]])) + lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE])) crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap marginal <- Decode2Way(counts[[1]], crmap, params2)$fit td <- read.csv(file = inp$truefile, header = FALSE) @@ -386,19 +391,24 @@ ExternalCounts <- function(inp) { ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] } } + ed[is.na(ed)] <- 0 time_taken <- proc.time() - ptm print(TVDistance(td, ed, "TV DISTANCE 2 WAY")) print("PROC.TIME") print(time_taken) + chisq_td <- chisq.test(td)[1][[1]][[1]] chisq_ed <- chisq.test(ed)[1][[1]][[1]] + if(is.nan(chisq_td)) { + chisq_td <- 0 + } if(is.nan(chisq_ed)) { chisq_ed <- 0 } metrics <- list( - td_chisq = chisq.test(td)[1][[1]][[1]], + td_chisq = chisq_td, ed_chisq = chisq_ed, tv = TVDistance(td, ed, ""), time = time_taken[1], @@ -414,14 +424,16 @@ ExternalCounts <- function(inp) { # ------------------------------------------------------------------------ ## ## Externally provided reports -## 2 OR 3 WAY ASSOCIATION +## EM ALGORITHM +## TODO: Also support 3 way association ## ## Inputs: ## ## Outputs: # # ------------------------------------------------------------------------ -ExternalReports <- function(inp) { +ExternalReportsEM <- function(inp) { + ptm <- proc.time() params <- ReadParameterFile(inp$params) # Ensure sufficient maps as required by number of vars stopifnot(inp$numvars == length(inp$maps)) @@ -429,107 +441,68 @@ ExternalReports <- function(inp) { ProcessMap(ReadMapFile(o, params = params), params = params)) - if (read_reports_flag == TRUE) { - # Reports must be of the format - # cohort no, rappor bitstring 1, rappor bitstring 2, ... - reportsObj <- read.csv(inp$reports, - colClasses = c("integer", + # Reports must be of the format + # cohort no, rappor bitstring 1, rappor bitstring 2, ... + reportsObj <- read.csv(inp$reports, + colClasses = c("integer", "integer", rep("character", inp$numvars)), - header = FALSE) + header = TRUE) + # Ignore the first column + reportsObj <- reportsObj[,-1] + # Parsing reportsObj + # ComputeDistributionEM allows for different sets of cohorts + # for each variable. Here, both sets of cohorts are identical + co <- as.list(reportsObj[1])[[1]] + co <- co + 1 # 1 indexing + cohorts <- rep(list(co), inp$numvars) + # Parse reports from reportObj cols 2, 3, ... + reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1])) - # Parsing reportsObj - # ComputeDistributionEM allows for different sets of cohorts - # for each variable. Here, both sets of cohorts are identical - co <- as.list(reportsObj[1])[[1]] - cohorts <- rep(list(co), inp$numvars) - # Parse reports from reportObj cols 2, 3, ... - reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1])) - - # Split strings into bit arrays (as required by assoc analysis) - reports <- lapply(1:inp$numvars, function(i) { - # apply the following function to each of reports[[1]] and reports[[2]] - lapply(reports[[i]][[1]], function(x) { - # function splits strings and converts them to numeric values - as.numeric(strsplit(x, split = "")[[1]]) - }) + # Split strings into bit arrays (as required by assoc analysis) + reports <- lapply(1:inp$numvars, function(i) { + # apply the following function to each of reports[[1]] and reports[[2]] + lapply(reports[[i]][[1]], function(x) { + # function splits strings and converts them to numeric values + # rev needed for endianness + rev(as.numeric(strsplit(x, split = "")[[1]])) }) + }) - creports <- CombineReports(reports[[1]], reports[[2]]) - } - - params2 <- params - params2$k <- (params$k ** 2) * 4 - # CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]]) - cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map) - # Combine cohorts into one map. Needed for Decode2Way - inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE)) - for (i in seq(1, length(inds))) { - inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1] - } - inds <- do.call("rbind", inds) - - # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1] - # inds <- rbind(inds[[1]], inds[[2]]) - crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c( - nrow(cmap[[1]]) * length(cmap), - ncol(cmap[[1]]))) - td <- read.csv(file = inp$truefile) - colnames(crmap) <- colnames(cmap[[1]]) - counts <- ComputeCounts(creports, cohorts[[1]], params2) - marginal <- Decode2Way(counts, crmap, params2)$fit - - also_em = FALSE - ed_em <- list() - if(also_em == TRUE) { - joint_dist <- ComputeDistributionEM(reports, cohorts, map, - ignore_other = TRUE, - quick = TRUE, - params, marginals = NULL, - estimate_var = FALSE, - new_alg = inp$newalg) - ed_em <- joint_dist$orig$fit - if(length(reports) == 3) { - ed_em <- as.data.frame(ed_em) - } - } - - ed <- td - for (cols in colnames(td)) { - for (rows in rownames(td)) { - ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] - } - } - + joint_dist <- ComputeDistributionEM(reports, cohorts, map, + ignore_other = TRUE, + quick = TRUE, + params, marginals = NULL, + estimate_var = FALSE, + new_alg = inp$newalg) + em <- joint_dist$orig$fit + td <- read.csv(file = inp$truefile, header = FALSE) + td <- table(td[,2:3]) + td <- td / sum(td) time_taken <- proc.time() - ptm - print("2 WAY RESULTS") - print(signif(ed[order(rowSums(ed)), ], 4)) - print(TVDistance(td, ed, "TV DISTANCE 2 WAY")) + print(TVDistance(td, em, "TV DISTANCE EM")) print("PROC.TIME") print(time_taken) - chisq_ed <- chisq.test(ed)[1][[1]][[1]] + chisq_td <- chisq.test(td)[1][[1]][[1]] + chisq_ed <- chisq.test(em)[1][[1]][[1]] + if(is.nan(chisq_td)) { + chisq_td <- 0 + } if(is.nan(chisq_ed)) { chisq_ed <- 0 } metrics <- list( - td_chisq = chisq.test(td)[1][[1]][[1]], + td_chisq = chisq_td, ed_chisq = chisq_ed, - tv = TVDistance(td, ed, ""), + tv = TVDistance(td, em, ""), time = time_taken[1], - dim1 = length(found_strings[[1]]), - dim2 = length(found_strings[[2]]) + dim1 = dim(em)[[1]], + dim2 = dim(em)[[2]] ) - if(also_em == TRUE) { - # Add EM metrics - metrics <- c(metrics, - list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]], - tv_em = TVDistance(td, ed_em, "")/2)) - } - # Write metrics to metrics.csv - filename <- file.path(inp$outdir, 'metrics.csv') + filename <- file.path(inp$outdir, 'metrics_2.csv') write.csv(metrics, file = filename, row.names = FALSE) } @@ -540,21 +513,21 @@ main <- function(opts) { # direct -> direct simulation of reports (without variances) # external-counts -> externally supplied counts for 2 way and marginals # external-reports -> externally supplied reports - if (!(inp$expt %in% c("direct", "external-counts", "external-reports"))) { + if (!(inp$expt %in% c("direct", "external-counts", "external-reports-em"))) { stop("Incorrect experiment in JSON file.") } - if(inp$expt == "direct") { + if("direct" %in% inp$expt) { print("---------- RUNNING EXPERIMENT DIRECT ----------") DirectSimulationOfReports(inp) } - if (inp$expt == "external-counts") { + if ("external-counts" %in% inp$expt) { print("---------- RUNNING EXPERIMENT EXT COUNTS ----------") ExternalCounts(inp) } - if (inp$expt == "external-reports") { + if ("external-reports-em" %in% inp$expt) { print("---------- RUNNING EXPERIMENT EXT REPORTS ----------") - ExternalReports(inp) + ExternalReportsEM(inp) } } diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py index 67843b4d..5cf29fe0 100755 --- a/tests/make_summary_assoc.py +++ b/tests/make_summary_assoc.py @@ -124,7 +124,7 @@ def MeanOfMeans(dict_of_lists): return None -def ParseSpecFile(spec_filename): +def ParseSpecFile(spec_filename, empty = False): """Parses the spec (parameters) file. Returns: @@ -135,6 +135,8 @@ def ParseSpecFile(spec_filename): spec_row = s.readline().split() spec_in_html = ' '.join('' % cell for cell in spec_row[3:]) + if empty == True: + spec_in_html = ' '.join('' for cell in spec_row[3:]) return spec_in_html @@ -321,6 +323,13 @@ def main(argv): print '{}{}{}'.format(cell1_html, spec_html, metrics_html) + # Printing metrics 2 if available + metrics_file = os.path.join(report_dir, 'metrics_2.csv') + if (os.path.isfile(metrics_file)): + metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file) + print '{}{}'.format(ParseSpecFile(spec_file, empty = + True), metrics_html) + print FormatSummaryRow(metrics) print '' diff --git a/tests/rappor_assoc_sim.py b/tests/rappor_assoc_sim.py new file mode 100755 index 00000000..1c6c026d --- /dev/null +++ b/tests/rappor_assoc_sim.py @@ -0,0 +1,211 @@ +#!/usr/bin/python +# +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tool to run RAPPOR on simulated client input. + +It takes a 3-column CSV file as generated by gen_sim_data.py, and outputs 4 +files: + + - out: 3 column CSV of RAPPOR'd data. + - params: RAPPOR parameters, needed to recover distributions from the output + - true inputs: Can be used to "cheat" and construct candidate strings + - hist: histogram of actual input values. Compare this with the histogram + the RAPPOR analysis infers from the first 3 values. + +Input columns: client,true_value +Ouput coumns: client,cohort,rappor + +See http://google.github.io/rappor/doc/data-flow.html for details. +""" + +import csv +import collections +import optparse +import os +import random +import sys +import time + +import rappor # client library +try: + import fastrand +except ImportError: + print >>sys.stderr, ( + "Native fastrand module not imported; see README for speedups") + fastrand = None + + +def log(msg, *args): + if args: + msg = msg % args + print >>sys.stderr, msg + + +def CreateOptionsParser(): + p = optparse.OptionParser() + + # We are taking a path, and not using stdin, because we read it twice. + p.add_option( + '-i', dest='infile', metavar='PATH', type='str', default='', + help='CSV input path. Header is "client,true_value"') + p.add_option( + '--out-prefix', dest='out_prefix', metavar='PATH', type='str', + default='', + help='Output prefix.') + + p.add_option( + '--num-bits', type='int', metavar='INT', dest='num_bits', default=16, + help='Number of bloom filter bits.') + p.add_option( + '--num-hashes', type='int', metavar='INT', dest='num_hashes', default=2, + help='Number of hashes.') + p.add_option( + '--num-cohorts', type='int', metavar='INT', dest='num_cohorts', + default=64, help='Number of cohorts.') + + p.add_option( + '-p', type='float', metavar='FLOAT', dest='prob_p', default=1, + help='Probability p') + p.add_option( + '-q', type='float', metavar='FLOAT', dest='prob_q', default=1, + help='Probability q') + p.add_option( + '-f', type='float', metavar='FLOAT', dest='prob_f', default=1, + help='Probability f') + + p.add_option( + '--oneprr', dest='oneprr', action='store_true', default=False, + help='Use a consistent PRR.') + + choices = ['simple', 'fast'] + p.add_option( + '-r', type='choice', metavar='STR', + dest='random_mode', default='fast', choices=choices, + help='Random algorithm (%s)' % '|'.join(choices)) + + return p + + +def make_histogram(csv_in): + """Make a histogram of the simulated input file.""" + # TODO: It would be better to share parsing with rappor_encode() + counter = collections.Counter() + for (_, word) in csv_in: + counter[word] += 1 + return dict(counter.most_common()) + + +def print_histogram(word_hist, histfile): + """Write histogram of values to histfile.""" + # Print histograms of distributions + sorted_words = sorted(word_hist.iteritems(), key=lambda pair: pair[1], + reverse=True) + fmt = "%s,%s" + print >>histfile, fmt % ("string", "count") + for pair in sorted_words: + print >>histfile, fmt % pair + + +def bit_string(irr, num_bloombits): + """Like bin(), but uses leading zeroes, and no '0b'.""" + s = '' + bits = [] + for bit_num in xrange(num_bloombits): + if irr & (1 << bit_num): + bits.append('1') + else: + bits.append('0') + return ''.join(reversed(bits)) + + +def main(argv): + (opts, argv) = CreateOptionsParser().parse_args(argv) + if not opts.infile: + raise RuntimeError('-i is required') + if not opts.out_prefix: + raise RuntimeError('--out-prefix is required') + + # Copy flags into params + params = rappor.Params() + params.num_bloombits = opts.num_bits + params.num_hashes = opts.num_hashes + params.num_cohorts = opts.num_cohorts + params.prob_p = opts.prob_p + params.prob_q = opts.prob_q + params.prob_f = opts.prob_f + params.flag_oneprr = opts.oneprr + + prefix = opts.out_prefix + + outfile = prefix + "_out.csv" + + rand = random.Random() # default Mersenne Twister randomness + #rand = random.SystemRandom() # cryptographic randomness from OS + + rand.seed() # Default: seed with sys time + + if opts.random_mode == 'simple': + rand_funcs = rappor.SimpleRandFuncs(params, rand) + elif opts.random_mode == 'fast': + if fastrand: + log('Using fastrand extension') + # NOTE: This doesn't take 'rand' + rand_funcs = fastrand.FastRandFuncs(params) + else: + log('Warning: fastrand module not importable; see README for build ' + 'instructions. Falling back to simple randomness.') + rand_funcs = rappor.SimpleRandFuncs(params, rand) + else: + raise AssertionError + + # Do RAPPOR transformation. + with open(opts.infile) as f_in, open(outfile, 'w') as f_out: + csv_in = csv.reader(f_in) + csv_out = csv.writer(f_out) + + header = ('client', 'cohort', 'rappor_var1', 'rappor_var2') + csv_out.writerow(header) + + cur_client = None # current client + + start_time = time.time() + + for i, (client, true_value_1, true_value_2) in enumerate(csv_in): + if i % 10000 == 0: + elapsed = time.time() - start_time + log('Processed %d inputs in %.2f seconds', i, elapsed) + + # New encoder instance for each client. + if client != cur_client: + cur_client = client + e = rappor.Encoder(params, cur_client, rand_funcs=rand_funcs) + + cohort, irr_1 = e.encode(true_value_1) + cohort_check, irr_2 = e.encode(true_value_2, assigned_cohort = cohort) + # Ensure same cohort used for irr_1, and irr_2 + assert cohort_check == cohort + + # encoded is a list of (cohort, rappor_1, rappor_2) triples + out_row = (client, cohort, bit_string(irr_1, params.num_bloombits), + bit_string(irr_2, params.num_bloombits)) + csv_out.writerow(out_row) + + +if __name__ == "__main__": + try: + main(sys.argv) + except RuntimeError, e: + log('rappor_sim.py: FATAL: %s', e) diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 8a8ca60a..68599cf4 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -126,18 +126,30 @@ # privacy params set) # The test config runs a test suite that is the cross product of all the above # sets -#ASSOC_TEST_CONFIG = { -# 'distr': ('fizz-tiny', 'fizz-tiny-bool', -# 'fizz-small', 'fizz-small-bool', -# 'fizz', 'fizz-bool'), -# 'blooms': ('8x16','8x32',),# '16x32'), -# 'privacy': ('eps_small','eps_chrome') -#} - ASSOC_TEST_CONFIG = { - 'distr': ('toy',), - 'blooms': ('16x32',), - 'privacy': ('eps_small',) + 'distr': ('fizz-tiny', + 'fizz-tiny-bool', +# 'fizz-small', +# 'fizz-small-bool', +# 'fizz', +# 'fizz-bool', + 'toy',), +# 'compact-noextra-small', +# 'loose-noextra-small', +# 'compact-noextra-large', +# 'loose-noextra-large', +# 'compact-extra-small', +# 'loose-extra-small', +# 'compact-extra-large', +# 'loose-extra-large', +# 'compact-excess-small', +# 'loose-excess-small', +# 'compact-excess-large', +# 'loose-excess-large'), + 'blooms': ('8x32', + '16x32'), + 'privacy': ('eps_small', + 'eps_chrome') } # From bfb257ed001ba4e6a08cfea66e1c559593338d4d Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 25 Jun 2015 12:14:23 -0700 Subject: [PATCH 33/67] Now considering both experiments. Fixing small changes. --- assoctest.sh | 4 ++-- tests/regtest_spec.py | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/assoctest.sh b/assoctest.sh index 3cbd2f8b..7a4ef8a3 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -168,7 +168,7 @@ _run-one-instance() { inp = dict(); \ inp['maps'] = ['$case_dir/case_map1.csv',\ '$case_dir/case_map2.csv']; \ - inp['reports'] = '$instance_dir/reports.csv'; \ + inp['reports'] = '$instance_dir/case_out.csv'; \ inp['truefile'] = '$instance_dir/case.csv'; \ inp['outdir'] = '$out_dir'; \ inp['params'] = '$case_dir/case_params.csv'; \ @@ -180,7 +180,7 @@ _run-one-instance() { inp['counts'] = ['$instance_dir/case_2way.csv',\ '$instance_dir/case_marg1.csv',\ '$instance_dir/case_marg2.csv']; \ - inp['expt'] = 'external-counts'; \ + inp['expt'] = ['external-counts', 'external-reports-em']; \ json.dump(inp, f); \ f.close();" diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 68599cf4..d029798f 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -127,13 +127,14 @@ # The test config runs a test suite that is the cross product of all the above # sets ASSOC_TEST_CONFIG = { - 'distr': ('fizz-tiny', + 'distr': ( + 'fizz-tiny', 'fizz-tiny-bool', -# 'fizz-small', -# 'fizz-small-bool', -# 'fizz', -# 'fizz-bool', - 'toy',), + 'fizz-small', + 'fizz-small-bool', + 'fizz', + 'fizz-bool',), +# 'toy',), # 'compact-noextra-small', # 'loose-noextra-small', # 'compact-noextra-large', From 9fd30deeb3b42e2f5c76bd22ebc2e5dc154ef207 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 25 Jun 2015 16:01:39 -0700 Subject: [PATCH 34/67] Small changes to assoc suite. --- analysis/R/decode.R | 2 +- tests/analyze_assoc_expt.R | 2 +- tests/make_summary_assoc.py | 11 ++++++++--- tests/regtest_spec.py | 16 +++++++++------- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index c84a23dd..6e0522ac 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -383,7 +383,7 @@ Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05, # Only select coefficients more than two standard deviations from 0. May # inflate empirical SD of the estimates. - reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd) + reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd) mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported]) diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index 77fc7df3..88f11540 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -369,7 +369,7 @@ ExternalCounts <- function(inp) { found_strings <- lapply(1:2, function(i) Decode(counts[[i + 1]], map[[i]]$rmap, - params, quick = FALSE)$fit$strings) + params, quick = TRUE)$fit$strings) if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) { print("FOUND_STRINGS") print(found_strings) diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py index 5cf29fe0..665ef9f9 100755 --- a/tests/make_summary_assoc.py +++ b/tests/make_summary_assoc.py @@ -157,7 +157,7 @@ def ExtractTime(log_filename): return None -def ParseMetrics(metrics_file, log_file): +def ParseMetrics(metrics_file, log_file, italics = False): """Processes the metrics file. Args: @@ -200,7 +200,11 @@ def ParseMetrics(metrics_file, log_file): } # return metrics formatted as HTML table entries - return (metrics_row_dict, + if(italics == True): + return (metrics_row_dict, + ' '.join('' % cell for cell in metrics_row_str)) + else: + return (metrics_row_dict, ' '.join('' % cell for cell in metrics_row_str)) @@ -326,7 +330,8 @@ def main(argv): # Printing metrics 2 if available metrics_file = os.path.join(report_dir, 'metrics_2.csv') if (os.path.isfile(metrics_file)): - metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file) + metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file, + italics = True) print '{}{}'.format(ParseSpecFile(spec_file, empty = True), metrics_html) diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index d029798f..db8a8566 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -131,9 +131,9 @@ 'fizz-tiny', 'fizz-tiny-bool', 'fizz-small', - 'fizz-small-bool', - 'fizz', - 'fizz-bool',), + 'fizz-small-bool',), +# 'fizz', +# 'fizz-bool',), # 'toy',), # 'compact-noextra-small', # 'loose-noextra-small', @@ -147,10 +147,12 @@ # 'loose-excess-small', # 'compact-excess-large', # 'loose-excess-large'), - 'blooms': ('8x32', - '16x32'), - 'privacy': ('eps_small', - 'eps_chrome') + 'blooms': ( + '8x32', + '16x32',), + 'privacy': ( + 'eps_small', + 'eps_chrome',) } # From 673348699a7a186934088d28b523e0a910c529f2 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 29 Jun 2015 11:04:58 -0700 Subject: [PATCH 35/67] Adding analysis/R/alternative.R. --- analysis/R/alternative.R | 109 ++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 59 deletions(-) diff --git a/analysis/R/alternative.R b/analysis/R/alternative.R index 3f0e66d3..d7869439 100644 --- a/analysis/R/alternative.R +++ b/analysis/R/alternative.R @@ -1,83 +1,74 @@ # Copyright 2014 Google Inc. All rights reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -library(limSolve) -library(Matrix) +# alternative.R +# +# This is some messy code to test out alternative regression using pcls(). -# The next two functions create a matrix (G) and a vector (H) encoding -# linear inequality constraints that a solution vector (x) must satisfy: -# G * x >= H +library(mgcv) -# Currently represent three sets of constraints on the solution vector: -# - all solution coefficients are nonnegative -# - the sum total of all solution coefficients is no more than 1 -# - in each of the coordinates of the target vector (estimated Bloom filter) -# we don't overshoot by more than three standard deviations. -MakeG <- function(n, X) { - d <- Diagonal(n) - last <- rep(-1, n) - rbind2(rbind2(d, last), -X) -} -MakeH <- function(n, Y, stds) { - # set the floor at 0.01 to avoid degenerate cases - YY <- apply(Y + 3 * stds, # in each bin don't overshoot by more than 3 stds - 1:2, - function(x) min(1, max(0.01, x))) # clamp the bound to [0.01,1] +# uniform vector +makep = function(n) { + rep(1, n) / (n+1) +} - c(rep(0, n), # non-negativity condition - -1, # coefficients sum up to no more than 1 - -as.vector(t(YY)) # t is important! - ) +# diagonal matrix with -1 +makeAin = function(n) { + d = diag(x=1, n, n) + last = rep(-1, n) + rbind(d, last) } -MakeLseiModel <- function(X, Y, stds) { - m <- dim(X)[1] - n <- dim(X)[2] +makebin = function(n) { + #ratio = 172318 / 128 + # NOTE: Hard-coded hacks here + ratio = 70000 / 64 + #ratio = 490000 / 64 -# no slack variables for now -# slack <- Matrix(FALSE, nrow = m, ncol = m, sparse = TRUE) -# colnames(slack) <- 1:m -# diag(slack) <- TRUE -# -# G <- MakeG(n + m) -# H <- MakeH(n + m) -# -# G[n+m+1,n:(n+m)] <- -0.1 -# A = cbind2(X, slack) + print("RATIO") + print(ratio) + + c(rep(0, n), -ratio) +} - w <- as.vector(t(1 / stds)) - w_median <- median(w[!is.infinite(w)]) - if(is.na(w_median)) # all w are infinite - w_median <- 1 - w[w > w_median * 2] <- w_median * 2 - w <- w / mean(w) +makeM = function(X,Y) { + n=dim(X)[2] + p = makep(n) + Ain = makeAin(n) + bin = makebin(n) - list(# coerce sparse Boolean matrix X to sparse numeric matrix - A = Diagonal(x = w) %*% (X + 0), - B = as.vector(t(Y)) * w, # transform to vector in the row-first order - G = MakeG(n, X), - H = MakeH(n, Y, stds), - type = 2) # Since there are no equality constraints, lsei defaults to - # solve.QP anyway, but outputs a warning unless type == 2. + list(X=as.matrix(X), + p=p, + off=array(0,0), + S=list(), + Ain=Ain, + bin=bin, + C=matrix(0,0,0), + sp=array(0,0), + y=Y, + w=rep(1, length(Y)) ) } # CustomLM(X, Y) -ConstrainedLinModel <- function(X,Y) { - model <- MakeLseiModel(X, Y$estimates, Y$stds) - coefs <- do.call(lsei, model)$X - names(coefs) <- colnames(X) +newLM = function(X,Y) { + M = makeM(X,Y) + coefs = pcls(M) + + print("SUM(coefs)") + print(sum(coefs)) + + return(coefs) +} - coefs -} \ No newline at end of file From 37a39ef86c5ba89b4b7b2315d4dd3602b2353cb8 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 29 Jun 2015 16:01:00 -0700 Subject: [PATCH 36/67] Merged changes from drop-QP branch. Code now uses only LASSO for association analysis. --- analysis/R/alternative.R | 74 -------------- analysis/R/analysis_tool.R | 31 +++--- analysis/R/association.R | 19 ++-- analysis/R/decode.R | 196 ++++++++++++++++++++----------------- analysis/R/decode_test.R | 146 +++++++++++++-------------- analysis/R/read_input.R | 5 +- analysis/R/simulation.R | 2 +- analysis/R/test.sh | 2 + analysis/R/util.R | 20 ++++ regtest.sh | 63 ++++++++---- setup.sh | 8 +- tests/analyze.R | 13 +-- tests/analyze_assoc_expt.R | 2 +- tests/gen_counts_test.R | 17 ++-- tests/user_spec.py | 116 ++++++++++++++++++++++ 15 files changed, 405 insertions(+), 309 deletions(-) delete mode 100644 analysis/R/alternative.R create mode 100644 analysis/R/util.R create mode 100755 tests/user_spec.py diff --git a/analysis/R/alternative.R b/analysis/R/alternative.R deleted file mode 100644 index d7869439..00000000 --- a/analysis/R/alternative.R +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright 2014 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# alternative.R -# -# This is some messy code to test out alternative regression using pcls(). - -library(mgcv) - - -# uniform vector -makep = function(n) { - rep(1, n) / (n+1) -} - -# diagonal matrix with -1 -makeAin = function(n) { - d = diag(x=1, n, n) - last = rep(-1, n) - rbind(d, last) -} - -makebin = function(n) { - #ratio = 172318 / 128 - # NOTE: Hard-coded hacks here - ratio = 70000 / 64 - #ratio = 490000 / 64 - - print("RATIO") - print(ratio) - - c(rep(0, n), -ratio) -} - -makeM = function(X,Y) { - n=dim(X)[2] - p = makep(n) - Ain = makeAin(n) - bin = makebin(n) - - list(X=as.matrix(X), - p=p, - off=array(0,0), - S=list(), - Ain=Ain, - bin=bin, - C=matrix(0,0,0), - sp=array(0,0), - y=Y, - w=rep(1, length(Y)) ) -} - -# CustomLM(X, Y) -newLM = function(X,Y) { - M = makeM(X,Y) - coefs = pcls(M) - - print("SUM(coefs)") - print(sum(coefs)) - - return(coefs) -} - diff --git a/analysis/R/analysis_tool.R b/analysis/R/analysis_tool.R index b871b2d5..b0acf2df 100755 --- a/analysis/R/analysis_tool.R +++ b/analysis/R/analysis_tool.R @@ -15,12 +15,12 @@ # days for weekly and 28 days for monthly analyses. library(optparse) +library(RJSONIO) source("analysis/R/analysis_lib.R") source("analysis/R/read_input.R") source("analysis/R/decode.R") - -source("analysis/R/alternative.R") +source("analysis/R/util.R") options(stringsAsFactors = FALSE) @@ -58,13 +58,6 @@ if (!interactive()) { opts <- parse_args(OptionParser(option_list = option_list)) } -# NOTE: This is in tests/analysis.R too -Log <- function(...) { - cat('rappor_analysis.R: ') - cat(sprintf(...)) - cat('\n') -} - # Handle the case of redundant cohorts, i.e. the counts file needs to be # further aggregated to obtain counts for the number of cohorts specified in # the config file. @@ -107,27 +100,27 @@ RunOne <- function(opts) { fit <- res$fit - results_path <- file.path(opts$output_dir, 'results.csv') - write.csv(fit, file = results_path, row.names = FALSE) + # Write analysis results as CSV. + results_csv_path <- file.path(opts$output_dir, 'results.csv') + write.csv(fit, file = results_csv_path, row.names = FALSE) + + # Write summary as JSON (scalar values). + metrics_json_path <- file.path(opts$output_dir, 'metrics.json') + m <- toJSON(res$metrics) + writeLines(m, con = metrics_json_path) # TODO: # - These are in an 2 column 'parameters' and 'values' format. Should these # just be a plain list? - # - Write them to another CSV file or JSON on stdout? - - Log("Fit summary:") - print(res$summary) - cat("\n") + # - Should any of these privacy params be in metrics.json? Log("Privacy summary:") print(res$privacy) cat("\n") # Output metrics as machine-parseable prefix + JSON. - num_rappor <- nrow(fit) - allocated_mass <- sum(fit$proportion) Log('__OUTPUT_METRICS__ {"num_rappor": %d, "allocated_mass": %f}', - num_rappor, allocated_mass) + res$metrics$num_detected, res$metrics$allocated_mass) Log('DONE') } diff --git a/analysis/R/association.R b/analysis/R/association.R index d50bd490..393b0e3a 100644 --- a/analysis/R/association.R +++ b/analysis/R/association.R @@ -39,7 +39,7 @@ GetOtherProbs <- function(counts, map, marginal, params) { p <- params$p # List of known strings that were measured in the marginal. - candidate_strings <- marginal$strings + candidate_strings <- marginal$string # Counts to remove from each cohort. top_counts <- ceiling(marginal$proportion * N / params$m) @@ -421,19 +421,20 @@ ComputeDistributionEM <- function(reports, report_cohorts, joint_conditional = NULL found_strings <- list() cd_for_reports <- list() - + for (j in (1:num_variables)) { ptm <- proc.time() variable_report <- reports[[j]] variable_cohort <- report_cohorts[[j]] map <- maps[[j]] - + # Compute the probability of the "other" category variable_counts <- NULL if (is.null(marginals)) { ptm2 <- proc.time() variable_counts <- ComputeCounts(variable_report, variable_cohort, params) - marginal <- Decode(variable_counts, map$rmap, params, quick)$fit + marginal <- Decode(variable_counts, map$rmap, params, quick, + quiet = TRUE)$fit print("TIME IN MARGINALS") print(proc.time() - ptm2) if (nrow(marginal) == 0) { @@ -442,7 +443,7 @@ ComputeDistributionEM <- function(reports, report_cohorts, } else { marginal <- marginals[[j]] } - found_strings[[j]] <- marginal$strings + found_strings[[j]] <- marginal$string if (ignore_other) { prob_other <- vector(mode = "list", length = params$m) @@ -466,7 +467,7 @@ ComputeDistributionEM <- function(reports, report_cohorts, prob_other[[idx]]) rep }) - + if(new_alg) { # Report conditional distributions as lists if (j == 1) { @@ -485,7 +486,7 @@ ComputeDistributionEM <- function(reports, report_cohorts, print("TIME IN COND_REPORT_DIST") print(proc.time()-ptm) } - + ptm <- proc.time() # Run expectation maximization to find joint distribution if (new_alg) { @@ -498,7 +499,7 @@ ComputeDistributionEM <- function(reports, report_cohorts, print("TIME IN EM") print(proc.time() - ptm) dimnames(em$est) <- found_strings - + # Return results in a usable format list(orig = list(fit = em$est, sd = em$sd, em = em)) -} \ No newline at end of file +} diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 6e0522ac..2d8af344 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -16,8 +16,7 @@ # This library implements the RAPPOR marginal decoding algorithms using LASSO. library(glmnet) - -source('analysis/R/alternative.R') +library(limSolve) Estimate2WayBloomCounts <- function(params, obs_counts) { p <- params$p @@ -25,20 +24,20 @@ Estimate2WayBloomCounts <- function(params, obs_counts) { f <- params$f m <- params$m k <- params$k - + stopifnot(m == nrow(obs_counts), params$k + 1 == ncol(obs_counts)) - + p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1 p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1 p10 <- 1 - p11 # probability of a true 1 reported as 0 p00 <- 1 - p01 # probability of a true 0 reported as 0 - + NoiseMatrix <- matrix(rep(0, 16), 4) NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2) NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00) NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01) NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2) - + ests <- apply(obs_counts, 1, function(x) { N <- x[1] inds <- seq(0, (k/4)-1) @@ -47,7 +46,7 @@ Estimate2WayBloomCounts <- function(params, obs_counts) { as.vector(t(Solve(NoiseMatrix)) %*% v[(i*4 + 1):((i+1)*4)]) }) }) - + if(FALSE) { # TODO(pseudorandom): Compute variances variances <- apply(obs_counts, 1, function(x) { @@ -59,17 +58,17 @@ Estimate2WayBloomCounts <- function(params, obs_counts) { N * r * (1 - r) / p2^2 # variance of the binomial }) } - + # Transform counts from absolute values to fractional, removing bias due to # variability of reporting between cohorts. ests <- apply(ests, 1, function(x) x / obs_counts[,1]) # stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1]) - + # Some estimates may be set to infinity, e.g. if f=1. We want to # account for this possibility, and set the corresponding counts # to 0. ests[abs(ests) == Inf] <- 0 - + list(estimates = ests, stds = matrix(rep(100, length(ests[,1]) * length(ests[1,])), length(ests[,1]))) @@ -95,7 +94,7 @@ EstimateBloomCounts <- function(params, obs_counts) { # Output: # ests: a matrix of size m by k with estimated counts for the probability # of each bit set to 1 in the true Bloom filter. - # std: standard deviation of the estimates. + # stds: standard deviation of the estimates. p <- params$p q <- params$q @@ -110,22 +109,23 @@ EstimateBloomCounts <- function(params, obs_counts) { p2 <- p11 - p01 # == (1 - f) * (q - p) ests <- apply(obs_counts, 1, function(x) { - N <- x[1] # sample size for the cohort - v <- x[-1] # counts for individual bits - (v - p01 * N) / p2 # unbiased estimator for individual bits' true counts - # It can be negative or exceed the total. - }) + N <- x[1] # sample size for the cohort + v <- x[-1] # counts for individual bits + (v - p01 * N) / p2 # unbiased estimator for individual bits' + # true counts. It can be negative or + # exceed the total. + }) total <- sum(obs_counts[,1]) variances <- apply(obs_counts, 1, function(x) { - N <- x[1] - v <- x[-1] - p_hats <- (v - p01 * N) / (N * p2) # expectation of a true 1 - p_hats <- pmax(0, pmin(1, p_hats)) # clamp to [0,1] - r <- p_hats * p11 + (1 - p_hats) * p01 # expectation of a reported 1 - N * r * (1 - r) / p2^2 # variance of the binomial - }) + N <- x[1] + v <- x[-1] + p_hats <- (v - p01 * N) / (N * p2) # expectation of a true 1 + p_hats <- pmax(0, pmin(1, p_hats)) # clamp to [0,1] + r <- p_hats * p11 + (1 - p_hats) * p01 # expectation of a reported 1 + N * r * (1 - r) / p2^2 # variance of the binomial + }) # Transform counts from absolute values to fractional, removing bias due to # variability of reporting between cohorts. @@ -145,26 +145,32 @@ FitLasso <- function(X, Y, intercept = TRUE) { # # Input: # X: a design matrix of size km by M (the number of candidate strings). - # Y: a vector of size km with estimated counts from EstimateBloomCounts(). + # Y: a vector of size km with estimated counts from EstimateBloomCounts(), + # representing constraints # intercept: whether to fit with intercept or not. # # Output: # a vector of size ncol(X) of coefficients. # TODO(mironov): Test cv.glmnet instead of glmnet - mod <- try(glmnet(X, Y, standardize = FALSE, intercept = intercept, - lower.limits = 0, - pmax = min(500, length(Y) * .8)), - silent = TRUE) - - # If fitting fails, return an empty data.frame. - if (class(mod)[1] == "try-error") { - coefs <- setNames(rep(0, ncol(X)), colnames(X)) - } else { - coefs <- coef(mod) - coefs <- coefs[-1, ncol(coefs), drop = FALSE] # coefs[1] is the intercept - } - coefs + + # Cap the number of non-zero coefficients to 500 or 80% of the number of + # constraints, whichever is less. The 500 cap is for performance reasons, 80% + # is to avoid overfitting. + cap <- min(500, nrow(X) * .8, ncol(X)) + + mod <- glmnet(X, Y, standardize = FALSE, intercept = intercept, + lower.limits = 0, # outputs are non-negative + pmax = cap) + + coefs <- coef(mod) + coefs <- coefs[-1, , drop = FALSE] # drop the intercept + l1cap <- sum(colSums(coefs) <= 1.0) # find all columns with L1 norm <= 1 + if(l1cap > 0) + distr <- coefs[, l1cap] # return the last set of coefficients with L1 <= 1 + else + distr <- setNames(rep(0, ncol(X)), colnames(X)) + distr } PerformInference <- function(X, Y, N, mod, params, alpha, correction) { @@ -193,7 +199,7 @@ PerformInference <- function(X, Y, N, mod, params, alpha, correction) { # # 1-sided t-test. # p_values <- pnorm(z_values, lower = FALSE) - fit <- data.frame(String = colnames(X), Estimate = betas, + fit <- data.frame(string = colnames(X), Estimate = betas, SD = mod$stds, # z_stat = z_values, pvalue = p_values, stringsAsFactors = FALSE) @@ -212,7 +218,7 @@ PerformInference <- function(X, Y, N, mod, params, alpha, correction) { fit <- fit[order(fit$Estimate, decreasing = TRUE), ] if (nrow(fit) > 0) { - str_names <- fit$String + str_names <- fit$string str_names <- str_names[!is.na(str_names)] if (length(str_names) > 0 && length(str_names) < nrow(X)) { this_data <- as.data.frame(as.matrix(X[, str_names])) @@ -262,7 +268,7 @@ ComputePrivacyGuarantees <- function(params, alpha, N) { privacy } -FitDistribution <- function(estimates_stds, map) { +FitDistribution <- function(estimates_stds, map, quiet = FALSE) { # Find a distribution over rows of map that approximates estimates_stds best # # Input: @@ -275,40 +281,24 @@ FitDistribution <- function(estimates_stds, map) { # according to this vector approximates estimates S <- ncol(map) # total number of candidates + lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) - support_coefs <- 1:S + if(!quiet) + cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n") - if (TRUE) { - # if (S > length(estimates_stds$estimates) * .8) { - # the system is close to being underdetermined - lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) - - # Select non-zero coefficients. - support_coefs <- which(lasso > 0) - cat("LASSO selected ", length(support_coefs), " coefficients in support.\n") - } - - coefs <- setNames(rep(0, S), colnames(map)) - - if(length(support_coefs) > 0) { # LASSO may return an empty list - constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE], - estimates_stds) - - coefs[support_coefs] <- constrained_coefs - } - - coefs -} + names(lasso) <- colnames(map) + lasso + } Resample <- function(e) { - result <- e - - result$estimates <- matrix(mapply(function(x, y) x + rnorm(1, 0, y), + # Simulate resampling of the Bloom filter estimates by adding Gaussian noise + # with estimated standard deviation. + estimates <- matrix(mapply(function(x, y) x + rnorm(1, 0, y), e$estimates, e$stds), nrow = nrow(e$estimates), ncol = ncol(e$estimates)) - result$stds <- e$stds * 2^.5 + stds <- e$stds * 2^.5 - result + list(estimates = estimates, stds = stds) } Decode2Way <- function(counts, map, params) { @@ -318,17 +308,17 @@ Decode2Way <- function(counts, map, params) { f <- params$f h <- params$h m <- params$m - + S <- ncol(map) # total number of candidates - + N <- sum(counts[, 1]) - + filter_cohorts <- which(counts[, 1] != 0) # exclude cohorts with zero reports - + # stretch cohorts to bits filter_bits <- as.vector( t(matrix(1:nrow(map), nrow = m, byrow = TRUE)[filter_cohorts,])) - + es <- Estimate2WayBloomCounts(params, counts) e <- list(estimates = es$estimates[filter_cohorts, , drop = FALSE], stds = es$stds[filter_cohorts, , drop = FALSE]) @@ -342,7 +332,7 @@ Decode2Way <- function(counts, map, params) { } Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05, - correction = c("Bonferroni"), ...) { + correction = c("Bonferroni"), quiet = FALSE, ...) { k <- params$k p <- params$p q <- params$q @@ -367,6 +357,8 @@ Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05, stds = es$stds[filter_cohorts, , drop = FALSE]) coefs_all <- vector() + # Run the fitting procedure several times (5 seems to be sufficient and not + # too many) to estimate standard deviation of the output. if(quick) {num_reps <- 2} else {num_reps <- 5} for(r in 1:num_reps) { @@ -374,20 +366,21 @@ Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05, e <- Resample(estimates_stds_filtered) else e <- estimates_stds_filtered - + coefs_all <- rbind(coefs_all, - FitDistribution(e, map[filter_bits, , drop = FALSE])) + FitDistribution(e, map[filter_bits, , drop = FALSE], + quiet)) } coefs_ssd <- N * apply(coefs_all, 2, sd) # compute sample standard deviations coefs_ave <- N * apply(coefs_all, 2, mean) - + # Only select coefficients more than two standard deviations from 0. May # inflate empirical SD of the estimates. reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd) - + mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported]) -# Old code ... +# Old code ... # coefs_all <- FitDistribution(estimates_stds_filtered, # map[filter_bits, , drop = FALSE]) # reported <- which(coefs_all > 1E-6) @@ -410,25 +403,38 @@ Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05, # Estimates from the model are per instance so must be multipled by h. # Standard errors are also adjusted. - fit$Total_Est <- floor(fit$Estimate) - fit$Total_SD <- floor(fit$SD) - fit$Prop <- fit$Total_Est / N - fit$LPB <- fit$Prop - 1.96 * fit$Total_SD / N - fit$UPB <- fit$Prop + 1.96 * fit$Total_SD / N + fit$estimate <- floor(fit$Estimate) + fit$proportion <- fit$estimate / N + + fit$std_error <- floor(fit$SD) + fit$prop_std_error <- fit$std_error / N + + # 1.96 standard deviations gives 95% confidence interval. + fit$prop_low_95 <- fit$proportion - 1.96 * fit$prop_std_error + fit$prop_high_95 <- fit$proportion + 1.96 * fit$prop_std_error - fit <- fit[, c("String", "Total_Est", "Total_SD", "Prop", "LPB", "UPB")] - colnames(fit) <- c("strings", "estimate", "std_dev", "proportion", - "lower_bound", "upper_bound") + fit <- fit[, c("string", "estimate", "std_error", "proportion", + "prop_std_error", "prop_low_95", "prop_high_95")] + + allocated_mass <- sum(fit$proportion) + num_detected <- nrow(fit) + + ss <- round(inf$SS, digits = 3) + explained_var <- ss[[1]] + missing_var <- ss[[2]] + noise_var <- ss[[3]] + + noise_std_dev <- round(inf$resid_sigma, digits = 3) # Compute summary of the fit. - parameters = + parameters <- c("Candidate strings", "Detected strings", "Sample size (N)", "Discovered Prop (out of N)", "Explained Variance", "Missing Variance", "Noise Variance", "Theoretical Noise Std. Dev.") - values <- c(S, nrow(fit), N, round(sum(fit[, 2]) / N, 3), - round(inf$SS, 3), - round(inf$resid_sigma, 3)) + values <- c(S, num_detected, N, allocated_mass, + explained_var, missing_var, noise_var, noise_std_dev) + res_summary <- data.frame(parameters = parameters, values = values) privacy <- ComputePrivacyGuarantees(params, alpha, N) @@ -436,9 +442,17 @@ Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05, c("k", "h", "m", "p", "q", "f", "N", "alpha"), values = c(k, h, m, p, q, f, N, alpha)) + # This is a list of decode stats in a better format than 'summary'. + # TODO: Delete summary. + metrics <- list(sample_size = N, + allocated_mass = allocated_mass, + num_detected = num_detected, + explained_var = explained_var, + missing_var = missing_var) + list(fit = fit, summary = res_summary, privacy = privacy, params = params, lasso = NULL, ests = as.vector(t(estimates_stds_filtered$estimates)), - counts = counts[, -1], resid = NULL) + counts = counts[, -1], resid = NULL, metrics = metrics) } ComputeCounts <- function(reports, cohorts, params) { diff --git a/analysis/R/decode_test.R b/analysis/R/decode_test.R index fe74e691..71db7aa5 100755 --- a/analysis/R/decode_test.R +++ b/analysis/R/decode_test.R @@ -22,20 +22,21 @@ source('tests/gen_counts.R') L1Distance <- function(X, Y) { # Computes the L1 distance between two named vectors common <- intersect(names(X), names(Y)) - union <- rbind(X[common], Y[common]) - (sum(abs(union[1,]-union[2,])) + sum(X[!names(X) %in% common]) - + sum(Y[!names(Y) %in% common])) / 2 + L1_intersect <- sum(abs(X[common] - Y[common])) + L1_X_minus_Y <- sum(X[!names(X) %in% common]) + L1_Y_minus_X <- sum(Y[!names(Y) %in% common]) + + (L1_intersect + L1_X_minus_Y + L1_Y_minus_X) / 2 } LInfDistance <- function(X, Y) { - # Computes the L1 distance between two named vectors + # Computes the L_infinity distance between two named vectors common <- intersect(names(X), names(Y)) - union <- rbind(X[common], Y[common]) - max(abs(union[1,]-union[2,]), - X[!names(X) %in% common], - Y[!names(Y) %in% common]) + max(abs(X[common] - Y[common]), + abs(X[!names(X) %in% common]), + abs(Y[!names(Y) %in% common])) } MatrixVectorMerge <- function(mat, vec) { @@ -63,39 +64,40 @@ MatrixVectorMerge <- function(mat, vec) { } RunMultipleTests <- function(title, fun, repetitions, ...) { - # Run a function with an annotated progress indicator + # Run a function with an annotated progress indicator. The function's outputs + # are concatenated and returned as a list of length repetitions. cat(title, ": ") if(repetitions == 1) { # only run once - fun(...) + results <- list(fun(...)) - cat(" Done.") - } - else { # run multiple times + cat(" Done.\n") + } else { # run multiple times pb <- txtProgressBar(min = 0, max = repetitions, width = getOption("width") - 20 - nchar(title)) + results <- vector(mode = "list", repetitions) for(i in 1:repetitions) { setTxtProgressBar(pb, i) - fun(...) + results[[i]] <- fun(...) } cat(" Done.") close(pb) } + + results } -TestEstimatesAndStdsHelper <- function(params, map, pdf, total) { +CheckEstimatesAndStdsHelper <- function(params, map, pdf, total) { # Helper function for TestEstimateBloomCounts. partition <- RandomPartition(total, pdf) counts <- GenerateCounts(params, map, partition, 1) - e <- EstimateBloomCounts(params, counts) - results$estimates <<- abind(results$estimates, e$estimates, along = 3) - results$stds <<- abind(results$stds, e$stds, along = 3) + EstimateBloomCounts(params, counts) } -TestEstimatesAndStds <- function(repetitions, title, params, map, pdf, total) { +CheckEstimatesAndStds <- function(repetitions, title, params, map, pdf, total) { # Checks that the expectations returned by EstimateBloomCounts on simulated # inputs match the ground truth and the empirical standard deviation matches # EstimateBloomCounts outputs. @@ -108,14 +110,16 @@ TestEstimatesAndStds <- function(repetitions, title, params, map, pdf, total) { # pdf: probability density function of the distribution from which simulated # clients are sampled # total: number of reports - results <<- c(estimates = list(), stds = list()) - RunMultipleTests(title, TestEstimatesAndStdsHelper, repetitions, - params, map, pdf, total) + results <- RunMultipleTests(title, CheckEstimatesAndStdsHelper, repetitions, + params, map, pdf, total) - ave_e <- apply(results$estimates,1:2, mean) - observed_stds <- apply(results$estimates,1:2, sd) - ave_stds <- apply(results$stds,1:2, mean) + estimates <- abind(lapply(results, function(r) r$estimates), along = 3) + stds <- abind(lapply(results, function(r) r$stds), along = 3) + + ave_e <- apply(estimates, 1:2, mean) + observed_stds <- apply(estimates, 1:2, sd) + ave_stds <- apply(stds, 1:2, mean) ground_truth <- matrix(map %*% pdf, nrow = params$m, byrow = TRUE) @@ -148,12 +152,12 @@ TestEstimateBloomCounts <- function() { noise0 <- list(p = 0, q = 1, f = 0) # no noise at all - TestEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (1/3)", - c(report4x2, noise0), map0, pdf0, 100) + CheckEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (1/3)", + c(report4x2, noise0), map0, pdf0, 100) noise1 <- list(p = 0.4, q = .6, f = 0.5) - TestEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (2/3)", - c(report4x2, noise1), map0, pdf0, 100) + CheckEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (2/3)", + c(report4x2, noise1), map0, pdf0, 100) # MEDIUM TEST: 100 values, 32 cohorts, 8 bits each, 10^6 reports values <- 100 @@ -166,51 +170,50 @@ TestEstimateBloomCounts <- function() { pdf1 <- ComputePdf("zipf1", values) - TestEstimatesAndStds(repetitions = 100, "Testing estimates and stds (3/3)", - c(report8x32, noise1), map1, pdf1, 10^9) + CheckEstimatesAndStds(repetitions = 100, "Testing estimates and stds (3/3)", + c(report8x32, noise1), map1, pdf1, 10^9) } -TestDecodeHelper <- function(params, map, pdf, num_clients, +CheckDecodeHelper <- function(params, map, pdf, num_clients, tolerance_l1, tolerance_linf) { # Helper function for TestDecode. Simulates a RAPPOR run and checks results of - # Decode's output against the ground truth. Results are appended to a global - # list. + # Decode's output against the ground truth. Output is returned as a list. partition <- RandomPartition(num_clients, pdf) counts <- GenerateCounts(params, map, partition, 1) total <- sum(partition) - decoded <- Decode(counts, map, params) - - decoded_partition <- setNames(decoded$fit$estimate, decoded$fit$strings) + decoded <- Decode(counts, map, params, quiet = TRUE) - results$estimates <<- MatrixVectorMerge(results$estimates, decoded_partition) - results$stds <<- MatrixVectorMerge(results$stds, - setNames(decoded$fit$std_dev, - decoded$fit$strings)) + decoded_partition <- setNames(decoded$fit$estimate, decoded$fit$string) checkTrue(L1Distance(decoded_partition, partition) < total^.5 * tolerance_l1, "L1 distance is too large") checkTrue(LInfDistance(decoded_partition, partition) < max(partition)^.5 * tolerance_linf, "L_inf distance is too large") + + list(estimates = decoded_partition, + stds = setNames(decoded$fit$std_error, decoded$fit$string)) } -TestDecodeAveAndStds <- function(...) { +CheckDecodeAveAndStds <- function(...) { # Runs Decode multiple times (specified by the repetition argument), checks # individuals runs against the ground truth, and the estimates of the standard # error against empirical observations. - results <<- list(estimates = matrix(nrow = 0, ncol = 0), - stds = matrix(nrow = 0, ncol = 0)) + results <- RunMultipleTests(...) - RunMultipleTests(...) + estimates <- matrix(nrow = 0, ncol = 0) + lapply(results, function(r) MatrixVectorMerge(estimates, r$estimates)) - empirical_stds <- apply(results$estimates, 2, sd, na.rm = TRUE) - estimated_stds <- apply(results$stds, 2, mean, na.rm = TRUE) + stds <- matrix(nrow = 0, ncol = 0) + lapply(results, function(r) MatrixVectorMerge(stds, r$stds)) - if(dim(results$estimates)[1] > 1) - { + empirical_stds <- apply(estimates, 2, sd, na.rm = TRUE) + estimated_stds <- apply(stds, 2, mean, na.rm = TRUE) + + if(dim(estimates)[1] > 1) { checkTrue(any(estimated_stds > empirical_stds / 2), "Our estimate for the standard deviation is too low") @@ -239,22 +242,22 @@ TestDecode <- function() { # match the ground truth. Must be close enough though. noise0 <- list(p = 0, q = 1, f = 0) # no noise whatsoever - TestDecodeAveAndStds("Testing Decode (1/5)", TestDecodeHelper, 100, - c(report4x2, noise0), map0, distribution0, 100, - tolerance_l1 = 5, - tolerance_linf = 3) + CheckDecodeAveAndStds("Testing Decode (1/5)", CheckDecodeHelper, 100, + c(report4x2, noise0), map0, distribution0, 100, + tolerance_l1 = 5, + tolerance_linf = 3) noise1 <- list(p = .4, q = .6, f = .5) # substantial noise, very few reports - TestDecodeAveAndStds("Testing Decode (2/5)", TestDecodeHelper, 100, - c(report4x2, noise1), map0, distribution0, 100, - tolerance_l1 = 20, - tolerance_linf = 20) + CheckDecodeAveAndStds("Testing Decode (2/5)", CheckDecodeHelper, 100, + c(report4x2, noise1), map0, distribution0, 100, + tolerance_l1 = 20, + tolerance_linf = 20) # substantial noise, many reports - TestDecodeAveAndStds("Testing Decode (3/5)", TestDecodeHelper, 100, - c(report4x2, noise1), map0, distribution0, 100000, - tolerance_l1 = 50, - tolerance_linf = 40) + CheckDecodeAveAndStds("Testing Decode (3/5)", CheckDecodeHelper, 100, + c(report4x2, noise1), map0, distribution0, 100000, + tolerance_l1 = 50, + tolerance_linf = 40) # MEDIUM TEST: 100 values, 32 cohorts, 8 bits each, 10^6 reports values <- 100 @@ -267,10 +270,10 @@ TestDecode <- function() { distribution1 <- ComputePdf("zipf1", values) names(distribution1) <- colnames(map1) - TestDecodeAveAndStds("Testing Decode (4/5)", TestDecodeHelper, 100, - c(report8x32, noise1), map1, distribution1, 10^6, - tolerance_l1 = values * 3, - tolerance_linf = 100) + CheckDecodeAveAndStds("Testing Decode (4/5)", CheckDecodeHelper, 100, + c(report8x32, noise1), map1, distribution1, 10^6, + tolerance_l1 = values * 3, + tolerance_linf = 100) # Testing LASSO: 500 values, 32 cohorts, 8 bits each, 10^6 reports values <- 500 @@ -284,16 +287,15 @@ TestDecode <- function() { distribution2 <- ComputePdf("zipf1.5", values) names(distribution2) <- colnames(map2) - TestDecodeAveAndStds("Testing Decode (5/5)", TestDecodeHelper, 1, - c(report8x32, noise1), map2, distribution2, 10^6, - tolerance_l1 = values * 3, - tolerance_linf = 80) - + CheckDecodeAveAndStds("Testing Decode (5/5)", CheckDecodeHelper, 1, + c(report8x32, noise1), map2, distribution2, 10^6, + tolerance_l1 = values * 3, + tolerance_linf = 80) } -TestAll <- function() { +RunAll <- function() { TestEstimateBloomCounts() TestDecode() } -TestAll() +RunAll() diff --git a/analysis/R/read_input.R b/analysis/R/read_input.R index b85a09fd..95ea1b0d 100644 --- a/analysis/R/read_input.R +++ b/analysis/R/read_input.R @@ -102,8 +102,9 @@ ReadMapFile <- function(map_file, params = NULL, quote = "") { } LoadMapFile <- function(map_file, params = NULL, quote = "") { - # Reads the map file and creates an R binary .rda. - # If .rda file already exists, just loads that file. + # Reads the map file and creates an R binary .rda. If the .rda file already + # exists, just loads that file. NOTE: It assumes the map file is + # immutable. rda_file <- sub(".csv", ".rda", map_file, fixed = TRUE) diff --git a/analysis/R/simulation.R b/analysis/R/simulation.R index d7c6e9e1..cd37e74f 100644 --- a/analysis/R/simulation.R +++ b/analysis/R/simulation.R @@ -254,7 +254,7 @@ GenerateSamples <- function(N = 10^5, params, pop_params, alpha = .05, correction = correction) # Add truth column. - fit$fit$Truth <- table(samp)[fit$fit$strings] + fit$fit$Truth <- table(samp)[fit$fit$string] fit$fit$Truth[is.na(fit$fit$Truth)] <- 0 fit$map <- map$map diff --git a/analysis/R/test.sh b/analysis/R/test.sh index 06cf2380..f47eafa3 100755 --- a/analysis/R/test.sh +++ b/analysis/R/test.sh @@ -26,6 +26,8 @@ analysis-tool() { --output_dir _tmp cat _tmp/results.csv + echo + cat _tmp/metrics.json popd } diff --git a/analysis/R/util.R b/analysis/R/util.R new file mode 100644 index 00000000..8679b83d --- /dev/null +++ b/analysis/R/util.R @@ -0,0 +1,20 @@ +#!/usr/bin/Rscript +# +# Common utility library for all R scripts. + +# Log message with timing. Example: +# +# _____ 1.301 My message +# +# The prefix makes it stand out (vs R's print()), and the number is the time so +# far. +# +# NOTE: The shell script log uses hyphens. + +Log <- function(...) { + cat('_____ ') + cat(proc.time()[['elapsed']]) + cat(' ') + cat(sprintf(...)) + cat('\n') +} diff --git a/regtest.sh b/regtest.sh index bfa0a0a0..2430edb1 100755 --- a/regtest.sh +++ b/regtest.sh @@ -144,7 +144,7 @@ _setup-one-case() { # banner "Hashing candidates to get 'map'" analysis/tools/hash_candidates.py \ - $case_dir/case_params.csv \ + $params_path \ < $case_dir/case_candidates.txt \ > $case_dir/case_map.csv } @@ -240,8 +240,6 @@ make-summary() { | sed -e '/TABLE_ROWS/ r rows.html' \ > $filename - rm rows.html - popd >/dev/null log "Wrote $dir/$filename" @@ -273,38 +271,51 @@ _setup-test-instances() { done } +# Print the default number of parallel processes, which is max(#CPUs - 1, 1) +default-processes() { + processors=$(grep -c ^processor /proc/cpuinfo || echo 4) # Linux-specific + if test $processors -gt 1; then # leave one CPU for the OS + processors=$(expr $processors - 1) + fi + echo $processors +} + # Args: -# regexp: A pattern selecting the subset of tests to run +# spec_gen: A program to execute to generate the spec. +# spec_regex: A pattern selecting the subset of tests to run # instances: A number of times each test case is run -# parallel: Whether the tests are run in parallel (T/F) +# parallel: Whether the tests are run in parallel (T/F). Sequential +# runs log to the console; parallel runs log to files. # fast_counts: Whether counts are sampled directly (T/F) -# + _run-tests() { - local spec_regex=$1 # grep -E format on the spec - local instances=$2 - local parallel=$3 - local fast_counts=$4 + local spec_gen=$1 + local spec_regex="$2" # grep -E format on the spec, can be empty + local instances=$3 + local parallel=$4 + local fast_counts=$5 rm -r -f --verbose $REGTEST_DIR mkdir --verbose -p $REGTEST_DIR local func - local processors=1 + local processors if test $parallel = F; then func=_run-one-instance # output to the console + processors=1 else func=_run-one-instance-logged - processors=$(grep -c ^processor /proc/cpuinfo || echo 4) # POSIX-specific - if test $processors -gt 1; then # leave one CPU for the OS - processors=$(expr $processors - 1) - fi + # Let the user override with MAX_PROC, in case they don't have enough + # memory. + processors=${MAX_PROC:-$(default-processes)} log "Running $processors parallel processes" fi local cases_list=$REGTEST_DIR/test-cases.txt - tests/regtest_spec.py | grep -E $spec_regex > $cases_list + # Need -- for regexes that start with - + $spec_gen | grep -E -- "$spec_regex" > $cases_list # Generate parameters for all test cases. cat $cases_list \ @@ -324,13 +335,16 @@ _run-tests() { make-summary $REGTEST_DIR } -# Run tests sequentially +# used for most tests +readonly REGTEST_SPEC=tests/regtest_spec.py + +# Run tests sequentially. NOTE: called by demo.sh. run-seq() { local spec_regex=${1:-'^r-'} # grep -E format on the spec local instances=${2:-1} local fast_counts=${3:-T} - _run-tests $spec_regex $instances F $fast_counts + time _run-tests $REGTEST_SPEC $spec_regex $instances F $fast_counts } # Run tests in parallel @@ -339,15 +353,22 @@ run() { local instances=${2:-1} local fast_counts=${3:-T} - _run-tests $spec_regex $instances T $fast_counts + time _run-tests $REGTEST_SPEC $spec_regex $instances T $fast_counts } -# Run tests in parallel +# Run tests in parallel (7+ minutes on 8 cores) run-all() { local instances=${1:-1} log "Running all tests. Can take a while." - _run-tests '^r-' $instances T T + time _run-tests $REGTEST_SPEC '^r-' $instances T T +} + +run-user() { + local spec_regex=${1:-} + local instances=${2:-1} + local parallel=T # too much memory + time _run-tests tests/user_spec.py "$spec_regex" $instances $parallel T } "$@" diff --git a/setup.sh b/setup.sh index 01e3a16b..729c486f 100755 --- a/setup.sh +++ b/setup.sh @@ -25,15 +25,21 @@ native-packages() { r-packages() { # Install as root so you can write to /usr/local/lib/R. + + # glmnet, limSolve: solvers for decode.R + # RJSONIO: for analysis_tool.R sudo R -e \ - 'install.packages(c("glmnet", "optparse", "limSolve", "RUnit", "abind"), repos="http://cran.rstudio.com/")' + 'install.packages(c("glmnet", "optparse", "RUnit", "abind", "RJSONIO"), repos="http://cran.rstudio.com/")' } # R 3.0.2 on Trusty is out of date with CRAN, so we need this workaround. install-plyr-with-friends() { mkdir -p _tmp + wget --directory _tmp \ + http://cran.r-project.org/src/contrib/Archive/Rcpp/Rcpp_0.11.4.tar.gz wget --directory _tmp \ http://cran.r-project.org/src/contrib/Archive/plyr/plyr_1.8.1.tar.gz + sudo R CMD INSTALL _tmp/Rcpp_0.11.4.tar.gz sudo R CMD INSTALL _tmp/plyr_1.8.1.tar.gz sudo R -e \ 'install.packages(c("reshape2", "ggplot2", "data.table"), repos="http://cran.rstudio.com/")' diff --git a/tests/analyze.R b/tests/analyze.R index fa1e2bf7..9f079405 100755 --- a/tests/analyze.R +++ b/tests/analyze.R @@ -50,14 +50,7 @@ if (library(Cairo, quietly = TRUE, logical.return = TRUE)) { source("analysis/R/analysis_lib.R") source("analysis/R/read_input.R") source("analysis/R/decode.R") - -source("analysis/R/alternative.R") # temporary - -Log <- function(...) { - cat('analyze.R: ') - cat(sprintf(...)) - cat('\n') -} +source("analysis/R/util.R") LoadContext <- function(prefix_case) { # Creates the context, filling it with privacy parameters @@ -126,7 +119,7 @@ CompareRapporVsActual <- function(ctx) { StringToInt <- function(x) as.integer(substring(x, 2)) actual_values <- StringToInt(actual$string) - rappor_values <- StringToInt(rappor$strings) + rappor_values <- StringToInt(rappor$string) # False negatives: AnalyzeRAPPOR failed to find this value (e.g. because it # occurs too rarely) @@ -180,6 +173,8 @@ CompareRapporVsActual <- function(ctx) { Log("False negatives:") str(false_neg) + # NOTE: We should call Decode() directly, and then num_rappor is + # metrics$num_detected, and sum_proportion is metrics$allocated_mass. metrics <- list( num_actual = nrow(actual), # data frames num_rappor = nrow(rappor), diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index 88f11540..dcd48cd3 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -369,7 +369,7 @@ ExternalCounts <- function(inp) { found_strings <- lapply(1:2, function(i) Decode(counts[[i + 1]], map[[i]]$rmap, - params, quick = TRUE)$fit$strings) + params, quick = FALSE)$fit[,"string"]) if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) { print("FOUND_STRINGS") print(found_strings) diff --git a/tests/gen_counts_test.R b/tests/gen_counts_test.R index 49ad3be5..e91de68e 100755 --- a/tests/gen_counts_test.R +++ b/tests/gen_counts_test.R @@ -51,7 +51,7 @@ TestGenerateCounts <- function() { noise1 <- list(p = .5, q = .5, f = 0) # truly random IRRs counts1 <- GenerateCounts(c(report_params, noise1), map, partition, v) - for(i in 2:4) + for(i in 2:5) for(j in 1:2) pvalues <- c(pvalues, chisq.test(c(counts1[j,1] - counts1[j,i], counts1[j,i]), @@ -64,14 +64,13 @@ TestGenerateCounts <- function() { counts2 <- counts2 / v - for(i in 2:4) + for(i in 2:5) for(j in 1:2) pvalues <- c(pvalues, chisq.test(c(counts2[j,1] - counts2[j,i], counts2[j,i]), p = c(.5, .5))$p.value) - checkTrue(min(pvalues) > 1E-9 && max(pvalues) < 1 - 1E-9, - "Chi-squared test failed") + checkTrue(min(pvalues) > 1E-9, "Chi-squared test failed") } TestRandomPartition <- function() { @@ -97,14 +96,14 @@ TestRandomPartition <- function() { p5 <- RandomPartition(total = 1000000, c(1, 2, 3, 4)) p.value <- chisq.test(p5, p = c(.1, .2, .3, .4))$p.value - # Apply the chi squared test and fail if p.value is too high or too low. - # Probability of failure is 2 * 1E-9, which should never happen. - checkTrue((p.value > 1E-9) && (p.value < 1 - 1E-9)) + # Apply the chi squared test and fail if p.value is too low. + # Probability of failure is 1E-9, which should never happen. + checkTrue(p.value < 1 - 1E-9) } -TestAll <- function(){ +CheckAll <- function(){ TestRandomPartition() TestGenerateCounts() } -TestAll() \ No newline at end of file +CheckAll() \ No newline at end of file diff --git a/tests/user_spec.py b/tests/user_spec.py new file mode 100755 index 00000000..5df58798 --- /dev/null +++ b/tests/user_spec.py @@ -0,0 +1,116 @@ +#!/usr/bin/python +"""Print a test spec on stdout. + +Each line has parmaeters for a test case. The regtest.sh shell script reads +these lines and runs parallel processes. + +We use Python data structures so the test cases are easier to read and edit. +""" + +import sys + +# +# TEST CONFIGURATION +# + +# For gen_sim_input.py +INPUT_PARAMS = { + # distribution, num unique values, num clients, values per client + 'exp-100k': ('exp', 100, 100000, 1), + 'exp-1m': ('exp', 100, 1000000, 1), +} + +# For rappor_sim.py +# 'k, h, m, p, q, f' as in params file. +RAPPOR_PARAMS = { + # Initial chrome params from 2014. + # NOTE: fastrand simulation only supports 64 bits! Make sure to use the + # 'fast_counts' code path. + 'chrome128': (128, 2, 128, 0.25, 0.75, 0.50), + + # Chrome params from early 2015 -- changed to 8 bit reports. + 'chrome8': (8, 2, 128, 0.25, 0.75, 0.50), + + # Original demo params + 'demo': (16, 2, 64, 0.5, 0.75, 0.5), +} + +# For deriving candidates from true inputs. +MAP_PARAMS = { + # 1. Number of extra candidates to add. + # 2. Candidate strings to remove from the map. This FORCES false + # negatives, e.g. for common strings, since a string has to be in the map + # for RAPPOR to choose it. + 'add-100': (100, []), + 'add-1000': (1000, []), + 'add-2000': (2000, []), + # also thrashes on 128 bits + 'add-3000': (3000, []), + 'add-10000': (10000, []), + 'add-15000': (15000, []), # approx number of candidates for eTLD+1 + 'add-100000': (100000, []), + 'remove-top-2': (20, ['v1', 'v2']), +} + +# test case name -> (input params name, RAPPOR params name, map params name) +TEST_CASES = [ + ('chrome128-100k-100', 'exp-100k', 'chrome128', 'add-100'), + ('chrome128-100k-1000', 'exp-100k', 'chrome128', 'add-1000'), + ('chrome128-100k-2000', 'exp-100k', 'chrome128', 'add-2000'), + ('chrome128-100k-3000', 'exp-100k', 'chrome128', 'add-3000'), + # 128 bits and 15k candidates fails on a machine with 8 GB memory. + # Lasso finishes with 7508 non-zero coefficients, and then allocation + # fails. TODO: just take the highest ones? + #('chrome128-100k-15000', 'exp-100k', 'chrome128', 'add-15000'), + #('chrome128-100k-100000', 'exp-100k', 'chrome128', 'add-100000'), + + # NOTE: Adding more candidates exercises LASSO + ('chrome8-100k-100', 'exp-100k', 'chrome8', 'add-100'), + ('chrome8-100k-1000', 'exp-100k', 'chrome8', 'add-1000'), + ('chrome8-100k-2000', 'exp-100k', 'chrome8', 'add-2000'), + ('chrome8-100k-3000', 'exp-100k', 'chrome8', 'add-3000'), + ('chrome8-100k-15000', 'exp-100k', 'chrome8', 'add-15000'), + + # NOTE: This one takes too much memory! More than 4 GB. This is because + # Lasso gets a huge matrix (100,000). We got 1564 non-zero coefficients. + ('chrome8-100k-100000', 'exp-100k', 'chrome8', 'add-100000'), + + # What happens when the the candidates are missing top values? + ('chrome8-badcand', 'exp-100k', 'chrome8', 'remove-top-2'), + + # TODO: Use chrome params with real map from Alexa 1M ? +] + +# +# END TEST CONFIGURATION +# + + +def main(argv): + rows = [] + for test_case, input_name, rappor_name, map_name in TEST_CASES: + input_params = INPUT_PARAMS[input_name] + rappor_params = RAPPOR_PARAMS[rappor_name] + map_params = MAP_PARAMS[map_name] + row = tuple([test_case]) + input_params + rappor_params + map_params + rows.append(row) + + for row in rows: + for cell in row: + if isinstance(cell, list): + if cell: + cell_str = '|'.join(cell) + else: + cell_str = 'NONE' # we don't want an empty string + else: + cell_str = cell + print cell_str, # print it with a space after it + print # new line after row + + +if __name__ == '__main__': + try: + main(sys.argv) + except RuntimeError, e: + print >>sys.stderr, 'FATAL: %s' % e + sys.exit(1) From 3547cf31300b7d94b8e9fdaf245e8e5ce835de26 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Tue, 7 Jul 2015 16:47:55 -0700 Subject: [PATCH 37/67] Better summary in assoctest for experiments with 2-way marginals. --- analysis/R/decode.R | 14 ++++++++++++-- assoctest.sh | 5 +++-- tests/analyze_assoc_expt.R | 14 +++++++++++--- tests/make_summary_assoc.py | 28 +++++++++++++++++++++++++++- tests/regtest_spec.py | 24 ++++++++++++------------ 5 files changed, 65 insertions(+), 20 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 2d8af344..f6a94226 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -268,6 +268,10 @@ ComputePrivacyGuarantees <- function(params, alpha, N) { privacy } +FitDistribution2 <- function(estimates_stds, map) { + FitDistribution(estimates_stds, map) +} + FitDistribution <- function(estimates_stds, map, quiet = FALSE) { # Find a distribution over rows of map that approximates estimates_stds best # @@ -301,7 +305,7 @@ Resample <- function(e) { list(estimates = estimates, stds = stds) } -Decode2Way <- function(counts, map, params) { +Decode2Way <- function(counts, map, params, new_decode = FALSE) { k <- params$k p <- params$p q <- params$q @@ -322,7 +326,11 @@ Decode2Way <- function(counts, map, params) { es <- Estimate2WayBloomCounts(params, counts) e <- list(estimates = es$estimates[filter_cohorts, , drop = FALSE], stds = es$stds[filter_cohorts, , drop = FALSE]) - coefs <- FitDistribution(e, map[filter_bits, , drop = FALSE]) + if (new_decode == TRUE) { + coefs <- FitDistribution2(e, map[filter_bits, , drop = FALSE]) + } else { + coefs <- FitDistribution(e, map[filter_bits, , drop = FALSE]) + } fit <- data.frame(String = colnames(map[filter_bits, , drop = FALSE]), Estimate = matrix(coefs, ncol = 1), SD = matrix(coefs, ncol = 1), @@ -371,6 +379,8 @@ Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05, FitDistribution(e, map[filter_bits, , drop = FALSE], quiet)) } + + FitDistribution(e, map[filter_bits, , drop = FALSE], quiet) coefs_ssd <- N * apply(coefs_all, 2, sd) # compute sample standard deviations coefs_ave <- N * apply(coefs_all, 2, mean) diff --git a/assoctest.sh b/assoctest.sh index 7a4ef8a3..cd432558 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -180,7 +180,7 @@ _run-one-instance() { inp['counts'] = ['$instance_dir/case_2way.csv',\ '$instance_dir/case_marg1.csv',\ '$instance_dir/case_marg2.csv']; \ - inp['expt'] = ['external-counts', 'external-reports-em']; \ + inp['expt'] = ['external-counts', 'external-counts-new']; \ json.dump(inp, f); \ f.close();" @@ -206,8 +206,9 @@ _run-one-instance-logged() { make-summary() { local dir=$1 local filename=${2:-results.html} + local instances=${3:-1} - tests/make_summary_assoc.py $dir > $dir/rows.html + tests/make_summary_assoc.py $dir $instances > $dir/rows.html pushd $dir >/dev/null diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index dcd48cd3..46b7b42d 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -350,7 +350,7 @@ DirectSimulationOfReports <- function(inp) { ## Outputs: # # ------------------------------------------------------------------------ -ExternalCounts <- function(inp) { +ExternalCounts <- function(inp, new_decode = FALSE) { ptm <- proc.time() params <- ReadParameterFile(inp$params) # Ensure sufficient maps as required by number of vars @@ -381,7 +381,7 @@ ExternalCounts <- function(inp) { pruned <- lapply(1:2, function(i) lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE])) crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap - marginal <- Decode2Way(counts[[1]], crmap, params2)$fit + marginal <- Decode2Way(counts[[1]], crmap, params2, new_decode = new_decode)$fit td <- read.csv(file = inp$truefile, header = FALSE) td <- table(td[,2:3]) td <- td / sum(td) @@ -417,7 +417,11 @@ ExternalCounts <- function(inp) { ) # Write metrics to metrics.csv - filename <- file.path(inp$outdir, 'metrics.csv') + if (new_decode == TRUE) { + filename <- file.path(inp$outdir, 'metrics_2.csv') + } else { + filename <- file.path(inp$outdir, 'metrics.csv') + } write.csv(metrics, file = filename, row.names = FALSE) } @@ -525,6 +529,10 @@ main <- function(opts) { print("---------- RUNNING EXPERIMENT EXT COUNTS ----------") ExternalCounts(inp) } + if ("external-counts-new" %in% inp$expt) { + print("---------- RUNNING EXPERIMENT EXT COUNTS ----------") + ExternalCounts(inp, new_decode = TRUE) + } if ("external-reports-em" %in% inp$expt) { print("---------- RUNNING EXPERIMENT EXT REPORTS ----------") ExternalReportsEM(inp) diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py index 665ef9f9..40c4d635 100755 --- a/tests/make_summary_assoc.py +++ b/tests/make_summary_assoc.py @@ -272,6 +272,7 @@ def FormatPlots(base_dir, test_instances): def main(argv): base_dir = argv[1] + num_instances = int(argv[2]) # This file has the test case names, in the order that they should be # displayed. @@ -293,6 +294,8 @@ def main(argv): # file. Instead, rows' names are links to the corresponding .png files. include_plots = len(test_instances) < 20 include_plots = False + l1d_list = [] + l1d_list2 = [] for instance in test_instances: # A test instance is idenfied by the test name and the test run. @@ -314,10 +317,14 @@ def main(argv): cell1_html = FormatCell1(test_case, test_instance, metrics_file, log_file, plot_file, include_plots) + if(int(test_instance) == 1): + l1d_list = [] + l1d_list2 = [] + if os.path.isfile(metrics_file): # ParseMetrics outputs an HTML table row and also updates lists metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file) - + l1d_list += metrics_dict['l1d'] # Update the metrics structure. Initialize dictionaries if necessary. for m in metrics: if not test_case in metrics[m]: @@ -332,9 +339,28 @@ def main(argv): if (os.path.isfile(metrics_file)): metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file, italics = True) + l1d_list2 += metrics_dict['l1d'] print '{}{}'.format(ParseSpecFile(spec_file, empty = True), metrics_html) + # Print summary of test instances + if(int(test_instance) == num_instances): + row_str = ['', '', '', '', + '%.3f±%.3f' % (Mean(l1d_list), StandardErrorEstimate(l1d_list)), + '', + ] + row_str2 = ['', '', '', '', + '%.3f±%.3f' % (Mean(l1d_list2), StandardErrorEstimate(l1d_list2)), + '', + ] + print '{}{}'.format(ParseSpecFile(spec_file, empty = + True), ' '.join('' % cell for cell in + row_str)) + if (os.path.isfile(metrics_file)): + print '{}{}'.format(ParseSpecFile(spec_file, empty = + True), ' '.join('' % cell for cell in + row_str2)) + print FormatSummaryRow(metrics) print '' diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index db8a8566..53d1053a 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -128,23 +128,23 @@ # sets ASSOC_TEST_CONFIG = { 'distr': ( - 'fizz-tiny', - 'fizz-tiny-bool', - 'fizz-small', - 'fizz-small-bool',), -# 'fizz', -# 'fizz-bool',), +# 'fizz-tiny', +# 'fizz-tiny-bool', +# 'fizz-small', +# 'fizz-small-bool',), + 'fizz', + 'fizz-bool', # 'toy',), -# 'compact-noextra-small', -# 'loose-noextra-small', -# 'compact-noextra-large', -# 'loose-noextra-large', + 'compact-noextra-small', + 'loose-noextra-small',), # 'compact-extra-small', # 'loose-extra-small', -# 'compact-extra-large', -# 'loose-extra-large', # 'compact-excess-small', # 'loose-excess-small', +# 'compact-noextra-large', +# 'loose-noextra-large', +# 'compact-extra-large', +# 'loose-extra-large', # 'compact-excess-large', # 'loose-excess-large'), 'blooms': ( From f9390ab1660dc61fbc89b615746d8f404edbc3ad Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Wed, 8 Jul 2015 10:21:59 -0700 Subject: [PATCH 38/67] Tests run sequentially. Trying random projection. --- analysis/R/decode.R | 32 +++++++++++++++++++++++++++++++- assoctest.sh | 4 ++-- tests/regtest_spec.py | 17 ++++++++--------- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index f6a94226..566ede11 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -268,8 +268,38 @@ ComputePrivacyGuarantees <- function(params, alpha, N) { privacy } +# Implements lsei +# FitDistribution2 <- function(estimates_stds, map) { +# X <- map +# Y <- as.vector(t(estimates_stds$estimates)) +# m <- dim(X)[1] +# n <- dim(X)[2] +# +# G <- rbind2(Diagonal(n), rep(-1, n)) +# H <- c(rep(0, n), -1) +# lsei(A = X, B = Y, G = G, H = H, type = 2)$X +# } + FitDistribution2 <- function(estimates_stds, map) { - FitDistribution(estimates_stds, map) + X <- map + Y <- as.vector(t(estimates_stds$estimates)) + m <- dim(X)[1] + n <- dim(X)[2] + + # Random projection params + size <- 10 * n + density <- 0.05 + rproj <- matrix(0, size, m) + rproj[sample(length(rproj), size = density * length(rproj))] <- rnorm(density * length(rproj)) + # rproj <- matrix(rnorm(10*n*m), 10*n, m) + Xproj <- rproj %*% X + Yproj <- as.vector(rproj %*% Y) + mproj <- dim(Xproj)[1] + nproj <- dim(Xproj)[2] + + G <- rbind2(Diagonal(nproj), rep(-1, nproj)) + H <- c(rep(0, nproj), -1) + lsei(A = Xproj, B = Yproj, G = G, H = H, type = 2)$X } FitDistribution <- function(estimates_stds, map, quiet = FALSE) { diff --git a/assoctest.sh b/assoctest.sh index cd432558..d5e0ec9f 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -301,7 +301,7 @@ _run-tests() { log "Done running all test instances" - make-summary $ASSOCTEST_DIR + make-summary $ASSOCTEST_DIR "results.html" $instances } # Run tests sequentially @@ -329,7 +329,7 @@ run-all() { log "Running all tests. Can take a while." # a- for assoc tests # F for sequential - _run-tests '^a-' $instances T T + _run-tests '^a-' $instances F T } "$@" diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 53d1053a..d1ee7ebb 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -131,16 +131,16 @@ # 'fizz-tiny', # 'fizz-tiny-bool', # 'fizz-small', -# 'fizz-small-bool',), - 'fizz', - 'fizz-bool', +# 'fizz-small-bool', +# 'fizz', +# 'fizz-bool',), # 'toy',), 'compact-noextra-small', - 'loose-noextra-small',), -# 'compact-extra-small', -# 'loose-extra-small', -# 'compact-excess-small', -# 'loose-excess-small', + 'loose-noextra-small', + 'compact-extra-small', + 'loose-extra-small', + 'compact-excess-small', + 'loose-excess-small',), # 'compact-noextra-large', # 'loose-noextra-large', # 'compact-extra-large', @@ -159,7 +159,6 @@ # END TEST CONFIGURATION # - def main(argv): rows = [] From e5435bb8bcd11b77e67141cf5b6e2240f95d166b Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 9 Jul 2015 16:02:46 -0700 Subject: [PATCH 39/67] Marginals constraints for LSEI. --- analysis/R/decode.R | 65 +++++++++++++++++++++++++++---------- assoctest.sh | 6 ++-- tests/analyze_assoc_expt.R | 10 ++++-- tests/make_summary_assoc.py | 8 ++--- 4 files changed, 61 insertions(+), 28 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 566ede11..f839ee01 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -269,7 +269,7 @@ ComputePrivacyGuarantees <- function(params, alpha, N) { } # Implements lsei -# FitDistribution2 <- function(estimates_stds, map) { +# FitDistribution <- function(estimates_stds, map, quiet = FALSE) { # X <- map # Y <- as.vector(t(estimates_stds$estimates)) # m <- dim(X)[1] @@ -280,26 +280,55 @@ ComputePrivacyGuarantees <- function(params, alpha, N) { # lsei(A = X, B = Y, G = G, H = H, type = 2)$X # } -FitDistribution2 <- function(estimates_stds, map) { - X <- map +FitDistribution2 <- function(estimates_stds, map, fit) { + + X <- as.matrix(map) Y <- as.vector(t(estimates_stds$estimates)) m <- dim(X)[1] n <- dim(X)[2] + wt <- 1000 # weight to marginal constraints - # Random projection params - size <- 10 * n - density <- 0.05 - rproj <- matrix(0, size, m) - rproj[sample(length(rproj), size = density * length(rproj))] <- rnorm(density * length(rproj)) - # rproj <- matrix(rnorm(10*n*m), 10*n, m) - Xproj <- rproj %*% X - Yproj <- as.vector(rproj %*% Y) - mproj <- dim(Xproj)[1] - nproj <- dim(Xproj)[2] + G <- rbind2(Diagonal(n), rep(-1, n)) + H <- c(rep(0, n), -1) + + # Adding marginals constraints to X and Y + fstrs <- lapply(fit, function(x) x[,"string"]) # found strings + + Y <- c(Y, wt * t(fit[[1]]["proportion"]), wt * t(fit[[2]]["proportion"])) + + for (strs in fstrs[[1]]) { + indices <- which(colnames(map) %in% outer(strs, + fstrs[[2]], + function(x, y) paste(x, y, sep = "x"))) + vec <- rep(0, n) + vec[indices] <- wt + X <- rbind2(X, vec) + } + for (strs in fstrs[[2]]) { + indices <- which(colnames(map) %in% outer(fstrs[[1]], + strs, + function(x, y) paste(x, y, sep = "x"))) + vec <- rep(0, n) + vec[indices] <- wt + X <- rbind2(X, vec) + } + + lsei(A = X, B = Y, G = G, H = H, type = 2)$X - G <- rbind2(Diagonal(nproj), rep(-1, nproj)) - H <- c(rep(0, nproj), -1) - lsei(A = Xproj, B = Yproj, G = G, H = H, type = 2)$X + # Random projection params +# size <- 10 * n +# density <- 0.05 +# rproj <- matrix(0, size, m) +# rproj[sample(length(rproj), size = density * length(rproj))] <- rnorm(density * length(rproj)) +# # rproj <- matrix(rnorm(10*n*m), 10*n, m) +# Xproj <- rproj %*% X +# Yproj <- as.vector(rproj %*% Y) +# mproj <- dim(Xproj)[1] +# nproj <- dim(Xproj)[2] +# +# G <- rbind2(Diagonal(nproj), rep(-1, nproj)) +# H <- c(rep(0, nproj), -1) +# lsei(A = Xproj, B = Yproj, G = G, H = H, type = 2)$X } FitDistribution <- function(estimates_stds, map, quiet = FALSE) { @@ -335,7 +364,7 @@ Resample <- function(e) { list(estimates = estimates, stds = stds) } -Decode2Way <- function(counts, map, params, new_decode = FALSE) { +Decode2Way <- function(counts, map, params, new_decode = FALSE, fit = NULL) { k <- params$k p <- params$p q <- params$q @@ -357,7 +386,7 @@ Decode2Way <- function(counts, map, params, new_decode = FALSE) { e <- list(estimates = es$estimates[filter_cohorts, , drop = FALSE], stds = es$stds[filter_cohorts, , drop = FALSE]) if (new_decode == TRUE) { - coefs <- FitDistribution2(e, map[filter_bits, , drop = FALSE]) + coefs <- FitDistribution2(e, map[filter_bits, , drop = FALSE], fit) } else { coefs <- FitDistribution(e, map[filter_bits, , drop = FALSE]) } diff --git a/assoctest.sh b/assoctest.sh index d5e0ec9f..492d949a 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -275,8 +275,8 @@ _run-tests() { else func=_run-one-instance-logged processors=$(grep -c ^processor /proc/cpuinfo || echo 4) # POSIX-specific - if test $processors -gt 3; then # leave few CPUs for the OS - processors=$(expr $processors - 3) + if test $processors -gt 6; then # leave few CPUs for the OS + processors=5 else processors=1 fi @@ -329,7 +329,7 @@ run-all() { log "Running all tests. Can take a while." # a- for assoc tests # F for sequential - _run-tests '^a-' $instances F T + _run-tests '^a-' $instances T T } "$@" diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index 46b7b42d..8c2a5ee7 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -366,10 +366,13 @@ ExternalCounts <- function(inp, new_decode = FALSE) { params2$k <- (params$k ** 2) * 4 # Prune candidates - found_strings <- lapply(1:2, function(i) + fit <- lapply(1:2, function(i) Decode(counts[[i + 1]], map[[i]]$rmap, - params, quick = FALSE)$fit[,"string"]) + params, quick = FALSE)$fit) + + found_strings = list(fit[[1]][,"string"], fit[[2]][,"string"]) + if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) { print("FOUND_STRINGS") print(found_strings) @@ -381,7 +384,7 @@ ExternalCounts <- function(inp, new_decode = FALSE) { pruned <- lapply(1:2, function(i) lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE])) crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap - marginal <- Decode2Way(counts[[1]], crmap, params2, new_decode = new_decode)$fit + marginal <- Decode2Way(counts[[1]], crmap, params2, new_decode = new_decode, fit = fit)$fit td <- read.csv(file = inp$truefile, header = FALSE) td <- table(td[,2:3]) td <- td / sum(td) @@ -392,6 +395,7 @@ ExternalCounts <- function(inp, new_decode = FALSE) { } } ed[is.na(ed)] <- 0 + ed[ed<0] <- 0 time_taken <- proc.time() - ptm diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py index 40c4d635..ad21ea44 100755 --- a/tests/make_summary_assoc.py +++ b/tests/make_summary_assoc.py @@ -349,14 +349,14 @@ def main(argv): '%.3f±%.3f' % (Mean(l1d_list), StandardErrorEstimate(l1d_list)), '', ] - row_str2 = ['', '', '', '', - '%.3f±%.3f' % (Mean(l1d_list2), StandardErrorEstimate(l1d_list2)), - '', - ] print '{}{}'.format(ParseSpecFile(spec_file, empty = True), ' '.join('' % cell for cell in row_str)) if (os.path.isfile(metrics_file)): + row_str2 = ['', '', '', '', + '%.3f±%.3f' % (Mean(l1d_list2), StandardErrorEstimate(l1d_list2)), + '', + ] print '{}{}'.format(ParseSpecFile(spec_file, empty = True), ' '.join('' % cell for cell in row_str2)) From edb44d677535c1edde4aaf5814cda56945526b30 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 13 Jul 2015 10:23:27 -0700 Subject: [PATCH 40/67] Cleaning up expts in association.R --- analysis/R/association.R | 100 --------------------------------------- analysis/R/decode.R | 2 +- 2 files changed, 1 insertion(+), 101 deletions(-) diff --git a/analysis/R/association.R b/analysis/R/association.R index 393b0e3a..aaf0a8a0 100644 --- a/analysis/R/association.R +++ b/analysis/R/association.R @@ -137,33 +137,6 @@ GetJointConditionalProb <- function(cond_x, cond_y) { mapply("outer", cond_x, cond_y, SIMPLIFY = FALSE) } -UpdatePij2 <- function(pij, reports, cohorts, cand_strs, - params, map) { - - accum <- array(0, dim(pij)) - # For each report - for (i in seq(length(reports[[1]]))) { - # For each var - for (var in seq(length(reports))) { - idx <- cohorts[[var]][i] - rep <- GetCondProb(reports[[var]][[i]], - candidate_strings = cand_strs[[var]], - params = params, - map[[var]]$map[[idx]], NULL) - if(var == 1) { - cond_joint_distr <- rep - } else { - cond_joint_distr <- outer(cond_joint_distr, rep) - } - } - z <- cond_joint_distr * pij - z <- z / sum(z) - z[is.nan(z)] <- 0 - accum <- accum + z - } - accum / length(reports[[1]]) -} - UpdatePij <- function(pij, cond_prob) { # Update the probability matrix based on the EM algorithm. # @@ -182,23 +155,6 @@ UpdatePij <- function(pij, cond_prob) { Reduce("+", wcp) / length(wcp) } -UpdatePij3 <- function(pij, cond_prob) { - wcp <- lapply(cond_prob, function(x) { - for (i in seq(length(x))) { - if (i == 1) { - op <- x[[i]] - } else { - op <- outer(op, x[[i]]) - } - } - z <- op * pij - z <- z / sum(z) - z[is.nan(z)] <- 0 - z - }) - Reduce("+", wcp) / length(wcp) -} - NLL <- function(pij, cond_prob) { # Update the probability matrix based on the EM algorithm. # @@ -230,62 +186,6 @@ ComputeVar <- function(cond_prob, est) { list(var_cov = var_cov, sd = sd, inform = inform) } -EM2 <- function(reports, cohorts, cand_strs, starting_pij = NULL, - params, map, - max_iter = 1e03, epsilon = 1e-06) { - - # State space is the product of lengths. - state_space <- sapply(cand_strs, "length") - pij <- array() - if(is.null(starting_pij)) { - pij <- array(1 / prod(state_space), state_space) - } else { - pij <- starting_pij - } - - if (nrow(pij) > 0) { - # Run EM - for (i in 1:max_iter) { - pij_new <- UpdatePij2(pij, reports, cohorts, cand_strs, - params, map) - diff <- max(abs(pij_new - pij)) - pij <- pij_new - if (diff < epsilon) { - break - } - } - } - list(hist = pij) -} - -EM3 <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE, - max_iter = 1e03, epsilon = 1e-06, verbose = FALSE) { - pij <- list() - - # Compute dimensions of conditional distributions. - state_space <- sapply(cond_prob[[1]], length) - if (is.null(starting_pij)) { - pij <- array(1 / prod(state_space), state_space) - } else { - pij <- starting_pij - } - if (nrow(pij) > 0) { - # Run EM - for (i in 1:max_iter) { - if (i == 1) { - ptm_iter <- proc.time() - } - pij_new <- UpdatePij3(pij, cond_prob) - diff <- max(abs(pij_new - pij)) - pij <- pij_new - if (diff < epsilon) { - break - } - } - } - list(est = pij, hist = pij, sd = 0) -} - EM <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE, max_iter = 1000, epsilon = 10^-6, verbose = FALSE) { # Performs estimation. diff --git a/analysis/R/decode.R b/analysis/R/decode.R index f839ee01..adaa0b47 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -286,7 +286,7 @@ FitDistribution2 <- function(estimates_stds, map, fit) { Y <- as.vector(t(estimates_stds$estimates)) m <- dim(X)[1] n <- dim(X)[2] - wt <- 1000 # weight to marginal constraints + wt <- 10000 # weight to marginal constraints G <- rbind2(Diagonal(n), rep(-1, n)) H <- c(rep(0, n), -1) From 09f22585e86c736945ea5817eca1a362523b124a Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 13 Jul 2015 11:01:03 -0700 Subject: [PATCH 41/67] Minor clean up. --- analysis/R/association.R | 28 +++++----------------------- tests/analyze_assoc_expt.R | 8 ++------ 2 files changed, 7 insertions(+), 29 deletions(-) diff --git a/analysis/R/association.R b/analysis/R/association.R index aaf0a8a0..482cf918 100644 --- a/analysis/R/association.R +++ b/analysis/R/association.R @@ -292,8 +292,7 @@ ComputeDistributionEM <- function(reports, report_cohorts, maps, ignore_other = FALSE, params, quick = FALSE, marginals = NULL, - estimate_var = FALSE, - new_alg = FALSE) { + estimate_var = FALSE) { # Computes the distribution of num_variables variables, where # num_variables is chosen by the client, using the EM algorithm. # @@ -368,33 +367,16 @@ ComputeDistributionEM <- function(reports, report_cohorts, rep }) - if(new_alg) { - # Report conditional distributions as lists - if (j == 1) { - # Conditional distribution for reports - joint_conditional <- lapply(cond_report_dist, "list") - } else { - joint_conditional <- mapply(function (x, y) c(x, list(y)), - joint_conditional, cond_report_dist, - SIMPLIFY = FALSE) - } - } else { - # Update the joint conditional distribution of all variables - joint_conditional <- UpdateJointConditional(cond_report_dist, - joint_conditional) - } + # Update the joint conditional distribution of all variables + joint_conditional <- UpdateJointConditional(cond_report_dist, + joint_conditional) print("TIME IN COND_REPORT_DIST") print(proc.time()-ptm) } ptm <- proc.time() # Run expectation maximization to find joint distribution - if (new_alg) { - funct <- EM3 - } else { - funct <- EM - } - em <- funct(joint_conditional, epsilon = 10 ^ -5, verbose = FALSE, + em <- EM(joint_conditional, epsilon = 10 ^ -5, verbose = FALSE, estimate_var = estimate_var) print("TIME IN EM") print(proc.time() - ptm) diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index 8c2a5ee7..20504ff4 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -480,8 +480,7 @@ ExternalReportsEM <- function(inp) { ignore_other = TRUE, quick = TRUE, params, marginals = NULL, - estimate_var = FALSE, - new_alg = inp$newalg) + estimate_var = FALSE) em <- joint_dist$orig$fit td <- read.csv(file = inp$truefile, header = FALSE) td <- table(td[,2:3]) @@ -521,10 +520,7 @@ main <- function(opts) { # direct -> direct simulation of reports (without variances) # external-counts -> externally supplied counts for 2 way and marginals # external-reports -> externally supplied reports - if (!(inp$expt %in% c("direct", "external-counts", "external-reports-em"))) { - stop("Incorrect experiment in JSON file.") - } - + if("direct" %in% inp$expt) { print("---------- RUNNING EXPERIMENT DIRECT ----------") DirectSimulationOfReports(inp) From 55b18c38c8d3af5dd07d098184bea950a8ea9018 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 13 Jul 2015 13:08:19 -0700 Subject: [PATCH 42/67] Moving 2 way marginal code to its own file. --- analysis/R/association.R | 4 +- analysis/R/decode.R | 153 ------------------------------------- tests/analyze_assoc_expt.R | 5 +- 3 files changed, 5 insertions(+), 157 deletions(-) diff --git a/analysis/R/association.R b/analysis/R/association.R index 482cf918..2a19656f 100644 --- a/analysis/R/association.R +++ b/analysis/R/association.R @@ -376,12 +376,12 @@ ComputeDistributionEM <- function(reports, report_cohorts, ptm <- proc.time() # Run expectation maximization to find joint distribution - em <- EM(joint_conditional, epsilon = 10 ^ -5, verbose = FALSE, + em <- EM(joint_conditional, epsilon = 10 ^ -6, verbose = FALSE, estimate_var = estimate_var) print("TIME IN EM") print(proc.time() - ptm) dimnames(em$est) <- found_strings # Return results in a usable format - list(orig = list(fit = em$est, sd = em$sd, em = em)) + list(fit = em$est, sd = em$sd, em = em) } diff --git a/analysis/R/decode.R b/analysis/R/decode.R index adaa0b47..1c5c327f 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -18,62 +18,6 @@ library(glmnet) library(limSolve) -Estimate2WayBloomCounts <- function(params, obs_counts) { - p <- params$p - q <- params$q - f <- params$f - m <- params$m - k <- params$k - - stopifnot(m == nrow(obs_counts), params$k + 1 == ncol(obs_counts)) - - p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1 - p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1 - p10 <- 1 - p11 # probability of a true 1 reported as 0 - p00 <- 1 - p01 # probability of a true 0 reported as 0 - - NoiseMatrix <- matrix(rep(0, 16), 4) - NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2) - NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00) - NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01) - NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2) - - ests <- apply(obs_counts, 1, function(x) { - N <- x[1] - inds <- seq(0, (k/4)-1) - v <- x[-1] - sapply(inds, function(i){ - as.vector(t(Solve(NoiseMatrix)) %*% v[(i*4 + 1):((i+1)*4)]) - }) - }) - - if(FALSE) { - # TODO(pseudorandom): Compute variances - variances <- apply(obs_counts, 1, function(x) { - N <- x[1] - v <- x[-1] - p_hats <- (v - p01 * N) / (N * p2) # expectation of a true 1 - p_hats <- pmax(0, pmin(1, p_hats)) # clamp to [0,1] - r <- p_hats * p11 + (1 - p_hats) * p01 # expectation of a reported 1 - N * r * (1 - r) / p2^2 # variance of the binomial - }) - } - - # Transform counts from absolute values to fractional, removing bias due to - # variability of reporting between cohorts. - ests <- apply(ests, 1, function(x) x / obs_counts[,1]) - # stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1]) - - # Some estimates may be set to infinity, e.g. if f=1. We want to - # account for this possibility, and set the corresponding counts - # to 0. - ests[abs(ests) == Inf] <- 0 - - list(estimates = ests, - stds = matrix(rep(100, length(ests[,1]) * length(ests[1,])), - length(ests[,1]))) -} - EstimateBloomCounts <- function(params, obs_counts) { # Estimates the number of times each bit in each cohort was set in original # Bloom filters. @@ -268,69 +212,6 @@ ComputePrivacyGuarantees <- function(params, alpha, N) { privacy } -# Implements lsei -# FitDistribution <- function(estimates_stds, map, quiet = FALSE) { -# X <- map -# Y <- as.vector(t(estimates_stds$estimates)) -# m <- dim(X)[1] -# n <- dim(X)[2] -# -# G <- rbind2(Diagonal(n), rep(-1, n)) -# H <- c(rep(0, n), -1) -# lsei(A = X, B = Y, G = G, H = H, type = 2)$X -# } - -FitDistribution2 <- function(estimates_stds, map, fit) { - - X <- as.matrix(map) - Y <- as.vector(t(estimates_stds$estimates)) - m <- dim(X)[1] - n <- dim(X)[2] - wt <- 10000 # weight to marginal constraints - - G <- rbind2(Diagonal(n), rep(-1, n)) - H <- c(rep(0, n), -1) - - # Adding marginals constraints to X and Y - fstrs <- lapply(fit, function(x) x[,"string"]) # found strings - - Y <- c(Y, wt * t(fit[[1]]["proportion"]), wt * t(fit[[2]]["proportion"])) - - for (strs in fstrs[[1]]) { - indices <- which(colnames(map) %in% outer(strs, - fstrs[[2]], - function(x, y) paste(x, y, sep = "x"))) - vec <- rep(0, n) - vec[indices] <- wt - X <- rbind2(X, vec) - } - for (strs in fstrs[[2]]) { - indices <- which(colnames(map) %in% outer(fstrs[[1]], - strs, - function(x, y) paste(x, y, sep = "x"))) - vec <- rep(0, n) - vec[indices] <- wt - X <- rbind2(X, vec) - } - - lsei(A = X, B = Y, G = G, H = H, type = 2)$X - - # Random projection params -# size <- 10 * n -# density <- 0.05 -# rproj <- matrix(0, size, m) -# rproj[sample(length(rproj), size = density * length(rproj))] <- rnorm(density * length(rproj)) -# # rproj <- matrix(rnorm(10*n*m), 10*n, m) -# Xproj <- rproj %*% X -# Yproj <- as.vector(rproj %*% Y) -# mproj <- dim(Xproj)[1] -# nproj <- dim(Xproj)[2] -# -# G <- rbind2(Diagonal(nproj), rep(-1, nproj)) -# H <- c(rep(0, nproj), -1) -# lsei(A = Xproj, B = Yproj, G = G, H = H, type = 2)$X -} - FitDistribution <- function(estimates_stds, map, quiet = FALSE) { # Find a distribution over rows of map that approximates estimates_stds best # @@ -364,40 +245,6 @@ Resample <- function(e) { list(estimates = estimates, stds = stds) } -Decode2Way <- function(counts, map, params, new_decode = FALSE, fit = NULL) { - k <- params$k - p <- params$p - q <- params$q - f <- params$f - h <- params$h - m <- params$m - - S <- ncol(map) # total number of candidates - - N <- sum(counts[, 1]) - - filter_cohorts <- which(counts[, 1] != 0) # exclude cohorts with zero reports - - # stretch cohorts to bits - filter_bits <- as.vector( - t(matrix(1:nrow(map), nrow = m, byrow = TRUE)[filter_cohorts,])) - - es <- Estimate2WayBloomCounts(params, counts) - e <- list(estimates = es$estimates[filter_cohorts, , drop = FALSE], - stds = es$stds[filter_cohorts, , drop = FALSE]) - if (new_decode == TRUE) { - coefs <- FitDistribution2(e, map[filter_bits, , drop = FALSE], fit) - } else { - coefs <- FitDistribution(e, map[filter_bits, , drop = FALSE]) - } - fit <- data.frame(String = colnames(map[filter_bits, , drop = FALSE]), - Estimate = matrix(coefs, ncol = 1), - SD = matrix(coefs, ncol = 1), - stringsAsFactors = FALSE) - rownames(fit) <- fit[,"String"] - list(fit = fit) -} - Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05, correction = c("Bonferroni"), quiet = FALSE, ...) { k <- params$k diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index 20504ff4..d91dad21 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -35,6 +35,7 @@ if(!interactive()) { opts <- parse_args(OptionParser(option_list = option_list)) } +source("analysis/R/decode2way.R") source("analysis/R/encode.R") source("analysis/R/decode.R") source("analysis/R/simulation.R") @@ -384,7 +385,7 @@ ExternalCounts <- function(inp, new_decode = FALSE) { pruned <- lapply(1:2, function(i) lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE])) crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap - marginal <- Decode2Way(counts[[1]], crmap, params2, new_decode = new_decode, fit = fit)$fit + marginal <- Decode2Way(counts[[1]], crmap, params2, fit = fit)$fit td <- read.csv(file = inp$truefile, header = FALSE) td <- table(td[,2:3]) td <- td / sum(td) @@ -481,7 +482,7 @@ ExternalReportsEM <- function(inp) { quick = TRUE, params, marginals = NULL, estimate_var = FALSE) - em <- joint_dist$orig$fit + em <- joint_dist$fit td <- read.csv(file = inp$truefile, header = FALSE) td <- table(td[,2:3]) td <- td / sum(td) From bcbacbe37daf6205897474783d3e246a1af5b6cd Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 13 Jul 2015 14:14:34 -0700 Subject: [PATCH 43/67] Small changes to sum_bits_assoc. --- analysis/tools/sum_bits_assoc.py | 27 ++++++++++++++------------- tests/regtest_spec.py | 14 +++++++------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/analysis/tools/sum_bits_assoc.py b/analysis/tools/sum_bits_assoc.py index acf5ea2c..b339473d 100755 --- a/analysis/tools/sum_bits_assoc.py +++ b/analysis/tools/sum_bits_assoc.py @@ -54,22 +54,23 @@ def SumBits(params, stdin, f_2way, f_1, f_2): # TODO: Extend checking for both reports if not len(irr_1) == params.num_bloombits: raise RuntimeError( - "Expected %d bits, got %r" % (params.num_bloombits, len(irr_1))) + "Expected %d bits in report 1, got %r" % + (params.num_bloombits, len(irr_1))) + if not len(irr_2) == params.num_bloombits: + raise RuntimeError( + "Expected %d bits in report 2, got %r" % + (params.num_bloombits, len(irr_2))) # "Unrolled" joint encoding of both reports + index_array = [[3, 1], [2, 0]] for i, c in enumerate(irr_1): for j, d in enumerate(irr_2): index = 4 * ((num_bloombits - i - 1) * params.num_bloombits + num_bloombits - j - 1) - if (c == '1' and d == '1'): - sums[cohort][index] += 1 - elif (c == '0' and d == '1'): - sums[cohort][index + 1] += 1 - elif (c == '1' and d == '0'): - sums[cohort][index + 2] += 1 - elif (c == '0' and d == '0'): - sums[cohort][index + 3] += 1 - else: - raise RuntimeError('Invalid IRRs -- digits should be 0 or 1') + try: + diff = index_array[int(c)][int(d)] + except IndexError: + raise RuntimeError('Invalid IRRs; digits should be 0/1') + sums[cohort][index + diff] += 1 for i, c in enumerate(irr_1): bit_num = num_bloombits - i - 1 # e.g. char 0 = bit 15, char 15 = bit 0 @@ -77,7 +78,7 @@ def SumBits(params, stdin, f_2way, f_1, f_2): sums_1[cohort][bit_num] += 1 else: if c != '0': - raise RuntimeError('Invalid IRR -- digits should be 0 or 1') + raise RuntimeError('Invalid IRRs; digits should be 0/1') for i, c in enumerate(irr_2): bit_num = num_bloombits - i - 1 # e.g. char 0 = bit 15, char 15 = bit 0 @@ -85,7 +86,7 @@ def SumBits(params, stdin, f_2way, f_1, f_2): sums_2[cohort][bit_num] += 1 else: if c != '0': - raise RuntimeError('Invalid IRR -- digits should be 0 or 1') + raise RuntimeError('Invalid IRRs; digits should be 0/1') for cohort in xrange(num_cohorts): # First column is the total number of reports in the cohort. diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index d1ee7ebb..6e0a602e 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -134,13 +134,13 @@ # 'fizz-small-bool', # 'fizz', # 'fizz-bool',), -# 'toy',), - 'compact-noextra-small', - 'loose-noextra-small', - 'compact-extra-small', - 'loose-extra-small', - 'compact-excess-small', - 'loose-excess-small',), + 'toy',), +# 'compact-noextra-small', +# 'loose-noextra-small', +# 'compact-extra-small', +# 'loose-extra-small', +# 'compact-excess-small', +# 'loose-excess-small',), # 'compact-noextra-large', # 'loose-noextra-large', # 'compact-extra-large', From a155be8aea4b1dd9ef282392a3fd57acec149b28 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 13 Jul 2015 15:56:04 -0700 Subject: [PATCH 44/67] Merging from master branch. --- analysis/R/decode.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index c8f32fd1..86f09303 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -102,6 +102,8 @@ FitLasso <- function(X, Y, intercept = TRUE) { # is to avoid overfitting. cap <- min(500, nrow(X) * .8, ncol(X)) + # TODO: take care of corner case when ncol(X) == 1 + # currently glmnet() fails mod <- glmnet(X, Y, standardize = FALSE, intercept = intercept, lower.limits = 0, # outputs are non-negative pmax = cap) @@ -244,7 +246,7 @@ Resample <- function(e) { list(estimates = estimates, stds = stds) } -Decode <- function(counts, map, params, alpha = 0.05, +Decode <- function(counts, map, params, alpha = 0.05, quick = FALSE, correction = c("Bonferroni"), quiet = FALSE, ...) { k <- params$k p <- params$p From 00b827b2b0d21ec9a8271d594bd2e262d311fa69 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Tue, 14 Jul 2015 14:05:59 -0700 Subject: [PATCH 45/67] Some code refactoring. - moved 2-way association code to its own file - moved specs related to assoctest to its own file - other minor changes --- analysis/R/decode2way.R | 196 +++++++++++++++++++++++++++++++ analysis/tools/sum_bits_assoc.py | 10 +- assoctest.sh | 28 ++--- tests/assoctest_spec.py | 137 +++++++++++++++++++++ tests/regtest_spec.py | 90 -------------- 5 files changed, 350 insertions(+), 111 deletions(-) create mode 100644 analysis/R/decode2way.R create mode 100755 tests/assoctest_spec.py diff --git a/analysis/R/decode2way.R b/analysis/R/decode2way.R new file mode 100644 index 00000000..63bb8f69 --- /dev/null +++ b/analysis/R/decode2way.R @@ -0,0 +1,196 @@ +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# This library implements RAPPOR decoding algorithms for 2 way association. +# + +library(limSolve) +source("analysis/R/decode.R") + +EstimateBloomCounts2Way <- function(params, obs_counts) { + # Estimates original bloom filter counts of each pair of bits + # in the original bloom filters of each report + # + # Input: + # params: a list of RAPPOR parameters: + # k - size of a Bloom filter + # h - number of hash functions + # m - number of cohorts + # p - P(IRR = 1 | PRR = 0) + # q - P(IRR = 1 | PRR = 1) + # f - Proportion of bits in the Bloom filter that are set randomly + # to 0 or 1 regardless of the underlying true bit value + # obs_counts: a matrix of size m by (4k**2 + 1). Column one contains sample + # sizes for each cohort. Other counts indicated how many times + # pairs of bits {11, 10, 01, 00} were set across the two + # reports (in a "1st report"-major order) + # + # Output: + # ests: a matrix of size m by 4k**2 with estimated counts + # stds: currently, just a filler value of 100 + + p <- params$p + q <- params$q + f <- params$f + m <- params$m + k <- params$k + + stopifnot(m == nrow(obs_counts), params$k + 1 == ncol(obs_counts)) + + p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1 + p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1 + p10 <- 1 - p11 # probability of a true 1 reported as 0 + p00 <- 1 - p01 # probability of a true 0 reported as 0 + + # The NoiseMatrix describes the probability that input pairs of bits + # are mapped to outputs {11, 10, 01, 00} due to noise added by RAPPOR + NoiseMatrix <- matrix(rep(0, 16), 4) + NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2) + NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00) + NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01) + NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2) + + # Apply the inverse of NoiseMatrix to get an unbiased estimator for + # the number of times input pairs of bits were seen. + # Apply the matrix to 4 values at a time from obs_counts + ests <- apply(obs_counts, 1, function(x) { + N <- x[1] + inds <- seq(0, (k/4)-1) + v <- x[-1] + sapply(inds, function(i){ + as.vector(t(solve(NoiseMatrix)) %*% v[(i*4 + 1):((i+1)*4)]) + }) + }) + + # Transform counts from absolute values to fractional, removing bias due to + # variability of reporting between cohorts. + ests <- apply(ests, 1, function(x) x / obs_counts[,1]) + # TODO: compute stddev in distribution induced by estimation + # stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1]) + + # Some estimates may be set to infinity, e.g. if f=1. We want to + # account for this possibility, and set the corresponding counts + # to 0. + ests[abs(ests) == Inf] <- 0 + + list(estimates = ests, + stds = matrix(rep(100, length(ests[,1]) * length(ests[1,])), + length(ests[,1]))) +} + +# Implements lsei +FitDistribution2Way <- function(estimates_stds, map, + fit = NULL, + quiet = FALSE) { + X <- map + Y <- as.vector(t(estimates_stds$estimates)) + m <- dim(X)[1] + n <- dim(X)[2] + + G <- rbind2(Diagonal(n), rep(-1, n)) + H <- c(rep(0, n), -1) + lsei(A = X, B = Y, G = G, H = H, type = 2)$X +} + +# FitDistribution2Way <- function(estimates_stds, map, fit) { +# # Find a distribution over rows of map that approximates estimates_stds best +# # +# # Input: +# # estimates_stds: a list of two m x k matrices, one for estimates, another +# # for standard errors +# # map : an (m * k) x S boolean matrix +# # +# # Output: +# # a float vector of length S, so that a distribution over map's rows sampled +# # according to this vector approximates estimates +# +# X <- as.matrix(map) +# Y <- as.vector(t(estimates_stds$estimates)) +# m <- dim(X)[1] +# n <- dim(X)[2] +# wt <- 10000 # weight to marginal constraints +# +# G <- rbind2(Diagonal(n), rep(-1, n)) +# H <- c(rep(0, n), -1) +# +# # Adding marginals constraints to X and Y +# fstrs <- lapply(fit, function(x) x[,"string"]) # found strings +# +# Y <- c(Y, wt * t(fit[[1]]["proportion"]), wt * t(fit[[2]]["proportion"])) +# +# for (strs in fstrs[[1]]) { +# indices <- which(colnames(map) %in% outer(strs, +# fstrs[[2]], +# function(x, y) paste(x, y, sep = "x"))) +# vec <- rep(0, n) +# vec[indices] <- wt +# X <- rbind2(X, vec) +# } +# for (strs in fstrs[[2]]) { +# indices <- which(colnames(map) %in% outer(fstrs[[1]], +# strs, +# function(x, y) paste(x, y, sep = "x"))) +# vec <- rep(0, n) +# vec[indices] <- wt +# X <- rbind2(X, vec) +# } +# +# lsei(A = X, B = Y, G = G, H = H, type = 2)$X + + # Random projection params + # size <- 10 * n + # density <- 0.05 + # rproj <- matrix(0, size, m) + # rproj[sample(length(rproj), size = density * length(rproj))] <- rnorm(density * length(rproj)) + # # rproj <- matrix(rnorm(10*n*m), 10*n, m) + # Xproj <- rproj %*% X + # Yproj <- as.vector(rproj %*% Y) + # mproj <- dim(Xproj)[1] + # nproj <- dim(Xproj)[2] + # + # G <- rbind2(Diagonal(nproj), rep(-1, nproj)) + # H <- c(rep(0, nproj), -1) + # lsei(A = Xproj, B = Yproj, G = G, H = H, type = 2)$X +# } + +Decode2Way <- function(counts, map, params, fit = NULL) { + k <- params$k + p <- params$p + q <- params$q + f <- params$f + h <- params$h + m <- params$m + + S <- ncol(map) # total number of candidates + + N <- sum(counts[, 1]) + + filter_cohorts <- which(counts[, 1] != 0) # exclude cohorts with zero reports + + # stretch cohorts to bits + filter_bits <- as.vector( + t(matrix(1:nrow(map), nrow = m, byrow = TRUE)[filter_cohorts,])) + + es <- EstimateBloomCounts2Way(params, counts) + e <- list(estimates = es$estimates[filter_cohorts, , drop = FALSE], + stds = es$stds[filter_cohorts, , drop = FALSE]) + coefs <- FitDistribution2Way(e, map[filter_bits, , drop = FALSE], fit) + fit <- data.frame(String = colnames(map[filter_bits, , drop = FALSE]), + Estimate = matrix(coefs, ncol = 1), + SD = matrix(coefs, ncol = 1), + stringsAsFactors = FALSE) + rownames(fit) <- fit[,"String"] + list(fit = fit) +} diff --git a/analysis/tools/sum_bits_assoc.py b/analysis/tools/sum_bits_assoc.py index b339473d..9bdd7f95 100755 --- a/analysis/tools/sum_bits_assoc.py +++ b/analysis/tools/sum_bits_assoc.py @@ -1,6 +1,6 @@ #!/usr/bin/python # -# Copyright 2014 Google Inc. All rights reserved. +# Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,8 +15,12 @@ # limitations under the License. """ -Read the RAPPOR'd values on stdin, and sum the bits to produce a Counting Bloom -filter by cohort. This can then be analyzed by R. +Read RAPPOR values of 2 variables from stdin. +Read parameters from parameter file and a prefix. + +Output counts of bloom filter bits set for each variable (1-way totals) +and counts of pairwise bits set (2-way totals) into files with suffixes +_marg1.csv, _marg2.csv, _2way.csv respectively. """ import csv diff --git a/assoctest.sh b/assoctest.sh index 492d949a..96bf597d 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -11,9 +11,7 @@ # run [[ []] - run tests matching in # parallel, each times. # -# ## run-seq currently not supported! # run-seq [ []] - ditto, except that tests are run sequentially -# ## -- # # run-all [] - run all tests, in parallel, each times # @@ -27,6 +25,8 @@ # use $ in the pattern, since it matches the whole spec line and not just the # test case name.) The number of processors used in a parallel run is one less # than the number of CPUs on the machine. +# +# fast_counts param inherited from regtest.sh, but currently not used set -o nounset @@ -276,6 +276,8 @@ _run-tests() { func=_run-one-instance-logged processors=$(grep -c ^processor /proc/cpuinfo || echo 4) # POSIX-specific if test $processors -gt 6; then # leave few CPUs for the OS + # Association tests take up a lot of memory; so restricted to a few + # processes at a time processors=5 else processors=1 @@ -284,7 +286,7 @@ _run-tests() { fi local cases_list=$ASSOCTEST_DIR/test-cases.txt - tests/regtest_spec.py | grep -E $spec_regex > $cases_list + tests/assoctest_spec.py | grep -E $spec_regex > $cases_list # Generate parameters for all test cases. cat $cases_list \ @@ -305,22 +307,12 @@ _run-tests() { } # Run tests sequentially -#run-seq() { -# local spec_regex=${1:-'^r-'} # grep -E format on the spec -# local instances=${2:-1} -# local fast_counts=${3:-T} -# -# _run-tests $spec_regex $instances F $fast_counts -#} +run-seq() { + local spec_regex=${1:-'^a-'} # grep -E format on the spec + local instances=${2:-1} -# Run tests in parallel -#run() { -# local spec_regex=${1:-'^r-'} # grep -E format on the spec -# local instances=${2:-1} -# local fast_counts=${3:-T} -# -# _run-tests $spec_regex $instances T $fast_counts -#} + _run-tests $spec_regex $instances F T +} # Run tests in parallel run-all() { diff --git a/tests/assoctest_spec.py b/tests/assoctest_spec.py new file mode 100755 index 00000000..3d1642dd --- /dev/null +++ b/tests/assoctest_spec.py @@ -0,0 +1,137 @@ +#!/usr/bin/python +"""Print a test spec on stdout. + +Each line has parameters for a test case. The assoctest.sh shell script reads +these lines and runs parallel processes. + +We use Python data structures so the test cases are easier to read and edit. +""" + +import optparse +import sys + +DISTRIBUTION_PARAMS_ASSOC = { + # name, num unique values 1, + # num unique values 2, num clients + 'tiny': (100, 2, int(1e03)), # test for insufficient data + 'small': (100, 10, int(1e04)), +# 'fizz-tiny': (100, 20, int(1e03)), +# 'fizz-tiny-bool': (100, 2, int(1e03)), +# 'fizz-small': (100, 20, int(1e04)), +# 'fizz-small-bool': (100, 2, int(1e04)), +# 'fizz': (100, 20, int(1e05)), +# 'fizz-large': (100, 50, int(1e05)), +# 'fizz-2large': (100, 50, int(5e05)), +# 'fizz-bool': (100, 2, int(1e05)), + 'medium': (1000, 10, int(1e05)), + 'medium2': (1000, 2, int(1e05)), + 'large': (10000, 10, int(1e06)), + 'large2': (10000, 2, int(1e06)), + 'largesquared': (int(1e04), 100, int(1e06)), + + # new test names for 2-way marginals + # includes testing for extras + 'fizz-tiny': (100, 20, int(1e03), int(1e04)), + 'fizz-tiny-bool': (100, 2, int(1e03), int(1e04)), + 'fizz-small': (100, 20, int(1e04), int(1e04)), + 'fizz-small-bool': (100, 2, int(1e04), int(1e04)), + 'fizz': (100, 20, int(1e05), int(1e04)), + 'fizz-bool': (100, 2, int(1e05), int(1e04)), + + 'toy': (5, 2, 1e04, 20), # for testing purposes only + 'compact-noextra-small': (40, 5, 1e04, 0), + 'loose-noextra-small': (100, 20, 1e04, 0), + 'compact-noextra-large': (40, 5, 1e06, 0), + 'loose-noextra-large': (100, 20, 1e06, 0), + 'compact-extra-small': (40, 5, int(1e04), int(1e04)), + 'loose-extra-small': (100, 20, int(1e04), int(1e04)), + 'compact-extra-large': (40, 5, int(1e06), int(1e04)), + 'loose-extra-large': (100, 20, int(1e06), int(1e04)), + 'compact-excess-small': (40, 5, int(1e04), int(1e05)), + 'loose-excess-small': (100, 20, int(1e04), int(1e05)), + 'compact-excess-large': (40, 5, int(1e06), int(1e05)), + 'loose-excess-large': (100, 20, int(1e06), int(1e05)), +} + +# 'k, h, m' as in params file. +BLOOMFILTER_PARAMS = { + '8x16': (8, 2, 16), # 16 cohorts, 8 bits each, 2 bits set in each + '8x32': (8, 2, 32), # 32 cohorts, 8 bits each, 2 bits set in each + '16x32': (16, 2, 32), # 32 cohorts, 16 bits each, 2 bits set in each + '8x128': (8, 2, 128), # 128 cohorts, 8 bits each, 2 bits set in each + '128x128': (128, 2, 128), # 8 cohorts, 128 bits each, 2 bits set in each +} + +# 'p, q, f' as in params file. +PRIVACY_PARAMS = { + 'eps_zero': (0, 0.99, 0), # testing purposes only! + 'eps_1_1': (0.39, 0.61, 0.45), # eps_1 = 1, eps_inf = 5: + 'eps_1_5': (0.225, 0.775, 0.0), # eps_1 = 5, no eps_inf + 'eps_verysmall': (0.125, 0.875, 0.125), + 'eps_small': (0.125, 0.875, 0.5), + 'eps_chrome': (0.25, 0.75, 0.5), + 'uma_rappor_type': (0.50, 0.75, 0.5), +} + +# assoc test configuration -> +# (distribution params set, bloomfilter params set, +# privacy params set) +# The test config runs a test suite that is the cross product of all the above +# sets +ASSOC_TEST_CONFIG = { + 'distr': ( +# 'fizz-tiny', +# 'fizz-tiny-bool', +# 'fizz-small', +# 'fizz-small-bool', +# 'fizz', +# 'fizz-bool',), + 'toy',), +# 'compact-noextra-small', +# 'loose-noextra-small', +# 'compact-extra-small', +# 'loose-extra-small', +# 'compact-excess-small', +# 'loose-excess-small',), +# 'compact-noextra-large', +# 'loose-noextra-large', +# 'compact-extra-large', +# 'loose-extra-large', +# 'compact-excess-large', +# 'loose-excess-large'), + 'blooms': ( + '8x32', + '16x32',), + 'privacy': ( + 'eps_small', + 'eps_chrome',) +} + +# +# END TEST CONFIGURATION +# + +def main(argv): + rows = [] + test_case = [] + # Association tests + for distr in ASSOC_TEST_CONFIG['distr']: + for blooms in ASSOC_TEST_CONFIG['blooms']: + for privacy in ASSOC_TEST_CONFIG['privacy']: + print distr, blooms, privacy + test_name = 'a-{}-{}-{}'.format(distr, blooms, privacy) + params = (BLOOMFILTER_PARAMS[blooms] + + PRIVACY_PARAMS[privacy]) + test_case = (test_name,) + DISTRIBUTION_PARAMS_ASSOC[distr] + params + row_str = [str(element) for element in test_case] + rows.append(row_str) + + for row in rows: + print ' '.join(row) + +if __name__ == '__main__': + try: + main(sys.argv) + except RuntimeError, e: + print >>sys.stderr, 'FATAL: %s' % e + sys.exit(1) diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 6e0a602e..6350ae7a 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -41,49 +41,6 @@ ('large', 10000, 100000000, 1), ) -DISTRIBUTION_PARAMS_ASSOC = { - # name, num unique values 1, - # num unique values 2, num clients - 'tiny': (100, 2, int(1e03)), # test for insufficient data - 'small': (100, 10, int(1e04)), -# 'fizz-tiny': (100, 20, int(1e03)), -# 'fizz-tiny-bool': (100, 2, int(1e03)), -# 'fizz-small': (100, 20, int(1e04)), -# 'fizz-small-bool': (100, 2, int(1e04)), -# 'fizz': (100, 20, int(1e05)), -# 'fizz-large': (100, 50, int(1e05)), -# 'fizz-2large': (100, 50, int(5e05)), -# 'fizz-bool': (100, 2, int(1e05)), - 'medium': (1000, 10, int(1e05)), - 'medium2': (1000, 2, int(1e05)), - 'large': (10000, 10, int(1e06)), - 'large2': (10000, 2, int(1e06)), - 'largesquared': (int(1e04), 100, int(1e06)), - - # new test names for 2-way marginals - # includes testing for extras - 'fizz-tiny': (100, 20, int(1e03), int(1e04)), - 'fizz-tiny-bool': (100, 2, int(1e03), int(1e04)), - 'fizz-small': (100, 20, int(1e04), int(1e04)), - 'fizz-small-bool': (100, 2, int(1e04), int(1e04)), - 'fizz': (100, 20, int(1e05), int(1e04)), - 'fizz-bool': (100, 2, int(1e05), int(1e04)), - - 'toy': (5, 2, 1e04, 20), # for testing purposes only - 'compact-noextra-small': (40, 5, 1e04, 0), - 'loose-noextra-small': (100, 20, 1e04, 0), - 'compact-noextra-large': (40, 5, 1e06, 0), - 'loose-noextra-large': (100, 20, 1e06, 0), - 'compact-extra-small': (40, 5, int(1e04), int(1e04)), - 'loose-extra-small': (100, 20, int(1e04), int(1e04)), - 'compact-extra-large': (40, 5, int(1e06), int(1e04)), - 'loose-extra-large': (100, 20, int(1e06), int(1e04)), - 'compact-excess-small': (40, 5, int(1e04), int(1e05)), - 'loose-excess-small': (100, 20, int(1e04), int(1e05)), - 'compact-excess-large': (40, 5, int(1e06), int(1e05)), - 'loose-excess-large': (100, 20, int(1e06), int(1e05)), -} - # 'k, h, m' as in params file. BLOOMFILTER_PARAMS = { '8x16': (8, 2, 16), # 16 cohorts, 8 bits each, 2 bits set in each @@ -121,40 +78,6 @@ ('over_x10', '8x128', 'eps_1_1', 10.0, '10%'), # overshoot by x10 ] -# assoc test configuration -> -# (distribution params set, bloomfilter params set, -# privacy params set) -# The test config runs a test suite that is the cross product of all the above -# sets -ASSOC_TEST_CONFIG = { - 'distr': ( -# 'fizz-tiny', -# 'fizz-tiny-bool', -# 'fizz-small', -# 'fizz-small-bool', -# 'fizz', -# 'fizz-bool',), - 'toy',), -# 'compact-noextra-small', -# 'loose-noextra-small', -# 'compact-extra-small', -# 'loose-extra-small', -# 'compact-excess-small', -# 'loose-excess-small',), -# 'compact-noextra-large', -# 'loose-noextra-large', -# 'compact-extra-large', -# 'loose-extra-large', -# 'compact-excess-large', -# 'loose-excess-large'), - 'blooms': ( - '8x32', - '16x32',), - 'privacy': ( - 'eps_small', - 'eps_chrome',) -} - # # END TEST CONFIGURATION # @@ -184,19 +107,6 @@ def main(argv): for params in DEMO: rows.append(params) - # Association tests - for distr in ASSOC_TEST_CONFIG['distr']: - for blooms in ASSOC_TEST_CONFIG['blooms']: - for privacy in ASSOC_TEST_CONFIG['privacy']: - print distr, blooms, privacy - test_name = 'a-{}-{}-{}'.format(distr, blooms, privacy) - params = (BLOOMFILTER_PARAMS[blooms] + - PRIVACY_PARAMS[privacy]) - test_case = (test_name,) + DISTRIBUTION_PARAMS_ASSOC[distr] + params - row_str = [str(element) for element in test_case] - rows.append(row_str) - # End of association tests - for row in rows: print ' '.join(row) From 3e812611f475bb34e04cfa0f6bded64947ffdeb9 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Tue, 14 Jul 2015 14:08:27 -0700 Subject: [PATCH 46/67] Updated some documentation. --- assoctest.sh | 9 ++++----- tests/rappor_assoc_sim.py | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/assoctest.sh b/assoctest.sh index 96bf597d..65fd9549 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -16,15 +16,14 @@ # run-all [] - run all tests, in parallel, each times # # Examples: -# $ ./regtest.sh run-seq tiny-8x16- # Sequential run, matches 2 cases -# $ ./regtest.sh run-seq tiny-8x16- 3 # Sequential, each test is run three +# $ ./assoctest.sh run-seq tiny-8x16- # Sequential run, matches 2 cases +# $ ./assoctest.sh run-seq tiny-8x16- 3 # Sequential, each test is run three # times -# $ ./regtest.sh run-all # Run all tests once +# $ ./assoctest.sh run-all # Run all tests once # # The argument is a regex in 'grep -E' format. (Detail: Don't # use $ in the pattern, since it matches the whole spec line and not just the -# test case name.) The number of processors used in a parallel run is one less -# than the number of CPUs on the machine. +# test case name.) The number of processors used in a parallel run is 5. # # fast_counts param inherited from regtest.sh, but currently not used diff --git a/tests/rappor_assoc_sim.py b/tests/rappor_assoc_sim.py index 1c6c026d..178bc509 100755 --- a/tests/rappor_assoc_sim.py +++ b/tests/rappor_assoc_sim.py @@ -1,6 +1,6 @@ #!/usr/bin/python # -# Copyright 2014 Google Inc. All rights reserved. +# Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From a67151bdb4f9c98f3ad17d3619e02440ded6a4b5 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Wed, 15 Jul 2015 15:22:17 -0700 Subject: [PATCH 47/67] A few code changes. - moving some map code to read_input.R - analyze_assoc_expt now includes a verbose mode --- analysis/R/read_input.R | 17 +++++ assoctest.sh | 3 +- tests/analyze_assoc_expt.R | 140 ++++++++++++++++++------------------- tests/assoctest_spec.py | 14 ++-- 4 files changed, 95 insertions(+), 79 deletions(-) diff --git a/analysis/R/read_input.R b/analysis/R/read_input.R index 95ea1b0d..35c2cead 100644 --- a/analysis/R/read_input.R +++ b/analysis/R/read_input.R @@ -101,6 +101,23 @@ ReadMapFile <- function(map_file, params = NULL, quote = "") { list(map = map, strs = strs, map_pos = map_pos) } +# This function processes the maps loaded using ReadMapFile +# Association analysis requires a map object with a map +# field that has the map split into cohorts and an rmap field +# that has all the cohorts combined +# Arguments: +# map = map object with cohorts as sparse matrix in +# object map$map +# This is the expected object from ReadMapFile +# params = data field with parameters +ProcessMap <- function(map, params) { + map$rmap <- map$map + map$map <- lapply(1:params$m, function(i) + map$rmap[seq(from = ((i - 1) * params$k + 1), + length.out = params$k),]) + map +} + LoadMapFile <- function(map_file, params = NULL, quote = "") { # Reads the map file and creates an R binary .rda. If the .rda file already # exists, just loads that file. NOTE: It assumes the map file is diff --git a/assoctest.sh b/assoctest.sh index 65fd9549..e1fe9446 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -176,10 +176,11 @@ _run-one-instance() { inp['num'] = $num_clients; \ inp['extras'] = $num_extras; \ inp['varcandidates'] = [$num_unique_values, $num_unique_values2]; \ + inp['verbose'] = 'false'; \ inp['counts'] = ['$instance_dir/case_2way.csv',\ '$instance_dir/case_marg1.csv',\ '$instance_dir/case_marg2.csv']; \ - inp['expt'] = ['external-counts', 'external-counts-new']; \ + inp['expt'] = ['external-counts', 'external-reports-em']; \ json.dump(inp, f); \ f.close();" diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index d91dad21..bbda4204 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -43,22 +43,11 @@ source("analysis/R/read_input.R") source("analysis/R/association.R") source("tests/gen_counts.R") -# This function processes the maps loaded using ReadMapFile -# Association analysis requires a map object with a map -# field that has the map split into cohorts and an rmap field -# that has all the cohorts combined -# Arguments: -# map = map object with cohorts as sparse matrix in -# object map$map -# This is the expected object from ReadMapFile -# params = data field with parameters -# TODO(pseudorandom): move this functionality to ReadMapFile -ProcessMap <- function(map, params) { - map$rmap <- map$map - map$map <- lapply(1:params$m, function(i) - map$rmap[seq(from = ((i - 1) * params$k + 1), - length.out = params$k),]) - map +# Wrapper function to print strings only if verbose flag is passed in +PrintIfVerbose <- function(string, flag = FALSE) { + if(flag == TRUE) { + print(string) + } } # TV distance = L1 distance / 2 = 1 - sum(min(df1|x, df2|x)) where @@ -90,7 +79,6 @@ CombineReports <- function(reports1, reports2) { function(x) as.vector(sapply(x, function(z) two_bits[[z+1]]))) } - # Given 2 lists of maps, maps1 and maps2, the function # combines the maps by cohort and outputs both # cohort-organized maps and flattened versions @@ -203,19 +191,25 @@ GenerateNoiseMatrix <- function(params) { NoiseMatrix } -# ------------------------------------------------------------------------ +##################################################################### ## -## Direct simulation of reports without simulated variance +## Direct simulation of reports WITHOUT simulated variance ## -## Inputs: +## Inputs: inp object (from parsing JSON) with +## num - # of reports +## params - file containing RAPPOR params +## varcandidates - list containing # of candidates for each var +## numvars - # of vars (>=2 for association) +## extra - # of extra candidates for var 1 +## ## -## Outputs: -# -# ------------------------------------------------------------------------ -DirectSimulationOfReports <- function(inp) { - params <- ReadParameterFile(inp$params) - # TWO WAY ASSOCIATIONS; INPUTS SIMULATED DIRECTLY - +## Outputs: Runs simulation of two-way association analysis by directly +## simulating the counts of one way and two way marginals +## +##################################################################### +DirectSimulationOfReports <- function(inp, verbose = FALSE) { + ptm <- proc.time() + params <- ReadParameterFile(inp$params) strconstant <- c("string", "option") N <- inp$num n1 <- inp$varcandidates[[1]] @@ -268,13 +262,13 @@ DirectSimulationOfReports <- function(inp) { found_strings <- lapply(1:2, function(i) Decode(ow_counts[[i]], map[[i]]$rmap, - params, quick = TRUE)$fit$strings) + params, quick = TRUE)$fit[,"string"]) # -------------- rownames(td) <- uvals[[1]][1:n1] # Don't take into account extras colnames(td) <- uvals[[2]] - print("TRUE DISTRIBUTION") - print(signif(td, 4)) + PrintIfVerbose("TRUE DISTRIBUTION", verbose) + PrintIfVerbose(signif(td, 4), verbose) cohorts <- as.matrix( apply(as.data.frame(final_part), 1, function(count) RandomPartition(count, rep(1, params$m)))) @@ -313,11 +307,11 @@ DirectSimulationOfReports <- function(inp) { ed[is.na(ed)] <- 0 time_taken <- proc.time() - ptm - print("2 WAY RESULTS") - print(signif(ed, 4)) - print(TVDistance(td, ed, "TV DISTANCE 2 WAY ALGORITHM")) - print("PROC.TIME") - print(time_taken) + PrintIfVerbose("2 WAY RESULTS", verbose) + PrintIfVerbose(signif(ed, 4), verbose) + PrintIfVerbose(TVDistance(td, ed, "TV DISTANCE 2 WAY ALGORITHM"), verbose) + PrintIfVerbose("PROC.TIME", verbose) + PrintIfVerbose(time_taken, verbose) chisq_td <- chisq.test(td)[1][[1]][[1]] chisq_ed <- chisq.test(ed)[1][[1]][[1]] if(is.nan(chisq_ed)) { @@ -339,19 +333,21 @@ DirectSimulationOfReports <- function(inp) { write.csv(metrics, file = filename, row.names = FALSE) } -# ------------------------------------------------------------------------ +##################################################################### ## ## Externally provided counts (gen_assoc_counts.R and sum_assoc_reports.py) -## 2 WAY ASSOCIATION ONLY +## new_decode flag allows you to switch between two decode algorithm choices +## Note: Only for two way associations ## -## Inputs: +## Inputs: inp object (from parsing JSON) with ## count files (2 way counts, individual marginal counts) ## map files (2 variables) +## params file with RAPPOR params ## -## Outputs: -# -# ------------------------------------------------------------------------ -ExternalCounts <- function(inp, new_decode = FALSE) { +## Outputs: Runs simulation of two-way association analysis reading inputs +## from counts, maps, and params file. +##################################################################### +ExternalCounts <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv") { ptm <- proc.time() params <- ReadParameterFile(inp$params) # Ensure sufficient maps as required by number of vars @@ -375,8 +371,8 @@ ExternalCounts <- function(inp, new_decode = FALSE) { found_strings = list(fit[[1]][,"string"], fit[[2]][,"string"]) if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) { - print("FOUND_STRINGS") - print(found_strings) + PrintIfVerbose("FOUND_STRINGS", verbose) + PrintIfVerbose(found_strings, verbose) stop("No strings found in 1-way marginal.") } @@ -400,9 +396,9 @@ ExternalCounts <- function(inp, new_decode = FALSE) { time_taken <- proc.time() - ptm - print(TVDistance(td, ed, "TV DISTANCE 2 WAY")) - print("PROC.TIME") - print(time_taken) + PrintIfVerbose(TVDistance(td, ed, "TV DISTANCE 2 WAY"), verbose) + PrintIfVerbose("PROC.TIME", verbose) + PrintIfVerbose(time_taken, verbose) chisq_td <- chisq.test(td)[1][[1]][[1]] chisq_ed <- chisq.test(ed)[1][[1]][[1]] if(is.nan(chisq_td)) { @@ -421,16 +417,12 @@ ExternalCounts <- function(inp, new_decode = FALSE) { dim2 = length(found_strings[[2]]) ) - # Write metrics to metrics.csv - if (new_decode == TRUE) { - filename <- file.path(inp$outdir, 'metrics_2.csv') - } else { - filename <- file.path(inp$outdir, 'metrics.csv') - } + # Write metrics to metrics_filename (default: metrics.csv) + filename <- file.path(inp$outdir, metrics_filename) write.csv(metrics, file = filename, row.names = FALSE) } -# ------------------------------------------------------------------------ +##################################################################### ## ## Externally provided reports ## EM ALGORITHM @@ -439,9 +431,9 @@ ExternalCounts <- function(inp, new_decode = FALSE) { ## Inputs: ## ## Outputs: -# -# ------------------------------------------------------------------------ -ExternalReportsEM <- function(inp) { +## +##################################################################### +ExternalReportsEM <- function(inp, verbose = FALSE) { ptm <- proc.time() params <- ReadParameterFile(inp$params) # Ensure sufficient maps as required by number of vars @@ -488,9 +480,9 @@ ExternalReportsEM <- function(inp) { td <- td / sum(td) time_taken <- proc.time() - ptm - print(TVDistance(td, em, "TV DISTANCE EM")) - print("PROC.TIME") - print(time_taken) + PrintIfVerbose(TVDistance(td, em, "TV DISTANCE EM"), verbose) + PrintIfVerbose("PROC.TIME", verbose) + PrintIfVerbose(time_taken, verbose) chisq_td <- chisq.test(td)[1][[1]][[1]] chisq_ed <- chisq.test(em)[1][[1]][[1]] if(is.nan(chisq_td)) { @@ -516,27 +508,33 @@ ExternalReportsEM <- function(inp) { main <- function(opts) { inp <- fromJSON(opts$inp) - + verbose_flag <- inp$verbose # Choose from a set of experiments to run # direct -> direct simulation of reports (without variances) # external-counts -> externally supplied counts for 2 way and marginals # external-reports -> externally supplied reports if("direct" %in% inp$expt) { - print("---------- RUNNING EXPERIMENT DIRECT ----------") - DirectSimulationOfReports(inp) + PrintIfVerbose("Running Experiment Direct", verbose_flag) + DirectSimulationOfReports(inp, verbose = verbose_flag) } if ("external-counts" %in% inp$expt) { - print("---------- RUNNING EXPERIMENT EXT COUNTS ----------") - ExternalCounts(inp) - } - if ("external-counts-new" %in% inp$expt) { - print("---------- RUNNING EXPERIMENT EXT COUNTS ----------") - ExternalCounts(inp, new_decode = TRUE) + PrintIfVerbose("Running Experiment Ext Counts", verbose_flag) + if ("direct" %in% inp$expt) { + # external-counts expt is run to compare results + ExternalCounts(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv") + } else { + ExternalCounts(inp, verbose = verbose_flag) + } } if ("external-reports-em" %in% inp$expt) { - print("---------- RUNNING EXPERIMENT EXT REPORTS ----------") - ExternalReportsEM(inp) + PrintIfVerbose("Running Experiment Ext Reports", verbose_flag) + if (("direct" %in% inp$expt)||("external-counts" %in% inp$expt)) { + # external-reports-em expt is run to compare results + ExternalCounts(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv") + } else { + ExternalCounts(inp, verbose = verbose_flag) + } } } diff --git a/tests/assoctest_spec.py b/tests/assoctest_spec.py index 3d1642dd..a10847b4 100755 --- a/tests/assoctest_spec.py +++ b/tests/assoctest_spec.py @@ -86,13 +86,13 @@ # 'fizz-small-bool', # 'fizz', # 'fizz-bool',), - 'toy',), -# 'compact-noextra-small', -# 'loose-noextra-small', -# 'compact-extra-small', -# 'loose-extra-small', -# 'compact-excess-small', -# 'loose-excess-small',), +# 'toy',), + 'compact-noextra-small', + 'loose-noextra-small', + 'compact-extra-small', + 'loose-extra-small', + 'compact-excess-small', + 'loose-excess-small',), # 'compact-noextra-large', # 'loose-noextra-large', # 'compact-extra-large', From 420b6a0c8d747b7a05a8b7b6d02ddcfe5cd0df6e Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Wed, 15 Jul 2015 16:59:08 -0700 Subject: [PATCH 48/67] Small fixes, updates to assoctest.sh --- assoctest.sh | 2 +- tests/analyze_assoc_expt.R | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/assoctest.sh b/assoctest.sh index e1fe9446..de33c403 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -176,7 +176,7 @@ _run-one-instance() { inp['num'] = $num_clients; \ inp['extras'] = $num_extras; \ inp['varcandidates'] = [$num_unique_values, $num_unique_values2]; \ - inp['verbose'] = 'false'; \ + inp['verbose'] = 'true'; \ inp['counts'] = ['$instance_dir/case_2way.csv',\ '$instance_dir/case_marg1.csv',\ '$instance_dir/case_marg2.csv']; \ diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index bbda4204..cb0a2087 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -433,7 +433,7 @@ ExternalCounts <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv ## Outputs: ## ##################################################################### -ExternalReportsEM <- function(inp, verbose = FALSE) { +ExternalReportsEM <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv") { ptm <- proc.time() params <- ReadParameterFile(inp$params) # Ensure sufficient maps as required by number of vars @@ -501,8 +501,8 @@ ExternalReportsEM <- function(inp, verbose = FALSE) { dim2 = dim(em)[[2]] ) - # Write metrics to metrics.csv - filename <- file.path(inp$outdir, 'metrics_2.csv') + # Write metrics to metrics_filename (default: metrics.csv) + filename <- file.path(inp$outdir, metrics_filename) write.csv(metrics, file = filename, row.names = FALSE) } @@ -531,9 +531,9 @@ main <- function(opts) { PrintIfVerbose("Running Experiment Ext Reports", verbose_flag) if (("direct" %in% inp$expt)||("external-counts" %in% inp$expt)) { # external-reports-em expt is run to compare results - ExternalCounts(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv") + ExternalReportsEM(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv") } else { - ExternalCounts(inp, verbose = verbose_flag) + ExternalReportsEM(inp, verbose = verbose_flag) } } } From f73aac4bd65391e5435ed10030d5fbab836230b7 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Tue, 21 Jul 2015 09:17:37 -0700 Subject: [PATCH 49/67] Incorporating changes from master - resolved conflicts - modified code to use new Encode interface - modified rappor_assoc_sim.py to use same interface as rappor_sim.py --- analysis/R/association.R | 3 +++ analysis/R/decode.R | 43 +++----------------------------- analysis/tools/sum_bits_assoc.py | 8 ++++-- assoctest.sh | 8 +++--- tests/analyze_assoc_expt.R | 11 ++++---- tests/assoctest_spec.py | 10 ++++---- tests/gen_assoc_reports.R | 10 +++++--- tests/rappor_assoc_sim.py | 8 +++--- 8 files changed, 38 insertions(+), 63 deletions(-) diff --git a/analysis/R/association.R b/analysis/R/association.R index 2a19656f..f2d6f59c 100644 --- a/analysis/R/association.R +++ b/analysis/R/association.R @@ -355,6 +355,9 @@ ComputeDistributionEM <- function(reports, report_cohorts, params) found_strings[[j]] <- c(found_strings[[j]], "Other") } + + GetCondProb(variable_report[[1]], candidate_strings = rownames(marginal), + params = params, map$map[[variable_cohort[1]]], prob_other[[variable_cohort[1]]]) # Get the joint conditional distribution cond_report_dist <- lapply(seq(length(variable_report)), function(i) { diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 9c37094b..ba9eb9c6 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -227,41 +227,14 @@ FitDistribution <- function(estimates_stds, map, quiet = FALSE) { # according to this vector approximates estimates S <- ncol(map) # total number of candidates -<<<<<<< HEAD lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) -======= - - support_coefs <- 1:S - - if (S > length(estimates_stds$estimates) * .8) { - # the system is close to being underdetermined - lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) ->>>>>>> master - - # Select non-zero coefficients. - support_coefs <- which(lasso > 0) + + if(!quiet) + cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n") -<<<<<<< HEAD names(lasso) <- colnames(map) lasso - } -======= - if(!quiet) - cat("LASSO selected ", length(support_coefs), " non-zero coefficients.\n") - } - - coefs <- setNames(rep(0, S), colnames(map)) - - if(length(support_coefs) > 0) { # LASSO may return an empty list - constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE], - estimates_stds) - - coefs[support_coefs] <- constrained_coefs - } - - coefs } ->>>>>>> master Resample <- function(e) { # Simulate resampling of the Bloom filter estimates by adding Gaussian noise @@ -302,13 +275,9 @@ Decode <- function(counts, map, params, alpha = 0.05, quick = FALSE, coefs_all <- vector() # Run the fitting procedure several times (5 seems to be sufficient and not # too many) to estimate standard deviation of the output. -<<<<<<< HEAD if(quick) {num_reps <- 2} else {num_reps <- 5} for(r in 1:num_reps) { -======= - for(r in 1:5) { ->>>>>>> master if(r > 1) e <- Resample(estimates_stds_filtered) else @@ -359,17 +328,11 @@ Decode <- function(counts, map, params, alpha = 0.05, quick = FALSE, fit$prop_std_error <- fit$std_error / N # 1.96 standard deviations gives 95% confidence interval. -<<<<<<< HEAD - fit$prop_low_95 <- fit$proportion - 1.96 * fit$prop_std_error - fit$prop_high_95 <- fit$proportion + 1.96 * fit$prop_std_error -======= low_95 <- fit$proportion - 1.96 * fit$prop_std_error high_95 <- fit$proportion + 1.96 * fit$prop_std_error # Clamp estimated proportion. pmin/max: vectorized min and max fit$prop_low_95 <- pmax(low_95, 0.0) fit$prop_high_95 <- pmin(high_95, 1.0) - ->>>>>>> master fit <- fit[, c("string", "estimate", "std_error", "proportion", "prop_std_error", "prop_low_95", "prop_high_95")] allocated_mass <- sum(fit$proportion) diff --git a/analysis/tools/sum_bits_assoc.py b/analysis/tools/sum_bits_assoc.py index 9bdd7f95..a858d78f 100755 --- a/analysis/tools/sum_bits_assoc.py +++ b/analysis/tools/sum_bits_assoc.py @@ -45,7 +45,7 @@ def SumBits(params, stdin, f_2way, f_1, f_2): for i, row in enumerate(csv_in): try: - (user_id, cohort, irr_1, irr_2) = row + (_, cohort, irr_1, irr_2) = row except ValueError: raise RuntimeError('Error parsing row %r' % row) @@ -53,7 +53,11 @@ def SumBits(params, stdin, f_2way, f_1, f_2): continue # skip header cohort = int(cohort) - num_reports[cohort] += 1 + try: + num_reports[cohort] += 1 + except IndexError: + raise RuntimeError('Error indexing cohort number %d (num_cohorts is %d) \ + ' % (cohort, num_cohorts)) # TODO: Extend checking for both reports if not len(irr_1) == params.num_bloombits: diff --git a/assoctest.sh b/assoctest.sh index de33c403..132cd917 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -125,13 +125,13 @@ _run-one-instance() { -p $p \ -q $q \ -f $f \ - -i $instance_dir/case.csv \ - --out-prefix "$instance_dir/case" + < $instance_dir/case.csv \ + > "$instance_dir/case_reports.csv" analysis/tools/sum_bits_assoc.py \ $case_dir/case_params.csv \ "$instance_dir/case" \ - < $instance_dir/case_out.csv + < $instance_dir/case_reports.csv # Setting up JSON file containing assoc_sim inputs with python @@ -167,7 +167,7 @@ _run-one-instance() { inp = dict(); \ inp['maps'] = ['$case_dir/case_map1.csv',\ '$case_dir/case_map2.csv']; \ - inp['reports'] = '$instance_dir/case_out.csv'; \ + inp['reports'] = '$instance_dir/case_reports.csv'; \ inp['truefile'] = '$instance_dir/case.csv'; \ inp['outdir'] = '$out_dir'; \ inp['params'] = '$case_dir/case_params.csv'; \ diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index cb0a2087..37e65426 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -382,8 +382,8 @@ ExternalCounts <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE])) crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap marginal <- Decode2Way(counts[[1]], crmap, params2, fit = fit)$fit - td <- read.csv(file = inp$truefile, header = FALSE) - td <- table(td[,2:3]) + td <- read.csv(file = inp$truefile, header = TRUE) + td <- table(td[,3:4]) td <- td / sum(td) ed <- td for (cols in colnames(td)) { @@ -443,13 +443,14 @@ ExternalReportsEM <- function(inp, verbose = FALSE, metrics_filename = "metrics. params = params)) # Reports must be of the format - # cohort no, rappor bitstring 1, rappor bitstring 2, ... + # client name, cohort no, rappor bitstring 1, rappor bitstring 2, ... reportsObj <- read.csv(inp$reports, - colClasses = c("integer", "integer", + colClasses = c("character", "integer", rep("character", inp$numvars)), header = TRUE) # Ignore the first column reportsObj <- reportsObj[,-1] + # Parsing reportsObj # ComputeDistributionEM allows for different sets of cohorts # for each variable. Here, both sets of cohorts are identical @@ -476,7 +477,7 @@ ExternalReportsEM <- function(inp, verbose = FALSE, metrics_filename = "metrics. estimate_var = FALSE) em <- joint_dist$fit td <- read.csv(file = inp$truefile, header = FALSE) - td <- table(td[,2:3]) + td <- table(td[,3:4]) td <- td / sum(td) time_taken <- proc.time() - ptm diff --git a/tests/assoctest_spec.py b/tests/assoctest_spec.py index a10847b4..c798a5ea 100755 --- a/tests/assoctest_spec.py +++ b/tests/assoctest_spec.py @@ -88,11 +88,11 @@ # 'fizz-bool',), # 'toy',), 'compact-noextra-small', - 'loose-noextra-small', - 'compact-extra-small', - 'loose-extra-small', - 'compact-excess-small', - 'loose-excess-small',), + 'loose-noextra-small',), +# 'compact-extra-small', +# 'loose-extra-small', +# 'compact-excess-small', +# 'loose-excess-small',), # 'compact-noextra-large', # 'loose-noextra-large', # 'compact-extra-large', diff --git a/tests/gen_assoc_reports.R b/tests/gen_assoc_reports.R index 41eb045a..fa83e95d 100755 --- a/tests/gen_assoc_reports.R +++ b/tests/gen_assoc_reports.R @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# TODO: Rename reports to values (more in line with its usage for histogram +# RAPPOR) source('tests/gen_counts.R') # Usage: @@ -69,13 +71,15 @@ main <- function(argv) { perm <- sample(N) values <- list(values[[1]][perm], values[[2]][perm]) - # Obtain reports by prefixing values with "v"s. Even slower than shuffling. + # Prepend with str and opt reports <- list(sprintf("str%d", values[[1]]), sprintf("opt%d", values[[2]])) - reports <- cbind(1:N, reports[[1]], reports[[2]]) # paste together "1 v342" + # paste together client name, cohort input, report1, report2 + reports <- cbind(sprintf("cli%d", 1:N), 1:N, reports[[1]], reports[[2]]) + colnames(reports) <- c("client", "cohort", "value1", "value2") - write.table(reports, file = out_file, row.names = FALSE, col.names = FALSE, + write.table(reports, file = out_file, row.names = FALSE, col.names = TRUE, sep = ",", quote = FALSE) } diff --git a/tests/rappor_assoc_sim.py b/tests/rappor_assoc_sim.py index 8d3fed41..b46c8436 100755 --- a/tests/rappor_assoc_sim.py +++ b/tests/rappor_assoc_sim.py @@ -119,8 +119,8 @@ def main(argv): # instance up front per client, rather than one per row below. start_time = time.time() - for i, (client_str, cohort_str, true_value_1, true_value_2) in - enumerate(csv_in): + for i, (client_str, cohort_str, true_value_1, + true_value_2) in enumerate(csv_in): if i == 0: if client_str != 'client': raise RuntimeError('Expected client header, got %s' % client_str) @@ -139,7 +139,7 @@ def main(argv): elapsed = time.time() - start_time log('Processed %d inputs in %.2f seconds', i, elapsed) - cohort = int(cohort_str) + cohort = int(cohort_str) % params.num_cohorts secret = client_str e = rappor.Encoder(params, cohort, secret, irr_rand) @@ -150,7 +150,7 @@ def main(argv): irr_1_str = rappor.bit_string(irr_1, params.num_bloombits) irr_2_str = rappor.bit_string(irr_2, params.num_bloombits) - out_row = (cohort_str, irr_1_str, irr_2_str) + out_row = (client_str, cohort, irr_1_str, irr_2_str) csv_out.writerow(out_row) From ab783199b32f1a4194cbfa8a459d7493783c87f7 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Tue, 21 Jul 2015 13:02:43 -0700 Subject: [PATCH 50/67] Added a test for gen_assoc_reports.R Also, some minor refactoring. --- assoctest.sh | 6 +++++- tests/analyze_assoc.R | 9 +++++++++ tests/assoc_sim.R | 12 ++++++++++++ tests/assoctest_spec.py | 8 ++++---- tests/gen_assoc_reports.R | 32 +++++++++++++++++++++++--------- tests/gen_assoc_reports_test.R | 34 ++++++++++++++++++++++++++++++++++ 6 files changed, 87 insertions(+), 14 deletions(-) create mode 100755 tests/gen_assoc_reports_test.R diff --git a/assoctest.sh b/assoctest.sh index 132cd917..f9dc392b 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -115,7 +115,7 @@ _run-one-instance() { banner "Generating input" tests/gen_assoc_reports.R $num_unique_values $num_unique_values2 \ - $num_clients $instance_dir/case.csv + $num_clients $num_cohorts $instance_dir/case.csv banner "Running RAPPOR client" tests/rappor_assoc_sim.py \ @@ -135,6 +135,8 @@ _run-one-instance() { # Setting up JSON file containing assoc_sim inputs with python + # Currently unused as true values and RAPPOR'd reports are generated + # running gen_assoc_reports.R, rappor_assoc_sim.py, and sum_bits_assoc.py python -c "import json; \ f = file('$instance_dir/assoc_inp.json', 'w'); \ inp = dict(); \ @@ -151,6 +153,8 @@ _run-one-instance() { json.dump(inp, f); \ f.close();" + # Currently unused as true values and RAPPOR'd reports are generated + # running gen_assoc_reports.R, rappor_assoc_sim.py, and sum_bits_assoc.py # tests/assoc_sim_expt.R --inp $instance_dir/assoc_inp.json local out_dir=${instance_dir}_report diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R index 4e6af972..100d204f 100755 --- a/tests/analyze_assoc.R +++ b/tests/analyze_assoc.R @@ -30,6 +30,15 @@ # intel 0.1 0.3 # google 0.5 0.1 +############################################################################## +############################################################################## +############################################################################## +# D E P R E C A T E D +# Please use analyze_assoc_expt.R to run assoc analysis experiments +############################################################################## +############################################################################## +############################################################################## + library("optparse") options(stringsAsFactors = FALSE) diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R index c1166bc1..1b1726de 100755 --- a/tests/assoc_sim.R +++ b/tests/assoc_sim.R @@ -26,6 +26,18 @@ # reports.csv file containing reports # map_{1, 2, ...}.csv file(s) containing maps of variables +############################################################################## +############################################################################## +############################################################################## +# D E P R E C A T E D +# Please look at workflow to use analyze_assoc_expt.R and +# run gen_assoc_reports.R, rappor_assoc_sim.py, and sum_bits_assoc.py +# to generate inputs to association analysis +# (For more details, see _run-one-instance() in assoctest.sh) +############################################################################## +############################################################################## +############################################################################## + library("optparse") options(stringsAsFactors = FALSE) diff --git a/tests/assoctest_spec.py b/tests/assoctest_spec.py index c798a5ea..b6952730 100755 --- a/tests/assoctest_spec.py +++ b/tests/assoctest_spec.py @@ -87,8 +87,8 @@ # 'fizz', # 'fizz-bool',), # 'toy',), - 'compact-noextra-small', - 'loose-noextra-small',), + 'compact-noextra-small',), +# 'loose-noextra-small',), # 'compact-extra-small', # 'loose-extra-small', # 'compact-excess-small', @@ -100,8 +100,8 @@ # 'compact-excess-large', # 'loose-excess-large'), 'blooms': ( - '8x32', - '16x32',), + '8x32',), +# '16x32',), 'privacy': ( 'eps_small', 'eps_chrome',) diff --git a/tests/gen_assoc_reports.R b/tests/gen_assoc_reports.R index fa83e95d..e6adb7e6 100755 --- a/tests/gen_assoc_reports.R +++ b/tests/gen_assoc_reports.R @@ -16,6 +16,7 @@ # TODO: Rename reports to values (more in line with its usage for histogram # RAPPOR) + source('tests/gen_counts.R') # Usage: @@ -30,10 +31,12 @@ source('tests/gen_counts.R') # Output: # csv file with reports sampled according to the specified distribution. -main <- function(argv) { - n <- list(as.integer(argv[[1]]), as.integer(argv[[2]])) - N <- as.integer(argv[[3]]) - out_file <- argv[[4]] +GenerateAssocReports <- function(n, N, num_cohorts) { + # Inputs: n, a list of supports for vars 1, 2 + # N, the number of reports/clients + # num_cohorts, the number of cohorts + # Output: tuples of values sampled according to a zipf x zipf distr + # with support n[[1]] and n[[2]] respectively # Sample values to compute partition # Resulting distribution is a correlated zipf x zipf @@ -70,15 +73,26 @@ main <- function(argv) { # Shuffle values randomly (may take a few sec for > 10^8 inputs) perm <- sample(N) values <- list(values[[1]][perm], values[[2]][perm]) + cohorts <- rep(1:N) %% num_cohorts + list(cohorts = cohorts, values = values) +} +main <- function(argv) { + n <- list(as.integer(argv[[1]]), as.integer(argv[[2]])) + N <- as.integer(argv[[3]]) + num_cohorts <- as.integer(argv[[4]]) + out_file <- argv[[5]] + + res <- GenerateAssocReports(n, N, num_cohorts) # Prepend with str and opt - reports <- list(sprintf("str%d", values[[1]]), - sprintf("opt%d", values[[2]])) + reports <- list(sprintf("str%d", res$values[[1]]), + sprintf("opt%d", res$values[[2]])) - # paste together client name, cohort input, report1, report2 - reports <- cbind(sprintf("cli%d", 1:N), 1:N, reports[[1]], reports[[2]]) - colnames(reports) <- c("client", "cohort", "value1", "value2") + # Paste together client name, cohort input, report1, report2 + reports <- cbind(sprintf("cli%d", 1:N), + res$cohorts, reports[[1]], reports[[2]]) + colnames(reports) <- c("client", "cohort", "value1", "value2") write.table(reports, file = out_file, row.names = FALSE, col.names = TRUE, sep = ",", quote = FALSE) } diff --git a/tests/gen_assoc_reports_test.R b/tests/gen_assoc_reports_test.R new file mode 100755 index 00000000..10f88c51 --- /dev/null +++ b/tests/gen_assoc_reports_test.R @@ -0,0 +1,34 @@ +#!/usr/bin/Rscript +# +# gen_reports_test.R + +source('analysis/R/util.R') # Log() + +source('tests/gen_assoc_reports.R') # module under test + +library(RUnit) + +TestGenerateAssocReports <- function() { + # list for support of var1, var2, + # total number of reports + # num_cohorts + res <- GenerateAssocReports(list(20, 5), 1000, 32) + # print(res$values) + + # 1000 reports + checkEquals(1000, length(res$values[[1]])) + + # support(var1) <= 20 + # support(var2) <= 5 + checkTrue(max(res$values[[1]]) <= 20) + checkTrue(max(res$values[[2]]) <= 5) + + # Ensure cohorts are filled up + checkEquals(32, length(unique(res$cohort))) +} + +TestAll <- function(){ + TestGenerateAssocReports() +} + +TestAll() From feee5d8b3d74d5519b215ed7504df37f9d5886b4 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Tue, 21 Jul 2015 14:19:09 -0700 Subject: [PATCH 51/67] Replaced regtest_spec.py from master branch. --- tests/regtest_spec.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 6350ae7a..6774e400 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -45,20 +45,14 @@ BLOOMFILTER_PARAMS = { '8x16': (8, 2, 16), # 16 cohorts, 8 bits each, 2 bits set in each '8x32': (8, 2, 32), # 32 cohorts, 8 bits each, 2 bits set in each - '16x32': (16, 2, 32), # 32 cohorts, 16 bits each, 2 bits set in each '8x128': (8, 2, 128), # 128 cohorts, 8 bits each, 2 bits set in each '128x128': (128, 2, 128), # 8 cohorts, 128 bits each, 2 bits set in each } # 'p, q, f' as in params file. PRIVACY_PARAMS = { - 'eps_zero': (0, 0.99, 0), # testing purposes only! 'eps_1_1': (0.39, 0.61, 0.45), # eps_1 = 1, eps_inf = 5: 'eps_1_5': (0.225, 0.775, 0.0), # eps_1 = 5, no eps_inf - 'eps_verysmall': (0.125, 0.875, 0.125), - 'eps_small': (0.125, 0.875, 0.5), - 'eps_chrome': (0.25, 0.75, 0.5), - 'uma_rappor_type': (0.50, 0.75, 0.5), } # For deriving candidates from true inputs. @@ -82,6 +76,7 @@ # END TEST CONFIGURATION # + def main(argv): rows = [] From 7936fc9faed65267cf3d398ce7c3fac7fb379e2e Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Tue, 21 Jul 2015 14:25:46 -0700 Subject: [PATCH 52/67] Moving deprecated code to experimental directory --- {tests => experimental/assoc}/analyze_assoc.R | 0 {tests => experimental/assoc}/assoc_sim.R | 0 {tests => experimental/assoc}/assoc_sim_expt.R | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename {tests => experimental/assoc}/analyze_assoc.R (100%) rename {tests => experimental/assoc}/assoc_sim.R (100%) rename {tests => experimental/assoc}/assoc_sim_expt.R (100%) diff --git a/tests/analyze_assoc.R b/experimental/assoc/analyze_assoc.R similarity index 100% rename from tests/analyze_assoc.R rename to experimental/assoc/analyze_assoc.R diff --git a/tests/assoc_sim.R b/experimental/assoc/assoc_sim.R similarity index 100% rename from tests/assoc_sim.R rename to experimental/assoc/assoc_sim.R diff --git a/tests/assoc_sim_expt.R b/experimental/assoc/assoc_sim_expt.R similarity index 100% rename from tests/assoc_sim_expt.R rename to experimental/assoc/assoc_sim_expt.R From 3deceee1f84a26072cc32beab2278ad784dea135 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 23 Jul 2015 09:48:45 -0700 Subject: [PATCH 53/67] A few fixes from code review. - uncommented experimental code in decode2way and documented it - renamed function that processes assoc maps - deleted params.csv --- analysis/R/decode2way.R | 87 ++++++++++++++++++-------------------- analysis/R/read_input.R | 2 +- assoctest.sh | 2 +- tests/analyze_assoc_expt.R | 14 +++--- tests/params.csv | 2 - 5 files changed, 52 insertions(+), 55 deletions(-) delete mode 100644 tests/params.csv diff --git a/analysis/R/decode2way.R b/analysis/R/decode2way.R index 63bb8f69..600c124e 100644 --- a/analysis/R/decode2way.R +++ b/analysis/R/decode2way.R @@ -32,7 +32,7 @@ EstimateBloomCounts2Way <- function(params, obs_counts) { # q - P(IRR = 1 | PRR = 1) # f - Proportion of bits in the Bloom filter that are set randomly # to 0 or 1 regardless of the underlying true bit value - # obs_counts: a matrix of size m by (4k**2 + 1). Column one contains sample + # obs_counts: a matrix of size m by (4k^2 + 1). Column one contains sample # sizes for each cohort. Other counts indicated how many times # pairs of bits {11, 10, 01, 00} were set across the two # reports (in a "1st report"-major order) @@ -104,50 +104,45 @@ FitDistribution2Way <- function(estimates_stds, map, lsei(A = X, B = Y, G = G, H = H, type = 2)$X } -# FitDistribution2Way <- function(estimates_stds, map, fit) { -# # Find a distribution over rows of map that approximates estimates_stds best -# # -# # Input: -# # estimates_stds: a list of two m x k matrices, one for estimates, another -# # for standard errors -# # map : an (m * k) x S boolean matrix -# # -# # Output: -# # a float vector of length S, so that a distribution over map's rows sampled -# # according to this vector approximates estimates -# -# X <- as.matrix(map) -# Y <- as.vector(t(estimates_stds$estimates)) -# m <- dim(X)[1] -# n <- dim(X)[2] -# wt <- 10000 # weight to marginal constraints -# -# G <- rbind2(Diagonal(n), rep(-1, n)) -# H <- c(rep(0, n), -1) -# -# # Adding marginals constraints to X and Y -# fstrs <- lapply(fit, function(x) x[,"string"]) # found strings -# -# Y <- c(Y, wt * t(fit[[1]]["proportion"]), wt * t(fit[[2]]["proportion"])) -# -# for (strs in fstrs[[1]]) { -# indices <- which(colnames(map) %in% outer(strs, -# fstrs[[2]], -# function(x, y) paste(x, y, sep = "x"))) -# vec <- rep(0, n) -# vec[indices] <- wt -# X <- rbind2(X, vec) -# } -# for (strs in fstrs[[2]]) { -# indices <- which(colnames(map) %in% outer(fstrs[[1]], -# strs, -# function(x, y) paste(x, y, sep = "x"))) -# vec <- rep(0, n) -# vec[indices] <- wt -# X <- rbind2(X, vec) -# } -# -# lsei(A = X, B = Y, G = G, H = H, type = 2)$X +FitDistribution2WayAdditionalConstraints <- function(estimates_stds, map, fit) { + # Experimental code + # Computes the same output as FitDistribution by + # additionally throwing in constraints corresponding to + # 1-way marginals + # Requires non-NULL fit as input (with "proportion" containing marginal info) + + X <- as.matrix(map) + Y <- as.vector(t(estimates_stds$estimates)) + m <- dim(X)[1] + n <- dim(X)[2] + wt <- 10000 # weight to marginal constraints + + G <- rbind2(Diagonal(n), rep(-1, n)) + H <- c(rep(0, n), -1) + + # Adding marginals constraints to X and Y + fstrs <- lapply(fit, function(x) x[,"string"]) # found strings + + Y <- c(Y, wt * t(fit[[1]]["proportion"]), wt * t(fit[[2]]["proportion"])) + + for (strs in fstrs[[1]]) { + indices <- which(colnames(map) %in% outer(strs, + fstrs[[2]], + function(x, y) paste(x, y, sep = "x"))) + vec <- rep(0, n) + vec[indices] <- wt + X <- rbind2(X, vec) + } + for (strs in fstrs[[2]]) { + indices <- which(colnames(map) %in% outer(fstrs[[1]], + strs, + function(x, y) paste(x, y, sep = "x"))) + vec <- rep(0, n) + vec[indices] <- wt + X <- rbind2(X, vec) + } + + lsei(A = X, B = Y, G = G, H = H, type = 2)$X # Random projection params # size <- 10 * n @@ -163,7 +158,7 @@ FitDistribution2Way <- function(estimates_stds, map, # G <- rbind2(Diagonal(nproj), rep(-1, nproj)) # H <- c(rep(0, nproj), -1) # lsei(A = Xproj, B = Yproj, G = G, H = H, type = 2)$X -# } +} Decode2Way <- function(counts, map, params, fit = NULL) { k <- params$k diff --git a/analysis/R/read_input.R b/analysis/R/read_input.R index 35c2cead..051b35c4 100644 --- a/analysis/R/read_input.R +++ b/analysis/R/read_input.R @@ -110,7 +110,7 @@ ReadMapFile <- function(map_file, params = NULL, quote = "") { # object map$map # This is the expected object from ReadMapFile # params = data field with parameters -ProcessMap <- function(map, params) { +CorrectMapForAssoc <- function(map, params) { map$rmap <- map$map map$map <- lapply(1:params$m, function(i) map$rmap[seq(from = ((i - 1) * params$k + 1), diff --git a/assoctest.sh b/assoctest.sh index f9dc392b..74e800f5 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -184,7 +184,7 @@ _run-one-instance() { inp['counts'] = ['$instance_dir/case_2way.csv',\ '$instance_dir/case_marg1.csv',\ '$instance_dir/case_marg2.csv']; \ - inp['expt'] = ['external-counts', 'external-reports-em']; \ + inp['expt'] = ['external-counts']; \ json.dump(inp, f); \ f.close();" diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R index 37e65426..86dad21e 100755 --- a/tests/analyze_assoc_expt.R +++ b/tests/analyze_assoc_expt.R @@ -351,10 +351,11 @@ ExternalCounts <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv ptm <- proc.time() params <- ReadParameterFile(inp$params) # Ensure sufficient maps as required by number of vars + # Correct map from ReadMapFile() for assoc analysis stopifnot(inp$numvars == length(inp$maps)) map <- lapply(inp$maps, function(o) - ProcessMap(ReadMapFile(o, params = params), - params = params)) + CorrectMapForAssoc(ReadMapFile(o, params = params), + params = params)) # (2 way counts, marginal 1 counts, marginal 2 counts) counts <- lapply(1:3, function(i) ReadCountsFile(inp$counts[[i]])) @@ -433,14 +434,17 @@ ExternalCounts <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv ## Outputs: ## ##################################################################### -ExternalReportsEM <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv") { +ExternalReportsEM <- function(inp, + verbose = FALSE, + metrics_filename = "metrics.csv") { ptm <- proc.time() params <- ReadParameterFile(inp$params) # Ensure sufficient maps as required by number of vars stopifnot(inp$numvars == length(inp$maps)) + # Correct map from ReadMapFile() for assoc analysis map <- lapply(inp$maps, function(o) - ProcessMap(ReadMapFile(o, params = params), - params = params)) + CorrectMapForAssoc(ReadMapFile(o, params = params), + params = params)) # Reports must be of the format # client name, cohort no, rappor bitstring 1, rappor bitstring 2, ... diff --git a/tests/params.csv b/tests/params.csv deleted file mode 100644 index 0dd2c58c..00000000 --- a/tests/params.csv +++ /dev/null @@ -1,2 +0,0 @@ -k, h, m, p, q, f -16, 2, 64, 0.1, 0.9, 0.2 From c0ea8cf3aa40134030c05f55f5a27ca1f6a1b159 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 23 Jul 2015 10:36:01 -0700 Subject: [PATCH 54/67] Addressing more review comments. - inverted noise matrix outside loop - renamed gen_assoc_reports - added its test to test.sh - make-summary now shows original dimensions for variables --- analysis/R/decode2way.R | 4 +- assoctest.sh | 2 +- test.sh | 2 + tests/analyze_assoc_expt.R | 548 ------------------ tests/assoctest.html | 20 +- ...ssoc_reports.R => gen_true_values_assoc.R} | 9 +- ...ts_test.R => gen_true_values_assoc_test.R} | 11 +- tests/make_summary_assoc.py | 22 +- 8 files changed, 34 insertions(+), 584 deletions(-) delete mode 100755 tests/analyze_assoc_expt.R rename tests/{gen_assoc_reports.R => gen_true_values_assoc.R} (93%) rename tests/{gen_assoc_reports_test.R => gen_true_values_assoc_test.R} (61%) diff --git a/analysis/R/decode2way.R b/analysis/R/decode2way.R index 600c124e..ce52d341 100644 --- a/analysis/R/decode2way.R +++ b/analysis/R/decode2way.R @@ -61,6 +61,8 @@ EstimateBloomCounts2Way <- function(params, obs_counts) { NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00) NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01) NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2) + # Invert NoiseMatrix for estimator + InvNoiseMatrix <- t(solve(NoiseMatrix)) # Apply the inverse of NoiseMatrix to get an unbiased estimator for # the number of times input pairs of bits were seen. @@ -70,7 +72,7 @@ EstimateBloomCounts2Way <- function(params, obs_counts) { inds <- seq(0, (k/4)-1) v <- x[-1] sapply(inds, function(i){ - as.vector(t(solve(NoiseMatrix)) %*% v[(i*4 + 1):((i+1)*4)]) + as.vector(InvNoiseMatrix %*% v[(i*4 + 1):((i+1)*4)]) }) }) diff --git a/assoctest.sh b/assoctest.sh index 74e800f5..6fbec00a 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -189,7 +189,7 @@ _run-one-instance() { f.close();" time { - tests/analyze_assoc_expt.R --inp $instance_dir/analyze_inp.json + tests/compare_assoc.R --inp $instance_dir/analyze_inp.json } } diff --git a/test.sh b/test.sh index 37ef0f14..22df91f5 100755 --- a/test.sh +++ b/test.sh @@ -112,6 +112,8 @@ r-unit() { tests/gen_true_values_test.R + tests/gen_true_values_assoc_test.R + analysis/R/decode_test.R analysis/test/run_tests.R diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R deleted file mode 100755 index 86dad21e..00000000 --- a/tests/analyze_assoc_expt.R +++ /dev/null @@ -1,548 +0,0 @@ -#!/usr/bin/env Rscript -# -# Copyright 2015 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Reads map files, report files, and RAPPOR parameters to run -# an EM algorithm to estimate joint distribution over two or more variables -# -# Usage: -# $ ./analyze_assoc_expt.R --inp -# -# Input file: -# Outputs: - -library("jsonlite") -library("optparse") - -options(stringsAsFactors = FALSE) - -if(!interactive()) { - option_list <- list( - make_option(c("--inp"), default = "analyze_inp.json", - help = "JSON file with inputs for analyze_assoc_expt")) - opts <- parse_args(OptionParser(option_list = option_list)) -} - -source("analysis/R/decode2way.R") -source("analysis/R/encode.R") -source("analysis/R/decode.R") -source("analysis/R/simulation.R") -source("analysis/R/read_input.R") -source("analysis/R/association.R") -source("tests/gen_counts.R") - -# Wrapper function to print strings only if verbose flag is passed in -PrintIfVerbose <- function(string, flag = FALSE) { - if(flag == TRUE) { - print(string) - } -} - -# TV distance = L1 distance / 2 = 1 - sum(min(df1|x, df2|x)) where -# df1|x / df2|x projects the distribution to the intersection x of the -# supports of df1 and df2 -TVDistance <- function(df1, df2, statement = "TV DISTANCE") { - rowsi <- intersect(rownames(df1), rownames(df2)) - colsi <- intersect(colnames(df1), colnames(df2)) - print(statement) - 1 - sum(mapply(min, - unlist(as.data.frame(df1[rowsi, colsi]), use.names = FALSE), - unlist(as.data.frame(df2[rowsi, colsi]), use.names = FALSE))) -} - -# Function to combine reports -# Currently assume 2-way marginals -CombineReports <- function(reports1, reports2) { - # Encoding (var1, var2) \in {(0, 0), (0, 1), (1, 0), (1, 1)} - two_bits <- list(c(0, 0, 0, 1), c(0, 1, 0, 0), c(0, 0, 1, 0), c(1, 0, 0, 0)) - OuterProd <- function(x, y) { - as.vector(outer(x, y, - function(z, t) z + 2 * t)) - } - # "report1-major" order - creports <- mapply(OuterProd, reports2, reports1, - SIMPLIFY = FALSE) - # Collapse counts to bit vector according to two_bits - lapply(creports, - function(x) as.vector(sapply(x, function(z) two_bits[[z+1]]))) -} - -# Given 2 lists of maps, maps1 and maps2, the function -# combines the maps by cohort and outputs both -# cohort-organized maps and flattened versions -CombineMaps <- function(maps1, maps2) { - # Combine maps - cmap <- mapply(CombineMapsInternal, maps1, maps2) - - # Flatten map - inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE)) - for (i in seq(1, length(inds))) { - inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1] - } - inds <- do.call("rbind", inds) - crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c( - nrow(cmap[[1]]) * length(cmap), - ncol(cmap[[1]]))) - colnames(crmap) <- colnames(cmap[[1]]) - list(cmap = cmap, crmap = crmap) -} - -# Function to combine maps -# Using map1-major order for both candidates and bits of the report -# to be consistent with how CombineReports works -# Currently assume 2-way marginals -CombineMapsInternal <- function(map1, map2) { - # Retrieve set indices and dimensions - rows1 <- which(map1, arr.ind = TRUE)[,1] - cols1 <- which(map1, arr.ind = TRUE)[,2] - length1 <- dim(map1)[[1]] - width1 <- dim(map1)[[2]] - rows2 <- which(map2, arr.ind = TRUE)[,1] - cols2 <- which(map2, arr.ind = TRUE)[,2] - length2 <- dim(map2)[[1]] - width2 <- dim(map2)[[2]] - - # Now process map1 - map1fn <- function(i, j) { - i1 <- seq(1, length2) + ((i-1) * length2) - j1 <- seq(1, width2) + ((j-1) * width2) - expand.grid(i1, j1) - } - map1indices <- do.call(rbind, - mapply(map1fn, rows1, cols1, SIMPLIFY = FALSE)) - map1_big <- sparseMatrix(map1indices[,"Var1"], - map1indices[,"Var2"], - dims = c(length1 * length2, - width1 * width2)) - colnames(map1_big) <- t(outer(colnames(map1), - colnames(map2), - function(x, y) paste(x, y, sep = "x"))) - - # Now process map2 - map2fn <- function(i, j) { - i2 <- i + (seq(0, length1 - 1) * length2) - j2 <- j + (seq(0, width1 - 1) * width2) - expand.grid(i2, j2) - } - map2indices <- do.call(rbind, - mapply(map2fn, rows2, cols2, SIMPLIFY = FALSE)) - map2_big <- sparseMatrix(map2indices[,"Var1"], - map2indices[,"Var2"], - dims = c(length1 * length2, - width1 * width2)) - colnames(map2_big) <- t(outer(colnames(map1), - colnames(map2), - function(x, y) paste(x, y, sep = "x"))) - - # Now collate two maps with entries in (1000, 0100, 0010, 0001) - # (m1&m2, !m1 & m2, m1 & !m2, !(m1 & m2)) respectively - findices <- which(map1_big & map2_big, arr.ind = TRUE) - # 1000 - findices[, 1] <- findices[, 1] * 4 - 3 - # 0100 - indices_0100 <- which((!map1_big) & map2_big, arr.ind = TRUE) - indices_0100[, 1] <- indices_0100[, 1] * 4 - 2 - findices <- rbind(findices, indices_0100) - # 0010 - indices_0010 <- which(map1_big & (!map2_big), arr.ind = TRUE) - indices_0010[, 1] <- indices_0010[, 1] * 4 - 1 - findices <- rbind(findices, indices_0010) - # 0001 - indices_0001 <- which((!map1_big) & (!map2_big), arr.ind = TRUE) - indices_0001[, 1] <- indices_0001[, 1] * 4 - findices <- rbind(findices, indices_0001) - sm <- sparseMatrix(findices[, 1], findices[, 2], - dims = c(4 * length1 * length2, - width1 * width2)) - colnames(sm) <- colnames(map1_big) - sm -} - -GenerateNoiseMatrix <- function(params) { - p <- params$p - q <- params$q - f <- params$f - m <- params$m - k <- params$k - - p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1 - p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1 - p10 <- 1 - p11 # probability of a true 1 reported as 0 - p00 <- 1 - p01 # probability of a true 0 reported as 0 - - NoiseMatrix <- matrix(rep(0, 16), 4) - NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2) - NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00) - NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01) - NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2) - - NoiseMatrix -} - -##################################################################### -## -## Direct simulation of reports WITHOUT simulated variance -## -## Inputs: inp object (from parsing JSON) with -## num - # of reports -## params - file containing RAPPOR params -## varcandidates - list containing # of candidates for each var -## numvars - # of vars (>=2 for association) -## extra - # of extra candidates for var 1 -## -## -## Outputs: Runs simulation of two-way association analysis by directly -## simulating the counts of one way and two way marginals -## -##################################################################### -DirectSimulationOfReports <- function(inp, verbose = FALSE) { - ptm <- proc.time() - params <- ReadParameterFile(inp$params) - strconstant <- c("string", "option") - N <- inp$num - n1 <- inp$varcandidates[[1]] - n2 <- inp$varcandidates[[2]] - - # Construct unique vals for each variable using strconstant - stopifnot(length(strconstant) == inp$numvars) - uvals <- lapply(1:inp$numvars, - function(i) { - apply(as.matrix(1:inp$varcandidates[[i]]), - 1, - function(z) sprintf("%s%d", strconstant[[i]], z)) - }) - - # Add extras if any - if(inp$extras > 0) { - uvals[[1]] <- c(uvals[[1]], apply(as.matrix(1:inp$extras), 1, - function(z) sprintf("%s%d", strconstant[[1]], z + n1))) - } - - # Compute map - map <- lapply(uvals, function(u) CreateMap(u, params)) - - # Trim maps to real # of candidates - # Use extras only for decoding - tmap <- lapply(map[[1]]$map, function(i) i[, 1:n1]) - crmap_trimmed <- CombineMaps(tmap, map[[2]]$map)$crmap - - # Sample values to compute partition - # Zipfian over n1 strings - v1_part <- RandomPartition(N, ComputePdf("zipf1.5", n1)) - # Zipfian over n2 strings for each of variable 1 - # Distr. are correlated as in assoc_sim.R - final_part <- as.vector(sapply(1:n1, - function(i) { - v2_part <- RandomPartition(v1_part[[i]], - ComputePdf("zipf1.5", n2)) - if (i %% 2 == 0) {v2_part} else {rev(v2_part)} - })) - - td <- matrix(final_part/sum(final_part), nrow = n1, ncol = n2, byrow = TRUE) - v2_part <- RandomPartition(N, apply(td, 2, sum)) - ow_parts <- list(v1_part, v2_part) - ow_parts[[1]] <- c(ow_parts[[1]], rep(0, inp$extra)) - - # -------------- - # Generate 1-way counts - ow_counts <- lapply(1:2, function(i) - GenerateCounts(params, map[[i]]$rmap, ow_parts[[i]], 1)) - found_strings <- lapply(1:2, function(i) - Decode(ow_counts[[i]], - map[[i]]$rmap, - params, quick = TRUE)$fit[,"string"]) - # -------------- - - rownames(td) <- uvals[[1]][1:n1] # Don't take into account extras - colnames(td) <- uvals[[2]] - PrintIfVerbose("TRUE DISTRIBUTION", verbose) - PrintIfVerbose(signif(td, 4), verbose) - cohorts <- as.matrix( - apply(as.data.frame(final_part), 1, - function(count) RandomPartition(count, rep(1, params$m)))) - expanded <- apply(cohorts, 2, function(vec) rep(vec, each = ((params$k)**2)*4)) - true_ones <- apply(expanded * crmap_trimmed, 1, sum) - - NoiseMatrix <- GenerateNoiseMatrix(params) - after_noise <- as.vector(sapply(1:(length(true_ones)/4), - function(x) - t(NoiseMatrix) %*% true_ones[((x-1)*4+1):(x*4)])) - counts <- cbind(apply(cohorts, 1, sum), - matrix(after_noise, - nrow = params$m, - ncol = 4 * (params$k**2), - byrow = TRUE)) - - params2 <- params - params2$k <- (params$k ** 2) * 4 - - # Combine maps to feed into Decode2Way - # Prune first to found_strings from Decode on 1-way counts - pruned <- lapply(1:2, function(i) - lapply(map[[i]]$map, function(z) z[,found_strings[[i]]])) - crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap - marginal <- Decode2Way(counts, crmap, params2)$fit - - # Fill in estimated results with rows and cols from td - ed <- matrix(0, nrow = (n1+inp$extra), ncol = n2) - rownames(ed) <- uvals[[1]] - colnames(ed) <- uvals[[2]] - for (cols in colnames(td)) { - for (rows in rownames(td)) { - ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] - } - } - ed[is.na(ed)] <- 0 - time_taken <- proc.time() - ptm - - PrintIfVerbose("2 WAY RESULTS", verbose) - PrintIfVerbose(signif(ed, 4), verbose) - PrintIfVerbose(TVDistance(td, ed, "TV DISTANCE 2 WAY ALGORITHM"), verbose) - PrintIfVerbose("PROC.TIME", verbose) - PrintIfVerbose(time_taken, verbose) - chisq_td <- chisq.test(td)[1][[1]][[1]] - chisq_ed <- chisq.test(ed)[1][[1]][[1]] - if(is.nan(chisq_ed)) { - chisq_ed <- 0 - } - if(is.nan(chisq_td)) { - chisq_td <- 0 - } - - metrics <- list( - td_chisq = chisq_td, - ed_chisq = chisq_ed, - tv = TVDistance(td, ed, ""), - time = time_taken[1], - dim1 = length(found_strings[[1]]), - dim2 = length(found_strings[[2]]) - ) - filename <- file.path(inp$outdir, 'metrics.csv') - write.csv(metrics, file = filename, row.names = FALSE) -} - -##################################################################### -## -## Externally provided counts (gen_assoc_counts.R and sum_assoc_reports.py) -## new_decode flag allows you to switch between two decode algorithm choices -## Note: Only for two way associations -## -## Inputs: inp object (from parsing JSON) with -## count files (2 way counts, individual marginal counts) -## map files (2 variables) -## params file with RAPPOR params -## -## Outputs: Runs simulation of two-way association analysis reading inputs -## from counts, maps, and params file. -##################################################################### -ExternalCounts <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv") { - ptm <- proc.time() - params <- ReadParameterFile(inp$params) - # Ensure sufficient maps as required by number of vars - # Correct map from ReadMapFile() for assoc analysis - stopifnot(inp$numvars == length(inp$maps)) - map <- lapply(inp$maps, function(o) - CorrectMapForAssoc(ReadMapFile(o, params = params), - params = params)) - - # (2 way counts, marginal 1 counts, marginal 2 counts) - counts <- lapply(1:3, function(i) ReadCountsFile(inp$counts[[i]])) - - params2 <- params - params2$k <- (params$k ** 2) * 4 - - # Prune candidates - fit <- lapply(1:2, function(i) - Decode(counts[[i + 1]], - map[[i]]$rmap, - params, quick = FALSE)$fit) - - found_strings = list(fit[[1]][,"string"], fit[[2]][,"string"]) - - if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) { - PrintIfVerbose("FOUND_STRINGS", verbose) - PrintIfVerbose(found_strings, verbose) - stop("No strings found in 1-way marginal.") - } - - # Combine maps to feed into Decode2Way - # Prune first to found_strings from Decode on 1-way counts - pruned <- lapply(1:2, function(i) - lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE])) - crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap - marginal <- Decode2Way(counts[[1]], crmap, params2, fit = fit)$fit - td <- read.csv(file = inp$truefile, header = TRUE) - td <- table(td[,3:4]) - td <- td / sum(td) - ed <- td - for (cols in colnames(td)) { - for (rows in rownames(td)) { - ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] - } - } - ed[is.na(ed)] <- 0 - ed[ed<0] <- 0 - - time_taken <- proc.time() - ptm - - PrintIfVerbose(TVDistance(td, ed, "TV DISTANCE 2 WAY"), verbose) - PrintIfVerbose("PROC.TIME", verbose) - PrintIfVerbose(time_taken, verbose) - chisq_td <- chisq.test(td)[1][[1]][[1]] - chisq_ed <- chisq.test(ed)[1][[1]][[1]] - if(is.nan(chisq_td)) { - chisq_td <- 0 - } - if(is.nan(chisq_ed)) { - chisq_ed <- 0 - } - - metrics <- list( - td_chisq = chisq_td, - ed_chisq = chisq_ed, - tv = TVDistance(td, ed, ""), - time = time_taken[1], - dim1 = length(found_strings[[1]]), - dim2 = length(found_strings[[2]]) - ) - - # Write metrics to metrics_filename (default: metrics.csv) - filename <- file.path(inp$outdir, metrics_filename) - write.csv(metrics, file = filename, row.names = FALSE) -} - -##################################################################### -## -## Externally provided reports -## EM ALGORITHM -## TODO: Also support 3 way association -## -## Inputs: -## -## Outputs: -## -##################################################################### -ExternalReportsEM <- function(inp, - verbose = FALSE, - metrics_filename = "metrics.csv") { - ptm <- proc.time() - params <- ReadParameterFile(inp$params) - # Ensure sufficient maps as required by number of vars - stopifnot(inp$numvars == length(inp$maps)) - # Correct map from ReadMapFile() for assoc analysis - map <- lapply(inp$maps, function(o) - CorrectMapForAssoc(ReadMapFile(o, params = params), - params = params)) - - # Reports must be of the format - # client name, cohort no, rappor bitstring 1, rappor bitstring 2, ... - reportsObj <- read.csv(inp$reports, - colClasses = c("character", "integer", - rep("character", inp$numvars)), - header = TRUE) - # Ignore the first column - reportsObj <- reportsObj[,-1] - - # Parsing reportsObj - # ComputeDistributionEM allows for different sets of cohorts - # for each variable. Here, both sets of cohorts are identical - co <- as.list(reportsObj[1])[[1]] - co <- co + 1 # 1 indexing - cohorts <- rep(list(co), inp$numvars) - # Parse reports from reportObj cols 2, 3, ... - reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1])) - - # Split strings into bit arrays (as required by assoc analysis) - reports <- lapply(1:inp$numvars, function(i) { - # apply the following function to each of reports[[1]] and reports[[2]] - lapply(reports[[i]][[1]], function(x) { - # function splits strings and converts them to numeric values - # rev needed for endianness - rev(as.numeric(strsplit(x, split = "")[[1]])) - }) - }) - - joint_dist <- ComputeDistributionEM(reports, cohorts, map, - ignore_other = TRUE, - quick = TRUE, - params, marginals = NULL, - estimate_var = FALSE) - em <- joint_dist$fit - td <- read.csv(file = inp$truefile, header = FALSE) - td <- table(td[,3:4]) - td <- td / sum(td) - time_taken <- proc.time() - ptm - - PrintIfVerbose(TVDistance(td, em, "TV DISTANCE EM"), verbose) - PrintIfVerbose("PROC.TIME", verbose) - PrintIfVerbose(time_taken, verbose) - chisq_td <- chisq.test(td)[1][[1]][[1]] - chisq_ed <- chisq.test(em)[1][[1]][[1]] - if(is.nan(chisq_td)) { - chisq_td <- 0 - } - if(is.nan(chisq_ed)) { - chisq_ed <- 0 - } - - metrics <- list( - td_chisq = chisq_td, - ed_chisq = chisq_ed, - tv = TVDistance(td, em, ""), - time = time_taken[1], - dim1 = dim(em)[[1]], - dim2 = dim(em)[[2]] - ) - - # Write metrics to metrics_filename (default: metrics.csv) - filename <- file.path(inp$outdir, metrics_filename) - write.csv(metrics, file = filename, row.names = FALSE) -} - -main <- function(opts) { - inp <- fromJSON(opts$inp) - verbose_flag <- inp$verbose - # Choose from a set of experiments to run - # direct -> direct simulation of reports (without variances) - # external-counts -> externally supplied counts for 2 way and marginals - # external-reports -> externally supplied reports - - if("direct" %in% inp$expt) { - PrintIfVerbose("Running Experiment Direct", verbose_flag) - DirectSimulationOfReports(inp, verbose = verbose_flag) - } - if ("external-counts" %in% inp$expt) { - PrintIfVerbose("Running Experiment Ext Counts", verbose_flag) - if ("direct" %in% inp$expt) { - # external-counts expt is run to compare results - ExternalCounts(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv") - } else { - ExternalCounts(inp, verbose = verbose_flag) - } - } - if ("external-reports-em" %in% inp$expt) { - PrintIfVerbose("Running Experiment Ext Reports", verbose_flag) - if (("direct" %in% inp$expt)||("external-counts" %in% inp$expt)) { - # external-reports-em expt is run to compare results - ExternalReportsEM(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv") - } else { - ExternalReportsEM(inp, verbose = verbose_flag) - } - } -} - -if(!interactive()) { - main(opts) -} diff --git a/tests/assoctest.html b/tests/assoctest.html index 0c839c86..e4b23875 100644 --- a/tests/assoctest.html +++ b/tests/assoctest.html @@ -25,9 +25,9 @@

RAPPOR assoctest.sh

Test Case + Input Params @@ -48,8 +48,9 @@

RAPPOR assoctest.sh

+ n: num reports
+ e: num extras
k: report bits
@@ -71,6 +72,7 @@

RAPPOR assoctest.sh

ne k h %s
%s%s
%s
%s
%s
%s
- + - + @@ -35,20 +35,22 @@

RAPPOR assoctest.sh

- - - @@ -58,11 +60,9 @@

RAPPOR assoctest.sh

m: cohorts
p, q, f: probabilities
- @@ -71,6 +71,8 @@

RAPPOR assoctest.sh

+ + @@ -83,8 +85,6 @@

RAPPOR assoctest.sh

- - diff --git a/tests/gen_assoc_reports.R b/tests/gen_true_values_assoc.R similarity index 93% rename from tests/gen_assoc_reports.R rename to tests/gen_true_values_assoc.R index e6adb7e6..779fe398 100755 --- a/tests/gen_assoc_reports.R +++ b/tests/gen_true_values_assoc.R @@ -14,14 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -# TODO: Rename reports to values (more in line with its usage for histogram -# RAPPOR) - source('tests/gen_counts.R') # Usage: # -# $ ./gen_assoc_reports.R 100 20 10000 foo.csv +# $ ./gen_true_values_assoc.R 100 20 10000 foo.csv # # Inputs: # size of the distribution's support for var 1 @@ -31,7 +28,7 @@ source('tests/gen_counts.R') # Output: # csv file with reports sampled according to the specified distribution. -GenerateAssocReports <- function(n, N, num_cohorts) { +GenerateTrueValuesAssoc <- function(n, N, num_cohorts) { # Inputs: n, a list of supports for vars 1, 2 # N, the number of reports/clients # num_cohorts, the number of cohorts @@ -83,7 +80,7 @@ main <- function(argv) { num_cohorts <- as.integer(argv[[4]]) out_file <- argv[[5]] - res <- GenerateAssocReports(n, N, num_cohorts) + res <- GenerateTrueValuesAssoc(n, N, num_cohorts) # Prepend with str and opt reports <- list(sprintf("str%d", res$values[[1]]), sprintf("opt%d", res$values[[2]])) diff --git a/tests/gen_assoc_reports_test.R b/tests/gen_true_values_assoc_test.R similarity index 61% rename from tests/gen_assoc_reports_test.R rename to tests/gen_true_values_assoc_test.R index 10f88c51..ebef1e77 100755 --- a/tests/gen_assoc_reports_test.R +++ b/tests/gen_true_values_assoc_test.R @@ -4,15 +4,15 @@ source('analysis/R/util.R') # Log() -source('tests/gen_assoc_reports.R') # module under test +source('tests/gen_true_values_assoc.R') # module under test library(RUnit) -TestGenerateAssocReports <- function() { +TestGenerateTrueValuesAssoc <- function() { # list for support of var1, var2, # total number of reports # num_cohorts - res <- GenerateAssocReports(list(20, 5), 1000, 32) + res <- GenerateTrueValuesAssoc(list(20, 5), 1000, 32) # print(res$values) # 1000 reports @@ -25,10 +25,13 @@ TestGenerateAssocReports <- function() { # Ensure cohorts are filled up checkEquals(32, length(unique(res$cohort))) + + # TODO: Add tests to confirm (w.h.p.?) that certain distribution aspects are + # as expected (such as the zipfian on marginals) } TestAll <- function(){ - TestGenerateAssocReports() + TestGenerateTrueValuesAssoc() } TestAll() diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py index ad21ea44..f1b0d1ca 100755 --- a/tests/make_summary_assoc.py +++ b/tests/make_summary_assoc.py @@ -19,6 +19,8 @@ + + @@ -31,8 +33,6 @@ - - @@ -125,18 +125,14 @@ def MeanOfMeans(dict_of_lists): def ParseSpecFile(spec_filename, empty = False): - """Parses the spec (parameters) file. + #Parses the spec (parameters) file. - Returns: - An integer and a string. The integer is the number of bogus candidates - and the string is parameters in the HTML format. - """ with open(spec_filename) as s: spec_row = s.readline().split() - spec_in_html = ' '.join('' % cell for cell in spec_row[3:]) + spec_in_html = ' '.join('' % cell for cell in spec_row[1:]) if empty == True: - spec_in_html = ' '.join('' for cell in spec_row[3:]) + spec_in_html = ' '.join('' for cell in spec_row[1:]) return spec_in_html @@ -185,8 +181,6 @@ def ParseMetrics(metrics_file, log_file, italics = False): metrics_row_str = [ '%s' % d1, '%s' % d2, - '%.3f' % td_chisq, - '%.3f' % ed_chisq, '%.3f' % l1d, str(rtime), ] @@ -249,7 +243,7 @@ def FormatSummaryRow(metrics_lists): summary = { 'name': 'Means', 'mean_l1d': FormatMeanWithSem(means_with_sem['l1d'], percent=False), - 'mean_chisqdiff': FormatMeanWithSem(means_with_sem['chisqdiff'], percent=False), + # 'mean_chisqdiff': FormatMeanWithSem(means_with_sem['chisqdiff'], percent=False), 'mean_rtime': FormatMeanWithSem(means_with_sem['rtime']), } return SUMMARY_ROW % summary @@ -345,7 +339,7 @@ def main(argv): # Print summary of test instances if(int(test_instance) == num_instances): - row_str = ['', '', '', '', + row_str = ['', '', '%.3f±%.3f' % (Mean(l1d_list), StandardErrorEstimate(l1d_list)), '', ] @@ -353,7 +347,7 @@ def main(argv): True), ' '.join('' % cell for cell in row_str)) if (os.path.isfile(metrics_file)): - row_str2 = ['', '', '', '', + row_str2 = ['', '', '%.3f±%.3f' % (Mean(l1d_list2), StandardErrorEstimate(l1d_list2)), '', ] From 870ee045e5002b40b98a2dbd3a53f239b9c67bb5 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 23 Jul 2015 12:41:48 -0700 Subject: [PATCH 55/67] Adding sum_bits_assoc_test and fixing small error in assoctest.sh --- analysis/tools/sum_bits_assoc.py | 16 ++-- analysis/tools/sum_bits_assoc_test.py | 116 ++++++++++++++++++++++++++ assoctest.sh | 2 +- 3 files changed, 127 insertions(+), 7 deletions(-) create mode 100755 analysis/tools/sum_bits_assoc_test.py diff --git a/analysis/tools/sum_bits_assoc.py b/analysis/tools/sum_bits_assoc.py index a858d78f..2263b671 100755 --- a/analysis/tools/sum_bits_assoc.py +++ b/analysis/tools/sum_bits_assoc.py @@ -31,9 +31,12 @@ def SumBits(params, stdin, f_2way, f_1, f_2): csv_in = csv.reader(stdin) - csv_out_two_way = csv.writer(open(f_2way, "w")) - csv_out_1 = csv.writer(open(f_1, "w")) - csv_out_2 = csv.writer(open(f_2, "w")) + csv_out_two_way = csv.writer(f_2way) + csv_out_1 = csv.writer(f_1) + csv_out_2 = csv.writer(f_2) +# csv_out_two_way = csv.writer(open(f_2way, "w")) +# csv_out_1 = csv.writer(open(f_1, "w")) +# csv_out_2 = csv.writer(open(f_2, "w")) num_cohorts = params.num_cohorts num_bloombits = params.num_bloombits @@ -59,7 +62,6 @@ def SumBits(params, stdin, f_2way, f_1, f_2): raise RuntimeError('Error indexing cohort number %d (num_cohorts is %d) \ ' % (cohort, num_cohorts)) - # TODO: Extend checking for both reports if not len(irr_1) == params.num_bloombits: raise RuntimeError( "Expected %d bits in report 1, got %r" % @@ -118,8 +120,10 @@ def main(argv): except rappor.Error as e: raise RuntimeError(e) - SumBits(params, sys.stdin, prefix + "_2way.csv", - prefix + "_marg1.csv", prefix + "_marg2.csv") + with open(prefix + "_2way.csv", "w") as f_2way: + with open(prefix + "_marg1.csv", "w") as f_1: + with open(prefix + "_marg2.csv", "w") as f_2: + SumBits(params, sys.stdin, f_2way, f_1, f_2) if __name__ == '__main__': diff --git a/analysis/tools/sum_bits_assoc_test.py b/analysis/tools/sum_bits_assoc_test.py new file mode 100755 index 00000000..fe37f1ce --- /dev/null +++ b/analysis/tools/sum_bits_assoc_test.py @@ -0,0 +1,116 @@ +#!/usr/bin/python -S +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +sum_bits_assoc_test.py: Tests for sum_bits_assoc.py +""" + +import cStringIO +import unittest + +import rappor +import sum_bits_assoc # module under test + + +# The header doesn't matter +CSV_IN = """\ +user_id,cohort,irr1,irr2 +5,1,0011,1010 +5,1,0011,1010 +5,1,0000,0000 +""" + +# ############################### +# EXPECTED_F_2WAY +# +# NOTE: bit order is reversed. +# First row is 65 zeroes +EXPECTED_F_2WAY = """\ +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\ +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\r +""" + +# Cohort 1 +# Total # of reports +EXPECTED_F_2WAY = EXPECTED_F_2WAY + "3," + +# Looking at LSBs of both irrs +# Total # of (11, 01, 10, 00) that appear +EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,2,1," + +# Report 1-major order. So looking at LSB of irr1 and 2nd LSB of irr2 +EXPECTED_F_2WAY = EXPECTED_F_2WAY + "2,0,0,1," + +# And so on ... +EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,2,1," +EXPECTED_F_2WAY = EXPECTED_F_2WAY + "2,0,0,1," + +# Now moving on to 2nd LSB of irr1 +EXPECTED_F_2WAY = EXPECTED_F_2WAY + ("0,0,2,1,2,0,0,1," * 2) + +# Now moving on to 3rd LSB of irr1 +# Note that for 3rd LSB of irr1 and LSB of irr2, there are three 00s +EXPECTED_F_2WAY = EXPECTED_F_2WAY + ("0,0,0,3,0,2,0,1," * 2) +# MSB of irr1 +EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,0,3,0,2,0,1," + "0,0,0,3,0,2,0,1" + +# end of EXPECTED_F_2WAY +# ############################### + +# NOTE: bit order is reversed. +EXPECTED_F_1 = """\ +0,0,0,0,0\r +3,2,2,0,0\r +""" + +# NOTE: bit order is reversed. +EXPECTED_F_2 = """\ +0,0,0,0,0\r +3,0,2,0,2\r +""" + +class SumBitsAssocTest(unittest.TestCase): + + def setUp(self): + self.params = rappor.Params() + self.params.num_bloombits = 4 + self.params.num_cohorts = 2 + self.maxDiff = None + + def testSum(self): + stdin = cStringIO.StringIO(CSV_IN) + f_2way = cStringIO.StringIO() + f_1 = cStringIO.StringIO() + f_2 = cStringIO.StringIO() + + sum_bits_assoc.SumBits(self.params, stdin, f_2way, f_1, f_2) + print f_2way.getvalue() + print EXPECTED_F_2WAY + + self.assertMultiLineEqual(EXPECTED_F_1, f_1.getvalue()) + self.assertMultiLineEqual(EXPECTED_F_2, f_2.getvalue()) + self.assertMultiLineEqual(EXPECTED_F_2WAY, f_2way.getvalue()) + +# def testErrors(self): +# stdin = cStringIO.StringIO(TOO_MANY_COLUMNS) +# stdout = cStringIO.StringIO() +# +# self.assertRaises( +# RuntimeError, sum_bits.SumBits, self.params, stdin, stdout) + + +if __name__ == '__main__': + unittest.main() diff --git a/assoctest.sh b/assoctest.sh index 6fbec00a..17d02df6 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -114,7 +114,7 @@ _run-one-instance() { banner "Generating input" - tests/gen_assoc_reports.R $num_unique_values $num_unique_values2 \ + tests/gen_true_values_assoc.R $num_unique_values $num_unique_values2 \ $num_clients $num_cohorts $instance_dir/case.csv banner "Running RAPPOR client" From 964f8a9de7edf48e08895718bc8ff869dac7ba3f Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 23 Jul 2015 12:46:28 -0700 Subject: [PATCH 56/67] Adding sum_bits_assoc_test.py --- analysis/tools/sum_bits_assoc_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/tools/sum_bits_assoc_test.py b/analysis/tools/sum_bits_assoc_test.py index fe37f1ce..3a5fb208 100755 --- a/analysis/tools/sum_bits_assoc_test.py +++ b/analysis/tools/sum_bits_assoc_test.py @@ -65,7 +65,7 @@ # Note that for 3rd LSB of irr1 and LSB of irr2, there are three 00s EXPECTED_F_2WAY = EXPECTED_F_2WAY + ("0,0,0,3,0,2,0,1," * 2) # MSB of irr1 -EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,0,3,0,2,0,1," + "0,0,0,3,0,2,0,1" +EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,0,3,0,2,0,1," + "0,0,0,3,0,2,0,1\r\n" # end of EXPECTED_F_2WAY # ############################### From a4accc9042a88f05ed6968ce0902d7fdddb338ee Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 23 Jul 2015 13:23:51 -0700 Subject: [PATCH 57/67] Added a couple more tests to sum_bits_assoc_test --- analysis/tools/sum_bits_assoc_test.py | 33 +++++++++++++++++++-------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/analysis/tools/sum_bits_assoc_test.py b/analysis/tools/sum_bits_assoc_test.py index 3a5fb208..11ac2ff1 100755 --- a/analysis/tools/sum_bits_assoc_test.py +++ b/analysis/tools/sum_bits_assoc_test.py @@ -82,6 +82,16 @@ 3,0,2,0,2\r """ +WRONG_IRR_BITS = """\ +user_id,cohort,irr1,irr2 +cli1,1,00123,11223 +""" + +WRONG_COHORT = """\ +user_id,cohort,irr1,irr2 +cli1,3,0011,0001 +""" + class SumBitsAssocTest(unittest.TestCase): def setUp(self): @@ -97,19 +107,24 @@ def testSum(self): f_2 = cStringIO.StringIO() sum_bits_assoc.SumBits(self.params, stdin, f_2way, f_1, f_2) - print f_2way.getvalue() - print EXPECTED_F_2WAY - self.assertMultiLineEqual(EXPECTED_F_1, f_1.getvalue()) self.assertMultiLineEqual(EXPECTED_F_2, f_2.getvalue()) self.assertMultiLineEqual(EXPECTED_F_2WAY, f_2way.getvalue()) -# def testErrors(self): -# stdin = cStringIO.StringIO(TOO_MANY_COLUMNS) -# stdout = cStringIO.StringIO() -# -# self.assertRaises( -# RuntimeError, sum_bits.SumBits, self.params, stdin, stdout) + def testErrors(self): + f_2way = cStringIO.StringIO() + f_1 = cStringIO.StringIO() + f_2 = cStringIO.StringIO() + + stdin = cStringIO.StringIO(WRONG_IRR_BITS) + self.assertRaises( + RuntimeError, sum_bits_assoc.SumBits, self.params, stdin, + f_2way, f_1, f_2) + + stdin = cStringIO.StringIO(WRONG_COHORT) + self.assertRaises( + RuntimeError, sum_bits_assoc.SumBits, self.params, stdin, + f_2way, f_1, f_2) if __name__ == '__main__': From e66ffd13f90bbed265b4c38e157dfb79899b317f Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 23 Jul 2015 16:47:08 -0700 Subject: [PATCH 58/67] Adding compare_assoc.R instead of analyze_assoc_expt.R --- tests/compare_assoc.R | 548 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 548 insertions(+) create mode 100755 tests/compare_assoc.R diff --git a/tests/compare_assoc.R b/tests/compare_assoc.R new file mode 100755 index 00000000..86dad21e --- /dev/null +++ b/tests/compare_assoc.R @@ -0,0 +1,548 @@ +#!/usr/bin/env Rscript +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Reads map files, report files, and RAPPOR parameters to run +# an EM algorithm to estimate joint distribution over two or more variables +# +# Usage: +# $ ./analyze_assoc_expt.R --inp +# +# Input file: +# Outputs: + +library("jsonlite") +library("optparse") + +options(stringsAsFactors = FALSE) + +if(!interactive()) { + option_list <- list( + make_option(c("--inp"), default = "analyze_inp.json", + help = "JSON file with inputs for analyze_assoc_expt")) + opts <- parse_args(OptionParser(option_list = option_list)) +} + +source("analysis/R/decode2way.R") +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/read_input.R") +source("analysis/R/association.R") +source("tests/gen_counts.R") + +# Wrapper function to print strings only if verbose flag is passed in +PrintIfVerbose <- function(string, flag = FALSE) { + if(flag == TRUE) { + print(string) + } +} + +# TV distance = L1 distance / 2 = 1 - sum(min(df1|x, df2|x)) where +# df1|x / df2|x projects the distribution to the intersection x of the +# supports of df1 and df2 +TVDistance <- function(df1, df2, statement = "TV DISTANCE") { + rowsi <- intersect(rownames(df1), rownames(df2)) + colsi <- intersect(colnames(df1), colnames(df2)) + print(statement) + 1 - sum(mapply(min, + unlist(as.data.frame(df1[rowsi, colsi]), use.names = FALSE), + unlist(as.data.frame(df2[rowsi, colsi]), use.names = FALSE))) +} + +# Function to combine reports +# Currently assume 2-way marginals +CombineReports <- function(reports1, reports2) { + # Encoding (var1, var2) \in {(0, 0), (0, 1), (1, 0), (1, 1)} + two_bits <- list(c(0, 0, 0, 1), c(0, 1, 0, 0), c(0, 0, 1, 0), c(1, 0, 0, 0)) + OuterProd <- function(x, y) { + as.vector(outer(x, y, + function(z, t) z + 2 * t)) + } + # "report1-major" order + creports <- mapply(OuterProd, reports2, reports1, + SIMPLIFY = FALSE) + # Collapse counts to bit vector according to two_bits + lapply(creports, + function(x) as.vector(sapply(x, function(z) two_bits[[z+1]]))) +} + +# Given 2 lists of maps, maps1 and maps2, the function +# combines the maps by cohort and outputs both +# cohort-organized maps and flattened versions +CombineMaps <- function(maps1, maps2) { + # Combine maps + cmap <- mapply(CombineMapsInternal, maps1, maps2) + + # Flatten map + inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE)) + for (i in seq(1, length(inds))) { + inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1] + } + inds <- do.call("rbind", inds) + crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c( + nrow(cmap[[1]]) * length(cmap), + ncol(cmap[[1]]))) + colnames(crmap) <- colnames(cmap[[1]]) + list(cmap = cmap, crmap = crmap) +} + +# Function to combine maps +# Using map1-major order for both candidates and bits of the report +# to be consistent with how CombineReports works +# Currently assume 2-way marginals +CombineMapsInternal <- function(map1, map2) { + # Retrieve set indices and dimensions + rows1 <- which(map1, arr.ind = TRUE)[,1] + cols1 <- which(map1, arr.ind = TRUE)[,2] + length1 <- dim(map1)[[1]] + width1 <- dim(map1)[[2]] + rows2 <- which(map2, arr.ind = TRUE)[,1] + cols2 <- which(map2, arr.ind = TRUE)[,2] + length2 <- dim(map2)[[1]] + width2 <- dim(map2)[[2]] + + # Now process map1 + map1fn <- function(i, j) { + i1 <- seq(1, length2) + ((i-1) * length2) + j1 <- seq(1, width2) + ((j-1) * width2) + expand.grid(i1, j1) + } + map1indices <- do.call(rbind, + mapply(map1fn, rows1, cols1, SIMPLIFY = FALSE)) + map1_big <- sparseMatrix(map1indices[,"Var1"], + map1indices[,"Var2"], + dims = c(length1 * length2, + width1 * width2)) + colnames(map1_big) <- t(outer(colnames(map1), + colnames(map2), + function(x, y) paste(x, y, sep = "x"))) + + # Now process map2 + map2fn <- function(i, j) { + i2 <- i + (seq(0, length1 - 1) * length2) + j2 <- j + (seq(0, width1 - 1) * width2) + expand.grid(i2, j2) + } + map2indices <- do.call(rbind, + mapply(map2fn, rows2, cols2, SIMPLIFY = FALSE)) + map2_big <- sparseMatrix(map2indices[,"Var1"], + map2indices[,"Var2"], + dims = c(length1 * length2, + width1 * width2)) + colnames(map2_big) <- t(outer(colnames(map1), + colnames(map2), + function(x, y) paste(x, y, sep = "x"))) + + # Now collate two maps with entries in (1000, 0100, 0010, 0001) + # (m1&m2, !m1 & m2, m1 & !m2, !(m1 & m2)) respectively + findices <- which(map1_big & map2_big, arr.ind = TRUE) + # 1000 + findices[, 1] <- findices[, 1] * 4 - 3 + # 0100 + indices_0100 <- which((!map1_big) & map2_big, arr.ind = TRUE) + indices_0100[, 1] <- indices_0100[, 1] * 4 - 2 + findices <- rbind(findices, indices_0100) + # 0010 + indices_0010 <- which(map1_big & (!map2_big), arr.ind = TRUE) + indices_0010[, 1] <- indices_0010[, 1] * 4 - 1 + findices <- rbind(findices, indices_0010) + # 0001 + indices_0001 <- which((!map1_big) & (!map2_big), arr.ind = TRUE) + indices_0001[, 1] <- indices_0001[, 1] * 4 + findices <- rbind(findices, indices_0001) + sm <- sparseMatrix(findices[, 1], findices[, 2], + dims = c(4 * length1 * length2, + width1 * width2)) + colnames(sm) <- colnames(map1_big) + sm +} + +GenerateNoiseMatrix <- function(params) { + p <- params$p + q <- params$q + f <- params$f + m <- params$m + k <- params$k + + p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1 + p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1 + p10 <- 1 - p11 # probability of a true 1 reported as 0 + p00 <- 1 - p01 # probability of a true 0 reported as 0 + + NoiseMatrix <- matrix(rep(0, 16), 4) + NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2) + NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00) + NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01) + NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2) + + NoiseMatrix +} + +##################################################################### +## +## Direct simulation of reports WITHOUT simulated variance +## +## Inputs: inp object (from parsing JSON) with +## num - # of reports +## params - file containing RAPPOR params +## varcandidates - list containing # of candidates for each var +## numvars - # of vars (>=2 for association) +## extra - # of extra candidates for var 1 +## +## +## Outputs: Runs simulation of two-way association analysis by directly +## simulating the counts of one way and two way marginals +## +##################################################################### +DirectSimulationOfReports <- function(inp, verbose = FALSE) { + ptm <- proc.time() + params <- ReadParameterFile(inp$params) + strconstant <- c("string", "option") + N <- inp$num + n1 <- inp$varcandidates[[1]] + n2 <- inp$varcandidates[[2]] + + # Construct unique vals for each variable using strconstant + stopifnot(length(strconstant) == inp$numvars) + uvals <- lapply(1:inp$numvars, + function(i) { + apply(as.matrix(1:inp$varcandidates[[i]]), + 1, + function(z) sprintf("%s%d", strconstant[[i]], z)) + }) + + # Add extras if any + if(inp$extras > 0) { + uvals[[1]] <- c(uvals[[1]], apply(as.matrix(1:inp$extras), 1, + function(z) sprintf("%s%d", strconstant[[1]], z + n1))) + } + + # Compute map + map <- lapply(uvals, function(u) CreateMap(u, params)) + + # Trim maps to real # of candidates + # Use extras only for decoding + tmap <- lapply(map[[1]]$map, function(i) i[, 1:n1]) + crmap_trimmed <- CombineMaps(tmap, map[[2]]$map)$crmap + + # Sample values to compute partition + # Zipfian over n1 strings + v1_part <- RandomPartition(N, ComputePdf("zipf1.5", n1)) + # Zipfian over n2 strings for each of variable 1 + # Distr. are correlated as in assoc_sim.R + final_part <- as.vector(sapply(1:n1, + function(i) { + v2_part <- RandomPartition(v1_part[[i]], + ComputePdf("zipf1.5", n2)) + if (i %% 2 == 0) {v2_part} else {rev(v2_part)} + })) + + td <- matrix(final_part/sum(final_part), nrow = n1, ncol = n2, byrow = TRUE) + v2_part <- RandomPartition(N, apply(td, 2, sum)) + ow_parts <- list(v1_part, v2_part) + ow_parts[[1]] <- c(ow_parts[[1]], rep(0, inp$extra)) + + # -------------- + # Generate 1-way counts + ow_counts <- lapply(1:2, function(i) + GenerateCounts(params, map[[i]]$rmap, ow_parts[[i]], 1)) + found_strings <- lapply(1:2, function(i) + Decode(ow_counts[[i]], + map[[i]]$rmap, + params, quick = TRUE)$fit[,"string"]) + # -------------- + + rownames(td) <- uvals[[1]][1:n1] # Don't take into account extras + colnames(td) <- uvals[[2]] + PrintIfVerbose("TRUE DISTRIBUTION", verbose) + PrintIfVerbose(signif(td, 4), verbose) + cohorts <- as.matrix( + apply(as.data.frame(final_part), 1, + function(count) RandomPartition(count, rep(1, params$m)))) + expanded <- apply(cohorts, 2, function(vec) rep(vec, each = ((params$k)**2)*4)) + true_ones <- apply(expanded * crmap_trimmed, 1, sum) + + NoiseMatrix <- GenerateNoiseMatrix(params) + after_noise <- as.vector(sapply(1:(length(true_ones)/4), + function(x) + t(NoiseMatrix) %*% true_ones[((x-1)*4+1):(x*4)])) + counts <- cbind(apply(cohorts, 1, sum), + matrix(after_noise, + nrow = params$m, + ncol = 4 * (params$k**2), + byrow = TRUE)) + + params2 <- params + params2$k <- (params$k ** 2) * 4 + + # Combine maps to feed into Decode2Way + # Prune first to found_strings from Decode on 1-way counts + pruned <- lapply(1:2, function(i) + lapply(map[[i]]$map, function(z) z[,found_strings[[i]]])) + crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap + marginal <- Decode2Way(counts, crmap, params2)$fit + + # Fill in estimated results with rows and cols from td + ed <- matrix(0, nrow = (n1+inp$extra), ncol = n2) + rownames(ed) <- uvals[[1]] + colnames(ed) <- uvals[[2]] + for (cols in colnames(td)) { + for (rows in rownames(td)) { + ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] + } + } + ed[is.na(ed)] <- 0 + time_taken <- proc.time() - ptm + + PrintIfVerbose("2 WAY RESULTS", verbose) + PrintIfVerbose(signif(ed, 4), verbose) + PrintIfVerbose(TVDistance(td, ed, "TV DISTANCE 2 WAY ALGORITHM"), verbose) + PrintIfVerbose("PROC.TIME", verbose) + PrintIfVerbose(time_taken, verbose) + chisq_td <- chisq.test(td)[1][[1]][[1]] + chisq_ed <- chisq.test(ed)[1][[1]][[1]] + if(is.nan(chisq_ed)) { + chisq_ed <- 0 + } + if(is.nan(chisq_td)) { + chisq_td <- 0 + } + + metrics <- list( + td_chisq = chisq_td, + ed_chisq = chisq_ed, + tv = TVDistance(td, ed, ""), + time = time_taken[1], + dim1 = length(found_strings[[1]]), + dim2 = length(found_strings[[2]]) + ) + filename <- file.path(inp$outdir, 'metrics.csv') + write.csv(metrics, file = filename, row.names = FALSE) +} + +##################################################################### +## +## Externally provided counts (gen_assoc_counts.R and sum_assoc_reports.py) +## new_decode flag allows you to switch between two decode algorithm choices +## Note: Only for two way associations +## +## Inputs: inp object (from parsing JSON) with +## count files (2 way counts, individual marginal counts) +## map files (2 variables) +## params file with RAPPOR params +## +## Outputs: Runs simulation of two-way association analysis reading inputs +## from counts, maps, and params file. +##################################################################### +ExternalCounts <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv") { + ptm <- proc.time() + params <- ReadParameterFile(inp$params) + # Ensure sufficient maps as required by number of vars + # Correct map from ReadMapFile() for assoc analysis + stopifnot(inp$numvars == length(inp$maps)) + map <- lapply(inp$maps, function(o) + CorrectMapForAssoc(ReadMapFile(o, params = params), + params = params)) + + # (2 way counts, marginal 1 counts, marginal 2 counts) + counts <- lapply(1:3, function(i) ReadCountsFile(inp$counts[[i]])) + + params2 <- params + params2$k <- (params$k ** 2) * 4 + + # Prune candidates + fit <- lapply(1:2, function(i) + Decode(counts[[i + 1]], + map[[i]]$rmap, + params, quick = FALSE)$fit) + + found_strings = list(fit[[1]][,"string"], fit[[2]][,"string"]) + + if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) { + PrintIfVerbose("FOUND_STRINGS", verbose) + PrintIfVerbose(found_strings, verbose) + stop("No strings found in 1-way marginal.") + } + + # Combine maps to feed into Decode2Way + # Prune first to found_strings from Decode on 1-way counts + pruned <- lapply(1:2, function(i) + lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE])) + crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap + marginal <- Decode2Way(counts[[1]], crmap, params2, fit = fit)$fit + td <- read.csv(file = inp$truefile, header = TRUE) + td <- table(td[,3:4]) + td <- td / sum(td) + ed <- td + for (cols in colnames(td)) { + for (rows in rownames(td)) { + ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] + } + } + ed[is.na(ed)] <- 0 + ed[ed<0] <- 0 + + time_taken <- proc.time() - ptm + + PrintIfVerbose(TVDistance(td, ed, "TV DISTANCE 2 WAY"), verbose) + PrintIfVerbose("PROC.TIME", verbose) + PrintIfVerbose(time_taken, verbose) + chisq_td <- chisq.test(td)[1][[1]][[1]] + chisq_ed <- chisq.test(ed)[1][[1]][[1]] + if(is.nan(chisq_td)) { + chisq_td <- 0 + } + if(is.nan(chisq_ed)) { + chisq_ed <- 0 + } + + metrics <- list( + td_chisq = chisq_td, + ed_chisq = chisq_ed, + tv = TVDistance(td, ed, ""), + time = time_taken[1], + dim1 = length(found_strings[[1]]), + dim2 = length(found_strings[[2]]) + ) + + # Write metrics to metrics_filename (default: metrics.csv) + filename <- file.path(inp$outdir, metrics_filename) + write.csv(metrics, file = filename, row.names = FALSE) +} + +##################################################################### +## +## Externally provided reports +## EM ALGORITHM +## TODO: Also support 3 way association +## +## Inputs: +## +## Outputs: +## +##################################################################### +ExternalReportsEM <- function(inp, + verbose = FALSE, + metrics_filename = "metrics.csv") { + ptm <- proc.time() + params <- ReadParameterFile(inp$params) + # Ensure sufficient maps as required by number of vars + stopifnot(inp$numvars == length(inp$maps)) + # Correct map from ReadMapFile() for assoc analysis + map <- lapply(inp$maps, function(o) + CorrectMapForAssoc(ReadMapFile(o, params = params), + params = params)) + + # Reports must be of the format + # client name, cohort no, rappor bitstring 1, rappor bitstring 2, ... + reportsObj <- read.csv(inp$reports, + colClasses = c("character", "integer", + rep("character", inp$numvars)), + header = TRUE) + # Ignore the first column + reportsObj <- reportsObj[,-1] + + # Parsing reportsObj + # ComputeDistributionEM allows for different sets of cohorts + # for each variable. Here, both sets of cohorts are identical + co <- as.list(reportsObj[1])[[1]] + co <- co + 1 # 1 indexing + cohorts <- rep(list(co), inp$numvars) + # Parse reports from reportObj cols 2, 3, ... + reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1])) + + # Split strings into bit arrays (as required by assoc analysis) + reports <- lapply(1:inp$numvars, function(i) { + # apply the following function to each of reports[[1]] and reports[[2]] + lapply(reports[[i]][[1]], function(x) { + # function splits strings and converts them to numeric values + # rev needed for endianness + rev(as.numeric(strsplit(x, split = "")[[1]])) + }) + }) + + joint_dist <- ComputeDistributionEM(reports, cohorts, map, + ignore_other = TRUE, + quick = TRUE, + params, marginals = NULL, + estimate_var = FALSE) + em <- joint_dist$fit + td <- read.csv(file = inp$truefile, header = FALSE) + td <- table(td[,3:4]) + td <- td / sum(td) + time_taken <- proc.time() - ptm + + PrintIfVerbose(TVDistance(td, em, "TV DISTANCE EM"), verbose) + PrintIfVerbose("PROC.TIME", verbose) + PrintIfVerbose(time_taken, verbose) + chisq_td <- chisq.test(td)[1][[1]][[1]] + chisq_ed <- chisq.test(em)[1][[1]][[1]] + if(is.nan(chisq_td)) { + chisq_td <- 0 + } + if(is.nan(chisq_ed)) { + chisq_ed <- 0 + } + + metrics <- list( + td_chisq = chisq_td, + ed_chisq = chisq_ed, + tv = TVDistance(td, em, ""), + time = time_taken[1], + dim1 = dim(em)[[1]], + dim2 = dim(em)[[2]] + ) + + # Write metrics to metrics_filename (default: metrics.csv) + filename <- file.path(inp$outdir, metrics_filename) + write.csv(metrics, file = filename, row.names = FALSE) +} + +main <- function(opts) { + inp <- fromJSON(opts$inp) + verbose_flag <- inp$verbose + # Choose from a set of experiments to run + # direct -> direct simulation of reports (without variances) + # external-counts -> externally supplied counts for 2 way and marginals + # external-reports -> externally supplied reports + + if("direct" %in% inp$expt) { + PrintIfVerbose("Running Experiment Direct", verbose_flag) + DirectSimulationOfReports(inp, verbose = verbose_flag) + } + if ("external-counts" %in% inp$expt) { + PrintIfVerbose("Running Experiment Ext Counts", verbose_flag) + if ("direct" %in% inp$expt) { + # external-counts expt is run to compare results + ExternalCounts(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv") + } else { + ExternalCounts(inp, verbose = verbose_flag) + } + } + if ("external-reports-em" %in% inp$expt) { + PrintIfVerbose("Running Experiment Ext Reports", verbose_flag) + if (("direct" %in% inp$expt)||("external-counts" %in% inp$expt)) { + # external-reports-em expt is run to compare results + ExternalReportsEM(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv") + } else { + ExternalReportsEM(inp, verbose = verbose_flag) + } + } +} + +if(!interactive()) { + main(opts) +} From 75120b98390e43869f6338aa1bcda1fc8d090d17 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Fri, 24 Jul 2015 11:23:56 -0700 Subject: [PATCH 59/67] Code review changes - threw fitdistribution experimental code into separate function that is now only called by a flag passed to FitDistribution - flag added to assoctest.sh to run comparisons to EM - added package jsonlite to setup - further documentation added in sum_bits_assoc --- analysis/R/decode2way.R | 41 ++++-------- analysis/tools/sum_bits_assoc.py | 24 ++++++++ analysis/tools/sum_bits_assoc_test.py | 2 +- assoctest.sh | 89 ++++++++++++++++----------- setup.sh | 2 +- tests/compare_assoc.R | 2 +- 6 files changed, 91 insertions(+), 69 deletions(-) diff --git a/analysis/R/decode2way.R b/analysis/R/decode2way.R index ce52d341..e8b546fa 100644 --- a/analysis/R/decode2way.R +++ b/analysis/R/decode2way.R @@ -95,7 +95,8 @@ EstimateBloomCounts2Way <- function(params, obs_counts) { # Implements lsei FitDistribution2Way <- function(estimates_stds, map, fit = NULL, - quiet = FALSE) { + quiet = FALSE, + add_constraints = FALSE) { X <- map Y <- as.vector(t(estimates_stds$estimates)) m <- dim(X)[1] @@ -103,25 +104,21 @@ FitDistribution2Way <- function(estimates_stds, map, G <- rbind2(Diagonal(n), rep(-1, n)) H <- c(rep(0, n), -1) - lsei(A = X, B = Y, G = G, H = H, type = 2)$X + if (add_constraints == TRUE) { + res <- AddConstraints(fit, X, Y, m, n, G, H) + lsei(A = res$X, B = res$Y, G = res$G, H = res$H, type = 2)$X + } else { + lsei(A = X, B = Y, G = G, H = H, type = 2)$X + } } -FitDistribution2WayAdditionalConstraints <- function(estimates_stds, map, fit) { +AddConstraints <- function(fit, X, Y, m, n, G, H) { # Experimental code # Computes the same output as FitDistribution by # additionally throwing in constraints corresponding to # 1-way marginals # Requires non-NULL fit as input (with "proportion" containing marginal info) - X <- as.matrix(map) - Y <- as.vector(t(estimates_stds$estimates)) - m <- dim(X)[1] - n <- dim(X)[2] - wt <- 10000 # weight to marginal constraints - - G <- rbind2(Diagonal(n), rep(-1, n)) - H <- c(rep(0, n), -1) - # Adding marginals constraints to X and Y fstrs <- lapply(fit, function(x) x[,"string"]) # found strings @@ -143,24 +140,8 @@ FitDistribution2WayAdditionalConstraints <- function(estimates_stds, map, fit) { vec[indices] <- wt X <- rbind2(X, vec) } - - lsei(A = X, B = Y, G = G, H = H, type = 2)$X - - # Random projection params - # size <- 10 * n - # density <- 0.05 - # rproj <- matrix(0, size, m) - # rproj[sample(length(rproj), size = density * length(rproj))] <- rnorm(density * length(rproj)) - # # rproj <- matrix(rnorm(10*n*m), 10*n, m) - # Xproj <- rproj %*% X - # Yproj <- as.vector(rproj %*% Y) - # mproj <- dim(Xproj)[1] - # nproj <- dim(Xproj)[2] - # - # G <- rbind2(Diagonal(nproj), rep(-1, nproj)) - # H <- c(rep(0, nproj), -1) - # lsei(A = Xproj, B = Yproj, G = G, H = H, type = 2)$X -} + list(X = X, Y = Y, G = G, H = H) +} Decode2Way <- function(counts, map, params, fit = NULL) { k <- params$k diff --git a/analysis/tools/sum_bits_assoc.py b/analysis/tools/sum_bits_assoc.py index 2263b671..8e01d669 100755 --- a/analysis/tools/sum_bits_assoc.py +++ b/analysis/tools/sum_bits_assoc.py @@ -21,6 +21,30 @@ Output counts of bloom filter bits set for each variable (1-way totals) and counts of pairwise bits set (2-way totals) into files with suffixes _marg1.csv, _marg2.csv, _2way.csv respectively. + +The file formats for each of the files are as follows: +_marg1.csv, _marg2.csv +Each row corresponds to a cohort with: +num reports, total count for bit 1, total count for bit 2, ... + +_2way.csv +Each row corresponds to a cohort +The first entry corresponds to total number of reports in that cohort +The next set of values indicate 2 way counts grouped 4 elements at a time: + the first 4 refer to information about bit 1 of irr1 and bit 1 of irr2 + the next 4 refer to information about bit 1 of irr1 and bit 2 of irr2 + ... + the next 4 refer to information about bit 1 of irr1 and bit k of irr2 + the next 4 refer to information about bit 2 of irr1 and bit 1 of irr2 + (pairwise information about tuples is stored in a "1st report"-major order) + ... + the last 4 refer to information about bit k of irr1 and bit k of irr2 + + for each 4-tuple, the values represents the counts for the pair of bits from + irr1 and irr2 having the value: + 11, 01, 10, and 00, respectively. + + See sum_bits_assoc_test.py for an example """ import csv diff --git a/analysis/tools/sum_bits_assoc_test.py b/analysis/tools/sum_bits_assoc_test.py index 11ac2ff1..e5ebb467 100755 --- a/analysis/tools/sum_bits_assoc_test.py +++ b/analysis/tools/sum_bits_assoc_test.py @@ -37,7 +37,7 @@ # EXPECTED_F_2WAY # # NOTE: bit order is reversed. -# First row is 65 zeroes +# First row is 65 zeroes because there are no reports with cohort 0 EXPECTED_F_2WAY = """\ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\r diff --git a/assoctest.sh b/assoctest.sh index 17d02df6..06f0c7d4 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -8,18 +8,25 @@ # At the end, it will print an HTML summary. # # Three main functions are -# run [[ []] - run tests matching in -# parallel, each times. +# run [[ [ []]] - run tests matching in +# parallel, each times, additionally +# running the EM algorithm if = T # -# run-seq [ []] - ditto, except that tests are run sequentially +# run-seq [ [ []]] - ditto, except that tests are run +# sequentially # -# run-all [] - run all tests, in parallel, each times +# run-all [ []] - run all tests, in parallel, +# each times +# +# Note: Patterns always start with a-. # # Examples: -# $ ./assoctest.sh run-seq tiny-8x16- # Sequential run, matches 2 cases -# $ ./assoctest.sh run-seq tiny-8x16- 3 # Sequential, each test is run three -# times -# $ ./assoctest.sh run-all # Run all tests once +# $ ./assoctest.sh run-seq a-toy # Sequential run, matches 2 cases +# $ ./assoctest.sh run-seq a-fizz 3 # Sequential, each test is run three +# times +# $ ./assoctest.sh run-all # Run all tests once +# $ ./assoctest.sh run-all 5 T # Run all tests five times with EM +# comparisons # # The argument is a regex in 'grep -E' format. (Detail: Don't # use $ in the pattern, since it matches the whole spec line and not just the @@ -27,7 +34,6 @@ # # fast_counts param inherited from regtest.sh, but currently not used - set -o nounset set -o pipefail set -o errexit @@ -107,7 +113,7 @@ _run-one-instance() { read -r case_name num_unique_values num_unique_values2 \ num_clients num_extras \ - num_bits num_hashes num_cohorts p q f < $case_dir/spec.txt + num_bits num_hashes num_cohorts p q f compare < $case_dir/spec.txt local instance_dir=$ASSOCTEST_DIR/$test_case/$test_instance mkdir --verbose -p $instance_dir @@ -165,29 +171,34 @@ _run-one-instance() { # substantial) map file. Timing below is more inclusive. TIMEFORMAT='Running analyze.R took %R seconds' - # Setting up JSON file with python - python -c "import json; \ - f = file('$instance_dir/analyze_inp.json', 'w'); \ - inp = dict(); \ - inp['maps'] = ['$case_dir/case_map1.csv',\ - '$case_dir/case_map2.csv']; \ - inp['reports'] = '$instance_dir/case_reports.csv'; \ - inp['truefile'] = '$instance_dir/case.csv'; \ - inp['outdir'] = '$out_dir'; \ - inp['params'] = '$case_dir/case_params.csv'; \ - inp['newalg'] = 'false'; \ - inp['numvars'] = 2; \ - inp['num'] = $num_clients; \ - inp['extras'] = $num_extras; \ - inp['varcandidates'] = [$num_unique_values, $num_unique_values2]; \ - inp['verbose'] = 'true'; \ - inp['counts'] = ['$instance_dir/case_2way.csv',\ - '$instance_dir/case_marg1.csv',\ - '$instance_dir/case_marg2.csv']; \ - inp['expt'] = ['external-counts']; \ - json.dump(inp, f); \ - f.close();" - + # Setting up JSON file + json_file="{\ + \"maps\": [\"$case_dir/case_map1.csv\",\ + \"$case_dir/case_map2.csv\"],\ + \"reports\": \"$instance_dir/case_reports.csv\",\ + \"truefile\": \"$instance_dir/case.csv\",\ + \"outdir\": \"$out_dir\",\ + \"params\": \"$case_dir/case_params.csv\",\ + \"newalg\": \"false\",\ + \"numvars\": 2,\ + \"num\": $num_clients,\ + \"extras\": $num_extras,\ + \"varcandidates\": [$num_unique_values, $num_unique_values2],\ + \"verbose\": \"true\",\ + \"counts\": [\"$instance_dir/case_2way.csv\",\ + \"$instance_dir/case_marg1.csv\",\ + \"$instance_dir/case_marg2.csv\"]," + + # Adding EM comparison depending on $compare flag + if test $compare = F; then + json_file=$json_file"\"expt\": [\"external-counts\"]" + else + json_file=$json_file"\"expt\": [\"external-counts\", \ + \"external-reports-em\"]" + fi + json_file=$json_file"}" + echo $json_file > $instance_dir/analyze_inp.json + time { tests/compare_assoc.R --inp $instance_dir/analyze_inp.json } @@ -254,12 +265,15 @@ _setup-test-instances() { # instances: A number of times each test case is run # parallel: Whether the tests are run in parallel (T/F) # fast_counts: Whether counts are sampled directly (T/F) +# compare: Whether the tests run comparisons between EM and Marginal +# algorithms or not # _run-tests() { local spec_regex=$1 # grep -E format on the spec local instances=$2 local parallel=$3 local fast_counts=$4 + local $compare=$5 rm -r -f --verbose $ASSOCTEST_DIR @@ -270,6 +284,7 @@ _run-tests() { echo $instances echo $parallel echo $fast_counts + echo $compare local func local processors=1 @@ -290,7 +305,7 @@ _run-tests() { fi local cases_list=$ASSOCTEST_DIR/test-cases.txt - tests/assoctest_spec.py | grep -E $spec_regex > $cases_list + tests/assoctest_spec.py | grep -E $spec_regex | sed "s/$/ $compare/" > $cases_list # Generate parameters for all test cases. cat $cases_list \ @@ -314,18 +329,20 @@ _run-tests() { run-seq() { local spec_regex=${1:-'^a-'} # grep -E format on the spec local instances=${2:-1} + local compare=${3:-F} - _run-tests $spec_regex $instances F T + _run-tests $spec_regex $instances F T $compare } # Run tests in parallel run-all() { local instances=${1:-1} + local compare=${2:-F} log "Running all tests. Can take a while." # a- for assoc tests # F for sequential - _run-tests '^a-' $instances T T + _run-tests '^a-' $instances T T $compare } "$@" diff --git a/setup.sh b/setup.sh index 90b6537f..fbaaff41 100755 --- a/setup.sh +++ b/setup.sh @@ -30,7 +30,7 @@ r-packages() { # glmnet, limSolve: solvers for decode.R # RJSONIO: for analysis_tool.R sudo R -e \ - 'install.packages(c("glmnet", "optparse", "limSolve", "RUnit", "abind", "RJSONIO"), repos="http://cran.rstudio.com/")' + 'install.packages(c("glmnet", "optparse", "limSolve", "RUnit", "abind", "RJSONIO", "jsonlite"), repos="http://cran.rstudio.com/")' } # R 3.0.2 on Trusty is out of date with CRAN, so we need this workaround. diff --git a/tests/compare_assoc.R b/tests/compare_assoc.R index 86dad21e..b0b3a718 100755 --- a/tests/compare_assoc.R +++ b/tests/compare_assoc.R @@ -18,7 +18,7 @@ # an EM algorithm to estimate joint distribution over two or more variables # # Usage: -# $ ./analyze_assoc_expt.R --inp +# $ ./compare_assoc.R --inp # # Input file: # Outputs: From 92590b815c5795066c517042f7f4c19b343b120d Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Fri, 24 Jul 2015 13:03:58 -0700 Subject: [PATCH 60/67] Remove display of compare flag in results. --- tests/make_summary_assoc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py index f1b0d1ca..0f1af04d 100755 --- a/tests/make_summary_assoc.py +++ b/tests/make_summary_assoc.py @@ -130,9 +130,9 @@ def ParseSpecFile(spec_filename, empty = False): with open(spec_filename) as s: spec_row = s.readline().split() - spec_in_html = ' '.join('' % cell for cell in spec_row[1:]) + spec_in_html = ' '.join('' % cell for cell in spec_row[1:len(spec_row)-1]) if empty == True: - spec_in_html = ' '.join('' for cell in spec_row[1:]) + spec_in_html = ' '.join('' for cell in spec_row[1:len(spec_row)-1]) return spec_in_html From 21080617e215906ce8779ebdc18cdf71c15df3a2 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 27 Jul 2015 11:16:01 -0700 Subject: [PATCH 61/67] Reconciled with old decode.R for assoc pruning. --- analysis/R/decode.R | 85 ++++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 39 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index ba9eb9c6..fe314cd9 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -97,26 +97,22 @@ FitLasso <- function(X, Y, intercept = TRUE) { # a vector of size ncol(X) of coefficients. # TODO(mironov): Test cv.glmnet instead of glmnet - - # Cap the number of non-zero coefficients to 500 or 80% of the number of - # constraints, whichever is less. The 500 cap is for performance reasons, 80% - # is to avoid overfitting. - cap <- min(500, nrow(X) * .8, ncol(X)) - - # TODO: take care of corner case when ncol(X) == 1 - # currently glmnet() fails - mod <- glmnet(X, Y, standardize = FALSE, intercept = intercept, - lower.limits = 0, # outputs are non-negative - pmax = cap) - - coefs <- coef(mod) - coefs <- coefs[-1, , drop = FALSE] # drop the intercept - l1cap <- sum(colSums(coefs) <= 1.0) # find all columns with L1 norm <= 1 - if(l1cap > 0) - distr <- coefs[, l1cap] # return the last set of coefficients with L1 <= 1 - else - distr <- setNames(rep(0, ncol(X)), colnames(X)) - distr + mod <- try(glmnet(X, Y, standardize = FALSE, intercept = intercept, + lower.limits = 0, # outputs are non-negative + # Cap the number of non-zero coefficients to 500 or + # 80% of the length of Y, whichever is less. The 500 cap + # is for performance reasons, 80% is to avoid overfitting. + pmax = min(500, length(Y) * .8)), + silent = TRUE) + + # If fitting fails, return an empty data.frame. + if (class(mod)[1] == "try-error") { + coefs <- setNames(rep(0, ncol(X)), colnames(X)) + } else { + coefs <- coef(mod) + coefs <- coefs[-1, ncol(coefs), drop = FALSE] # coefs[1] is the intercept + } + coefs } PerformInference <- function(X, Y, N, mod, params, alpha, correction) { @@ -227,13 +223,30 @@ FitDistribution <- function(estimates_stds, map, quiet = FALSE) { # according to this vector approximates estimates S <- ncol(map) # total number of candidates - lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) - - if(!quiet) - cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n") - names(lasso) <- colnames(map) - lasso + support_coefs <- 1:S + + if (S > length(estimates_stds$estimates) * .8) { + # the system is close to being underdetermined + lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) + + # Select non-zero coefficients. + support_coefs <- which(lasso > 0) + + if(!quiet) + cat("LASSO selected ", length(support_coefs), " non-zero coefficients.\n") + } + + coefs <- setNames(rep(0, S), colnames(map)) + + if(length(support_coefs) > 0) { # LASSO may return an empty list + constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE], + estimates_stds) + + coefs[support_coefs] <- constrained_coefs + } + + coefs } Resample <- function(e) { @@ -247,7 +260,7 @@ Resample <- function(e) { list(estimates = estimates, stds = stds) } -Decode <- function(counts, map, params, alpha = 0.05, quick = FALSE, +Decode <- function(counts, map, params, alpha = 0.05, correction = c("Bonferroni"), quiet = FALSE, ...) { k <- params$k p <- params$p @@ -273,11 +286,10 @@ Decode <- function(counts, map, params, alpha = 0.05, quick = FALSE, stds = es$stds[filter_cohorts, , drop = FALSE]) coefs_all <- vector() + # Run the fitting procedure several times (5 seems to be sufficient and not # too many) to estimate standard deviation of the output. - if(quick) {num_reps <- 2} else {num_reps <- 5} - for(r in 1:num_reps) - { + for(r in 1:5) { if(r > 1) e <- Resample(estimates_stds_filtered) else @@ -287,23 +299,16 @@ Decode <- function(counts, map, params, alpha = 0.05, quick = FALSE, FitDistribution(e, map[filter_bits, , drop = FALSE], quiet)) } - - FitDistribution(e, map[filter_bits, , drop = FALSE], quiet) + coefs_ssd <- N * apply(coefs_all, 2, sd) # compute sample standard deviations coefs_ave <- N * apply(coefs_all, 2, mean) # Only select coefficients more than two standard deviations from 0. May # inflate empirical SD of the estimates. - reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd) + reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd) mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported]) -# Old code ... -# coefs_all <- FitDistribution(estimates_stds_filtered, -# map[filter_bits, , drop = FALSE]) -# reported <- which(coefs_all > 1E-6) -# mod <- list(coefs = coefs_all[reported], stds = rep(0, length(reported))) - if (correction == "Bonferroni") { alpha <- alpha / S } @@ -333,8 +338,10 @@ Decode <- function(counts, map, params, alpha = 0.05, quick = FALSE, # Clamp estimated proportion. pmin/max: vectorized min and max fit$prop_low_95 <- pmax(low_95, 0.0) fit$prop_high_95 <- pmin(high_95, 1.0) + fit <- fit[, c("string", "estimate", "std_error", "proportion", "prop_std_error", "prop_low_95", "prop_high_95")] + allocated_mass <- sum(fit$proportion) num_detected <- nrow(fit) From 22fa769365ece1ceec9497b0926cdf616ea598b8 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 27 Jul 2015 11:19:08 -0700 Subject: [PATCH 62/67] Fixed expected_f_2way in sum bits assoc test --- analysis/tools/sum_bits_assoc_test.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/analysis/tools/sum_bits_assoc_test.py b/analysis/tools/sum_bits_assoc_test.py index e5ebb467..e19b9fed 100755 --- a/analysis/tools/sum_bits_assoc_test.py +++ b/analysis/tools/sum_bits_assoc_test.py @@ -38,34 +38,37 @@ # # NOTE: bit order is reversed. # First row is 65 zeroes because there are no reports with cohort 0 -EXPECTED_F_2WAY = """\ +expected_f_2way = """\ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\r """ # Cohort 1 # Total # of reports -EXPECTED_F_2WAY = EXPECTED_F_2WAY + "3," +expected_f_2way = expected_f_2way + "3," # Looking at LSBs of both irrs # Total # of (11, 01, 10, 00) that appear -EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,2,1," +expected_f_2way = expected_f_2way + "0,0,2,1," # Report 1-major order. So looking at LSB of irr1 and 2nd LSB of irr2 -EXPECTED_F_2WAY = EXPECTED_F_2WAY + "2,0,0,1," +expected_f_2way = expected_f_2way + "2,0,0,1," # And so on ... -EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,2,1," -EXPECTED_F_2WAY = EXPECTED_F_2WAY + "2,0,0,1," +expected_f_2way = expected_f_2way + "0,0,2,1," +expected_f_2way = expected_f_2way + "2,0,0,1," # Now moving on to 2nd LSB of irr1 -EXPECTED_F_2WAY = EXPECTED_F_2WAY + ("0,0,2,1,2,0,0,1," * 2) +expected_f_2way = expected_f_2way + ("0,0,2,1,2,0,0,1," * 2) # Now moving on to 3rd LSB of irr1 # Note that for 3rd LSB of irr1 and LSB of irr2, there are three 00s -EXPECTED_F_2WAY = EXPECTED_F_2WAY + ("0,0,0,3,0,2,0,1," * 2) +expected_f_2way = expected_f_2way + ("0,0,0,3,0,2,0,1," * 2) # MSB of irr1 -EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,0,3,0,2,0,1," + "0,0,0,3,0,2,0,1\r\n" +expected_f_2way = expected_f_2way + "0,0,0,3,0,2,0,1," + "0,0,0,3,0,2,0,1\r\n" + +# EXPECTED_F_2WAY is a constant +EXPECTED_F_2WAY = expected_f_2way # end of EXPECTED_F_2WAY # ############################### From 217417204c27e946038cb0f62ac2edb2e6a4d450 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 27 Jul 2015 14:02:59 -0700 Subject: [PATCH 63/67] Wrapper for running quick analysis. --- analysis/R/assoc.R | 171 +++++++++++++++++++++++++++++++++++++++ analysis/R/association.R | 19 ++--- quick_assoc.sh | 75 +++++++++++++++++ 3 files changed, 256 insertions(+), 9 deletions(-) create mode 100755 analysis/R/assoc.R create mode 100755 quick_assoc.sh diff --git a/analysis/R/assoc.R b/analysis/R/assoc.R new file mode 100755 index 00000000..a7dc63d6 --- /dev/null +++ b/analysis/R/assoc.R @@ -0,0 +1,171 @@ +#!/usr/bin/env Rscript +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Reads map files, report files, and RAPPOR parameters to run +# an EM algorithm to estimate joint distribution over two or more variables +# +# Usage: +# $ ./assoc.R --inp +# +# Input: JSON file with the following fields +# "maps" for map files of each var +# "reports" for a list of reports +# "counts" for 2 way marginal counts, individual marginal counts +# respectively +# "params" for params file with RAPPOR params +# "csv_out" for a file name into which results will be written +# as comma separated values +# +# Output: A table with joint distribution to stdout and csv file with results + +library("jsonlite") +library("optparse") + +options(stringsAsFactors = FALSE) + +if(!interactive()) { + option_list <- list( + make_option(c("--inp"), default = "inp.json", + help = "JSON file with inputs for assoc.R")) + opts <- parse_args(OptionParser(option_list = option_list)) +} + +source("analysis/R/decode2way.R") +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/read_input.R") +source("analysis/R/association.R") +source("tests/gen_counts.R") +source("tests/compare_assoc.R") # For CombineMaps; it should be moved elsewhere + +TwoWayAlg <- function(inp) { + ptm <- proc.time() + params <- ReadParameterFile(inp$params) + # Ensure sufficient maps as required by number of vars + # Correct map from ReadMapFile() for assoc analysis + stopifnot(inp$numvars == length(inp$maps)) + map <- lapply(inp$maps, function(o) + CorrectMapForAssoc(ReadMapFile(o, params = params), + params = params)) + + # (2 way counts, marginal 1 counts, marginal 2 counts) + counts <- lapply(1:3, function(i) ReadCountsFile(inp$counts[[i]])) + + # TODO: account for different parameters across different variables + params2 <- params + params2$k <- (params$k ** 2) * 4 + + # Prune candidates + fit <- lapply(1:2, function(i) + Decode(counts[[i + 1]], + map[[i]]$rmap, + params, quick = FALSE)$fit) + + found_strings = list(fit[[1]][,"string"], fit[[2]][,"string"]) + + if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) { + stop("No strings found in 1-way marginal.") + } + + # Combine maps to feed into Decode2Way + # Prune first to found_strings from Decode on 1-way counts + pruned <- lapply(1:2, function(i) + lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE])) + crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap + marginal <- Decode2Way(counts[[1]], crmap, params2, fit = fit)$fit + + # Reconstruct 2-way table from marginals + ed <- matrix(0, nrow = length(found_strings[[1]]), ncol = length(found_strings[[2]])) + colnames(ed) <- found_strings[[2]] + rownames(ed) <- found_strings[[1]] + for (cols in found_strings[[2]]) { + for (rows in found_strings[[1]]) { + ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] + } + } + ed[is.na(ed)] <- 0 + ed[ed<0] <- 0 + + time_taken <- proc.time() - ptm + print("Two Way Algorithm Results") + print(ed[order(-rowSums(ed)), order(-colSums(ed))]) + if(inp$time == TRUE) + print(time_taken) +} + +EMAlg <- function(inp) { + ptm <- proc.time() + params <- ReadParameterFile(inp$params) + # Ensure sufficient maps as required by number of vars + stopifnot(inp$numvars == length(inp$maps)) + # Correct map from ReadMapFile() for assoc analysis + map <- lapply(inp$maps, function(o) + CorrectMapForAssoc(ReadMapFile(o, params = params), + params = params)) + + # Reports must be of the format + # client name, cohort no, rappor bitstring 1, rappor bitstring 2, ... + reportsObj <- read.csv(inp$reports, + colClasses = c("character", "integer", + rep("character", inp$numvars)), + header = TRUE) + # Ignore the first column + reportsObj <- reportsObj[,-1] + + # Parsing reportsObj + # ComputeDistributionEM allows for different sets of cohorts + # for each variable. Here, both sets of cohorts are identical + co <- as.list(reportsObj[1])[[1]] + co <- co + 1 # 1 indexing + cohorts <- rep(list(co), inp$numvars) + # Parse reports from reportObj cols 2, 3, ... + reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1])) + + # Split strings into bit arrays (as required by assoc analysis) + reports <- lapply(1:inp$numvars, function(i) { + # apply the following function to each of reports[[1]] and reports[[2]] + lapply(reports[[i]][[1]], function(x) { + # function splits strings and converts them to numeric values + # rev needed for endianness + rev(as.numeric(strsplit(x, split = "")[[1]])) + }) + }) + + joint_dist <- ComputeDistributionEM(reports, cohorts, map, + ignore_other = TRUE, + quick = TRUE, + params, marginals = NULL, + estimate_var = FALSE, + verbose = inp$time) + em <- joint_dist$fit + time_taken <- proc.time() - ptm + print("EM Algorithm Results") + print(em[order(-rowSums(em)), order(-colSums(em))]) + if(inp$time == TRUE) + print(time_taken) +} + +main <- function(opts) { + inp <- fromJSON(opts$inp) + TwoWayAlg(inp) + if(inp$also_em == TRUE) + EMAlg(inp) +} + +if(!interactive()) { + main(opts) +} \ No newline at end of file diff --git a/analysis/R/association.R b/analysis/R/association.R index f2d6f59c..eb561267 100644 --- a/analysis/R/association.R +++ b/analysis/R/association.R @@ -219,8 +219,8 @@ EM <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE, pij[[i + 1]] <- UpdatePij(pij[[i]], cond_prob) dif <- max(abs(pij[[i + 1]] - pij[[i]])) if (i == 1) { - print("ONE ITERATION") - print(proc.time() - ptm_iter) + PrintIfVerbose("ONE ITERATION", verbose) + PrintIfVerbose(proc.time() - ptm_iter, verbose) } if (dif < epsilon) { break @@ -292,7 +292,8 @@ ComputeDistributionEM <- function(reports, report_cohorts, maps, ignore_other = FALSE, params, quick = FALSE, marginals = NULL, - estimate_var = FALSE) { + estimate_var = FALSE, + verbose = FALSE) { # Computes the distribution of num_variables variables, where # num_variables is chosen by the client, using the EM algorithm. # @@ -334,8 +335,8 @@ ComputeDistributionEM <- function(reports, report_cohorts, variable_counts <- ComputeCounts(variable_report, variable_cohort, params) marginal <- Decode(variable_counts, map$rmap, params, quick, quiet = TRUE)$fit - print("TIME IN MARGINALS") - print(proc.time() - ptm2) + PrintIfVerbose("TIME IN MARGINALS", verbose) + PrintIfVerbose(proc.time() - ptm2, verbose) if (nrow(marginal) == 0) { return (NULL) } @@ -373,16 +374,16 @@ ComputeDistributionEM <- function(reports, report_cohorts, # Update the joint conditional distribution of all variables joint_conditional <- UpdateJointConditional(cond_report_dist, joint_conditional) - print("TIME IN COND_REPORT_DIST") - print(proc.time()-ptm) + PrintIfVerbose("TIME IN COND_REPORT_DIST", verbose) + PrintIfVerbose(proc.time()-ptm, verbose) } ptm <- proc.time() # Run expectation maximization to find joint distribution em <- EM(joint_conditional, epsilon = 10 ^ -6, verbose = FALSE, estimate_var = estimate_var) - print("TIME IN EM") - print(proc.time() - ptm) + PrintIfVerbose("TIME IN EM", verbose) + PrintIfVerbose(proc.time() - ptm, verbose) dimnames(em$est) <- found_strings # Return results in a usable format diff --git a/quick_assoc.sh b/quick_assoc.sh new file mode 100755 index 00000000..024e321e --- /dev/null +++ b/quick_assoc.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# +# Quick script to wrap assoc.R +# +# Usage: +# ./quick_assoc.sh [] +# +# For directory name $dir, quick_assoc.sh expects the following files: +# $dir/map1.csv -- map files +# $dir/map2.csv +# $dir/reports.csv -- these are the raw reports +# $dir/params.csv -- parameters file +# +# At the end, it will output results of the Two Way Algorithm and EM algorithm +# (if EM also is set to T) to stdout +# +# Examples: +# $ ./quick_assoc.sh . T + +readonly THIS_DIR=$(dirname $0) +readonly REPO_ROOT=$THIS_DIR +readonly CLIENT_DIR=$REPO_ROOT/client/python +readonly MAP_SUFFIX=map +readonly COUNT_SUFFIX=count + +# All the Python tools need this +export PYTHONPATH=$CLIENT_DIR + +_run-input() { + + # Read reports and compute two way counts + analysis/tools/sum_bits_assoc.py \ + $1/params.csv \ + "$1/$COUNT_SUFFIX" \ + < $1/reports.csv + + # Currently, the summary file shows and aggregates timing of the inference + # engine, which excludes R's loading time and reading of the (possibly + # substantial) map file. Timing below is more inclusive. + TIMEFORMAT='Running analyze.R took %R seconds' + + # Setting up JSON file inp.json in current directory + json_file="{\ + \"time\": false, + \"maps\": [\"$1/${MAP_SUFFIX}1.csv\",\ + \"$1/${MAP_SUFFIX}2.csv\"],\ + \"reports\": \"$1/reports.csv\",\ + \"params\": \"$1/params.csv\",\ + \"numvars\": 2,\ + \"verbose\": \"false\",\ + \"counts\": [\"$1/${COUNT_SUFFIX}_2way.csv\",\ + \"${COUNT_SUFFIX}_marg1.csv\",\ + \"${COUNT_SUFFIX}_marg2.csv\"]," + + # Adding EM comparison depending on flag + if test $2 = T; then + json_file=$json_file"\"also_em\": true" + else + json_file=$json_file"\"also_em\": false" + fi + json_file=$json_file"}" + echo $json_file > inp.json + + time { + analysis/R/assoc.R --inp inp.json + } +} + +main() { + dir=$1 + also_em=${2:-F} + _run-input $dir $also_em +} + +main "$@" From bda727535befb5e9e2b7294f3ef5a8b16d295611 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Mon, 27 Jul 2015 14:29:53 -0700 Subject: [PATCH 64/67] Clean up in assoctest.sh --- assoctest.sh | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/assoctest.sh b/assoctest.sh index 06f0c7d4..42e1799f 100755 --- a/assoctest.sh +++ b/assoctest.sh @@ -140,29 +140,6 @@ _run-one-instance() { < $instance_dir/case_reports.csv - # Setting up JSON file containing assoc_sim inputs with python - # Currently unused as true values and RAPPOR'd reports are generated - # running gen_assoc_reports.R, rappor_assoc_sim.py, and sum_bits_assoc.py - python -c "import json; \ - f = file('$instance_dir/assoc_inp.json', 'w'); \ - inp = dict(); \ - inp['params'] = '$case_dir/case_params.csv'; \ - inp['reports'] = '$instance_dir/reports.csv'; \ - inp['true'] = '$instance_dir/truedist.csv'; \ - inp['map'] = '$instance_dir/map'; \ - inp['num'] = $num_clients; \ - inp['extras'] = 0; \ - inp['distr'] = 'zipf2'; \ - inp['prefix'] = './'; \ - inp['vars'] = 2; \ - inp['varcandidates'] = [$num_unique_values, $num_unique_values2]; \ - json.dump(inp, f); \ - f.close();" - - # Currently unused as true values and RAPPOR'd reports are generated - # running gen_assoc_reports.R, rappor_assoc_sim.py, and sum_bits_assoc.py - # tests/assoc_sim_expt.R --inp $instance_dir/assoc_inp.json - local out_dir=${instance_dir}_report mkdir --verbose -p $out_dir From 5e665da877b2be017f3b50691205a3a9c4def289 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Tue, 4 Aug 2015 21:00:11 -0700 Subject: [PATCH 65/67] Modifications to work with basic assocations. --- analysis/R/assoc.R | 16 +++++++++++++++- analysis/R/association.R | 14 +++++++------- analysis/R/decode.R | 11 ++++++++--- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/analysis/R/assoc.R b/analysis/R/assoc.R index a7dc63d6..db2af246 100755 --- a/analysis/R/assoc.R +++ b/analysis/R/assoc.R @@ -114,9 +114,20 @@ EMAlg <- function(inp) { stopifnot(inp$numvars == length(inp$maps)) # Correct map from ReadMapFile() for assoc analysis map <- lapply(inp$maps, function(o) - CorrectMapForAssoc(ReadMapFile(o, params = params), + CorrectMapForAssoc(LoadMapFile(o, params = params), params = params)) + # For BASIC only + m1 <- lapply(1:params$m, function(z) { + m <- sparseMatrix(c(1), c(2), dims = c(1, 2)) + colnames(m) <- c("FALSE", "TRUE") + m + }) + m2 <- sparseMatrix(1:params$m, rep(2, params$m)) + colnames(m2) <- colnames(m1[[1]]) + map[[2]]$map <- m1 + map[[2]]$rmap <- m2 + # Reports must be of the format # client name, cohort no, rappor bitstring 1, rappor bitstring 2, ... reportsObj <- read.csv(inp$reports, @@ -126,6 +137,9 @@ EMAlg <- function(inp) { # Ignore the first column reportsObj <- reportsObj[,-1] + params = list(params, params) + params[[2]]$k = 1 + # Parsing reportsObj # ComputeDistributionEM allows for different sets of cohorts # for each variable. Here, both sets of cohorts are identical diff --git a/analysis/R/association.R b/analysis/R/association.R index eb561267..56a95749 100644 --- a/analysis/R/association.R +++ b/analysis/R/association.R @@ -332,8 +332,8 @@ ComputeDistributionEM <- function(reports, report_cohorts, variable_counts <- NULL if (is.null(marginals)) { ptm2 <- proc.time() - variable_counts <- ComputeCounts(variable_report, variable_cohort, params) - marginal <- Decode(variable_counts, map$rmap, params, quick, + variable_counts <- ComputeCounts(variable_report, variable_cohort, params[[j]]) + marginal <- Decode(variable_counts, map$rmap, params[[j]], quick, quiet = TRUE)$fit PrintIfVerbose("TIME IN MARGINALS", verbose) PrintIfVerbose(proc.time() - ptm2, verbose) @@ -346,26 +346,26 @@ ComputeDistributionEM <- function(reports, report_cohorts, found_strings[[j]] <- marginal$string if (ignore_other) { - prob_other <- vector(mode = "list", length = params$m) + prob_other <- vector(mode = "list", length = params[[j]]$m) } else { if (is.null(variable_counts)) { variable_counts <- ComputeCounts(variable_report, variable_cohort, - params) + params[[j]]) } prob_other <- GetOtherProbs(variable_counts, map$map, marginal, - params) + params[[j]]) found_strings[[j]] <- c(found_strings[[j]], "Other") } GetCondProb(variable_report[[1]], candidate_strings = rownames(marginal), - params = params, map$map[[variable_cohort[1]]], prob_other[[variable_cohort[1]]]) + params = params[[j]], map$map[[variable_cohort[1]]], prob_other[[variable_cohort[1]]]) # Get the joint conditional distribution cond_report_dist <- lapply(seq(length(variable_report)), function(i) { idx <- variable_cohort[i] rep <- GetCondProb(variable_report[[i]], candidate_strings = rownames(marginal), - params = params, + params = params[[j]], map$map[[idx]], prob_other[[idx]]) rep diff --git a/analysis/R/decode.R b/analysis/R/decode.R index fe314cd9..626274e2 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -74,9 +74,14 @@ EstimateBloomCounts <- function(params, obs_counts) { # Transform counts from absolute values to fractional, removing bias due to # variability of reporting between cohorts. - ests <- apply(ests, 1, function(x) x / obs_counts[,1]) - stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1]) - + if (ncol(obs_counts) == 2) { + ests <- apply(t(ests), 1, function(x) x / obs_counts[,1]) + stds <- apply(t(variances^.5), 1, function(x) x / obs_counts[,1]) + } else { + ests <- apply((ests), 1, function(x) x / obs_counts[,1]) + stds <- apply((variances^.5), 1, function(x) x / obs_counts[,1]) + } + # Some estimates may be set to infinity, e.g. if f=1. We want to # account for this possibility, and set the corresponding counts # to 0. From 45052b97c750196b988e648fc603c26f3f863b0d Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Thu, 6 Aug 2015 10:41:45 -0700 Subject: [PATCH 66/67] Rigging old EM code to work with Basic assoc. --- analysis/R/assoc.R | 2 +- analysis/R/association.R | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/analysis/R/assoc.R b/analysis/R/assoc.R index db2af246..662ab1f6 100755 --- a/analysis/R/assoc.R +++ b/analysis/R/assoc.R @@ -160,7 +160,7 @@ EMAlg <- function(inp) { }) joint_dist <- ComputeDistributionEM(reports, cohorts, map, - ignore_other = TRUE, + ignore_other = FALSE, quick = TRUE, params, marginals = NULL, estimate_var = FALSE, diff --git a/analysis/R/association.R b/analysis/R/association.R index 56a95749..328d3292 100644 --- a/analysis/R/association.R +++ b/analysis/R/association.R @@ -44,7 +44,7 @@ GetOtherProbs <- function(counts, map, marginal, params) { # Counts to remove from each cohort. top_counts <- ceiling(marginal$proportion * N / params$m) sum_top <- sum(top_counts) - candidate_map <- lapply(map, function(x) x[, candidate_strings]) + candidate_map <- lapply(map, function(x) x[, candidate_strings, drop = FALSE]) # Counts set by known strings without noise considerations. if (length(marginal) > 0) { @@ -63,6 +63,10 @@ GetOtherProbs <- function(counts, map, marginal, params) { pstar <- (1 - f / 2) * p + (f / 2) * q top_counts_cohort <- (sum_top - top_counts_cohort) * pstar + top_counts_cohort * qstar + + # Adjustment for basic rappor + if(nrow(top_counts_cohort) == 1) + top_counts_cohort <- t(top_counts_cohort) top_counts_cohort <- cbind(sum_top, top_counts_cohort) # Counts set by the "other" category. @@ -72,6 +76,9 @@ GetOtherProbs <- function(counts, map, marginal, params) { props_other[props_other > 1] <- 1 props_other[is.nan(props_other)] <- 0 props_other[is.infinite(props_other)] <- 0 + # Adjustmet for basic rappor + if(is.null(nrow(props_other))) + props_other <- t(props_other) as.list(as.data.frame(props_other)) } @@ -356,9 +363,6 @@ ComputeDistributionEM <- function(reports, report_cohorts, params[[j]]) found_strings[[j]] <- c(found_strings[[j]], "Other") } - - GetCondProb(variable_report[[1]], candidate_strings = rownames(marginal), - params = params[[j]], map$map[[variable_cohort[1]]], prob_other[[variable_cohort[1]]]) # Get the joint conditional distribution cond_report_dist <- lapply(seq(length(variable_report)), function(i) { From bde82f4cf454760d087a61bd69d82bf64e44c384 Mon Sep 17 00:00:00 2001 From: Ananth Raghunathan Date: Tue, 8 Sep 2015 17:52:07 -0700 Subject: [PATCH 67/67] params causes a bug --- tests/compare_assoc.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/compare_assoc.R b/tests/compare_assoc.R index b0b3a718..a8105662 100755 --- a/tests/compare_assoc.R +++ b/tests/compare_assoc.R @@ -473,7 +473,8 @@ ExternalReportsEM <- function(inp, rev(as.numeric(strsplit(x, split = "")[[1]])) }) }) - + + params = list(params, params) joint_dist <- ComputeDistributionEM(reports, cohorts, map, ignore_other = TRUE, quick = TRUE,
Test Case + Input Params RAPPOR Params + Result Metrics
+ + d1: orig. support(var1)
+ d2: orig. support(var2)
n: num reports
e: num extras
+ d1: dimension of var1 solutions.
d2: dimension of var2 solutions.
- td_chisq: chisq test on true distr.
- ed_chisq: chisq test on est. distr.
tv: tot. var. distance
rtime: R runtime
d1d2 n e d1 d2td_chisqed_chisq tv rtime
%(mean_chisqdiff)s %(mean_l1d)s %(mean_rtime)s
%s%s%s%s%s