From dd546e9be5de0595a7e6c232f58568ab1594ae79 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Wed, 29 Apr 2015 11:17:21 -0700
Subject: [PATCH 01/67] Adding some more testing to assoc analysis.

---
 tests/analyze_assoc.R | 19 ++++++++-
 tests/assoc_sim.R     | 89 ++++++++++++++++++++++++++++++++++---------
 tests/gen_counts.R    |  2 +-
 tests/params.csv      |  2 +-
 tests/uvals.csv       |  4 +-
 5 files changed, 94 insertions(+), 22 deletions(-)

diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R
index 5d78806f..56b66ea0 100755
--- a/tests/analyze_assoc.R
+++ b/tests/analyze_assoc.R
@@ -114,9 +114,26 @@ main <- function(opts) {
                                       ignore_other = TRUE,
                                       params, marginals = NULL,
                                       estimate_var = FALSE)
+  
+  # Hardcoded place to lookup true distribution
+  # TODO(pseudorandom): Make this a flag
+  td <- read.csv(file = "truedist.csv")
+  ed <- joint_dist$fit
+  
+  # L1 distance = 1 - sum(min(td|x, ed|x)) where
+  # td|x / ed|x projects the distribution to the intersection x of the
+  # supports of td and ed
+  rowsi <- intersect(rownames(td), rownames(ed))
+  colsi <- intersect(colnames(td), colnames(ed))
+  print("L1 DISTANCE")
+  print(1 - sum(mapply(min,
+                  unlist(td[rowsi, colsi], use.names = FALSE),
+                  unlist(as.data.frame(ed)[rowsi, colsi], use.names = FALSE)
+                   )))
+  
   # TODO(pseudorandom): Export the results to a file for further analysis
   print("JOINT_DIST$FIT")
-  print(joint_dist$fit)
+  print(signif(ed[order(rowSums(ed)),], 4))
   print("PROC.TIME")
   print(proc.time() - ptm)
 }
diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R
index 3ff1e5df..6ac2b857 100755
--- a/tests/assoc_sim.R
+++ b/tests/assoc_sim.R
@@ -41,12 +41,17 @@ if(!interactive()) {
                 help = "Filename for RAPPOR parameters"),
     make_option(c("--reports", "-r"), default = "reports.csv",
                 help = "Filename for reports"),
+    make_option(c("--true", "-t"), default = "truedist.csv",
+                help = "Filename for the true distribution"),
     make_option(c("--map", "-m"), default = "map",
                 help = "Filename *prefix* for map(s)"),
     make_option(c("--num", "-n"), default = 1e05,
                 help = "Number of reports"),
-    make_option(c("--unif", "-u"), default = FALSE,
-                help = "Run simulation with uniform distribution")
+    make_option(c("--extras", "-e"), default = TRUE,
+                help = "Does 1st map have spurious candidates?"),
+    make_option(c("--distr", "-d"), default = "zipfg",
+                help = "Type of distribution. Choose between 
+                {unif, poisson, poisson2}")
   )
   opts <- parse_args(OptionParser(option_list = option_list))
 }    
@@ -56,6 +61,7 @@ source("../analysis/R/decode.R")
 source("../analysis/R/simulation.R")
 source("../analysis/R/read_input.R")
 source("../analysis/R/association.R")
+source("../tests/gen_counts.R")
 
 # Read unique values of reports from a csv file
 # Inputs: filename. The file is expected to contain two rows of strings
@@ -83,28 +89,66 @@ GetUniqueValsFromFile <- function(filename) {
 # Inputs: N = number of reports
 #         uvals = list containing a list of unique values
 #         params = list with RAPPOR parameters
-#         unif = whether to replace poisson with uniform
+#         distr = the type of distribution to use
+#                 {unif, poisson, poisson2, zipfg}
+#         extras = whether map_1.csv has spurious candidates or not
 #         mapfile = file to write maps into (with .csv suffixes)
 #         reportsfile = file to write reports into (with .csv suffix)
-SimulateReports <- function(N, uvals, params, unif,
+SimulateReports <- function(N, uvals, params, distr, extras, truefile,
                             mapfile, reportsfile) {
   # Compute true distribution
   m <- params$m  
 
-  if (unif) {
+  if (distr == "unif") {
     # Draw uniformly from 1 to 10
     v1_samples <- as.integer(runif(N, 1, 10))
-  } else {
+    
+    # Pr[var2 = N + 1 | var1 = N] = 0.5
+    # Pr[var2 = N     | var1 = N] = 0.5
+    v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE)
+
+  } else if(distr == "poisson") {
     # Draw from a Poisson random variable
     v1_samples <- rpois(N, 1) + rep(1, N)
+
+    # Pr[var2 = N + 1 | var1 = N] = 0.5
+    # Pr[var2 = N     | var1 = N] = 0.5
+    v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE)
+  } else if (distr == "poisson2") {
+
+    v1_samples <- rpois(N, 1) + rep(1, N)
+    # supp(var2) = {1, 2}
+    # Pr[var2 = 1 | var1 = even] = 0.75
+    # Pr[var2 = 1 | var1 = odd]  = 0.25
+    pr25 <- rbinom(N, 1, 0.25) + 1
+    pr75 <- rbinom(N, 1, 0.75) + 1
+    v2_samples <- rep(1, N)
+    v2_samples[v1_samples %% 2 == 0] <- pr25[v1_samples %% 2 == 0]
+    v2_samples[v1_samples %% 2 == 1] <- pr75[v1_samples %% 2 == 1]
+  } else if (distr == "zipfg") {
+
+    # Zipfian over 25 strings
+    partition <- RandomPartition(N, ComputePdf("zipf1.5", 25))
+    v1_samples <- rep(1:25, partition)  # expand partition
+    # Shuffle values randomly (may take a few sec for > 10^8 inputs)
+    v1_samples <- sample(v1_samples)
+
+    # supp(var2) = {1, 2, 3, 4, 6}
+    # We look at two zipfian distributions over supp(var2)
+    # D1 = zipfian distribution
+    # D2 = zipfian distr over {6, 5, 4, 3, 2, 1}
+    # (i.e., D1 in reverse)
+    # var2 ~ D1 if var1 = even
+    # var2 ~ D2 if var1 = odd
+    d1 <- sample(rep(1:6, RandomPartition(N, ComputePdf("zipf1.5", 6))))
+    d2 <- c(6, 5, 4, 3, 2, 1)[d1]
+    v2_samples <- rep(1, N)
+    v2_samples[v1_samples %% 2 == 0] <- d1[v1_samples %% 2 == 0]
+    v2_samples[v1_samples %% 2 == 1] <- d2[v1_samples %% 2 == 1] 
   }
-  
-  # Pr[var2 = N + 1 | var1 = N] = 0.5
-  # Pr[var2 = N     | var1 = N] = 0.5
-  v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE)
-  
+
   tmp_samples <- list(v1_samples, v2_samples)
-  
+
   # Function to pad strings to uval_vec if sample_vec has
   # larger support than the number of strings in uval_vec
   # For e.g., if samples have support {1, 2, 3, 4, ...} and uvals
@@ -122,21 +166,31 @@ SimulateReports <- function(N, uvals, params, unif,
     }
     uval_vec
   }
-  
+
   # Pad and update uvals
   uvals <- lapply(1:2, function(i) PadStrings(tmp_samples[[i]],
                                               uvals[[i]]))
-
   # Replace integers in tmp_samples with actual sample strings
   samples <- lapply(1:2, function(i) uvals[[i]][tmp_samples[[i]]])
 
+  print("TRUE DISTR")
+  td <- table(samples)/sum(table(samples))
+  td <- td[order(rowSums(td), decreasing = TRUE),]
+  print(td)
+  write.table(td, file = truefile, sep = ",", col.names = TRUE,
+              row.names = TRUE, quote = FALSE)
   # Randomly assign cohorts in each dimension
   cohorts <- sample(1:m, N, replace = TRUE)
   
   # Create and write map into mapfile_1.csv and mapfile_2.csv
+  if (extras == TRUE) {
+    # 1000 spurious candidates for mapfile_1.csv
+    len <- length(uvals[[1]]) + 1000
+    uvals[[1]] <- PadStrings(len, uvals[[1]])
+  }
   map <- lapply(uvals, function(u) CreateMap(u, params))
   write.table(map[[1]]$map_pos, file = paste(mapfile, "_1.csv", sep = ""),
-              sep = ",", col.names = FALSE, na = "", quote = FALSE)
+                sep = ",", col.names = FALSE, na = "", quote = FALSE)
   write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""),
               sep = ",", col.names = FALSE, na = "", quote = FALSE)
   
@@ -160,8 +214,9 @@ main <- function(opts) {
   
   uvals <- GetUniqueValsFromFile(opts$uvals)
   params <- ReadParameterFile(opts$params)
-  SimulateReports(opts$num, uvals, params,  opts$unif, # inputs
-                  opts$map, opts$reports)              # outputs
+  SimulateReports(opts$num, uvals, params,  opts$distr, # inuts
+                  opts$extras,  opts$true,              # inputs
+                  opts$map, opts$reports)               # outputs
   
   print("PROC.TIME")
   print(proc.time() - ptm)
diff --git a/tests/gen_counts.R b/tests/gen_counts.R
index e947a5cf..4c8359f6 100755
--- a/tests/gen_counts.R
+++ b/tests/gen_counts.R
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-source('analysis/R/read_input.R')
+source('../analysis/R/read_input.R')
 
 RandomPartition <- function(total, weights) {
   # Outputs a random partition according to a specified distribution
diff --git a/tests/params.csv b/tests/params.csv
index a2114c90..0dd2c58c 100644
--- a/tests/params.csv
+++ b/tests/params.csv
@@ -1,2 +1,2 @@
 k, h, m, p, q, f
-16, 2, 4, 0.1, 0.9, 0.2
+16, 2, 64, 0.1, 0.9, 0.2
diff --git a/tests/uvals.csv b/tests/uvals.csv
index cebc17ec..986e994f 100644
--- a/tests/uvals.csv
+++ b/tests/uvals.csv
@@ -1,2 +1,2 @@
-google.com,intel.com,yahoo.com
-ssl,nossl
+str1
+option1,option2,option3,option4,option5,option6

From 62375c6ac4ebed6b6fff5b0484972f1200e0af92 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 30 Apr 2015 13:59:41 -0700
Subject: [PATCH 02/67] Adding assoctest.sh.

---
 assoctest.sh       | 362 +++++++++++++++++++++++++++++++++++++++++++++
 tests/gen_counts.R |   2 +-
 2 files changed, 363 insertions(+), 1 deletion(-)
 create mode 100755 assoctest.sh

diff --git a/assoctest.sh b/assoctest.sh
new file mode 100755
index 00000000..95805201
--- /dev/null
+++ b/assoctest.sh
@@ -0,0 +1,362 @@
+#!/bin/bash
+#
+# Run end-to-end tests in parallel.
+#
+# Usage:
+#   ./regtest.sh <function name>
+
+# At the end, it will print an HTML summary.
+# 
+# Three main functions are 
+#    run [[<pattern> [<num> [<fast>]] - run tests matching <pattern> in
+#                                       parallel, each <num> times. The fast 
+#                                       mode (T/F) shortcuts generation of 
+#                                       reports.
+#    run-seq [<pattern> [<num> [<fast>]] - ditto, except that tests are run
+#                                       sequentially
+#    run-all [<num>]              - run all tests, in parallel, each <num> times
+#
+# Examples:
+# $ ./regtest.sh run-seq unif-small-typical  # Sequential run, matches 1 case
+# $ ./regtest.sh run-seq unif-small- 3 F  # Sequential, each test is run three
+#                                           times, using slow generation
+# $ ./regtest.sh run unif-  # Parallel run, matches multiple cases
+# $ ./regtest.sh run unif- 5 # Parallel run, matches multiple cases, each test 
+#                              is run 5 times
+# $ ./regtest.sh run-all     # Run all tests once
+#
+# The <pattern> argument is a regex in 'grep -E' format. (Detail: Don't
+# use $ in the pattern, since it matches the whole spec line and not just the
+# test case name.) The number of processors used in a parallel run is one less
+# than the number of CPUs on the machine.
+
+
+# Future speedups:
+# - Reuse the same input -- come up with naming scheme based on params
+# - Reuse the same maps -- ditto, rappor library can cache it
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+. util.sh
+
+readonly THIS_DIR=$(dirname $0)
+readonly REPO_ROOT=$THIS_DIR
+readonly CLIENT_DIR=$REPO_ROOT/client/python
+readonly REGTEST_DIR=_tmp/regtest
+readonly ASSOCTEST_DIR=_tmp/assoctest
+
+# All the Python tools need this
+export PYTHONPATH=$CLIENT_DIR
+
+#print-true-inputs() {
+#  local num_unique_values=$1
+#  seq 1 $num_unique_values | awk '{print "v" $1}'
+#}
+
+# Add some more candidates here.  We hope these are estimated at 0.
+# e.g. if add_start=51, and num_additional is 20, show v51-v70
+#more-candidates() {
+#  local last_true=$1
+#  local num_additional=$2
+#
+#  local begin
+#  local end
+#  begin=$(expr $last_true + 1)
+#  end=$(expr $last_true + $num_additional)
+#
+#  seq $begin $end | awk '{print "v" $1}'
+#}
+
+# Args:
+#   true_inputs: File of true inputs
+#   last_true: last true input, e.g. 50 if we generated "v1" .. "v50".
+#   num_additional: additional candidates to generate (starting at 'last_true')
+#   to_remove: Regex of true values to omit from the candidates list, or the
+#     string 'NONE' if none should be.  (Our values look like 'v1', 'v2', etc. so
+#     there isn't any ambiguity.)
+#print-candidates() {
+#  local true_inputs=$1
+#  local last_true=$2
+#  local num_additional=$3 
+#  local to_remove=$4
+#
+#  if test $to_remove = NONE; then
+#    cat $true_inputs  # include all true inputs
+#  else
+#    egrep -v $to_remove $true_inputs  # remove some true inputs
+#  fi
+#  more-candidates $last_true $num_additional
+#}
+
+# Generate a single test case, specified by a line of the test spec.
+# This is a helper function for _run_tests().
+_setup-one-case() {
+  local test_case=$1
+
+  # input params
+  local dist=$2
+  local num_unique_values=$3
+  local num_clients=$4
+  local values_per_client=$5
+
+  # RAPPOR params
+  local num_bits=$6
+  local num_hashes=$7
+  local num_cohorts=$8
+  local p=$9
+  local q=${10}  # need curly braces to get the 10th arg
+  local f=${11}
+
+  # map params
+  local num_additional=${12}
+  local to_remove=${13}
+
+  banner 'Setting up parameters and candidate files for '$test_case
+
+  local case_dir=$REGTEST_DIR/$test_case
+  mkdir --verbose -p $case_dir
+
+  # Save the "spec"
+  echo "$@" > $case_dir/spec.txt
+
+  local params_path=$case_dir/case_params.csv
+
+  echo 'k,h,m,p,q,f' > $params_path
+  echo "$num_bits,$num_hashes,$num_cohorts,$p,$q,$f" >> $params_path
+
+  print-true-inputs $num_unique_values > $case_dir/case_true_inputs.txt
+
+  local true_map_path=$case_dir/case_true_map.csv
+
+  analysis/tools/hash_candidates.py \
+    $params_path \
+    < $case_dir/case_true_inputs.txt \
+    > $true_map_path
+
+  # banner "Constructing candidates"
+
+  print-candidates \
+    $case_dir/case_true_inputs.txt $num_unique_values \
+    $num_additional "$to_remove" \
+    > $case_dir/case_candidates.txt
+
+  # banner "Hashing candidates to get 'map'"
+
+  analysis/tools/hash_candidates.py \
+    $case_dir/case_params.csv \
+    < $case_dir/case_candidates.txt \
+    > $case_dir/case_map.csv
+}
+
+# Run a single test instance, specified by <test_name, instance_num>.
+# This is a helper function for _run_tests().
+_run-one-instance() {
+  local test_case=$1
+  local test_instance=$2
+  local fast_counts=$3
+
+  local case_dir=$REGTEST_DIR/$test_case
+  
+  read -r case_name distr num_unique_values num_clients \
+    values_per_client num_bits num_hashes num_cohorts p q f num_additional \
+    to_remove < $case_dir/spec.txt
+
+  local instance_dir=$REGTEST_DIR/$test_case/$test_instance
+  mkdir --verbose -p $instance_dir
+
+  if test $fast_counts = T; then
+    local params_file=$case_dir/case_params.csv
+    local true_map_file=$case_dir/case_true_map.csv
+
+    banner "Using gen_counts.R"
+
+    tests/gen_counts.R $distr $num_clients $values_per_client $params_file \
+                       $true_map_file "$instance_dir/case"
+  else
+    banner "Generating input"
+
+    tests/gen_reports.R $distr $num_unique_values $num_clients \
+                        $values_per_client $instance_dir/case.csv
+
+    banner "Running RAPPOR client"
+
+    # Writes encoded "out" file, true histogram, true inputs to $instance_dir.
+    tests/rappor_sim.py \
+      --num-bits $num_bits \
+      --num-hashes $num_hashes \
+      --num-cohorts $num_cohorts \
+      -p $p \
+      -q $q \
+      -f $f \
+      -i $instance_dir/case.csv \
+      --out-prefix "$instance_dir/case"
+
+    banner "Summing bits to get 'counts'"
+
+    analysis/tools/sum_bits.py \
+      $case_dir/case_params.csv \
+      < $instance_dir/case_out.csv \
+      > $instance_dir/case_counts.csv
+  fi
+
+  local out_dir=${instance_dir}_report
+  mkdir --verbose -p $out_dir
+
+  # Currently, the summary file shows and aggregates timing of the inference
+  # engine, which excludes R's loading time and reading of the (possibly 
+  # substantial) map file. Timing below is more inclusive.
+  TIMEFORMAT='Running analyze.R took %R seconds'
+  time {
+    # Input prefix, output dir
+    tests/analyze.R -t "Test case: $test_case (instance $test_instance)" \
+                       "$case_dir/case" "$instance_dir/case" $out_dir
+  }
+}
+
+# Like _run-once-case, but log to a file.
+_run-one-instance-logged() {
+  local test_case_id=$1
+  local test_case_run=$2
+
+  local log_dir=$REGTEST_DIR/$test_case_id/${test_case_run}_report
+  mkdir --verbose -p $log_dir
+
+  log "Started '$test_case_id' (instance $test_case_run) -- logging to $log_dir/log.txt"
+  _run-one-instance "$@" >$log_dir/log.txt 2>&1 \
+    && log "Test case $test_case_id (instance $test_case_run) done" \
+    || log "Test case $test_case_id (instance $test_case_run) failed"
+}
+
+#make-summary() {
+#  local dir=$1
+#  local filename=${2:-results.html}
+#
+#  tests/make_summary.py $dir > $dir/rows.html
+#
+#  pushd $dir >/dev/null
+#
+#  cat ../../tests/regtest.html \
+#    | sed -e '/TABLE_ROWS/ r rows.html' \
+#    > $filename
+#
+#  popd >/dev/null
+#
+#  log "Wrote $dir/$filename"
+#  log "URL: file://$PWD/$dir/$filename"
+#}
+
+test-error() {
+  local spec_regex=${1:-}
+  log "Some test cases failed"
+  if test -n "$spec_regex"; then
+    log "(Perhaps none matched pattern '$spec_regex')"
+  fi
+  # don't quit just yet
+  # exit 1 
+}
+
+# Assuming the spec file, write a list of test case names (first column) with
+# the instance ids (second column), where instance ids run from 1 to $1.
+# Third column is fast_counts (T/F).
+#_setup-test-instances() {
+#  local instances=$1
+#  local fast_counts=$2
+#
+#  while read line; do
+#    for i in $(seq 1 $instances); do
+#      read case_name _ <<< $line  # extract the first token
+#      echo $case_name $i $fast_counts
+#    done
+#  done
+#}
+
+# Args:
+#   regexp: A pattern selecting the subset of tests to run
+#   instances: A number of times each test case is run
+#   parallel: Whether the tests are run in parallel (T/F)
+#   fast_counts: Whether counts are sampled directly (T/F)
+#
+_run-tests() {
+  local spec_regex=$1  # grep -E format on the spec
+  local instances=$2
+  local parallel=$3
+  local fast_counts=$4
+
+  rm -r -f --verbose $ASSOCTEST_DIR
+
+  mkdir --verbose -p $ASSOCTEST_DIR
+
+  echo "PARAMS"
+  echo $spec_regex
+  echo $instances
+  echo $parallel
+  echo $fast_counts
+
+  local func
+  local processors=1
+
+  if test $parallel = F; then
+    func=_run-one-instance   output to the console
+  else
+    func=_run-one-instance-logged
+    processors=$(grep -c ^processor /proc/cpuinfo || echo 4)  # POSIX-specific
+    if test $processors -gt 1; then  # leave one CPU for the OS
+      processors=$(expr $processors - 1)
+    fi
+    log "Running $processors parallel processes"
+  fi
+
+  echo "FUNC"
+  echo $func
+
+  local cases_list=$ASSOCTEST_DIR/test-cases.txt
+  tests/regtest_spec.py | grep -E $spec_regex > $cases_list
+  break
+
+  # Generate parameters for all test cases.
+  cat $cases_list \
+    | xargs -l -P $processors -- $0 _setup-one-case \
+    || test-error
+
+  log "Done generating parameters for all test cases"
+
+  local instances_list=$REGTEST_DIR/test-instances.txt
+  _setup-test-instances $instances $fast_counts < $cases_list > $instances_list 
+
+  cat $instances_list \
+    | xargs -l -P $processors -- $0 $func || test-error
+
+  log "Done running all test instances"
+
+  make-summary $REGTEST_DIR
+}
+
+# Run tests sequentially
+#run-seq() {
+#  local spec_regex=${1:-'^r-'}  # grep -E format on the spec
+#  local instances=${2:-1}
+#  local fast_counts=${3:-T}
+#
+#  _run-tests $spec_regex $instances F $fast_counts
+#}
+
+# Run tests in parallel
+#run() {
+#  local spec_regex=${1:-'^r-'}  # grep -E format on the spec
+#  local instances=${2:-1}
+#  local fast_counts=${3:-T}
+#  
+#  _run-tests $spec_regex $instances T $fast_counts 
+#}
+
+# Run tests in parallel
+run-all() {
+  local instances=${1:-1}
+
+  log "Running all tests. Can take a while."
+  _run-tests '^r-' $instances T T
+}
+
+"$@"
diff --git a/tests/gen_counts.R b/tests/gen_counts.R
index 4c8359f6..e947a5cf 100755
--- a/tests/gen_counts.R
+++ b/tests/gen_counts.R
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-source('../analysis/R/read_input.R')
+source('analysis/R/read_input.R')
 
 RandomPartition <- function(total, weights) {
   # Outputs a random partition according to a specified distribution

From 935309ee56d0f333f0670c4f05cb0abf857a1673 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Wed, 6 May 2015 17:47:46 -0700
Subject: [PATCH 03/67] Assoctest.sh test suite.

---
 assoctest.sh          | 154 ++++++++++++++----------------------------
 tests/analyze_assoc.R |  77 ++++++++++++---------
 tests/assoc_sim.R     |  45 ++++++++----
 tests/regtest_spec.py |  27 ++++++++
 tests/uvals.csv       |   2 +-
 5 files changed, 156 insertions(+), 149 deletions(-)

diff --git a/assoctest.sh b/assoctest.sh
index 95805201..7b806ef8 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -6,11 +6,11 @@
 #   ./regtest.sh <function name>
 
 # At the end, it will print an HTML summary.
-# 
-# Three main functions are 
+#
+# Three main functions are
 #    run [[<pattern> [<num> [<fast>]] - run tests matching <pattern> in
-#                                       parallel, each <num> times. The fast 
-#                                       mode (T/F) shortcuts generation of 
+#                                       parallel, each <num> times. The fast
+#                                       mode (T/F) shortcuts generation of
 #                                       reports.
 #    run-seq [<pattern> [<num> [<fast>]] - ditto, except that tests are run
 #                                       sequentially
@@ -21,7 +21,7 @@
 # $ ./regtest.sh run-seq unif-small- 3 F  # Sequential, each test is run three
 #                                           times, using slow generation
 # $ ./regtest.sh run unif-  # Parallel run, matches multiple cases
-# $ ./regtest.sh run unif- 5 # Parallel run, matches multiple cases, each test 
+# $ ./regtest.sh run unif- 5 # Parallel run, matches multiple cases, each test
 #                              is run 5 times
 # $ ./regtest.sh run-all     # Run all tests once
 #
@@ -79,7 +79,7 @@ export PYTHONPATH=$CLIENT_DIR
 #print-candidates() {
 #  local true_inputs=$1
 #  local last_true=$2
-#  local num_additional=$3 
+#  local num_additional=$3
 #  local to_remove=$4
 #
 #  if test $to_remove = NONE; then
@@ -98,8 +98,9 @@ _setup-one-case() {
   # input params
   local dist=$2
   local num_unique_values=$3
-  local num_clients=$4
-  local values_per_client=$5
+  local num_unique_values2=$4
+  local num_clients=$5
+  local values_per_client=$6
 
   # RAPPOR params
   local num_bits=$6
@@ -109,13 +110,9 @@ _setup-one-case() {
   local q=${10}  # need curly braces to get the 10th arg
   local f=${11}
 
-  # map params
-  local num_additional=${12}
-  local to_remove=${13}
-
   banner 'Setting up parameters and candidate files for '$test_case
 
-  local case_dir=$REGTEST_DIR/$test_case
+  local case_dir=$ASSOCTEST_DIR/$test_case
   mkdir --verbose -p $case_dir
 
   # Save the "spec"
@@ -125,29 +122,6 @@ _setup-one-case() {
 
   echo 'k,h,m,p,q,f' > $params_path
   echo "$num_bits,$num_hashes,$num_cohorts,$p,$q,$f" >> $params_path
-
-  print-true-inputs $num_unique_values > $case_dir/case_true_inputs.txt
-
-  local true_map_path=$case_dir/case_true_map.csv
-
-  analysis/tools/hash_candidates.py \
-    $params_path \
-    < $case_dir/case_true_inputs.txt \
-    > $true_map_path
-
-  # banner "Constructing candidates"
-
-  print-candidates \
-    $case_dir/case_true_inputs.txt $num_unique_values \
-    $num_additional "$to_remove" \
-    > $case_dir/case_candidates.txt
-
-  # banner "Hashing candidates to get 'map'"
-
-  analysis/tools/hash_candidates.py \
-    $case_dir/case_params.csv \
-    < $case_dir/case_candidates.txt \
-    > $case_dir/case_map.csv
 }
 
 # Run a single test instance, specified by <test_name, instance_num>.
@@ -155,63 +129,44 @@ _setup-one-case() {
 _run-one-instance() {
   local test_case=$1
   local test_instance=$2
-  local fast_counts=$3
 
-  local case_dir=$REGTEST_DIR/$test_case
-  
-  read -r case_name distr num_unique_values num_clients \
-    values_per_client num_bits num_hashes num_cohorts p q f num_additional \
-    to_remove < $case_dir/spec.txt
+  local case_dir=$ASSOCTEST_DIR/$test_case
 
-  local instance_dir=$REGTEST_DIR/$test_case/$test_instance
-  mkdir --verbose -p $instance_dir
+  read -r case_name case_descr num_unique_values num_unique_values2 \
+    num_clients num_bits num_hashes num_cohorts p q f < $case_dir/spec.txt
 
-  if test $fast_counts = T; then
-    local params_file=$case_dir/case_params.csv
-    local true_map_file=$case_dir/case_true_map.csv
+  local instance_dir=$ASSOCTEST_DIR/$test_case/$test_instance
+  mkdir --verbose -p $instance_dir
 
-    banner "Using gen_counts.R"
+  banner "Running association input simulation"
 
-    tests/gen_counts.R $distr $num_clients $values_per_client $params_file \
-                       $true_map_file "$instance_dir/case"
-  else
-    banner "Generating input"
-
-    tests/gen_reports.R $distr $num_unique_values $num_clients \
-                        $values_per_client $instance_dir/case.csv
-
-    banner "Running RAPPOR client"
-
-    # Writes encoded "out" file, true histogram, true inputs to $instance_dir.
-    tests/rappor_sim.py \
-      --num-bits $num_bits \
-      --num-hashes $num_hashes \
-      --num-cohorts $num_cohorts \
-      -p $p \
-      -q $q \
-      -f $f \
-      -i $instance_dir/case.csv \
-      --out-prefix "$instance_dir/case"
-
-    banner "Summing bits to get 'counts'"
-
-    analysis/tools/sum_bits.py \
-      $case_dir/case_params.csv \
-      < $instance_dir/case_out.csv \
-      > $instance_dir/case_counts.csv
-  fi
+  tests/assoc_sim.R \
+    -p $case_dir/case_params.csv \
+    -r $instance_dir/reports.csv \
+    -t $instance_dir/truedist.csv \
+    -m $instance_dir/map \
+    -n $num_clients \
+    --var1_num $num_unique_values \
+    --var2_num $num_unique_values2
 
   local out_dir=${instance_dir}_report
   mkdir --verbose -p $out_dir
 
   # Currently, the summary file shows and aggregates timing of the inference
-  # engine, which excludes R's loading time and reading of the (possibly 
+  # engine, which excludes R's loading time and reading of the (possibly
   # substantial) map file. Timing below is more inclusive.
   TIMEFORMAT='Running analyze.R took %R seconds'
   time {
+    tests/analyze_assoc.R \
+      --map1 $instance_dir/map_1.csv \
+      --map2 $instance_dir/map_2.csv \
+      --reports $instance_dir/reports.csv \
+      --truefile $instance_dir/truedist.csv \
+      --outdir $out_dir \
+      --params $case_dir/case_params.csv
     # Input prefix, output dir
-    tests/analyze.R -t "Test case: $test_case (instance $test_instance)" \
-                       "$case_dir/case" "$instance_dir/case" $out_dir
+#    tests/analyze.R -t "Test case: $test_case (instance $test_instance)" \
+#                       "$case_dir/case" "$instance_dir/case" $out_dir
   }
 }
 
@@ -220,7 +175,7 @@ _run-one-instance-logged() {
   local test_case_id=$1
   local test_case_run=$2
 
-  local log_dir=$REGTEST_DIR/$test_case_id/${test_case_run}_report
+  local log_dir=$ASSOCTEST_DIR/$test_case_id/${test_case_run}_report
   mkdir --verbose -p $log_dir
 
   log "Started '$test_case_id' (instance $test_case_run) -- logging to $log_dir/log.txt"
@@ -254,23 +209,21 @@ test-error() {
     log "(Perhaps none matched pattern '$spec_regex')"
   fi
   # don't quit just yet
-  # exit 1 
+  # exit 1
 }
 
 # Assuming the spec file, write a list of test case names (first column) with
 # the instance ids (second column), where instance ids run from 1 to $1.
-# Third column is fast_counts (T/F).
-#_setup-test-instances() {
-#  local instances=$1
-#  local fast_counts=$2
-#
-#  while read line; do
-#    for i in $(seq 1 $instances); do
-#      read case_name _ <<< $line  # extract the first token
-#      echo $case_name $i $fast_counts
-#    done
-#  done
-#}
+_setup-test-instances() {
+  local instances=$1
+
+  while read line; do
+    for i in $(seq 1 $instances); do
+      read case_name _ <<< $line  # extract the first token
+      echo $case_name $i
+    done
+  done
+}
 
 # Args:
 #   regexp: A pattern selecting the subset of tests to run
@@ -298,7 +251,7 @@ _run-tests() {
   local processors=1
 
   if test $parallel = F; then
-    func=_run-one-instance   output to the console
+    func=_run-one-instance  # output to the console
   else
     func=_run-one-instance-logged
     processors=$(grep -c ^processor /proc/cpuinfo || echo 4)  # POSIX-specific
@@ -308,12 +261,8 @@ _run-tests() {
     log "Running $processors parallel processes"
   fi
 
-  echo "FUNC"
-  echo $func
-
   local cases_list=$ASSOCTEST_DIR/test-cases.txt
   tests/regtest_spec.py | grep -E $spec_regex > $cases_list
-  break
 
   # Generate parameters for all test cases.
   cat $cases_list \
@@ -322,15 +271,16 @@ _run-tests() {
 
   log "Done generating parameters for all test cases"
 
-  local instances_list=$REGTEST_DIR/test-instances.txt
-  _setup-test-instances $instances $fast_counts < $cases_list > $instances_list 
+  local instances_list=$ASSOCTEST_DIR/test-instances.txt
+  _setup-test-instances $instances $fast_counts < $cases_list > $instances_list
 
   cat $instances_list \
     | xargs -l -P $processors -- $0 $func || test-error
 
   log "Done running all test instances"
+  exit 1
 
-  make-summary $REGTEST_DIR
+  make-summary $ASSOCTEST_DIR
 }
 
 # Run tests sequentially
@@ -356,7 +306,7 @@ run-all() {
   local instances=${1:-1}
 
   log "Running all tests. Can take a while."
-  _run-tests '^r-' $instances T T
+  _run-tests '^a-' $instances T T
 }
 
 "$@"
diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R
index 56b66ea0..50f98c33 100755
--- a/tests/analyze_assoc.R
+++ b/tests/analyze_assoc.R
@@ -1,22 +1,22 @@
 #!/usr/bin/env Rscript
 #
 # Copyright 2015 Google Inc. All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Reads map files, report files, and RAPPOR parameters to run 
+# Reads map files, report files, and RAPPOR parameters to run
 # an EM algorithm to estimate joint distribution over two or more variables
-# 
+#
 # Usage:
 #       $ ./analyze_assoc.R -map1 map_1.csv -map2 map_2.csv \
 #                                 -reports reports.csv \
@@ -25,7 +25,7 @@
 # Outputs:
 #         prints a table with estimated joint probability masses
 #         over candidate strings
-#         Ex. 
+#         Ex.
 #                 ssl   nossl
 #         intel   0.1   0.3
 #         google  0.5   0.1
@@ -43,23 +43,27 @@ if(!interactive()) {
                 help = "Hashed candidates for 2nd variable"),
     make_option(c("--reports", "-r"), default = "reports.csv",
                 help = "File with raw reports as <cohort, report1, report2>"),
+    make_option(c("--truefile", "-t"), default = "truedist.csv",
+                help = "File with true distribution generated by assoc_sim.R"),
+    make_option(c("--outdir", "-o"), default = ".",
+                help = "File where the metrics go"),
     make_option(c("--params", "-p"), default = "params.csv",
                 help = "Filename for RAPPOR parameters")
   )
   opts <- parse_args(OptionParser(option_list = option_list))
-}    
+}
 
-source("../analysis/R/encode.R")
-source("../analysis/R/decode.R")
-source("../analysis/R/simulation.R")
-source("../analysis/R/read_input.R")
-source("../analysis/R/association.R")
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/read_input.R")
+source("analysis/R/association.R")
 
 # This function processes the maps loaded using ReadMapFile
 # Association analysis requires a map object with a map
 # field that has the map split into cohorts and an rmap field
 # that has all the cohorts combined
-# Arguments: 
+# Arguments:
 #       map = map object with cohorts as sparse matrix in
 #             object map$map
 #             This is the expected object from ReadMapFile
@@ -81,7 +85,7 @@ ProcessMap <- function(map, params) {
 
 main <- function(opts) {
   ptm <- proc.time()
-  
+
   params <- ReadParameterFile(opts$params)
   opts_map <- list(opts$map1, opts$map2)
   map <- lapply(opts_map, function(o)
@@ -89,10 +93,10 @@ main <- function(opts) {
                              params = params))
   # Reports must be of the format
   #     cohort no, rappor bitstring 1, rappor bitstring 2
-  reportsObj <- read.csv(opts$reports, 
+  reportsObj <- read.csv(opts$reports,
                          colClasses = c("integer", "character", "character"),
                          header = FALSE)
-  
+
   # Parsing reportsObj
   # ComputeDistributionEM allows for different sets of cohorts
   # for each variable. Here, both sets of cohorts are identical
@@ -100,44 +104,55 @@ main <- function(opts) {
   cohorts <- list(co, co)
   # Parse reports from reportObj cols 2 and 3
   reports <- lapply(1:2, function(x) as.list(reportsObj[x + 1]))
-  
+
   # Split strings into bit arrays (as required by assoc analysis)
   reports <- lapply(1:2, function(i) {
     # apply the following function to each of reports[[1]] and reports[[2]]
     lapply(reports[[i]][[1]], function(x) {
-      # function splits strings and converts them to numeric values  
+      # function splits strings and converts them to numeric values
       as.numeric(strsplit(x, split = "")[[1]])
     })
   })
-  
-  joint_dist <- ComputeDistributionEM(reports, cohorts, map, 
+
+  joint_dist <- ComputeDistributionEM(reports, cohorts, map,
                                       ignore_other = TRUE,
                                       params, marginals = NULL,
                                       estimate_var = FALSE)
-  
-  # Hardcoded place to lookup true distribution
-  # TODO(pseudorandom): Make this a flag
-  td <- read.csv(file = "truedist.csv")
+
+  td <- read.csv(file = opts$truefile)
   ed <- joint_dist$fit
-  
+  print("CHI-SQUARED")
+  td_chisq <- chisq.test(td)
+  ed_chisq <- chisq.test(ed)
+  print(td_chisq)
+  print(ed_chisq)
+
   # L1 distance = 1 - sum(min(td|x, ed|x)) where
   # td|x / ed|x projects the distribution to the intersection x of the
   # supports of td and ed
   rowsi <- intersect(rownames(td), rownames(ed))
   colsi <- intersect(colnames(td), colnames(ed))
   print("L1 DISTANCE")
-  print(1 - sum(mapply(min,
+  l1d <- 1 - sum(mapply(min,
                   unlist(td[rowsi, colsi], use.names = FALSE),
                   unlist(as.data.frame(ed)[rowsi, colsi], use.names = FALSE)
-                   )))
-  
-  # TODO(pseudorandom): Export the results to a file for further analysis
+                   ))
+  print(l1d)
+
   print("JOINT_DIST$FIT")
   print(signif(ed[order(rowSums(ed)),], 4))
   print("PROC.TIME")
-  print(proc.time() - ptm)
+  time_taken <- proc.time() - ptm
+  print(time_taken)
+  
+  # Write metrics to metrics.csv
+  metrics <- list(td_chisq = td_chisq[1][[1]][[1]],
+                  ed_chisq = ed_chisq[1][[1]][[1]],
+                 l1d = l1d, time = time_taken[1])
+  filename <- file.path(opts$outdir, 'metrics.csv')
+  write.csv(metrics, file = filename, row.names = FALSE)
 }
 
 if(!interactive()) {
   main(opts)
-}
\ No newline at end of file
+}
diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R
index 6ac2b857..61ee822f 100755
--- a/tests/assoc_sim.R
+++ b/tests/assoc_sim.R
@@ -32,7 +32,7 @@ options(stringsAsFactors = FALSE)
 
 if(!interactive()) {
   option_list <- list(
-    make_option(c("--uvals", "-v"), default = "uvals.csv",
+    make_option(c("--uvals", "-v"),
                 help = "Filename for list of values over which
                 distributions are simulated. The file is a list of
                 comma-separated strings each line of which refers
@@ -47,6 +47,10 @@ if(!interactive()) {
                 help = "Filename *prefix* for map(s)"),
     make_option(c("--num", "-n"), default = 1e05,
                 help = "Number of reports"),
+    make_option(c("--var1_num", "-z"), default = 25,
+                help = "Number of values for var1"),
+    make_option(c("--var2_num", "-y"), default = 5,
+                help = "Number of values for var2"),
     make_option(c("--extras", "-e"), default = TRUE,
                 help = "Does 1st map have spurious candidates?"),
     make_option(c("--distr", "-d"), default = "zipfg",
@@ -56,12 +60,12 @@ if(!interactive()) {
   opts <- parse_args(OptionParser(option_list = option_list))
 }    
 
-source("../analysis/R/encode.R")
-source("../analysis/R/decode.R")
-source("../analysis/R/simulation.R")
-source("../analysis/R/read_input.R")
-source("../analysis/R/association.R")
-source("../tests/gen_counts.R")
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/read_input.R")
+source("analysis/R/association.R")
+source("tests/gen_counts.R")
 
 # Read unique values of reports from a csv file
 # Inputs: filename. The file is expected to contain two rows of strings
@@ -92,9 +96,14 @@ GetUniqueValsFromFile <- function(filename) {
 #         distr = the type of distribution to use
 #                 {unif, poisson, poisson2, zipfg}
 #         extras = whether map_1.csv has spurious candidates or not
+#         truefile = name of the file with true distribution
+#         var1_num = number of var1 candidates
+#         var2_num = number of var2 candidates
+#         *** CURRENTLY ONLY USEFUL IF DISTR = ZIPFG ***
 #         mapfile = file to write maps into (with .csv suffixes)
 #         reportsfile = file to write reports into (with .csv suffix)
 SimulateReports <- function(N, uvals, params, distr, extras, truefile,
+                            var1_num, var2_num,
                             mapfile, reportsfile) {
   # Compute true distribution
   m <- params$m  
@@ -127,21 +136,22 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
     v2_samples[v1_samples %% 2 == 1] <- pr75[v1_samples %% 2 == 1]
   } else if (distr == "zipfg") {
 
-    # Zipfian over 25 strings
-    partition <- RandomPartition(N, ComputePdf("zipf1.5", 25))
-    v1_samples <- rep(1:25, partition)  # expand partition
+    # Zipfian over var1_num strings
+    partition <- RandomPartition(N, ComputePdf("zipf1.5", var1_num))
+    v1_samples <- rep(1:var1_num, partition)  # expand partition
     # Shuffle values randomly (may take a few sec for > 10^8 inputs)
     v1_samples <- sample(v1_samples)
 
-    # supp(var2) = {1, 2, 3, 4, 6}
+    # supp(var2) = {1, 2, 3, ..., var2_num}
     # We look at two zipfian distributions over supp(var2)
     # D1 = zipfian distribution
-    # D2 = zipfian distr over {6, 5, 4, 3, 2, 1}
+    # D2 = zipfian distr over {var2_num, ..., 4, 3, 2, 1}
     # (i.e., D1 in reverse)
     # var2 ~ D1 if var1 = even
     # var2 ~ D2 if var1 = odd
-    d1 <- sample(rep(1:6, RandomPartition(N, ComputePdf("zipf1.5", 6))))
-    d2 <- c(6, 5, 4, 3, 2, 1)[d1]
+    d1 <- sample(rep(1:var2_num, 
+                     RandomPartition(N, ComputePdf("zipf1.5", var2_num))))
+    d2 <- (var2_num:1)[d1]
     v2_samples <- rep(1, N)
     v2_samples[v1_samples %% 2 == 0] <- d1[v1_samples %% 2 == 0]
     v2_samples[v1_samples %% 2 == 1] <- d2[v1_samples %% 2 == 1] 
@@ -212,10 +222,15 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
 main <- function(opts) {
   ptm <- proc.time()
   
-  uvals <- GetUniqueValsFromFile(opts$uvals)
+  if(is.null(opts$uvals)) {
+    uvals = list(var1 = c("str1"), var2 = c("option1"))
+  } else {
+    uvals <- GetUniqueValsFromFile(opts$uvals)
+  }
   params <- ReadParameterFile(opts$params)
   SimulateReports(opts$num, uvals, params,  opts$distr, # inuts
                   opts$extras,  opts$true,              # inputs
+                  opts$var1_num,  opts$var2_num,        # inputs
                   opts$map, opts$reports)               # outputs
   
   print("PROC.TIME")
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 6774e400..0c5798f0 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -41,6 +41,17 @@
     ('large', 10000, 100000000, 1),
 )
 
+DISTRIBUTION_PARAMS_ASSOC = (
+    # name, num unique values 1,
+    # num unique values 2, num clients, values per client
+    ('tiny', 100, 2, int(1e03), 1),  # test for insufficient data
+    ('small', 100, 10, int(1e04), 1),
+    ('medium', 1000, 10, int(1e05), 1),
+    ('large', 1000, 10, int(1e06), 1),
+    ('mediumsquared', 1000, 100, int(1e05), 1),
+    ('largesquared', int(1e04), 100, int(1e06), 1),
+)
+
 # 'k, h, m' as in params file.
 BLOOMFILTER_PARAMS = {
     '8x16': (8, 2, 16),  # 16 cohorts, 8 bits each, 2 bits set in each
@@ -102,6 +113,22 @@ def main(argv):
   for params in DEMO:
     rows.append(params)
 
+  # Association tests
+  for (distr_params, num_values1, num_values2, num_clients,
+       num_reports_per_client) in DISTRIBUTION_PARAMS_ASSOC:
+    for bloom_params in BLOOMFILTER_PARAMS:
+      for privacy_params in PRIVACY_PARAMS:
+        test_name = 'a-{}-{}-{}'.format(distr_params, bloom_params,
+                                        privacy_params)
+
+        params = (BLOOMFILTER_PARAMS[bloom_name]
+                  + PRIVACY_PARAMS[privacy_params])
+        test_case = (test_name, distr_params, num_values1, num_values2,
+                     num_clients) + params
+        row_str = [str(element) for element in test_case]
+        rows.append(row_str)
+  # End of association tests
+
   for row in rows:
     print ' '.join(row)
 
diff --git a/tests/uvals.csv b/tests/uvals.csv
index 986e994f..18600571 100644
--- a/tests/uvals.csv
+++ b/tests/uvals.csv
@@ -1,2 +1,2 @@
 str1
-option1,option2,option3,option4,option5,option6
+option1

From 17d3f1fe2483ebd283844d08a5a9ecb445503c87 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Wed, 6 May 2015 18:34:23 -0700
Subject: [PATCH 04/67] Modifying files involved with generating summary.

---
 assoctest.sh                |  34 ++--
 tests/assoctest.html        |  98 +++++++++++
 tests/make_summary_assoc.py | 343 ++++++++++++++++++++++++++++++++++++
 3 files changed, 458 insertions(+), 17 deletions(-)
 create mode 100644 tests/assoctest.html
 create mode 100755 tests/make_summary_assoc.py

diff --git a/assoctest.sh b/assoctest.sh
index 7b806ef8..5bbabc30 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -184,23 +184,23 @@ _run-one-instance-logged() {
     || log "Test case $test_case_id (instance $test_case_run) failed"
 }
 
-#make-summary() {
-#  local dir=$1
-#  local filename=${2:-results.html}
-#
-#  tests/make_summary.py $dir > $dir/rows.html
-#
-#  pushd $dir >/dev/null
-#
-#  cat ../../tests/regtest.html \
-#    | sed -e '/TABLE_ROWS/ r rows.html' \
-#    > $filename
-#
-#  popd >/dev/null
-#
-#  log "Wrote $dir/$filename"
-#  log "URL: file://$PWD/$dir/$filename"
-#}
+make-summary() {
+  local dir=$1
+  local filename=${2:-results.html}
+
+  tests/make_summary_assoc.py $dir > $dir/rows.html
+
+  pushd $dir >/dev/null
+
+  cat ../../tests/assoctest.html \
+    | sed -e '/TABLE_ROWS/ r rows.html' \
+    > $filename
+
+  popd >/dev/null
+
+  log "Wrote $dir/$filename"
+  log "URL: file://$PWD/$dir/$filename"
+}
 
 test-error() {
   local spec_regex=${1:-}
diff --git a/tests/assoctest.html b/tests/assoctest.html
new file mode 100644
index 00000000..91ee25be
--- /dev/null
+++ b/tests/assoctest.html
@@ -0,0 +1,98 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <title>RAPPOR assoctest.sh</title>
+  <style type="text/css">
+    h2 { text-align: center }
+    p { margin: 0 auto; width: 80%; text-align: center }
+
+    table { width: 100%; border-spacing: 0 }
+    .tophead { text-align: center; font-weight: bold }
+    .explain { text-align: left; font-weight: normal }
+    .subhead { text-align: right; font-weight: bold }
+    .highlight { background-color: #eeeeee }
+    tbody td { text-align: right }
+  </style>
+</head>
+
+<body>
+  <a id="top"></a>
+
+  <h2>RAPPOR regtest.sh</h2>
+
+  <!-- These strings will be replaced by a sed script. -->
+
+  <table cellspacing="0" cellpadding="5">
+    <colgroup>
+      <col span="1" class="highlight" />
+      <col span="4" />
+      <col span="6" class="highlight" />
+      <col span="4" />
+    </colgroup>
+
+    <thead>
+      <tr class="tophead">
+        <td>
+          Test Case
+        </td>
+        <td colspan=4>
+          Input Params
+        </td>
+        <td colspan=6>
+          RAPPOR Params
+        </td>
+        <td colspan=4>
+          Result Metrics
+        </td>
+      </tr>
+
+      <tr class="explain">
+        <td></td>
+        <td colspan=4>
+          d: distribution drawn from<br/>
+          u: total unique values<br/>
+          c: clients<br/>
+          v: values per client<br/>
+        </td>
+        <td colspan=6>
+          k: report bits<br/>
+          h: hashes<br/>
+          m: cohorts<br/>
+          p, q, f: probabilities<br/>
+        </td>
+        <td colspan=7>
+          td_chisq: chisq test on true distr.<br/>
+          ed_chisq: chisq test on est. distr.<br/>
+          l1d: l1 distance<br/>
+          rtime: R runtime<br/>
+        </td>
+      </tr>
+
+      <tr class="subhead">
+        <td></td>
+
+        <td>d</td>
+        <td>u</td>
+        <td>c</td>
+        <td>v</td>
+
+        <td>k</td>
+        <td>h</td>
+        <td>m</td>
+        <td>p</td>
+        <td>q</td>
+        <td>f</td>
+
+        <td>td_chisq</td>
+        <td>ed_chisq</td>
+        <td>l1d</td>
+        <td>rtime</td>
+      </tr>
+    </thead>
+
+    <tbody>
+  <!-- TABLE_ROWS -->
+
+</body>
+
+</html>
diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py
new file mode 100755
index 00000000..ac3fb160
--- /dev/null
+++ b/tests/make_summary_assoc.py
@@ -0,0 +1,343 @@
+#!/usr/bin/python
+"""Given a regtest result tree, prints an HTML summary on stdout.
+
+See HTML skeleton in tests/regtest.html.
+"""
+
+import os
+import re
+import sys
+
+
+SUMMARY_ROW = """\
+<tfoot style="font-weight: bold; text-align: right">
+<tr>
+  <td>
+    %(name)s
+  </td>
+
+  <!-- input params -->
+  <td></td>
+  <td></td>
+  <td></td>
+  <td></td>
+
+  <!-- RAPPOR params -->
+  <td></td>
+  <td></td>
+  <td></td>
+  <td></td>
+  <td></td>
+  <td></td>
+
+  <!-- MAP params -->
+  <td></td>
+  <td></td>
+
+  <!-- Result metrics -->
+  <td></td>
+  <td></td>
+  <td>%(mean_l1d)s</td>
+  <td>%(mean_rtime)s</td>
+  <td>%(mean_chisqdiff)s</td>
+</tr>
+</tfoot>
+"""
+
+# Navigation and links to plot.
+DETAILS = """\
+<p style="text-align: right">
+  <a href="#top">Up</a>
+</p>
+
+<a id="%(anchor)s"></a>
+
+<p style="text-align: center">
+  <img src="%(instance_dir)s/dist.png"/>
+</p>
+
+<p>
+<a href="%(instance_dir)s">%(name)s files</a>
+</p>
+"""
+
+
+def FormatFloat(x, percent):
+  """Formats a floating-point number."""
+  if percent:
+    return '{:.1f}%'.format(x * 100.0)
+  else:
+    return '{:.3f}'.format(x)
+
+
+def FormatMeanWithSem(m_std_error, percent=False):
+  """Formats an estimate with standard error."""
+  if m_std_error is None:
+    return ''
+  m, std_error = m_std_error
+  if std_error is None:
+    return FormatFloat(m, percent)
+  else:
+    return '{}&plusmn;{}'.format(
+        FormatFloat(m, percent),
+        FormatFloat(std_error, percent))
+
+
+def Mean(l):
+  """Computes the mean (average) for a list of numbers."""
+  if l:
+    return float(sum(l)) / len(l)
+  else:
+    return None
+
+
+def SampleVar(l):
+  """Computes the sample variance for a list of numbers."""
+  if len(l) > 1:
+    mean = Mean(l)
+    var = sum([(x - mean) ** 2 for x in l]) / (len(l) - 1)
+    return var
+  else:
+    return None
+
+
+def StandardErrorEstimate(l):
+  """Returns the standard error estimate for a list of numbers.
+
+  For a singleton the standard error is assumed to be 10% of its value.
+  """
+  if len(l) > 1:
+    return (SampleVar(l) / len(l)) ** .5
+  elif l:
+    return l[0] / 10.0
+  else:
+    return None
+
+
+def MeanOfMeans(dict_of_lists):
+  """Returns the average of averages with the standard error of the estimate.
+  """
+  means = [Mean(dict_of_lists[key]) for key in dict_of_lists
+           if dict_of_lists[key]]
+  if means:
+    # Compute variances of the estimate for each sublist.
+    se = [StandardErrorEstimate(dict_of_lists[key]) ** 2 for key
+          in dict_of_lists if dict_of_lists[key]]
+    return (Mean(means),  # Mean over all sublists
+            sum(se) ** .5 / len(se))  # Standard deviation of the mean
+  else:
+    return None
+
+
+def ParseSpecFile(spec_filename):
+  """Parses the spec (parameters) file.
+
+  Returns:
+    An integer and a string. The integer is the number of bogus candidates
+    and the string is parameters in the HTML format.
+  """
+  with open(spec_filename) as s:
+    spec_row = s.readline().split()
+
+  spec_in_html = ' '.join('<td>%s</td>' % cell for cell in spec_row[1:])
+
+  return spec_in_html
+
+
+def ExtractTime(log_filename):
+  """Extracts the elapsed time information from the log file.
+
+  Returns:
+     Elapsed time (in seconds) or None in case of failure.
+  """
+  if os.path.isfile(log_filename):
+    with open(log_filename) as log:
+      log_str = log.read()
+    # Matching a line output by analyze.R.
+    match = re.search(r'Inference took ([0-9.]+) seconds', log_str)
+    if match:
+      return float(match.group(1))
+  return None
+
+
+def ParseMetrics(metrics_file, log_file):
+  """Processes the metrics file.
+
+  Args:
+    report_dir: A directory name containing metrics.csv and log.txt.
+    num_additional: A number of bogus candidates added to the candidate list.
+
+  Returns a pair:
+    - A dictionary of metrics (some can be []).
+    - An HTML-formatted portion of the report row.
+  """
+  with open(metrics_file) as m:
+    m.readline()
+    metrics_row = m.readline().split(',')
+
+  (td_chisq, ed_chisq, l1d, rtime) = metrics_row
+
+  td_chisq = float(td_chisq)
+  ed_chisq = float(ed_chisq)
+
+  l1d = float(l1d)
+  rtime = float(rtime)
+
+  elapsed_time = ExtractTime(log_file)
+
+  metrics_row_str = [
+      str(td_chisq),
+      str(ed_chisq),
+      str(l1d),
+      str(rtime),
+  ]
+
+  metrics_row_dict = {
+      'l1d': [l1d],
+      'rtime': [rtime],
+      'chisqdiff': [abs(td_chisq - ed_chisq)],
+  }
+
+  # return metrics formatted as HTML table entries
+  return (metrics_row_dict,
+          ' '.join('<td>%s</td>' % cell for cell in metrics_row_str))
+
+
+def FormatCell1(test_case, test_instance, metrics_file, log_file, plot_file,
+                link_to_plots):
+  """Outputs an HTML table entry for the first cell of the row.
+
+  The row is filled if the metrics file exist. The first cell contains a link
+  that for short tables points to a plot file inline, for large tables to an
+  external file.
+
+  If the metrics file is missing, the link points to the log file (if one
+  exists)
+  """
+  relpath_report = '{}/{}_report'.format(test_case, test_instance)
+  if os.path.isfile(metrics_file):
+    external_file = plot_file
+    if link_to_plots:
+      link = '#{}_{}'.format(test_case, test_instance)  # anchor
+    else:
+      link = os.path.join(relpath_report, 'dist.png')
+  else:  # no results likely due to an error, puts a link to the log file
+    external_file = log_file
+    link = os.path.join(relpath_report, 'log.txt')
+
+  if os.path.isfile(external_file):
+    return '<td><a href="{}">{}</a></td>'.format(link, test_case)
+  else:  # if no file to link to
+    return '<td>{}</td>'.format(test_case)
+
+
+def FormatSummaryRow(metrics_lists):
+  """Outputs an HTML-formatted summary row."""
+  means_with_sem = {}  # SEM - standard error of the mean
+
+  for key in metrics_lists:
+    means_with_sem[key] = MeanOfMeans(metrics_lists[key])
+    # If none of the lists is longer than one element, drop the SEM component.
+    if means_with_sem[key] and max([len(l) for l in metrics_lists[key]]) < 2:
+      means_with_sem[key] = [means_with_sem[key][0], None]
+
+  summary = {
+      'name': 'Means',
+      'mean_l1d': FormatMeanWithSem(means_with_sem['l1d'], percent=True),
+      'mean_chisqdiff': FormatMeanWithSem(means_with_sem['chisqdiff'], percent=True),
+      'mean_rtime': FormatMeanWithSem(means_with_sem['rtime']),
+  }
+  return SUMMARY_ROW % summary
+
+
+def FormatPlots(base_dir, test_instances):
+  """Outputs HTML-formatted plots."""
+  result = ''
+  for instance in test_instances:
+    # A test instance is identified by the test name and the test run.
+    test_case, test_instance, _ = instance.split(' ')
+    instance_dir = test_case + '/' + test_instance + '_report'
+    if os.path.isfile(os.path.join(base_dir, instance_dir, 'dist.png')):
+      result += DETAILS % {'anchor': test_case + '_' + test_instance,
+                           'name': '{} (instance {})'.format(test_case,
+                                                             test_instance),
+                           'instance_dir': instance_dir}
+  return result
+
+
+def main(argv):
+  base_dir = argv[1]
+
+  # This file has the test case names, in the order that they should be
+  # displayed.
+  path = os.path.join(base_dir, 'test-instances.txt')
+  with open(path) as f:
+    test_instances = [line.strip() for line in f]
+
+  # Metrics are assembled into a dictionary of dictionaries. The top-level
+  # key is the metric name ('tv', 'fpr', etc.), the second level key is
+  # the test case. These keys reference a list of floats, which can be empty.
+  metrics = {
+      'l1d': {},  # l1 distance
+      'rtime': {},  # R run time
+      'chisqdiff': {}, # abs diff in values for the chisq test between true
+                       # distr and estimated distr.
+  }
+
+  # If there are too many tests, the plots are not included in the results
+  # file. Instead, rows' names are links to the corresponding .png files.
+  include_plots = len(test_instances) < 20
+
+  for instance in test_instances:
+    # A test instance is idenfied by the test name and the test run.
+    test_case, test_instance = instance.split(' ')
+
+    spec_file = os.path.join(base_dir, test_case, 'spec.txt')
+    if not os.path.isfile(spec_file):
+      raise RuntimeError('{} is missing'.format(spec_file))
+
+    spec_html = ParseSpecFile(spec_file)
+    metrics_html = ''  # will be filled in later on, if metrics exist
+
+    report_dir = os.path.join(base_dir, test_case, test_instance + '_report')
+
+    metrics_file = os.path.join(report_dir, 'metrics.csv')
+    log_file = os.path.join(report_dir, 'log.txt')
+    plot_file = os.path.join(report_dir, 'dist.png')
+
+    cell1_html = FormatCell1(test_case, test_instance, metrics_file, log_file,
+                             plot_file, include_plots)
+
+    if os.path.isfile(metrics_file):
+      # ParseMetrics outputs an HTML table row and also updates lists
+      metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file)
+
+      # Update the metrics structure. Initialize dictionaries if necessary.
+      for m in metrics:
+        if not test_case in metrics[m]:
+          metrics[m][test_case] = metrics_dict[m]
+        else:
+          metrics[m][test_case] += metrics_dict[m]
+
+    print '<tr>{}{}{}</tr>'.format(cell1_html, spec_html, metrics_html)
+
+  print FormatSummaryRow(metrics)
+
+  print '</tbody>'
+  print '</table>'
+  print '<p style="padding-bottom: 3em"></p>'  # vertical space
+
+  # Plot links.
+  if include_plots:
+    print FormatPlots(base_dir, test_instances)
+  else:
+    print ('<p>Too many tests to include plots. '
+           'Click links within rows for details.</p>')
+
+
+if __name__ == '__main__':
+  try:
+    main(sys.argv)
+  except RuntimeError, e:
+    print >>sys.stderr, 'FATAL: %s' % e
+    sys.exit(1)

From 9ab52bb68259df840cdcaa682921d238e719dcde Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 7 May 2015 09:50:40 -0700
Subject: [PATCH 05/67] Cleaning up assoctest.sh

---
 assoctest.sh          | 83 +++++++++----------------------------------
 tests/regtest_spec.py |  6 ++--
 2 files changed, 20 insertions(+), 69 deletions(-)

diff --git a/assoctest.sh b/assoctest.sh
index 5bbabc30..80d9a067 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -1,28 +1,26 @@
 #!/bin/bash
 #
-# Run end-to-end tests in parallel.
+# Run and end-to-end association test in parallel.
 #
 # Usage:
-#   ./regtest.sh <function name>
+#   ./assoctest.sh <function name>
 
 # At the end, it will print an HTML summary.
 #
 # Three main functions are
-#    run [[<pattern> [<num> [<fast>]] - run tests matching <pattern> in
-#                                       parallel, each <num> times. The fast
-#                                       mode (T/F) shortcuts generation of
-#                                       reports.
-#    run-seq [<pattern> [<num> [<fast>]] - ditto, except that tests are run
-#                                       sequentially
-#    run-all [<num>]              - run all tests, in parallel, each <num> times
+#    run [[<pattern> [<num>]] - run tests matching <pattern> in
+#                               parallel, each <num> times.
+#
+#    ## run-seq currently not supported!
+#    run-seq [<pattern> [<num>]] - ditto, except that tests are run sequentially
+#    ## --
+#
+#    run-all [<num>]             - run all tests, in parallel, each <num> times
 #
 # Examples:
-# $ ./regtest.sh run-seq unif-small-typical  # Sequential run, matches 1 case
-# $ ./regtest.sh run-seq unif-small- 3 F  # Sequential, each test is run three
-#                                           times, using slow generation
-# $ ./regtest.sh run unif-  # Parallel run, matches multiple cases
-# $ ./regtest.sh run unif- 5 # Parallel run, matches multiple cases, each test
-#                              is run 5 times
+# $ ./regtest.sh run-seq tiny-8x16-  # Sequential run, matches 2 cases
+# $ ./regtest.sh run-seq tiny-8x16- 3  # Sequential, each test is run three
+#                                           times
 # $ ./regtest.sh run-all     # Run all tests once
 #
 # The <pattern> argument is a regex in 'grep -E' format. (Detail: Don't
@@ -31,10 +29,6 @@
 # than the number of CPUs on the machine.
 
 
-# Future speedups:
-# - Reuse the same input -- come up with naming scheme based on params
-# - Reuse the same maps -- ditto, rappor library can cache it
-
 set -o nounset
 set -o pipefail
 set -o errexit
@@ -44,58 +38,17 @@ set -o errexit
 readonly THIS_DIR=$(dirname $0)
 readonly REPO_ROOT=$THIS_DIR
 readonly CLIENT_DIR=$REPO_ROOT/client/python
-readonly REGTEST_DIR=_tmp/regtest
 readonly ASSOCTEST_DIR=_tmp/assoctest
 
 # All the Python tools need this
 export PYTHONPATH=$CLIENT_DIR
 
-#print-true-inputs() {
-#  local num_unique_values=$1
-#  seq 1 $num_unique_values | awk '{print "v" $1}'
-#}
-
-# Add some more candidates here.  We hope these are estimated at 0.
-# e.g. if add_start=51, and num_additional is 20, show v51-v70
-#more-candidates() {
-#  local last_true=$1
-#  local num_additional=$2
-#
-#  local begin
-#  local end
-#  begin=$(expr $last_true + 1)
-#  end=$(expr $last_true + $num_additional)
-#
-#  seq $begin $end | awk '{print "v" $1}'
-#}
-
-# Args:
-#   true_inputs: File of true inputs
-#   last_true: last true input, e.g. 50 if we generated "v1" .. "v50".
-#   num_additional: additional candidates to generate (starting at 'last_true')
-#   to_remove: Regex of true values to omit from the candidates list, or the
-#     string 'NONE' if none should be.  (Our values look like 'v1', 'v2', etc. so
-#     there isn't any ambiguity.)
-#print-candidates() {
-#  local true_inputs=$1
-#  local last_true=$2
-#  local num_additional=$3
-#  local to_remove=$4
-#
-#  if test $to_remove = NONE; then
-#    cat $true_inputs  # include all true inputs
-#  else
-#    egrep -v $to_remove $true_inputs  # remove some true inputs
-#  fi
-#  more-candidates $last_true $num_additional
-#}
-
 # Generate a single test case, specified by a line of the test spec.
 # This is a helper function for _run_tests().
 _setup-one-case() {
   local test_case=$1
 
-  # input params
+  # Input parameters
   local dist=$2
   local num_unique_values=$3
   local num_unique_values2=$4
@@ -164,9 +117,6 @@ _run-one-instance() {
       --truefile $instance_dir/truedist.csv \
       --outdir $out_dir \
       --params $case_dir/case_params.csv
-    # Input prefix, output dir
-#    tests/analyze.R -t "Test case: $test_case (instance $test_instance)" \
-#                       "$case_dir/case" "$instance_dir/case" $out_dir
   }
 }
 
@@ -297,8 +247,8 @@ _run-tests() {
 #  local spec_regex=${1:-'^r-'}  # grep -E format on the spec
 #  local instances=${2:-1}
 #  local fast_counts=${3:-T}
-#  
-#  _run-tests $spec_regex $instances T $fast_counts 
+#
+#  _run-tests $spec_regex $instances T $fast_counts
 #}
 
 # Run tests in parallel
@@ -306,6 +256,7 @@ run-all() {
   local instances=${1:-1}
 
   log "Running all tests. Can take a while."
+  # a- for assoc tests
   _run-tests '^a-' $instances T T
 }
 
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 0c5798f0..3961b39a 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -47,9 +47,9 @@
     ('tiny', 100, 2, int(1e03), 1),  # test for insufficient data
     ('small', 100, 10, int(1e04), 1),
     ('medium', 1000, 10, int(1e05), 1),
-    ('large', 1000, 10, int(1e06), 1),
-    ('mediumsquared', 1000, 100, int(1e05), 1),
-    ('largesquared', int(1e04), 100, int(1e06), 1),
+#    ('large', 1000, 10, int(1e06), 1),
+#    ('mediumsquared', 1000, 100, int(1e05), 1),
+#    ('largesquared', int(1e04), 100, int(1e06), 1),
 )
 
 # 'k, h, m' as in params file.

From 69fe145effe7666e1f53b5a80ea367f8ba6f3242 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 7 May 2015 11:25:30 -0700
Subject: [PATCH 06/67] Cleaning up code and summary HTML.

---
 assoctest.sh                |  1 -
 tests/assoctest.html        |  2 +-
 tests/make_summary_assoc.py | 13 ++++---------
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/assoctest.sh b/assoctest.sh
index 80d9a067..e37a4f8c 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -228,7 +228,6 @@ _run-tests() {
     | xargs -l -P $processors -- $0 $func || test-error
 
   log "Done running all test instances"
-  exit 1
 
   make-summary $ASSOCTEST_DIR
 }
diff --git a/tests/assoctest.html b/tests/assoctest.html
index 91ee25be..80ef6515 100644
--- a/tests/assoctest.html
+++ b/tests/assoctest.html
@@ -18,7 +18,7 @@
 <body>
   <a id="top"></a>
 
-  <h2>RAPPOR regtest.sh</h2>
+  <h2>RAPPOR assoctest.sh</h2>
 
   <!-- These strings will be replaced by a sed script. -->
 
diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py
index ac3fb160..0558893d 100755
--- a/tests/make_summary_assoc.py
+++ b/tests/make_summary_assoc.py
@@ -30,16 +30,11 @@
   <td></td>
   <td></td>
 
-  <!-- MAP params -->
-  <td></td>
-  <td></td>
-
   <!-- Result metrics -->
   <td></td>
-  <td></td>
+  <td>%(mean_chisqdiff)s</td>
   <td>%(mean_l1d)s</td>
   <td>%(mean_rtime)s</td>
-  <td>%(mean_chisqdiff)s</td>
 </tr>
 </tfoot>
 """
@@ -243,8 +238,8 @@ def FormatSummaryRow(metrics_lists):
 
   summary = {
       'name': 'Means',
-      'mean_l1d': FormatMeanWithSem(means_with_sem['l1d'], percent=True),
-      'mean_chisqdiff': FormatMeanWithSem(means_with_sem['chisqdiff'], percent=True),
+      'mean_l1d': FormatMeanWithSem(means_with_sem['l1d'], percent=False),
+      'mean_chisqdiff': FormatMeanWithSem(means_with_sem['chisqdiff'], percent=False),
       'mean_rtime': FormatMeanWithSem(means_with_sem['rtime']),
   }
   return SUMMARY_ROW % summary
@@ -279,9 +274,9 @@ def main(argv):
   # the test case. These keys reference a list of floats, which can be empty.
   metrics = {
       'l1d': {},  # l1 distance
-      'rtime': {},  # R run time
       'chisqdiff': {}, # abs diff in values for the chisq test between true
                        # distr and estimated distr.
+      'rtime': {},  # R run time
   }
 
   # If there are too many tests, the plots are not included in the results

From deff9a20bdf2230beb25868e149916817392fd4f Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Tue, 12 May 2015 22:19:12 +0000
Subject: [PATCH 07/67] Some small changes to help with test rig.

---
 tests/make_summary_assoc.py |  1 +
 tests/regtest_spec.py       | 61 ++++++++++++++++++++-----------------
 2 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py
index 0558893d..dc16d3f1 100755
--- a/tests/make_summary_assoc.py
+++ b/tests/make_summary_assoc.py
@@ -282,6 +282,7 @@ def main(argv):
   # If there are too many tests, the plots are not included in the results
   # file. Instead, rows' names are links to the corresponding .png files.
   include_plots = len(test_instances) < 20
+  include_plots = False
 
   for instance in test_instances:
     # A test instance is idenfied by the test name and the test run.
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 3961b39a..3f192fe7 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -44,11 +44,12 @@
 DISTRIBUTION_PARAMS_ASSOC = (
     # name, num unique values 1,
     # num unique values 2, num clients, values per client
-    ('tiny', 100, 2, int(1e03), 1),  # test for insufficient data
-    ('small', 100, 10, int(1e04), 1),
+#    ('tiny', 100, 2, int(1e03), 1),  # test for insufficient data
+#    ('small', 100, 10, int(1e04), 1),
     ('medium', 1000, 10, int(1e05), 1),
-#    ('large', 1000, 10, int(1e06), 1),
-#    ('mediumsquared', 1000, 100, int(1e05), 1),
+    ('medium2', 1000, 2, int(1e05), 1),
+#    ('large', 10000, 10, int(1e06), 1),
+#    ('large2', 10000, 2, int(1e06), 1),
 #    ('largesquared', int(1e04), 100, int(1e06), 1),
 )
 
@@ -56,14 +57,17 @@
 BLOOMFILTER_PARAMS = {
     '8x16': (8, 2, 16),  # 16 cohorts, 8 bits each, 2 bits set in each
     '8x32': (8, 2, 32),  # 32 cohorts, 8 bits each, 2 bits set in each
+    '16x32': (16, 2, 32),  # 32 cohorts, 16 bits each, 2 bits set in each
     '8x128': (8, 2, 128),  # 128 cohorts, 8 bits each, 2 bits set in each
-    '128x128': (128, 2, 128),  # 8 cohorts, 128 bits each, 2 bits set in each
+#    '128x128': (128, 2, 128),  # 8 cohorts, 128 bits each, 2 bits set in each
 }
 
 # 'p, q, f' as in params file.
 PRIVACY_PARAMS = {
-    'eps_1_1': (0.39, 0.61, 0.45),  # eps_1 = 1, eps_inf = 5:
-    'eps_1_5': (0.225, 0.775, 0.0),  # eps_1 = 5, no eps_inf
+#    'eps_1_1': (0.39, 0.61, 0.45),  # eps_1 = 1, eps_inf = 5:
+#    'eps_1_5': (0.225, 0.775, 0.0),  # eps_1 = 5, no eps_inf
+    'eps_verysmall': (0.125, 0.875, 0.125),
+    'eps_small': (0.125, 0.875, 0.5),
 }
 
 # For deriving candidates from true inputs.
@@ -92,26 +96,27 @@ def main(argv):
   rows = []
 
   test_case = []
-  for (distr_params, num_values, num_clients,
-       num_reports_per_client) in DISTRIBUTION_PARAMS:
-    for distribution in DISTRIBUTIONS:
-      for (config_name, bloom_name, privacy_params, fr_extra,
-           regex_missing) in TEST_CONFIGS:
-        test_name = 'r-{}-{}-{}'.format(distribution, distr_params,
-                                        config_name)
-
-        params = (BLOOMFILTER_PARAMS[bloom_name]
-                  + PRIVACY_PARAMS[privacy_params]
-                  + tuple([int(num_values * fr_extra)])
-                  + tuple([MAP_REGEX_MISSING[regex_missing]]))
-
-        test_case = (test_name, distribution, num_values, num_clients,
-                     num_reports_per_client) + params
-        row_str = [str(element) for element in test_case]
-        rows.append(row_str)
-
-  for params in DEMO:
-    rows.append(params)
+  if(False): 
+    for (distr_params, num_values, num_clients,
+         num_reports_per_client) in DISTRIBUTION_PARAMS:
+      for distribution in DISTRIBUTIONS:
+        for (config_name, bloom_name, privacy_params, fr_extra,
+             regex_missing) in TEST_CONFIGS:
+          test_name = 'r-{}-{}-{}'.format(distribution, distr_params,
+                                          config_name)
+
+          params = (BLOOMFILTER_PARAMS[bloom_name]
+                    + PRIVACY_PARAMS[privacy_params]
+                    + tuple([int(num_values * fr_extra)])
+                    + tuple([MAP_REGEX_MISSING[regex_missing]]))
+
+          test_case = (test_name, distribution, num_values, num_clients,
+                       num_reports_per_client) + params
+          row_str = [str(element) for element in test_case]
+          rows.append(row_str)
+
+    for params in DEMO:
+      rows.append(params)
 
   # Association tests
   for (distr_params, num_values1, num_values2, num_clients,
@@ -121,7 +126,7 @@ def main(argv):
         test_name = 'a-{}-{}-{}'.format(distr_params, bloom_params,
                                         privacy_params)
 
-        params = (BLOOMFILTER_PARAMS[bloom_name]
+        params = (BLOOMFILTER_PARAMS[bloom_params]
                   + PRIVACY_PARAMS[privacy_params])
         test_case = (test_name, distr_params, num_values1, num_values2,
                      num_clients) + params

From d9831c4e617cb0ec365369aec6806da21600f7b0 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Tue, 12 May 2015 16:07:08 -0700
Subject: [PATCH 08/67] Moving from l1 distance to t.v. = l1/2.

---
 tests/analyze_assoc.R |  8 ++++----
 tests/assoctest.html  | 12 ++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R
index 50f98c33..c97602b5 100755
--- a/tests/analyze_assoc.R
+++ b/tests/analyze_assoc.R
@@ -144,15 +144,15 @@ main <- function(opts) {
   print("PROC.TIME")
   time_taken <- proc.time() - ptm
   print(time_taken)
-  
+
   # Write metrics to metrics.csv
   metrics <- list(td_chisq = td_chisq[1][[1]][[1]],
                   ed_chisq = ed_chisq[1][[1]][[1]],
-                 l1d = l1d, time = time_taken[1])
+                 tv = l1d/2, time = time_taken[2])   # report l1 distance / 2
+                                                     # to be consistent with
+                                                     # histogram analysis
   filename <- file.path(opts$outdir, 'metrics.csv')
   write.csv(metrics, file = filename, row.names = FALSE)
 }
 
 if(!interactive()) {
-  main(opts)
-}
diff --git a/tests/assoctest.html b/tests/assoctest.html
index 80ef6515..c5004882 100644
--- a/tests/assoctest.html
+++ b/tests/assoctest.html
@@ -49,10 +49,10 @@ <h2>RAPPOR assoctest.sh</h2>
       <tr class="explain">
         <td></td>
         <td colspan=4>
-          d: distribution drawn from<br/>
+          d: distribution type<br/>
           u: total unique values<br/>
-          c: clients<br/>
-          v: values per client<br/>
+          u2: total unique values 2<br/>
+          c: number of reports/clients<br/>
         </td>
         <td colspan=6>
           k: report bits<br/>
@@ -63,7 +63,7 @@ <h2>RAPPOR assoctest.sh</h2>
         <td colspan=7>
           td_chisq: chisq test on true distr.<br/>
           ed_chisq: chisq test on est. distr.<br/>
-          l1d: l1 distance<br/>
+          tv: tot. var. distance<br/>
           rtime: R runtime<br/>
         </td>
       </tr>
@@ -73,8 +73,8 @@ <h2>RAPPOR assoctest.sh</h2>
 
         <td>d</td>
         <td>u</td>
+        <td>u2</td>
         <td>c</td>
-        <td>v</td>
 
         <td>k</td>
         <td>h</td>
@@ -85,7 +85,7 @@ <h2>RAPPOR assoctest.sh</h2>
 
         <td>td_chisq</td>
         <td>ed_chisq</td>
-        <td>l1d</td>
+        <td>tv</td>
         <td>rtime</td>
       </tr>
     </thead>

From 6754f2d9fde44c6a3791d83cf13630c66cdbf26a Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 18 May 2015 10:49:15 -0700
Subject: [PATCH 09/67] Pushing small changes.

---
 tests/assoc_sim.R | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R
index 61ee822f..4ead3273 100755
--- a/tests/assoc_sim.R
+++ b/tests/assoc_sim.R
@@ -1,13 +1,13 @@
 #!/usr/bin/env Rscript
 #
 # Copyright 2015 Google Inc. All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,7 +17,7 @@
 # Simulates inputs on which association analysis can be run.
 # Currently assoc_sim.R only supports 2 variables but can
 # be easily extended to support more.
-# 
+#
 # Usage:
 #       $ ./assoc_sim.R -n 1000
 # Inputs: uvals, params, reports, map, num, unif
@@ -54,11 +54,11 @@ if(!interactive()) {
     make_option(c("--extras", "-e"), default = TRUE,
                 help = "Does 1st map have spurious candidates?"),
     make_option(c("--distr", "-d"), default = "zipfg",
-                help = "Type of distribution. Choose between 
+                help = "Type of distribution. Choose between
                 {unif, poisson, poisson2}")
   )
   opts <- parse_args(OptionParser(option_list = option_list))
-}    
+}
 
 source("analysis/R/encode.R")
 source("analysis/R/decode.R")
@@ -106,12 +106,12 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
                             var1_num, var2_num,
                             mapfile, reportsfile) {
   # Compute true distribution
-  m <- params$m  
+  m <- params$m
 
   if (distr == "unif") {
     # Draw uniformly from 1 to 10
     v1_samples <- as.integer(runif(N, 1, 10))
-    
+
     # Pr[var2 = N + 1 | var1 = N] = 0.5
     # Pr[var2 = N     | var1 = N] = 0.5
     v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE)
@@ -149,12 +149,12 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
     # (i.e., D1 in reverse)
     # var2 ~ D1 if var1 = even
     # var2 ~ D2 if var1 = odd
-    d1 <- sample(rep(1:var2_num, 
+    d1 <- sample(rep(1:var2_num,
                      RandomPartition(N, ComputePdf("zipf1.5", var2_num))))
     d2 <- (var2_num:1)[d1]
     v2_samples <- rep(1, N)
     v2_samples[v1_samples %% 2 == 0] <- d1[v1_samples %% 2 == 0]
-    v2_samples[v1_samples %% 2 == 1] <- d2[v1_samples %% 2 == 1] 
+    v2_samples[v1_samples %% 2 == 1] <- d2[v1_samples %% 2 == 1]
   }
 
   tmp_samples <- list(v1_samples, v2_samples)
@@ -191,7 +191,7 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
               row.names = TRUE, quote = FALSE)
   # Randomly assign cohorts in each dimension
   cohorts <- sample(1:m, N, replace = TRUE)
-  
+
   # Create and write map into mapfile_1.csv and mapfile_2.csv
   if (extras == TRUE) {
     # 1000 spurious candidates for mapfile_1.csv
@@ -203,7 +203,7 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
                 sep = ",", col.names = FALSE, na = "", quote = FALSE)
   write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""),
               sep = ",", col.names = FALSE, na = "", quote = FALSE)
-  
+
   # Write reports into a csv file
   # Format:
   #     cohort, bloom filter var1, bloom filter var2
@@ -211,7 +211,7 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
     EncodeAll(samples[[i]], cohorts, map[[i]]$map, params))
   # Organize cohorts and reports into format
   write_matrix <- cbind(as.matrix(cohorts),
-                        as.matrix(lapply(reports[[1]], 
+                        as.matrix(lapply(reports[[1]],
                             function(x) paste(x, collapse = ""))),
                         as.matrix(lapply(reports[[2]],
                             function(x) paste(x, collapse = ""))))
@@ -221,7 +221,7 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
 
 main <- function(opts) {
   ptm <- proc.time()
-  
+
   if(is.null(opts$uvals)) {
     uvals = list(var1 = c("str1"), var2 = c("option1"))
   } else {
@@ -232,7 +232,7 @@ main <- function(opts) {
                   opts$extras,  opts$true,              # inputs
                   opts$var1_num,  opts$var2_num,        # inputs
                   opts$map, opts$reports)               # outputs
-  
+
   print("PROC.TIME")
   print(proc.time() - ptm)
 }

From 45ee2f80014140a43d3a4bf974f5976825fb335b Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 18 May 2015 15:02:43 -0700
Subject: [PATCH 10/67] Fixing inconsistencies in map objects.

---
 tests/analyze_assoc.R | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R
index c97602b5..8948e4ac 100755
--- a/tests/analyze_assoc.R
+++ b/tests/analyze_assoc.R
@@ -71,15 +71,9 @@ source("analysis/R/association.R")
 # TODO(pseudorandom): move this functionality to ReadMapFile
 ProcessMap <- function(map, params) {
   map$rmap <- map$map
-  split_map <- function(i, map_struct) {
-    numbits <- params$k
-    indices <- which(as.matrix(
-      map_struct[((i - 1) * numbits + 1):(i * numbits),]) == TRUE,
-      arr.ind = TRUE)
-    sparseMatrix(indices[, "row"], indices[, "col"],
-                 dims = c(numbits, max(indices[, "col"])))
-  }
-  map$map <- lapply(1:params$m, function(i) split_map(i, map$rmap))
+  map$map <- lapply(1:params$m, function(i)
+                          map$rmap[seq(from = (i - 1) * params$k + 1),
+                                   length.out = params$k),])
   map
 }
 
@@ -156,3 +150,5 @@ main <- function(opts) {
 }
 
 if(!interactive()) {
+  main(opts)
+}

From c9484c57eda478bf2bc30c0ce4332d23d0237795 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 18 May 2015 15:51:58 -0700
Subject: [PATCH 11/67] Re-factoring regtest_spec.py to suit assoc better.

---
 tests/regtest_spec.py | 94 +++++++++++++++++++++++--------------------
 1 file changed, 50 insertions(+), 44 deletions(-)

diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 3f192fe7..98ca1fa1 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -41,17 +41,17 @@
     ('large', 10000, 100000000, 1),
 )
 
-DISTRIBUTION_PARAMS_ASSOC = (
+DISTRIBUTION_PARAMS_ASSOC = {
     # name, num unique values 1,
     # num unique values 2, num clients, values per client
-#    ('tiny', 100, 2, int(1e03), 1),  # test for insufficient data
-#    ('small', 100, 10, int(1e04), 1),
-    ('medium', 1000, 10, int(1e05), 1),
-    ('medium2', 1000, 2, int(1e05), 1),
-#    ('large', 10000, 10, int(1e06), 1),
-#    ('large2', 10000, 2, int(1e06), 1),
-#    ('largesquared', int(1e04), 100, int(1e06), 1),
-)
+    'tiny': (100, 2, int(1e03), 1),   # test for insufficient data
+    'small': (100, 10, int(1e04), 1),
+    'medium': (1000, 10, int(1e05), 1),
+    'medium2': (1000, 2, int(1e05), 1),
+    'large': (10000, 10, int(1e06), 1),
+    'large2': (10000, 2, int(1e06), 1),
+    'largesquared': (int(1e04), 100, int(1e06), 1),
+}
 
 # 'k, h, m' as in params file.
 BLOOMFILTER_PARAMS = {
@@ -59,13 +59,13 @@
     '8x32': (8, 2, 32),  # 32 cohorts, 8 bits each, 2 bits set in each
     '16x32': (16, 2, 32),  # 32 cohorts, 16 bits each, 2 bits set in each
     '8x128': (8, 2, 128),  # 128 cohorts, 8 bits each, 2 bits set in each
-#    '128x128': (128, 2, 128),  # 8 cohorts, 128 bits each, 2 bits set in each
+    '128x128': (128, 2, 128),  # 8 cohorts, 128 bits each, 2 bits set in each
 }
 
 # 'p, q, f' as in params file.
 PRIVACY_PARAMS = {
-#    'eps_1_1': (0.39, 0.61, 0.45),  # eps_1 = 1, eps_inf = 5:
-#    'eps_1_5': (0.225, 0.775, 0.0),  # eps_1 = 5, no eps_inf
+    'eps_1_1': (0.39, 0.61, 0.45),  # eps_1 = 1, eps_inf = 5:
+    'eps_1_5': (0.225, 0.775, 0.0),  # eps_1 = 5, no eps_inf
     'eps_verysmall': (0.125, 0.875, 0.125),
     'eps_small': (0.125, 0.875, 0.5),
 }
@@ -87,6 +87,17 @@
     ('over_x10', '8x128', 'eps_1_1', 10.0, '10%'),  # overshoot by x10
 ]
 
+# assoc test configuration ->
+#   (distribution params set, bloomfilter params set,
+#    privacy params set)
+# The test config runs a test suite that is the cross product of all the above
+# sets
+ASSOC_TEST_CONFIG = {
+    'distr': ('small', 'medium'),
+    'blooms': ('8x16', '8x32', '16x32'),
+    'privacy': ('eps_verysmall', 'eps_small'),
+}
+
 #
 # END TEST CONFIGURATION
 #
@@ -96,40 +107,35 @@ def main(argv):
   rows = []
 
   test_case = []
-  if(False): 
-    for (distr_params, num_values, num_clients,
-         num_reports_per_client) in DISTRIBUTION_PARAMS:
-      for distribution in DISTRIBUTIONS:
-        for (config_name, bloom_name, privacy_params, fr_extra,
-             regex_missing) in TEST_CONFIGS:
-          test_name = 'r-{}-{}-{}'.format(distribution, distr_params,
-                                          config_name)
-
-          params = (BLOOMFILTER_PARAMS[bloom_name]
-                    + PRIVACY_PARAMS[privacy_params]
-                    + tuple([int(num_values * fr_extra)])
-                    + tuple([MAP_REGEX_MISSING[regex_missing]]))
-
-          test_case = (test_name, distribution, num_values, num_clients,
-                       num_reports_per_client) + params
-          row_str = [str(element) for element in test_case]
-          rows.append(row_str)
-
-    for params in DEMO:
-      rows.append(params)
+  for (distr_params, num_values, num_clients,
+       num_reports_per_client) in DISTRIBUTION_PARAMS:
+    for distribution in DISTRIBUTIONS:
+      for (config_name, bloom_name, privacy_params, fr_extra,
+           regex_missing) in TEST_CONFIGS:
+        test_name = 'r-{}-{}-{}'.format(distribution, distr_params,
+                                        config_name)
+
+        params = (BLOOMFILTER_PARAMS[bloom_name]
+                  + PRIVACY_PARAMS[privacy_params]
+                  + tuple([int(num_values * fr_extra)])
+                  + tuple([MAP_REGEX_MISSING[regex_missing]]))
+
+        test_case = (test_name, distribution, num_values, num_clients,
+                     num_reports_per_client) + params
+        row_str = [str(element) for element in test_case]
+        rows.append(row_str)
+
+  for params in DEMO:
+    rows.append(params)
 
   # Association tests
-  for (distr_params, num_values1, num_values2, num_clients,
-       num_reports_per_client) in DISTRIBUTION_PARAMS_ASSOC:
-    for bloom_params in BLOOMFILTER_PARAMS:
-      for privacy_params in PRIVACY_PARAMS:
-        test_name = 'a-{}-{}-{}'.format(distr_params, bloom_params,
-                                        privacy_params)
-
-        params = (BLOOMFILTER_PARAMS[bloom_params]
-                  + PRIVACY_PARAMS[privacy_params])
-        test_case = (test_name, distr_params, num_values1, num_values2,
-                     num_clients) + params
+  for distr in ASSOC_TEST_CONFIG['distr']:
+    for blooms in ASSOC_TEST_CONFIG['blooms']:
+      for privacy in ASSOC_TEST_CONFIG['privacy']:
+        test_name = 'a-{}-{}-{}'.format(distr, blooms, privacy)
+        params = (BLOOMFILTER_PARAMS[blooms] +
+                  PRIVACY_PARAMS[privacy])
+        test_case = (test_name,) + DISTRIBUTION_PARAMS_ASSOC[distr] + params
         row_str = [str(element) for element in test_case]
         rows.append(row_str)
   # End of association tests

From b61f251582c3fb5861b37794fc7a6909e05d5e81 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 18 May 2015 21:09:23 -0700
Subject: [PATCH 12/67] Modifications to allow the use of the new decode.

Quick mode only requires 2 LASSO calls for estimating std dev instead of 5.
Minor changes in several places including modifications to the test suite and
assoc test params.
---
 analysis/R/association.R    |  4 ++--
 analysis/R/decode.R         | 21 +++++++++++++--------
 assoctest.sh                | 24 +++++++++++-------------
 tests/analyze_assoc.R       |  5 +++--
 tests/assoc_sim.R           | 20 ++++++++++----------
 tests/assoctest.html        | 18 ++++++++----------
 tests/make_summary_assoc.py |  1 -
 tests/regtest_spec.py       | 23 ++++++++++++-----------
 8 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/analysis/R/association.R b/analysis/R/association.R
index c5b23e26..dd3080fc 100644
--- a/analysis/R/association.R
+++ b/analysis/R/association.R
@@ -283,7 +283,7 @@ UpdateJointConditional <- function(cond_report_dist, joint_conditional = NULL) {
 
 ComputeDistributionEM <- function(reports, report_cohorts,
                                   maps, ignore_other = FALSE,
-                                  params,
+                                  params, quick = FALSE,
                                   marginals = NULL,
                                   estimate_var = FALSE) {
   # Computes the distribution of num_variables variables, where
@@ -322,7 +322,7 @@ ComputeDistributionEM <- function(reports, report_cohorts,
     variable_counts <- NULL
     if (is.null(marginals)) {
       variable_counts <- ComputeCounts(variable_report, variable_cohort, params)
-      marginal <- Decode(variable_counts, map$rmap, params)$fit
+      marginal <- Decode(variable_counts, map$rmap, params, quick)$fit
       if (nrow(marginal) == 0) {
         return (NULL)
       }
diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index b965ebdd..4fae9d86 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -254,7 +254,7 @@ Resample <- function(e) {
   result
 }
 
-Decode <- function(counts, map, params, alpha = 0.05,
+Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05,
                    correction = c("Bonferroni"), ...) {
   k <- params$k
   p <- params$p
@@ -280,27 +280,32 @@ Decode <- function(counts, map, params, alpha = 0.05,
          stds = es$stds[filter_cohorts, , drop = FALSE])
 
   coefs_all <- vector()
-
-  for(r in 1:5)
+  if(quick) {num_reps <- 2} else {num_reps <- 5}
+  for(r in 1:num_reps)
   {
     if(r > 1)
       e <- Resample(estimates_stds_filtered)
     else
       e <- estimates_stds_filtered
-
+    
     coefs_all <- rbind(coefs_all,
-                       FitDistribution(e, map[filter_bits, , drop = FALSE]))
+                       FitDistribution(e, map[filter_bits, , drop = FALSE]))  
   }
-
   coefs_ssd <- N * apply(coefs_all, 2, sd)  # compute sample standard deviations
   coefs_ave <- N * apply(coefs_all, 2, mean)
-
+  
   # Only select coefficients more than two standard deviations from 0. May
   # inflate empirical SD of the estimates.
   reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd)
-
+  
   mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported])
 
+#   Old code  ... 
+#     coefs_all <- FitDistribution(estimates_stds_filtered,
+#                                         map[filter_bits, , drop = FALSE])
+#     reported <- which(coefs_all > 1E-6)
+#     mod <- list(coefs = coefs_all[reported], stds = rep(0, length(reported)))
+
   if (correction == "Bonferroni") {
     alpha <- alpha / S
   }
diff --git a/assoctest.sh b/assoctest.sh
index e37a4f8c..61eec301 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -49,19 +49,17 @@ _setup-one-case() {
   local test_case=$1
 
   # Input parameters
-  local dist=$2
-  local num_unique_values=$3
-  local num_unique_values2=$4
-  local num_clients=$5
-  local values_per_client=$6
+  local num_unique_values=$2
+  local num_unique_values2=$3
+  local num_clients=$4
 
   # RAPPOR params
-  local num_bits=$6
-  local num_hashes=$7
-  local num_cohorts=$8
-  local p=$9
-  local q=${10}  # need curly braces to get the 10th arg
-  local f=${11}
+  local num_bits=$5
+  local num_hashes=$6
+  local num_cohorts=$7
+  local p=$8
+  local q=$9  # need curly braces to get the 10th arg
+  local f=${10}
 
   banner 'Setting up parameters and candidate files for '$test_case
 
@@ -85,7 +83,7 @@ _run-one-instance() {
 
   local case_dir=$ASSOCTEST_DIR/$test_case
 
-  read -r case_name case_descr num_unique_values num_unique_values2 \
+  read -r case_name num_unique_values num_unique_values2 \
     num_clients num_bits num_hashes num_cohorts p q f < $case_dir/spec.txt
 
   local instance_dir=$ASSOCTEST_DIR/$test_case/$test_instance
@@ -99,7 +97,7 @@ _run-one-instance() {
     -t $instance_dir/truedist.csv \
     -m $instance_dir/map \
     -n $num_clients \
-    --var1_num $num_unique_values \
+    --extras $num_unique_values \
     --var2_num $num_unique_values2
 
   local out_dir=${instance_dir}_report
diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R
index 8948e4ac..c58cc345 100755
--- a/tests/analyze_assoc.R
+++ b/tests/analyze_assoc.R
@@ -72,7 +72,7 @@ source("analysis/R/association.R")
 ProcessMap <- function(map, params) {
   map$rmap <- map$map
   map$map <- lapply(1:params$m, function(i)
-                          map$rmap[seq(from = (i - 1) * params$k + 1),
+                          map$rmap[seq(from = ((i - 1) * params$k + 1),
                                    length.out = params$k),])
   map
 }
@@ -110,6 +110,7 @@ main <- function(opts) {
 
   joint_dist <- ComputeDistributionEM(reports, cohorts, map,
                                       ignore_other = TRUE,
+                                      quick = TRUE,
                                       params, marginals = NULL,
                                       estimate_var = FALSE)
 
@@ -142,7 +143,7 @@ main <- function(opts) {
   # Write metrics to metrics.csv
   metrics <- list(td_chisq = td_chisq[1][[1]][[1]],
                   ed_chisq = ed_chisq[1][[1]][[1]],
-                 tv = l1d/2, time = time_taken[2])   # report l1 distance / 2
+                 tv = l1d/2, time = time_taken[1])   # report l1 distance / 2
                                                      # to be consistent with
                                                      # histogram analysis
   filename <- file.path(opts$outdir, 'metrics.csv')
diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R
index 4ead3273..3b8e89c5 100755
--- a/tests/assoc_sim.R
+++ b/tests/assoc_sim.R
@@ -47,15 +47,15 @@ if(!interactive()) {
                 help = "Filename *prefix* for map(s)"),
     make_option(c("--num", "-n"), default = 1e05,
                 help = "Number of reports"),
-    make_option(c("--var1_num", "-z"), default = 25,
+    make_option(c("--var1_num", "-z"), default = 40,
                 help = "Number of values for var1"),
     make_option(c("--var2_num", "-y"), default = 5,
                 help = "Number of values for var2"),
-    make_option(c("--extras", "-e"), default = TRUE,
-                help = "Does 1st map have spurious candidates?"),
-    make_option(c("--distr", "-d"), default = "zipfg",
+    make_option(c("--extras", "-e"), default = 1000,
+                help = "How many spurious candidates does the 1st map have?"),
+    make_option(c("--distr", "-d"), default = "zipf2",
                 help = "Type of distribution. Choose between
-                {unif, poisson, poisson2}")
+                {unif, poisson, poisson2, zipf2}")
   )
   opts <- parse_args(OptionParser(option_list = option_list))
 }
@@ -99,7 +99,7 @@ GetUniqueValsFromFile <- function(filename) {
 #         truefile = name of the file with true distribution
 #         var1_num = number of var1 candidates
 #         var2_num = number of var2 candidates
-#         *** CURRENTLY ONLY USEFUL IF DISTR = ZIPFG ***
+#         *** FOR ASSOCTEST TEST SUITE, USE ONLY ZIPF2 ***
 #         mapfile = file to write maps into (with .csv suffixes)
 #         reportsfile = file to write reports into (with .csv suffix)
 SimulateReports <- function(N, uvals, params, distr, extras, truefile,
@@ -134,7 +134,7 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
     v2_samples <- rep(1, N)
     v2_samples[v1_samples %% 2 == 0] <- pr25[v1_samples %% 2 == 0]
     v2_samples[v1_samples %% 2 == 1] <- pr75[v1_samples %% 2 == 1]
-  } else if (distr == "zipfg") {
+  } else if (distr == "zipf2") {
 
     # Zipfian over var1_num strings
     partition <- RandomPartition(N, ComputePdf("zipf1.5", var1_num))
@@ -193,9 +193,9 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
   cohorts <- sample(1:m, N, replace = TRUE)
 
   # Create and write map into mapfile_1.csv and mapfile_2.csv
-  if (extras == TRUE) {
-    # 1000 spurious candidates for mapfile_1.csv
-    len <- length(uvals[[1]]) + 1000
+  if (extras > 0) {
+    # spurious candidates for mapfile_1.csv
+    len <- length(uvals[[1]]) + as.numeric(extras)
     uvals[[1]] <- PadStrings(len, uvals[[1]])
   }
   map <- lapply(uvals, function(u) CreateMap(u, params))
diff --git a/tests/assoctest.html b/tests/assoctest.html
index c5004882..7fc6aff0 100644
--- a/tests/assoctest.html
+++ b/tests/assoctest.html
@@ -25,7 +25,7 @@ <h2>RAPPOR assoctest.sh</h2>
   <table cellspacing="0" cellpadding="5">
     <colgroup>
       <col span="1" class="highlight" />
-      <col span="4" />
+      <col span="3" />
       <col span="6" class="highlight" />
       <col span="4" />
     </colgroup>
@@ -35,7 +35,7 @@ <h2>RAPPOR assoctest.sh</h2>
         <td>
           Test Case
         </td>
-        <td colspan=4>
+        <td colspan=3>
           Input Params
         </td>
         <td colspan=6>
@@ -48,11 +48,10 @@ <h2>RAPPOR assoctest.sh</h2>
 
       <tr class="explain">
         <td></td>
-        <td colspan=4>
-          d: distribution type<br/>
-          u: total unique values<br/>
-          u2: total unique values 2<br/>
-          c: number of reports/clients<br/>
+        <td colspan=3>
+          e: number of extras<br/>
+          u2: number of unique vals in var2<br/>
+          n: number of reports/clients <br/>
         </td>
         <td colspan=6>
           k: report bits<br/>
@@ -71,10 +70,9 @@ <h2>RAPPOR assoctest.sh</h2>
       <tr class="subhead">
         <td></td>
 
-        <td>d</td>
-        <td>u</td>
+        <td>e</td>
         <td>u2</td>
-        <td>c</td>
+        <td>n</td>
 
         <td>k</td>
         <td>h</td>
diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py
index dc16d3f1..2c959971 100755
--- a/tests/make_summary_assoc.py
+++ b/tests/make_summary_assoc.py
@@ -20,7 +20,6 @@
   <td></td>
   <td></td>
   <td></td>
-  <td></td>
 
   <!-- RAPPOR params -->
   <td></td>
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 98ca1fa1..47feb470 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -43,14 +43,14 @@
 
 DISTRIBUTION_PARAMS_ASSOC = {
     # name, num unique values 1,
-    # num unique values 2, num clients, values per client
-    'tiny': (100, 2, int(1e03), 1),   # test for insufficient data
-    'small': (100, 10, int(1e04), 1),
-    'medium': (1000, 10, int(1e05), 1),
-    'medium2': (1000, 2, int(1e05), 1),
-    'large': (10000, 10, int(1e06), 1),
-    'large2': (10000, 2, int(1e06), 1),
-    'largesquared': (int(1e04), 100, int(1e06), 1),
+    # num unique values 2, num clients
+    'tiny': (100, 2, int(1e03)),   # test for insufficient data
+    'small': (100, 10, int(1e04)),
+    'medium': (1000, 10, int(1e05)),
+    'medium2': (1000, 2, int(1e05)),
+    'large': (10000, 10, int(1e06)),
+    'large2': (10000, 2, int(1e06)),
+    'largesquared': (int(1e04), 100, int(1e06)),
 }
 
 # 'k, h, m' as in params file.
@@ -93,9 +93,9 @@
 # The test config runs a test suite that is the cross product of all the above
 # sets
 ASSOC_TEST_CONFIG = {
-    'distr': ('small', 'medium'),
-    'blooms': ('8x16', '8x32', '16x32'),
-    'privacy': ('eps_verysmall', 'eps_small'),
+    'distr': ('small',),# 'medium'),
+    'blooms': ('8x16',), # '8x32', '16x32'),
+    'privacy': ('eps_verysmall',), # 'eps_small'),
 }
 
 #
@@ -132,6 +132,7 @@ def main(argv):
   for distr in ASSOC_TEST_CONFIG['distr']:
     for blooms in ASSOC_TEST_CONFIG['blooms']:
       for privacy in ASSOC_TEST_CONFIG['privacy']:
+        print distr, blooms, privacy
         test_name = 'a-{}-{}-{}'.format(distr, blooms, privacy)
         params = (BLOOMFILTER_PARAMS[blooms] +
                   PRIVACY_PARAMS[privacy])

From a7e69eba11287c662ab88f76d0d5d15e084ff6f3 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 4 Jun 2015 15:16:11 -0700
Subject: [PATCH 13/67] Updates on association test suite.

- Contains code fragments to experiment with EM
- More metrics reported (support recovered in Decode for each var)
- Timing information (experimental)
- Default params in assoc_sim are modified to match use cases closer
- Different metrics presented in association test suite results page
---
 analysis/R/association.R    | 157 +++++++++++++++++++++++++++++++++---
 tests/analyze_assoc.R       |  43 +++++-----
 tests/assoc_sim.R           |  27 ++++---
 tests/assoctest.html        |  26 +++---
 tests/make_summary_assoc.py |  28 ++++---
 tests/regtest_spec.py       |   6 +-
 6 files changed, 221 insertions(+), 66 deletions(-)

diff --git a/analysis/R/association.R b/analysis/R/association.R
index dd3080fc..d50bd490 100644
--- a/analysis/R/association.R
+++ b/analysis/R/association.R
@@ -137,6 +137,33 @@ GetJointConditionalProb <- function(cond_x, cond_y) {
   mapply("outer", cond_x, cond_y, SIMPLIFY = FALSE)
 }
 
+UpdatePij2 <- function(pij, reports, cohorts, cand_strs,
+                       params, map) {
+
+  accum <- array(0, dim(pij))
+  # For each report
+  for (i in seq(length(reports[[1]]))) {
+    # For each var
+    for (var in seq(length(reports))) {
+      idx <- cohorts[[var]][i]
+      rep <- GetCondProb(reports[[var]][[i]],
+                         candidate_strings = cand_strs[[var]],
+                         params = params,
+                         map[[var]]$map[[idx]], NULL)
+      if(var == 1) {
+        cond_joint_distr <- rep
+      } else {
+        cond_joint_distr <- outer(cond_joint_distr, rep)
+      }
+    }
+    z <- cond_joint_distr * pij
+    z <- z / sum(z)
+    z[is.nan(z)] <- 0
+    accum <- accum + z
+  }
+  accum / length(reports[[1]])
+}
+
 UpdatePij <- function(pij, cond_prob) {
   # Update the probability matrix based on the EM algorithm.
   #
@@ -155,6 +182,23 @@ UpdatePij <- function(pij, cond_prob) {
   Reduce("+", wcp) / length(wcp)
 }
 
+UpdatePij3 <- function(pij, cond_prob) {
+  wcp <- lapply(cond_prob, function(x) {
+    for (i in seq(length(x))) {
+      if (i == 1) {
+        op <- x[[i]]
+      } else {
+        op <- outer(op, x[[i]])
+      }
+    }
+    z <- op * pij
+    z <- z / sum(z)
+    z[is.nan(z)] <- 0
+    z
+  })
+  Reduce("+", wcp) / length(wcp)
+}
+
 NLL <- function(pij, cond_prob) {
   # Update the probability matrix based on the EM algorithm.
   #
@@ -186,6 +230,62 @@ ComputeVar <- function(cond_prob, est) {
   list(var_cov = var_cov, sd = sd, inform = inform)
 }
 
+EM2 <- function(reports, cohorts, cand_strs, starting_pij = NULL,
+                params, map,
+                max_iter = 1e03, epsilon = 1e-06) {
+  
+  # State space is the product of lengths.
+  state_space <- sapply(cand_strs, "length")
+  pij <- array()
+  if(is.null(starting_pij)) {
+    pij <- array(1 / prod(state_space), state_space)
+  } else {
+    pij <- starting_pij
+  }
+
+  if (nrow(pij) > 0) {
+    # Run EM
+    for (i in 1:max_iter) {
+      pij_new <- UpdatePij2(pij, reports, cohorts, cand_strs,
+                        params, map)
+      diff <- max(abs(pij_new - pij))
+      pij <- pij_new
+      if (diff < epsilon) {
+        break
+      }
+    }
+  }
+  list(hist = pij)
+}
+
+EM3 <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE,
+                max_iter = 1e03, epsilon = 1e-06, verbose = FALSE) {
+  pij <- list()
+  
+  # Compute dimensions of conditional distributions.
+  state_space <- sapply(cond_prob[[1]], length)
+  if (is.null(starting_pij)) {
+    pij <- array(1 / prod(state_space), state_space)
+  } else {
+    pij <- starting_pij
+  }
+  if (nrow(pij) > 0) {
+    # Run EM
+    for (i in 1:max_iter) {
+      if (i == 1) {
+        ptm_iter <- proc.time()
+      }
+      pij_new <- UpdatePij3(pij, cond_prob)
+      diff <- max(abs(pij_new - pij))
+      pij <- pij_new
+      if (diff < epsilon) {
+        break
+      }
+    }
+  }
+  list(est = pij, hist = pij, sd = 0)
+}
+
 EM <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE,
                max_iter = 1000, epsilon = 10^-6, verbose = FALSE) {
   # Performs estimation.
@@ -213,8 +313,15 @@ EM <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE,
   if (nrow(pij[[1]]) > 0) {
     # Run EM
     for (i in 1:max_iter) {
+      if (i == 1) {
+        ptm_iter <- proc.time()
+      }
       pij[[i + 1]] <- UpdatePij(pij[[i]], cond_prob)
       dif <- max(abs(pij[[i + 1]] - pij[[i]]))
+      if (i == 1) {
+        print("ONE ITERATION")
+        print(proc.time() - ptm_iter)
+      }
       if (dif < epsilon) {
         break
       }
@@ -285,7 +392,8 @@ ComputeDistributionEM <- function(reports, report_cohorts,
                                   maps, ignore_other = FALSE,
                                   params, quick = FALSE,
                                   marginals = NULL,
-                                  estimate_var = FALSE) {
+                                  estimate_var = FALSE,
+                                  new_alg = FALSE) {
   # Computes the distribution of num_variables variables, where
   #     num_variables is chosen by the client, using the EM algorithm.
   #
@@ -312,17 +420,22 @@ ComputeDistributionEM <- function(reports, report_cohorts,
   # Compute the counts for each variable and then do conditionals.
   joint_conditional = NULL
   found_strings <- list()
-
+  cd_for_reports <- list()
+  
   for (j in (1:num_variables)) {
+    ptm <- proc.time()
     variable_report <- reports[[j]]
     variable_cohort <- report_cohorts[[j]]
     map <- maps[[j]]
-
+    
     # Compute the probability of the "other" category
     variable_counts <- NULL
     if (is.null(marginals)) {
+      ptm2 <- proc.time()
       variable_counts <- ComputeCounts(variable_report, variable_cohort, params)
       marginal <- Decode(variable_counts, map$rmap, params, quick)$fit
+      print("TIME IN MARGINALS")
+      print(proc.time() - ptm2)
       if (nrow(marginal) == 0) {
         return (NULL)
       }
@@ -353,17 +466,39 @@ ComputeDistributionEM <- function(reports, report_cohorts,
                          prob_other[[idx]])
       rep
     })
-
-    # Update the joint conditional distribution of all variables
-    joint_conditional <- UpdateJointConditional(cond_report_dist,
+    
+    if(new_alg) {
+      # Report conditional distributions as lists
+      if (j == 1) {
+        # Conditional distribution for reports
+        joint_conditional <- lapply(cond_report_dist, "list")
+      } else {
+        joint_conditional <- mapply(function (x, y) c(x, list(y)),
+                                 joint_conditional, cond_report_dist,
+                                 SIMPLIFY = FALSE)
+      }
+    } else {
+      # Update the joint conditional distribution of all variables
+      joint_conditional <- UpdateJointConditional(cond_report_dist,
                                                 joint_conditional)
+    }
+    print("TIME IN COND_REPORT_DIST")
+    print(proc.time()-ptm)
   }
-
+  
+  ptm <- proc.time()
   # Run expectation maximization to find joint distribution
-  em <- EM(joint_conditional, epsilon = 10 ^ -6, verbose = FALSE,
+  if (new_alg) {
+    funct <- EM3
+  } else {
+    funct <- EM
+  }
+  em <- funct(joint_conditional, epsilon = 10 ^ -5, verbose = FALSE,
            estimate_var = estimate_var)
+  print("TIME IN EM")
+  print(proc.time() - ptm)
   dimnames(em$est) <- found_strings
+  
   # Return results in a usable format
-  list(fit = em$est, sd = em$sd, em = em)
-
-}
+  list(orig = list(fit = em$est, sd = em$sd, em = em))
+}
\ No newline at end of file
diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R
index c58cc345..ade385c0 100755
--- a/tests/analyze_assoc.R
+++ b/tests/analyze_assoc.R
@@ -48,7 +48,9 @@ if(!interactive()) {
     make_option(c("--outdir", "-o"), default = ".",
                 help = "File where the metrics go"),
     make_option(c("--params", "-p"), default = "params.csv",
-                help = "Filename for RAPPOR parameters")
+                help = "Filename for RAPPOR parameters"),
+    make_option(c("--newalg", "-a"), default = FALSE,
+                help = "Flag to run new EM3 algorithm or not")
   )
   opts <- parse_args(OptionParser(option_list = option_list))
 }
@@ -112,28 +114,19 @@ main <- function(opts) {
                                       ignore_other = TRUE,
                                       quick = TRUE,
                                       params, marginals = NULL,
-                                      estimate_var = FALSE)
+                                      estimate_var = FALSE,
+                                      new_alg = opts$newalg)
 
   td <- read.csv(file = opts$truefile)
-  ed <- joint_dist$fit
+  ed <- joint_dist$orig$fit
   print("CHI-SQUARED")
   td_chisq <- chisq.test(td)
   ed_chisq <- chisq.test(ed)
   print(td_chisq)
   print(ed_chisq)
 
-  # L1 distance = 1 - sum(min(td|x, ed|x)) where
-  # td|x / ed|x projects the distribution to the intersection x of the
-  # supports of td and ed
-  rowsi <- intersect(rownames(td), rownames(ed))
-  colsi <- intersect(colnames(td), colnames(ed))
-  print("L1 DISTANCE")
-  l1d <- 1 - sum(mapply(min,
-                  unlist(td[rowsi, colsi], use.names = FALSE),
-                  unlist(as.data.frame(ed)[rowsi, colsi], use.names = FALSE)
-                   ))
-  print(l1d)
-
+  print(l1d(td, ed, "L1 DISTANCE"))
+  
   print("JOINT_DIST$FIT")
   print(signif(ed[order(rowSums(ed)),], 4))
   print("PROC.TIME")
@@ -141,15 +134,29 @@ main <- function(opts) {
   print(time_taken)
 
   # Write metrics to metrics.csv
+  # Report l1 distance / 2 to be consistent with histogram analysis
   metrics <- list(td_chisq = td_chisq[1][[1]][[1]],
                   ed_chisq = ed_chisq[1][[1]][[1]],
-                 tv = l1d/2, time = time_taken[1])   # report l1 distance / 2
-                                                     # to be consistent with
-                                                     # histogram analysis
+                 tv = l1d(td, ed, "L1 DISTANCE")/2,
+                 time = time_taken[1],
+                 dim1 = dim(ed)[[1]],
+                 dim2 = dim(ed)[[2]])               
   filename <- file.path(opts$outdir, 'metrics.csv')
   write.csv(metrics, file = filename, row.names = FALSE)
 }
 
+# L1 distance = 1 - sum(min(df1|x, df2|x)) where
+# df1|x / df2|x projects the distribution to the intersection x of the
+# supports of df1 and df2
+l1d <- function(df1, df2, statement = "L1 DISTANCE") {
+  rowsi <- intersect(rownames(df1), rownames(df2))
+  colsi <- intersect(colnames(df1), colnames(df2))
+  print(statement)
+  1 - sum(mapply(min, 
+                 unlist(as.data.frame(df1)[rowsi, colsi], use.names = FALSE),
+                 unlist(as.data.frame(df2)[rowsi, colsi], use.names = FALSE)))
+}
+
 if(!interactive()) {
   main(opts)
 }
diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R
index 3b8e89c5..e93918e4 100755
--- a/tests/assoc_sim.R
+++ b/tests/assoc_sim.R
@@ -47,19 +47,25 @@ if(!interactive()) {
                 help = "Filename *prefix* for map(s)"),
     make_option(c("--num", "-n"), default = 1e05,
                 help = "Number of reports"),
-    make_option(c("--var1_num", "-z"), default = 40,
+    make_option(c("--var1_num", "-z"), default = 100,
                 help = "Number of values for var1"),
-    make_option(c("--var2_num", "-y"), default = 5,
+    make_option(c("--var2_num", "-y"), default = 20,
                 help = "Number of values for var2"),
-    make_option(c("--extras", "-e"), default = 1000,
+    make_option(c("--extras", "-e"), default = 1e05,
                 help = "How many spurious candidates does the 1st map have?"),
     make_option(c("--distr", "-d"), default = "zipf2",
                 help = "Type of distribution. Choose between
-                {unif, poisson, poisson2, zipf2}")
+                {unif, poisson, poisson2, zipf2}"),
+    make_option(c("--prefix", "-x"), default = "./",
+                help = "Path to prefix all default files")
   )
   opts <- parse_args(OptionParser(option_list = option_list))
 }
 
+apply_prefix <- function(path) {
+  paste(opts$prefix, path, sep = "")
+}
+
 source("analysis/R/encode.R")
 source("analysis/R/decode.R")
 source("analysis/R/simulation.R")
@@ -225,13 +231,14 @@ main <- function(opts) {
   if(is.null(opts$uvals)) {
     uvals = list(var1 = c("str1"), var2 = c("option1"))
   } else {
-    uvals <- GetUniqueValsFromFile(opts$uvals)
+    uvals <- GetUniqueValsFromFile(apply_prefix(opts$uvals))
   }
-  params <- ReadParameterFile(opts$params)
-  SimulateReports(opts$num, uvals, params,  opts$distr, # inuts
-                  opts$extras,  opts$true,              # inputs
-                  opts$var1_num,  opts$var2_num,        # inputs
-                  opts$map, opts$reports)               # outputs
+  params <- ReadParameterFile(apply_prefix(opts$params))
+  SimulateReports(opts$num, uvals, params,  opts$distr,   # inuts
+                  opts$extras,  apply_prefix(opts$true),  # inputs
+                  opts$var1_num,  opts$var2_num,          # inputs
+                  apply_prefix(opts$map),
+                  apply_prefix(opts$reports))             # outputs
 
   print("PROC.TIME")
   print(proc.time() - ptm)
diff --git a/tests/assoctest.html b/tests/assoctest.html
index 7fc6aff0..38e5abac 100644
--- a/tests/assoctest.html
+++ b/tests/assoctest.html
@@ -25,9 +25,9 @@ <h2>RAPPOR assoctest.sh</h2>
   <table cellspacing="0" cellpadding="5">
     <colgroup>
       <col span="1" class="highlight" />
-      <col span="3" />
+      <col span="1" />
       <col span="6" class="highlight" />
-      <col span="4" />
+      <col span="6" />
     </colgroup>
 
     <thead>
@@ -35,23 +35,21 @@ <h2>RAPPOR assoctest.sh</h2>
         <td>
           Test Case
         </td>
-        <td colspan=3>
+        <td colspan=1>
           Input Params
         </td>
         <td colspan=6>
           RAPPOR Params
         </td>
-        <td colspan=4>
+        <td colspan=6>
           Result Metrics
         </td>
       </tr>
 
       <tr class="explain">
         <td></td>
-        <td colspan=3>
-          e: number of extras<br/>
-          u2: number of unique vals in var2<br/>
-          n: number of reports/clients <br/>
+        <td colspan=1>
+          n: num reports<br/>
         </td>
         <td colspan=6>
           k: report bits<br/>
@@ -59,9 +57,11 @@ <h2>RAPPOR assoctest.sh</h2>
           m: cohorts<br/>
           p, q, f: probabilities<br/>
         </td>
-        <td colspan=7>
-          td_chisq: chisq test on true distr.<br/>
-          ed_chisq: chisq test on est. distr.<br/>
+        <td colspan=6>
+          d1: dimension of var1 solutions. <br />
+          d2: dimension of var2 solutions. <br />
+          td_chisq: chisq test on true distr.<br />
+          ed_chisq: chisq test on est. distr.<br />
           tv: tot. var. distance<br/>
           rtime: R runtime<br/>
         </td>
@@ -70,8 +70,6 @@ <h2>RAPPOR assoctest.sh</h2>
       <tr class="subhead">
         <td></td>
 
-        <td>e</td>
-        <td>u2</td>
         <td>n</td>
 
         <td>k</td>
@@ -81,6 +79,8 @@ <h2>RAPPOR assoctest.sh</h2>
         <td>q</td>
         <td>f</td>
 
+        <td>d1</td>
+        <td>d2</td>
         <td>td_chisq</td>
         <td>ed_chisq</td>
         <td>tv</td>
diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py
index 2c959971..59a4f247 100755
--- a/tests/make_summary_assoc.py
+++ b/tests/make_summary_assoc.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python
 """Given a regtest result tree, prints an HTML summary on stdout.
 
-See HTML skeleton in tests/regtest.html.
+See HTML skeleton in tests/assoctest.html.
 """
 
 import os
@@ -18,8 +18,6 @@
 
   <!-- input params -->
   <td></td>
-  <td></td>
-  <td></td>
 
   <!-- RAPPOR params -->
   <td></td>
@@ -31,6 +29,8 @@
 
   <!-- Result metrics -->
   <td></td>
+  <td></td>
+  <td></td>
   <td>%(mean_chisqdiff)s</td>
   <td>%(mean_l1d)s</td>
   <td>%(mean_rtime)s</td>
@@ -133,7 +133,7 @@ def ParseSpecFile(spec_filename):
   with open(spec_filename) as s:
     spec_row = s.readline().split()
 
-  spec_in_html = ' '.join('<td>%s</td>' % cell for cell in spec_row[1:])
+  spec_in_html = ' '.join('<td>%s</td>' % cell for cell in spec_row[3:])
 
   return spec_in_html
 
@@ -169,7 +169,7 @@ def ParseMetrics(metrics_file, log_file):
     m.readline()
     metrics_row = m.readline().split(',')
 
-  (td_chisq, ed_chisq, l1d, rtime) = metrics_row
+  (td_chisq, ed_chisq, l1d, rtime, d1, d2) = metrics_row
 
   td_chisq = float(td_chisq)
   ed_chisq = float(ed_chisq)
@@ -180,16 +180,20 @@ def ParseMetrics(metrics_file, log_file):
   elapsed_time = ExtractTime(log_file)
 
   metrics_row_str = [
-      str(td_chisq),
-      str(ed_chisq),
-      str(l1d),
-      str(rtime),
+    '%s' % d1,
+    '%s' % d2,
+    '%.3f' % td_chisq,
+    '%.3f' % ed_chisq,
+    '%.3f' % l1d,
+    str(rtime),
   ]
 
   metrics_row_dict = {
-      'l1d': [l1d],
-      'rtime': [rtime],
-      'chisqdiff': [abs(td_chisq - ed_chisq)],
+    'd1': [d1],
+    'd2': [d2],
+    'l1d': [l1d],
+    'rtime': [rtime],
+    'chisqdiff': [abs(td_chisq - ed_chisq)],
   }
 
   # return metrics formatted as HTML table entries
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 47feb470..e4458b8e 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -46,6 +46,8 @@
     # num unique values 2, num clients
     'tiny': (100, 2, int(1e03)),   # test for insufficient data
     'small': (100, 10, int(1e04)),
+    'fizz': (100, 20, int(1e05)),
+    'fizzbool': (100, 2, int(1e05)),
     'medium': (1000, 10, int(1e05)),
     'medium2': (1000, 2, int(1e05)),
     'large': (10000, 10, int(1e06)),
@@ -93,9 +95,9 @@
 # The test config runs a test suite that is the cross product of all the above
 # sets
 ASSOC_TEST_CONFIG = {
-    'distr': ('small',),# 'medium'),
+    'distr': ('fizz', 'fizzbool'),# 'medium'),
     'blooms': ('8x16',), # '8x32', '16x32'),
-    'privacy': ('eps_verysmall',), # 'eps_small'),
+    'privacy': ('eps_small',), # 'eps_small'),
 }
 
 #

From 19d7f9318a3cb7c812744798d755e894c650df88 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 4 Jun 2015 15:22:09 -0700
Subject: [PATCH 14/67] Adding a couple more specs to test.

---
 tests/regtest_spec.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index e4458b8e..0e7de91e 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -46,8 +46,12 @@
     # num unique values 2, num clients
     'tiny': (100, 2, int(1e03)),   # test for insufficient data
     'small': (100, 10, int(1e04)),
+    'fizz-tiny': (100, 20, int(1e03)),
+    'fizz-tiny-bool': (100, 2, int(1e03)),
+    'fizz-small': (100, 20, int(1e04)),
+    'fizz-small-bool': (100, 2, int(1e04)),
     'fizz': (100, 20, int(1e05)),
-    'fizzbool': (100, 2, int(1e05)),
+    'fizz-bool': (100, 2, int(1e05)),
     'medium': (1000, 10, int(1e05)),
     'medium2': (1000, 2, int(1e05)),
     'large': (10000, 10, int(1e06)),
@@ -95,7 +99,8 @@
 # The test config runs a test suite that is the cross product of all the above
 # sets
 ASSOC_TEST_CONFIG = {
-    'distr': ('fizz', 'fizzbool'),# 'medium'),
+    'distr': ('fizz-tiny', 'fizz-tiny-bool',
+              'fizz-small', 'fizz-small-bool',),# 'medium'),
     'blooms': ('8x16',), # '8x32', '16x32'),
     'privacy': ('eps_small',), # 'eps_small'),
 }

From d15179156b47bcfb53cba51147e433a86968f6e5 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 8 Jun 2015 17:02:20 -0700
Subject: [PATCH 15/67] Updating association to work with 3 variables.

---
 assoctest.sh          |  3 ++-
 tests/analyze_assoc.R | 49 +++++++++++++++++++++++++++----------------
 tests/assoc_sim.R     | 26 ++++++++++++++++-------
 tests/regtest_spec.py | 12 +++++++----
 4 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/assoctest.sh b/assoctest.sh
index 61eec301..947c33bd 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -97,7 +97,7 @@ _run-one-instance() {
     -t $instance_dir/truedist.csv \
     -m $instance_dir/map \
     -n $num_clients \
-    --extras $num_unique_values \
+    --var1_num $num_unique_values \
     --var2_num $num_unique_values2
 
   local out_dir=${instance_dir}_report
@@ -111,6 +111,7 @@ _run-one-instance() {
     tests/analyze_assoc.R \
       --map1 $instance_dir/map_1.csv \
       --map2 $instance_dir/map_2.csv \
+      --map3 $instance_dir/map_3.csv \
       --reports $instance_dir/reports.csv \
       --truefile $instance_dir/truedist.csv \
       --outdir $out_dir \
diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R
index ade385c0..4e6af972 100755
--- a/tests/analyze_assoc.R
+++ b/tests/analyze_assoc.R
@@ -37,12 +37,14 @@ options(stringsAsFactors = FALSE)
 if(!interactive()) {
   option_list <- list(
     # Flags
-    make_option(c("--map1", "-m1"), default = "map_1.csv",
+    make_option(c("--map1"), default = "map_1.csv",
                 help = "Hashed candidates for 1st variable"),
-    make_option(c("--map2", "-m2"), default = "map_2.csv",
+    make_option(c("--map2"), default = "map_2.csv",
                 help = "Hashed candidates for 2nd variable"),
+    make_option(c("--map3"), default = "map_3.csv",
+                help = "Hashed candidates for 3rd variable"),
     make_option(c("--reports", "-r"), default = "reports.csv",
-                help = "File with raw reports as <cohort, report1, report2>"),
+                help = "File with raw reports as <cohort, report1, report2, ...>"),
     make_option(c("--truefile", "-t"), default = "truedist.csv",
                 help = "File with true distribution generated by assoc_sim.R"),
     make_option(c("--outdir", "-o"), default = ".",
@@ -83,26 +85,27 @@ main <- function(opts) {
   ptm <- proc.time()
 
   params <- ReadParameterFile(opts$params)
-  opts_map <- list(opts$map1, opts$map2)
+  opts_map <- list(opts$map1, opts$map2, opts$map3)
   map <- lapply(opts_map, function(o)
                   ProcessMap(ReadMapFile(o, params = params),
                              params = params))
   # Reports must be of the format
   #     cohort no, rappor bitstring 1, rappor bitstring 2
   reportsObj <- read.csv(opts$reports,
-                         colClasses = c("integer", "character", "character"),
+                         colClasses = c("integer", "character",
+                                        "character", "character"),
                          header = FALSE)
 
   # Parsing reportsObj
   # ComputeDistributionEM allows for different sets of cohorts
   # for each variable. Here, both sets of cohorts are identical
   co <- as.list(reportsObj[1])[[1]]
-  cohorts <- list(co, co)
-  # Parse reports from reportObj cols 2 and 3
-  reports <- lapply(1:2, function(x) as.list(reportsObj[x + 1]))
+  cohorts <- list(co, co, co)
+  # Parse reports from reportObj cols 2, 3, and 4
+  reports <- lapply(1:3, function(x) as.list(reportsObj[x + 1]))
 
   # Split strings into bit arrays (as required by assoc analysis)
-  reports <- lapply(1:2, function(i) {
+  reports <- lapply(1:3, function(i) {
     # apply the following function to each of reports[[1]] and reports[[2]]
     lapply(reports[[i]][[1]], function(x) {
       # function splits strings and converts them to numeric values
@@ -117,30 +120,40 @@ main <- function(opts) {
                                       estimate_var = FALSE,
                                       new_alg = opts$newalg)
 
+  
   td <- read.csv(file = opts$truefile)
   ed <- joint_dist$orig$fit
+  if(length(reports) == 3) {
+    ed <- as.data.frame(ed) 
+  }
+  
+  # We can see if chi-squared tests show different results
+  # for estimated vs real distribution
   print("CHI-SQUARED")
   td_chisq <- chisq.test(td)
   ed_chisq <- chisq.test(ed)
   print(td_chisq)
   print(ed_chisq)
-
   print(l1d(td, ed, "L1 DISTANCE"))
-  
+  l1d_metric <- l1d(td, ed, "")
   print("JOINT_DIST$FIT")
   print(signif(ed[order(rowSums(ed)),], 4))
+  td_metric <- td_chisq[1][[1]][[1]]
+  ed_metric <- ed_chisq[1][[1]][[1]]
+  
   print("PROC.TIME")
   time_taken <- proc.time() - ptm
   print(time_taken)
-
+  
+  metrics <- list(td_chisq = td_metric,
+                  ed_chisq = ed_metric,
+                  tv = l1d_metric/2,
+                  time = time_taken[1],
+                  dim1 = dim(ed)[[2]],
+                  dim2 = dim(ed)[[1]])
+  
   # Write metrics to metrics.csv
   # Report l1 distance / 2 to be consistent with histogram analysis
-  metrics <- list(td_chisq = td_chisq[1][[1]][[1]],
-                  ed_chisq = ed_chisq[1][[1]][[1]],
-                 tv = l1d(td, ed, "L1 DISTANCE")/2,
-                 time = time_taken[1],
-                 dim1 = dim(ed)[[1]],
-                 dim2 = dim(ed)[[2]])               
   filename <- file.path(opts$outdir, 'metrics.csv')
   write.csv(metrics, file = filename, row.names = FALSE)
 }
diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R
index e93918e4..a4e82c6d 100755
--- a/tests/assoc_sim.R
+++ b/tests/assoc_sim.R
@@ -53,7 +53,7 @@ if(!interactive()) {
                 help = "Number of values for var2"),
     make_option(c("--extras", "-e"), default = 1e05,
                 help = "How many spurious candidates does the 1st map have?"),
-    make_option(c("--distr", "-d"), default = "zipf2",
+    make_option(c("--distr", "-d"), default = "zipf3",
                 help = "Type of distribution. Choose between
                 {unif, poisson, poisson2, zipf2}"),
     make_option(c("--prefix", "-x"), default = "./",
@@ -105,7 +105,7 @@ GetUniqueValsFromFile <- function(filename) {
 #         truefile = name of the file with true distribution
 #         var1_num = number of var1 candidates
 #         var2_num = number of var2 candidates
-#         *** FOR ASSOCTEST TEST SUITE, USE ONLY ZIPF2 ***
+#         *** FOR ASSOCTEST TEST SUITE, USE ONLY ZIPF2 / ZIPF3 ***
 #         mapfile = file to write maps into (with .csv suffixes)
 #         reportsfile = file to write reports into (with .csv suffix)
 SimulateReports <- function(N, uvals, params, distr, extras, truefile,
@@ -140,7 +140,7 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
     v2_samples <- rep(1, N)
     v2_samples[v1_samples %% 2 == 0] <- pr25[v1_samples %% 2 == 0]
     v2_samples[v1_samples %% 2 == 1] <- pr75[v1_samples %% 2 == 1]
-  } else if (distr == "zipf2") {
+  } else if (distr == "zipf2" || distr == "zipf3") {
 
     # Zipfian over var1_num strings
     partition <- RandomPartition(N, ComputePdf("zipf1.5", var1_num))
@@ -159,11 +159,18 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
                      RandomPartition(N, ComputePdf("zipf1.5", var2_num))))
     d2 <- (var2_num:1)[d1]
     v2_samples <- rep(1, N)
+    v3_samples <- rep(1, N)
     v2_samples[v1_samples %% 2 == 0] <- d1[v1_samples %% 2 == 0]
     v2_samples[v1_samples %% 2 == 1] <- d2[v1_samples %% 2 == 1]
+    if(distr == "zipf3") {
+      bool1 <- rbinom(N, 1, 0.25) + rep(1, N)
+      bool2 <- rbinom(N, 1, 0.75) + rep(1, N)
+      v3_samples[v1_samples %% 2 == 0] <- bool1[v1_samples %% 2 == 0]
+      v3_samples[v1_samples %% 2 == 1] <- bool2[v1_samples %% 2 == 1]
+    }
   }
 
-  tmp_samples <- list(v1_samples, v2_samples)
+  tmp_samples <- list(v1_samples, v2_samples, v3_samples)
 
   # Function to pad strings to uval_vec if sample_vec has
   # larger support than the number of strings in uval_vec
@@ -186,12 +193,13 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
   # Pad and update uvals
   uvals <- lapply(1:2, function(i) PadStrings(tmp_samples[[i]],
                                               uvals[[i]]))
+  uvals[[3]] <- c("true", "false")
   # Replace integers in tmp_samples with actual sample strings
-  samples <- lapply(1:2, function(i) uvals[[i]][tmp_samples[[i]]])
+  samples <- lapply(1:3, function(i) uvals[[i]][tmp_samples[[i]]])
 
   print("TRUE DISTR")
   td <- table(samples)/sum(table(samples))
-  td <- td[order(rowSums(td), decreasing = TRUE),]
+  td <- td[order(rowSums(td), decreasing = TRUE),,]
   print(td)
   write.table(td, file = truefile, sep = ",", col.names = TRUE,
               row.names = TRUE, quote = FALSE)
@@ -209,17 +217,21 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
                 sep = ",", col.names = FALSE, na = "", quote = FALSE)
   write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""),
               sep = ",", col.names = FALSE, na = "", quote = FALSE)
+  write.table(map[[3]]$map_pos, file = paste(mapfile, "_3.csv", sep = ""),
+              sep = ",", col.names = FALSE, na = "", quote = FALSE)
 
   # Write reports into a csv file
   # Format:
   #     cohort, bloom filter var1, bloom filter var2
-  reports <- lapply(1:2, function(i)
+  reports <- lapply(1:3, function(i)
     EncodeAll(samples[[i]], cohorts, map[[i]]$map, params))
   # Organize cohorts and reports into format
   write_matrix <- cbind(as.matrix(cohorts),
                         as.matrix(lapply(reports[[1]],
                             function(x) paste(x, collapse = ""))),
                         as.matrix(lapply(reports[[2]],
+                            function(x) paste(x, collapse = ""))),
+                        as.matrix(lapply(reports[[3]],
                             function(x) paste(x, collapse = ""))))
   write.table(write_matrix, file = reportsfile, quote = FALSE,
               row.names = FALSE, col.names = FALSE, sep = ",")
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 0e7de91e..973913ce 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -51,6 +51,8 @@
     'fizz-small': (100, 20, int(1e04)),
     'fizz-small-bool': (100, 2, int(1e04)),
     'fizz': (100, 20, int(1e05)),
+    'fizz-large': (100, 50, int(1e05)),
+    'fizz-2large': (100, 50, int(5e05)),
     'fizz-bool': (100, 2, int(1e05)),
     'medium': (1000, 10, int(1e05)),
     'medium2': (1000, 2, int(1e05)),
@@ -74,6 +76,7 @@
     'eps_1_5': (0.225, 0.775, 0.0),  # eps_1 = 5, no eps_inf
     'eps_verysmall': (0.125, 0.875, 0.125),
     'eps_small': (0.125, 0.875, 0.5),
+    'uma_rappor_type': (0.50, 0.75, 0.5),
 }
 
 # For deriving candidates from true inputs.
@@ -99,10 +102,11 @@
 # The test config runs a test suite that is the cross product of all the above
 # sets
 ASSOC_TEST_CONFIG = {
-    'distr': ('fizz-tiny', 'fizz-tiny-bool',
-              'fizz-small', 'fizz-small-bool',),# 'medium'),
-    'blooms': ('8x16',), # '8x32', '16x32'),
-    'privacy': ('eps_small',), # 'eps_small'),
+    'distr': ('fizz-tiny',
+              'fizz-small',
+              'fizz','fizz-large','fizz-2large'),# 'medium'),
+    'blooms': ('8x32',), # '8x32', '16x32'),
+    'privacy': ('eps_small','uma_rappor_type'), # 'eps_small'),
 }
 
 #

From 0ed6ab69e4c5aae18270d172ae8145d25ecffcef Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 11 Jun 2015 14:06:47 -0700
Subject: [PATCH 16/67] Can we replace EM with 2-way marginal computations?

---
 tests/analyze_assoc_expt.R | 211 +++++++++++++++++++++++++++++++
 tests/assoc_sim.R          |   8 +-
 tests/assoc_sim_expt.R     | 250 +++++++++++++++++++++++++++++++++++++
 tests/regtest_spec.py      |   8 +-
 4 files changed, 472 insertions(+), 5 deletions(-)
 create mode 100755 tests/analyze_assoc_expt.R
 create mode 100755 tests/assoc_sim_expt.R

diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
new file mode 100755
index 00000000..10c35341
--- /dev/null
+++ b/tests/analyze_assoc_expt.R
@@ -0,0 +1,211 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Reads map files, report files, and RAPPOR parameters to run
+# an EM algorithm to estimate joint distribution over two or more variables
+#
+# Usage:
+#       $ ./analyze_assoc_expt.R --inp <input JSON file>
+#
+# Input file: 
+# Outputs:
+
+library("jsonlite")
+library("optparse")
+
+options(stringsAsFactors = FALSE)
+
+if(!interactive()) {
+  option_list <- list(
+    make_option(c("--inp"), default = "analyze_inp.json",
+                help = "JSON file with inputs for analyze_assoc_expt"))
+  opts <- parse_args(OptionParser(option_list = option_list))
+}
+
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/read_input.R")
+source("analysis/R/association.R")
+
+# This function processes the maps loaded using ReadMapFile
+# Association analysis requires a map object with a map
+# field that has the map split into cohorts and an rmap field
+# that has all the cohorts combined
+# Arguments:
+#       map = map object with cohorts as sparse matrix in
+#             object map$map
+#             This is the expected object from ReadMapFile
+#       params = data field with parameters
+# TODO(pseudorandom): move this functionality to ReadMapFile
+ProcessMap <- function(map, params) {
+  map$rmap <- map$map
+  map$map <- lapply(1:params$m, function(i)
+                          map$rmap[seq(from = ((i - 1) * params$k + 1),
+                                   length.out = params$k),])
+  map
+}
+
+# Function to combine reports
+# Currently assume 2-way marginals
+CombineReports <- function(reports1, reports2) {
+  two_bits <- list(c(0, 0, 0, 1), c(0, 0, 1, 0), c(0, 1, 0, 0), c(1, 0, 0, 0))
+  OuterProd <- function(x, y) {
+    as.vector(outer(x, y,
+                    function(z, t) z + 2 * t))
+  }
+  creports <- mapply(OuterProd, reports1, reports2,
+                     SIMPLIFY = FALSE)
+  # Collapse counts to bit vector according to two_bits
+  lapply(creports,
+         function(x) as.vector(sapply(x, function(z) two_bits[[z+1]])))
+}
+
+# Function to combine maps
+# Using map1-major order for both candidates and bits of the report
+# to be consistent with how CombineReports works
+# Currently assume 2-way marginals
+CombineMaps <- function(map1, map2) {
+  # Retrieve set indices and dimensions
+  rows1 <- which(map1, arr.ind = TRUE)[,1]
+  cols1 <- which(map1, arr.ind = TRUE)[,2]
+  length1 <- dim(map1)[[1]]
+  width1 <- dim(map1)[[2]]
+  rows2 <- which(map2, arr.ind = TRUE)[,1]
+  cols2 <- which(map2, arr.ind = TRUE)[,2]
+  length2 <- dim(map2)[[1]]
+  width2 <- dim(map2)[[2]]
+  
+  map1fn <- function(i, j) {
+    i1 <- seq(1, length2) + (i-1) * length2
+    j1 <- seq(1, width2) + (j-1) * width2
+    indices1 <- expand.grid(i1, j1)
+  }
+  map1indices <- do.call(rbind,
+                         mapply(map1fn, rows1, cols1, SIMPLIFY = FALSE))
+  map1_big <- sparseMatrix(map1indices[,"Var1"],
+                           map1indices[,"var2"],
+                           dims = c(length1 * length2,
+                                    width1 * width2))
+  colnames(map1_big) <- outer(function(x, y) paste(x, y, sep = "x"),
+                              colnames(map1),
+                              colnames(map2))
+}
+
+
+main <- function(opts) {
+  ptm <- proc.time()
+  inp <- fromJSON(opts$inp)
+  params <- ReadParameterFile(inp$params)
+  # ensure sufficient maps as required by number of vars
+  stopifnot(inp$numvars == length(inp$maps))
+  opts_map <- inp$maps
+  map <- lapply(opts_map, function(o)
+                  ProcessMap(ReadMapFile(o, params = params),
+                             params = params))
+  # Reports must be of the format
+  #     cohort no, rappor bitstring 1, rappor bitstring 2, ...
+  reportsObj <- read.csv(inp$reports,
+                         colClasses = c("integer",
+                                        rep("character", inp$numvars)),
+                         header = FALSE)
+
+  # Parsing reportsObj
+  # ComputeDistributionEM allows for different sets of cohorts
+  # for each variable. Here, both sets of cohorts are identical
+  co <- as.list(reportsObj[1])[[1]]
+  cohorts <- rep(list(co), inp$numvars)
+  # Parse reports from reportObj cols 2, 3, ...
+  reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1]))
+
+  # Split strings into bit arrays (as required by assoc analysis)
+  reports <- lapply(1:inp$numvars, function(i) {
+    # apply the following function to each of reports[[1]] and reports[[2]]
+    lapply(reports[[i]][[1]], function(x) {
+      # function splits strings and converts them to numeric values
+      as.numeric(strsplit(x, split = "")[[1]])
+    })
+  })
+
+  creports <- CombineReports(reports[[1]], reports[[2]])
+  params2 <- params
+  params2$k <- (params$k ** 2) * 4
+  CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]])
+  cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map)
+  counts <- ComputeCounts(creports, cohorts[[1]], params2)
+  
+  
+  return
+  joint_dist <- ComputeDistributionEM(reports, cohorts, map,
+                                      ignore_other = TRUE,
+                                      quick = TRUE,
+                                      params, marginals = NULL,
+                                      estimate_var = FALSE,
+                                      new_alg = inp$newalg)
+
+  
+  td <- read.csv(file = inp$truefile)
+  ed <- joint_dist$orig$fit
+  if(length(reports) == 3) {
+    ed <- as.data.frame(ed) 
+  }
+  
+  # We can see if chi-squared tests show different results
+  # for estimated vs real distribution
+  print("CHI-SQUARED")
+  td_chisq <- chisq.test(td)
+  ed_chisq <- chisq.test(ed)
+  print(td_chisq)
+  print(ed_chisq)
+  print(l1d(td, ed, "L1 DISTANCE"))
+  l1d_metric <- l1d(td, ed, "")
+  print("JOINT_DIST$FIT")
+  print(signif(ed[order(rowSums(ed)),], 4))
+  td_metric <- td_chisq[1][[1]][[1]]
+  ed_metric <- ed_chisq[1][[1]][[1]]
+  
+  print("PROC.TIME")
+  time_taken <- proc.time() - ptm
+  print(time_taken)
+  
+  metrics <- list(td_chisq = td_metric,
+                  ed_chisq = ed_metric,
+                  tv = l1d_metric/2,
+                  time = time_taken[1],
+                  dim1 = dim(ed)[[2]],
+                  dim2 = dim(ed)[[1]])
+  
+  # Write metrics to metrics.csv
+  # Report l1 distance / 2 to be consistent with histogram analysis
+  filename <- file.path(inp$outdir, 'metrics.csv')
+  write.csv(metrics, file = filename, row.names = FALSE)
+}
+
+# L1 distance = 1 - sum(min(df1|x, df2|x)) where
+# df1|x / df2|x projects the distribution to the intersection x of the
+# supports of df1 and df2
+l1d <- function(df1, df2, statement = "L1 DISTANCE") {
+  rowsi <- intersect(rownames(df1), rownames(df2))
+  colsi <- intersect(colnames(df1), colnames(df2))
+  print(statement)
+  1 - sum(mapply(min, 
+                 unlist(as.data.frame(df1)[rowsi, colsi], use.names = FALSE),
+                 unlist(as.data.frame(df2)[rowsi, colsi], use.names = FALSE)))
+}
+
+if(!interactive()) {
+  main(opts)
+}
diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R
index a4e82c6d..c1166bc1 100755
--- a/tests/assoc_sim.R
+++ b/tests/assoc_sim.R
@@ -170,7 +170,12 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
     }
   }
 
-  tmp_samples <- list(v1_samples, v2_samples, v3_samples)
+  if(distr == "zipf2") {
+    tmp_samples <- list(v1_samples, v2_samples)
+  } else if(distr == "zipf3") {
+    tmp_samples <- list(v1_samples, v2_samples, v3_samples)
+  }
+
 
   # Function to pad strings to uval_vec if sample_vec has
   # larger support than the number of strings in uval_vec
@@ -193,6 +198,7 @@ SimulateReports <- function(N, uvals, params, distr, extras, truefile,
   # Pad and update uvals
   uvals <- lapply(1:2, function(i) PadStrings(tmp_samples[[i]],
                                               uvals[[i]]))
+  
   uvals[[3]] <- c("true", "false")
   # Replace integers in tmp_samples with actual sample strings
   samples <- lapply(1:3, function(i) uvals[[i]][tmp_samples[[i]]])
diff --git a/tests/assoc_sim_expt.R b/tests/assoc_sim_expt.R
new file mode 100755
index 00000000..59ce1356
--- /dev/null
+++ b/tests/assoc_sim_expt.R
@@ -0,0 +1,250 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Simulates inputs on which association analysis can be run.
+# Currently assoc_sim.R only supports 2 variables but can
+# be easily extended to support more.
+#
+# Usage:
+#       $ ./assoc_sim_expt.R --inp sim_inp.json
+# Inputs: uvals, params, reports, map, num, unif
+#         see how options are parsed below for more information
+# Outputs:
+#         reports.csv file containing reports
+#         map_{1, 2, ...}.csv file(s) containing maps of variables
+
+library("jsonlite")
+library("optparse")
+
+options(stringsAsFactors = FALSE)
+
+if(!interactive()) {
+  option_list <- list(
+    make_option(c("--inp"), default = "assoc_inp.json",
+                help = "JSON file with inputs for assoc_sim_expt"))
+  opts <- parse_args(OptionParser(option_list = option_list))
+  inp <- fromJSON(opts$inp)
+}
+
+apply_prefix <- function(path) {
+  paste(inp$prefix, path, sep = "")
+}
+
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/read_input.R")
+source("analysis/R/association.R")
+source("tests/gen_counts.R")
+
+# Read unique values of reports from a csv file
+# Inputs: filename. The file is expected to contain two rows of strings
+#         (one for each variable):
+#         "google.com", "apple.com", ...
+#         "ssl", "nossl", ...
+# Returns: a list containing strings
+GetUniqueValsFromFile <- function(filename) {
+  contents <- read.csv(filename, header = FALSE)
+  # Expect 2 rows of unique vals
+  if(nrow(contents) != 2) {
+    stop(paste("Unique vals file", filename, "expected to have
+               two rows of strings."))
+  }
+  # Removes superfluous "" entries if the lists of unique values
+  # differ in length
+  strip_empty <- function(vec) {
+    vec[!vec %in% c("")]
+  }
+  list(var1 = strip_empty(as.vector(t(contents[1,]))),
+       var2 = strip_empty(as.vector(t(contents[2,]))))
+}
+
+# Simulate correlated reports and write into reportsfile
+# Inputs: N = number of reports
+#         uvals = list containing a list of unique values
+#         params = list with RAPPOR parameters
+#         distr = the type of distribution to use
+#                 {unif, poisson, poisson2, zipfg}
+#         extras = whether map_1.csv has spurious candidates or not
+#         truefile = name of the file with true distribution
+#         varcandidates = list of number of candidates for each var
+#         *** FOR ASSOCTEST TEST SUITE, USE ONLY ZIPF2 / ZIPF3 ***
+#         mapfile = file to write maps into (with .csv suffixes)
+#         reportsfile = file to write reports into (with .csv suffix)
+SimulateReports <- function(N, uvals, params, distr, extras, truefile,
+                            varcandidates,
+                            mapfile, reportsfile) {
+  # Compute true distribution
+  m <- params$m
+
+  if (distr == "unif") {
+    # Draw uniformly from 1 to 10
+    v1_samples <- as.integer(runif(N, 1, 10))
+
+    # Pr[var2 = N + 1 | var1 = N] = 0.5
+    # Pr[var2 = N     | var1 = N] = 0.5
+    v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE)
+
+  } else if(distr == "poisson") {
+    # Draw from a Poisson random variable
+    v1_samples <- rpois(N, 1) + rep(1, N)
+
+    # Pr[var2 = N + 1 | var1 = N] = 0.5
+    # Pr[var2 = N     | var1 = N] = 0.5
+    v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE)
+  } else if (distr == "poisson2") {
+
+    v1_samples <- rpois(N, 1) + rep(1, N)
+    # supp(var2) = {1, 2}
+    # Pr[var2 = 1 | var1 = even] = 0.75
+    # Pr[var2 = 1 | var1 = odd]  = 0.25
+    pr25 <- rbinom(N, 1, 0.25) + 1
+    pr75 <- rbinom(N, 1, 0.75) + 1
+    v2_samples <- rep(1, N)
+    v2_samples[v1_samples %% 2 == 0] <- pr25[v1_samples %% 2 == 0]
+    v2_samples[v1_samples %% 2 == 1] <- pr75[v1_samples %% 2 == 1]
+  } else if (distr == "zipf2" || distr == "zipf3") {
+
+    var1_num <- varcandidates[[1]]
+    var2_num <- varcandidates[[2]]
+    
+    # Zipfian over var1_num strings
+    partition <- RandomPartition(N, ComputePdf("zipf1.5", var1_num))
+    v1_samples <- rep(1:var1_num, partition)  # expand partition
+    # Shuffle values randomly (may take a few sec for > 10^8 inputs)
+    v1_samples <- sample(v1_samples)
+
+    # supp(var2) = {1, 2, 3, ..., var2_num}
+    # We look at two zipfian distributions over supp(var2)
+    # D1 = zipfian distribution
+    # D2 = zipfian distr over {var2_num, ..., 4, 3, 2, 1}
+    # (i.e., D1 in reverse)
+    # var2 ~ D1 if var1 = even
+    # var2 ~ D2 if var1 = odd
+    d1 <- sample(rep(1:var2_num,
+                     RandomPartition(N, ComputePdf("zipf1.5", var2_num))))
+    d2 <- (var2_num:1)[d1]
+    v2_samples <- rep(1, N)
+    v3_samples <- rep(1, N)
+    v2_samples[v1_samples %% 2 == 0] <- d1[v1_samples %% 2 == 0]
+    v2_samples[v1_samples %% 2 == 1] <- d2[v1_samples %% 2 == 1]
+    if(distr == "zipf3") {
+      bool1 <- rbinom(N, 1, 0.25) + rep(1, N)
+      bool2 <- rbinom(N, 1, 0.75) + rep(1, N)
+      v3_samples[v1_samples %% 2 == 0] <- bool1[v1_samples %% 2 == 0]
+      v3_samples[v1_samples %% 2 == 1] <- bool2[v1_samples %% 2 == 1]
+    }
+  }
+
+  if(length(varcandidates) == 3) {
+    tmp_samples <- list(v1_samples, v2_samples, v3_samples)
+  } else if (length(varcandidates) == 2) {
+    tmp_samples <- list(v1_samples, v2_samples)
+  }
+
+  # Function to pad strings to uval_vec if sample_vec has
+  # larger support than the number of strings in uval_vec
+  # For e.g., if samples have support {1, 2, 3, 4, ...} and uvals
+  # only have "value1", "value2", and "value3", samples now
+  # over support {"value1", "value2", "value3", "str4", ...}
+  PadStrings <- function(sample_vec, uval_vec) {
+    if (max(sample_vec) > length(uval_vec)) {
+      # Padding uvals to required length
+      len <- length(uval_vec)
+      max_of_samples <- max(sample_vec)
+      uval_vec[(len + 1):max_of_samples] <- apply(
+        as.matrix((len + 1):max_of_samples),
+        1,
+        function(i) sprintf("str%d", i))
+    }
+    uval_vec
+  }
+
+  # Pad and update uvals
+  uvals <- lapply(1:length(varcandidates),
+                  function(i) PadStrings(tmp_samples[[i]],
+                                              uvals[[i]]))
+  # Replace integers in tmp_samples with actual sample strings
+  samples <- lapply(1:length(varcandidates),
+                    function(i) uvals[[i]][tmp_samples[[i]]])
+
+  print("TRUE DISTR")
+  td <- table(samples)/sum(table(samples))
+  if (length(varcandidates) == 2) {
+    td <- td[order(rowSums(td), decreasing = TRUE),]
+  } else {
+    td <- td[order(rowSums(td), decreasing = TRUE),,]
+  }
+  print(td)
+  write.table(td, file = truefile, sep = ",", col.names = TRUE,
+              row.names = TRUE, quote = FALSE)
+  # Randomly assign cohorts in each dimension
+  cohorts <- sample(1:m, N, replace = TRUE)
+
+  # Create and write map into mapfile_1.csv and mapfile_2.csv
+  if (extras > 0) {
+    # spurious candidates for mapfile_1.csv
+    len <- length(uvals[[1]]) + as.numeric(extras)
+    uvals[[1]] <- PadStrings(len, uvals[[1]])
+  }
+  map <- lapply(uvals, function(u) CreateMap(u, params))
+  write.table(map[[1]]$map_pos, file = paste(mapfile, "_1.csv", sep = ""),
+                sep = ",", col.names = FALSE, na = "", quote = FALSE)
+  write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""),
+              sep = ",", col.names = FALSE, na = "", quote = FALSE)
+  if(length(varcandidates) == 3) {
+    write.table(map[[3]]$map_pos, file = paste(mapfile, "_3.csv", sep = ""),
+              sep = ",", col.names = FALSE, na = "", quote = FALSE)
+  }
+
+  # Write reports into a csv file
+  # Format:
+  #     cohort, bloom filter var1, bloom filter var2
+  reports <- lapply(1:length(varcandidates), function(i)
+    EncodeAll(samples[[i]], cohorts, map[[i]]$map, params))
+  # Organize cohorts and reports into format
+  write_matrix <- cbind(as.matrix(cohorts),
+                        sapply(reports,
+                               function(x) as.matrix(lapply(x,
+                                                            function(z) paste(z, collapse = "")))))
+  write.table(write_matrix, file = reportsfile, quote = FALSE,
+              row.names = FALSE, col.names = FALSE, sep = ",")
+}
+
+main <- function(inp) {
+  ptm <- proc.time()
+  
+  if(is.null(inp$uvals)) {
+    # One off case.
+    # TODO(pseudorandom): More sensible defaults.
+    uvals = list(var1 = c("str1", "str2"), var2 = c("option1", "option2", "option3"))
+  } else {
+    uvals <- GetUniqueValsFromFile(apply_prefix(inp$uvals))
+  }
+  params <- ReadParameterFile(apply_prefix(inp$params))
+  SimulateReports(inp$num, uvals, params,  inp$distr,   # inuts
+                  inp$extras,  apply_prefix(inp$true),  # inputs
+                  inp$varcandidates,          # inputs
+                  apply_prefix(inp$map),
+                  apply_prefix(inp$reports))             # outputs
+
+  print("PROC.TIME")
+  print(proc.time() - ptm)
+}
+
+if(!interactive()) {
+  main(inp)
+}
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 973913ce..f21ba367 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -102,11 +102,11 @@
 # The test config runs a test suite that is the cross product of all the above
 # sets
 ASSOC_TEST_CONFIG = {
-    'distr': ('fizz-tiny',
-              'fizz-small',
-              'fizz','fizz-large','fizz-2large'),# 'medium'),
+    'distr': (#'fizz-tiny',
+              #'fizz-small',
+              'fizz',),#'fizz-large','fizz-2large'),# 'medium'),
     'blooms': ('8x32',), # '8x32', '16x32'),
-    'privacy': ('eps_small','uma_rappor_type'), # 'eps_small'),
+    'privacy': ('eps_small',),#'uma_rappor_type'), # 'eps_small'),
 }
 
 #

From d37dcf0eda4bdc4487577daef09b34b27f5cd18b Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 11 Jun 2015 23:21:08 -0700
Subject: [PATCH 17/67] Combining maps.

---
 tests/analyze_assoc_expt.R | 54 +++++++++++++++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 7 deletions(-)

diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index 10c35341..e06f2fa5 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -89,20 +89,60 @@ CombineMaps <- function(map1, map2) {
   length2 <- dim(map2)[[1]]
   width2 <- dim(map2)[[2]]
   
+  # Now process map1
   map1fn <- function(i, j) {
-    i1 <- seq(1, length2) + (i-1) * length2
-    j1 <- seq(1, width2) + (j-1) * width2
-    indices1 <- expand.grid(i1, j1)
+    i1 <- seq(1, length2) + ((i-1) * length2)
+    j1 <- seq(1, width2) + ((j-1) * width2)
+    expand.grid(i1, j1)  
   }
   map1indices <- do.call(rbind,
                          mapply(map1fn, rows1, cols1, SIMPLIFY = FALSE))
   map1_big <- sparseMatrix(map1indices[,"Var1"],
-                           map1indices[,"var2"],
+                           map1indices[,"Var2"],
                            dims = c(length1 * length2,
                                     width1 * width2))
-  colnames(map1_big) <- outer(function(x, y) paste(x, y, sep = "x"),
-                              colnames(map1),
-                              colnames(map2))
+  colnames(map1_big) <- t(outer(colnames(map1),
+                              colnames(map2),
+                              function(x, y) paste(x, y, sep = "x")))
+  
+  # Now process map2
+  map2fn <- function(i, j) {
+    i2 <- i + (seq(0, length1 - 1) * length2)
+    j2 <- j + (seq(0, width1 - 1) * width2)
+    expand.grid(i2, j2)
+  }
+  map2indices <- do.call(rbind,
+                         mapply(map2fn, rows2, cols2, SIMPLIFY = FALSE))
+  map2_big <- sparseMatrix(map2indices[,"Var1"],
+                           map2indices[,"Var2"],
+                           dims = c(length1 * length2,
+                                    width1 * width2))
+  colnames(map2_big) <- t(outer(colnames(map1),
+                              colnames(map2),
+                              function(x, y) paste(x, y, sep = "x")))
+  
+  # Now collate two maps with entries in (1000, 0100, 0010, 0001)
+  # (m1&m2, !m1 & m2, m1 & !m2, !(m1 & m2)) respectively
+  findices <- which(map1_big & map2_big, arr.ind = TRUE)
+  # 1000
+  findices[, 1] <- findices[, 1] * 4 - 3
+  # 0100
+  indices_0100 <- which((!map1_big) & map2_big, arr.ind = TRUE)
+  indices_0100[, 1] <- indices_0100[, 1] * 4 - 2
+  findices <- rbind(findices, indices_0100)
+  # 0010
+  indices_0010 <- which(map1_big & (!map2_big), arr.ind = TRUE)
+  indices_0010[, 1] <- indices_0010[, 1] * 4 - 1
+  findices <- rbind(findices, indices_0010)
+  # 0001
+  indices_0001 <- which(!(map1_big & map2_big), arr.ind = TRUE)
+  indices_0001[, 1] <- indices_0001[, 1] * 4
+  findices <- rbind(findices, indices_0001)
+  sm <- sparseMatrix(findices[, 1], findices[, 2],
+                     dims = c(4 * length1 * length2,
+                        width1 * width2))
+  colnames(sm) <- colnames(map1_big)
+  sm
 }
 
 

From 1a983db150f591825a117457db422eadc00aae9a Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Fri, 12 Jun 2015 18:16:16 -0700
Subject: [PATCH 18/67] More Decode code to support 2-way marginals.

---
 analysis/R/decode.R        | 61 ++++++++++++++++++++++++++++++++++++++
 tests/analyze_assoc_expt.R |  4 +--
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 4fae9d86..e75385be 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -19,6 +19,67 @@ library(glmnet)
 
 source('analysis/R/alternative.R')
 
+Estimate2WayBloomCounts <- function(params, obs_counts) {
+  p <- params$p
+  q <- params$q
+  f <- params$f
+  m <- params$m
+  
+  stopifnot(m == nrow(obs_counts), params$k + 1 == ncol(obs_counts))
+  
+  p11 <- q * (1 - f/2) + p * f / 2  # probability of a true 1 reported as 1
+  p01 <- p * (1 - f/2) + q * f / 2  # probability of a true 0 reported as 1
+  p10 <- 1 - p11  # probability of a true 1 reported as 0
+  p00 <- 1 - p01  # probability of a true 0 reported as 0
+  p2 <- p11 - p01  # == (1 - f) * (q - p)
+  
+  ests <- apply(obs_counts, 1, function(x) {
+    N <- x[1]  # sample size of cohort
+    inds <- seq(0, m/4 - 1)
+    v <- x[-1]  # counts for individual bits
+    # 11 or (1000) estimates
+    v[inds*4 + 2] <- 
+      (v[inds*4 + 2] - (p11**2)*N) / (2*p01*p11 + p01**2 - p11**2)
+    
+    # 10 or (0100) estimates
+    v[inds*4 + 3] <-
+      (v[inds*4 + 3] - (p11*p00)*N) / (p10*p11 + p01*p10 + p01*p00 - p11*p00)
+    
+    # 01 or (0010) estimates
+    v[inds*4 + 4] <-
+      (v[inds*4 + 4] - (p11*p00)*N) / (p10*p11 + p01*p10 + p01*p00 - p11*p00)
+    
+    # 00 or (0001) estimates
+    v[inds*4 + 5] <-
+      (v[inds*4 + 5] - (p11**2)*N) / (2*p10*p00 + p10**2 - p00**2)
+    v
+  })
+  
+  if(FALSE) {
+    # TODO(pseudorandom): Compute variances
+    variances <- apply(obs_counts, 1, function(x) {
+      N <- x[1]
+      v <- x[-1]
+      p_hats <- (v - p01 * N) / (N * p2)  # expectation of a true 1
+      p_hats <- pmax(0, pmin(1, p_hats))  # clamp to [0,1]
+      r <- p_hats * p11 + (1 - p_hats) * p01  # expectation of a reported 1
+      N * r * (1 - r) / p2^2  # variance of the binomial
+    })
+  }
+  
+  # Transform counts from absolute values to fractional, removing bias due to
+  #      variability of reporting between cohorts.
+  ests <- apply(ests, 1, function(x) x / obs_counts[,1])
+  # stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1])
+  
+  # Some estimates may be set to infinity, e.g. if f=1. We want to
+  #     account for this possibility, and set the corresponding counts
+  #     to 0.
+  ests[abs(ests) == Inf] <- 0
+  
+  list(estimates = ests, stds = ests)
+}
+
 EstimateBloomCounts <- function(params, obs_counts) {
   # Estimates the number of times each bit in each cohort was set in original
   # Bloom filters.
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index e06f2fa5..430e0b24 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -135,7 +135,7 @@ CombineMaps <- function(map1, map2) {
   indices_0010[, 1] <- indices_0010[, 1] * 4 - 1
   findices <- rbind(findices, indices_0010)
   # 0001
-  indices_0001 <- which(!(map1_big & map2_big), arr.ind = TRUE)
+  indices_0001 <- which((!map1_big) & (!map2_big), arr.ind = TRUE)
   indices_0001[, 1] <- indices_0001[, 1] * 4
   findices <- rbind(findices, indices_0001)
   sm <- sparseMatrix(findices[, 1], findices[, 2],
@@ -186,7 +186,7 @@ main <- function(opts) {
   CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]])
   cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map)
   counts <- ComputeCounts(creports, cohorts[[1]], params2)
-  
+  ests <- Estimate2WayBloomCounts(params2, counts)
   
   return
   joint_dist <- ComputeDistributionEM(reports, cohorts, map,

From 669c500391869b72ecd083408f1ade18e5d7de48 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 15 Jun 2015 18:29:03 -0700
Subject: [PATCH 19/67] Replacing EM with two-way marginals.

---
 analysis/R/decode.R        |  69 ++++++++++++++++--------
 tests/analyze_assoc_expt.R | 105 +++++++++++++++++++++----------------
 tests/assoc_sim_expt.R     |   2 +-
 3 files changed, 108 insertions(+), 68 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index e75385be..07d3c815 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -24,6 +24,7 @@ Estimate2WayBloomCounts <- function(params, obs_counts) {
   q <- params$q
   f <- params$f
   m <- params$m
+  k <- params$k
   
   stopifnot(m == nrow(obs_counts), params$k + 1 == ncol(obs_counts))
   
@@ -31,28 +32,20 @@ Estimate2WayBloomCounts <- function(params, obs_counts) {
   p01 <- p * (1 - f/2) + q * f / 2  # probability of a true 0 reported as 1
   p10 <- 1 - p11  # probability of a true 1 reported as 0
   p00 <- 1 - p01  # probability of a true 0 reported as 0
-  p2 <- p11 - p01  # == (1 - f) * (q - p)
+  
+  NoiseMatrix <- matrix(rep(0, 16), 4)
+  NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2)
+  NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00)
+  NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01)
+  NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2)
   
   ests <- apply(obs_counts, 1, function(x) {
-    N <- x[1]  # sample size of cohort
-    inds <- seq(0, m/4 - 1)
-    v <- x[-1]  # counts for individual bits
-    # 11 or (1000) estimates
-    v[inds*4 + 2] <- 
-      (v[inds*4 + 2] - (p11**2)*N) / (2*p01*p11 + p01**2 - p11**2)
-    
-    # 10 or (0100) estimates
-    v[inds*4 + 3] <-
-      (v[inds*4 + 3] - (p11*p00)*N) / (p10*p11 + p01*p10 + p01*p00 - p11*p00)
-    
-    # 01 or (0010) estimates
-    v[inds*4 + 4] <-
-      (v[inds*4 + 4] - (p11*p00)*N) / (p10*p11 + p01*p10 + p01*p00 - p11*p00)
-    
-    # 00 or (0001) estimates
-    v[inds*4 + 5] <-
-      (v[inds*4 + 5] - (p11**2)*N) / (2*p10*p00 + p10**2 - p00**2)
-    v
+    N <- x[1]
+    inds <- seq(0, (k/4)-1)
+    v <- x[-1]
+    sapply(inds, function(i){
+      as.vector(t(Solve(NoiseMatrix)) %*% v[(i*4 + 1):((i+1)*4)])
+    })
   })
   
   if(FALSE) {
@@ -76,8 +69,9 @@ Estimate2WayBloomCounts <- function(params, obs_counts) {
   #     account for this possibility, and set the corresponding counts
   #     to 0.
   ests[abs(ests) == Inf] <- 0
-  
-  list(estimates = ests, stds = ests)
+    
+  list(estimates = ests,
+       stds = matrix(rep(1, 2 * length(ests[1,])), 2))
 }
 
 EstimateBloomCounts <- function(params, obs_counts) {
@@ -315,6 +309,37 @@ Resample <- function(e) {
   result
 }
 
+Decode2Way <- function(counts, map, params) {
+  k <- params$k
+  p <- params$p
+  q <- params$q
+  f <- params$f
+  h <- params$h
+  m <- params$m
+  
+  S <- ncol(map)  # total number of candidates
+  
+  N <- sum(counts[, 1])
+  
+  filter_cohorts <- which(counts[, 1] != 0)  # exclude cohorts with zero reports
+  
+  # stretch cohorts to bits
+  filter_bits <- as.vector(
+    t(matrix(1:nrow(map), nrow = m, byrow = TRUE)[filter_cohorts,]))
+  
+  es <- Estimate2WayBloomCounts(params, counts)
+  e <- list(estimates = es$estimates[filter_cohorts, , drop = FALSE],
+            stds = es$stds[filter_cohorts, , drop = FALSE])
+  coefs <- FitDistribution(e, map[filter_bits, , drop = FALSE])
+  mod <- list(coefs = coefs, stds = coefs)
+  inf <- PerformInference(map[filter_bits, , drop = FALSE],
+                          as.vector(t(es$estimates)),
+                          N, mod, params, alpha = (0.05/S),
+                          correction = "Bonferroni")
+  fit <- inf$fit
+  list(fit = fit)
+}
+
 Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05,
                    correction = c("Bonferroni"), ...) {
   k <- params$k
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index 430e0b24..f50afb2a 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -62,12 +62,14 @@ ProcessMap <- function(map, params) {
 # Function to combine reports
 # Currently assume 2-way marginals
 CombineReports <- function(reports1, reports2) {
-  two_bits <- list(c(0, 0, 0, 1), c(0, 0, 1, 0), c(0, 1, 0, 0), c(1, 0, 0, 0))
+  # Encoding (var1, var2) \in {(0, 0), (0, 1), (1, 0), (1, 1)}
+  two_bits <- list(c(0, 0, 0, 1), c(0, 1, 0, 0), c(0, 0, 1, 0), c(1, 0, 0, 0))
   OuterProd <- function(x, y) {
     as.vector(outer(x, y,
                     function(z, t) z + 2 * t))
   }
-  creports <- mapply(OuterProd, reports1, reports2,
+  # "report1-major" order
+  creports <- mapply(OuterProd, reports2, reports1,
                      SIMPLIFY = FALSE)
   # Collapse counts to bit vector according to two_bits
   lapply(creports,
@@ -183,55 +185,68 @@ main <- function(opts) {
   creports <- CombineReports(reports[[1]], reports[[2]])
   params2 <- params
   params2$k <- (params$k ** 2) * 4
-  CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]])
+  # CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]])
   cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map)
+  # Combine cohorts into one map. Needed for Decode2Way
+  inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE))
+  inds[[2]][, 1] <- inds[[2]][, 1] + length(inds[[1]][, 1])
+  inds <- rbind(inds[[1]], inds[[2]])
+  crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c(
+                                                nrow(cmap[[1]]) + nrow(cmap[[2]]),
+                                                ncol(cmap[[1]])))
+  colnames(crmap) <- colnames(cmap[[1]])
   counts <- ComputeCounts(creports, cohorts[[1]], params2)
-  ests <- Estimate2WayBloomCounts(params2, counts)
+  marginal <- Decode2Way(counts, crmap, params2)$fit
+  print(marginal)
   
-  return
-  joint_dist <- ComputeDistributionEM(reports, cohorts, map,
-                                      ignore_other = TRUE,
-                                      quick = TRUE,
-                                      params, marginals = NULL,
-                                      estimate_var = FALSE,
-                                      new_alg = inp$newalg)
-
-  
-  td <- read.csv(file = inp$truefile)
-  ed <- joint_dist$orig$fit
-  if(length(reports) == 3) {
-    ed <- as.data.frame(ed) 
+  if (FALSE) {
+    joint_dist <- ComputeDistributionEM(reports, cohorts, map,
+                                        ignore_other = TRUE,
+                                        quick = TRUE,
+                                        params, marginals = NULL,
+                                        estimate_var = FALSE,
+                                        new_alg = inp$newalg)
   }
   
-  # We can see if chi-squared tests show different results
-  # for estimated vs real distribution
-  print("CHI-SQUARED")
-  td_chisq <- chisq.test(td)
-  ed_chisq <- chisq.test(ed)
-  print(td_chisq)
-  print(ed_chisq)
-  print(l1d(td, ed, "L1 DISTANCE"))
-  l1d_metric <- l1d(td, ed, "")
-  print("JOINT_DIST$FIT")
-  print(signif(ed[order(rowSums(ed)),], 4))
-  td_metric <- td_chisq[1][[1]][[1]]
-  ed_metric <- ed_chisq[1][[1]][[1]]
-  
-  print("PROC.TIME")
-  time_taken <- proc.time() - ptm
-  print(time_taken)
-  
-  metrics <- list(td_chisq = td_metric,
-                  ed_chisq = ed_metric,
-                  tv = l1d_metric/2,
-                  time = time_taken[1],
-                  dim1 = dim(ed)[[2]],
-                  dim2 = dim(ed)[[1]])
+  td <- read.csv(file = inp$truefile)
+  print(td)
   
-  # Write metrics to metrics.csv
-  # Report l1 distance / 2 to be consistent with histogram analysis
-  filename <- file.path(inp$outdir, 'metrics.csv')
-  write.csv(metrics, file = filename, row.names = FALSE)
+  if(FALSE) {
+    ed <- joint_dist$orig$fit
+    if(length(reports) == 3) {
+      ed <- as.data.frame(ed) 
+    }
+    
+    # We can see if chi-squared tests show different results
+    # for estimated vs real distribution
+    print("CHI-SQUARED")
+    td_chisq <- chisq.test(td)
+    ed_chisq <- chisq.test(ed)
+    print(td_chisq)
+    print(ed_chisq)
+    print(l1d(td, ed, "L1 DISTANCE"))
+    l1d_metric <- l1d(td, ed, "")
+    print("JOINT_DIST$FIT")
+    print(signif(ed[order(rowSums(ed)),], 4))
+    td_metric <- td_chisq[1][[1]][[1]]
+    ed_metric <- ed_chisq[1][[1]][[1]]
+    
+    print("PROC.TIME")
+    time_taken <- proc.time() - ptm
+    print(time_taken)
+    
+    metrics <- list(td_chisq = td_metric,
+                    ed_chisq = ed_metric,
+                    tv = l1d_metric/2,
+                    time = time_taken[1],
+                    dim1 = dim(ed)[[2]],
+                    dim2 = dim(ed)[[1]])
+    
+    # Write metrics to metrics.csv
+    # Report l1 distance / 2 to be consistent with histogram analysis
+    filename <- file.path(inp$outdir, 'metrics.csv')
+    write.csv(metrics, file = filename, row.names = FALSE)
+  }
 }
 
 # L1 distance = 1 - sum(min(df1|x, df2|x)) where
diff --git a/tests/assoc_sim_expt.R b/tests/assoc_sim_expt.R
index 59ce1356..5d3438ef 100755
--- a/tests/assoc_sim_expt.R
+++ b/tests/assoc_sim_expt.R
@@ -230,7 +230,7 @@ main <- function(inp) {
   if(is.null(inp$uvals)) {
     # One off case.
     # TODO(pseudorandom): More sensible defaults.
-    uvals = list(var1 = c("str1", "str2"), var2 = c("option1", "option2", "option3"))
+    uvals = list(var1 = c("str1", "str2"), var2 = c("option1", "option2"))
   } else {
     uvals <- GetUniqueValsFromFile(apply_prefix(inp$uvals))
   }

From 598abc2d20c352c79cb54c51627c883b9e275618 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Tue, 16 Jun 2015 10:42:01 -0700
Subject: [PATCH 20/67] Working on 2-way marginal code.

---
 analysis/R/decode.R        |  2 +-
 tests/analyze_assoc_expt.R | 80 +++++++++++++++++++-------------------
 2 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 07d3c815..b68db0e4 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -71,7 +71,7 @@ Estimate2WayBloomCounts <- function(params, obs_counts) {
   ests[abs(ests) == Inf] <- 0
     
   list(estimates = ests,
-       stds = matrix(rep(1, 2 * length(ests[1,])), 2))
+       stds = matrix(rep(5, 2 * length(ests[1,])), 2))
 }
 
 EstimateBloomCounts <- function(params, obs_counts) {
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index f50afb2a..1aa01f03 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -189,7 +189,7 @@ main <- function(opts) {
   cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map)
   # Combine cohorts into one map. Needed for Decode2Way
   inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE))
-  inds[[2]][, 1] <- inds[[2]][, 1] + length(inds[[1]][, 1])
+  inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1]
   inds <- rbind(inds[[1]], inds[[2]])
   crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c(
                                                 nrow(cmap[[1]]) + nrow(cmap[[2]]),
@@ -197,56 +197,58 @@ main <- function(opts) {
   colnames(crmap) <- colnames(cmap[[1]])
   counts <- ComputeCounts(creports, cohorts[[1]], params2)
   marginal <- Decode2Way(counts, crmap, params2)$fit
-  print(marginal)
   
-  if (FALSE) {
+  also_em = FALSE
+  ed_em <- list()
+  if(also_em == TRUE) {
     joint_dist <- ComputeDistributionEM(reports, cohorts, map,
                                         ignore_other = TRUE,
                                         quick = TRUE,
                                         params, marginals = NULL,
                                         estimate_var = FALSE,
                                         new_alg = inp$newalg)
+    ed_em <- joint_dist$orig$fit
+    if(length(reports) == 3) {
+      ed_em <- as.data.frame(ed_em)
+    }
   }
   
   td <- read.csv(file = inp$truefile)
-  print(td)
-  
-  if(FALSE) {
-    ed <- joint_dist$orig$fit
-    if(length(reports) == 3) {
-      ed <- as.data.frame(ed) 
+  ed <- td
+  for (cols in colnames(td)) {
+    for (rows in rownames(td)) {
+      ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
     }
-    
-    # We can see if chi-squared tests show different results
-    # for estimated vs real distribution
-    print("CHI-SQUARED")
-    td_chisq <- chisq.test(td)
-    ed_chisq <- chisq.test(ed)
-    print(td_chisq)
-    print(ed_chisq)
-    print(l1d(td, ed, "L1 DISTANCE"))
-    l1d_metric <- l1d(td, ed, "")
-    print("JOINT_DIST$FIT")
-    print(signif(ed[order(rowSums(ed)),], 4))
-    td_metric <- td_chisq[1][[1]][[1]]
-    ed_metric <- ed_chisq[1][[1]][[1]]
-    
-    print("PROC.TIME")
-    time_taken <- proc.time() - ptm
-    print(time_taken)
-    
-    metrics <- list(td_chisq = td_metric,
-                    ed_chisq = ed_metric,
-                    tv = l1d_metric/2,
-                    time = time_taken[1],
-                    dim1 = dim(ed)[[2]],
-                    dim2 = dim(ed)[[1]])
-    
-    # Write metrics to metrics.csv
-    # Report l1 distance / 2 to be consistent with histogram analysis
-    filename <- file.path(inp$outdir, 'metrics.csv')
-    write.csv(metrics, file = filename, row.names = FALSE)
   }
+  
+  print("PROC.TIME")
+  time_taken <- proc.time() - ptm
+  print(time_taken)
+  
+  print("2 WAY RESULTS")
+  print(signif(ed[order(rowSums(ed)), ], 4))
+  print(l1d(td, ed, "L1 DISTANCE 2 WAY"))
+  metrics <- list(
+    td_chisq = chisq.test(td)[1][[1]][[1]],
+    ed_chisq = chisq.test(ed)[1][[1]][[1]],
+    tv = l1d(td, ed, "")/2,
+    time = time_taken[1],
+    dim1 = dim(ed)[[2]],
+    dim2 = dim(ed)[[1]]
+  )
+  
+  if(also_em == TRUE) {
+    # Add EM metrics
+    metrics <- c(metrics,
+                 list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]],
+                      tv_em = l1d(td, ed_em, "")/2))
+  }
+  
+  # Write metrics to metrics.csv
+  # Report l1 distance / 2 to be consistent with histogram analysis
+  filename <- file.path(inp$outdir, 'metrics.csv')
+  write.csv(metrics, file = filename, row.names = FALSE)
+
 }
 
 # L1 distance = 1 - sum(min(df1|x, df2|x)) where

From e293e670acee1343c057189ee274a834f98ab714 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Tue, 16 Jun 2015 15:53:57 -0700
Subject: [PATCH 21/67] Fixing some bugs.

---
 analysis/R/decode.R        |  6 +++--
 assoctest.sh               | 51 ++++++++++++++++++++++++++------------
 tests/analyze_assoc_expt.R | 11 +++++---
 tests/regtest_spec.py      |  4 +--
 4 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index b68db0e4..6e755423 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -71,7 +71,8 @@ Estimate2WayBloomCounts <- function(params, obs_counts) {
   ests[abs(ests) == Inf] <- 0
     
   list(estimates = ests,
-       stds = matrix(rep(5, 2 * length(ests[1,])), 2))
+       stds = matrix(rep(5, length(ests[,1]) * length(ests[1,])),
+                     length(ests[,1])))
 }
 
 EstimateBloomCounts <- function(params, obs_counts) {
@@ -277,7 +278,8 @@ FitDistribution <- function(estimates_stds, map) {
 
   support_coefs <- 1:S
 
-  if (S > length(estimates_stds$estimates) * .8) {
+  if (TRUE) {
+  # if (S > length(estimates_stds$estimates) * .8) {
     # the system is close to being underdetermined
     lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
 
diff --git a/assoctest.sh b/assoctest.sh
index 947c33bd..01de8793 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -91,14 +91,24 @@ _run-one-instance() {
 
   banner "Running association input simulation"
 
-  tests/assoc_sim.R \
-    -p $case_dir/case_params.csv \
-    -r $instance_dir/reports.csv \
-    -t $instance_dir/truedist.csv \
-    -m $instance_dir/map \
-    -n $num_clients \
-    --var1_num $num_unique_values \
-    --var2_num $num_unique_values2
+  # Setting up JSON file containing assoc_sim inputs with python
+  python -c "import json; \
+    f = file('$instance_dir/assoc_inp.json', 'w'); \
+    inp = dict(); \
+    inp['params'] = '$case_dir/case_params.csv'; \
+    inp['reports'] = '$instance_dir/reports.csv'; \
+    inp['true'] = '$instance_dir/truedist.csv'; \
+    inp['map'] = '$instance_dir/map'; \
+    inp['num'] = $num_clients; \
+    inp['extras'] = 0; \
+    inp['distr'] = 'zipf2'; \
+    inp['prefix'] = './'; \
+    inp['vars'] = 2; \
+    inp['varcandidates'] = [$num_unique_values, $num_unique_values2]; \
+    json.dump(inp, f); \
+    f.close();"
+
+  tests/assoc_sim_expt.R --inp $instance_dir/assoc_inp.json
 
   local out_dir=${instance_dir}_report
   mkdir --verbose -p $out_dir
@@ -107,15 +117,24 @@ _run-one-instance() {
   # engine, which excludes R's loading time and reading of the (possibly
   # substantial) map file. Timing below is more inclusive.
   TIMEFORMAT='Running analyze.R took %R seconds'
+
+  # Setting up JSON file with python
+  python -c "import json; \
+    f = file('$instance_dir/analyze_inp.json', 'w'); \
+    inp = dict(); \
+    inp['maps'] = ['$instance_dir/map_1.csv',\
+                   '$instance_dir/map_2.csv']; \
+    inp['reports'] = '$instance_dir/reports.csv'; \
+    inp['truefile'] = '$instance_dir/truedist.csv'; \
+    inp['outdir'] = '.'; \
+    inp['params'] = '$case_dir/case_params.csv'; \
+    inp['newalg'] = 'false'; \
+    inp['numvars'] = 2; \
+    json.dump(inp, f); \
+    f.close();"
+
   time {
-    tests/analyze_assoc.R \
-      --map1 $instance_dir/map_1.csv \
-      --map2 $instance_dir/map_2.csv \
-      --map3 $instance_dir/map_3.csv \
-      --reports $instance_dir/reports.csv \
-      --truefile $instance_dir/truedist.csv \
-      --outdir $out_dir \
-      --params $case_dir/case_params.csv
+    tests/analyze_assoc_expt.R --inp $instance_dir/analyze_inp.json
   }
 }
 
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index 1aa01f03..936ca1da 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -189,10 +189,15 @@ main <- function(opts) {
   cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map)
   # Combine cohorts into one map. Needed for Decode2Way
   inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE))
-  inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1]
-  inds <- rbind(inds[[1]], inds[[2]])
+  for (i in seq(1, length(inds))) {
+    inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1]
+  }
+  inds <- do.call("rbind", inds)
+  
+  # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1]
+  # inds <- rbind(inds[[1]], inds[[2]])
   crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c(
-                                                nrow(cmap[[1]]) + nrow(cmap[[2]]),
+                                                nrow(cmap[[1]]) * length(cmap),
                                                 ncol(cmap[[1]])))
   colnames(crmap) <- colnames(cmap[[1]])
   counts <- ComputeCounts(creports, cohorts[[1]], params2)
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index f21ba367..93101384 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -105,8 +105,8 @@
     'distr': (#'fizz-tiny',
               #'fizz-small',
               'fizz',),#'fizz-large','fizz-2large'),# 'medium'),
-    'blooms': ('8x32',), # '8x32', '16x32'),
-    'privacy': ('eps_small',),#'uma_rappor_type'), # 'eps_small'),
+    'blooms': ('8x16', '8x32'), # '8x32', '16x32'),
+    'privacy': ('eps_small','uma_rappor_type'),#'uma_rappor_type'), # 'eps_small'),
 }
 
 #

From 85495da94b7d035c45c865d72b19d5ead4890057 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Tue, 16 Jun 2015 16:38:26 -0700
Subject: [PATCH 22/67] Fixing a bug in assoctest.sh

---
 assoctest.sh               | 2 +-
 tests/analyze_assoc_expt.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/assoctest.sh b/assoctest.sh
index 01de8793..74fd7149 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -126,7 +126,7 @@ _run-one-instance() {
                    '$instance_dir/map_2.csv']; \
     inp['reports'] = '$instance_dir/reports.csv'; \
     inp['truefile'] = '$instance_dir/truedist.csv'; \
-    inp['outdir'] = '.'; \
+    inp['outdir'] = '$out_dir'; \
     inp['params'] = '$case_dir/case_params.csv'; \
     inp['newalg'] = 'false'; \
     inp['numvars'] = 2; \
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index 936ca1da..7eede713 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -236,7 +236,7 @@ main <- function(opts) {
   metrics <- list(
     td_chisq = chisq.test(td)[1][[1]][[1]],
     ed_chisq = chisq.test(ed)[1][[1]][[1]],
-    tv = l1d(td, ed, "")/2,
+    tv = l1d(td, ed, ""),
     time = time_taken[1],
     dim1 = dim(ed)[[2]],
     dim2 = dim(ed)[[1]]

From f0e82721ef8b03b28a7a75c57ecdb72308a14927 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Wed, 17 Jun 2015 20:54:38 -0700
Subject: [PATCH 23/67] More testing with 2-way marginals.

---
 analysis/R/decode.R        |  11 +-
 tests/analyze_assoc_expt.R | 252 ++++++++++++++++++++++---------------
 2 files changed, 159 insertions(+), 104 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 6e755423..fc3c29f5 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -333,12 +333,11 @@ Decode2Way <- function(counts, map, params) {
   e <- list(estimates = es$estimates[filter_cohorts, , drop = FALSE],
             stds = es$stds[filter_cohorts, , drop = FALSE])
   coefs <- FitDistribution(e, map[filter_bits, , drop = FALSE])
-  mod <- list(coefs = coefs, stds = coefs)
-  inf <- PerformInference(map[filter_bits, , drop = FALSE],
-                          as.vector(t(es$estimates)),
-                          N, mod, params, alpha = (0.05/S),
-                          correction = "Bonferroni")
-  fit <- inf$fit
+  fit <- data.frame(String = colnames(map[filter_bits, , drop = FALSE]),
+                    Estimate = matrix(coefs, ncol = 1),
+                    SD = matrix(coefs, ncol = 1),
+                    stringsAsFactors = FALSE)
+  rownames(fit) <- fit[,"String"]
   list(fit = fit)
 }
 
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index 7eede713..4036411c 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -76,11 +76,32 @@ CombineReports <- function(reports1, reports2) {
          function(x) as.vector(sapply(x, function(z) two_bits[[z+1]])))
 }
 
+
+# Given 2 lists of maps, maps1 and maps2, the function
+# combines the maps by cohort and outputs both
+# cohort-organized maps and flattened versions
+CombineMaps <- function(maps1, maps2) {
+  # Combine maps
+  cmap <- mapply(CombineMapsInternal, maps1, maps2)
+  
+  # Flatten map
+  inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE))
+  for (i in seq(1, length(inds))) {
+    inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1]
+  }
+  inds <- do.call("rbind", inds)
+  crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c(
+    nrow(cmap[[1]]) * length(cmap),
+    ncol(cmap[[1]])))
+  colnames(crmap) <- colnames(cmap[[1]])
+  list(cmap = cmap, crmap = crmap)
+}
+
 # Function to combine maps
 # Using map1-major order for both candidates and bits of the report
 # to be consistent with how CombineReports works
 # Currently assume 2-way marginals
-CombineMaps <- function(map1, map2) {
+CombineMapsInternal <- function(map1, map2) {
   # Retrieve set indices and dimensions
   rows1 <- which(map1, arr.ind = TRUE)[,1]
   cols1 <- which(map1, arr.ind = TRUE)[,2]
@@ -150,110 +171,145 @@ CombineMaps <- function(map1, map2) {
 
 main <- function(opts) {
   ptm <- proc.time()
+  direct_simulation = TRUE
   inp <- fromJSON(opts$inp)
   params <- ReadParameterFile(inp$params)
-  # ensure sufficient maps as required by number of vars
-  stopifnot(inp$numvars == length(inp$maps))
-  opts_map <- inp$maps
-  map <- lapply(opts_map, function(o)
-                  ProcessMap(ReadMapFile(o, params = params),
-                             params = params))
-  # Reports must be of the format
-  #     cohort no, rappor bitstring 1, rappor bitstring 2, ...
-  reportsObj <- read.csv(inp$reports,
-                         colClasses = c("integer",
-                                        rep("character", inp$numvars)),
-                         header = FALSE)
-
-  # Parsing reportsObj
-  # ComputeDistributionEM allows for different sets of cohorts
-  # for each variable. Here, both sets of cohorts are identical
-  co <- as.list(reportsObj[1])[[1]]
-  cohorts <- rep(list(co), inp$numvars)
-  # Parse reports from reportObj cols 2, 3, ...
-  reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1]))
-
-  # Split strings into bit arrays (as required by assoc analysis)
-  reports <- lapply(1:inp$numvars, function(i) {
-    # apply the following function to each of reports[[1]] and reports[[2]]
-    lapply(reports[[i]][[1]], function(x) {
-      # function splits strings and converts them to numeric values
-      as.numeric(strsplit(x, split = "")[[1]])
-    })
-  })
-
-  creports <- CombineReports(reports[[1]], reports[[2]])
-  params2 <- params
-  params2$k <- (params$k ** 2) * 4
-  # CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]])
-  cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map)
-  # Combine cohorts into one map. Needed for Decode2Way
-  inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE))
-  for (i in seq(1, length(inds))) {
-    inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1]
-  }
-  inds <- do.call("rbind", inds)
-  
-  # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1]
-  # inds <- rbind(inds[[1]], inds[[2]])
-  crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c(
-                                                nrow(cmap[[1]]) * length(cmap),
-                                                ncol(cmap[[1]])))
-  colnames(crmap) <- colnames(cmap[[1]])
-  counts <- ComputeCounts(creports, cohorts[[1]], params2)
-  marginal <- Decode2Way(counts, crmap, params2)$fit
-  
-  also_em = FALSE
-  ed_em <- list()
-  if(also_em == TRUE) {
-    joint_dist <- ComputeDistributionEM(reports, cohorts, map,
-                                        ignore_other = TRUE,
-                                        quick = TRUE,
-                                        params, marginals = NULL,
-                                        estimate_var = FALSE,
-                                        new_alg = inp$newalg)
-    ed_em <- joint_dist$orig$fit
-    if(length(reports) == 3) {
-      ed_em <- as.data.frame(ed_em)
+  if(direct_simulation == TRUE) {
+    # TWO WAY ASSOCIATIONS; INPUTS SIMULATED DIRECTLY
+    strconstant <- c("string", "option")
+    
+    # Construct unique vals for each variable using strconstant
+    stopifnot(length(strconstant) == inp$numvars)
+    uvals <- lapply(1:inp$numvars,
+                    function(i) {
+                      apply(as.matrix(1:inp$varcandidates[[i]]),
+                            1,
+                            function(z) sprintf("%s%d", strconstant[[i]], z))
+                    })
+    
+    # Add extras if any
+    if(inp$extras > 0) {
+      uvals[[1]] <- c(uvals[[1]], apply(as.matrix(1:inp$extras), 1,
+                          function(z) sprintf("%s%d", strconstant[[1]], z + inp$varcandidates[[1]])))
     }
-  }
-  
-  td <- read.csv(file = inp$truefile)
-  ed <- td
-  for (cols in colnames(td)) {
-    for (rows in rownames(td)) {
-      ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
+    
+    map <- lapply(uvals, function(u) CreateMap(u, params))
+    trim <- function(map) {
+      lapply(map, function(z) z[,1:inp$varcandidates[[1]]])
     }
-  }
+    # Trim maps to real # of candidates
+    # Use extras only for decoding
+    tmap <- trim(map[[1]]$map)
+    crmap_trimmed <- CombineMaps(tmap, map[[2]]$map)$crmap
+    
+    cohorts <- as.matrix(
+      apply(as.data.frame(partition), 1,
+            function(count) RandomPartition(count, rep(1, params$m))))
+    
+  } else {
+    # ensure sufficient maps as required by number of vars
+    stopifnot(inp$numvars == length(inp$maps))
+    opts_map <- inp$maps
+    map <- lapply(opts_map, function(o)
+                    ProcessMap(ReadMapFile(o, params = params),
+                               params = params))
+    # Reports must be of the format
+    #     cohort no, rappor bitstring 1, rappor bitstring 2, ...
+    reportsObj <- read.csv(inp$reports,
+                           colClasses = c("integer",
+                                          rep("character", inp$numvars)),
+                           header = FALSE)
   
-  print("PROC.TIME")
-  time_taken <- proc.time() - ptm
-  print(time_taken)
+    # Parsing reportsObj
+    # ComputeDistributionEM allows for different sets of cohorts
+    # for each variable. Here, both sets of cohorts are identical
+    co <- as.list(reportsObj[1])[[1]]
+    cohorts <- rep(list(co), inp$numvars)
+    # Parse reports from reportObj cols 2, 3, ...
+    reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1]))
   
-  print("2 WAY RESULTS")
-  print(signif(ed[order(rowSums(ed)), ], 4))
-  print(l1d(td, ed, "L1 DISTANCE 2 WAY"))
-  metrics <- list(
-    td_chisq = chisq.test(td)[1][[1]][[1]],
-    ed_chisq = chisq.test(ed)[1][[1]][[1]],
-    tv = l1d(td, ed, ""),
-    time = time_taken[1],
-    dim1 = dim(ed)[[2]],
-    dim2 = dim(ed)[[1]]
-  )
-  
-  if(also_em == TRUE) {
-    # Add EM metrics
-    metrics <- c(metrics,
-                 list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]],
-                      tv_em = l1d(td, ed_em, "")/2))
-  }
+    # Split strings into bit arrays (as required by assoc analysis)
+    reports <- lapply(1:inp$numvars, function(i) {
+      # apply the following function to each of reports[[1]] and reports[[2]]
+      lapply(reports[[i]][[1]], function(x) {
+        # function splits strings and converts them to numeric values
+        as.numeric(strsplit(x, split = "")[[1]])
+      })
+    })
   
-  # Write metrics to metrics.csv
-  # Report l1 distance / 2 to be consistent with histogram analysis
-  filename <- file.path(inp$outdir, 'metrics.csv')
-  write.csv(metrics, file = filename, row.names = FALSE)
-
+    creports <- CombineReports(reports[[1]], reports[[2]])
+    params2 <- params
+    params2$k <- (params$k ** 2) * 4
+    # CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]])
+    cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map)
+    # Combine cohorts into one map. Needed for Decode2Way
+    inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE))
+    for (i in seq(1, length(inds))) {
+      inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1]
+    }
+    inds <- do.call("rbind", inds)
+    
+    # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1]
+    # inds <- rbind(inds[[1]], inds[[2]])
+    crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c(
+                                                  nrow(cmap[[1]]) * length(cmap),
+                                                  ncol(cmap[[1]])))
+    td <- read.csv(file = inp$truefile)
+    colnames(crmap) <- colnames(cmap[[1]])
+    counts <- ComputeCounts(creports, cohorts[[1]], params2)
+    marginal <- Decode2Way(counts, crmap, params2)$fit
+    
+    also_em = FALSE
+    ed_em <- list()
+    if(also_em == TRUE) {
+      joint_dist <- ComputeDistributionEM(reports, cohorts, map,
+                                          ignore_other = TRUE,
+                                          quick = TRUE,
+                                          params, marginals = NULL,
+                                          estimate_var = FALSE,
+                                          new_alg = inp$newalg)
+      ed_em <- joint_dist$orig$fit
+      if(length(reports) == 3) {
+        ed_em <- as.data.frame(ed_em)
+      }
+    }
+    
+    ed <- td
+    for (cols in colnames(td)) {
+      for (rows in rownames(td)) {
+        ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
+      }
+    }
+    
+    time_taken <- proc.time() - ptm
+    
+    print("2 WAY RESULTS")
+    print(signif(ed[order(rowSums(ed)), ], 4))
+    print(l1d(td, ed, "L1 DISTANCE 2 WAY"))
+    print("PROC.TIME")
+    print(time_taken)
+    
+    metrics <- list(
+      td_chisq = chisq.test(td)[1][[1]][[1]],
+      ed_chisq = chisq.test(ed)[1][[1]][[1]],
+      tv = l1d(td, ed, ""),
+      time = time_taken[1],
+      dim1 = dim(ed)[[2]],
+      dim2 = dim(ed)[[1]]
+    )
+    
+    if(also_em == TRUE) {
+      # Add EM metrics
+      metrics <- c(metrics,
+                   list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]],
+                        tv_em = l1d(td, ed_em, "")/2))
+    }
+    
+    # Write metrics to metrics.csv
+    # Report l1 distance / 2 to be consistent with histogram analysis
+    filename <- file.path(inp$outdir, 'metrics.csv')
+    write.csv(metrics, file = filename, row.names = FALSE)
+  }  
 }
 
 # L1 distance = 1 - sum(min(df1|x, df2|x)) where

From 26722821e1bee660abd8ed6a85e4237f1cc4ea17 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 18 Jun 2015 12:59:21 -0700
Subject: [PATCH 24/67] Edits.

---
 analysis/R/decode.R        |  2 +-
 tests/analyze_assoc_expt.R | 71 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index fc3c29f5..c84a23dd 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -71,7 +71,7 @@ Estimate2WayBloomCounts <- function(params, obs_counts) {
   ests[abs(ests) == Inf] <- 0
     
   list(estimates = ests,
-       stds = matrix(rep(5, length(ests[,1]) * length(ests[1,])),
+       stds = matrix(rep(100, length(ests[,1]) * length(ests[1,])),
                      length(ests[,1])))
 }
 
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index 4036411c..c82257f7 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -40,6 +40,7 @@ source("analysis/R/decode.R")
 source("analysis/R/simulation.R")
 source("analysis/R/read_input.R")
 source("analysis/R/association.R")
+source("tests/gen_counts.R")
 
 # This function processes the maps loaded using ReadMapFile
 # Association analysis requires a map object with a map
@@ -177,6 +178,9 @@ main <- function(opts) {
   if(direct_simulation == TRUE) {
     # TWO WAY ASSOCIATIONS; INPUTS SIMULATED DIRECTLY
     strconstant <- c("string", "option")
+    N <- inp$num
+    n1 <- inp$varcandidates[[1]]
+    n2 <- inp$varcandidates[[2]]
     
     # Construct unique vals for each variable using strconstant
     stopifnot(length(strconstant) == inp$numvars)
@@ -190,22 +194,83 @@ main <- function(opts) {
     # Add extras if any
     if(inp$extras > 0) {
       uvals[[1]] <- c(uvals[[1]], apply(as.matrix(1:inp$extras), 1,
-                          function(z) sprintf("%s%d", strconstant[[1]], z + inp$varcandidates[[1]])))
+                          function(z) sprintf("%s%d", strconstant[[1]], z + n1)))
     }
     
     map <- lapply(uvals, function(u) CreateMap(u, params))
     trim <- function(map) {
-      lapply(map, function(z) z[,1:inp$varcandidates[[1]]])
+      lapply(map, function(z) z[,1:n1])
     }
     # Trim maps to real # of candidates
     # Use extras only for decoding
     tmap <- trim(map[[1]]$map)
     crmap_trimmed <- CombineMaps(tmap, map[[2]]$map)$crmap
     
+    # Sample values to compute partition
+    # Zipfian over n1 strings
+    v1_part <- RandomPartition(N, ComputePdf("zipf1.5", n1))
+    # Zipfian over n2 strings for each of variable 1
+    # Distr. are correlated as in assoc_sim.R
+    final_part <- as.vector(sapply(1:n1,
+                    function(i) {
+                      v2_part <- RandomPartition(v1_part[[i]],
+                                                 ComputePdf("zipf1.5", n2))
+                      if (i %% 2 == 0) {v2_part} else {rev(v2_part)}
+                    }))
+    
+    td <- matrix(final_part/sum(final_part), nrow = n1, ncol = n2, byrow = TRUE)
+    rownames(td) <- uvals[[1]][1:n1]  # Don't take into account extras
+    colnames(td) <- uvals[[2]]
+    print(signif(td, 4))
     cohorts <- as.matrix(
-      apply(as.data.frame(partition), 1,
+      apply(as.data.frame(final_part), 1,
             function(count) RandomPartition(count, rep(1, params$m))))
+    expanded <- apply(cohorts, 2, function(vec) rep(vec, each = ((params$k)**2)*4))
+    true_ones <- apply(expanded * crmap_trimmed, 1, sum)
+    
+    p <- params$p
+    q <- params$q
+    f <- params$f
+    m <- params$m
+    k <- params$k
     
+    p11 <- q * (1 - f/2) + p * f / 2  # probability of a true 1 reported as 1
+    p01 <- p * (1 - f/2) + q * f / 2  # probability of a true 0 reported as 1
+    p10 <- 1 - p11  # probability of a true 1 reported as 0
+    p00 <- 1 - p01  # probability of a true 0 reported as 0
+    
+    NoiseMatrix <- matrix(rep(0, 16), 4)
+    NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2)
+    NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00)
+    NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01)
+    NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2)
+
+    after_noise <- as.vector(sapply(1:(length(true_ones)/4), 
+                                    function(x) 
+                                      t(NoiseMatrix) %*% true_ones[((x-1)*4+1):(x*4)]))
+    counts <- cbind(apply(cohorts, 1, sum),
+                    matrix(after_noise,
+                           nrow = m,
+                           ncol = 4 * (k**2),
+                           byrow = TRUE))
+    params2 <- params
+    params2$k <- (params$k ** 2) * 4
+    crmap <- CombineMaps(map[[1]]$map, map[[2]]$map)$crmap
+    marginal <- Decode2Way(counts, crmap, params2)$fit
+    ed <- td
+    for (cols in colnames(td)) {
+      for (rows in rownames(td)) {
+        ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
+      }
+    }
+    
+    time_taken <- proc.time() - ptm
+    
+    print("2 WAY RESULTS")
+    print(signif(ed, 4))
+    print(l1d(td, ed, "L1 DISTANCE 2 WAY"))
+    print("PROC.TIME")
+    print(time_taken)
   } else {
     # ensure sufficient maps as required by number of vars
     stopifnot(inp$numvars == length(inp$maps))

From b43aa8724026a5e493a817ae3c6e7729f035ca10 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Fri, 19 Jun 2015 13:54:19 -0700
Subject: [PATCH 25/67] Simulating noise directly.

---
 tests/analyze_assoc_expt.R | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index c82257f7..b0bfc82f 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -244,7 +244,13 @@ main <- function(opts) {
     NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00)
     NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01)
     NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2)
-
+    
+    NoiseMatrix2 <- matrix(rep(0, 16), 4)
+    NoiseMatrix2[1,] <- c(1, 0, 0, 0)
+    NoiseMatrix2[2,] <- c(0, 1, 0, 0)
+    NoiseMatrix2[3,] <- c(0, 0, 1, 0)
+    NoiseMatrix2[4,] <- c(0, 0, 0, 1)
+    
     after_noise <- as.vector(sapply(1:(length(true_ones)/4), 
                                     function(x) 
                                       t(NoiseMatrix) %*% true_ones[((x-1)*4+1):(x*4)]))
@@ -253,6 +259,7 @@ main <- function(opts) {
                            nrow = m,
                            ncol = 4 * (k**2),
                            byrow = TRUE))
+    
     params2 <- params
     params2$k <- (params$k ** 2) * 4
     crmap <- CombineMaps(map[[1]]$map, map[[2]]$map)$crmap
@@ -271,6 +278,17 @@ main <- function(opts) {
     print(l1d(td, ed, "L1 DISTANCE 2 WAY"))
     print("PROC.TIME")
     print(time_taken)
+    
+    metrics <- list(
+      td_chisq = chisq.test(td)[1][[1]][[1]],
+      ed_chisq = chisq.test(ed)[1][[1]][[1]],
+      tv = l1d(td, ed, ""),
+      time = time_taken[1],
+      dim1 = dim(ed)[[2]],
+      dim2 = dim(ed)[[1]]
+    )
+    filename <- file.path(inp$outdir, 'metrics.csv')
+    write.csv(metrics, file = filename, row.names = FALSE)
   } else {
     # ensure sufficient maps as required by number of vars
     stopifnot(inp$numvars == length(inp$maps))

From ca9953eaf164b6928f4ec222ca967cc43bbfa6f7 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Fri, 19 Jun 2015 16:59:59 -0700
Subject: [PATCH 26/67] Test suite updated to consider 2-way marginals.

---
 assoctest.sh                |  21 ++++---
 tests/analyze_assoc_expt.R  | 111 ++++++++++++++++++++++++------------
 tests/assoctest.html        |   8 ++-
 tests/make_summary_assoc.py |   1 +
 tests/regtest_spec.py       |  50 +++++++++++-----
 5 files changed, 131 insertions(+), 60 deletions(-)

diff --git a/assoctest.sh b/assoctest.sh
index 74fd7149..6516653e 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -52,14 +52,15 @@ _setup-one-case() {
   local num_unique_values=$2
   local num_unique_values2=$3
   local num_clients=$4
+  local num_extras=$5
 
   # RAPPOR params
-  local num_bits=$5
-  local num_hashes=$6
-  local num_cohorts=$7
-  local p=$8
-  local q=$9  # need curly braces to get the 10th arg
-  local f=${10}
+  local num_bits=$6
+  local num_hashes=$7
+  local num_cohorts=$8
+  local p=$9
+  local q=${10}  # need curly braces to get the 10th arg
+  local f=${11}
 
   banner 'Setting up parameters and candidate files for '$test_case
 
@@ -84,7 +85,8 @@ _run-one-instance() {
   local case_dir=$ASSOCTEST_DIR/$test_case
 
   read -r case_name num_unique_values num_unique_values2 \
-    num_clients num_bits num_hashes num_cohorts p q f < $case_dir/spec.txt
+    num_clients num_extras \
+    num_bits num_hashes num_cohorts p q f < $case_dir/spec.txt
 
   local instance_dir=$ASSOCTEST_DIR/$test_case/$test_instance
   mkdir --verbose -p $instance_dir
@@ -108,7 +110,7 @@ _run-one-instance() {
     json.dump(inp, f); \
     f.close();"
 
-  tests/assoc_sim_expt.R --inp $instance_dir/assoc_inp.json
+  # tests/assoc_sim_expt.R --inp $instance_dir/assoc_inp.json
 
   local out_dir=${instance_dir}_report
   mkdir --verbose -p $out_dir
@@ -130,6 +132,9 @@ _run-one-instance() {
     inp['params'] = '$case_dir/case_params.csv'; \
     inp['newalg'] = 'false'; \
     inp['numvars'] = 2; \
+    inp['num'] = $num_clients; \
+    inp['extras'] = $num_extras; \
+    inp['varcandidates'] = [$num_unique_values, $num_unique_values2]; \
     json.dump(inp, f); \
     f.close();"
 
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index b0bfc82f..cbd2e7ff 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -169,14 +169,37 @@ CombineMapsInternal <- function(map1, map2) {
   sm
 }
 
+GenerateNoiseMatrix <- function(params) {
+  p <- params$p
+  q <- params$q
+  f <- params$f
+  m <- params$m
+  k <- params$k
+  
+  p11 <- q * (1 - f/2) + p * f / 2  # probability of a true 1 reported as 1
+  p01 <- p * (1 - f/2) + q * f / 2  # probability of a true 0 reported as 1
+  p10 <- 1 - p11  # probability of a true 1 reported as 0
+  p00 <- 1 - p01  # probability of a true 0 reported as 0
+  
+  NoiseMatrix <- matrix(rep(0, 16), 4)
+  NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2)
+  NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00)
+  NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01)
+  NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2)
+
+  NoiseMatrix
+}
+
 
 main <- function(opts) {
   ptm <- proc.time()
   direct_simulation = TRUE
   inp <- fromJSON(opts$inp)
   params <- ReadParameterFile(inp$params)
+  
   if(direct_simulation == TRUE) {
     # TWO WAY ASSOCIATIONS; INPUTS SIMULATED DIRECTLY
+    
     strconstant <- c("string", "option")
     N <- inp$num
     n1 <- inp$varcandidates[[1]]
@@ -197,13 +220,12 @@ main <- function(opts) {
                           function(z) sprintf("%s%d", strconstant[[1]], z + n1)))
     }
     
+    # Compute map
     map <- lapply(uvals, function(u) CreateMap(u, params))
-    trim <- function(map) {
-      lapply(map, function(z) z[,1:n1])
-    }
+    
     # Trim maps to real # of candidates
     # Use extras only for decoding
-    tmap <- trim(map[[1]]$map)
+    tmap <- lapply(map[[1]]$map, function(i) i[, 1:n1])
     crmap_trimmed <- CombineMaps(tmap, map[[2]]$map)$crmap
     
     # Sample values to compute partition
@@ -219,8 +241,23 @@ main <- function(opts) {
                     }))
     
     td <- matrix(final_part/sum(final_part), nrow = n1, ncol = n2, byrow = TRUE)
+    v2_part <- RandomPartition(N, apply(td, 2, sum))
+    ow_parts <- list(v1_part, v2_part)
+    ow_parts[[1]] <- c(ow_parts[[1]], rep(0, inp$extra))
+    
+    # --------------
+    # Generate 1-way counts
+    ow_counts <- lapply(1:2, function(i)
+                        GenerateCounts(params, map[[i]]$rmap, ow_parts[[i]], 1))
+    found_strings <- lapply(1:2, function(i)
+                            Decode(ow_counts[[i]],
+                                   map[[i]]$rmap,
+                                   params, quick = TRUE)$fit$strings)
+    # --------------
+    
     rownames(td) <- uvals[[1]][1:n1]  # Don't take into account extras
     colnames(td) <- uvals[[2]]
+    print("TRUE DISTRIBUTION")
     print(signif(td, 4))
     cohorts <- as.matrix(
       apply(as.data.frame(final_part), 1,
@@ -228,49 +265,39 @@ main <- function(opts) {
     expanded <- apply(cohorts, 2, function(vec) rep(vec, each = ((params$k)**2)*4))
     true_ones <- apply(expanded * crmap_trimmed, 1, sum)
     
-    p <- params$p
-    q <- params$q
-    f <- params$f
-    m <- params$m
-    k <- params$k
-    
-    p11 <- q * (1 - f/2) + p * f / 2  # probability of a true 1 reported as 1
-    p01 <- p * (1 - f/2) + q * f / 2  # probability of a true 0 reported as 1
-    p10 <- 1 - p11  # probability of a true 1 reported as 0
-    p00 <- 1 - p01  # probability of a true 0 reported as 0
     
-    NoiseMatrix <- matrix(rep(0, 16), 4)
-    NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2)
-    NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00)
-    NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01)
-    NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2)
     
-    NoiseMatrix2 <- matrix(rep(0, 16), 4)
-    NoiseMatrix2[1,] <- c(1, 0, 0, 0)
-    NoiseMatrix2[2,] <- c(0, 1, 0, 0)
-    NoiseMatrix2[3,] <- c(0, 0, 1, 0)
-    NoiseMatrix2[4,] <- c(0, 0, 0, 1)
     
+    NoiseMatrix <- GenerateNoiseMatrix(params)
     after_noise <- as.vector(sapply(1:(length(true_ones)/4), 
                                     function(x) 
                                       t(NoiseMatrix) %*% true_ones[((x-1)*4+1):(x*4)]))
     counts <- cbind(apply(cohorts, 1, sum),
                     matrix(after_noise,
-                           nrow = m,
-                           ncol = 4 * (k**2),
+                           nrow = params$m,
+                           ncol = 4 * (params$k**2),
                            byrow = TRUE))
     
     params2 <- params
     params2$k <- (params$k ** 2) * 4
-    crmap <- CombineMaps(map[[1]]$map, map[[2]]$map)$crmap
+    
+    # Combine maps to feed into Decode2Way
+    # Prune first to found_strings
+    pruned <- lapply(1:2, function(i)
+                     lapply(map[[i]]$map, function(z) z[,found_strings[[i]]]))
+    crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap
     marginal <- Decode2Way(counts, crmap, params2)$fit
-    ed <- td
+    
+    # Fill in estimated results with rows and cols from td
+    ed <- matrix(0, nrow = (n1+inp$extra), ncol = n2)
+    rownames(ed) <- uvals[[1]]
+    colnames(ed) <- uvals[[2]]
     for (cols in colnames(td)) {
       for (rows in rownames(td)) {
         ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
       }
     }
-    
+    ed[is.na(ed)] <- 0
     time_taken <- proc.time() - ptm
     
     print("2 WAY RESULTS")
@@ -278,14 +305,22 @@ main <- function(opts) {
     print(l1d(td, ed, "L1 DISTANCE 2 WAY"))
     print("PROC.TIME")
     print(time_taken)
+    chisq_td <- chisq.test(td)[1][[1]][[1]]
+    chisq_ed <- chisq.test(ed)[1][[1]][[1]]
+    if(is.nan(chisq_ed)) {
+      chisq_ed <- 0
+    }
+    if(is.nan(chisq_td)) {
+      chisq_td <- 0
+    }
     
     metrics <- list(
-      td_chisq = chisq.test(td)[1][[1]][[1]],
-      ed_chisq = chisq.test(ed)[1][[1]][[1]],
+      td_chisq = chisq_td,
+      ed_chisq = chisq_ed,
       tv = l1d(td, ed, ""),
       time = time_taken[1],
-      dim1 = dim(ed)[[2]],
-      dim2 = dim(ed)[[1]]
+      dim1 = length(found_strings[[1]]),
+      dim2 = length(found_strings[[2]])
     )
     filename <- file.path(inp$outdir, 'metrics.csv')
     write.csv(metrics, file = filename, row.names = FALSE)
@@ -371,14 +406,18 @@ main <- function(opts) {
     print(l1d(td, ed, "L1 DISTANCE 2 WAY"))
     print("PROC.TIME")
     print(time_taken)
+    chisq_ed <- chisq.test(ed)[1][[1]][[1]]
+    if(is.nan(chisq_ed)) {
+      chisq_ed <- 0
+    }
     
     metrics <- list(
       td_chisq = chisq.test(td)[1][[1]][[1]],
-      ed_chisq = chisq.test(ed)[1][[1]][[1]],
+      ed_chisq = chisq_ed,
       tv = l1d(td, ed, ""),
       time = time_taken[1],
-      dim1 = dim(ed)[[2]],
-      dim2 = dim(ed)[[1]]
+      dim1 = length(found_strings[[1]]),
+      dim2 = length(found_strings[[2]])
     )
     
     if(also_em == TRUE) {
diff --git a/tests/assoctest.html b/tests/assoctest.html
index 38e5abac..0c839c86 100644
--- a/tests/assoctest.html
+++ b/tests/assoctest.html
@@ -25,7 +25,7 @@ <h2>RAPPOR assoctest.sh</h2>
   <table cellspacing="0" cellpadding="5">
     <colgroup>
       <col span="1" class="highlight" />
-      <col span="1" />
+      <col span="2" />
       <col span="6" class="highlight" />
       <col span="6" />
     </colgroup>
@@ -35,7 +35,7 @@ <h2>RAPPOR assoctest.sh</h2>
         <td>
           Test Case
         </td>
-        <td colspan=1>
+        <td colspan=2>
           Input Params
         </td>
         <td colspan=6>
@@ -48,8 +48,9 @@ <h2>RAPPOR assoctest.sh</h2>
 
       <tr class="explain">
         <td></td>
-        <td colspan=1>
+        <td colspan=2>
           n: num reports<br/>
+          e: num extras<br/>
         </td>
         <td colspan=6>
           k: report bits<br/>
@@ -71,6 +72,7 @@ <h2>RAPPOR assoctest.sh</h2>
         <td></td>
 
         <td>n</td>
+        <td>e</td>
 
         <td>k</td>
         <td>h</td>
diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py
index 59a4f247..67843b4d 100755
--- a/tests/make_summary_assoc.py
+++ b/tests/make_summary_assoc.py
@@ -18,6 +18,7 @@
 
   <!-- input params -->
   <td></td>
+  <td></td>
 
   <!-- RAPPOR params -->
   <td></td>
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 93101384..cf8c6059 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -46,19 +46,41 @@
     # num unique values 2, num clients
     'tiny': (100, 2, int(1e03)),   # test for insufficient data
     'small': (100, 10, int(1e04)),
-    'fizz-tiny': (100, 20, int(1e03)),
-    'fizz-tiny-bool': (100, 2, int(1e03)),
-    'fizz-small': (100, 20, int(1e04)),
-    'fizz-small-bool': (100, 2, int(1e04)),
-    'fizz': (100, 20, int(1e05)),
-    'fizz-large': (100, 50, int(1e05)),
-    'fizz-2large': (100, 50, int(5e05)),
-    'fizz-bool': (100, 2, int(1e05)),
+#    'fizz-tiny': (100, 20, int(1e03)),
+#    'fizz-tiny-bool': (100, 2, int(1e03)),
+#    'fizz-small': (100, 20, int(1e04)),
+#    'fizz-small-bool': (100, 2, int(1e04)),
+#    'fizz': (100, 20, int(1e05)),
+#    'fizz-large': (100, 50, int(1e05)),
+#    'fizz-2large': (100, 50, int(5e05)),
+#    'fizz-bool': (100, 2, int(1e05)),
     'medium': (1000, 10, int(1e05)),
     'medium2': (1000, 2, int(1e05)),
     'large': (10000, 10, int(1e06)),
     'large2': (10000, 2, int(1e06)),
     'largesquared': (int(1e04), 100, int(1e06)),
+
+    # new test names for 2-way marginals
+    # includes testing for extras
+    'fizz-tiny': (100, 20, int(1e03), int(1e04)),
+    'fizz-tiny-bool': (100, 2, int(1e03), int(1e04)),
+    'fizz-small': (100, 20, int(1e04), int(1e04)),
+    'fizz-small-bool': (100, 2, int(1e04), int(1e04)),
+    'fizz': (100, 20, int(1e05), int(1e04)),
+    'fizz-bool': (100, 2, int(1e05), int(1e04)),
+
+    'compact-noextra-small': (40, 5, 1e04, 0),
+    'loose-noextra-small': (100, 20, 1e04, 0),
+    'compact-noextra-large': (40, 5, 1e06, 0),
+    'loose-noextra-large': (100, 20, 1e06, 0),
+    'compact-extra-small': (40, 5, int(1e04), int(1e04)),
+    'loose-extra-small': (100, 20, int(1e04), int(1e04)),
+    'compact-extra-large': (40, 5, int(1e06), int(1e04)),
+    'loose-extra-large': (100, 20, int(1e06), int(1e04)),
+    'compact-excess-small': (40, 5, int(1e04), int(1e05)),
+    'loose-excess-small': (100, 20, int(1e04), int(1e05)),
+    'compact-excess-large': (40, 5, int(1e06), int(1e05)),
+    'loose-excess-large': (100, 20, int(1e06), int(1e05)),
 }
 
 # 'k, h, m' as in params file.
@@ -76,6 +98,7 @@
     'eps_1_5': (0.225, 0.775, 0.0),  # eps_1 = 5, no eps_inf
     'eps_verysmall': (0.125, 0.875, 0.125),
     'eps_small': (0.125, 0.875, 0.5),
+    'eps_chrome': (0.25, 0.75, 0.5),
     'uma_rappor_type': (0.50, 0.75, 0.5),
 }
 
@@ -102,11 +125,12 @@
 # The test config runs a test suite that is the cross product of all the above
 # sets
 ASSOC_TEST_CONFIG = {
-    'distr': (#'fizz-tiny',
-              #'fizz-small',
-              'fizz',),#'fizz-large','fizz-2large'),# 'medium'),
-    'blooms': ('8x16', '8x32'), # '8x32', '16x32'),
-    'privacy': ('eps_small','uma_rappor_type'),#'uma_rappor_type'), # 'eps_small'),
+#    'distr': ('fizz-tiny', 'fizz-tiny-bool',
+#              'fizz-small', 'fizz-small-bool',
+#              'fizz', 'fizz-bool'),
+    'distr': ('fizz-small',),
+    'blooms': ('8x16',), # '8x32', '16x32'),
+    'privacy': ('eps_small',)
 }
 
 #

From f33b285de1671295fb8016acbd0370179705b544 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Fri, 19 Jun 2015 17:01:28 -0700
Subject: [PATCH 27/67] Small updates to test cases.

---
 tests/regtest_spec.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index cf8c6059..ef3c3ea5 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -125,12 +125,11 @@
 # The test config runs a test suite that is the cross product of all the above
 # sets
 ASSOC_TEST_CONFIG = {
-#    'distr': ('fizz-tiny', 'fizz-tiny-bool',
-#              'fizz-small', 'fizz-small-bool',
-#              'fizz', 'fizz-bool'),
-    'distr': ('fizz-small',),
-    'blooms': ('8x16',), # '8x32', '16x32'),
-    'privacy': ('eps_small',)
+    'distr': ('fizz-tiny', 'fizz-tiny-bool',
+              'fizz-small', 'fizz-small-bool',
+              'fizz', 'fizz-bool'),
+    'blooms': ('8x16','8x32',)# '16x32'),
+    'privacy': ('eps_small','eps_chrome')
 }
 
 #

From b3cd75983b78e6414270388714c0fb874f0fbc71 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Fri, 19 Jun 2015 17:02:13 -0700
Subject: [PATCH 28/67] Making tests run sequentially.

---
 assoctest.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/assoctest.sh b/assoctest.sh
index 6516653e..1221a59f 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -279,7 +279,8 @@ run-all() {
 
   log "Running all tests. Can take a while."
   # a- for assoc tests
-  _run-tests '^a-' $instances T T
+  # F for sequential
+  _run-tests '^a-' $instances F T
 }
 
 "$@"

From c1c48ccc0bd9294a37a406b26438ce058c2fff85 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Wed, 24 Jun 2015 14:40:32 -0700
Subject: [PATCH 29/67] gen_assoc_reports.R to produce assoc. reports.

---
 assoctest.sh              | 22 ++++++++++
 tests/gen_assoc_reports.R | 84 +++++++++++++++++++++++++++++++++++++++
 tests/regtest_spec.py     | 16 +++++---
 3 files changed, 117 insertions(+), 5 deletions(-)
 create mode 100755 tests/gen_assoc_reports.R

diff --git a/assoctest.sh b/assoctest.sh
index 1221a59f..1144da1b 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -43,6 +43,13 @@ readonly ASSOCTEST_DIR=_tmp/assoctest
 # All the Python tools need this
 export PYTHONPATH=$CLIENT_DIR
 
+# Print true inputs into a file with selected prefix
+print-true-inputs() {
+  local num_unique_values=$1
+  local prefix=$2
+  seq 1 $num_unique_values | awk '{print "'$prefix'" $1}'
+}
+
 # Generate a single test case, specified by a line of the test spec.
 # This is a helper function for _run_tests().
 _setup-one-case() {
@@ -74,6 +81,21 @@ _setup-one-case() {
 
   echo 'k,h,m,p,q,f' > $params_path
   echo "$num_bits,$num_hashes,$num_cohorts,$p,$q,$f" >> $params_path
+
+  print-true-inputs $[num_unique_values+num_extras] \
+    "str" > $case_dir/case_true_inputs1.txt
+  print-true-inputs $num_unique_values2 "opt" > $case_dir/case_true_inputs2.txt
+
+  # Hash candidates
+  analysis/tools/hash_candidates.py \
+    $params_path \
+    < $case_dir/case_true_inputs1.txt \
+    > $case_dir/case_map1.csv
+
+  analysis/tools/hash_candidates.py \
+    $params_path \
+    < $case_dir/case_true_inputs2.txt \
+    > $case_dir/case_map2.csv
 }
 
 # Run a single test instance, specified by <test_name, instance_num>.
diff --git a/tests/gen_assoc_reports.R b/tests/gen_assoc_reports.R
new file mode 100755
index 00000000..41eb045a
--- /dev/null
+++ b/tests/gen_assoc_reports.R
@@ -0,0 +1,84 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2015 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source('tests/gen_counts.R')
+
+# Usage:
+#
+# $ ./gen_assoc_reports.R 100 20 10000 foo.csv
+#
+# Inputs:
+#   size of the distribution's support for var 1
+#   size of the distribution's support for var 2
+#   number of clients
+#   name of the output file
+# Output:
+#   csv file with reports sampled according to the specified distribution. 
+
+main <- function(argv) {
+  n <- list(as.integer(argv[[1]]), as.integer(argv[[2]]))
+  N <- as.integer(argv[[3]])
+  out_file <- argv[[4]]
+
+  # Sample values to compute partition
+  # Resulting distribution is a correlated zipf x zipf
+  # distribution over 2 variables
+  PartitionWithCorrelation <- function(size, support, index) {
+    part <- RandomPartition(size, ComputePdf("zipf1.5", support))
+    if (index %% 2 == 0) {part} else {rev(part)}
+  }
+  
+  # Zipfian over n[[1]] strings
+  part <- RandomPartition(N, ComputePdf("zipf1.5", n[[1]]))
+  # Zipfian over n[[2]] strings for each of variable 1
+  final_part <- as.vector(sapply(1:n[[1]],
+                  function(i) PartitionWithCorrelation(part[[i]], n[[2]], i)))
+  
+  final_part <- matrix(final_part, nrow = n[[1]], byrow = TRUE)
+  rownames(final_part) <- sapply(1:n[[1]], function(x) paste("str", x, sep = ""))
+  colnames(final_part) <- sapply(1:n[[2]], function(x) paste("opt", x, sep = ""))
+  distr <- final_part/sum(final_part)
+  print("DISTRIBUTION")
+  print(distr)
+
+  print('PARTITION')
+  print(final_part)
+
+  # Expand partition
+  values <- list(
+    rep(1:n[[1]], rowSums(final_part)),
+    unlist(sapply(1:n[[1]], function(x) rep(1:n[[2]], final_part[x, ]))))
+  
+  stopifnot((length(values[[1]]) == N) &
+              (length(values[[2]]) == N))
+
+  # Shuffle values randomly (may take a few sec for > 10^8 inputs)
+  perm <- sample(N)
+  values <- list(values[[1]][perm], values[[2]][perm])
+
+  # Obtain reports by prefixing values with "v"s. Even slower than shuffling.
+  reports <- list(sprintf("str%d", values[[1]]),
+                  sprintf("opt%d", values[[2]]))
+
+  reports <- cbind(1:N, reports[[1]], reports[[2]])  # paste together "1 v342"
+
+  write.table(reports, file = out_file, row.names = FALSE, col.names = FALSE, 
+              sep = ",", quote = FALSE)
+}
+
+if (length(sys.frames()) == 0) {
+  main(commandArgs(TRUE))
+}
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index ef3c3ea5..31f1d1ac 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -124,12 +124,18 @@
 #    privacy params set)
 # The test config runs a test suite that is the cross product of all the above
 # sets
+#ASSOC_TEST_CONFIG = {
+#    'distr': ('fizz-tiny', 'fizz-tiny-bool',
+#              'fizz-small', 'fizz-small-bool',
+#              'fizz', 'fizz-bool'),
+#    'blooms': ('8x16','8x32',),# '16x32'),
+#    'privacy': ('eps_small','eps_chrome')
+#}
+
 ASSOC_TEST_CONFIG = {
-    'distr': ('fizz-tiny', 'fizz-tiny-bool',
-              'fizz-small', 'fizz-small-bool',
-              'fizz', 'fizz-bool'),
-    'blooms': ('8x16','8x32',)# '16x32'),
-    'privacy': ('eps_small','eps_chrome')
+  'distr': ('fizz-small',),
+  'blooms': ('8x32',),
+  'privacy': ('eps_small',)
 }
 
 #

From dd3dd839c640dd8738f8bc57e46ff3f5a45b0e1c Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Wed, 24 Jun 2015 22:30:36 -0700
Subject: [PATCH 30/67] Changes for running experiments with 2 way marginals.

---
 assoctest.sh               |  27 +-
 client/python/rappor.py    |   6 +-
 tests/analyze_assoc_expt.R | 592 +++++++++++++++++++++++--------------
 tests/regtest_spec.py      |   8 +-
 4 files changed, 402 insertions(+), 231 deletions(-)

diff --git a/assoctest.sh b/assoctest.sh
index 1144da1b..047d09fc 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -113,7 +113,28 @@ _run-one-instance() {
   local instance_dir=$ASSOCTEST_DIR/$test_case/$test_instance
   mkdir --verbose -p $instance_dir
 
-  banner "Running association input simulation"
+  banner "Generating input"
+
+  tests/gen_assoc_reports.R $num_unique_values $num_unique_values2 \
+                            $num_clients $instance_dir/case.csv
+
+  banner "Running RAPPOR client"
+  tests/rappor_assoc_sim.py \
+    --num-bits $num_bits \
+    --num-hashes $num_hashes \
+    --num-cohorts $num_cohorts \
+    -p $p \
+    -q $q \
+    -f $f \
+    -i $instance_dir/case.csv \
+    --out-prefix "$instance_dir/case"
+
+  analysis/tools/sum_bits_assoc.py \
+    $case_dir/case_params.csv \
+    "$instance_dir/case" \
+    < $instance_dir/case_out.csv
+
+  return
 
   # Setting up JSON file containing assoc_sim inputs with python
   python -c "import json; \
@@ -146,8 +167,8 @@ _run-one-instance() {
   python -c "import json; \
     f = file('$instance_dir/analyze_inp.json', 'w'); \
     inp = dict(); \
-    inp['maps'] = ['$instance_dir/map_1.csv',\
-                   '$instance_dir/map_2.csv']; \
+    inp['maps'] = ['$case_dir/case_map1.csv',\
+                   '$case_dir/case_map2.csv']; \
     inp['reports'] = '$instance_dir/reports.csv'; \
     inp['truefile'] = '$instance_dir/truedist.csv'; \
     inp['outdir'] = '$out_dir'; \
diff --git a/client/python/rappor.py b/client/python/rappor.py
index 4423f8af..5481963b 100644
--- a/client/python/rappor.py
+++ b/client/python/rappor.py
@@ -216,14 +216,16 @@ def __init__(self, params, user_id, rand_funcs=None):
     self.p_gen = self.rand_funcs.p_gen
     self.q_gen = self.rand_funcs.q_gen
 
-  def encode(self, word):
+  def encode(self, word, assigned_cohort = -1):
     """Compute rappor (Instantaneous Randomized Response)."""
     params = self.params
 
     cohort, uniform, f_mask = get_rappor_masks(self.user_id, word,
                                                params,
                                                self.rand_funcs)
-
+    if (assigned_cohort != -1) and (assigned_cohort in
+                                    range(0, params.num_cohorts)):
+      cohort = assigned_cohort
     bloom_bits_array = 0
     # Compute Bloom Filter
     for hash_no in xrange(params.num_hashes):
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index cbd2e7ff..27208017 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -60,6 +60,18 @@ ProcessMap <- function(map, params) {
   map
 }
 
+# TV distance = L1 distance / 2 = 1 - sum(min(df1|x, df2|x)) where
+# df1|x / df2|x projects the distribution to the intersection x of the
+# supports of df1 and df2
+TVDistance <- function(df1, df2, statement = "TV DISTANCE") {
+  rowsi <- intersect(rownames(df1), rownames(df2))
+  colsi <- intersect(colnames(df1), colnames(df2))
+  print(statement)
+  1 - sum(mapply(min, 
+                 unlist(as.data.frame(df1)[rowsi, colsi], use.names = FALSE),
+                 unlist(as.data.frame(df2)[rowsi, colsi], use.names = FALSE)))
+}
+
 # Function to combine reports
 # Currently assume 2-way marginals
 CombineReports <- function(reports1, reports2) {
@@ -190,154 +202,273 @@ GenerateNoiseMatrix <- function(params) {
   NoiseMatrix
 }
 
-
-main <- function(opts) {
-  ptm <- proc.time()
-  direct_simulation = TRUE
-  inp <- fromJSON(opts$inp)
+# ------------------------------------------------------------------------
+##
+## Direct simulation of reports without simulated variance
+## 
+## Inputs:
+##
+## Outputs:
+#
+# ------------------------------------------------------------------------
+DirectSimulationOfReports <- function(inp) {
   params <- ReadParameterFile(inp$params)
+  # TWO WAY ASSOCIATIONS; INPUTS SIMULATED DIRECTLY
   
-  if(direct_simulation == TRUE) {
-    # TWO WAY ASSOCIATIONS; INPUTS SIMULATED DIRECTLY
-    
-    strconstant <- c("string", "option")
-    N <- inp$num
-    n1 <- inp$varcandidates[[1]]
-    n2 <- inp$varcandidates[[2]]
-    
-    # Construct unique vals for each variable using strconstant
-    stopifnot(length(strconstant) == inp$numvars)
-    uvals <- lapply(1:inp$numvars,
-                    function(i) {
-                      apply(as.matrix(1:inp$varcandidates[[i]]),
-                            1,
-                            function(z) sprintf("%s%d", strconstant[[i]], z))
-                    })
-    
-    # Add extras if any
-    if(inp$extras > 0) {
-      uvals[[1]] <- c(uvals[[1]], apply(as.matrix(1:inp$extras), 1,
-                          function(z) sprintf("%s%d", strconstant[[1]], z + n1)))
-    }
-    
-    # Compute map
-    map <- lapply(uvals, function(u) CreateMap(u, params))
-    
-    # Trim maps to real # of candidates
-    # Use extras only for decoding
-    tmap <- lapply(map[[1]]$map, function(i) i[, 1:n1])
-    crmap_trimmed <- CombineMaps(tmap, map[[2]]$map)$crmap
-    
-    # Sample values to compute partition
-    # Zipfian over n1 strings
-    v1_part <- RandomPartition(N, ComputePdf("zipf1.5", n1))
-    # Zipfian over n2 strings for each of variable 1
-    # Distr. are correlated as in assoc_sim.R
-    final_part <- as.vector(sapply(1:n1,
-                    function(i) {
-                      v2_part <- RandomPartition(v1_part[[i]],
-                                                 ComputePdf("zipf1.5", n2))
-                      if (i %% 2 == 0) {v2_part} else {rev(v2_part)}
-                    }))
-    
-    td <- matrix(final_part/sum(final_part), nrow = n1, ncol = n2, byrow = TRUE)
-    v2_part <- RandomPartition(N, apply(td, 2, sum))
-    ow_parts <- list(v1_part, v2_part)
-    ow_parts[[1]] <- c(ow_parts[[1]], rep(0, inp$extra))
-    
-    # --------------
-    # Generate 1-way counts
-    ow_counts <- lapply(1:2, function(i)
-                        GenerateCounts(params, map[[i]]$rmap, ow_parts[[i]], 1))
-    found_strings <- lapply(1:2, function(i)
-                            Decode(ow_counts[[i]],
-                                   map[[i]]$rmap,
-                                   params, quick = TRUE)$fit$strings)
-    # --------------
-    
-    rownames(td) <- uvals[[1]][1:n1]  # Don't take into account extras
-    colnames(td) <- uvals[[2]]
-    print("TRUE DISTRIBUTION")
-    print(signif(td, 4))
-    cohorts <- as.matrix(
-      apply(as.data.frame(final_part), 1,
-            function(count) RandomPartition(count, rep(1, params$m))))
-    expanded <- apply(cohorts, 2, function(vec) rep(vec, each = ((params$k)**2)*4))
-    true_ones <- apply(expanded * crmap_trimmed, 1, sum)
-    
-    
-    
-    
-    NoiseMatrix <- GenerateNoiseMatrix(params)
-    after_noise <- as.vector(sapply(1:(length(true_ones)/4), 
-                                    function(x) 
-                                      t(NoiseMatrix) %*% true_ones[((x-1)*4+1):(x*4)]))
-    counts <- cbind(apply(cohorts, 1, sum),
-                    matrix(after_noise,
-                           nrow = params$m,
-                           ncol = 4 * (params$k**2),
-                           byrow = TRUE))
-    
-    params2 <- params
-    params2$k <- (params$k ** 2) * 4
-    
-    # Combine maps to feed into Decode2Way
-    # Prune first to found_strings
-    pruned <- lapply(1:2, function(i)
-                     lapply(map[[i]]$map, function(z) z[,found_strings[[i]]]))
-    crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap
-    marginal <- Decode2Way(counts, crmap, params2)$fit
-    
-    # Fill in estimated results with rows and cols from td
-    ed <- matrix(0, nrow = (n1+inp$extra), ncol = n2)
-    rownames(ed) <- uvals[[1]]
-    colnames(ed) <- uvals[[2]]
-    for (cols in colnames(td)) {
-      for (rows in rownames(td)) {
-        ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
-      }
+  strconstant <- c("string", "option")
+  N <- inp$num
+  n1 <- inp$varcandidates[[1]]
+  n2 <- inp$varcandidates[[2]]
+  
+  # Construct unique vals for each variable using strconstant
+  stopifnot(length(strconstant) == inp$numvars)
+  uvals <- lapply(1:inp$numvars,
+                  function(i) {
+                    apply(as.matrix(1:inp$varcandidates[[i]]),
+                          1,
+                          function(z) sprintf("%s%d", strconstant[[i]], z))
+                  })
+  
+  # Add extras if any
+  if(inp$extras > 0) {
+    uvals[[1]] <- c(uvals[[1]], apply(as.matrix(1:inp$extras), 1,
+                                      function(z) sprintf("%s%d", strconstant[[1]], z + n1)))
+  }
+  
+  # Compute map
+  map <- lapply(uvals, function(u) CreateMap(u, params))
+  
+  # Trim maps to real # of candidates
+  # Use extras only for decoding
+  tmap <- lapply(map[[1]]$map, function(i) i[, 1:n1])
+  crmap_trimmed <- CombineMaps(tmap, map[[2]]$map)$crmap
+  
+  # Sample values to compute partition
+  # Zipfian over n1 strings
+  v1_part <- RandomPartition(N, ComputePdf("zipf1.5", n1))
+  # Zipfian over n2 strings for each of variable 1
+  # Distr. are correlated as in assoc_sim.R
+  final_part <- as.vector(sapply(1:n1,
+                                 function(i) {
+                                   v2_part <- RandomPartition(v1_part[[i]],
+                                                              ComputePdf("zipf1.5", n2))
+                                   if (i %% 2 == 0) {v2_part} else {rev(v2_part)}
+                                 }))
+  
+  td <- matrix(final_part/sum(final_part), nrow = n1, ncol = n2, byrow = TRUE)
+  v2_part <- RandomPartition(N, apply(td, 2, sum))
+  ow_parts <- list(v1_part, v2_part)
+  ow_parts[[1]] <- c(ow_parts[[1]], rep(0, inp$extra))
+  
+  # --------------
+  # Generate 1-way counts
+  ow_counts <- lapply(1:2, function(i)
+    GenerateCounts(params, map[[i]]$rmap, ow_parts[[i]], 1))
+  found_strings <- lapply(1:2, function(i)
+    Decode(ow_counts[[i]],
+           map[[i]]$rmap,
+           params, quick = TRUE)$fit$strings)
+  # --------------
+  
+  rownames(td) <- uvals[[1]][1:n1]  # Don't take into account extras
+  colnames(td) <- uvals[[2]]
+  print("TRUE DISTRIBUTION")
+  print(signif(td, 4))
+  cohorts <- as.matrix(
+    apply(as.data.frame(final_part), 1,
+          function(count) RandomPartition(count, rep(1, params$m))))
+  expanded <- apply(cohorts, 2, function(vec) rep(vec, each = ((params$k)**2)*4))
+  true_ones <- apply(expanded * crmap_trimmed, 1, sum)
+  
+  NoiseMatrix <- GenerateNoiseMatrix(params)
+  after_noise <- as.vector(sapply(1:(length(true_ones)/4), 
+                                  function(x) 
+                                    t(NoiseMatrix) %*% true_ones[((x-1)*4+1):(x*4)]))
+  counts <- cbind(apply(cohorts, 1, sum),
+                  matrix(after_noise,
+                         nrow = params$m,
+                         ncol = 4 * (params$k**2),
+                         byrow = TRUE))
+  
+  params2 <- params
+  params2$k <- (params$k ** 2) * 4
+  
+  # Combine maps to feed into Decode2Way
+  # Prune first to found_strings from Decode on 1-way counts
+  pruned <- lapply(1:2, function(i)
+    lapply(map[[i]]$map, function(z) z[,found_strings[[i]]]))
+  crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap
+  marginal <- Decode2Way(counts, crmap, params2)$fit
+  
+  # Fill in estimated results with rows and cols from td
+  ed <- matrix(0, nrow = (n1+inp$extra), ncol = n2)
+  rownames(ed) <- uvals[[1]]
+  colnames(ed) <- uvals[[2]]
+  for (cols in colnames(td)) {
+    for (rows in rownames(td)) {
+      ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
     }
-    ed[is.na(ed)] <- 0
-    time_taken <- proc.time() - ptm
-    
-    print("2 WAY RESULTS")
-    print(signif(ed, 4))
-    print(l1d(td, ed, "L1 DISTANCE 2 WAY"))
-    print("PROC.TIME")
-    print(time_taken)
-    chisq_td <- chisq.test(td)[1][[1]][[1]]
-    chisq_ed <- chisq.test(ed)[1][[1]][[1]]
-    if(is.nan(chisq_ed)) {
-      chisq_ed <- 0
+  }
+  ed[is.na(ed)] <- 0
+  time_taken <- proc.time() - ptm
+  
+  print("2 WAY RESULTS")
+  print(signif(ed, 4))
+  print(TVDistance(td, ed, "TV DISTANCE 2 WAY ALGORITHM"))
+  print("PROC.TIME")
+  print(time_taken)
+  chisq_td <- chisq.test(td)[1][[1]][[1]]
+  chisq_ed <- chisq.test(ed)[1][[1]][[1]]
+  if(is.nan(chisq_ed)) {
+    chisq_ed <- 0
+  }
+  if(is.nan(chisq_td)) {
+    chisq_td <- 0
+  }
+  
+  metrics <- list(
+    td_chisq = chisq_td,
+    ed_chisq = chisq_ed,
+    tv = TVDistance(td, ed, ""),
+    time = time_taken[1],
+    dim1 = length(found_strings[[1]]),
+    dim2 = length(found_strings[[2]])
+  )
+  filename <- file.path(inp$outdir, 'metrics.csv')
+  write.csv(metrics, file = filename, row.names = FALSE)
+}
+
+# ------------------------------------------------------------------------
+##
+## Externally provided counts (gen_assoc_counts.R and sum_assoc_reports.py)
+## 2 WAY ASSOCIATION ONLY
+## 
+## Inputs:
+##    count files (2 way counts, individual marginal counts)
+##    map files (2 variables)
+##
+## Outputs:
+#
+# ------------------------------------------------------------------------
+ExternalCounts <- function(inp) {
+  params <- ReadParameterFile(inp$params)
+  # Ensure sufficient maps as required by number of vars
+  stopifnot(inp$numvars == length(inp$maps))
+  map <- lapply(inp$maps, function(o)
+    ProcessMap(ReadMapFile(o, params = params),
+               params = params))
+
+  # (2 way counts, marginal 1 counts, marginal 2 counts)
+  counts <- lapply(1:3, function(i) ReadCountsFile(inp$counts[[i]]))
+  
+  params2 <- params
+  params2$k <- (params$k ** 2) * 4
+  
+  # Prune candidates
+  found_strings <- lapply(1:2, function(i)
+    Decode(counts[[i + 1]],
+           map[[i]]$rmap,
+           params, quick = FALSE)$fit$strings)
+  
+  cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map)
+  # Combine cohorts into one map. Needed for Decode2Way
+  inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE))
+  for (i in seq(1, length(inds))) {
+    inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1]
+  }
+  inds <- do.call("rbind", inds)
+  
+  # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1]
+  # inds <- rbind(inds[[1]], inds[[2]])
+  crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c(
+    nrow(cmap[[1]]) * length(cmap),
+    ncol(cmap[[1]])))
+  td <- read.csv(file = inp$truefile)
+  colnames(crmap) <- colnames(cmap[[1]])
+  counts <- ComputeCounts(creports, cohorts[[1]], params2)
+  marginal <- Decode2Way(counts, crmap, params2)$fit
+  
+  also_em = FALSE
+  ed_em <- list()
+  if(also_em == TRUE) {
+    joint_dist <- ComputeDistributionEM(reports, cohorts, map,
+                                        ignore_other = TRUE,
+                                        quick = TRUE,
+                                        params, marginals = NULL,
+                                        estimate_var = FALSE,
+                                        new_alg = inp$newalg)
+    ed_em <- joint_dist$orig$fit
+    if(length(reports) == 3) {
+      ed_em <- as.data.frame(ed_em)
     }
-    if(is.nan(chisq_td)) {
-      chisq_td <- 0
+  }
+  
+  ed <- td
+  for (cols in colnames(td)) {
+    for (rows in rownames(td)) {
+      ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
     }
-    
-    metrics <- list(
-      td_chisq = chisq_td,
-      ed_chisq = chisq_ed,
-      tv = l1d(td, ed, ""),
-      time = time_taken[1],
-      dim1 = length(found_strings[[1]]),
-      dim2 = length(found_strings[[2]])
-    )
-    filename <- file.path(inp$outdir, 'metrics.csv')
-    write.csv(metrics, file = filename, row.names = FALSE)
-  } else {
-    # ensure sufficient maps as required by number of vars
-    stopifnot(inp$numvars == length(inp$maps))
-    opts_map <- inp$maps
-    map <- lapply(opts_map, function(o)
-                    ProcessMap(ReadMapFile(o, params = params),
-                               params = params))
+  }
+  
+  time_taken <- proc.time() - ptm
+  
+  print("2 WAY RESULTS")
+  print(signif(ed[order(rowSums(ed)), ], 4))
+  print(TVDistance(td, ed, "TV DISTANCE 2 WAY"))
+  print("PROC.TIME")
+  print(time_taken)
+  chisq_ed <- chisq.test(ed)[1][[1]][[1]]
+  if(is.nan(chisq_ed)) {
+    chisq_ed <- 0
+  }
+  
+  metrics <- list(
+    td_chisq = chisq.test(td)[1][[1]][[1]],
+    ed_chisq = chisq_ed,
+    tv = TVDistance(td, ed, ""),
+    time = time_taken[1],
+    dim1 = length(found_strings[[1]]),
+    dim2 = length(found_strings[[2]])
+  )
+  
+  if(also_em == TRUE) {
+    # Add EM metrics
+    metrics <- c(metrics,
+                 list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]],
+                      tv_em = TVDistance(td, ed_em, "")/2))
+  }
+  
+  # Write metrics to metrics.csv
+  filename <- file.path(inp$outdir, 'metrics.csv')
+  write.csv(metrics, file = filename, row.names = FALSE)
+}
+
+# ------------------------------------------------------------------------
+##
+## Externally provided reports
+## 2 OR 3 WAY ASSOCIATION
+## 
+## Inputs:
+##    
+## Outputs:
+#
+# ------------------------------------------------------------------------
+ExternalReports <- function(inp) {
+  params <- ReadParameterFile(inp$params)
+  # Ensure sufficient maps as required by number of vars
+  stopifnot(inp$numvars == length(inp$maps))
+  map <- lapply(inp$maps, function(o)
+    ProcessMap(ReadMapFile(o, params = params),
+               params = params))
+  
+  if (read_reports_flag == TRUE) {
     # Reports must be of the format
     #     cohort no, rappor bitstring 1, rappor bitstring 2, ...
     reportsObj <- read.csv(inp$reports,
                            colClasses = c("integer",
                                           rep("character", inp$numvars)),
                            header = FALSE)
-  
+    
     # Parsing reportsObj
     # ComputeDistributionEM allows for different sets of cohorts
     # for each variable. Here, both sets of cohorts are identical
@@ -345,7 +476,7 @@ main <- function(opts) {
     cohorts <- rep(list(co), inp$numvars)
     # Parse reports from reportObj cols 2, 3, ...
     reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1]))
-  
+    
     # Split strings into bit arrays (as required by assoc analysis)
     reports <- lapply(1:inp$numvars, function(i) {
       # apply the following function to each of reports[[1]] and reports[[2]]
@@ -354,96 +485,111 @@ main <- function(opts) {
         as.numeric(strsplit(x, split = "")[[1]])
       })
     })
-  
-    creports <- CombineReports(reports[[1]], reports[[2]])
-    params2 <- params
-    params2$k <- (params$k ** 2) * 4
-    # CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]])
-    cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map)
-    # Combine cohorts into one map. Needed for Decode2Way
-    inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE))
-    for (i in seq(1, length(inds))) {
-      inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1]
-    }
-    inds <- do.call("rbind", inds)
-    
-    # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1]
-    # inds <- rbind(inds[[1]], inds[[2]])
-    crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c(
-                                                  nrow(cmap[[1]]) * length(cmap),
-                                                  ncol(cmap[[1]])))
-    td <- read.csv(file = inp$truefile)
-    colnames(crmap) <- colnames(cmap[[1]])
-    counts <- ComputeCounts(creports, cohorts[[1]], params2)
-    marginal <- Decode2Way(counts, crmap, params2)$fit
-    
-    also_em = FALSE
-    ed_em <- list()
-    if(also_em == TRUE) {
-      joint_dist <- ComputeDistributionEM(reports, cohorts, map,
-                                          ignore_other = TRUE,
-                                          quick = TRUE,
-                                          params, marginals = NULL,
-                                          estimate_var = FALSE,
-                                          new_alg = inp$newalg)
-      ed_em <- joint_dist$orig$fit
-      if(length(reports) == 3) {
-        ed_em <- as.data.frame(ed_em)
-      }
-    }
-    
-    ed <- td
-    for (cols in colnames(td)) {
-      for (rows in rownames(td)) {
-        ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
-      }
-    }
     
-    time_taken <- proc.time() - ptm
-    
-    print("2 WAY RESULTS")
-    print(signif(ed[order(rowSums(ed)), ], 4))
-    print(l1d(td, ed, "L1 DISTANCE 2 WAY"))
-    print("PROC.TIME")
-    print(time_taken)
-    chisq_ed <- chisq.test(ed)[1][[1]][[1]]
-    if(is.nan(chisq_ed)) {
-      chisq_ed <- 0
+    creports <- CombineReports(reports[[1]], reports[[2]])
+  }
+  
+  params2 <- params
+  params2$k <- (params$k ** 2) * 4
+  # CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]])
+  cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map)
+  # Combine cohorts into one map. Needed for Decode2Way
+  inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE))
+  for (i in seq(1, length(inds))) {
+    inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1]
+  }
+  inds <- do.call("rbind", inds)
+  
+  # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1]
+  # inds <- rbind(inds[[1]], inds[[2]])
+  crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c(
+    nrow(cmap[[1]]) * length(cmap),
+    ncol(cmap[[1]])))
+  td <- read.csv(file = inp$truefile)
+  colnames(crmap) <- colnames(cmap[[1]])
+  counts <- ComputeCounts(creports, cohorts[[1]], params2)
+  marginal <- Decode2Way(counts, crmap, params2)$fit
+  
+  also_em = FALSE
+  ed_em <- list()
+  if(also_em == TRUE) {
+    joint_dist <- ComputeDistributionEM(reports, cohorts, map,
+                                        ignore_other = TRUE,
+                                        quick = TRUE,
+                                        params, marginals = NULL,
+                                        estimate_var = FALSE,
+                                        new_alg = inp$newalg)
+    ed_em <- joint_dist$orig$fit
+    if(length(reports) == 3) {
+      ed_em <- as.data.frame(ed_em)
     }
-    
-    metrics <- list(
-      td_chisq = chisq.test(td)[1][[1]][[1]],
-      ed_chisq = chisq_ed,
-      tv = l1d(td, ed, ""),
-      time = time_taken[1],
-      dim1 = length(found_strings[[1]]),
-      dim2 = length(found_strings[[2]])
-    )
-    
-    if(also_em == TRUE) {
-      # Add EM metrics
-      metrics <- c(metrics,
-                   list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]],
-                        tv_em = l1d(td, ed_em, "")/2))
+  }
+  
+  ed <- td
+  for (cols in colnames(td)) {
+    for (rows in rownames(td)) {
+      ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
     }
-    
-    # Write metrics to metrics.csv
-    # Report l1 distance / 2 to be consistent with histogram analysis
-    filename <- file.path(inp$outdir, 'metrics.csv')
-    write.csv(metrics, file = filename, row.names = FALSE)
-  }  
+  }
+  
+  time_taken <- proc.time() - ptm
+  
+  print("2 WAY RESULTS")
+  print(signif(ed[order(rowSums(ed)), ], 4))
+  print(TVDistance(td, ed, "TV DISTANCE 2 WAY"))
+  print("PROC.TIME")
+  print(time_taken)
+  chisq_ed <- chisq.test(ed)[1][[1]][[1]]
+  if(is.nan(chisq_ed)) {
+    chisq_ed <- 0
+  }
+  
+  metrics <- list(
+    td_chisq = chisq.test(td)[1][[1]][[1]],
+    ed_chisq = chisq_ed,
+    tv = TVDistance(td, ed, ""),
+    time = time_taken[1],
+    dim1 = length(found_strings[[1]]),
+    dim2 = length(found_strings[[2]])
+  )
+  
+  if(also_em == TRUE) {
+    # Add EM metrics
+    metrics <- c(metrics,
+                 list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]],
+                      tv_em = TVDistance(td, ed_em, "")/2))
+  }
+  
+  # Write metrics to metrics.csv
+  filename <- file.path(inp$outdir, 'metrics.csv')
+  write.csv(metrics, file = filename, row.names = FALSE)
 }
 
-# L1 distance = 1 - sum(min(df1|x, df2|x)) where
-# df1|x / df2|x projects the distribution to the intersection x of the
-# supports of df1 and df2
-l1d <- function(df1, df2, statement = "L1 DISTANCE") {
-  rowsi <- intersect(rownames(df1), rownames(df2))
-  colsi <- intersect(colnames(df1), colnames(df2))
-  print(statement)
-  1 - sum(mapply(min, 
-                 unlist(as.data.frame(df1)[rowsi, colsi], use.names = FALSE),
-                 unlist(as.data.frame(df2)[rowsi, colsi], use.names = FALSE)))
+main <- function(opts) {
+  ptm <- proc.time()
+  direct_simulation = FALSE
+  inp <- fromJSON(opts$inp)
+  
+  # Choose from a set of experiments to run
+  # direct -> direct simulation of reports (without variances)
+  # external-counts -> externally supplied counts for 2 way and marginals
+  # external-reports -> externally supplied reports 
+  if (!(inp$expt %in% c("direct", "external-counts", "external-reports"))) {
+    stop("Incorrect experiment in JSON file.")
+  }
+  
+  if(inp$expt == "direct") {
+    print("---------- RUNNING EXPERIMENT \"DIRECT\" ----------")
+    DirectSimulationOfReports(inp)
+  } 
+  if (inp$expt == "external-counts") {
+    print("---------- RUNNING EXPERIMENT \"EXT COUNTS\" ----------")
+    ExternalCounts(inp)  
+  }
+  if (inp$expt == "external-reports") {
+    print("---------- RUNNING EXPERIMENT \"EXT REPORTS\" ----------")
+    ExternalReports(inp)
+  }
 }
 
 if(!interactive()) {
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 31f1d1ac..845dc93b 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -69,6 +69,7 @@
     'fizz': (100, 20, int(1e05), int(1e04)),
     'fizz-bool': (100, 2, int(1e05), int(1e04)),
 
+    'toy': (5, 2, 1e04, 20),  # for testing purposes only
     'compact-noextra-small': (40, 5, 1e04, 0),
     'loose-noextra-small': (100, 20, 1e04, 0),
     'compact-noextra-large': (40, 5, 1e06, 0),
@@ -94,6 +95,7 @@
 
 # 'p, q, f' as in params file.
 PRIVACY_PARAMS = {
+    'eps_zero': (0, 0.99, 0),  # testing purposes only!
     'eps_1_1': (0.39, 0.61, 0.45),  # eps_1 = 1, eps_inf = 5:
     'eps_1_5': (0.225, 0.775, 0.0),  # eps_1 = 5, no eps_inf
     'eps_verysmall': (0.125, 0.875, 0.125),
@@ -133,9 +135,9 @@
 #}
 
 ASSOC_TEST_CONFIG = {
-  'distr': ('fizz-small',),
-  'blooms': ('8x32',),
-  'privacy': ('eps_small',)
+  'distr': ('toy',),
+  'blooms': ('16x32',),
+  'privacy': ('eps_zero',)
 }
 
 #

From e296f6b5100f92225aa1ef2caadfcb0261118f48 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Wed, 24 Jun 2015 23:55:25 -0700
Subject: [PATCH 31/67] Assoctest.sh test suite for experiments up.

---
 assoctest.sh               |  9 ++++--
 tests/analyze_assoc_expt.R | 64 +++++++++-----------------------------
 tests/regtest_spec.py      |  2 +-
 3 files changed, 22 insertions(+), 53 deletions(-)

diff --git a/assoctest.sh b/assoctest.sh
index 047d09fc..f433862f 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -134,7 +134,6 @@ _run-one-instance() {
     "$instance_dir/case" \
     < $instance_dir/case_out.csv
 
-  return
 
   # Setting up JSON file containing assoc_sim inputs with python
   python -c "import json; \
@@ -170,7 +169,7 @@ _run-one-instance() {
     inp['maps'] = ['$case_dir/case_map1.csv',\
                    '$case_dir/case_map2.csv']; \
     inp['reports'] = '$instance_dir/reports.csv'; \
-    inp['truefile'] = '$instance_dir/truedist.csv'; \
+    inp['truefile'] = '$instance_dir/case.csv'; \
     inp['outdir'] = '$out_dir'; \
     inp['params'] = '$case_dir/case_params.csv'; \
     inp['newalg'] = 'false'; \
@@ -178,6 +177,10 @@ _run-one-instance() {
     inp['num'] = $num_clients; \
     inp['extras'] = $num_extras; \
     inp['varcandidates'] = [$num_unique_values, $num_unique_values2]; \
+    inp['counts'] = ['$instance_dir/case_2way.csv',\
+                     '$instance_dir/case_marg1.csv',\
+                     '$instance_dir/case_marg2.csv']; \
+    inp['expt'] = 'external-counts'; \
     json.dump(inp, f); \
     f.close();"
 
@@ -267,7 +270,7 @@ _run-tests() {
   local processors=1
 
   if test $parallel = F; then
-    func=_run-one-instance  # output to the console
+    func=_run-one-instance-logged  # output to the console
   else
     func=_run-one-instance-logged
     processors=$(grep -c ^processor /proc/cpuinfo || echo 4)  # POSIX-specific
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index 27208017..be630207 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -68,8 +68,8 @@ TVDistance <- function(df1, df2, statement = "TV DISTANCE") {
   colsi <- intersect(colnames(df1), colnames(df2))
   print(statement)
   1 - sum(mapply(min, 
-                 unlist(as.data.frame(df1)[rowsi, colsi], use.names = FALSE),
-                 unlist(as.data.frame(df2)[rowsi, colsi], use.names = FALSE)))
+                 unlist(as.data.frame(df1[rowsi, colsi]), use.names = FALSE),
+                 unlist(as.data.frame(df2[rowsi, colsi]), use.names = FALSE)))
 }
 
 # Function to combine reports
@@ -351,6 +351,7 @@ DirectSimulationOfReports <- function(inp) {
 #
 # ------------------------------------------------------------------------
 ExternalCounts <- function(inp) {
+  ptm <- proc.time()
   params <- ReadParameterFile(inp$params)
   # Ensure sufficient maps as required by number of vars
   stopifnot(inp$numvars == length(inp$maps))
@@ -370,39 +371,15 @@ ExternalCounts <- function(inp) {
            map[[i]]$rmap,
            params, quick = FALSE)$fit$strings)
   
-  cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map)
-  # Combine cohorts into one map. Needed for Decode2Way
-  inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE))
-  for (i in seq(1, length(inds))) {
-    inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1]
-  }
-  inds <- do.call("rbind", inds)
-  
-  # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1]
-  # inds <- rbind(inds[[1]], inds[[2]])
-  crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c(
-    nrow(cmap[[1]]) * length(cmap),
-    ncol(cmap[[1]])))
-  td <- read.csv(file = inp$truefile)
-  colnames(crmap) <- colnames(cmap[[1]])
-  counts <- ComputeCounts(creports, cohorts[[1]], params2)
-  marginal <- Decode2Way(counts, crmap, params2)$fit
-  
-  also_em = FALSE
-  ed_em <- list()
-  if(also_em == TRUE) {
-    joint_dist <- ComputeDistributionEM(reports, cohorts, map,
-                                        ignore_other = TRUE,
-                                        quick = TRUE,
-                                        params, marginals = NULL,
-                                        estimate_var = FALSE,
-                                        new_alg = inp$newalg)
-    ed_em <- joint_dist$orig$fit
-    if(length(reports) == 3) {
-      ed_em <- as.data.frame(ed_em)
-    }
-  }
-  
+  # Combine maps to feed into Decode2Way
+  # Prune first to found_strings from Decode on 1-way counts
+  pruned <- lapply(1:2, function(i)
+    lapply(map[[i]]$map, function(z) z[,found_strings[[i]]]))
+  crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap
+  marginal <- Decode2Way(counts[[1]], crmap, params2)$fit
+  td <- read.csv(file = inp$truefile, header = FALSE)
+  td <- table(td[,2:3])
+  td <- td / sum(td)
   ed <- td
   for (cols in colnames(td)) {
     for (rows in rownames(td)) {
@@ -412,8 +389,6 @@ ExternalCounts <- function(inp) {
   
   time_taken <- proc.time() - ptm
   
-  print("2 WAY RESULTS")
-  print(signif(ed[order(rowSums(ed)), ], 4))
   print(TVDistance(td, ed, "TV DISTANCE 2 WAY"))
   print("PROC.TIME")
   print(time_taken)
@@ -431,13 +406,6 @@ ExternalCounts <- function(inp) {
     dim2 = length(found_strings[[2]])
   )
   
-  if(also_em == TRUE) {
-    # Add EM metrics
-    metrics <- c(metrics,
-                 list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]],
-                      tv_em = TVDistance(td, ed_em, "")/2))
-  }
-  
   # Write metrics to metrics.csv
   filename <- file.path(inp$outdir, 'metrics.csv')
   write.csv(metrics, file = filename, row.names = FALSE)
@@ -566,8 +534,6 @@ ExternalReports <- function(inp) {
 }
 
 main <- function(opts) {
-  ptm <- proc.time()
-  direct_simulation = FALSE
   inp <- fromJSON(opts$inp)
   
   # Choose from a set of experiments to run
@@ -579,15 +545,15 @@ main <- function(opts) {
   }
   
   if(inp$expt == "direct") {
-    print("---------- RUNNING EXPERIMENT \"DIRECT\" ----------")
+    print("---------- RUNNING EXPERIMENT DIRECT ----------")
     DirectSimulationOfReports(inp)
   } 
   if (inp$expt == "external-counts") {
-    print("---------- RUNNING EXPERIMENT \"EXT COUNTS\" ----------")
+    print("---------- RUNNING EXPERIMENT EXT COUNTS ----------")
     ExternalCounts(inp)  
   }
   if (inp$expt == "external-reports") {
-    print("---------- RUNNING EXPERIMENT \"EXT REPORTS\" ----------")
+    print("---------- RUNNING EXPERIMENT EXT REPORTS ----------")
     ExternalReports(inp)
   }
 }
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 845dc93b..8a8ca60a 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -137,7 +137,7 @@
 ASSOC_TEST_CONFIG = {
   'distr': ('toy',),
   'blooms': ('16x32',),
-  'privacy': ('eps_zero',)
+  'privacy': ('eps_small',)
 }
 
 #

From 53233bab26a05d63ed47b4ea99178b36554593a8 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 25 Jun 2015 12:02:17 -0700
Subject: [PATCH 32/67] The test suite now can run two experiments
 simultaneously.

---
 analysis/tools/sum_bits_assoc.py | 121 ++++++++++++++++++
 assoctest.sh                     |   8 +-
 tests/analyze_assoc_expt.R       | 157 ++++++++++-------------
 tests/make_summary_assoc.py      |  11 +-
 tests/rappor_assoc_sim.py        | 211 +++++++++++++++++++++++++++++++
 tests/regtest_spec.py            |  34 +++--
 6 files changed, 435 insertions(+), 107 deletions(-)
 create mode 100755 analysis/tools/sum_bits_assoc.py
 create mode 100755 tests/rappor_assoc_sim.py

diff --git a/analysis/tools/sum_bits_assoc.py b/analysis/tools/sum_bits_assoc.py
new file mode 100755
index 00000000..acf5ea2c
--- /dev/null
+++ b/analysis/tools/sum_bits_assoc.py
@@ -0,0 +1,121 @@
+#!/usr/bin/python
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Read the RAPPOR'd values on stdin, and sum the bits to produce a Counting Bloom
+filter by cohort.  This can then be analyzed by R.
+"""
+
+import csv
+import sys
+
+import rappor
+
+
+def SumBits(params, stdin, f_2way, f_1, f_2):
+  csv_in = csv.reader(stdin)
+  csv_out_two_way = csv.writer(open(f_2way, "w"))
+  csv_out_1 = csv.writer(open(f_1, "w"))
+  csv_out_2 = csv.writer(open(f_2, "w"))
+
+  num_cohorts = params.num_cohorts
+  num_bloombits = params.num_bloombits
+
+  sums = [[0] * (4 * (num_bloombits ** 2)) for _ in xrange(num_cohorts)]
+  sums_1 = [[0] * num_bloombits for _ in xrange(num_cohorts)]
+  sums_2 = [[0] * num_bloombits for _ in xrange(num_cohorts)]
+  num_reports = [0] * num_cohorts
+
+  for i, row in enumerate(csv_in):
+    try:
+      (user_id, cohort, irr_1, irr_2) = row
+    except ValueError:
+      raise RuntimeError('Error parsing row %r' % row)
+
+    if i == 0:
+      continue  # skip header
+
+    cohort = int(cohort)
+    num_reports[cohort] += 1
+
+    # TODO: Extend checking for both reports
+    if not len(irr_1) == params.num_bloombits:
+      raise RuntimeError(
+          "Expected %d bits, got %r" % (params.num_bloombits, len(irr_1)))
+    # "Unrolled" joint encoding of both reports
+    for i, c in enumerate(irr_1):
+      for j, d in enumerate(irr_2):
+        index = 4 * ((num_bloombits - i - 1) * params.num_bloombits +
+                     num_bloombits - j - 1)
+        if (c == '1' and d == '1'):
+          sums[cohort][index] += 1
+        elif (c == '0' and d == '1'):
+          sums[cohort][index + 1] += 1
+        elif (c == '1' and d == '0'):
+          sums[cohort][index + 2] += 1
+        elif (c == '0' and d == '0'):
+          sums[cohort][index + 3] += 1
+        else:
+          raise RuntimeError('Invalid IRRs -- digits should be 0 or 1')
+
+    for i, c in enumerate(irr_1):
+      bit_num = num_bloombits - i - 1  # e.g. char 0 = bit 15, char 15 = bit 0
+      if c == '1':
+        sums_1[cohort][bit_num] += 1
+      else:
+        if c != '0':
+          raise RuntimeError('Invalid IRR -- digits should be 0 or 1')
+
+    for i, c in enumerate(irr_2):
+      bit_num = num_bloombits - i - 1  # e.g. char 0 = bit 15, char 15 = bit 0
+      if c == '1':
+        sums_2[cohort][bit_num] += 1
+      else:
+        if c != '0':
+          raise RuntimeError('Invalid IRR -- digits should be 0 or 1')
+
+  for cohort in xrange(num_cohorts):
+    # First column is the total number of reports in the cohort.
+    row = [num_reports[cohort]] + sums[cohort]
+    csv_out_two_way.writerow(row)
+    row = [num_reports[cohort]] + sums_1[cohort]
+    csv_out_1.writerow(row)
+    row = [num_reports[cohort]] + sums_2[cohort]
+    csv_out_2.writerow(row)
+
+
+def main(argv):
+  try:
+    filename = argv[1]
+    prefix = argv[2]
+  except IndexError:
+    raise RuntimeError('Usage: sum_bits.py <params file> <prefix>')
+  with open(filename) as f:
+    try:
+      params = rappor.Params.from_csv(f)
+    except rappor.Error as e:
+      raise RuntimeError(e)
+
+  SumBits(params, sys.stdin, prefix + "_2way.csv",
+          prefix + "_marg1.csv", prefix + "_marg2.csv")
+
+
+if __name__ == '__main__':
+  try:
+    main(sys.argv)
+  except RuntimeError, e:
+    print >>sys.stderr, e.args[0]
+    sys.exit(1)
diff --git a/assoctest.sh b/assoctest.sh
index f433862f..3cbd2f8b 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -274,8 +274,10 @@ _run-tests() {
   else
     func=_run-one-instance-logged
     processors=$(grep -c ^processor /proc/cpuinfo || echo 4)  # POSIX-specific
-    if test $processors -gt 1; then  # leave one CPU for the OS
-      processors=$(expr $processors - 1)
+    if test $processors -gt 3; then  # leave few CPUs for the OS
+      processors=$(expr $processors - 3)
+    else
+      processors=1
     fi
     log "Running $processors parallel processes"
   fi
@@ -326,7 +328,7 @@ run-all() {
   log "Running all tests. Can take a while."
   # a- for assoc tests
   # F for sequential
-  _run-tests '^a-' $instances F T
+  _run-tests '^a-' $instances T T
 }
 
 "$@"
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index be630207..77fc7df3 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -370,11 +370,16 @@ ExternalCounts <- function(inp) {
     Decode(counts[[i + 1]],
            map[[i]]$rmap,
            params, quick = FALSE)$fit$strings)
+  if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) {
+    print("FOUND_STRINGS")
+    print(found_strings)
+    stop("No strings found in 1-way marginal.")
+  }
   
   # Combine maps to feed into Decode2Way
   # Prune first to found_strings from Decode on 1-way counts
   pruned <- lapply(1:2, function(i)
-    lapply(map[[i]]$map, function(z) z[,found_strings[[i]]]))
+    lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE]))
   crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap
   marginal <- Decode2Way(counts[[1]], crmap, params2)$fit
   td <- read.csv(file = inp$truefile, header = FALSE)
@@ -386,19 +391,24 @@ ExternalCounts <- function(inp) {
       ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
     }
   }
+  ed[is.na(ed)] <- 0
   
   time_taken <- proc.time() - ptm
   
   print(TVDistance(td, ed, "TV DISTANCE 2 WAY"))
   print("PROC.TIME")
   print(time_taken)
+  chisq_td <- chisq.test(td)[1][[1]][[1]]
   chisq_ed <- chisq.test(ed)[1][[1]][[1]]
+  if(is.nan(chisq_td)) {
+    chisq_td <- 0
+  }
   if(is.nan(chisq_ed)) {
     chisq_ed <- 0
   }
   
   metrics <- list(
-    td_chisq = chisq.test(td)[1][[1]][[1]],
+    td_chisq = chisq_td,
     ed_chisq = chisq_ed,
     tv = TVDistance(td, ed, ""),
     time = time_taken[1],
@@ -414,14 +424,16 @@ ExternalCounts <- function(inp) {
 # ------------------------------------------------------------------------
 ##
 ## Externally provided reports
-## 2 OR 3 WAY ASSOCIATION
+## EM ALGORITHM
+## TODO: Also support 3 way association
 ## 
 ## Inputs:
 ##    
 ## Outputs:
 #
 # ------------------------------------------------------------------------
-ExternalReports <- function(inp) {
+ExternalReportsEM <- function(inp) {
+  ptm <- proc.time()
   params <- ReadParameterFile(inp$params)
   # Ensure sufficient maps as required by number of vars
   stopifnot(inp$numvars == length(inp$maps))
@@ -429,107 +441,68 @@ ExternalReports <- function(inp) {
     ProcessMap(ReadMapFile(o, params = params),
                params = params))
   
-  if (read_reports_flag == TRUE) {
-    # Reports must be of the format
-    #     cohort no, rappor bitstring 1, rappor bitstring 2, ...
-    reportsObj <- read.csv(inp$reports,
-                           colClasses = c("integer",
+  # Reports must be of the format
+  #     cohort no, rappor bitstring 1, rappor bitstring 2, ...
+  reportsObj <- read.csv(inp$reports,
+                           colClasses = c("integer", "integer",
                                           rep("character", inp$numvars)),
-                           header = FALSE)
+                           header = TRUE)
+  # Ignore the first column
+  reportsObj <- reportsObj[,-1]
+  # Parsing reportsObj
+  # ComputeDistributionEM allows for different sets of cohorts
+  # for each variable. Here, both sets of cohorts are identical
+  co <- as.list(reportsObj[1])[[1]]
+  co <- co + 1  # 1 indexing
+  cohorts <- rep(list(co), inp$numvars)
+  # Parse reports from reportObj cols 2, 3, ...
+  reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1]))
     
-    # Parsing reportsObj
-    # ComputeDistributionEM allows for different sets of cohorts
-    # for each variable. Here, both sets of cohorts are identical
-    co <- as.list(reportsObj[1])[[1]]
-    cohorts <- rep(list(co), inp$numvars)
-    # Parse reports from reportObj cols 2, 3, ...
-    reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1]))
-    
-    # Split strings into bit arrays (as required by assoc analysis)
-    reports <- lapply(1:inp$numvars, function(i) {
-      # apply the following function to each of reports[[1]] and reports[[2]]
-      lapply(reports[[i]][[1]], function(x) {
-        # function splits strings and converts them to numeric values
-        as.numeric(strsplit(x, split = "")[[1]])
-      })
+  # Split strings into bit arrays (as required by assoc analysis)
+  reports <- lapply(1:inp$numvars, function(i) {
+    # apply the following function to each of reports[[1]] and reports[[2]]
+    lapply(reports[[i]][[1]], function(x) {
+      # function splits strings and converts them to numeric values
+      # rev needed for endianness
+      rev(as.numeric(strsplit(x, split = "")[[1]]))
     })
+  })
     
-    creports <- CombineReports(reports[[1]], reports[[2]])
-  }
-  
-  params2 <- params
-  params2$k <- (params$k ** 2) * 4
-  # CombineMaps(map[[1]]$map[[1]], map[[2]]$map[[1]])
-  cmap <- mapply(CombineMaps, map[[1]]$map, map[[2]]$map)
-  # Combine cohorts into one map. Needed for Decode2Way
-  inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE))
-  for (i in seq(1, length(inds))) {
-    inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1]
-  }
-  inds <- do.call("rbind", inds)
-  
-  # inds[[2]][, 1] <- inds[[2]][, 1] + dim(cmap[[1]])[1]
-  # inds <- rbind(inds[[1]], inds[[2]])
-  crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c(
-    nrow(cmap[[1]]) * length(cmap),
-    ncol(cmap[[1]])))
-  td <- read.csv(file = inp$truefile)
-  colnames(crmap) <- colnames(cmap[[1]])
-  counts <- ComputeCounts(creports, cohorts[[1]], params2)
-  marginal <- Decode2Way(counts, crmap, params2)$fit
-  
-  also_em = FALSE
-  ed_em <- list()
-  if(also_em == TRUE) {
-    joint_dist <- ComputeDistributionEM(reports, cohorts, map,
-                                        ignore_other = TRUE,
-                                        quick = TRUE,
-                                        params, marginals = NULL,
-                                        estimate_var = FALSE,
-                                        new_alg = inp$newalg)
-    ed_em <- joint_dist$orig$fit
-    if(length(reports) == 3) {
-      ed_em <- as.data.frame(ed_em)
-    }
-  }
-  
-  ed <- td
-  for (cols in colnames(td)) {
-    for (rows in rownames(td)) {
-      ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
-    }
-  }
-  
+  joint_dist <- ComputeDistributionEM(reports, cohorts, map,
+                                      ignore_other = TRUE,
+                                      quick = TRUE,
+                                      params, marginals = NULL,
+                                      estimate_var = FALSE,
+                                      new_alg = inp$newalg)
+  em <- joint_dist$orig$fit
+  td <- read.csv(file = inp$truefile, header = FALSE)
+  td <- table(td[,2:3])
+  td <- td / sum(td)
   time_taken <- proc.time() - ptm
   
-  print("2 WAY RESULTS")
-  print(signif(ed[order(rowSums(ed)), ], 4))
-  print(TVDistance(td, ed, "TV DISTANCE 2 WAY"))
+  print(TVDistance(td, em, "TV DISTANCE EM"))
   print("PROC.TIME")
   print(time_taken)
-  chisq_ed <- chisq.test(ed)[1][[1]][[1]]
+  chisq_td <- chisq.test(td)[1][[1]][[1]]
+  chisq_ed <- chisq.test(em)[1][[1]][[1]]
+  if(is.nan(chisq_td)) {
+    chisq_td <- 0
+  }
   if(is.nan(chisq_ed)) {
     chisq_ed <- 0
   }
   
   metrics <- list(
-    td_chisq = chisq.test(td)[1][[1]][[1]],
+    td_chisq = chisq_td,
     ed_chisq = chisq_ed,
-    tv = TVDistance(td, ed, ""),
+    tv = TVDistance(td, em, ""),
     time = time_taken[1],
-    dim1 = length(found_strings[[1]]),
-    dim2 = length(found_strings[[2]])
+    dim1 = dim(em)[[1]],
+    dim2 = dim(em)[[2]]
   )
   
-  if(also_em == TRUE) {
-    # Add EM metrics
-    metrics <- c(metrics,
-                 list(ed_em_chisq = chisq.test(ed_em)[1][[1]][[1]],
-                      tv_em = TVDistance(td, ed_em, "")/2))
-  }
-  
   # Write metrics to metrics.csv
-  filename <- file.path(inp$outdir, 'metrics.csv')
+  filename <- file.path(inp$outdir, 'metrics_2.csv')
   write.csv(metrics, file = filename, row.names = FALSE)
 }
 
@@ -540,21 +513,21 @@ main <- function(opts) {
   # direct -> direct simulation of reports (without variances)
   # external-counts -> externally supplied counts for 2 way and marginals
   # external-reports -> externally supplied reports 
-  if (!(inp$expt %in% c("direct", "external-counts", "external-reports"))) {
+  if (!(inp$expt %in% c("direct", "external-counts", "external-reports-em"))) {
     stop("Incorrect experiment in JSON file.")
   }
   
-  if(inp$expt == "direct") {
+  if("direct" %in% inp$expt) {
     print("---------- RUNNING EXPERIMENT DIRECT ----------")
     DirectSimulationOfReports(inp)
   } 
-  if (inp$expt == "external-counts") {
+  if ("external-counts" %in% inp$expt) {
     print("---------- RUNNING EXPERIMENT EXT COUNTS ----------")
     ExternalCounts(inp)  
   }
-  if (inp$expt == "external-reports") {
+  if ("external-reports-em" %in% inp$expt) {
     print("---------- RUNNING EXPERIMENT EXT REPORTS ----------")
-    ExternalReports(inp)
+    ExternalReportsEM(inp)
   }
 }
 
diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py
index 67843b4d..5cf29fe0 100755
--- a/tests/make_summary_assoc.py
+++ b/tests/make_summary_assoc.py
@@ -124,7 +124,7 @@ def MeanOfMeans(dict_of_lists):
     return None
 
 
-def ParseSpecFile(spec_filename):
+def ParseSpecFile(spec_filename, empty = False):
   """Parses the spec (parameters) file.
 
   Returns:
@@ -135,6 +135,8 @@ def ParseSpecFile(spec_filename):
     spec_row = s.readline().split()
 
   spec_in_html = ' '.join('<td>%s</td>' % cell for cell in spec_row[3:])
+  if empty == True:
+    spec_in_html = ' '.join('<td></td>' for cell in spec_row[3:])
 
   return spec_in_html
 
@@ -321,6 +323,13 @@ def main(argv):
 
     print '<tr>{}{}{}</tr>'.format(cell1_html, spec_html, metrics_html)
 
+    # Printing metrics 2 if available
+    metrics_file = os.path.join(report_dir, 'metrics_2.csv')
+    if (os.path.isfile(metrics_file)):
+      metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file)
+      print '<tr><td></td>{}{}</tr>'.format(ParseSpecFile(spec_file, empty =
+                                                        True), metrics_html)
+
   print FormatSummaryRow(metrics)
 
   print '</tbody>'
diff --git a/tests/rappor_assoc_sim.py b/tests/rappor_assoc_sim.py
new file mode 100755
index 00000000..1c6c026d
--- /dev/null
+++ b/tests/rappor_assoc_sim.py
@@ -0,0 +1,211 @@
+#!/usr/bin/python
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tool to run RAPPOR on simulated client input.
+
+It takes a 3-column CSV file as generated by gen_sim_data.py, and outputs 4
+files:
+
+  - out: 3 column CSV of RAPPOR'd data.
+  - params: RAPPOR parameters, needed to recover distributions from the output
+  - true inputs: Can be used to "cheat" and construct candidate strings
+  - hist: histogram of actual input values.  Compare this with the histogram
+    the RAPPOR analysis infers from the first 3 values.
+
+Input columns: client,true_value
+Ouput coumns: client,cohort,rappor
+
+See http://google.github.io/rappor/doc/data-flow.html for details.
+"""
+
+import csv
+import collections
+import optparse
+import os
+import random
+import sys
+import time
+
+import rappor  # client library
+try:
+  import fastrand
+except ImportError:
+  print >>sys.stderr, (
+      "Native fastrand module not imported; see README for speedups")
+  fastrand = None
+
+
+def log(msg, *args):
+  if args:
+    msg = msg % args
+  print >>sys.stderr, msg
+
+
+def CreateOptionsParser():
+  p = optparse.OptionParser()
+
+  # We are taking a path, and not using stdin, because we read it twice.
+  p.add_option(
+      '-i', dest='infile', metavar='PATH', type='str', default='',
+      help='CSV input path.  Header is "client,true_value"')
+  p.add_option(
+      '--out-prefix', dest='out_prefix', metavar='PATH', type='str',
+      default='',
+      help='Output prefix.')
+
+  p.add_option(
+      '--num-bits', type='int', metavar='INT', dest='num_bits', default=16,
+      help='Number of bloom filter bits.')
+  p.add_option(
+      '--num-hashes', type='int', metavar='INT', dest='num_hashes', default=2,
+      help='Number of hashes.')
+  p.add_option(
+      '--num-cohorts', type='int', metavar='INT', dest='num_cohorts',
+      default=64, help='Number of cohorts.')
+
+  p.add_option(
+      '-p', type='float', metavar='FLOAT', dest='prob_p', default=1,
+      help='Probability p')
+  p.add_option(
+      '-q', type='float', metavar='FLOAT', dest='prob_q', default=1,
+      help='Probability q')
+  p.add_option(
+      '-f', type='float', metavar='FLOAT', dest='prob_f', default=1,
+      help='Probability f')
+
+  p.add_option(
+      '--oneprr', dest='oneprr', action='store_true', default=False,
+      help='Use a consistent PRR.')
+
+  choices = ['simple', 'fast']
+  p.add_option(
+      '-r', type='choice', metavar='STR',
+      dest='random_mode', default='fast', choices=choices,
+      help='Random algorithm (%s)' % '|'.join(choices))
+
+  return p
+
+
+def make_histogram(csv_in):
+  """Make a histogram of the simulated input file."""
+  # TODO: It would be better to share parsing with rappor_encode()
+  counter = collections.Counter()
+  for (_, word) in csv_in:
+    counter[word] += 1
+  return dict(counter.most_common())
+
+
+def print_histogram(word_hist, histfile):
+  """Write histogram of values to histfile."""
+  # Print histograms of distributions
+  sorted_words = sorted(word_hist.iteritems(), key=lambda pair: pair[1],
+                        reverse=True)
+  fmt = "%s,%s"
+  print >>histfile, fmt % ("string", "count")
+  for pair in sorted_words:
+    print >>histfile, fmt % pair
+
+
+def bit_string(irr, num_bloombits):
+  """Like bin(), but uses leading zeroes, and no '0b'."""
+  s = ''
+  bits = []
+  for bit_num in xrange(num_bloombits):
+    if irr & (1 << bit_num):
+      bits.append('1')
+    else:
+      bits.append('0')
+  return ''.join(reversed(bits))
+
+
+def main(argv):
+  (opts, argv) = CreateOptionsParser().parse_args(argv)
+  if not opts.infile:
+    raise RuntimeError('-i is required')
+  if not opts.out_prefix:
+    raise RuntimeError('--out-prefix is required')
+
+  # Copy flags into params
+  params = rappor.Params()
+  params.num_bloombits = opts.num_bits
+  params.num_hashes = opts.num_hashes
+  params.num_cohorts = opts.num_cohorts
+  params.prob_p = opts.prob_p
+  params.prob_q = opts.prob_q
+  params.prob_f = opts.prob_f
+  params.flag_oneprr = opts.oneprr
+
+  prefix = opts.out_prefix
+
+  outfile = prefix + "_out.csv"
+
+  rand = random.Random()  # default Mersenne Twister randomness
+  #rand = random.SystemRandom()  # cryptographic randomness from OS
+
+  rand.seed()  # Default: seed with sys time
+
+  if opts.random_mode == 'simple':
+    rand_funcs = rappor.SimpleRandFuncs(params, rand)
+  elif opts.random_mode == 'fast':
+    if fastrand:
+      log('Using fastrand extension')
+      # NOTE: This doesn't take 'rand'
+      rand_funcs = fastrand.FastRandFuncs(params)
+    else:
+      log('Warning: fastrand module not importable; see README for build '
+          'instructions.  Falling back to simple randomness.')
+      rand_funcs = rappor.SimpleRandFuncs(params, rand)
+  else:
+    raise AssertionError
+
+  # Do RAPPOR transformation.
+  with open(opts.infile) as f_in, open(outfile, 'w') as f_out:
+    csv_in = csv.reader(f_in)
+    csv_out = csv.writer(f_out)
+
+    header = ('client', 'cohort', 'rappor_var1', 'rappor_var2')
+    csv_out.writerow(header)
+
+    cur_client = None  # current client
+
+    start_time = time.time()
+
+    for i, (client, true_value_1, true_value_2) in enumerate(csv_in):
+      if i % 10000 == 0:
+        elapsed = time.time() - start_time
+        log('Processed %d inputs in %.2f seconds', i, elapsed)
+
+      # New encoder instance for each client.
+      if client != cur_client:
+        cur_client = client
+        e = rappor.Encoder(params, cur_client, rand_funcs=rand_funcs)
+
+      cohort, irr_1 = e.encode(true_value_1)
+      cohort_check, irr_2 = e.encode(true_value_2, assigned_cohort = cohort)
+      # Ensure same cohort used for irr_1, and irr_2
+      assert cohort_check == cohort
+
+      # encoded is a list of (cohort, rappor_1, rappor_2) triples
+      out_row = (client, cohort, bit_string(irr_1, params.num_bloombits),
+                 bit_string(irr_2, params.num_bloombits))
+      csv_out.writerow(out_row)
+
+
+if __name__ == "__main__":
+  try:
+    main(sys.argv)
+  except RuntimeError, e:
+    log('rappor_sim.py: FATAL: %s', e)
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 8a8ca60a..68599cf4 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -126,18 +126,30 @@
 #    privacy params set)
 # The test config runs a test suite that is the cross product of all the above
 # sets
-#ASSOC_TEST_CONFIG = {
-#    'distr': ('fizz-tiny', 'fizz-tiny-bool',
-#              'fizz-small', 'fizz-small-bool',
-#              'fizz', 'fizz-bool'),
-#    'blooms': ('8x16','8x32',),# '16x32'),
-#    'privacy': ('eps_small','eps_chrome')
-#}
-
 ASSOC_TEST_CONFIG = {
-  'distr': ('toy',),
-  'blooms': ('16x32',),
-  'privacy': ('eps_small',)
+  'distr': ('fizz-tiny',
+            'fizz-tiny-bool',
+#            'fizz-small',
+#            'fizz-small-bool',
+#            'fizz',
+#            'fizz-bool',
+            'toy',),
+#            'compact-noextra-small',
+#            'loose-noextra-small',
+#            'compact-noextra-large',
+#            'loose-noextra-large',
+#            'compact-extra-small',
+#            'loose-extra-small',
+#            'compact-extra-large',
+#            'loose-extra-large',
+#            'compact-excess-small',
+#            'loose-excess-small',
+#            'compact-excess-large',
+#            'loose-excess-large'),
+  'blooms': ('8x32',
+             '16x32'),
+  'privacy': ('eps_small',
+              'eps_chrome')
 }
 
 #

From bfb257ed001ba4e6a08cfea66e1c559593338d4d Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 25 Jun 2015 12:14:23 -0700
Subject: [PATCH 33/67] Now considering both experiments. Fixing small changes.

---
 assoctest.sh          |  4 ++--
 tests/regtest_spec.py | 13 +++++++------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/assoctest.sh b/assoctest.sh
index 3cbd2f8b..7a4ef8a3 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -168,7 +168,7 @@ _run-one-instance() {
     inp = dict(); \
     inp['maps'] = ['$case_dir/case_map1.csv',\
                    '$case_dir/case_map2.csv']; \
-    inp['reports'] = '$instance_dir/reports.csv'; \
+    inp['reports'] = '$instance_dir/case_out.csv'; \
     inp['truefile'] = '$instance_dir/case.csv'; \
     inp['outdir'] = '$out_dir'; \
     inp['params'] = '$case_dir/case_params.csv'; \
@@ -180,7 +180,7 @@ _run-one-instance() {
     inp['counts'] = ['$instance_dir/case_2way.csv',\
                      '$instance_dir/case_marg1.csv',\
                      '$instance_dir/case_marg2.csv']; \
-    inp['expt'] = 'external-counts'; \
+    inp['expt'] = ['external-counts', 'external-reports-em']; \
     json.dump(inp, f); \
     f.close();"
 
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 68599cf4..d029798f 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -127,13 +127,14 @@
 # The test config runs a test suite that is the cross product of all the above
 # sets
 ASSOC_TEST_CONFIG = {
-  'distr': ('fizz-tiny',
+  'distr': (
+            'fizz-tiny',
             'fizz-tiny-bool',
-#            'fizz-small',
-#            'fizz-small-bool',
-#            'fizz',
-#            'fizz-bool',
-            'toy',),
+            'fizz-small',
+            'fizz-small-bool',
+            'fizz',
+            'fizz-bool',),
+#            'toy',),
 #            'compact-noextra-small',
 #            'loose-noextra-small',
 #            'compact-noextra-large',

From 9fd30deeb3b42e2f5c76bd22ebc2e5dc154ef207 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 25 Jun 2015 16:01:39 -0700
Subject: [PATCH 34/67] Small changes to assoc suite.

---
 analysis/R/decode.R         |  2 +-
 tests/analyze_assoc_expt.R  |  2 +-
 tests/make_summary_assoc.py | 11 ++++++++---
 tests/regtest_spec.py       | 16 +++++++++-------
 4 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index c84a23dd..6e0522ac 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -383,7 +383,7 @@ Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05,
   
   # Only select coefficients more than two standard deviations from 0. May
   # inflate empirical SD of the estimates.
-  reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd)
+  reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd)
   
   mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported])
 
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index 77fc7df3..88f11540 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -369,7 +369,7 @@ ExternalCounts <- function(inp) {
   found_strings <- lapply(1:2, function(i)
     Decode(counts[[i + 1]],
            map[[i]]$rmap,
-           params, quick = FALSE)$fit$strings)
+           params, quick = TRUE)$fit$strings)
   if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) {
     print("FOUND_STRINGS")
     print(found_strings)
diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py
index 5cf29fe0..665ef9f9 100755
--- a/tests/make_summary_assoc.py
+++ b/tests/make_summary_assoc.py
@@ -157,7 +157,7 @@ def ExtractTime(log_filename):
   return None
 
 
-def ParseMetrics(metrics_file, log_file):
+def ParseMetrics(metrics_file, log_file, italics = False):
   """Processes the metrics file.
 
   Args:
@@ -200,7 +200,11 @@ def ParseMetrics(metrics_file, log_file):
   }
 
   # return metrics formatted as HTML table entries
-  return (metrics_row_dict,
+  if(italics == True):
+    return (metrics_row_dict,
+          ' '.join('<td><i>%s</i></td>' % cell for cell in metrics_row_str))
+  else:
+    return (metrics_row_dict,
           ' '.join('<td>%s</td>' % cell for cell in metrics_row_str))
 
 
@@ -326,7 +330,8 @@ def main(argv):
     # Printing metrics 2 if available
     metrics_file = os.path.join(report_dir, 'metrics_2.csv')
     if (os.path.isfile(metrics_file)):
-      metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file)
+      metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file,
+                                                italics = True)
       print '<tr><td></td>{}{}</tr>'.format(ParseSpecFile(spec_file, empty =
                                                         True), metrics_html)
 
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index d029798f..db8a8566 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -131,9 +131,9 @@
             'fizz-tiny',
             'fizz-tiny-bool',
             'fizz-small',
-            'fizz-small-bool',
-            'fizz',
-            'fizz-bool',),
+            'fizz-small-bool',),
+#            'fizz',
+#            'fizz-bool',),
 #            'toy',),
 #            'compact-noextra-small',
 #            'loose-noextra-small',
@@ -147,10 +147,12 @@
 #            'loose-excess-small',
 #            'compact-excess-large',
 #            'loose-excess-large'),
-  'blooms': ('8x32',
-             '16x32'),
-  'privacy': ('eps_small',
-              'eps_chrome')
+  'blooms': (
+             '8x32',
+             '16x32',),
+  'privacy': (
+              'eps_small',
+              'eps_chrome',)
 }
 
 #

From 673348699a7a186934088d28b523e0a910c529f2 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 29 Jun 2015 11:04:58 -0700
Subject: [PATCH 35/67] Adding analysis/R/alternative.R.

---
 analysis/R/alternative.R | 109 ++++++++++++++++++---------------------
 1 file changed, 50 insertions(+), 59 deletions(-)

diff --git a/analysis/R/alternative.R b/analysis/R/alternative.R
index 3f0e66d3..d7869439 100644
--- a/analysis/R/alternative.R
+++ b/analysis/R/alternative.R
@@ -1,83 +1,74 @@
 # Copyright 2014 Google Inc. All rights reserved.
-#
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-library(limSolve)
-library(Matrix)
+# alternative.R
+#
+# This is some messy code to test out alternative regression using pcls().
 
-# The next two functions create a matrix (G) and a vector (H) encoding
-# linear inequality constraints that a solution vector (x) must satisfy:
-#                       G * x >= H
+library(mgcv)
 
-# Currently represent three sets of constraints on the solution vector:
-#  - all solution coefficients are nonnegative
-#  - the sum total of all solution coefficients is no more than 1
-#  - in each of the coordinates of the target vector (estimated Bloom filter)
-#    we don't overshoot by more than three standard deviations.
-MakeG <- function(n, X) {
-  d <- Diagonal(n)
-  last <- rep(-1, n)
-  rbind2(rbind2(d, last), -X)
-}
 
-MakeH <- function(n, Y, stds) {
-  # set the floor at 0.01 to avoid degenerate cases
-  YY <- apply(Y + 3 * stds,  # in each bin don't overshoot by more than 3 stds
-              1:2,
-              function(x) min(1, max(0.01, x)))  # clamp the bound to [0.01,1]
+# uniform vector
+makep = function(n) {
+  rep(1, n) / (n+1)
+}
 
-  c(rep(0, n),  # non-negativity condition
-    -1,         # coefficients sum up to no more than 1
-    -as.vector(t(YY))   # t is important!
-    )
+# diagonal matrix with -1
+makeAin = function(n) {
+  d = diag(x=1, n, n)
+  last = rep(-1, n)
+  rbind(d, last)
 }
 
-MakeLseiModel <- function(X, Y, stds) {
-  m <- dim(X)[1]
-  n <- dim(X)[2]
+makebin = function(n) {
+  #ratio = 172318 / 128
+  # NOTE: Hard-coded hacks here
+  ratio = 70000 / 64
+  #ratio = 490000 / 64
 
-# no slack variables for now
-#   slack <- Matrix(FALSE, nrow = m, ncol = m, sparse = TRUE)
-#   colnames(slack) <- 1:m
-#   diag(slack) <- TRUE
-#
-#   G <- MakeG(n + m)
-#   H <- MakeH(n + m)
-#
-#   G[n+m+1,n:(n+m)] <- -0.1
-#  A = cbind2(X, slack)
+  print("RATIO")
+  print(ratio)
+
+  c(rep(0, n), -ratio)
+}
 
-  w <- as.vector(t(1 / stds))
-  w_median <- median(w[!is.infinite(w)])
-  if(is.na(w_median))  # all w are infinite
-    w_median <- 1
-  w[w > w_median * 2] <- w_median * 2
-  w <- w / mean(w)
+makeM = function(X,Y) {
+  n=dim(X)[2]
+  p = makep(n)
+  Ain = makeAin(n)
+  bin = makebin(n)
 
-  list(# coerce sparse Boolean matrix X to sparse numeric matrix
-       A = Diagonal(x = w) %*% (X + 0),
-       B = as.vector(t(Y)) * w,  # transform to vector in the row-first order
-       G = MakeG(n, X),
-       H = MakeH(n, Y, stds),
-       type = 2)  # Since there are no equality constraints, lsei defaults to
-                  # solve.QP anyway, but outputs a warning unless type == 2.
+  list(X=as.matrix(X),
+       p=p,
+       off=array(0,0),
+       S=list(),
+       Ain=Ain,
+       bin=bin,
+       C=matrix(0,0,0),
+       sp=array(0,0),
+       y=Y,
+       w=rep(1, length(Y)) )
 }
 
 # CustomLM(X, Y)
-ConstrainedLinModel <- function(X,Y) {
-  model <- MakeLseiModel(X, Y$estimates, Y$stds)
-  coefs <- do.call(lsei, model)$X
-  names(coefs) <- colnames(X)
+newLM = function(X,Y) {
+  M = makeM(X,Y)
+  coefs = pcls(M)
+
+  print("SUM(coefs)")
+  print(sum(coefs))
+
+  return(coefs)
+}
 
-  coefs
-}
\ No newline at end of file

From 37a39ef86c5ba89b4b7b2315d4dd3602b2353cb8 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 29 Jun 2015 16:01:00 -0700
Subject: [PATCH 36/67] Merged changes from drop-QP branch.

Code now uses only LASSO for association analysis.
---
 analysis/R/alternative.R   |  74 --------------
 analysis/R/analysis_tool.R |  31 +++---
 analysis/R/association.R   |  19 ++--
 analysis/R/decode.R        | 196 ++++++++++++++++++++-----------------
 analysis/R/decode_test.R   | 146 +++++++++++++--------------
 analysis/R/read_input.R    |   5 +-
 analysis/R/simulation.R    |   2 +-
 analysis/R/test.sh         |   2 +
 analysis/R/util.R          |  20 ++++
 regtest.sh                 |  63 ++++++++----
 setup.sh                   |   8 +-
 tests/analyze.R            |  13 +--
 tests/analyze_assoc_expt.R |   2 +-
 tests/gen_counts_test.R    |  17 ++--
 tests/user_spec.py         | 116 ++++++++++++++++++++++
 15 files changed, 405 insertions(+), 309 deletions(-)
 delete mode 100644 analysis/R/alternative.R
 create mode 100644 analysis/R/util.R
 create mode 100755 tests/user_spec.py

diff --git a/analysis/R/alternative.R b/analysis/R/alternative.R
deleted file mode 100644
index d7869439..00000000
--- a/analysis/R/alternative.R
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2014 Google Inc. All rights reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# alternative.R
-#
-# This is some messy code to test out alternative regression using pcls().
-
-library(mgcv)
-
-
-# uniform vector
-makep = function(n) {
-  rep(1, n) / (n+1)
-}
-
-# diagonal matrix with -1
-makeAin = function(n) {
-  d = diag(x=1, n, n)
-  last = rep(-1, n)
-  rbind(d, last)
-}
-
-makebin = function(n) {
-  #ratio = 172318 / 128
-  # NOTE: Hard-coded hacks here
-  ratio = 70000 / 64
-  #ratio = 490000 / 64
-
-  print("RATIO")
-  print(ratio)
-
-  c(rep(0, n), -ratio)
-}
-
-makeM = function(X,Y) {
-  n=dim(X)[2]
-  p = makep(n)
-  Ain = makeAin(n)
-  bin = makebin(n)
-
-  list(X=as.matrix(X),
-       p=p,
-       off=array(0,0),
-       S=list(),
-       Ain=Ain,
-       bin=bin,
-       C=matrix(0,0,0),
-       sp=array(0,0),
-       y=Y,
-       w=rep(1, length(Y)) )
-}
-
-# CustomLM(X, Y)
-newLM = function(X,Y) {
-  M = makeM(X,Y)
-  coefs = pcls(M)
-
-  print("SUM(coefs)")
-  print(sum(coefs))
-
-  return(coefs)
-}
-
diff --git a/analysis/R/analysis_tool.R b/analysis/R/analysis_tool.R
index b871b2d5..b0acf2df 100755
--- a/analysis/R/analysis_tool.R
+++ b/analysis/R/analysis_tool.R
@@ -15,12 +15,12 @@
 # days for weekly and 28 days for monthly analyses.
 
 library(optparse)
+library(RJSONIO)
 
 source("analysis/R/analysis_lib.R")
 source("analysis/R/read_input.R")
 source("analysis/R/decode.R")
-
-source("analysis/R/alternative.R")
+source("analysis/R/util.R")
 
 options(stringsAsFactors = FALSE)
 
@@ -58,13 +58,6 @@ if (!interactive()) {
   opts <- parse_args(OptionParser(option_list = option_list))
 }
 
-# NOTE: This is in tests/analysis.R too
-Log <- function(...) {
-  cat('rappor_analysis.R: ')
-  cat(sprintf(...))
-  cat('\n')
-}
-
 # Handle the case of redundant cohorts, i.e. the counts file needs to be
 # further aggregated to obtain counts for the number of cohorts specified in
 # the config file.
@@ -107,27 +100,27 @@ RunOne <- function(opts) {
 
   fit <- res$fit
 
-  results_path <- file.path(opts$output_dir, 'results.csv')
-  write.csv(fit, file = results_path, row.names = FALSE)
+  # Write analysis results as CSV.
+  results_csv_path <- file.path(opts$output_dir, 'results.csv')
+  write.csv(fit, file = results_csv_path, row.names = FALSE)
+
+  # Write summary as JSON (scalar values).
+  metrics_json_path <- file.path(opts$output_dir, 'metrics.json')
+  m <- toJSON(res$metrics)
+  writeLines(m, con = metrics_json_path)
 
   # TODO:
   # - These are in an 2 column 'parameters' and 'values' format.  Should these
   # just be a plain list?
-  # - Write them to another CSV file or JSON on stdout?
-
-  Log("Fit summary:")
-  print(res$summary)
-  cat("\n")
+  # - Should any of these privacy params be in metrics.json?
 
   Log("Privacy summary:")
   print(res$privacy)
   cat("\n")
 
   # Output metrics as machine-parseable prefix + JSON.
-  num_rappor <- nrow(fit)
-  allocated_mass <- sum(fit$proportion)
   Log('__OUTPUT_METRICS__ {"num_rappor": %d, "allocated_mass": %f}',
-      num_rappor, allocated_mass)
+      res$metrics$num_detected, res$metrics$allocated_mass)
 
   Log('DONE')
 }
diff --git a/analysis/R/association.R b/analysis/R/association.R
index d50bd490..393b0e3a 100644
--- a/analysis/R/association.R
+++ b/analysis/R/association.R
@@ -39,7 +39,7 @@ GetOtherProbs <- function(counts, map, marginal, params) {
   p <- params$p
 
   # List of known strings that were measured in the marginal.
-  candidate_strings <- marginal$strings
+  candidate_strings <- marginal$string
 
   # Counts to remove from each cohort.
   top_counts <- ceiling(marginal$proportion * N / params$m)
@@ -421,19 +421,20 @@ ComputeDistributionEM <- function(reports, report_cohorts,
   joint_conditional = NULL
   found_strings <- list()
   cd_for_reports <- list()
-  
+
   for (j in (1:num_variables)) {
     ptm <- proc.time()
     variable_report <- reports[[j]]
     variable_cohort <- report_cohorts[[j]]
     map <- maps[[j]]
-    
+
     # Compute the probability of the "other" category
     variable_counts <- NULL
     if (is.null(marginals)) {
       ptm2 <- proc.time()
       variable_counts <- ComputeCounts(variable_report, variable_cohort, params)
-      marginal <- Decode(variable_counts, map$rmap, params, quick)$fit
+      marginal <- Decode(variable_counts, map$rmap, params, quick,
+                         quiet = TRUE)$fit
       print("TIME IN MARGINALS")
       print(proc.time() - ptm2)
       if (nrow(marginal) == 0) {
@@ -442,7 +443,7 @@ ComputeDistributionEM <- function(reports, report_cohorts,
     } else {
       marginal <- marginals[[j]]
     }
-    found_strings[[j]] <- marginal$strings
+    found_strings[[j]] <- marginal$string
 
     if (ignore_other) {
       prob_other <- vector(mode = "list", length = params$m)
@@ -466,7 +467,7 @@ ComputeDistributionEM <- function(reports, report_cohorts,
                          prob_other[[idx]])
       rep
     })
-    
+
     if(new_alg) {
       # Report conditional distributions as lists
       if (j == 1) {
@@ -485,7 +486,7 @@ ComputeDistributionEM <- function(reports, report_cohorts,
     print("TIME IN COND_REPORT_DIST")
     print(proc.time()-ptm)
   }
-  
+
   ptm <- proc.time()
   # Run expectation maximization to find joint distribution
   if (new_alg) {
@@ -498,7 +499,7 @@ ComputeDistributionEM <- function(reports, report_cohorts,
   print("TIME IN EM")
   print(proc.time() - ptm)
   dimnames(em$est) <- found_strings
-  
+
   # Return results in a usable format
   list(orig = list(fit = em$est, sd = em$sd, em = em))
-}
\ No newline at end of file
+}
diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 6e0522ac..2d8af344 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -16,8 +16,7 @@
 # This library implements the RAPPOR marginal decoding algorithms using LASSO.
 
 library(glmnet)
-
-source('analysis/R/alternative.R')
+library(limSolve)
 
 Estimate2WayBloomCounts <- function(params, obs_counts) {
   p <- params$p
@@ -25,20 +24,20 @@ Estimate2WayBloomCounts <- function(params, obs_counts) {
   f <- params$f
   m <- params$m
   k <- params$k
-  
+
   stopifnot(m == nrow(obs_counts), params$k + 1 == ncol(obs_counts))
-  
+
   p11 <- q * (1 - f/2) + p * f / 2  # probability of a true 1 reported as 1
   p01 <- p * (1 - f/2) + q * f / 2  # probability of a true 0 reported as 1
   p10 <- 1 - p11  # probability of a true 1 reported as 0
   p00 <- 1 - p01  # probability of a true 0 reported as 0
-  
+
   NoiseMatrix <- matrix(rep(0, 16), 4)
   NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2)
   NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00)
   NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01)
   NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2)
-  
+
   ests <- apply(obs_counts, 1, function(x) {
     N <- x[1]
     inds <- seq(0, (k/4)-1)
@@ -47,7 +46,7 @@ Estimate2WayBloomCounts <- function(params, obs_counts) {
       as.vector(t(Solve(NoiseMatrix)) %*% v[(i*4 + 1):((i+1)*4)])
     })
   })
-  
+
   if(FALSE) {
     # TODO(pseudorandom): Compute variances
     variances <- apply(obs_counts, 1, function(x) {
@@ -59,17 +58,17 @@ Estimate2WayBloomCounts <- function(params, obs_counts) {
       N * r * (1 - r) / p2^2  # variance of the binomial
     })
   }
-  
+
   # Transform counts from absolute values to fractional, removing bias due to
   #      variability of reporting between cohorts.
   ests <- apply(ests, 1, function(x) x / obs_counts[,1])
   # stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1])
-  
+
   # Some estimates may be set to infinity, e.g. if f=1. We want to
   #     account for this possibility, and set the corresponding counts
   #     to 0.
   ests[abs(ests) == Inf] <- 0
-    
+
   list(estimates = ests,
        stds = matrix(rep(100, length(ests[,1]) * length(ests[1,])),
                      length(ests[,1])))
@@ -95,7 +94,7 @@ EstimateBloomCounts <- function(params, obs_counts) {
   # Output:
   #    ests: a matrix of size m by k with estimated counts for the probability
   #          of each bit set to 1 in the true Bloom filter.
-  #    std: standard deviation of the estimates.
+  #    stds: standard deviation of the estimates.
 
   p <- params$p
   q <- params$q
@@ -110,22 +109,23 @@ EstimateBloomCounts <- function(params, obs_counts) {
   p2 <- p11 - p01  # == (1 - f) * (q - p)
 
   ests <- apply(obs_counts, 1, function(x) {
-  	N <- x[1]  # sample size for the cohort
-  	v <- x[-1]  # counts for individual bits
-    (v - p01 * N) / p2  # unbiased estimator for individual bits' true counts
-                        # It can be negative or exceed the total.
-  })
+      N <- x[1]  # sample size for the cohort
+      v <- x[-1]  # counts for individual bits
+      (v - p01 * N) / p2  # unbiased estimator for individual bits'
+                          # true counts. It can be negative or
+                          # exceed the total.
+    })
 
   total <- sum(obs_counts[,1])
 
   variances <- apply(obs_counts, 1, function(x) {
-  	N <- x[1]
-  	v <- x[-1]
-  	p_hats <- (v - p01 * N) / (N * p2)  # expectation of a true 1
-  	p_hats <- pmax(0, pmin(1, p_hats))  # clamp to [0,1]
-    r <- p_hats * p11 + (1 - p_hats) * p01  # expectation of a reported 1
-    N * r * (1 - r) / p2^2  # variance of the binomial
-  })
+      N <- x[1]
+      v <- x[-1]
+      p_hats <- (v - p01 * N) / (N * p2)  # expectation of a true 1
+      p_hats <- pmax(0, pmin(1, p_hats))  # clamp to [0,1]
+      r <- p_hats * p11 + (1 - p_hats) * p01  # expectation of a reported 1
+      N * r * (1 - r) / p2^2  # variance of the binomial
+     })
 
   # Transform counts from absolute values to fractional, removing bias due to
   #      variability of reporting between cohorts.
@@ -145,26 +145,32 @@ FitLasso <- function(X, Y, intercept = TRUE) {
   #
   # Input:
   #    X: a design matrix of size km by M (the number of candidate strings).
-  #    Y: a vector of size km with estimated counts from EstimateBloomCounts().
+  #    Y: a vector of size km with estimated counts from EstimateBloomCounts(),
+  #       representing constraints
   #    intercept: whether to fit with intercept or not.
   #
   # Output:
   #    a vector of size ncol(X) of coefficients.
 
   # TODO(mironov): Test cv.glmnet instead of glmnet
-  mod <- try(glmnet(X, Y, standardize = FALSE, intercept = intercept,
-                    lower.limits = 0,
-                    pmax = min(500, length(Y) * .8)),
-             silent = TRUE)
-
-  # If fitting fails, return an empty data.frame.
-  if (class(mod)[1] == "try-error") {
-    coefs <- setNames(rep(0, ncol(X)), colnames(X))
-  } else {
-    coefs <- coef(mod)
-    coefs <- coefs[-1, ncol(coefs), drop = FALSE]  # coefs[1] is the intercept
-  }
-  coefs
+
+  # Cap the number of non-zero coefficients to 500 or 80% of the number of
+  # constraints, whichever is less. The 500 cap is for performance reasons, 80%
+  # is to avoid overfitting.
+  cap <- min(500, nrow(X) * .8, ncol(X))
+
+  mod <- glmnet(X, Y, standardize = FALSE, intercept = intercept,
+                lower.limits = 0,  # outputs are non-negative
+                pmax = cap)
+
+  coefs <- coef(mod)
+  coefs <- coefs[-1, , drop = FALSE]  # drop the intercept
+  l1cap <- sum(colSums(coefs) <= 1.0)  # find all columns with L1 norm <= 1
+  if(l1cap > 0)
+   	distr <- coefs[, l1cap]  # return the last set of coefficients with L1 <= 1
+  else
+   	distr <- setNames(rep(0, ncol(X)), colnames(X))
+  distr
 }
 
 PerformInference <- function(X, Y, N, mod, params, alpha, correction) {
@@ -193,7 +199,7 @@ PerformInference <- function(X, Y, N, mod, params, alpha, correction) {
 #   # 1-sided t-test.
 #   p_values <- pnorm(z_values, lower = FALSE)
 
-  fit <- data.frame(String = colnames(X), Estimate = betas,
+  fit <- data.frame(string = colnames(X), Estimate = betas,
                     SD = mod$stds, # z_stat = z_values, pvalue = p_values,
                     stringsAsFactors = FALSE)
 
@@ -212,7 +218,7 @@ PerformInference <- function(X, Y, N, mod, params, alpha, correction) {
   fit <- fit[order(fit$Estimate, decreasing = TRUE), ]
 
   if (nrow(fit) > 0) {
-    str_names <- fit$String
+    str_names <- fit$string
     str_names <- str_names[!is.na(str_names)]
     if (length(str_names) > 0 && length(str_names) < nrow(X)) {
       this_data <- as.data.frame(as.matrix(X[, str_names]))
@@ -262,7 +268,7 @@ ComputePrivacyGuarantees <- function(params, alpha, N) {
   privacy
 }
 
-FitDistribution <- function(estimates_stds, map) {
+FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
   # Find a distribution over rows of map that approximates estimates_stds best
   #
   # Input:
@@ -275,40 +281,24 @@ FitDistribution <- function(estimates_stds, map) {
   #   according to this vector approximates estimates
 
   S <- ncol(map)  # total number of candidates
+  lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
 
-  support_coefs <- 1:S
+  if(!quiet)
+    cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n")
 
-  if (TRUE) {
-  # if (S > length(estimates_stds$estimates) * .8) {
-    # the system is close to being underdetermined
-    lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
-
-    # Select non-zero coefficients.
-    support_coefs <- which(lasso > 0)
-    cat("LASSO selected ", length(support_coefs), " coefficients in support.\n")
-  }
-
-  coefs <- setNames(rep(0, S), colnames(map))
-
-  if(length(support_coefs) > 0) {  # LASSO may return an empty list
-    constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE],
-                                             estimates_stds)
-
-    coefs[support_coefs] <- constrained_coefs
-  }
-
-  coefs
-}
+  names(lasso) <- colnames(map)
+  lasso
+ }
 
 Resample <- function(e) {
-  result <- e
-
-  result$estimates <- matrix(mapply(function(x, y) x + rnorm(1, 0, y),
+  # Simulate resampling of the Bloom filter estimates by adding Gaussian noise
+  # with estimated standard deviation.
+  estimates <- matrix(mapply(function(x, y) x + rnorm(1, 0, y),
                              e$estimates, e$stds),
                              nrow = nrow(e$estimates), ncol = ncol(e$estimates))
-  result$stds <- e$stds * 2^.5
+  stds <- e$stds * 2^.5
 
-  result
+  list(estimates = estimates, stds = stds)
 }
 
 Decode2Way <- function(counts, map, params) {
@@ -318,17 +308,17 @@ Decode2Way <- function(counts, map, params) {
   f <- params$f
   h <- params$h
   m <- params$m
-  
+
   S <- ncol(map)  # total number of candidates
-  
+
   N <- sum(counts[, 1])
-  
+
   filter_cohorts <- which(counts[, 1] != 0)  # exclude cohorts with zero reports
-  
+
   # stretch cohorts to bits
   filter_bits <- as.vector(
     t(matrix(1:nrow(map), nrow = m, byrow = TRUE)[filter_cohorts,]))
-  
+
   es <- Estimate2WayBloomCounts(params, counts)
   e <- list(estimates = es$estimates[filter_cohorts, , drop = FALSE],
             stds = es$stds[filter_cohorts, , drop = FALSE])
@@ -342,7 +332,7 @@ Decode2Way <- function(counts, map, params) {
 }
 
 Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05,
-                   correction = c("Bonferroni"), ...) {
+                   correction = c("Bonferroni"), quiet = FALSE, ...) {
   k <- params$k
   p <- params$p
   q <- params$q
@@ -367,6 +357,8 @@ Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05,
          stds = es$stds[filter_cohorts, , drop = FALSE])
 
   coefs_all <- vector()
+  # Run the fitting procedure several times (5 seems to be sufficient and not
+  # too many) to estimate standard deviation of the output.
   if(quick) {num_reps <- 2} else {num_reps <- 5}
   for(r in 1:num_reps)
   {
@@ -374,20 +366,21 @@ Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05,
       e <- Resample(estimates_stds_filtered)
     else
       e <- estimates_stds_filtered
-    
+
     coefs_all <- rbind(coefs_all,
-                       FitDistribution(e, map[filter_bits, , drop = FALSE]))  
+                       FitDistribution(e, map[filter_bits, , drop = FALSE],
+                                       quiet))
   }
   coefs_ssd <- N * apply(coefs_all, 2, sd)  # compute sample standard deviations
   coefs_ave <- N * apply(coefs_all, 2, mean)
-  
+
   # Only select coefficients more than two standard deviations from 0. May
   # inflate empirical SD of the estimates.
   reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd)
-  
+
   mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported])
 
-#   Old code  ... 
+#   Old code  ...
 #     coefs_all <- FitDistribution(estimates_stds_filtered,
 #                                         map[filter_bits, , drop = FALSE])
 #     reported <- which(coefs_all > 1E-6)
@@ -410,25 +403,38 @@ Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05,
 
   # Estimates from the model are per instance so must be multipled by h.
   # Standard errors are also adjusted.
-  fit$Total_Est <- floor(fit$Estimate)
-  fit$Total_SD <- floor(fit$SD)
-  fit$Prop <- fit$Total_Est / N
-  fit$LPB <- fit$Prop - 1.96 * fit$Total_SD / N
-  fit$UPB <- fit$Prop + 1.96 * fit$Total_SD / N
+  fit$estimate <- floor(fit$Estimate)
+  fit$proportion <- fit$estimate / N
+
+  fit$std_error <- floor(fit$SD)
+  fit$prop_std_error <- fit$std_error / N
+
+  # 1.96 standard deviations gives 95% confidence interval.
+  fit$prop_low_95 <- fit$proportion - 1.96 * fit$prop_std_error
+  fit$prop_high_95 <- fit$proportion + 1.96 * fit$prop_std_error
 
-  fit <- fit[, c("String", "Total_Est", "Total_SD", "Prop", "LPB", "UPB")]
-  colnames(fit) <- c("strings", "estimate", "std_dev", "proportion",
-                     "lower_bound", "upper_bound")
+  fit <- fit[, c("string", "estimate", "std_error", "proportion",
+                 "prop_std_error", "prop_low_95", "prop_high_95")]
+
+  allocated_mass <- sum(fit$proportion)
+  num_detected <- nrow(fit)
+
+  ss <- round(inf$SS, digits = 3)
+  explained_var <- ss[[1]]
+  missing_var <- ss[[2]]
+  noise_var <- ss[[3]]
+
+  noise_std_dev <- round(inf$resid_sigma, digits = 3)
 
   # Compute summary of the fit.
-  parameters =
+  parameters <-
       c("Candidate strings", "Detected strings",
         "Sample size (N)", "Discovered Prop (out of N)",
         "Explained Variance", "Missing Variance", "Noise Variance",
         "Theoretical Noise Std. Dev.")
-  values <- c(S, nrow(fit), N, round(sum(fit[, 2]) / N, 3),
-              round(inf$SS, 3),
-              round(inf$resid_sigma, 3))
+  values <- c(S, num_detected, N, allocated_mass,
+              explained_var, missing_var, noise_var, noise_std_dev)
+
   res_summary <- data.frame(parameters = parameters, values = values)
 
   privacy <- ComputePrivacyGuarantees(params, alpha, N)
@@ -436,9 +442,17 @@ Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05,
                        c("k", "h", "m", "p", "q", "f", "N", "alpha"),
                        values = c(k, h, m, p, q, f, N, alpha))
 
+  # This is a list of decode stats in a better format than 'summary'.
+  # TODO: Delete summary.
+  metrics <- list(sample_size = N,
+                  allocated_mass = allocated_mass,
+                  num_detected = num_detected,
+                  explained_var = explained_var,
+                  missing_var = missing_var)
+
   list(fit = fit, summary = res_summary, privacy = privacy, params = params,
        lasso = NULL, ests = as.vector(t(estimates_stds_filtered$estimates)),
-       counts = counts[, -1], resid = NULL)
+       counts = counts[, -1], resid = NULL, metrics = metrics)
 }
 
 ComputeCounts <- function(reports, cohorts, params) {
diff --git a/analysis/R/decode_test.R b/analysis/R/decode_test.R
index fe74e691..71db7aa5 100755
--- a/analysis/R/decode_test.R
+++ b/analysis/R/decode_test.R
@@ -22,20 +22,21 @@ source('tests/gen_counts.R')
 L1Distance <- function(X, Y) {
   # Computes the L1 distance between two named vectors
   common <- intersect(names(X), names(Y))
-  union <- rbind(X[common], Y[common])
 
-  (sum(abs(union[1,]-union[2,])) + sum(X[!names(X) %in% common])
-                                 + sum(Y[!names(Y) %in% common])) / 2
+  L1_intersect <- sum(abs(X[common] - Y[common]))
+  L1_X_minus_Y <- sum(X[!names(X) %in% common])
+  L1_Y_minus_X <- sum(Y[!names(Y) %in% common])
+
+  (L1_intersect + L1_X_minus_Y + L1_Y_minus_X) / 2
 }
 
 LInfDistance <- function(X, Y) {
-  # Computes the L1 distance between two named vectors
+  # Computes the L_infinity distance between two named vectors
   common <- intersect(names(X), names(Y))
-  union <- rbind(X[common], Y[common])
 
-  max(abs(union[1,]-union[2,]),
-      X[!names(X) %in% common],
-      Y[!names(Y) %in% common])
+  max(abs(X[common] - Y[common]),
+      abs(X[!names(X) %in% common]),
+      abs(Y[!names(Y) %in% common]))
 }
 
 MatrixVectorMerge <- function(mat, vec) {
@@ -63,39 +64,40 @@ MatrixVectorMerge <- function(mat, vec) {
 }
 
 RunMultipleTests <- function(title, fun, repetitions, ...) {
-  # Run a function with an annotated progress indicator
+  # Run a function with an annotated progress indicator. The function's outputs
+  # are concatenated and returned as a list of length repetitions.
   cat(title, ": ")
 
   if(repetitions == 1) {
     # only run once
-    fun(...)
+    results <- list(fun(...))
 
-    cat(" Done.")
-  }
-  else {  # run multiple times
+    cat(" Done.\n")
+  } else {  # run multiple times
     pb <- txtProgressBar(min = 0, max = repetitions,
                          width = getOption("width") - 20 - nchar(title))
 
+    results <- vector(mode = "list", repetitions)
     for(i in 1:repetitions) {
       setTxtProgressBar(pb, i)
-      fun(...)
+      results[[i]] <- fun(...)
     }
     cat(" Done.")
     close(pb)
   }
+
+  results
 }
 
-TestEstimatesAndStdsHelper <- function(params, map, pdf, total) {
+CheckEstimatesAndStdsHelper <- function(params, map, pdf, total) {
   # Helper function for TestEstimateBloomCounts.
   partition <- RandomPartition(total, pdf)
   counts <- GenerateCounts(params, map, partition, 1)
-  e <- EstimateBloomCounts(params, counts)
 
-  results$estimates <<- abind(results$estimates, e$estimates, along = 3)
-  results$stds <<- abind(results$stds, e$stds, along = 3)
+  EstimateBloomCounts(params, counts)
 }
 
-TestEstimatesAndStds <- function(repetitions, title, params, map, pdf, total) {
+CheckEstimatesAndStds <- function(repetitions, title, params, map, pdf, total) {
   # Checks that the expectations returned by EstimateBloomCounts on simulated
   # inputs match the ground truth and the empirical standard deviation matches
   # EstimateBloomCounts outputs.
@@ -108,14 +110,16 @@ TestEstimatesAndStds <- function(repetitions, title, params, map, pdf, total) {
   #   pdf: probability density function of the distribution from which simulated
   #        clients are sampled
   #   total: number of reports
-  results <<- c(estimates = list(), stds = list())
 
-  RunMultipleTests(title, TestEstimatesAndStdsHelper, repetitions,
-                   params, map, pdf, total)
+  results <- RunMultipleTests(title, CheckEstimatesAndStdsHelper, repetitions,
+                              params, map, pdf, total)
 
-  ave_e <- apply(results$estimates,1:2, mean)
-  observed_stds <- apply(results$estimates,1:2, sd)
-  ave_stds <- apply(results$stds,1:2, mean)
+  estimates <- abind(lapply(results, function(r) r$estimates), along = 3)
+  stds <- abind(lapply(results, function(r) r$stds), along = 3)
+
+  ave_e <- apply(estimates, 1:2, mean)
+  observed_stds <- apply(estimates, 1:2, sd)
+  ave_stds <- apply(stds, 1:2, mean)
 
   ground_truth <- matrix(map %*% pdf, nrow = params$m, byrow = TRUE)
 
@@ -148,12 +152,12 @@ TestEstimateBloomCounts <- function() {
 
   noise0 <- list(p = 0, q = 1, f = 0)  # no noise at all
 
-  TestEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (1/3)",
-                       c(report4x2, noise0), map0, pdf0, 100)
+  CheckEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (1/3)",
+                        c(report4x2, noise0), map0, pdf0, 100)
 
   noise1 <- list(p = 0.4, q = .6, f = 0.5)
-  TestEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (2/3)",
-                       c(report4x2, noise1), map0, pdf0, 100)
+  CheckEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (2/3)",
+                        c(report4x2, noise1), map0, pdf0, 100)
 
   # MEDIUM TEST: 100 values, 32 cohorts, 8 bits each, 10^6 reports
   values <- 100
@@ -166,51 +170,50 @@ TestEstimateBloomCounts <- function() {
 
   pdf1 <- ComputePdf("zipf1", values)
 
-  TestEstimatesAndStds(repetitions = 100, "Testing estimates and stds (3/3)",
-                       c(report8x32, noise1), map1, pdf1, 10^9)
+  CheckEstimatesAndStds(repetitions = 100, "Testing estimates and stds (3/3)",
+                        c(report8x32, noise1), map1, pdf1, 10^9)
 }
 
-TestDecodeHelper <- function(params, map, pdf, num_clients,
+CheckDecodeHelper <- function(params, map, pdf, num_clients,
                              tolerance_l1, tolerance_linf) {
   # Helper function for TestDecode. Simulates a RAPPOR run and checks results of
-  # Decode's output against the ground truth. Results are appended to a global
-  # list.
+  # Decode's output against the ground truth. Output is returned as a list.
 
   partition <- RandomPartition(num_clients, pdf)
   counts <- GenerateCounts(params, map, partition, 1)
   total <- sum(partition)
 
-  decoded <- Decode(counts, map, params)
-
-  decoded_partition <- setNames(decoded$fit$estimate, decoded$fit$strings)
+  decoded <- Decode(counts, map, params, quiet = TRUE)
 
-  results$estimates <<- MatrixVectorMerge(results$estimates, decoded_partition)
-  results$stds <<- MatrixVectorMerge(results$stds,
-                                          setNames(decoded$fit$std_dev,
-                                                   decoded$fit$strings))
+  decoded_partition <- setNames(decoded$fit$estimate, decoded$fit$string)
 
   checkTrue(L1Distance(decoded_partition, partition) < total^.5 * tolerance_l1,
             "L1 distance is too large")
 
   checkTrue(LInfDistance(decoded_partition, partition) <
               max(partition)^.5 * tolerance_linf, "L_inf distance is too large")
+
+  list(estimates = decoded_partition,
+       stds = setNames(decoded$fit$std_error, decoded$fit$string))
 }
 
-TestDecodeAveAndStds <- function(...) {
+CheckDecodeAveAndStds <- function(...) {
   # Runs Decode multiple times (specified by the repetition argument), checks
   # individuals runs against the ground truth, and the estimates of the standard
   # error against empirical observations.
 
-  results <<- list(estimates = matrix(nrow = 0, ncol = 0),
-                   stds = matrix(nrow = 0, ncol = 0))
+  results <- RunMultipleTests(...)
 
-  RunMultipleTests(...)
+  estimates <- matrix(nrow = 0, ncol = 0)
+  lapply(results, function(r) MatrixVectorMerge(estimates, r$estimates))
 
-  empirical_stds <- apply(results$estimates, 2, sd, na.rm = TRUE)
-  estimated_stds <- apply(results$stds, 2, mean, na.rm = TRUE)
+  stds <- matrix(nrow = 0, ncol = 0)
+  lapply(results, function(r) MatrixVectorMerge(stds, r$stds))
 
-  if(dim(results$estimates)[1] > 1)
-  {
+  empirical_stds <- apply(estimates, 2, sd, na.rm = TRUE)
+  estimated_stds <- apply(stds, 2, mean, na.rm = TRUE)
+
+  if(dim(estimates)[1] > 1) {
     checkTrue(any(estimated_stds > empirical_stds / 2),
               "Our estimate for the standard deviation is too low")
 
@@ -239,22 +242,22 @@ TestDecode <- function() {
   # match the ground truth. Must be close enough though.
   noise0 <- list(p = 0, q = 1, f = 0)  # no noise whatsoever
 
-  TestDecodeAveAndStds("Testing Decode (1/5)", TestDecodeHelper, 100,
-                       c(report4x2, noise0), map0, distribution0, 100,
-                       tolerance_l1 = 5,
-                       tolerance_linf = 3)
+  CheckDecodeAveAndStds("Testing Decode (1/5)", CheckDecodeHelper, 100,
+                        c(report4x2, noise0), map0, distribution0, 100,
+                        tolerance_l1 = 5,
+                        tolerance_linf = 3)
 
   noise1 <- list(p = .4, q = .6, f = .5)  # substantial noise, very few reports
-  TestDecodeAveAndStds("Testing Decode (2/5)", TestDecodeHelper, 100,
-                       c(report4x2, noise1), map0, distribution0, 100,
-                       tolerance_l1 = 20,
-                       tolerance_linf = 20)
+  CheckDecodeAveAndStds("Testing Decode (2/5)", CheckDecodeHelper, 100,
+                        c(report4x2, noise1), map0, distribution0, 100,
+                        tolerance_l1 = 20,
+                        tolerance_linf = 20)
 
   # substantial noise, many reports
-  TestDecodeAveAndStds("Testing Decode (3/5)", TestDecodeHelper, 100,
-                       c(report4x2, noise1), map0, distribution0, 100000,
-                       tolerance_l1 = 50,
-                       tolerance_linf = 40)
+  CheckDecodeAveAndStds("Testing Decode (3/5)", CheckDecodeHelper, 100,
+                        c(report4x2, noise1), map0, distribution0, 100000,
+                        tolerance_l1 = 50,
+                        tolerance_linf = 40)
 
   # MEDIUM TEST: 100 values, 32 cohorts, 8 bits each, 10^6 reports
   values <- 100
@@ -267,10 +270,10 @@ TestDecode <- function() {
 
   distribution1 <- ComputePdf("zipf1", values)
   names(distribution1) <- colnames(map1)
-  TestDecodeAveAndStds("Testing Decode (4/5)", TestDecodeHelper, 100,
-                   c(report8x32, noise1), map1, distribution1, 10^6,
-                   tolerance_l1 = values * 3,
-                    tolerance_linf = 100)
+  CheckDecodeAveAndStds("Testing Decode (4/5)", CheckDecodeHelper, 100,
+                        c(report8x32, noise1), map1, distribution1, 10^6,
+                        tolerance_l1 = values * 3,
+                        tolerance_linf = 100)
 
   # Testing LASSO: 500 values, 32 cohorts, 8 bits each, 10^6 reports
   values <- 500
@@ -284,16 +287,15 @@ TestDecode <- function() {
   distribution2 <- ComputePdf("zipf1.5", values)
   names(distribution2) <- colnames(map2)
 
-  TestDecodeAveAndStds("Testing Decode (5/5)", TestDecodeHelper, 1,
-                   c(report8x32, noise1), map2, distribution2, 10^6,
-                   tolerance_l1 = values * 3,
-                   tolerance_linf = 80)
-
+  CheckDecodeAveAndStds("Testing Decode (5/5)", CheckDecodeHelper, 1,
+                        c(report8x32, noise1), map2, distribution2, 10^6,
+                        tolerance_l1 = values * 3,
+                        tolerance_linf = 80)
 }
 
-TestAll <- function() {
+RunAll <- function() {
   TestEstimateBloomCounts()
   TestDecode()
 }
 
-TestAll()
+RunAll()
diff --git a/analysis/R/read_input.R b/analysis/R/read_input.R
index b85a09fd..95ea1b0d 100644
--- a/analysis/R/read_input.R
+++ b/analysis/R/read_input.R
@@ -102,8 +102,9 @@ ReadMapFile <- function(map_file, params = NULL, quote = "") {
 }
 
 LoadMapFile <- function(map_file, params = NULL, quote = "") {
-  # Reads the map file and creates an R binary .rda.
-  # If .rda file already exists, just loads that file.
+  # Reads the map file and creates an R binary .rda. If the .rda file already
+  # exists, just loads that file. NOTE: It assumes the map file is
+  # immutable.
 
   rda_file <- sub(".csv", ".rda", map_file, fixed = TRUE)
 
diff --git a/analysis/R/simulation.R b/analysis/R/simulation.R
index d7c6e9e1..cd37e74f 100644
--- a/analysis/R/simulation.R
+++ b/analysis/R/simulation.R
@@ -254,7 +254,7 @@ GenerateSamples <- function(N = 10^5, params, pop_params, alpha = .05,
                 correction = correction)
 
   # Add truth column.
-  fit$fit$Truth <- table(samp)[fit$fit$strings]
+  fit$fit$Truth <- table(samp)[fit$fit$string]
   fit$fit$Truth[is.na(fit$fit$Truth)] <- 0
 
   fit$map <- map$map
diff --git a/analysis/R/test.sh b/analysis/R/test.sh
index 06cf2380..f47eafa3 100755
--- a/analysis/R/test.sh
+++ b/analysis/R/test.sh
@@ -26,6 +26,8 @@ analysis-tool() {
     --output_dir _tmp
 
   cat _tmp/results.csv 
+  echo
+  cat _tmp/metrics.json
 
   popd
 }
diff --git a/analysis/R/util.R b/analysis/R/util.R
new file mode 100644
index 00000000..8679b83d
--- /dev/null
+++ b/analysis/R/util.R
@@ -0,0 +1,20 @@
+#!/usr/bin/Rscript
+#
+# Common utility library for all R scripts.
+
+# Log message with timing.  Example:
+#
+# _____ 1.301 My message
+#
+# The prefix makes it stand out (vs R's print()), and the number is the time so
+# far.
+#
+# NOTE: The shell script log uses hyphens.
+
+Log <- function(...) {
+  cat('_____ ')
+  cat(proc.time()[['elapsed']])
+  cat(' ')
+  cat(sprintf(...))
+  cat('\n')
+}
diff --git a/regtest.sh b/regtest.sh
index bfa0a0a0..2430edb1 100755
--- a/regtest.sh
+++ b/regtest.sh
@@ -144,7 +144,7 @@ _setup-one-case() {
   # banner "Hashing candidates to get 'map'"
 
   analysis/tools/hash_candidates.py \
-    $case_dir/case_params.csv \
+    $params_path \
     < $case_dir/case_candidates.txt \
     > $case_dir/case_map.csv
 }
@@ -240,8 +240,6 @@ make-summary() {
     | sed -e '/TABLE_ROWS/ r rows.html' \
     > $filename
 
-  rm rows.html
-
   popd >/dev/null
 
   log "Wrote $dir/$filename"
@@ -273,38 +271,51 @@ _setup-test-instances() {
   done
 }
 
+# Print the default number of parallel processes, which is max(#CPUs - 1, 1)
+default-processes() {
+  processors=$(grep -c ^processor /proc/cpuinfo || echo 4)  # Linux-specific
+  if test $processors -gt 1; then  # leave one CPU for the OS
+    processors=$(expr $processors - 1)
+  fi
+  echo $processors
+}
+
 # Args:
-#   regexp: A pattern selecting the subset of tests to run
+#   spec_gen: A program to execute to generate the spec.
+#   spec_regex: A pattern selecting the subset of tests to run
 #   instances: A number of times each test case is run
-#   parallel: Whether the tests are run in parallel (T/F)
+#   parallel: Whether the tests are run in parallel (T/F).  Sequential
+#     runs log to the console; parallel runs log to files.
 #   fast_counts: Whether counts are sampled directly (T/F)
-#   
+
 _run-tests() {
-  local spec_regex=$1  # grep -E format on the spec
-  local instances=$2
-  local parallel=$3
-  local fast_counts=$4
+  local spec_gen=$1
+  local spec_regex="$2"  # grep -E format on the spec, can be empty
+  local instances=$3
+  local parallel=$4
+  local fast_counts=$5
 
   rm -r -f --verbose $REGTEST_DIR
   
   mkdir --verbose -p $REGTEST_DIR
 
   local func
-  local processors=1
+  local processors
 
   if test $parallel = F; then
     func=_run-one-instance  # output to the console
+    processors=1
   else
     func=_run-one-instance-logged
-    processors=$(grep -c ^processor /proc/cpuinfo || echo 4)  # POSIX-specific
-    if test $processors -gt 1; then  # leave one CPU for the OS
-      processors=$(expr $processors - 1)
-    fi
+    # Let the user override with MAX_PROC, in case they don't have enough
+    # memory.
+    processors=${MAX_PROC:-$(default-processes)}
     log "Running $processors parallel processes"
   fi
 
   local cases_list=$REGTEST_DIR/test-cases.txt
-  tests/regtest_spec.py | grep -E $spec_regex > $cases_list
+  # Need -- for regexes that start with -
+  $spec_gen | grep -E -- "$spec_regex" > $cases_list
 
   # Generate parameters for all test cases.
   cat $cases_list \
@@ -324,13 +335,16 @@ _run-tests() {
   make-summary $REGTEST_DIR
 }
 
-# Run tests sequentially
+# used for most tests
+readonly REGTEST_SPEC=tests/regtest_spec.py
+
+# Run tests sequentially.  NOTE: called by demo.sh.
 run-seq() {
   local spec_regex=${1:-'^r-'}  # grep -E format on the spec
   local instances=${2:-1}
   local fast_counts=${3:-T}
 
-  _run-tests $spec_regex $instances F $fast_counts
+  time _run-tests $REGTEST_SPEC $spec_regex $instances F $fast_counts
 }
 
 # Run tests in parallel
@@ -339,15 +353,22 @@ run() {
   local instances=${2:-1}
   local fast_counts=${3:-T}
   
-  _run-tests $spec_regex $instances T $fast_counts 
+  time _run-tests $REGTEST_SPEC $spec_regex $instances T $fast_counts 
 }
 
-# Run tests in parallel
+# Run tests in parallel (7+ minutes on 8 cores)
 run-all() {
   local instances=${1:-1}
 
   log "Running all tests. Can take a while."
-  _run-tests '^r-' $instances T T
+  time _run-tests $REGTEST_SPEC '^r-' $instances T T
+}
+
+run-user() {
+  local spec_regex=${1:-}
+  local instances=${2:-1}
+  local parallel=T  # too much memory
+  time _run-tests tests/user_spec.py "$spec_regex" $instances $parallel T
 }
 
 "$@"
diff --git a/setup.sh b/setup.sh
index 01e3a16b..729c486f 100755
--- a/setup.sh
+++ b/setup.sh
@@ -25,15 +25,21 @@ native-packages() {
 
 r-packages() {
   # Install as root so you can write to /usr/local/lib/R.
+
+  # glmnet, limSolve: solvers for decode.R
+  # RJSONIO: for analysis_tool.R
   sudo R -e \
-    'install.packages(c("glmnet", "optparse", "limSolve", "RUnit", "abind"), repos="http://cran.rstudio.com/")'
+    'install.packages(c("glmnet", "optparse", "RUnit", "abind", "RJSONIO"), repos="http://cran.rstudio.com/")'
 }
 
 # R 3.0.2 on Trusty is out of date with CRAN, so we need this workaround.
 install-plyr-with-friends() {
   mkdir -p _tmp
+  wget --directory _tmp \
+    http://cran.r-project.org/src/contrib/Archive/Rcpp/Rcpp_0.11.4.tar.gz
   wget --directory _tmp \
     http://cran.r-project.org/src/contrib/Archive/plyr/plyr_1.8.1.tar.gz
+  sudo R CMD INSTALL _tmp/Rcpp_0.11.4.tar.gz
   sudo R CMD INSTALL _tmp/plyr_1.8.1.tar.gz 
   sudo R -e \
     'install.packages(c("reshape2", "ggplot2", "data.table"), repos="http://cran.rstudio.com/")'
diff --git a/tests/analyze.R b/tests/analyze.R
index fa1e2bf7..9f079405 100755
--- a/tests/analyze.R
+++ b/tests/analyze.R
@@ -50,14 +50,7 @@ if (library(Cairo, quietly = TRUE, logical.return = TRUE)) {
 source("analysis/R/analysis_lib.R")
 source("analysis/R/read_input.R")
 source("analysis/R/decode.R")
-
-source("analysis/R/alternative.R")  # temporary
-
-Log <- function(...) {
-  cat('analyze.R: ')
-  cat(sprintf(...))
-  cat('\n')
-}
+source("analysis/R/util.R")
 
 LoadContext <- function(prefix_case) {
   # Creates the context, filling it with privacy parameters
@@ -126,7 +119,7 @@ CompareRapporVsActual <- function(ctx) {
   StringToInt <- function(x) as.integer(substring(x, 2))
 
   actual_values <- StringToInt(actual$string)
-  rappor_values <- StringToInt(rappor$strings)
+  rappor_values <- StringToInt(rappor$string)
 
   # False negatives: AnalyzeRAPPOR failed to find this value (e.g. because it
   # occurs too rarely)
@@ -180,6 +173,8 @@ CompareRapporVsActual <- function(ctx) {
   Log("False negatives:")
   str(false_neg)
 
+  # NOTE: We should call Decode() directly, and then num_rappor is
+  # metrics$num_detected, and sum_proportion is metrics$allocated_mass.
   metrics <- list(
       num_actual = nrow(actual),  # data frames
       num_rappor = nrow(rappor),
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index 88f11540..dcd48cd3 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -369,7 +369,7 @@ ExternalCounts <- function(inp) {
   found_strings <- lapply(1:2, function(i)
     Decode(counts[[i + 1]],
            map[[i]]$rmap,
-           params, quick = TRUE)$fit$strings)
+           params, quick = FALSE)$fit[,"string"])
   if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) {
     print("FOUND_STRINGS")
     print(found_strings)
diff --git a/tests/gen_counts_test.R b/tests/gen_counts_test.R
index 49ad3be5..e91de68e 100755
--- a/tests/gen_counts_test.R
+++ b/tests/gen_counts_test.R
@@ -51,7 +51,7 @@ TestGenerateCounts <- function() {
   noise1 <- list(p = .5, q = .5, f = 0)  # truly random IRRs
   counts1 <- GenerateCounts(c(report_params, noise1), map, partition, v)
 
-  for(i in 2:4)
+  for(i in 2:5)
     for(j in 1:2)
       pvalues <- c(pvalues,
                    chisq.test(c(counts1[j,1] - counts1[j,i], counts1[j,i]),
@@ -64,14 +64,13 @@ TestGenerateCounts <- function() {
 
   counts2 <- counts2 / v
 
-  for(i in 2:4)
+  for(i in 2:5)
     for(j in 1:2)
       pvalues <- c(pvalues,
                    chisq.test(c(counts2[j,1] - counts2[j,i], counts2[j,i]),
                               p = c(.5, .5))$p.value)
 
-  checkTrue(min(pvalues) > 1E-9 && max(pvalues) < 1 - 1E-9,
-            "Chi-squared test failed")
+  checkTrue(min(pvalues) > 1E-9, "Chi-squared test failed")
 }
 
 TestRandomPartition <- function() {
@@ -97,14 +96,14 @@ TestRandomPartition <- function() {
   p5 <- RandomPartition(total = 1000000, c(1, 2, 3, 4))
   p.value <- chisq.test(p5, p = c(.1, .2, .3, .4))$p.value
 
-  # Apply the chi squared test and fail if p.value is too high or too low.
-  # Probability of failure is 2 * 1E-9, which should never happen.
-  checkTrue((p.value > 1E-9) && (p.value <  1 - 1E-9))
+  # Apply the chi squared test and fail if p.value is too low.
+  # Probability of failure is 1E-9, which should never happen.
+  checkTrue(p.value <  1 - 1E-9)
 }
 
-TestAll <- function(){
+CheckAll <- function(){
   TestRandomPartition()
   TestGenerateCounts()
 }
 
-TestAll()
\ No newline at end of file
+CheckAll()
\ No newline at end of file
diff --git a/tests/user_spec.py b/tests/user_spec.py
new file mode 100755
index 00000000..5df58798
--- /dev/null
+++ b/tests/user_spec.py
@@ -0,0 +1,116 @@
+#!/usr/bin/python
+"""Print a test spec on stdout.
+
+Each line has parmaeters for a test case.  The regtest.sh shell script reads
+these lines and runs parallel processes.
+
+We use Python data structures so the test cases are easier to read and edit.
+"""
+
+import sys
+
+#
+# TEST CONFIGURATION
+#
+
+# For gen_sim_input.py
+INPUT_PARAMS = {
+    # distribution, num unique values, num clients, values per client
+    'exp-100k': ('exp', 100, 100000, 1),
+    'exp-1m': ('exp', 100, 1000000, 1),
+}
+
+# For rappor_sim.py
+# 'k, h, m, p, q, f' as in params file.
+RAPPOR_PARAMS = {
+    # Initial chrome params from 2014.
+    # NOTE: fastrand simulation only supports 64 bits!  Make sure to use the
+    # 'fast_counts' code path.
+    'chrome128': (128, 2, 128, 0.25, 0.75, 0.50),
+
+    # Chrome params from early 2015 -- changed to 8 bit reports.
+    'chrome8': (8, 2, 128, 0.25, 0.75, 0.50),
+
+    # Original demo params
+    'demo': (16, 2, 64, 0.5, 0.75, 0.5),
+}
+
+# For deriving candidates from true inputs.
+MAP_PARAMS = {
+    # 1. Number of extra candidates to add.
+    # 2. Candidate strings to remove from the map.  This FORCES false
+    # negatives, e.g. for common strings, since a string has to be in the map
+    # for RAPPOR to choose it.
+    'add-100': (100, []),
+    'add-1000': (1000, []),
+    'add-2000': (2000, []),
+    # also thrashes on 128 bits
+    'add-3000': (3000, []),
+    'add-10000': (10000, []),
+    'add-15000': (15000, []),  # approx number of candidates for eTLD+1
+    'add-100000': (100000, []),
+    'remove-top-2': (20, ['v1', 'v2']),
+}
+
+# test case name -> (input params name, RAPPOR params name, map params name)
+TEST_CASES = [
+    ('chrome128-100k-100', 'exp-100k', 'chrome128', 'add-100'),
+    ('chrome128-100k-1000', 'exp-100k', 'chrome128', 'add-1000'),
+    ('chrome128-100k-2000', 'exp-100k', 'chrome128', 'add-2000'),
+    ('chrome128-100k-3000', 'exp-100k', 'chrome128', 'add-3000'),
+    # 128 bits and 15k candidates fails on a machine with 8 GB memory.
+    # Lasso finishes with 7508 non-zero coefficients, and then allocation
+    # fails.  TODO: just take the highest ones?
+    #('chrome128-100k-15000', 'exp-100k', 'chrome128', 'add-15000'),
+    #('chrome128-100k-100000', 'exp-100k', 'chrome128', 'add-100000'),
+
+    # NOTE: Adding more candidates exercises LASSO
+    ('chrome8-100k-100', 'exp-100k', 'chrome8', 'add-100'),
+    ('chrome8-100k-1000', 'exp-100k', 'chrome8', 'add-1000'),
+    ('chrome8-100k-2000', 'exp-100k', 'chrome8', 'add-2000'),
+    ('chrome8-100k-3000', 'exp-100k', 'chrome8', 'add-3000'),
+    ('chrome8-100k-15000', 'exp-100k', 'chrome8', 'add-15000'),
+
+    # NOTE: This one takes too much memory!  More than 4 GB.  This is because
+    # Lasso gets a huge matrix (100,000).  We got 1564 non-zero coefficients.
+    ('chrome8-100k-100000', 'exp-100k', 'chrome8', 'add-100000'),
+
+    # What happens when the the candidates are missing top values?
+    ('chrome8-badcand', 'exp-100k', 'chrome8', 'remove-top-2'),
+
+    # TODO: Use chrome params with real map from Alexa 1M ?
+]
+
+#
+# END TEST CONFIGURATION
+#
+
+
+def main(argv):
+  rows = []
+  for test_case, input_name, rappor_name, map_name in TEST_CASES:
+    input_params = INPUT_PARAMS[input_name]
+    rappor_params = RAPPOR_PARAMS[rappor_name]
+    map_params = MAP_PARAMS[map_name]
+    row = tuple([test_case]) + input_params + rappor_params + map_params
+    rows.append(row)
+
+  for row in rows:
+    for cell in row:
+      if isinstance(cell, list):
+        if cell:
+          cell_str = '|'.join(cell)
+        else:
+          cell_str = 'NONE'  # we don't want an empty string
+      else:
+        cell_str = cell
+      print cell_str,  # print it with a space after it
+    print  # new line after row
+
+
+if __name__ == '__main__':
+  try:
+    main(sys.argv)
+  except RuntimeError, e:
+    print >>sys.stderr, 'FATAL: %s' % e
+    sys.exit(1)

From 3547cf31300b7d94b8e9fdaf245e8e5ce835de26 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Tue, 7 Jul 2015 16:47:55 -0700
Subject: [PATCH 37/67] Better summary in assoctest for experiments with 2-way
 marginals.

---
 analysis/R/decode.R         | 14 ++++++++++++--
 assoctest.sh                |  5 +++--
 tests/analyze_assoc_expt.R  | 14 +++++++++++---
 tests/make_summary_assoc.py | 28 +++++++++++++++++++++++++++-
 tests/regtest_spec.py       | 24 ++++++++++++------------
 5 files changed, 65 insertions(+), 20 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 2d8af344..f6a94226 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -268,6 +268,10 @@ ComputePrivacyGuarantees <- function(params, alpha, N) {
   privacy
 }
 
+FitDistribution2 <- function(estimates_stds, map) {
+  FitDistribution(estimates_stds, map)
+}
+
 FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
   # Find a distribution over rows of map that approximates estimates_stds best
   #
@@ -301,7 +305,7 @@ Resample <- function(e) {
   list(estimates = estimates, stds = stds)
 }
 
-Decode2Way <- function(counts, map, params) {
+Decode2Way <- function(counts, map, params, new_decode = FALSE) {
   k <- params$k
   p <- params$p
   q <- params$q
@@ -322,7 +326,11 @@ Decode2Way <- function(counts, map, params) {
   es <- Estimate2WayBloomCounts(params, counts)
   e <- list(estimates = es$estimates[filter_cohorts, , drop = FALSE],
             stds = es$stds[filter_cohorts, , drop = FALSE])
-  coefs <- FitDistribution(e, map[filter_bits, , drop = FALSE])
+  if (new_decode == TRUE) {
+    coefs <- FitDistribution2(e, map[filter_bits, , drop = FALSE])
+  } else {
+    coefs <- FitDistribution(e, map[filter_bits, , drop = FALSE])
+  }
   fit <- data.frame(String = colnames(map[filter_bits, , drop = FALSE]),
                     Estimate = matrix(coefs, ncol = 1),
                     SD = matrix(coefs, ncol = 1),
@@ -371,6 +379,8 @@ Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05,
                        FitDistribution(e, map[filter_bits, , drop = FALSE],
                                        quiet))
   }
+  
+  FitDistribution(e, map[filter_bits, , drop = FALSE], quiet)
   coefs_ssd <- N * apply(coefs_all, 2, sd)  # compute sample standard deviations
   coefs_ave <- N * apply(coefs_all, 2, mean)
 
diff --git a/assoctest.sh b/assoctest.sh
index 7a4ef8a3..cd432558 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -180,7 +180,7 @@ _run-one-instance() {
     inp['counts'] = ['$instance_dir/case_2way.csv',\
                      '$instance_dir/case_marg1.csv',\
                      '$instance_dir/case_marg2.csv']; \
-    inp['expt'] = ['external-counts', 'external-reports-em']; \
+    inp['expt'] = ['external-counts', 'external-counts-new']; \
     json.dump(inp, f); \
     f.close();"
 
@@ -206,8 +206,9 @@ _run-one-instance-logged() {
 make-summary() {
   local dir=$1
   local filename=${2:-results.html}
+  local instances=${3:-1}
 
-  tests/make_summary_assoc.py $dir > $dir/rows.html
+  tests/make_summary_assoc.py $dir $instances > $dir/rows.html
 
   pushd $dir >/dev/null
 
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index dcd48cd3..46b7b42d 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -350,7 +350,7 @@ DirectSimulationOfReports <- function(inp) {
 ## Outputs:
 #
 # ------------------------------------------------------------------------
-ExternalCounts <- function(inp) {
+ExternalCounts <- function(inp, new_decode = FALSE) {
   ptm <- proc.time()
   params <- ReadParameterFile(inp$params)
   # Ensure sufficient maps as required by number of vars
@@ -381,7 +381,7 @@ ExternalCounts <- function(inp) {
   pruned <- lapply(1:2, function(i)
     lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE]))
   crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap
-  marginal <- Decode2Way(counts[[1]], crmap, params2)$fit
+  marginal <- Decode2Way(counts[[1]], crmap, params2, new_decode = new_decode)$fit
   td <- read.csv(file = inp$truefile, header = FALSE)
   td <- table(td[,2:3])
   td <- td / sum(td)
@@ -417,7 +417,11 @@ ExternalCounts <- function(inp) {
   )
   
   # Write metrics to metrics.csv
-  filename <- file.path(inp$outdir, 'metrics.csv')
+  if (new_decode == TRUE) {
+    filename <- file.path(inp$outdir, 'metrics_2.csv')
+  } else {
+    filename <- file.path(inp$outdir, 'metrics.csv')
+  }
   write.csv(metrics, file = filename, row.names = FALSE)
 }
 
@@ -525,6 +529,10 @@ main <- function(opts) {
     print("---------- RUNNING EXPERIMENT EXT COUNTS ----------")
     ExternalCounts(inp)  
   }
+  if ("external-counts-new" %in% inp$expt) {
+    print("---------- RUNNING EXPERIMENT EXT COUNTS ----------")
+    ExternalCounts(inp, new_decode = TRUE)  
+  }
   if ("external-reports-em" %in% inp$expt) {
     print("---------- RUNNING EXPERIMENT EXT REPORTS ----------")
     ExternalReportsEM(inp)
diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py
index 665ef9f9..40c4d635 100755
--- a/tests/make_summary_assoc.py
+++ b/tests/make_summary_assoc.py
@@ -272,6 +272,7 @@ def FormatPlots(base_dir, test_instances):
 
 def main(argv):
   base_dir = argv[1]
+  num_instances = int(argv[2])
 
   # This file has the test case names, in the order that they should be
   # displayed.
@@ -293,6 +294,8 @@ def main(argv):
   # file. Instead, rows' names are links to the corresponding .png files.
   include_plots = len(test_instances) < 20
   include_plots = False
+  l1d_list = []
+  l1d_list2 = []
 
   for instance in test_instances:
     # A test instance is idenfied by the test name and the test run.
@@ -314,10 +317,14 @@ def main(argv):
     cell1_html = FormatCell1(test_case, test_instance, metrics_file, log_file,
                              plot_file, include_plots)
 
+    if(int(test_instance) == 1):
+      l1d_list = [] 
+      l1d_list2 = []
+
     if os.path.isfile(metrics_file):
       # ParseMetrics outputs an HTML table row and also updates lists
       metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file)
-
+      l1d_list += metrics_dict['l1d']
       # Update the metrics structure. Initialize dictionaries if necessary.
       for m in metrics:
         if not test_case in metrics[m]:
@@ -332,9 +339,28 @@ def main(argv):
     if (os.path.isfile(metrics_file)):
       metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file,
                                                 italics = True)
+      l1d_list2 += metrics_dict['l1d']
       print '<tr><td></td>{}{}</tr>'.format(ParseSpecFile(spec_file, empty =
                                                         True), metrics_html)
 
+    # Print summary of test instances
+    if(int(test_instance) == num_instances):
+      row_str = ['', '', '', '', 
+        '%.3f&plusmn;%.3f' % (Mean(l1d_list), StandardErrorEstimate(l1d_list)),
+        '',
+      ]
+      row_str2 = ['', '', '', '', 
+        '%.3f&plusmn;%.3f' % (Mean(l1d_list2), StandardErrorEstimate(l1d_list2)),
+        '',
+      ]
+      print '<tr><td></td>{}{}</tr>'.format(ParseSpecFile(spec_file, empty =
+              True), ' '.join('<td><b>%s</b></td>' % cell for cell in
+                              row_str))
+      if (os.path.isfile(metrics_file)):
+        print '<tr><td></td>{}{}</tr>'.format(ParseSpecFile(spec_file, empty =
+              True), ' '.join('<td><b><i>%s</i></b></td>' % cell for cell in
+                              row_str2))
+
   print FormatSummaryRow(metrics)
 
   print '</tbody>'
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index db8a8566..53d1053a 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -128,23 +128,23 @@
 # sets
 ASSOC_TEST_CONFIG = {
   'distr': (
-            'fizz-tiny',
-            'fizz-tiny-bool',
-            'fizz-small',
-            'fizz-small-bool',),
-#            'fizz',
-#            'fizz-bool',),
+#            'fizz-tiny',
+#            'fizz-tiny-bool',
+#            'fizz-small',
+#            'fizz-small-bool',),
+            'fizz',
+            'fizz-bool',
 #            'toy',),
-#            'compact-noextra-small',
-#            'loose-noextra-small',
-#            'compact-noextra-large',
-#            'loose-noextra-large',
+            'compact-noextra-small',
+            'loose-noextra-small',),
 #            'compact-extra-small',
 #            'loose-extra-small',
-#            'compact-extra-large',
-#            'loose-extra-large',
 #            'compact-excess-small',
 #            'loose-excess-small',
+#            'compact-noextra-large',
+#            'loose-noextra-large',
+#            'compact-extra-large',
+#            'loose-extra-large',
 #            'compact-excess-large',
 #            'loose-excess-large'),
   'blooms': (

From f9390ab1660dc61fbc89b615746d8f404edbc3ad Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Wed, 8 Jul 2015 10:21:59 -0700
Subject: [PATCH 38/67] Tests run sequentially. Trying random projection.

---
 analysis/R/decode.R   | 32 +++++++++++++++++++++++++++++++-
 assoctest.sh          |  4 ++--
 tests/regtest_spec.py | 17 ++++++++---------
 3 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index f6a94226..566ede11 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -268,8 +268,38 @@ ComputePrivacyGuarantees <- function(params, alpha, N) {
   privacy
 }
 
+# Implements lsei
+# FitDistribution2 <- function(estimates_stds, map) {
+#   X <- map
+#   Y <- as.vector(t(estimates_stds$estimates))
+#   m <- dim(X)[1]
+#   n <- dim(X)[2]
+#   
+#   G <- rbind2(Diagonal(n), rep(-1, n))
+#   H <- c(rep(0, n), -1)
+#   lsei(A = X, B = Y, G = G, H = H, type = 2)$X
+# }
+
 FitDistribution2 <- function(estimates_stds, map) {
-  FitDistribution(estimates_stds, map)
+  X <- map
+  Y <- as.vector(t(estimates_stds$estimates))
+  m <- dim(X)[1]
+  n <- dim(X)[2]
+  
+  # Random projection params
+  size <- 10 * n
+  density <- 0.05
+  rproj <- matrix(0, size, m)
+  rproj[sample(length(rproj), size = density * length(rproj))] <- rnorm(density * length(rproj))
+  # rproj <- matrix(rnorm(10*n*m), 10*n, m)
+  Xproj <- rproj %*% X
+  Yproj <- as.vector(rproj %*% Y)
+  mproj <- dim(Xproj)[1]
+  nproj <- dim(Xproj)[2]
+  
+  G <- rbind2(Diagonal(nproj), rep(-1, nproj))
+  H <- c(rep(0, nproj), -1)
+  lsei(A = Xproj, B = Yproj, G = G, H = H, type = 2)$X
 }
 
 FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
diff --git a/assoctest.sh b/assoctest.sh
index cd432558..d5e0ec9f 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -301,7 +301,7 @@ _run-tests() {
 
   log "Done running all test instances"
 
-  make-summary $ASSOCTEST_DIR
+  make-summary $ASSOCTEST_DIR "results.html" $instances
 }
 
 # Run tests sequentially
@@ -329,7 +329,7 @@ run-all() {
   log "Running all tests. Can take a while."
   # a- for assoc tests
   # F for sequential
-  _run-tests '^a-' $instances T T
+  _run-tests '^a-' $instances F T
 }
 
 "$@"
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 53d1053a..d1ee7ebb 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -131,16 +131,16 @@
 #            'fizz-tiny',
 #            'fizz-tiny-bool',
 #            'fizz-small',
-#            'fizz-small-bool',),
-            'fizz',
-            'fizz-bool',
+#            'fizz-small-bool',
+#            'fizz',
+#            'fizz-bool',),
 #            'toy',),
             'compact-noextra-small',
-            'loose-noextra-small',),
-#            'compact-extra-small',
-#            'loose-extra-small',
-#            'compact-excess-small',
-#            'loose-excess-small',
+            'loose-noextra-small',
+            'compact-extra-small',
+            'loose-extra-small',
+            'compact-excess-small',
+            'loose-excess-small',),
 #            'compact-noextra-large',
 #            'loose-noextra-large',
 #            'compact-extra-large',
@@ -159,7 +159,6 @@
 # END TEST CONFIGURATION
 #
 
-
 def main(argv):
   rows = []
 

From e5435bb8bcd11b77e67141cf5b6e2240f95d166b Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 9 Jul 2015 16:02:46 -0700
Subject: [PATCH 39/67] Marginals constraints for LSEI.

---
 analysis/R/decode.R         | 65 +++++++++++++++++++++++++++----------
 assoctest.sh                |  6 ++--
 tests/analyze_assoc_expt.R  | 10 ++++--
 tests/make_summary_assoc.py |  8 ++---
 4 files changed, 61 insertions(+), 28 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 566ede11..f839ee01 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -269,7 +269,7 @@ ComputePrivacyGuarantees <- function(params, alpha, N) {
 }
 
 # Implements lsei
-# FitDistribution2 <- function(estimates_stds, map) {
+# FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
 #   X <- map
 #   Y <- as.vector(t(estimates_stds$estimates))
 #   m <- dim(X)[1]
@@ -280,26 +280,55 @@ ComputePrivacyGuarantees <- function(params, alpha, N) {
 #   lsei(A = X, B = Y, G = G, H = H, type = 2)$X
 # }
 
-FitDistribution2 <- function(estimates_stds, map) {
-  X <- map
+FitDistribution2 <- function(estimates_stds, map, fit) {
+
+  X <- as.matrix(map)
   Y <- as.vector(t(estimates_stds$estimates))
   m <- dim(X)[1]
   n <- dim(X)[2]
+  wt <- 1000  # weight to marginal constraints
   
-  # Random projection params
-  size <- 10 * n
-  density <- 0.05
-  rproj <- matrix(0, size, m)
-  rproj[sample(length(rproj), size = density * length(rproj))] <- rnorm(density * length(rproj))
-  # rproj <- matrix(rnorm(10*n*m), 10*n, m)
-  Xproj <- rproj %*% X
-  Yproj <- as.vector(rproj %*% Y)
-  mproj <- dim(Xproj)[1]
-  nproj <- dim(Xproj)[2]
+  G <- rbind2(Diagonal(n), rep(-1, n))
+  H <- c(rep(0, n), -1)
+  
+  # Adding marginals constraints to X and Y
+  fstrs <- lapply(fit, function(x) x[,"string"])  # found strings
+  
+  Y <- c(Y, wt * t(fit[[1]]["proportion"]), wt * t(fit[[2]]["proportion"]))
+  
+  for (strs in fstrs[[1]]) {
+    indices <- which(colnames(map) %in% outer(strs,
+                                              fstrs[[2]],
+                                              function(x, y) paste(x, y, sep = "x")))
+    vec <- rep(0, n)
+    vec[indices] <- wt
+    X <- rbind2(X, vec)
+  }
+  for (strs in fstrs[[2]]) {
+    indices <- which(colnames(map) %in% outer(fstrs[[1]],
+                                              strs,
+                                              function(x, y) paste(x, y, sep = "x")))
+    vec <- rep(0, n)
+    vec[indices] <- wt
+    X <- rbind2(X, vec)
+  }
+  
+  lsei(A = X, B = Y, G = G, H = H, type = 2)$X
   
-  G <- rbind2(Diagonal(nproj), rep(-1, nproj))
-  H <- c(rep(0, nproj), -1)
-  lsei(A = Xproj, B = Yproj, G = G, H = H, type = 2)$X
+  # Random projection params
+#   size <- 10 * n
+#   density <- 0.05
+#   rproj <- matrix(0, size, m)
+#   rproj[sample(length(rproj), size = density * length(rproj))] <- rnorm(density * length(rproj))
+#   # rproj <- matrix(rnorm(10*n*m), 10*n, m)
+#   Xproj <- rproj %*% X
+#   Yproj <- as.vector(rproj %*% Y)
+#   mproj <- dim(Xproj)[1]
+#   nproj <- dim(Xproj)[2]
+#   
+#   G <- rbind2(Diagonal(nproj), rep(-1, nproj))
+#   H <- c(rep(0, nproj), -1)
+#   lsei(A = Xproj, B = Yproj, G = G, H = H, type = 2)$X
 }
 
 FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
@@ -335,7 +364,7 @@ Resample <- function(e) {
   list(estimates = estimates, stds = stds)
 }
 
-Decode2Way <- function(counts, map, params, new_decode = FALSE) {
+Decode2Way <- function(counts, map, params, new_decode = FALSE, fit = NULL) {
   k <- params$k
   p <- params$p
   q <- params$q
@@ -357,7 +386,7 @@ Decode2Way <- function(counts, map, params, new_decode = FALSE) {
   e <- list(estimates = es$estimates[filter_cohorts, , drop = FALSE],
             stds = es$stds[filter_cohorts, , drop = FALSE])
   if (new_decode == TRUE) {
-    coefs <- FitDistribution2(e, map[filter_bits, , drop = FALSE])
+    coefs <- FitDistribution2(e, map[filter_bits, , drop = FALSE], fit)
   } else {
     coefs <- FitDistribution(e, map[filter_bits, , drop = FALSE])
   }
diff --git a/assoctest.sh b/assoctest.sh
index d5e0ec9f..492d949a 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -275,8 +275,8 @@ _run-tests() {
   else
     func=_run-one-instance-logged
     processors=$(grep -c ^processor /proc/cpuinfo || echo 4)  # POSIX-specific
-    if test $processors -gt 3; then  # leave few CPUs for the OS
-      processors=$(expr $processors - 3)
+    if test $processors -gt 6; then  # leave few CPUs for the OS
+      processors=5
     else
       processors=1
     fi
@@ -329,7 +329,7 @@ run-all() {
   log "Running all tests. Can take a while."
   # a- for assoc tests
   # F for sequential
-  _run-tests '^a-' $instances F T
+  _run-tests '^a-' $instances T T
 }
 
 "$@"
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index 46b7b42d..8c2a5ee7 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -366,10 +366,13 @@ ExternalCounts <- function(inp, new_decode = FALSE) {
   params2$k <- (params$k ** 2) * 4
   
   # Prune candidates
-  found_strings <- lapply(1:2, function(i)
+  fit <- lapply(1:2, function(i)
     Decode(counts[[i + 1]],
            map[[i]]$rmap,
-           params, quick = FALSE)$fit[,"string"])
+           params, quick = FALSE)$fit)
+  
+  found_strings = list(fit[[1]][,"string"], fit[[2]][,"string"])
+
   if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) {
     print("FOUND_STRINGS")
     print(found_strings)
@@ -381,7 +384,7 @@ ExternalCounts <- function(inp, new_decode = FALSE) {
   pruned <- lapply(1:2, function(i)
     lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE]))
   crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap
-  marginal <- Decode2Way(counts[[1]], crmap, params2, new_decode = new_decode)$fit
+  marginal <- Decode2Way(counts[[1]], crmap, params2, new_decode = new_decode, fit = fit)$fit
   td <- read.csv(file = inp$truefile, header = FALSE)
   td <- table(td[,2:3])
   td <- td / sum(td)
@@ -392,6 +395,7 @@ ExternalCounts <- function(inp, new_decode = FALSE) {
     }
   }
   ed[is.na(ed)] <- 0
+  ed[ed<0] <- 0
   
   time_taken <- proc.time() - ptm
   
diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py
index 40c4d635..ad21ea44 100755
--- a/tests/make_summary_assoc.py
+++ b/tests/make_summary_assoc.py
@@ -349,14 +349,14 @@ def main(argv):
         '%.3f&plusmn;%.3f' % (Mean(l1d_list), StandardErrorEstimate(l1d_list)),
         '',
       ]
-      row_str2 = ['', '', '', '', 
-        '%.3f&plusmn;%.3f' % (Mean(l1d_list2), StandardErrorEstimate(l1d_list2)),
-        '',
-      ]
       print '<tr><td></td>{}{}</tr>'.format(ParseSpecFile(spec_file, empty =
               True), ' '.join('<td><b>%s</b></td>' % cell for cell in
                               row_str))
       if (os.path.isfile(metrics_file)):
+        row_str2 = ['', '', '', '', 
+          '%.3f&plusmn;%.3f' % (Mean(l1d_list2), StandardErrorEstimate(l1d_list2)),
+          '',
+        ]
         print '<tr><td></td>{}{}</tr>'.format(ParseSpecFile(spec_file, empty =
               True), ' '.join('<td><b><i>%s</i></b></td>' % cell for cell in
                               row_str2))

From edb44d677535c1edde4aaf5814cda56945526b30 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 13 Jul 2015 10:23:27 -0700
Subject: [PATCH 40/67] Cleaning up expts in association.R

---
 analysis/R/association.R | 100 ---------------------------------------
 analysis/R/decode.R      |   2 +-
 2 files changed, 1 insertion(+), 101 deletions(-)

diff --git a/analysis/R/association.R b/analysis/R/association.R
index 393b0e3a..aaf0a8a0 100644
--- a/analysis/R/association.R
+++ b/analysis/R/association.R
@@ -137,33 +137,6 @@ GetJointConditionalProb <- function(cond_x, cond_y) {
   mapply("outer", cond_x, cond_y, SIMPLIFY = FALSE)
 }
 
-UpdatePij2 <- function(pij, reports, cohorts, cand_strs,
-                       params, map) {
-
-  accum <- array(0, dim(pij))
-  # For each report
-  for (i in seq(length(reports[[1]]))) {
-    # For each var
-    for (var in seq(length(reports))) {
-      idx <- cohorts[[var]][i]
-      rep <- GetCondProb(reports[[var]][[i]],
-                         candidate_strings = cand_strs[[var]],
-                         params = params,
-                         map[[var]]$map[[idx]], NULL)
-      if(var == 1) {
-        cond_joint_distr <- rep
-      } else {
-        cond_joint_distr <- outer(cond_joint_distr, rep)
-      }
-    }
-    z <- cond_joint_distr * pij
-    z <- z / sum(z)
-    z[is.nan(z)] <- 0
-    accum <- accum + z
-  }
-  accum / length(reports[[1]])
-}
-
 UpdatePij <- function(pij, cond_prob) {
   # Update the probability matrix based on the EM algorithm.
   #
@@ -182,23 +155,6 @@ UpdatePij <- function(pij, cond_prob) {
   Reduce("+", wcp) / length(wcp)
 }
 
-UpdatePij3 <- function(pij, cond_prob) {
-  wcp <- lapply(cond_prob, function(x) {
-    for (i in seq(length(x))) {
-      if (i == 1) {
-        op <- x[[i]]
-      } else {
-        op <- outer(op, x[[i]])
-      }
-    }
-    z <- op * pij
-    z <- z / sum(z)
-    z[is.nan(z)] <- 0
-    z
-  })
-  Reduce("+", wcp) / length(wcp)
-}
-
 NLL <- function(pij, cond_prob) {
   # Update the probability matrix based on the EM algorithm.
   #
@@ -230,62 +186,6 @@ ComputeVar <- function(cond_prob, est) {
   list(var_cov = var_cov, sd = sd, inform = inform)
 }
 
-EM2 <- function(reports, cohorts, cand_strs, starting_pij = NULL,
-                params, map,
-                max_iter = 1e03, epsilon = 1e-06) {
-  
-  # State space is the product of lengths.
-  state_space <- sapply(cand_strs, "length")
-  pij <- array()
-  if(is.null(starting_pij)) {
-    pij <- array(1 / prod(state_space), state_space)
-  } else {
-    pij <- starting_pij
-  }
-
-  if (nrow(pij) > 0) {
-    # Run EM
-    for (i in 1:max_iter) {
-      pij_new <- UpdatePij2(pij, reports, cohorts, cand_strs,
-                        params, map)
-      diff <- max(abs(pij_new - pij))
-      pij <- pij_new
-      if (diff < epsilon) {
-        break
-      }
-    }
-  }
-  list(hist = pij)
-}
-
-EM3 <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE,
-                max_iter = 1e03, epsilon = 1e-06, verbose = FALSE) {
-  pij <- list()
-  
-  # Compute dimensions of conditional distributions.
-  state_space <- sapply(cond_prob[[1]], length)
-  if (is.null(starting_pij)) {
-    pij <- array(1 / prod(state_space), state_space)
-  } else {
-    pij <- starting_pij
-  }
-  if (nrow(pij) > 0) {
-    # Run EM
-    for (i in 1:max_iter) {
-      if (i == 1) {
-        ptm_iter <- proc.time()
-      }
-      pij_new <- UpdatePij3(pij, cond_prob)
-      diff <- max(abs(pij_new - pij))
-      pij <- pij_new
-      if (diff < epsilon) {
-        break
-      }
-    }
-  }
-  list(est = pij, hist = pij, sd = 0)
-}
-
 EM <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE,
                max_iter = 1000, epsilon = 10^-6, verbose = FALSE) {
   # Performs estimation.
diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index f839ee01..adaa0b47 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -286,7 +286,7 @@ FitDistribution2 <- function(estimates_stds, map, fit) {
   Y <- as.vector(t(estimates_stds$estimates))
   m <- dim(X)[1]
   n <- dim(X)[2]
-  wt <- 1000  # weight to marginal constraints
+  wt <- 10000  # weight to marginal constraints
   
   G <- rbind2(Diagonal(n), rep(-1, n))
   H <- c(rep(0, n), -1)

From 09f22585e86c736945ea5817eca1a362523b124a Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 13 Jul 2015 11:01:03 -0700
Subject: [PATCH 41/67] Minor clean up.

---
 analysis/R/association.R   | 28 +++++-----------------------
 tests/analyze_assoc_expt.R |  8 ++------
 2 files changed, 7 insertions(+), 29 deletions(-)

diff --git a/analysis/R/association.R b/analysis/R/association.R
index aaf0a8a0..482cf918 100644
--- a/analysis/R/association.R
+++ b/analysis/R/association.R
@@ -292,8 +292,7 @@ ComputeDistributionEM <- function(reports, report_cohorts,
                                   maps, ignore_other = FALSE,
                                   params, quick = FALSE,
                                   marginals = NULL,
-                                  estimate_var = FALSE,
-                                  new_alg = FALSE) {
+                                  estimate_var = FALSE) {
   # Computes the distribution of num_variables variables, where
   #     num_variables is chosen by the client, using the EM algorithm.
   #
@@ -368,33 +367,16 @@ ComputeDistributionEM <- function(reports, report_cohorts,
       rep
     })
 
-    if(new_alg) {
-      # Report conditional distributions as lists
-      if (j == 1) {
-        # Conditional distribution for reports
-        joint_conditional <- lapply(cond_report_dist, "list")
-      } else {
-        joint_conditional <- mapply(function (x, y) c(x, list(y)),
-                                 joint_conditional, cond_report_dist,
-                                 SIMPLIFY = FALSE)
-      }
-    } else {
-      # Update the joint conditional distribution of all variables
-      joint_conditional <- UpdateJointConditional(cond_report_dist,
-                                                joint_conditional)
-    }
+    # Update the joint conditional distribution of all variables
+    joint_conditional <- UpdateJointConditional(cond_report_dist,
+                                              joint_conditional)
     print("TIME IN COND_REPORT_DIST")
     print(proc.time()-ptm)
   }
 
   ptm <- proc.time()
   # Run expectation maximization to find joint distribution
-  if (new_alg) {
-    funct <- EM3
-  } else {
-    funct <- EM
-  }
-  em <- funct(joint_conditional, epsilon = 10 ^ -5, verbose = FALSE,
+  em <- EM(joint_conditional, epsilon = 10 ^ -5, verbose = FALSE,
            estimate_var = estimate_var)
   print("TIME IN EM")
   print(proc.time() - ptm)
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index 8c2a5ee7..20504ff4 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -480,8 +480,7 @@ ExternalReportsEM <- function(inp) {
                                       ignore_other = TRUE,
                                       quick = TRUE,
                                       params, marginals = NULL,
-                                      estimate_var = FALSE,
-                                      new_alg = inp$newalg)
+                                      estimate_var = FALSE)
   em <- joint_dist$orig$fit
   td <- read.csv(file = inp$truefile, header = FALSE)
   td <- table(td[,2:3])
@@ -521,10 +520,7 @@ main <- function(opts) {
   # direct -> direct simulation of reports (without variances)
   # external-counts -> externally supplied counts for 2 way and marginals
   # external-reports -> externally supplied reports 
-  if (!(inp$expt %in% c("direct", "external-counts", "external-reports-em"))) {
-    stop("Incorrect experiment in JSON file.")
-  }
-  
+
   if("direct" %in% inp$expt) {
     print("---------- RUNNING EXPERIMENT DIRECT ----------")
     DirectSimulationOfReports(inp)

From 55b18c38c8d3af5dd07d098184bea950a8ea9018 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 13 Jul 2015 13:08:19 -0700
Subject: [PATCH 42/67] Moving 2 way marginal code to its own file.

---
 analysis/R/association.R   |   4 +-
 analysis/R/decode.R        | 153 -------------------------------------
 tests/analyze_assoc_expt.R |   5 +-
 3 files changed, 5 insertions(+), 157 deletions(-)

diff --git a/analysis/R/association.R b/analysis/R/association.R
index 482cf918..2a19656f 100644
--- a/analysis/R/association.R
+++ b/analysis/R/association.R
@@ -376,12 +376,12 @@ ComputeDistributionEM <- function(reports, report_cohorts,
 
   ptm <- proc.time()
   # Run expectation maximization to find joint distribution
-  em <- EM(joint_conditional, epsilon = 10 ^ -5, verbose = FALSE,
+  em <- EM(joint_conditional, epsilon = 10 ^ -6, verbose = FALSE,
            estimate_var = estimate_var)
   print("TIME IN EM")
   print(proc.time() - ptm)
   dimnames(em$est) <- found_strings
 
   # Return results in a usable format
-  list(orig = list(fit = em$est, sd = em$sd, em = em))
+  list(fit = em$est, sd = em$sd, em = em)
 }
diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index adaa0b47..1c5c327f 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -18,62 +18,6 @@
 library(glmnet)
 library(limSolve)
 
-Estimate2WayBloomCounts <- function(params, obs_counts) {
-  p <- params$p
-  q <- params$q
-  f <- params$f
-  m <- params$m
-  k <- params$k
-
-  stopifnot(m == nrow(obs_counts), params$k + 1 == ncol(obs_counts))
-
-  p11 <- q * (1 - f/2) + p * f / 2  # probability of a true 1 reported as 1
-  p01 <- p * (1 - f/2) + q * f / 2  # probability of a true 0 reported as 1
-  p10 <- 1 - p11  # probability of a true 1 reported as 0
-  p00 <- 1 - p01  # probability of a true 0 reported as 0
-
-  NoiseMatrix <- matrix(rep(0, 16), 4)
-  NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2)
-  NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00)
-  NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01)
-  NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2)
-
-  ests <- apply(obs_counts, 1, function(x) {
-    N <- x[1]
-    inds <- seq(0, (k/4)-1)
-    v <- x[-1]
-    sapply(inds, function(i){
-      as.vector(t(Solve(NoiseMatrix)) %*% v[(i*4 + 1):((i+1)*4)])
-    })
-  })
-
-  if(FALSE) {
-    # TODO(pseudorandom): Compute variances
-    variances <- apply(obs_counts, 1, function(x) {
-      N <- x[1]
-      v <- x[-1]
-      p_hats <- (v - p01 * N) / (N * p2)  # expectation of a true 1
-      p_hats <- pmax(0, pmin(1, p_hats))  # clamp to [0,1]
-      r <- p_hats * p11 + (1 - p_hats) * p01  # expectation of a reported 1
-      N * r * (1 - r) / p2^2  # variance of the binomial
-    })
-  }
-
-  # Transform counts from absolute values to fractional, removing bias due to
-  #      variability of reporting between cohorts.
-  ests <- apply(ests, 1, function(x) x / obs_counts[,1])
-  # stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1])
-
-  # Some estimates may be set to infinity, e.g. if f=1. We want to
-  #     account for this possibility, and set the corresponding counts
-  #     to 0.
-  ests[abs(ests) == Inf] <- 0
-
-  list(estimates = ests,
-       stds = matrix(rep(100, length(ests[,1]) * length(ests[1,])),
-                     length(ests[,1])))
-}
-
 EstimateBloomCounts <- function(params, obs_counts) {
   # Estimates the number of times each bit in each cohort was set in original
   # Bloom filters.
@@ -268,69 +212,6 @@ ComputePrivacyGuarantees <- function(params, alpha, N) {
   privacy
 }
 
-# Implements lsei
-# FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
-#   X <- map
-#   Y <- as.vector(t(estimates_stds$estimates))
-#   m <- dim(X)[1]
-#   n <- dim(X)[2]
-#   
-#   G <- rbind2(Diagonal(n), rep(-1, n))
-#   H <- c(rep(0, n), -1)
-#   lsei(A = X, B = Y, G = G, H = H, type = 2)$X
-# }
-
-FitDistribution2 <- function(estimates_stds, map, fit) {
-
-  X <- as.matrix(map)
-  Y <- as.vector(t(estimates_stds$estimates))
-  m <- dim(X)[1]
-  n <- dim(X)[2]
-  wt <- 10000  # weight to marginal constraints
-  
-  G <- rbind2(Diagonal(n), rep(-1, n))
-  H <- c(rep(0, n), -1)
-  
-  # Adding marginals constraints to X and Y
-  fstrs <- lapply(fit, function(x) x[,"string"])  # found strings
-  
-  Y <- c(Y, wt * t(fit[[1]]["proportion"]), wt * t(fit[[2]]["proportion"]))
-  
-  for (strs in fstrs[[1]]) {
-    indices <- which(colnames(map) %in% outer(strs,
-                                              fstrs[[2]],
-                                              function(x, y) paste(x, y, sep = "x")))
-    vec <- rep(0, n)
-    vec[indices] <- wt
-    X <- rbind2(X, vec)
-  }
-  for (strs in fstrs[[2]]) {
-    indices <- which(colnames(map) %in% outer(fstrs[[1]],
-                                              strs,
-                                              function(x, y) paste(x, y, sep = "x")))
-    vec <- rep(0, n)
-    vec[indices] <- wt
-    X <- rbind2(X, vec)
-  }
-  
-  lsei(A = X, B = Y, G = G, H = H, type = 2)$X
-  
-  # Random projection params
-#   size <- 10 * n
-#   density <- 0.05
-#   rproj <- matrix(0, size, m)
-#   rproj[sample(length(rproj), size = density * length(rproj))] <- rnorm(density * length(rproj))
-#   # rproj <- matrix(rnorm(10*n*m), 10*n, m)
-#   Xproj <- rproj %*% X
-#   Yproj <- as.vector(rproj %*% Y)
-#   mproj <- dim(Xproj)[1]
-#   nproj <- dim(Xproj)[2]
-#   
-#   G <- rbind2(Diagonal(nproj), rep(-1, nproj))
-#   H <- c(rep(0, nproj), -1)
-#   lsei(A = Xproj, B = Yproj, G = G, H = H, type = 2)$X
-}
-
 FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
   # Find a distribution over rows of map that approximates estimates_stds best
   #
@@ -364,40 +245,6 @@ Resample <- function(e) {
   list(estimates = estimates, stds = stds)
 }
 
-Decode2Way <- function(counts, map, params, new_decode = FALSE, fit = NULL) {
-  k <- params$k
-  p <- params$p
-  q <- params$q
-  f <- params$f
-  h <- params$h
-  m <- params$m
-
-  S <- ncol(map)  # total number of candidates
-
-  N <- sum(counts[, 1])
-
-  filter_cohorts <- which(counts[, 1] != 0)  # exclude cohorts with zero reports
-
-  # stretch cohorts to bits
-  filter_bits <- as.vector(
-    t(matrix(1:nrow(map), nrow = m, byrow = TRUE)[filter_cohorts,]))
-
-  es <- Estimate2WayBloomCounts(params, counts)
-  e <- list(estimates = es$estimates[filter_cohorts, , drop = FALSE],
-            stds = es$stds[filter_cohorts, , drop = FALSE])
-  if (new_decode == TRUE) {
-    coefs <- FitDistribution2(e, map[filter_bits, , drop = FALSE], fit)
-  } else {
-    coefs <- FitDistribution(e, map[filter_bits, , drop = FALSE])
-  }
-  fit <- data.frame(String = colnames(map[filter_bits, , drop = FALSE]),
-                    Estimate = matrix(coefs, ncol = 1),
-                    SD = matrix(coefs, ncol = 1),
-                    stringsAsFactors = FALSE)
-  rownames(fit) <- fit[,"String"]
-  list(fit = fit)
-}
-
 Decode <- function(counts, map, params, quick = FALSE, alpha = 0.05,
                    correction = c("Bonferroni"), quiet = FALSE, ...) {
   k <- params$k
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index 20504ff4..d91dad21 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -35,6 +35,7 @@ if(!interactive()) {
   opts <- parse_args(OptionParser(option_list = option_list))
 }
 
+source("analysis/R/decode2way.R")
 source("analysis/R/encode.R")
 source("analysis/R/decode.R")
 source("analysis/R/simulation.R")
@@ -384,7 +385,7 @@ ExternalCounts <- function(inp, new_decode = FALSE) {
   pruned <- lapply(1:2, function(i)
     lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE]))
   crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap
-  marginal <- Decode2Way(counts[[1]], crmap, params2, new_decode = new_decode, fit = fit)$fit
+  marginal <- Decode2Way(counts[[1]], crmap, params2, fit = fit)$fit
   td <- read.csv(file = inp$truefile, header = FALSE)
   td <- table(td[,2:3])
   td <- td / sum(td)
@@ -481,7 +482,7 @@ ExternalReportsEM <- function(inp) {
                                       quick = TRUE,
                                       params, marginals = NULL,
                                       estimate_var = FALSE)
-  em <- joint_dist$orig$fit
+  em <- joint_dist$fit
   td <- read.csv(file = inp$truefile, header = FALSE)
   td <- table(td[,2:3])
   td <- td / sum(td)

From bcbacbe37daf6205897474783d3e246a1af5b6cd Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 13 Jul 2015 14:14:34 -0700
Subject: [PATCH 43/67] Small changes to sum_bits_assoc.

---
 analysis/tools/sum_bits_assoc.py | 27 ++++++++++++++-------------
 tests/regtest_spec.py            | 14 +++++++-------
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/analysis/tools/sum_bits_assoc.py b/analysis/tools/sum_bits_assoc.py
index acf5ea2c..b339473d 100755
--- a/analysis/tools/sum_bits_assoc.py
+++ b/analysis/tools/sum_bits_assoc.py
@@ -54,22 +54,23 @@ def SumBits(params, stdin, f_2way, f_1, f_2):
     # TODO: Extend checking for both reports
     if not len(irr_1) == params.num_bloombits:
       raise RuntimeError(
-          "Expected %d bits, got %r" % (params.num_bloombits, len(irr_1)))
+        "Expected %d bits in report 1, got %r" % 
+        (params.num_bloombits, len(irr_1)))
+    if not len(irr_2) == params.num_bloombits:
+      raise RuntimeError(
+        "Expected %d bits in report 2, got %r" % 
+        (params.num_bloombits, len(irr_2)))
     # "Unrolled" joint encoding of both reports
+    index_array = [[3, 1], [2, 0]]
     for i, c in enumerate(irr_1):
       for j, d in enumerate(irr_2):
         index = 4 * ((num_bloombits - i - 1) * params.num_bloombits +
                      num_bloombits - j - 1)
-        if (c == '1' and d == '1'):
-          sums[cohort][index] += 1
-        elif (c == '0' and d == '1'):
-          sums[cohort][index + 1] += 1
-        elif (c == '1' and d == '0'):
-          sums[cohort][index + 2] += 1
-        elif (c == '0' and d == '0'):
-          sums[cohort][index + 3] += 1
-        else:
-          raise RuntimeError('Invalid IRRs -- digits should be 0 or 1')
+        try: 
+          diff = index_array[int(c)][int(d)]
+        except IndexError:
+          raise RuntimeError('Invalid IRRs; digits should be 0/1')
+        sums[cohort][index + diff] += 1
 
     for i, c in enumerate(irr_1):
       bit_num = num_bloombits - i - 1  # e.g. char 0 = bit 15, char 15 = bit 0
@@ -77,7 +78,7 @@ def SumBits(params, stdin, f_2way, f_1, f_2):
         sums_1[cohort][bit_num] += 1
       else:
         if c != '0':
-          raise RuntimeError('Invalid IRR -- digits should be 0 or 1')
+          raise RuntimeError('Invalid IRRs; digits should be 0/1')
 
     for i, c in enumerate(irr_2):
       bit_num = num_bloombits - i - 1  # e.g. char 0 = bit 15, char 15 = bit 0
@@ -85,7 +86,7 @@ def SumBits(params, stdin, f_2way, f_1, f_2):
         sums_2[cohort][bit_num] += 1
       else:
         if c != '0':
-          raise RuntimeError('Invalid IRR -- digits should be 0 or 1')
+          raise RuntimeError('Invalid IRRs; digits should be 0/1')
 
   for cohort in xrange(num_cohorts):
     # First column is the total number of reports in the cohort.
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index d1ee7ebb..6e0a602e 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -134,13 +134,13 @@
 #            'fizz-small-bool',
 #            'fizz',
 #            'fizz-bool',),
-#            'toy',),
-            'compact-noextra-small',
-            'loose-noextra-small',
-            'compact-extra-small',
-            'loose-extra-small',
-            'compact-excess-small',
-            'loose-excess-small',),
+            'toy',),
+#            'compact-noextra-small',
+#            'loose-noextra-small',
+#            'compact-extra-small',
+#            'loose-extra-small',
+#            'compact-excess-small',
+#            'loose-excess-small',),
 #            'compact-noextra-large',
 #            'loose-noextra-large',
 #            'compact-extra-large',

From a155be8aea4b1dd9ef282392a3fd57acec149b28 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 13 Jul 2015 15:56:04 -0700
Subject: [PATCH 44/67] Merging from master branch.

---
 analysis/R/decode.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index c8f32fd1..86f09303 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -102,6 +102,8 @@ FitLasso <- function(X, Y, intercept = TRUE) {
   # is to avoid overfitting.
   cap <- min(500, nrow(X) * .8, ncol(X))
 
+  # TODO: take care of corner case when ncol(X) == 1
+  # currently glmnet() fails
   mod <- glmnet(X, Y, standardize = FALSE, intercept = intercept,
                 lower.limits = 0,  # outputs are non-negative
                 pmax = cap)
@@ -244,7 +246,7 @@ Resample <- function(e) {
   list(estimates = estimates, stds = stds)
 }
 
-Decode <- function(counts, map, params, alpha = 0.05,
+Decode <- function(counts, map, params, alpha = 0.05, quick = FALSE,
                    correction = c("Bonferroni"), quiet = FALSE, ...) {
   k <- params$k
   p <- params$p

From 00b827b2b0d21ec9a8271d594bd2e262d311fa69 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Tue, 14 Jul 2015 14:05:59 -0700
Subject: [PATCH 45/67] Some code refactoring.

- moved 2-way association code to its own file
- moved specs related to assoctest to its own file
- other minor changes
---
 analysis/R/decode2way.R          | 196 +++++++++++++++++++++++++++++++
 analysis/tools/sum_bits_assoc.py |  10 +-
 assoctest.sh                     |  28 ++---
 tests/assoctest_spec.py          | 137 +++++++++++++++++++++
 tests/regtest_spec.py            |  90 --------------
 5 files changed, 350 insertions(+), 111 deletions(-)
 create mode 100644 analysis/R/decode2way.R
 create mode 100755 tests/assoctest_spec.py

diff --git a/analysis/R/decode2way.R b/analysis/R/decode2way.R
new file mode 100644
index 00000000..63bb8f69
--- /dev/null
+++ b/analysis/R/decode2way.R
@@ -0,0 +1,196 @@
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# This library implements RAPPOR decoding algorithms for 2 way association.
+#
+
+library(limSolve)
+source("analysis/R/decode.R")
+
+EstimateBloomCounts2Way <- function(params, obs_counts) {
+  # Estimates original bloom filter counts of each pair of bits
+  # in the original bloom filters of each report
+  #
+  # Input:
+  #    params: a list of RAPPOR parameters:
+  #            k - size of a Bloom filter
+  #            h - number of hash functions
+  #            m - number of cohorts
+  #            p - P(IRR = 1 | PRR = 0)
+  #            q - P(IRR = 1 | PRR = 1)
+  #            f - Proportion of bits in the Bloom filter that are set randomly
+  #                to 0 or 1 regardless of the underlying true bit value
+  #    obs_counts: a matrix of size m by (4k**2 + 1). Column one contains sample
+  #                sizes for each cohort. Other counts indicated how many times
+  #                pairs of bits {11, 10, 01, 00} were set across the two
+  #                reports (in a "1st report"-major order)
+  #
+  # Output:
+  #    ests: a matrix of size m by 4k**2 with estimated counts
+  #    stds: currently, just a filler value of 100
+  
+  p <- params$p
+  q <- params$q
+  f <- params$f
+  m <- params$m
+  k <- params$k
+  
+  stopifnot(m == nrow(obs_counts), params$k + 1 == ncol(obs_counts))
+  
+  p11 <- q * (1 - f/2) + p * f / 2  # probability of a true 1 reported as 1
+  p01 <- p * (1 - f/2) + q * f / 2  # probability of a true 0 reported as 1
+  p10 <- 1 - p11  # probability of a true 1 reported as 0
+  p00 <- 1 - p01  # probability of a true 0 reported as 0
+  
+  # The NoiseMatrix describes the probability that input pairs of bits
+  # are mapped to outputs {11, 10, 01, 00} due to noise added by RAPPOR
+  NoiseMatrix <- matrix(rep(0, 16), 4)
+  NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2)
+  NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00)
+  NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01)
+  NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2)
+  
+  # Apply the inverse of NoiseMatrix to get an unbiased estimator for
+  # the number of times input pairs of bits were seen.
+  # Apply the matrix to 4 values at a time from obs_counts
+  ests <- apply(obs_counts, 1, function(x) {
+    N <- x[1]
+    inds <- seq(0, (k/4)-1)
+    v <- x[-1]
+    sapply(inds, function(i){
+      as.vector(t(solve(NoiseMatrix)) %*% v[(i*4 + 1):((i+1)*4)])
+    })
+  })
+  
+  # Transform counts from absolute values to fractional, removing bias due to
+  #      variability of reporting between cohorts.
+  ests <- apply(ests, 1, function(x) x / obs_counts[,1])
+  # TODO: compute stddev in distribution induced by estimation
+  # stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1])
+  
+  # Some estimates may be set to infinity, e.g. if f=1. We want to
+  #     account for this possibility, and set the corresponding counts
+  #     to 0.
+  ests[abs(ests) == Inf] <- 0
+  
+  list(estimates = ests,
+       stds = matrix(rep(100, length(ests[,1]) * length(ests[1,])),
+                     length(ests[,1])))
+}
+
+# Implements lsei
+FitDistribution2Way <- function(estimates_stds, map,
+                                fit = NULL,
+                                quiet = FALSE) {
+  X <- map
+  Y <- as.vector(t(estimates_stds$estimates))
+  m <- dim(X)[1]
+  n <- dim(X)[2]
+  
+  G <- rbind2(Diagonal(n), rep(-1, n))
+  H <- c(rep(0, n), -1)
+  lsei(A = X, B = Y, G = G, H = H, type = 2)$X
+}
+
+# FitDistribution2Way <- function(estimates_stds, map, fit) {
+#   # Find a distribution over rows of map that approximates estimates_stds best
+#   #
+#   # Input:
+#   #   estimates_stds: a list of two m x k matrices, one for estimates, another
+#   #                   for standard errors
+#   #   map           : an (m * k) x S boolean matrix
+#   #
+#   # Output:
+#   #   a float vector of length S, so that a distribution over map's rows sampled
+#   #   according to this vector approximates estimates
+#   
+#   X <- as.matrix(map)
+#   Y <- as.vector(t(estimates_stds$estimates))
+#   m <- dim(X)[1]
+#   n <- dim(X)[2]
+#   wt <- 10000  # weight to marginal constraints
+#   
+#   G <- rbind2(Diagonal(n), rep(-1, n))
+#   H <- c(rep(0, n), -1)
+#   
+#   # Adding marginals constraints to X and Y
+#   fstrs <- lapply(fit, function(x) x[,"string"])  # found strings
+#   
+#   Y <- c(Y, wt * t(fit[[1]]["proportion"]), wt * t(fit[[2]]["proportion"]))
+#   
+#   for (strs in fstrs[[1]]) {
+#     indices <- which(colnames(map) %in% outer(strs,
+#                                               fstrs[[2]],
+#                                               function(x, y) paste(x, y, sep = "x")))
+#     vec <- rep(0, n)
+#     vec[indices] <- wt
+#     X <- rbind2(X, vec)
+#   }
+#   for (strs in fstrs[[2]]) {
+#     indices <- which(colnames(map) %in% outer(fstrs[[1]],
+#                                               strs,
+#                                               function(x, y) paste(x, y, sep = "x")))
+#     vec <- rep(0, n)
+#     vec[indices] <- wt
+#     X <- rbind2(X, vec)
+#   }
+#   
+#   lsei(A = X, B = Y, G = G, H = H, type = 2)$X
+  
+  # Random projection params
+  #   size <- 10 * n
+  #   density <- 0.05
+  #   rproj <- matrix(0, size, m)
+  #   rproj[sample(length(rproj), size = density * length(rproj))] <- rnorm(density * length(rproj))
+  #   # rproj <- matrix(rnorm(10*n*m), 10*n, m)
+  #   Xproj <- rproj %*% X
+  #   Yproj <- as.vector(rproj %*% Y)
+  #   mproj <- dim(Xproj)[1]
+  #   nproj <- dim(Xproj)[2]
+  #   
+  #   G <- rbind2(Diagonal(nproj), rep(-1, nproj))
+  #   H <- c(rep(0, nproj), -1)
+  #   lsei(A = Xproj, B = Yproj, G = G, H = H, type = 2)$X
+# }
+
+Decode2Way <- function(counts, map, params, fit = NULL) {
+  k <- params$k
+  p <- params$p
+  q <- params$q
+  f <- params$f
+  h <- params$h
+  m <- params$m
+  
+  S <- ncol(map)  # total number of candidates
+  
+  N <- sum(counts[, 1])
+  
+  filter_cohorts <- which(counts[, 1] != 0)  # exclude cohorts with zero reports
+  
+  # stretch cohorts to bits
+  filter_bits <- as.vector(
+    t(matrix(1:nrow(map), nrow = m, byrow = TRUE)[filter_cohorts,]))
+  
+  es <- EstimateBloomCounts2Way(params, counts)
+  e <- list(estimates = es$estimates[filter_cohorts, , drop = FALSE],
+            stds = es$stds[filter_cohorts, , drop = FALSE])
+  coefs <- FitDistribution2Way(e, map[filter_bits, , drop = FALSE], fit)
+  fit <- data.frame(String = colnames(map[filter_bits, , drop = FALSE]),
+                    Estimate = matrix(coefs, ncol = 1),
+                    SD = matrix(coefs, ncol = 1),
+                    stringsAsFactors = FALSE)
+  rownames(fit) <- fit[,"String"]
+  list(fit = fit)
+}
diff --git a/analysis/tools/sum_bits_assoc.py b/analysis/tools/sum_bits_assoc.py
index b339473d..9bdd7f95 100755
--- a/analysis/tools/sum_bits_assoc.py
+++ b/analysis/tools/sum_bits_assoc.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 #
-# Copyright 2014 Google Inc. All rights reserved.
+# Copyright 2015 Google Inc. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,8 +15,12 @@
 # limitations under the License.
 
 """
-Read the RAPPOR'd values on stdin, and sum the bits to produce a Counting Bloom
-filter by cohort.  This can then be analyzed by R.
+Read RAPPOR values of 2 variables from stdin.
+Read parameters from parameter file and a prefix.
+
+Output counts of bloom filter bits set for each variable (1-way totals)
+and counts of pairwise bits set (2-way totals) into files with suffixes
+_marg1.csv, _marg2.csv, _2way.csv respectively.
 """
 
 import csv
diff --git a/assoctest.sh b/assoctest.sh
index 492d949a..96bf597d 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -11,9 +11,7 @@
 #    run [[<pattern> [<num>]] - run tests matching <pattern> in
 #                               parallel, each <num> times.
 #
-#    ## run-seq currently not supported!
 #    run-seq [<pattern> [<num>]] - ditto, except that tests are run sequentially
-#    ## --
 #
 #    run-all [<num>]             - run all tests, in parallel, each <num> times
 #
@@ -27,6 +25,8 @@
 # use $ in the pattern, since it matches the whole spec line and not just the
 # test case name.) The number of processors used in a parallel run is one less
 # than the number of CPUs on the machine.
+#
+# fast_counts param inherited from regtest.sh, but currently not used
 
 
 set -o nounset
@@ -276,6 +276,8 @@ _run-tests() {
     func=_run-one-instance-logged
     processors=$(grep -c ^processor /proc/cpuinfo || echo 4)  # POSIX-specific
     if test $processors -gt 6; then  # leave few CPUs for the OS
+      # Association tests take up a lot of memory; so restricted to a few
+      # processes at a time
       processors=5
     else
       processors=1
@@ -284,7 +286,7 @@ _run-tests() {
   fi
 
   local cases_list=$ASSOCTEST_DIR/test-cases.txt
-  tests/regtest_spec.py | grep -E $spec_regex > $cases_list
+  tests/assoctest_spec.py | grep -E $spec_regex > $cases_list
 
   # Generate parameters for all test cases.
   cat $cases_list \
@@ -305,22 +307,12 @@ _run-tests() {
 }
 
 # Run tests sequentially
-#run-seq() {
-#  local spec_regex=${1:-'^r-'}  # grep -E format on the spec
-#  local instances=${2:-1}
-#  local fast_counts=${3:-T}
-#
-#  _run-tests $spec_regex $instances F $fast_counts
-#}
+run-seq() {
+  local spec_regex=${1:-'^a-'}  # grep -E format on the spec
+  local instances=${2:-1}
 
-# Run tests in parallel
-#run() {
-#  local spec_regex=${1:-'^r-'}  # grep -E format on the spec
-#  local instances=${2:-1}
-#  local fast_counts=${3:-T}
-#
-#  _run-tests $spec_regex $instances T $fast_counts
-#}
+  _run-tests $spec_regex $instances F T
+}
 
 # Run tests in parallel
 run-all() {
diff --git a/tests/assoctest_spec.py b/tests/assoctest_spec.py
new file mode 100755
index 00000000..3d1642dd
--- /dev/null
+++ b/tests/assoctest_spec.py
@@ -0,0 +1,137 @@
+#!/usr/bin/python
+"""Print a test spec on stdout.
+
+Each line has parameters for a test case.  The assoctest.sh shell script reads
+these lines and runs parallel processes.
+
+We use Python data structures so the test cases are easier to read and edit.
+"""
+
+import optparse
+import sys
+
+DISTRIBUTION_PARAMS_ASSOC = {
+    # name, num unique values 1,
+    # num unique values 2, num clients
+    'tiny': (100, 2, int(1e03)),   # test for insufficient data
+    'small': (100, 10, int(1e04)),
+#    'fizz-tiny': (100, 20, int(1e03)),
+#    'fizz-tiny-bool': (100, 2, int(1e03)),
+#    'fizz-small': (100, 20, int(1e04)),
+#    'fizz-small-bool': (100, 2, int(1e04)),
+#    'fizz': (100, 20, int(1e05)),
+#    'fizz-large': (100, 50, int(1e05)),
+#    'fizz-2large': (100, 50, int(5e05)),
+#    'fizz-bool': (100, 2, int(1e05)),
+    'medium': (1000, 10, int(1e05)),
+    'medium2': (1000, 2, int(1e05)),
+    'large': (10000, 10, int(1e06)),
+    'large2': (10000, 2, int(1e06)),
+    'largesquared': (int(1e04), 100, int(1e06)),
+
+    # new test names for 2-way marginals
+    # includes testing for extras
+    'fizz-tiny': (100, 20, int(1e03), int(1e04)),
+    'fizz-tiny-bool': (100, 2, int(1e03), int(1e04)),
+    'fizz-small': (100, 20, int(1e04), int(1e04)),
+    'fizz-small-bool': (100, 2, int(1e04), int(1e04)),
+    'fizz': (100, 20, int(1e05), int(1e04)),
+    'fizz-bool': (100, 2, int(1e05), int(1e04)),
+
+    'toy': (5, 2, 1e04, 20),  # for testing purposes only
+    'compact-noextra-small': (40, 5, 1e04, 0),
+    'loose-noextra-small': (100, 20, 1e04, 0),
+    'compact-noextra-large': (40, 5, 1e06, 0),
+    'loose-noextra-large': (100, 20, 1e06, 0),
+    'compact-extra-small': (40, 5, int(1e04), int(1e04)),
+    'loose-extra-small': (100, 20, int(1e04), int(1e04)),
+    'compact-extra-large': (40, 5, int(1e06), int(1e04)),
+    'loose-extra-large': (100, 20, int(1e06), int(1e04)),
+    'compact-excess-small': (40, 5, int(1e04), int(1e05)),
+    'loose-excess-small': (100, 20, int(1e04), int(1e05)),
+    'compact-excess-large': (40, 5, int(1e06), int(1e05)),
+    'loose-excess-large': (100, 20, int(1e06), int(1e05)),
+}
+
+# 'k, h, m' as in params file.
+BLOOMFILTER_PARAMS = {
+    '8x16': (8, 2, 16),  # 16 cohorts, 8 bits each, 2 bits set in each
+    '8x32': (8, 2, 32),  # 32 cohorts, 8 bits each, 2 bits set in each
+    '16x32': (16, 2, 32),  # 32 cohorts, 16 bits each, 2 bits set in each
+    '8x128': (8, 2, 128),  # 128 cohorts, 8 bits each, 2 bits set in each
+    '128x128': (128, 2, 128),  # 8 cohorts, 128 bits each, 2 bits set in each
+}
+
+# 'p, q, f' as in params file.
+PRIVACY_PARAMS = {
+    'eps_zero': (0, 0.99, 0),  # testing purposes only!
+    'eps_1_1': (0.39, 0.61, 0.45),  # eps_1 = 1, eps_inf = 5:
+    'eps_1_5': (0.225, 0.775, 0.0),  # eps_1 = 5, no eps_inf
+    'eps_verysmall': (0.125, 0.875, 0.125),
+    'eps_small': (0.125, 0.875, 0.5),
+    'eps_chrome': (0.25, 0.75, 0.5),
+    'uma_rappor_type': (0.50, 0.75, 0.5),
+}
+
+# assoc test configuration ->
+#   (distribution params set, bloomfilter params set,
+#    privacy params set)
+# The test config runs a test suite that is the cross product of all the above
+# sets
+ASSOC_TEST_CONFIG = {
+  'distr': (
+#            'fizz-tiny',
+#            'fizz-tiny-bool',
+#            'fizz-small',
+#            'fizz-small-bool',
+#            'fizz',
+#            'fizz-bool',),
+            'toy',),
+#            'compact-noextra-small',
+#            'loose-noextra-small',
+#            'compact-extra-small',
+#            'loose-extra-small',
+#            'compact-excess-small',
+#            'loose-excess-small',),
+#            'compact-noextra-large',
+#            'loose-noextra-large',
+#            'compact-extra-large',
+#            'loose-extra-large',
+#            'compact-excess-large',
+#            'loose-excess-large'),
+  'blooms': (
+             '8x32',
+             '16x32',),
+  'privacy': (
+              'eps_small',
+              'eps_chrome',)
+}
+
+#
+# END TEST CONFIGURATION
+#
+
+def main(argv):
+  rows = []
+  test_case = []
+  # Association tests
+  for distr in ASSOC_TEST_CONFIG['distr']:
+    for blooms in ASSOC_TEST_CONFIG['blooms']:
+      for privacy in ASSOC_TEST_CONFIG['privacy']:
+        print distr, blooms, privacy
+        test_name = 'a-{}-{}-{}'.format(distr, blooms, privacy)
+        params = (BLOOMFILTER_PARAMS[blooms] +
+                  PRIVACY_PARAMS[privacy])
+        test_case = (test_name,) + DISTRIBUTION_PARAMS_ASSOC[distr] + params
+        row_str = [str(element) for element in test_case]
+        rows.append(row_str)
+
+  for row in rows:
+    print ' '.join(row)
+
+if __name__ == '__main__':
+  try:
+    main(sys.argv)
+  except RuntimeError, e:
+    print >>sys.stderr, 'FATAL: %s' % e
+    sys.exit(1)
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 6e0a602e..6350ae7a 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -41,49 +41,6 @@
     ('large', 10000, 100000000, 1),
 )
 
-DISTRIBUTION_PARAMS_ASSOC = {
-    # name, num unique values 1,
-    # num unique values 2, num clients
-    'tiny': (100, 2, int(1e03)),   # test for insufficient data
-    'small': (100, 10, int(1e04)),
-#    'fizz-tiny': (100, 20, int(1e03)),
-#    'fizz-tiny-bool': (100, 2, int(1e03)),
-#    'fizz-small': (100, 20, int(1e04)),
-#    'fizz-small-bool': (100, 2, int(1e04)),
-#    'fizz': (100, 20, int(1e05)),
-#    'fizz-large': (100, 50, int(1e05)),
-#    'fizz-2large': (100, 50, int(5e05)),
-#    'fizz-bool': (100, 2, int(1e05)),
-    'medium': (1000, 10, int(1e05)),
-    'medium2': (1000, 2, int(1e05)),
-    'large': (10000, 10, int(1e06)),
-    'large2': (10000, 2, int(1e06)),
-    'largesquared': (int(1e04), 100, int(1e06)),
-
-    # new test names for 2-way marginals
-    # includes testing for extras
-    'fizz-tiny': (100, 20, int(1e03), int(1e04)),
-    'fizz-tiny-bool': (100, 2, int(1e03), int(1e04)),
-    'fizz-small': (100, 20, int(1e04), int(1e04)),
-    'fizz-small-bool': (100, 2, int(1e04), int(1e04)),
-    'fizz': (100, 20, int(1e05), int(1e04)),
-    'fizz-bool': (100, 2, int(1e05), int(1e04)),
-
-    'toy': (5, 2, 1e04, 20),  # for testing purposes only
-    'compact-noextra-small': (40, 5, 1e04, 0),
-    'loose-noextra-small': (100, 20, 1e04, 0),
-    'compact-noextra-large': (40, 5, 1e06, 0),
-    'loose-noextra-large': (100, 20, 1e06, 0),
-    'compact-extra-small': (40, 5, int(1e04), int(1e04)),
-    'loose-extra-small': (100, 20, int(1e04), int(1e04)),
-    'compact-extra-large': (40, 5, int(1e06), int(1e04)),
-    'loose-extra-large': (100, 20, int(1e06), int(1e04)),
-    'compact-excess-small': (40, 5, int(1e04), int(1e05)),
-    'loose-excess-small': (100, 20, int(1e04), int(1e05)),
-    'compact-excess-large': (40, 5, int(1e06), int(1e05)),
-    'loose-excess-large': (100, 20, int(1e06), int(1e05)),
-}
-
 # 'k, h, m' as in params file.
 BLOOMFILTER_PARAMS = {
     '8x16': (8, 2, 16),  # 16 cohorts, 8 bits each, 2 bits set in each
@@ -121,40 +78,6 @@
     ('over_x10', '8x128', 'eps_1_1', 10.0, '10%'),  # overshoot by x10
 ]
 
-# assoc test configuration ->
-#   (distribution params set, bloomfilter params set,
-#    privacy params set)
-# The test config runs a test suite that is the cross product of all the above
-# sets
-ASSOC_TEST_CONFIG = {
-  'distr': (
-#            'fizz-tiny',
-#            'fizz-tiny-bool',
-#            'fizz-small',
-#            'fizz-small-bool',
-#            'fizz',
-#            'fizz-bool',),
-            'toy',),
-#            'compact-noextra-small',
-#            'loose-noextra-small',
-#            'compact-extra-small',
-#            'loose-extra-small',
-#            'compact-excess-small',
-#            'loose-excess-small',),
-#            'compact-noextra-large',
-#            'loose-noextra-large',
-#            'compact-extra-large',
-#            'loose-extra-large',
-#            'compact-excess-large',
-#            'loose-excess-large'),
-  'blooms': (
-             '8x32',
-             '16x32',),
-  'privacy': (
-              'eps_small',
-              'eps_chrome',)
-}
-
 #
 # END TEST CONFIGURATION
 #
@@ -184,19 +107,6 @@ def main(argv):
   for params in DEMO:
     rows.append(params)
 
-  # Association tests
-  for distr in ASSOC_TEST_CONFIG['distr']:
-    for blooms in ASSOC_TEST_CONFIG['blooms']:
-      for privacy in ASSOC_TEST_CONFIG['privacy']:
-        print distr, blooms, privacy
-        test_name = 'a-{}-{}-{}'.format(distr, blooms, privacy)
-        params = (BLOOMFILTER_PARAMS[blooms] +
-                  PRIVACY_PARAMS[privacy])
-        test_case = (test_name,) + DISTRIBUTION_PARAMS_ASSOC[distr] + params
-        row_str = [str(element) for element in test_case]
-        rows.append(row_str)
-  # End of association tests
-
   for row in rows:
     print ' '.join(row)
 

From 3e812611f475bb34e04cfa0f6bded64947ffdeb9 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Tue, 14 Jul 2015 14:08:27 -0700
Subject: [PATCH 46/67] Updated some documentation.

---
 assoctest.sh              | 9 ++++-----
 tests/rappor_assoc_sim.py | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/assoctest.sh b/assoctest.sh
index 96bf597d..65fd9549 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -16,15 +16,14 @@
 #    run-all [<num>]             - run all tests, in parallel, each <num> times
 #
 # Examples:
-# $ ./regtest.sh run-seq tiny-8x16-  # Sequential run, matches 2 cases
-# $ ./regtest.sh run-seq tiny-8x16- 3  # Sequential, each test is run three
+# $ ./assoctest.sh run-seq tiny-8x16-     # Sequential run, matches 2 cases
+# $ ./assoctest.sh run-seq tiny-8x16- 3   # Sequential, each test is run three
 #                                           times
-# $ ./regtest.sh run-all     # Run all tests once
+# $ ./assoctest.sh run-all                # Run all tests once
 #
 # The <pattern> argument is a regex in 'grep -E' format. (Detail: Don't
 # use $ in the pattern, since it matches the whole spec line and not just the
-# test case name.) The number of processors used in a parallel run is one less
-# than the number of CPUs on the machine.
+# test case name.) The number of processors used in a parallel run is 5.
 #
 # fast_counts param inherited from regtest.sh, but currently not used
 
diff --git a/tests/rappor_assoc_sim.py b/tests/rappor_assoc_sim.py
index 1c6c026d..178bc509 100755
--- a/tests/rappor_assoc_sim.py
+++ b/tests/rappor_assoc_sim.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 #
-# Copyright 2014 Google Inc. All rights reserved.
+# Copyright 2015 Google Inc. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From a67151bdb4f9c98f3ad17d3619e02440ded6a4b5 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Wed, 15 Jul 2015 15:22:17 -0700
Subject: [PATCH 47/67] A few code changes.

- moving some map code to read_input.R
- analyze_assoc_expt now includes a verbose mode
---
 analysis/R/read_input.R    |  17 +++++
 assoctest.sh               |   3 +-
 tests/analyze_assoc_expt.R | 140 ++++++++++++++++++-------------------
 tests/assoctest_spec.py    |  14 ++--
 4 files changed, 95 insertions(+), 79 deletions(-)

diff --git a/analysis/R/read_input.R b/analysis/R/read_input.R
index 95ea1b0d..35c2cead 100644
--- a/analysis/R/read_input.R
+++ b/analysis/R/read_input.R
@@ -101,6 +101,23 @@ ReadMapFile <- function(map_file, params = NULL, quote = "") {
   list(map = map, strs = strs, map_pos = map_pos)
 }
 
+# This function processes the maps loaded using ReadMapFile
+# Association analysis requires a map object with a map
+# field that has the map split into cohorts and an rmap field
+# that has all the cohorts combined
+# Arguments:
+#       map = map object with cohorts as sparse matrix in
+#             object map$map
+#             This is the expected object from ReadMapFile
+#       params = data field with parameters
+ProcessMap <- function(map, params) {
+  map$rmap <- map$map
+  map$map <- lapply(1:params$m, function(i)
+    map$rmap[seq(from = ((i - 1) * params$k + 1),
+                 length.out = params$k),])
+  map
+}
+
 LoadMapFile <- function(map_file, params = NULL, quote = "") {
   # Reads the map file and creates an R binary .rda. If the .rda file already
   # exists, just loads that file. NOTE: It assumes the map file is
diff --git a/assoctest.sh b/assoctest.sh
index 65fd9549..e1fe9446 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -176,10 +176,11 @@ _run-one-instance() {
     inp['num'] = $num_clients; \
     inp['extras'] = $num_extras; \
     inp['varcandidates'] = [$num_unique_values, $num_unique_values2]; \
+    inp['verbose'] = 'false'; \
     inp['counts'] = ['$instance_dir/case_2way.csv',\
                      '$instance_dir/case_marg1.csv',\
                      '$instance_dir/case_marg2.csv']; \
-    inp['expt'] = ['external-counts', 'external-counts-new']; \
+    inp['expt'] = ['external-counts', 'external-reports-em']; \
     json.dump(inp, f); \
     f.close();"
 
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index d91dad21..bbda4204 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -43,22 +43,11 @@ source("analysis/R/read_input.R")
 source("analysis/R/association.R")
 source("tests/gen_counts.R")
 
-# This function processes the maps loaded using ReadMapFile
-# Association analysis requires a map object with a map
-# field that has the map split into cohorts and an rmap field
-# that has all the cohorts combined
-# Arguments:
-#       map = map object with cohorts as sparse matrix in
-#             object map$map
-#             This is the expected object from ReadMapFile
-#       params = data field with parameters
-# TODO(pseudorandom): move this functionality to ReadMapFile
-ProcessMap <- function(map, params) {
-  map$rmap <- map$map
-  map$map <- lapply(1:params$m, function(i)
-                          map$rmap[seq(from = ((i - 1) * params$k + 1),
-                                   length.out = params$k),])
-  map
+# Wrapper function to print strings only if verbose flag is passed in
+PrintIfVerbose <- function(string, flag = FALSE) {
+  if(flag == TRUE) {
+    print(string)
+  }
 }
 
 # TV distance = L1 distance / 2 = 1 - sum(min(df1|x, df2|x)) where
@@ -90,7 +79,6 @@ CombineReports <- function(reports1, reports2) {
          function(x) as.vector(sapply(x, function(z) two_bits[[z+1]])))
 }
 
-
 # Given 2 lists of maps, maps1 and maps2, the function
 # combines the maps by cohort and outputs both
 # cohort-organized maps and flattened versions
@@ -203,19 +191,25 @@ GenerateNoiseMatrix <- function(params) {
   NoiseMatrix
 }
 
-# ------------------------------------------------------------------------
+#####################################################################
 ##
-## Direct simulation of reports without simulated variance
+## Direct simulation of reports WITHOUT simulated variance
 ## 
-## Inputs:
+## Inputs: inp object (from parsing JSON) with
+##         num - # of reports
+##         params - file containing RAPPOR params
+##         varcandidates - list containing # of candidates for each var
+##         numvars - # of vars (>=2 for association)
+##         extra - # of extra candidates for var 1 
+##         
 ##
-## Outputs:
-#
-# ------------------------------------------------------------------------
-DirectSimulationOfReports <- function(inp) {
-  params <- ReadParameterFile(inp$params)
-  # TWO WAY ASSOCIATIONS; INPUTS SIMULATED DIRECTLY
-  
+## Outputs: Runs simulation of two-way association analysis by directly
+##          simulating the counts of one way and two way marginals
+##
+#####################################################################
+DirectSimulationOfReports <- function(inp, verbose = FALSE) {
+  ptm <- proc.time()
+  params <- ReadParameterFile(inp$params)  
   strconstant <- c("string", "option")
   N <- inp$num
   n1 <- inp$varcandidates[[1]]
@@ -268,13 +262,13 @@ DirectSimulationOfReports <- function(inp) {
   found_strings <- lapply(1:2, function(i)
     Decode(ow_counts[[i]],
            map[[i]]$rmap,
-           params, quick = TRUE)$fit$strings)
+           params, quick = TRUE)$fit[,"string"])
   # --------------
   
   rownames(td) <- uvals[[1]][1:n1]  # Don't take into account extras
   colnames(td) <- uvals[[2]]
-  print("TRUE DISTRIBUTION")
-  print(signif(td, 4))
+  PrintIfVerbose("TRUE DISTRIBUTION", verbose)
+  PrintIfVerbose(signif(td, 4), verbose)
   cohorts <- as.matrix(
     apply(as.data.frame(final_part), 1,
           function(count) RandomPartition(count, rep(1, params$m))))
@@ -313,11 +307,11 @@ DirectSimulationOfReports <- function(inp) {
   ed[is.na(ed)] <- 0
   time_taken <- proc.time() - ptm
   
-  print("2 WAY RESULTS")
-  print(signif(ed, 4))
-  print(TVDistance(td, ed, "TV DISTANCE 2 WAY ALGORITHM"))
-  print("PROC.TIME")
-  print(time_taken)
+  PrintIfVerbose("2 WAY RESULTS", verbose)
+  PrintIfVerbose(signif(ed, 4), verbose)
+  PrintIfVerbose(TVDistance(td, ed, "TV DISTANCE 2 WAY ALGORITHM"), verbose)
+  PrintIfVerbose("PROC.TIME", verbose)
+  PrintIfVerbose(time_taken, verbose)
   chisq_td <- chisq.test(td)[1][[1]][[1]]
   chisq_ed <- chisq.test(ed)[1][[1]][[1]]
   if(is.nan(chisq_ed)) {
@@ -339,19 +333,21 @@ DirectSimulationOfReports <- function(inp) {
   write.csv(metrics, file = filename, row.names = FALSE)
 }
 
-# ------------------------------------------------------------------------
+#####################################################################
 ##
 ## Externally provided counts (gen_assoc_counts.R and sum_assoc_reports.py)
-## 2 WAY ASSOCIATION ONLY
+## new_decode flag allows you to switch between two decode algorithm choices
+## Note: Only for two way associations
 ## 
-## Inputs:
+## Inputs: inp object (from parsing JSON) with
 ##    count files (2 way counts, individual marginal counts)
 ##    map files (2 variables)
+##    params file with RAPPOR params
 ##
-## Outputs:
-#
-# ------------------------------------------------------------------------
-ExternalCounts <- function(inp, new_decode = FALSE) {
+## Outputs: Runs simulation of two-way association analysis reading inputs
+##          from counts, maps, and params file.
+#####################################################################
+ExternalCounts <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv") {
   ptm <- proc.time()
   params <- ReadParameterFile(inp$params)
   # Ensure sufficient maps as required by number of vars
@@ -375,8 +371,8 @@ ExternalCounts <- function(inp, new_decode = FALSE) {
   found_strings = list(fit[[1]][,"string"], fit[[2]][,"string"])
 
   if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) {
-    print("FOUND_STRINGS")
-    print(found_strings)
+    PrintIfVerbose("FOUND_STRINGS", verbose)
+    PrintIfVerbose(found_strings, verbose)
     stop("No strings found in 1-way marginal.")
   }
   
@@ -400,9 +396,9 @@ ExternalCounts <- function(inp, new_decode = FALSE) {
   
   time_taken <- proc.time() - ptm
   
-  print(TVDistance(td, ed, "TV DISTANCE 2 WAY"))
-  print("PROC.TIME")
-  print(time_taken)
+  PrintIfVerbose(TVDistance(td, ed, "TV DISTANCE 2 WAY"), verbose)
+  PrintIfVerbose("PROC.TIME", verbose)
+  PrintIfVerbose(time_taken, verbose)
   chisq_td <- chisq.test(td)[1][[1]][[1]]
   chisq_ed <- chisq.test(ed)[1][[1]][[1]]
   if(is.nan(chisq_td)) {
@@ -421,16 +417,12 @@ ExternalCounts <- function(inp, new_decode = FALSE) {
     dim2 = length(found_strings[[2]])
   )
   
-  # Write metrics to metrics.csv
-  if (new_decode == TRUE) {
-    filename <- file.path(inp$outdir, 'metrics_2.csv')
-  } else {
-    filename <- file.path(inp$outdir, 'metrics.csv')
-  }
+  # Write metrics to metrics_filename (default: metrics.csv)
+  filename <- file.path(inp$outdir, metrics_filename)
   write.csv(metrics, file = filename, row.names = FALSE)
 }
 
-# ------------------------------------------------------------------------
+#####################################################################
 ##
 ## Externally provided reports
 ## EM ALGORITHM
@@ -439,9 +431,9 @@ ExternalCounts <- function(inp, new_decode = FALSE) {
 ## Inputs:
 ##    
 ## Outputs:
-#
-# ------------------------------------------------------------------------
-ExternalReportsEM <- function(inp) {
+##
+#####################################################################
+ExternalReportsEM <- function(inp, verbose = FALSE) {
   ptm <- proc.time()
   params <- ReadParameterFile(inp$params)
   # Ensure sufficient maps as required by number of vars
@@ -488,9 +480,9 @@ ExternalReportsEM <- function(inp) {
   td <- td / sum(td)
   time_taken <- proc.time() - ptm
   
-  print(TVDistance(td, em, "TV DISTANCE EM"))
-  print("PROC.TIME")
-  print(time_taken)
+  PrintIfVerbose(TVDistance(td, em, "TV DISTANCE EM"), verbose)
+  PrintIfVerbose("PROC.TIME", verbose)
+  PrintIfVerbose(time_taken, verbose)
   chisq_td <- chisq.test(td)[1][[1]][[1]]
   chisq_ed <- chisq.test(em)[1][[1]][[1]]
   if(is.nan(chisq_td)) {
@@ -516,27 +508,33 @@ ExternalReportsEM <- function(inp) {
 
 main <- function(opts) {
   inp <- fromJSON(opts$inp)
-  
+  verbose_flag <- inp$verbose
   # Choose from a set of experiments to run
   # direct -> direct simulation of reports (without variances)
   # external-counts -> externally supplied counts for 2 way and marginals
   # external-reports -> externally supplied reports 
 
   if("direct" %in% inp$expt) {
-    print("---------- RUNNING EXPERIMENT DIRECT ----------")
-    DirectSimulationOfReports(inp)
+    PrintIfVerbose("Running Experiment Direct", verbose_flag)
+    DirectSimulationOfReports(inp, verbose = verbose_flag)
   } 
   if ("external-counts" %in% inp$expt) {
-    print("---------- RUNNING EXPERIMENT EXT COUNTS ----------")
-    ExternalCounts(inp)  
-  }
-  if ("external-counts-new" %in% inp$expt) {
-    print("---------- RUNNING EXPERIMENT EXT COUNTS ----------")
-    ExternalCounts(inp, new_decode = TRUE)  
+    PrintIfVerbose("Running Experiment Ext Counts", verbose_flag)
+    if ("direct" %in% inp$expt) {
+      # external-counts expt is run to compare results
+      ExternalCounts(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv")
+    } else {
+      ExternalCounts(inp, verbose = verbose_flag)
+    }
   }
   if ("external-reports-em" %in% inp$expt) {
-    print("---------- RUNNING EXPERIMENT EXT REPORTS ----------")
-    ExternalReportsEM(inp)
+    PrintIfVerbose("Running Experiment Ext Reports", verbose_flag)
+    if (("direct" %in% inp$expt)||("external-counts" %in% inp$expt)) {
+      # external-reports-em expt is run to compare results
+      ExternalCounts(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv")
+    } else {
+      ExternalCounts(inp, verbose = verbose_flag)
+    }
   }
 }
 
diff --git a/tests/assoctest_spec.py b/tests/assoctest_spec.py
index 3d1642dd..a10847b4 100755
--- a/tests/assoctest_spec.py
+++ b/tests/assoctest_spec.py
@@ -86,13 +86,13 @@
 #            'fizz-small-bool',
 #            'fizz',
 #            'fizz-bool',),
-            'toy',),
-#            'compact-noextra-small',
-#            'loose-noextra-small',
-#            'compact-extra-small',
-#            'loose-extra-small',
-#            'compact-excess-small',
-#            'loose-excess-small',),
+#            'toy',),
+            'compact-noextra-small',
+            'loose-noextra-small',
+            'compact-extra-small',
+            'loose-extra-small',
+            'compact-excess-small',
+            'loose-excess-small',),
 #            'compact-noextra-large',
 #            'loose-noextra-large',
 #            'compact-extra-large',

From 420b6a0c8d747b7a05a8b7b6d02ddcfe5cd0df6e Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Wed, 15 Jul 2015 16:59:08 -0700
Subject: [PATCH 48/67] Small fixes, updates to assoctest.sh

---
 assoctest.sh               |  2 +-
 tests/analyze_assoc_expt.R | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/assoctest.sh b/assoctest.sh
index e1fe9446..de33c403 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -176,7 +176,7 @@ _run-one-instance() {
     inp['num'] = $num_clients; \
     inp['extras'] = $num_extras; \
     inp['varcandidates'] = [$num_unique_values, $num_unique_values2]; \
-    inp['verbose'] = 'false'; \
+    inp['verbose'] = 'true'; \
     inp['counts'] = ['$instance_dir/case_2way.csv',\
                      '$instance_dir/case_marg1.csv',\
                      '$instance_dir/case_marg2.csv']; \
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index bbda4204..cb0a2087 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -433,7 +433,7 @@ ExternalCounts <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv
 ## Outputs:
 ##
 #####################################################################
-ExternalReportsEM <- function(inp, verbose = FALSE) {
+ExternalReportsEM <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv") {
   ptm <- proc.time()
   params <- ReadParameterFile(inp$params)
   # Ensure sufficient maps as required by number of vars
@@ -501,8 +501,8 @@ ExternalReportsEM <- function(inp, verbose = FALSE) {
     dim2 = dim(em)[[2]]
   )
   
-  # Write metrics to metrics.csv
-  filename <- file.path(inp$outdir, 'metrics_2.csv')
+  # Write metrics to metrics_filename (default: metrics.csv)
+  filename <- file.path(inp$outdir, metrics_filename)
   write.csv(metrics, file = filename, row.names = FALSE)
 }
 
@@ -531,9 +531,9 @@ main <- function(opts) {
     PrintIfVerbose("Running Experiment Ext Reports", verbose_flag)
     if (("direct" %in% inp$expt)||("external-counts" %in% inp$expt)) {
       # external-reports-em expt is run to compare results
-      ExternalCounts(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv")
+      ExternalReportsEM(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv")
     } else {
-      ExternalCounts(inp, verbose = verbose_flag)
+      ExternalReportsEM(inp, verbose = verbose_flag)
     }
   }
 }

From f73aac4bd65391e5435ed10030d5fbab836230b7 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Tue, 21 Jul 2015 09:17:37 -0700
Subject: [PATCH 49/67] Incorporating changes from master

- resolved conflicts
- modified code to use new Encode interface
- modified rappor_assoc_sim.py to use same interface as rappor_sim.py
---
 analysis/R/association.R         |  3 +++
 analysis/R/decode.R              | 43 +++-----------------------------
 analysis/tools/sum_bits_assoc.py |  8 ++++--
 assoctest.sh                     |  8 +++---
 tests/analyze_assoc_expt.R       | 11 ++++----
 tests/assoctest_spec.py          | 10 ++++----
 tests/gen_assoc_reports.R        | 10 +++++---
 tests/rappor_assoc_sim.py        |  8 +++---
 8 files changed, 38 insertions(+), 63 deletions(-)

diff --git a/analysis/R/association.R b/analysis/R/association.R
index 2a19656f..f2d6f59c 100644
--- a/analysis/R/association.R
+++ b/analysis/R/association.R
@@ -355,6 +355,9 @@ ComputeDistributionEM <- function(reports, report_cohorts,
                                   params)
       found_strings[[j]] <- c(found_strings[[j]], "Other")
     }
+    
+    GetCondProb(variable_report[[1]], candidate_strings = rownames(marginal),
+                params = params, map$map[[variable_cohort[1]]], prob_other[[variable_cohort[1]]])
 
     # Get the joint conditional distribution
     cond_report_dist <- lapply(seq(length(variable_report)), function(i) {
diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 9c37094b..ba9eb9c6 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -227,41 +227,14 @@ FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
   #   according to this vector approximates estimates
 
   S <- ncol(map)  # total number of candidates
-<<<<<<< HEAD
   lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
-=======
-
-  support_coefs <- 1:S
-
-  if (S > length(estimates_stds$estimates) * .8) {
-    # the system is close to being underdetermined
-    lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
->>>>>>> master
-
-    # Select non-zero coefficients.
-    support_coefs <- which(lasso > 0)
+  
+  if(!quiet)
+    cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n")
 
-<<<<<<< HEAD
   names(lasso) <- colnames(map)
   lasso
- }
-=======
-    if(!quiet)
-      cat("LASSO selected ", length(support_coefs), " non-zero coefficients.\n")
-  }
-
-  coefs <- setNames(rep(0, S), colnames(map))
-
-  if(length(support_coefs) > 0) {  # LASSO may return an empty list
-    constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE],
-                                             estimates_stds)
-
-    coefs[support_coefs] <- constrained_coefs
-  }
-
-  coefs
 }
->>>>>>> master
 
 Resample <- function(e) {
   # Simulate resampling of the Bloom filter estimates by adding Gaussian noise
@@ -302,13 +275,9 @@ Decode <- function(counts, map, params, alpha = 0.05, quick = FALSE,
   coefs_all <- vector()
   # Run the fitting procedure several times (5 seems to be sufficient and not
   # too many) to estimate standard deviation of the output.
-<<<<<<< HEAD
   if(quick) {num_reps <- 2} else {num_reps <- 5}
   for(r in 1:num_reps)
   {
-=======
-  for(r in 1:5) {
->>>>>>> master
     if(r > 1)
       e <- Resample(estimates_stds_filtered)
     else
@@ -359,17 +328,11 @@ Decode <- function(counts, map, params, alpha = 0.05, quick = FALSE,
   fit$prop_std_error <- fit$std_error / N
 
   # 1.96 standard deviations gives 95% confidence interval.
-<<<<<<< HEAD
-  fit$prop_low_95 <- fit$proportion - 1.96 * fit$prop_std_error
-  fit$prop_high_95 <- fit$proportion + 1.96 * fit$prop_std_error
-=======
   low_95 <- fit$proportion - 1.96 * fit$prop_std_error
   high_95 <- fit$proportion + 1.96 * fit$prop_std_error
   # Clamp estimated proportion.  pmin/max: vectorized min and max
   fit$prop_low_95 <- pmax(low_95, 0.0)
   fit$prop_high_95 <- pmin(high_95, 1.0)
-
->>>>>>> master
   fit <- fit[, c("string", "estimate", "std_error", "proportion",
                  "prop_std_error", "prop_low_95", "prop_high_95")]
   allocated_mass <- sum(fit$proportion)
diff --git a/analysis/tools/sum_bits_assoc.py b/analysis/tools/sum_bits_assoc.py
index 9bdd7f95..a858d78f 100755
--- a/analysis/tools/sum_bits_assoc.py
+++ b/analysis/tools/sum_bits_assoc.py
@@ -45,7 +45,7 @@ def SumBits(params, stdin, f_2way, f_1, f_2):
 
   for i, row in enumerate(csv_in):
     try:
-      (user_id, cohort, irr_1, irr_2) = row
+      (_, cohort, irr_1, irr_2) = row
     except ValueError:
       raise RuntimeError('Error parsing row %r' % row)
 
@@ -53,7 +53,11 @@ def SumBits(params, stdin, f_2way, f_1, f_2):
       continue  # skip header
 
     cohort = int(cohort)
-    num_reports[cohort] += 1
+    try:
+      num_reports[cohort] += 1
+    except IndexError:
+      raise RuntimeError('Error indexing cohort number %d (num_cohorts is %d) \
+                         ' % (cohort, num_cohorts))
 
     # TODO: Extend checking for both reports
     if not len(irr_1) == params.num_bloombits:
diff --git a/assoctest.sh b/assoctest.sh
index de33c403..132cd917 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -125,13 +125,13 @@ _run-one-instance() {
     -p $p \
     -q $q \
     -f $f \
-    -i $instance_dir/case.csv \
-    --out-prefix "$instance_dir/case"
+    < $instance_dir/case.csv \
+    > "$instance_dir/case_reports.csv"
 
   analysis/tools/sum_bits_assoc.py \
     $case_dir/case_params.csv \
     "$instance_dir/case" \
-    < $instance_dir/case_out.csv
+    < $instance_dir/case_reports.csv
 
 
   # Setting up JSON file containing assoc_sim inputs with python
@@ -167,7 +167,7 @@ _run-one-instance() {
     inp = dict(); \
     inp['maps'] = ['$case_dir/case_map1.csv',\
                    '$case_dir/case_map2.csv']; \
-    inp['reports'] = '$instance_dir/case_out.csv'; \
+    inp['reports'] = '$instance_dir/case_reports.csv'; \
     inp['truefile'] = '$instance_dir/case.csv'; \
     inp['outdir'] = '$out_dir'; \
     inp['params'] = '$case_dir/case_params.csv'; \
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index cb0a2087..37e65426 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -382,8 +382,8 @@ ExternalCounts <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv
     lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE]))
   crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap
   marginal <- Decode2Way(counts[[1]], crmap, params2, fit = fit)$fit
-  td <- read.csv(file = inp$truefile, header = FALSE)
-  td <- table(td[,2:3])
+  td <- read.csv(file = inp$truefile, header = TRUE)
+  td <- table(td[,3:4])
   td <- td / sum(td)
   ed <- td
   for (cols in colnames(td)) {
@@ -443,13 +443,14 @@ ExternalReportsEM <- function(inp, verbose = FALSE, metrics_filename = "metrics.
                params = params))
   
   # Reports must be of the format
-  #     cohort no, rappor bitstring 1, rappor bitstring 2, ...
+  #     client name, cohort no, rappor bitstring 1, rappor bitstring 2, ...
   reportsObj <- read.csv(inp$reports,
-                           colClasses = c("integer", "integer",
+                           colClasses = c("character", "integer",
                                           rep("character", inp$numvars)),
                            header = TRUE)
   # Ignore the first column
   reportsObj <- reportsObj[,-1]
+
   # Parsing reportsObj
   # ComputeDistributionEM allows for different sets of cohorts
   # for each variable. Here, both sets of cohorts are identical
@@ -476,7 +477,7 @@ ExternalReportsEM <- function(inp, verbose = FALSE, metrics_filename = "metrics.
                                       estimate_var = FALSE)
   em <- joint_dist$fit
   td <- read.csv(file = inp$truefile, header = FALSE)
-  td <- table(td[,2:3])
+  td <- table(td[,3:4])
   td <- td / sum(td)
   time_taken <- proc.time() - ptm
   
diff --git a/tests/assoctest_spec.py b/tests/assoctest_spec.py
index a10847b4..c798a5ea 100755
--- a/tests/assoctest_spec.py
+++ b/tests/assoctest_spec.py
@@ -88,11 +88,11 @@
 #            'fizz-bool',),
 #            'toy',),
             'compact-noextra-small',
-            'loose-noextra-small',
-            'compact-extra-small',
-            'loose-extra-small',
-            'compact-excess-small',
-            'loose-excess-small',),
+            'loose-noextra-small',),
+#            'compact-extra-small',
+#            'loose-extra-small',
+#            'compact-excess-small',
+#            'loose-excess-small',),
 #            'compact-noextra-large',
 #            'loose-noextra-large',
 #            'compact-extra-large',
diff --git a/tests/gen_assoc_reports.R b/tests/gen_assoc_reports.R
index 41eb045a..fa83e95d 100755
--- a/tests/gen_assoc_reports.R
+++ b/tests/gen_assoc_reports.R
@@ -14,6 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# TODO: Rename reports to values (more in line with its usage for histogram
+# RAPPOR)
 source('tests/gen_counts.R')
 
 # Usage:
@@ -69,13 +71,15 @@ main <- function(argv) {
   perm <- sample(N)
   values <- list(values[[1]][perm], values[[2]][perm])
 
-  # Obtain reports by prefixing values with "v"s. Even slower than shuffling.
+  # Prepend with str and opt
   reports <- list(sprintf("str%d", values[[1]]),
                   sprintf("opt%d", values[[2]]))
 
-  reports <- cbind(1:N, reports[[1]], reports[[2]])  # paste together "1 v342"
+  # paste together client name, cohort input, report1, report2
+  reports <- cbind(sprintf("cli%d", 1:N), 1:N, reports[[1]], reports[[2]])
+  colnames(reports) <- c("client", "cohort", "value1", "value2")
 
-  write.table(reports, file = out_file, row.names = FALSE, col.names = FALSE, 
+  write.table(reports, file = out_file, row.names = FALSE, col.names = TRUE, 
               sep = ",", quote = FALSE)
 }
 
diff --git a/tests/rappor_assoc_sim.py b/tests/rappor_assoc_sim.py
index 8d3fed41..b46c8436 100755
--- a/tests/rappor_assoc_sim.py
+++ b/tests/rappor_assoc_sim.py
@@ -119,8 +119,8 @@ def main(argv):
   # instance up front per client, rather than one per row below.
   start_time = time.time()
 
-  for i, (client_str, cohort_str, true_value_1, true_value_2) in
-                                                          enumerate(csv_in):
+  for i, (client_str, cohort_str, true_value_1, 
+          true_value_2) in enumerate(csv_in):
     if i == 0:
       if client_str != 'client':
         raise RuntimeError('Expected client header, got %s' % client_str)
@@ -139,7 +139,7 @@ def main(argv):
       elapsed = time.time() - start_time
       log('Processed %d inputs in %.2f seconds', i, elapsed)
 
-    cohort = int(cohort_str)
+    cohort = int(cohort_str) % params.num_cohorts
     secret = client_str
     e = rappor.Encoder(params, cohort, secret, irr_rand)
 
@@ -150,7 +150,7 @@ def main(argv):
     irr_1_str = rappor.bit_string(irr_1, params.num_bloombits)
     irr_2_str = rappor.bit_string(irr_2, params.num_bloombits)
 
-    out_row = (cohort_str, irr_1_str, irr_2_str)
+    out_row = (client_str, cohort, irr_1_str, irr_2_str)
     csv_out.writerow(out_row)
 
 

From ab783199b32f1a4194cbfa8a459d7493783c87f7 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Tue, 21 Jul 2015 13:02:43 -0700
Subject: [PATCH 50/67] Added a test for gen_assoc_reports.R

Also, some minor refactoring.
---
 assoctest.sh                   |  6 +++++-
 tests/analyze_assoc.R          |  9 +++++++++
 tests/assoc_sim.R              | 12 ++++++++++++
 tests/assoctest_spec.py        |  8 ++++----
 tests/gen_assoc_reports.R      | 32 +++++++++++++++++++++++---------
 tests/gen_assoc_reports_test.R | 34 ++++++++++++++++++++++++++++++++++
 6 files changed, 87 insertions(+), 14 deletions(-)
 create mode 100755 tests/gen_assoc_reports_test.R

diff --git a/assoctest.sh b/assoctest.sh
index 132cd917..f9dc392b 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -115,7 +115,7 @@ _run-one-instance() {
   banner "Generating input"
 
   tests/gen_assoc_reports.R $num_unique_values $num_unique_values2 \
-                            $num_clients $instance_dir/case.csv
+                            $num_clients $num_cohorts $instance_dir/case.csv
 
   banner "Running RAPPOR client"
   tests/rappor_assoc_sim.py \
@@ -135,6 +135,8 @@ _run-one-instance() {
 
 
   # Setting up JSON file containing assoc_sim inputs with python
+  # Currently unused as true values and RAPPOR'd reports are generated
+  # running gen_assoc_reports.R, rappor_assoc_sim.py, and sum_bits_assoc.py
   python -c "import json; \
     f = file('$instance_dir/assoc_inp.json', 'w'); \
     inp = dict(); \
@@ -151,6 +153,8 @@ _run-one-instance() {
     json.dump(inp, f); \
     f.close();"
 
+  # Currently unused as true values and RAPPOR'd reports are generated
+  # running gen_assoc_reports.R, rappor_assoc_sim.py, and sum_bits_assoc.py
   # tests/assoc_sim_expt.R --inp $instance_dir/assoc_inp.json
 
   local out_dir=${instance_dir}_report
diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R
index 4e6af972..100d204f 100755
--- a/tests/analyze_assoc.R
+++ b/tests/analyze_assoc.R
@@ -30,6 +30,15 @@
 #         intel   0.1   0.3
 #         google  0.5   0.1
 
+##############################################################################
+##############################################################################
+##############################################################################
+# D E P R E C A T E D
+# Please use analyze_assoc_expt.R to run assoc analysis experiments 
+##############################################################################
+##############################################################################
+##############################################################################
+
 library("optparse")
 
 options(stringsAsFactors = FALSE)
diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R
index c1166bc1..1b1726de 100755
--- a/tests/assoc_sim.R
+++ b/tests/assoc_sim.R
@@ -26,6 +26,18 @@
 #         reports.csv file containing reports
 #         map_{1, 2, ...}.csv file(s) containing maps of variables
 
+##############################################################################
+##############################################################################
+##############################################################################
+# D E P R E C A T E D
+# Please look at workflow to use analyze_assoc_expt.R and 
+# run gen_assoc_reports.R, rappor_assoc_sim.py, and sum_bits_assoc.py
+# to generate inputs to association analysis
+# (For more details, see _run-one-instance() in assoctest.sh)
+##############################################################################
+##############################################################################
+##############################################################################
+
 library("optparse")
 
 options(stringsAsFactors = FALSE)
diff --git a/tests/assoctest_spec.py b/tests/assoctest_spec.py
index c798a5ea..b6952730 100755
--- a/tests/assoctest_spec.py
+++ b/tests/assoctest_spec.py
@@ -87,8 +87,8 @@
 #            'fizz',
 #            'fizz-bool',),
 #            'toy',),
-            'compact-noextra-small',
-            'loose-noextra-small',),
+            'compact-noextra-small',),
+#            'loose-noextra-small',),
 #            'compact-extra-small',
 #            'loose-extra-small',
 #            'compact-excess-small',
@@ -100,8 +100,8 @@
 #            'compact-excess-large',
 #            'loose-excess-large'),
   'blooms': (
-             '8x32',
-             '16x32',),
+             '8x32',),
+#             '16x32',),
   'privacy': (
               'eps_small',
               'eps_chrome',)
diff --git a/tests/gen_assoc_reports.R b/tests/gen_assoc_reports.R
index fa83e95d..e6adb7e6 100755
--- a/tests/gen_assoc_reports.R
+++ b/tests/gen_assoc_reports.R
@@ -16,6 +16,7 @@
 
 # TODO: Rename reports to values (more in line with its usage for histogram
 # RAPPOR)
+
 source('tests/gen_counts.R')
 
 # Usage:
@@ -30,10 +31,12 @@ source('tests/gen_counts.R')
 # Output:
 #   csv file with reports sampled according to the specified distribution. 
 
-main <- function(argv) {
-  n <- list(as.integer(argv[[1]]), as.integer(argv[[2]]))
-  N <- as.integer(argv[[3]])
-  out_file <- argv[[4]]
+GenerateAssocReports <- function(n, N, num_cohorts) {
+  # Inputs: n, a list of supports for vars 1, 2
+  #         N, the number of reports/clients
+  #         num_cohorts, the number of cohorts
+  # Output: tuples of values sampled according to a zipf x zipf distr
+  #         with support n[[1]] and n[[2]] respectively
 
   # Sample values to compute partition
   # Resulting distribution is a correlated zipf x zipf
@@ -70,15 +73,26 @@ main <- function(argv) {
   # Shuffle values randomly (may take a few sec for > 10^8 inputs)
   perm <- sample(N)
   values <- list(values[[1]][perm], values[[2]][perm])
+  cohorts <- rep(1:N) %% num_cohorts
+  list(cohorts = cohorts, values = values)
+}
 
+main <- function(argv) {
+  n <- list(as.integer(argv[[1]]), as.integer(argv[[2]]))
+  N <- as.integer(argv[[3]])
+  num_cohorts <- as.integer(argv[[4]])
+  out_file <- argv[[5]]
+
+  res <- GenerateAssocReports(n, N, num_cohorts)
   # Prepend with str and opt
-  reports <- list(sprintf("str%d", values[[1]]),
-                  sprintf("opt%d", values[[2]]))
+  reports <- list(sprintf("str%d", res$values[[1]]),
+                  sprintf("opt%d", res$values[[2]]))
 
-  # paste together client name, cohort input, report1, report2
-  reports <- cbind(sprintf("cli%d", 1:N), 1:N, reports[[1]], reports[[2]])
-  colnames(reports) <- c("client", "cohort", "value1", "value2")
 
+  # Paste together client name, cohort input, report1, report2
+  reports <- cbind(sprintf("cli%d", 1:N),
+                   res$cohorts, reports[[1]], reports[[2]])
+  colnames(reports) <- c("client", "cohort", "value1", "value2")
   write.table(reports, file = out_file, row.names = FALSE, col.names = TRUE, 
               sep = ",", quote = FALSE)
 }
diff --git a/tests/gen_assoc_reports_test.R b/tests/gen_assoc_reports_test.R
new file mode 100755
index 00000000..10f88c51
--- /dev/null
+++ b/tests/gen_assoc_reports_test.R
@@ -0,0 +1,34 @@
+#!/usr/bin/Rscript
+#
+# gen_reports_test.R
+
+source('analysis/R/util.R')  # Log()
+
+source('tests/gen_assoc_reports.R')  # module under test
+
+library(RUnit)
+
+TestGenerateAssocReports <- function() {
+  # list for support of var1, var2, 
+  # total number of reports
+  # num_cohorts
+  res <- GenerateAssocReports(list(20, 5), 1000, 32)
+  # print(res$values)
+
+  # 1000 reports
+  checkEquals(1000, length(res$values[[1]]))
+
+  # support(var1) <= 20
+  # support(var2) <= 5
+  checkTrue(max(res$values[[1]]) <= 20)
+  checkTrue(max(res$values[[2]]) <= 5)
+
+  # Ensure cohorts are filled up
+  checkEquals(32, length(unique(res$cohort)))
+}
+
+TestAll <- function(){
+  TestGenerateAssocReports()
+}
+
+TestAll()

From feee5d8b3d74d5519b215ed7504df37f9d5886b4 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Tue, 21 Jul 2015 14:19:09 -0700
Subject: [PATCH 51/67] Replaced regtest_spec.py from master branch.

---
 tests/regtest_spec.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
index 6350ae7a..6774e400 100755
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@@ -45,20 +45,14 @@
 BLOOMFILTER_PARAMS = {
     '8x16': (8, 2, 16),  # 16 cohorts, 8 bits each, 2 bits set in each
     '8x32': (8, 2, 32),  # 32 cohorts, 8 bits each, 2 bits set in each
-    '16x32': (16, 2, 32),  # 32 cohorts, 16 bits each, 2 bits set in each
     '8x128': (8, 2, 128),  # 128 cohorts, 8 bits each, 2 bits set in each
     '128x128': (128, 2, 128),  # 8 cohorts, 128 bits each, 2 bits set in each
 }
 
 # 'p, q, f' as in params file.
 PRIVACY_PARAMS = {
-    'eps_zero': (0, 0.99, 0),  # testing purposes only!
     'eps_1_1': (0.39, 0.61, 0.45),  # eps_1 = 1, eps_inf = 5:
     'eps_1_5': (0.225, 0.775, 0.0),  # eps_1 = 5, no eps_inf
-    'eps_verysmall': (0.125, 0.875, 0.125),
-    'eps_small': (0.125, 0.875, 0.5),
-    'eps_chrome': (0.25, 0.75, 0.5),
-    'uma_rappor_type': (0.50, 0.75, 0.5),
 }
 
 # For deriving candidates from true inputs.
@@ -82,6 +76,7 @@
 # END TEST CONFIGURATION
 #
 
+
 def main(argv):
   rows = []
 

From 7936fc9faed65267cf3d398ce7c3fac7fb379e2e Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Tue, 21 Jul 2015 14:25:46 -0700
Subject: [PATCH 52/67] Moving deprecated code to experimental directory

---
 {tests => experimental/assoc}/analyze_assoc.R  | 0
 {tests => experimental/assoc}/assoc_sim.R      | 0
 {tests => experimental/assoc}/assoc_sim_expt.R | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename {tests => experimental/assoc}/analyze_assoc.R (100%)
 rename {tests => experimental/assoc}/assoc_sim.R (100%)
 rename {tests => experimental/assoc}/assoc_sim_expt.R (100%)

diff --git a/tests/analyze_assoc.R b/experimental/assoc/analyze_assoc.R
similarity index 100%
rename from tests/analyze_assoc.R
rename to experimental/assoc/analyze_assoc.R
diff --git a/tests/assoc_sim.R b/experimental/assoc/assoc_sim.R
similarity index 100%
rename from tests/assoc_sim.R
rename to experimental/assoc/assoc_sim.R
diff --git a/tests/assoc_sim_expt.R b/experimental/assoc/assoc_sim_expt.R
similarity index 100%
rename from tests/assoc_sim_expt.R
rename to experimental/assoc/assoc_sim_expt.R

From 3deceee1f84a26072cc32beab2278ad784dea135 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 23 Jul 2015 09:48:45 -0700
Subject: [PATCH 53/67] A few fixes from code review.

- uncommented experimental code in decode2way and documented it
- renamed function that processes assoc maps
- deleted params.csv
---
 analysis/R/decode2way.R    | 87 ++++++++++++++++++--------------------
 analysis/R/read_input.R    |  2 +-
 assoctest.sh               |  2 +-
 tests/analyze_assoc_expt.R | 14 +++---
 tests/params.csv           |  2 -
 5 files changed, 52 insertions(+), 55 deletions(-)
 delete mode 100644 tests/params.csv

diff --git a/analysis/R/decode2way.R b/analysis/R/decode2way.R
index 63bb8f69..600c124e 100644
--- a/analysis/R/decode2way.R
+++ b/analysis/R/decode2way.R
@@ -32,7 +32,7 @@ EstimateBloomCounts2Way <- function(params, obs_counts) {
   #            q - P(IRR = 1 | PRR = 1)
   #            f - Proportion of bits in the Bloom filter that are set randomly
   #                to 0 or 1 regardless of the underlying true bit value
-  #    obs_counts: a matrix of size m by (4k**2 + 1). Column one contains sample
+  #    obs_counts: a matrix of size m by (4k^2 + 1). Column one contains sample
   #                sizes for each cohort. Other counts indicated how many times
   #                pairs of bits {11, 10, 01, 00} were set across the two
   #                reports (in a "1st report"-major order)
@@ -104,50 +104,45 @@ FitDistribution2Way <- function(estimates_stds, map,
   lsei(A = X, B = Y, G = G, H = H, type = 2)$X
 }
 
-# FitDistribution2Way <- function(estimates_stds, map, fit) {
-#   # Find a distribution over rows of map that approximates estimates_stds best
-#   #
-#   # Input:
-#   #   estimates_stds: a list of two m x k matrices, one for estimates, another
-#   #                   for standard errors
-#   #   map           : an (m * k) x S boolean matrix
-#   #
-#   # Output:
-#   #   a float vector of length S, so that a distribution over map's rows sampled
-#   #   according to this vector approximates estimates
-#   
-#   X <- as.matrix(map)
-#   Y <- as.vector(t(estimates_stds$estimates))
-#   m <- dim(X)[1]
-#   n <- dim(X)[2]
-#   wt <- 10000  # weight to marginal constraints
-#   
-#   G <- rbind2(Diagonal(n), rep(-1, n))
-#   H <- c(rep(0, n), -1)
-#   
-#   # Adding marginals constraints to X and Y
-#   fstrs <- lapply(fit, function(x) x[,"string"])  # found strings
-#   
-#   Y <- c(Y, wt * t(fit[[1]]["proportion"]), wt * t(fit[[2]]["proportion"]))
-#   
-#   for (strs in fstrs[[1]]) {
-#     indices <- which(colnames(map) %in% outer(strs,
-#                                               fstrs[[2]],
-#                                               function(x, y) paste(x, y, sep = "x")))
-#     vec <- rep(0, n)
-#     vec[indices] <- wt
-#     X <- rbind2(X, vec)
-#   }
-#   for (strs in fstrs[[2]]) {
-#     indices <- which(colnames(map) %in% outer(fstrs[[1]],
-#                                               strs,
-#                                               function(x, y) paste(x, y, sep = "x")))
-#     vec <- rep(0, n)
-#     vec[indices] <- wt
-#     X <- rbind2(X, vec)
-#   }
-#   
-#   lsei(A = X, B = Y, G = G, H = H, type = 2)$X
+FitDistribution2WayAdditionalConstraints <- function(estimates_stds, map, fit) {
+  # Experimental code
+  # Computes the same output as FitDistribution by 
+  # additionally throwing in constraints corresponding to
+  # 1-way marginals
+  # Requires non-NULL fit as input (with "proportion" containing marginal info)
+
+  X <- as.matrix(map)
+  Y <- as.vector(t(estimates_stds$estimates))
+  m <- dim(X)[1]
+  n <- dim(X)[2]
+  wt <- 10000 #  weight to marginal constraints
+  
+  G <- rbind2(Diagonal(n), rep(-1, n))
+  H <- c(rep(0, n), -1)
+  
+  # Adding marginals constraints to X and Y
+  fstrs <- lapply(fit, function(x) x[,"string"]) #  found strings
+  
+  Y <- c(Y, wt * t(fit[[1]]["proportion"]), wt * t(fit[[2]]["proportion"]))
+  
+  for (strs in fstrs[[1]]) {
+    indices <- which(colnames(map) %in% outer(strs,
+                                    fstrs[[2]],
+                                    function(x, y) paste(x, y, sep = "x")))
+    vec <- rep(0, n)
+    vec[indices] <- wt
+    X <- rbind2(X, vec)
+  }
+  for (strs in fstrs[[2]]) {
+    indices <- which(colnames(map) %in% outer(fstrs[[1]],
+                                    strs,
+                                    function(x, y) paste(x, y, sep = "x")))
+    vec <- rep(0, n)
+    vec[indices] <- wt
+    X <- rbind2(X, vec)
+  }
+  
+  lsei(A = X, B = Y, G = G, H = H, type = 2)$X
   
   # Random projection params
   #   size <- 10 * n
@@ -163,7 +158,7 @@ FitDistribution2Way <- function(estimates_stds, map,
   #   G <- rbind2(Diagonal(nproj), rep(-1, nproj))
   #   H <- c(rep(0, nproj), -1)
   #   lsei(A = Xproj, B = Yproj, G = G, H = H, type = 2)$X
-# }
+}
 
 Decode2Way <- function(counts, map, params, fit = NULL) {
   k <- params$k
diff --git a/analysis/R/read_input.R b/analysis/R/read_input.R
index 35c2cead..051b35c4 100644
--- a/analysis/R/read_input.R
+++ b/analysis/R/read_input.R
@@ -110,7 +110,7 @@ ReadMapFile <- function(map_file, params = NULL, quote = "") {
 #             object map$map
 #             This is the expected object from ReadMapFile
 #       params = data field with parameters
-ProcessMap <- function(map, params) {
+CorrectMapForAssoc <- function(map, params) {
   map$rmap <- map$map
   map$map <- lapply(1:params$m, function(i)
     map$rmap[seq(from = ((i - 1) * params$k + 1),
diff --git a/assoctest.sh b/assoctest.sh
index f9dc392b..74e800f5 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -184,7 +184,7 @@ _run-one-instance() {
     inp['counts'] = ['$instance_dir/case_2way.csv',\
                      '$instance_dir/case_marg1.csv',\
                      '$instance_dir/case_marg2.csv']; \
-    inp['expt'] = ['external-counts', 'external-reports-em']; \
+    inp['expt'] = ['external-counts']; \
     json.dump(inp, f); \
     f.close();"
 
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
index 37e65426..86dad21e 100755
--- a/tests/analyze_assoc_expt.R
+++ b/tests/analyze_assoc_expt.R
@@ -351,10 +351,11 @@ ExternalCounts <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv
   ptm <- proc.time()
   params <- ReadParameterFile(inp$params)
   # Ensure sufficient maps as required by number of vars
+  # Correct map from ReadMapFile() for assoc analysis
   stopifnot(inp$numvars == length(inp$maps))
   map <- lapply(inp$maps, function(o)
-    ProcessMap(ReadMapFile(o, params = params),
-               params = params))
+    CorrectMapForAssoc(ReadMapFile(o, params = params),
+                       params = params))
 
   # (2 way counts, marginal 1 counts, marginal 2 counts)
   counts <- lapply(1:3, function(i) ReadCountsFile(inp$counts[[i]]))
@@ -433,14 +434,17 @@ ExternalCounts <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv
 ## Outputs:
 ##
 #####################################################################
-ExternalReportsEM <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv") {
+ExternalReportsEM <- function(inp,
+                              verbose = FALSE,
+                              metrics_filename = "metrics.csv") {
   ptm <- proc.time()
   params <- ReadParameterFile(inp$params)
   # Ensure sufficient maps as required by number of vars
   stopifnot(inp$numvars == length(inp$maps))
+  # Correct map from ReadMapFile() for assoc analysis
   map <- lapply(inp$maps, function(o)
-    ProcessMap(ReadMapFile(o, params = params),
-               params = params))
+    CorrectMapForAssoc(ReadMapFile(o, params = params),
+                       params = params))
   
   # Reports must be of the format
   #     client name, cohort no, rappor bitstring 1, rappor bitstring 2, ...
diff --git a/tests/params.csv b/tests/params.csv
deleted file mode 100644
index 0dd2c58c..00000000
--- a/tests/params.csv
+++ /dev/null
@@ -1,2 +0,0 @@
-k, h, m, p, q, f
-16, 2, 64, 0.1, 0.9, 0.2

From c0ea8cf3aa40134030c05f55f5a27ca1f6a1b159 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 23 Jul 2015 10:36:01 -0700
Subject: [PATCH 54/67] Addressing more review comments.

- inverted noise matrix outside loop
- renamed gen_assoc_reports
- added its test to test.sh
- make-summary now shows original dimensions for variables
---
 analysis/R/decode2way.R                       |   4 +-
 assoctest.sh                                  |   2 +-
 test.sh                                       |   2 +
 tests/analyze_assoc_expt.R                    | 548 ------------------
 tests/assoctest.html                          |  20 +-
 ...ssoc_reports.R => gen_true_values_assoc.R} |   9 +-
 ...ts_test.R => gen_true_values_assoc_test.R} |  11 +-
 tests/make_summary_assoc.py                   |  22 +-
 8 files changed, 34 insertions(+), 584 deletions(-)
 delete mode 100755 tests/analyze_assoc_expt.R
 rename tests/{gen_assoc_reports.R => gen_true_values_assoc.R} (93%)
 rename tests/{gen_assoc_reports_test.R => gen_true_values_assoc_test.R} (61%)

diff --git a/analysis/R/decode2way.R b/analysis/R/decode2way.R
index 600c124e..ce52d341 100644
--- a/analysis/R/decode2way.R
+++ b/analysis/R/decode2way.R
@@ -61,6 +61,8 @@ EstimateBloomCounts2Way <- function(params, obs_counts) {
   NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00)
   NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01)
   NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2)
+  # Invert NoiseMatrix for estimator
+  InvNoiseMatrix <- t(solve(NoiseMatrix))
   
   # Apply the inverse of NoiseMatrix to get an unbiased estimator for
   # the number of times input pairs of bits were seen.
@@ -70,7 +72,7 @@ EstimateBloomCounts2Way <- function(params, obs_counts) {
     inds <- seq(0, (k/4)-1)
     v <- x[-1]
     sapply(inds, function(i){
-      as.vector(t(solve(NoiseMatrix)) %*% v[(i*4 + 1):((i+1)*4)])
+      as.vector(InvNoiseMatrix %*% v[(i*4 + 1):((i+1)*4)])
     })
   })
   
diff --git a/assoctest.sh b/assoctest.sh
index 74e800f5..6fbec00a 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -189,7 +189,7 @@ _run-one-instance() {
     f.close();"
 
   time {
-    tests/analyze_assoc_expt.R --inp $instance_dir/analyze_inp.json
+    tests/compare_assoc.R --inp $instance_dir/analyze_inp.json
   }
 }
 
diff --git a/test.sh b/test.sh
index 37ef0f14..22df91f5 100755
--- a/test.sh
+++ b/test.sh
@@ -112,6 +112,8 @@ r-unit() {
 
   tests/gen_true_values_test.R
 
+  tests/gen_true_values_assoc_test.R
+
   analysis/R/decode_test.R
 
   analysis/test/run_tests.R
diff --git a/tests/analyze_assoc_expt.R b/tests/analyze_assoc_expt.R
deleted file mode 100755
index 86dad21e..00000000
--- a/tests/analyze_assoc_expt.R
+++ /dev/null
@@ -1,548 +0,0 @@
-#!/usr/bin/env Rscript
-#
-# Copyright 2015 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Reads map files, report files, and RAPPOR parameters to run
-# an EM algorithm to estimate joint distribution over two or more variables
-#
-# Usage:
-#       $ ./analyze_assoc_expt.R --inp <input JSON file>
-#
-# Input file: 
-# Outputs:
-
-library("jsonlite")
-library("optparse")
-
-options(stringsAsFactors = FALSE)
-
-if(!interactive()) {
-  option_list <- list(
-    make_option(c("--inp"), default = "analyze_inp.json",
-                help = "JSON file with inputs for analyze_assoc_expt"))
-  opts <- parse_args(OptionParser(option_list = option_list))
-}
-
-source("analysis/R/decode2way.R")
-source("analysis/R/encode.R")
-source("analysis/R/decode.R")
-source("analysis/R/simulation.R")
-source("analysis/R/read_input.R")
-source("analysis/R/association.R")
-source("tests/gen_counts.R")
-
-# Wrapper function to print strings only if verbose flag is passed in
-PrintIfVerbose <- function(string, flag = FALSE) {
-  if(flag == TRUE) {
-    print(string)
-  }
-}
-
-# TV distance = L1 distance / 2 = 1 - sum(min(df1|x, df2|x)) where
-# df1|x / df2|x projects the distribution to the intersection x of the
-# supports of df1 and df2
-TVDistance <- function(df1, df2, statement = "TV DISTANCE") {
-  rowsi <- intersect(rownames(df1), rownames(df2))
-  colsi <- intersect(colnames(df1), colnames(df2))
-  print(statement)
-  1 - sum(mapply(min, 
-                 unlist(as.data.frame(df1[rowsi, colsi]), use.names = FALSE),
-                 unlist(as.data.frame(df2[rowsi, colsi]), use.names = FALSE)))
-}
-
-# Function to combine reports
-# Currently assume 2-way marginals
-CombineReports <- function(reports1, reports2) {
-  # Encoding (var1, var2) \in {(0, 0), (0, 1), (1, 0), (1, 1)}
-  two_bits <- list(c(0, 0, 0, 1), c(0, 1, 0, 0), c(0, 0, 1, 0), c(1, 0, 0, 0))
-  OuterProd <- function(x, y) {
-    as.vector(outer(x, y,
-                    function(z, t) z + 2 * t))
-  }
-  # "report1-major" order
-  creports <- mapply(OuterProd, reports2, reports1,
-                     SIMPLIFY = FALSE)
-  # Collapse counts to bit vector according to two_bits
-  lapply(creports,
-         function(x) as.vector(sapply(x, function(z) two_bits[[z+1]])))
-}
-
-# Given 2 lists of maps, maps1 and maps2, the function
-# combines the maps by cohort and outputs both
-# cohort-organized maps and flattened versions
-CombineMaps <- function(maps1, maps2) {
-  # Combine maps
-  cmap <- mapply(CombineMapsInternal, maps1, maps2)
-  
-  # Flatten map
-  inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE))
-  for (i in seq(1, length(inds))) {
-    inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1]
-  }
-  inds <- do.call("rbind", inds)
-  crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c(
-    nrow(cmap[[1]]) * length(cmap),
-    ncol(cmap[[1]])))
-  colnames(crmap) <- colnames(cmap[[1]])
-  list(cmap = cmap, crmap = crmap)
-}
-
-# Function to combine maps
-# Using map1-major order for both candidates and bits of the report
-# to be consistent with how CombineReports works
-# Currently assume 2-way marginals
-CombineMapsInternal <- function(map1, map2) {
-  # Retrieve set indices and dimensions
-  rows1 <- which(map1, arr.ind = TRUE)[,1]
-  cols1 <- which(map1, arr.ind = TRUE)[,2]
-  length1 <- dim(map1)[[1]]
-  width1 <- dim(map1)[[2]]
-  rows2 <- which(map2, arr.ind = TRUE)[,1]
-  cols2 <- which(map2, arr.ind = TRUE)[,2]
-  length2 <- dim(map2)[[1]]
-  width2 <- dim(map2)[[2]]
-  
-  # Now process map1
-  map1fn <- function(i, j) {
-    i1 <- seq(1, length2) + ((i-1) * length2)
-    j1 <- seq(1, width2) + ((j-1) * width2)
-    expand.grid(i1, j1)  
-  }
-  map1indices <- do.call(rbind,
-                         mapply(map1fn, rows1, cols1, SIMPLIFY = FALSE))
-  map1_big <- sparseMatrix(map1indices[,"Var1"],
-                           map1indices[,"Var2"],
-                           dims = c(length1 * length2,
-                                    width1 * width2))
-  colnames(map1_big) <- t(outer(colnames(map1),
-                              colnames(map2),
-                              function(x, y) paste(x, y, sep = "x")))
-  
-  # Now process map2
-  map2fn <- function(i, j) {
-    i2 <- i + (seq(0, length1 - 1) * length2)
-    j2 <- j + (seq(0, width1 - 1) * width2)
-    expand.grid(i2, j2)
-  }
-  map2indices <- do.call(rbind,
-                         mapply(map2fn, rows2, cols2, SIMPLIFY = FALSE))
-  map2_big <- sparseMatrix(map2indices[,"Var1"],
-                           map2indices[,"Var2"],
-                           dims = c(length1 * length2,
-                                    width1 * width2))
-  colnames(map2_big) <- t(outer(colnames(map1),
-                              colnames(map2),
-                              function(x, y) paste(x, y, sep = "x")))
-  
-  # Now collate two maps with entries in (1000, 0100, 0010, 0001)
-  # (m1&m2, !m1 & m2, m1 & !m2, !(m1 & m2)) respectively
-  findices <- which(map1_big & map2_big, arr.ind = TRUE)
-  # 1000
-  findices[, 1] <- findices[, 1] * 4 - 3
-  # 0100
-  indices_0100 <- which((!map1_big) & map2_big, arr.ind = TRUE)
-  indices_0100[, 1] <- indices_0100[, 1] * 4 - 2
-  findices <- rbind(findices, indices_0100)
-  # 0010
-  indices_0010 <- which(map1_big & (!map2_big), arr.ind = TRUE)
-  indices_0010[, 1] <- indices_0010[, 1] * 4 - 1
-  findices <- rbind(findices, indices_0010)
-  # 0001
-  indices_0001 <- which((!map1_big) & (!map2_big), arr.ind = TRUE)
-  indices_0001[, 1] <- indices_0001[, 1] * 4
-  findices <- rbind(findices, indices_0001)
-  sm <- sparseMatrix(findices[, 1], findices[, 2],
-                     dims = c(4 * length1 * length2,
-                        width1 * width2))
-  colnames(sm) <- colnames(map1_big)
-  sm
-}
-
-GenerateNoiseMatrix <- function(params) {
-  p <- params$p
-  q <- params$q
-  f <- params$f
-  m <- params$m
-  k <- params$k
-  
-  p11 <- q * (1 - f/2) + p * f / 2  # probability of a true 1 reported as 1
-  p01 <- p * (1 - f/2) + q * f / 2  # probability of a true 0 reported as 1
-  p10 <- 1 - p11  # probability of a true 1 reported as 0
-  p00 <- 1 - p01  # probability of a true 0 reported as 0
-  
-  NoiseMatrix <- matrix(rep(0, 16), 4)
-  NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2)
-  NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00)
-  NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01)
-  NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2)
-
-  NoiseMatrix
-}
-
-#####################################################################
-##
-## Direct simulation of reports WITHOUT simulated variance
-## 
-## Inputs: inp object (from parsing JSON) with
-##         num - # of reports
-##         params - file containing RAPPOR params
-##         varcandidates - list containing # of candidates for each var
-##         numvars - # of vars (>=2 for association)
-##         extra - # of extra candidates for var 1 
-##         
-##
-## Outputs: Runs simulation of two-way association analysis by directly
-##          simulating the counts of one way and two way marginals
-##
-#####################################################################
-DirectSimulationOfReports <- function(inp, verbose = FALSE) {
-  ptm <- proc.time()
-  params <- ReadParameterFile(inp$params)  
-  strconstant <- c("string", "option")
-  N <- inp$num
-  n1 <- inp$varcandidates[[1]]
-  n2 <- inp$varcandidates[[2]]
-  
-  # Construct unique vals for each variable using strconstant
-  stopifnot(length(strconstant) == inp$numvars)
-  uvals <- lapply(1:inp$numvars,
-                  function(i) {
-                    apply(as.matrix(1:inp$varcandidates[[i]]),
-                          1,
-                          function(z) sprintf("%s%d", strconstant[[i]], z))
-                  })
-  
-  # Add extras if any
-  if(inp$extras > 0) {
-    uvals[[1]] <- c(uvals[[1]], apply(as.matrix(1:inp$extras), 1,
-                                      function(z) sprintf("%s%d", strconstant[[1]], z + n1)))
-  }
-  
-  # Compute map
-  map <- lapply(uvals, function(u) CreateMap(u, params))
-  
-  # Trim maps to real # of candidates
-  # Use extras only for decoding
-  tmap <- lapply(map[[1]]$map, function(i) i[, 1:n1])
-  crmap_trimmed <- CombineMaps(tmap, map[[2]]$map)$crmap
-  
-  # Sample values to compute partition
-  # Zipfian over n1 strings
-  v1_part <- RandomPartition(N, ComputePdf("zipf1.5", n1))
-  # Zipfian over n2 strings for each of variable 1
-  # Distr. are correlated as in assoc_sim.R
-  final_part <- as.vector(sapply(1:n1,
-                                 function(i) {
-                                   v2_part <- RandomPartition(v1_part[[i]],
-                                                              ComputePdf("zipf1.5", n2))
-                                   if (i %% 2 == 0) {v2_part} else {rev(v2_part)}
-                                 }))
-  
-  td <- matrix(final_part/sum(final_part), nrow = n1, ncol = n2, byrow = TRUE)
-  v2_part <- RandomPartition(N, apply(td, 2, sum))
-  ow_parts <- list(v1_part, v2_part)
-  ow_parts[[1]] <- c(ow_parts[[1]], rep(0, inp$extra))
-  
-  # --------------
-  # Generate 1-way counts
-  ow_counts <- lapply(1:2, function(i)
-    GenerateCounts(params, map[[i]]$rmap, ow_parts[[i]], 1))
-  found_strings <- lapply(1:2, function(i)
-    Decode(ow_counts[[i]],
-           map[[i]]$rmap,
-           params, quick = TRUE)$fit[,"string"])
-  # --------------
-  
-  rownames(td) <- uvals[[1]][1:n1]  # Don't take into account extras
-  colnames(td) <- uvals[[2]]
-  PrintIfVerbose("TRUE DISTRIBUTION", verbose)
-  PrintIfVerbose(signif(td, 4), verbose)
-  cohorts <- as.matrix(
-    apply(as.data.frame(final_part), 1,
-          function(count) RandomPartition(count, rep(1, params$m))))
-  expanded <- apply(cohorts, 2, function(vec) rep(vec, each = ((params$k)**2)*4))
-  true_ones <- apply(expanded * crmap_trimmed, 1, sum)
-  
-  NoiseMatrix <- GenerateNoiseMatrix(params)
-  after_noise <- as.vector(sapply(1:(length(true_ones)/4), 
-                                  function(x) 
-                                    t(NoiseMatrix) %*% true_ones[((x-1)*4+1):(x*4)]))
-  counts <- cbind(apply(cohorts, 1, sum),
-                  matrix(after_noise,
-                         nrow = params$m,
-                         ncol = 4 * (params$k**2),
-                         byrow = TRUE))
-  
-  params2 <- params
-  params2$k <- (params$k ** 2) * 4
-  
-  # Combine maps to feed into Decode2Way
-  # Prune first to found_strings from Decode on 1-way counts
-  pruned <- lapply(1:2, function(i)
-    lapply(map[[i]]$map, function(z) z[,found_strings[[i]]]))
-  crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap
-  marginal <- Decode2Way(counts, crmap, params2)$fit
-  
-  # Fill in estimated results with rows and cols from td
-  ed <- matrix(0, nrow = (n1+inp$extra), ncol = n2)
-  rownames(ed) <- uvals[[1]]
-  colnames(ed) <- uvals[[2]]
-  for (cols in colnames(td)) {
-    for (rows in rownames(td)) {
-      ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
-    }
-  }
-  ed[is.na(ed)] <- 0
-  time_taken <- proc.time() - ptm
-  
-  PrintIfVerbose("2 WAY RESULTS", verbose)
-  PrintIfVerbose(signif(ed, 4), verbose)
-  PrintIfVerbose(TVDistance(td, ed, "TV DISTANCE 2 WAY ALGORITHM"), verbose)
-  PrintIfVerbose("PROC.TIME", verbose)
-  PrintIfVerbose(time_taken, verbose)
-  chisq_td <- chisq.test(td)[1][[1]][[1]]
-  chisq_ed <- chisq.test(ed)[1][[1]][[1]]
-  if(is.nan(chisq_ed)) {
-    chisq_ed <- 0
-  }
-  if(is.nan(chisq_td)) {
-    chisq_td <- 0
-  }
-  
-  metrics <- list(
-    td_chisq = chisq_td,
-    ed_chisq = chisq_ed,
-    tv = TVDistance(td, ed, ""),
-    time = time_taken[1],
-    dim1 = length(found_strings[[1]]),
-    dim2 = length(found_strings[[2]])
-  )
-  filename <- file.path(inp$outdir, 'metrics.csv')
-  write.csv(metrics, file = filename, row.names = FALSE)
-}
-
-#####################################################################
-##
-## Externally provided counts (gen_assoc_counts.R and sum_assoc_reports.py)
-## new_decode flag allows you to switch between two decode algorithm choices
-## Note: Only for two way associations
-## 
-## Inputs: inp object (from parsing JSON) with
-##    count files (2 way counts, individual marginal counts)
-##    map files (2 variables)
-##    params file with RAPPOR params
-##
-## Outputs: Runs simulation of two-way association analysis reading inputs
-##          from counts, maps, and params file.
-#####################################################################
-ExternalCounts <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv") {
-  ptm <- proc.time()
-  params <- ReadParameterFile(inp$params)
-  # Ensure sufficient maps as required by number of vars
-  # Correct map from ReadMapFile() for assoc analysis
-  stopifnot(inp$numvars == length(inp$maps))
-  map <- lapply(inp$maps, function(o)
-    CorrectMapForAssoc(ReadMapFile(o, params = params),
-                       params = params))
-
-  # (2 way counts, marginal 1 counts, marginal 2 counts)
-  counts <- lapply(1:3, function(i) ReadCountsFile(inp$counts[[i]]))
-  
-  params2 <- params
-  params2$k <- (params$k ** 2) * 4
-  
-  # Prune candidates
-  fit <- lapply(1:2, function(i)
-    Decode(counts[[i + 1]],
-           map[[i]]$rmap,
-           params, quick = FALSE)$fit)
-  
-  found_strings = list(fit[[1]][,"string"], fit[[2]][,"string"])
-
-  if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) {
-    PrintIfVerbose("FOUND_STRINGS", verbose)
-    PrintIfVerbose(found_strings, verbose)
-    stop("No strings found in 1-way marginal.")
-  }
-  
-  # Combine maps to feed into Decode2Way
-  # Prune first to found_strings from Decode on 1-way counts
-  pruned <- lapply(1:2, function(i)
-    lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE]))
-  crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap
-  marginal <- Decode2Way(counts[[1]], crmap, params2, fit = fit)$fit
-  td <- read.csv(file = inp$truefile, header = TRUE)
-  td <- table(td[,3:4])
-  td <- td / sum(td)
-  ed <- td
-  for (cols in colnames(td)) {
-    for (rows in rownames(td)) {
-      ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
-    }
-  }
-  ed[is.na(ed)] <- 0
-  ed[ed<0] <- 0
-  
-  time_taken <- proc.time() - ptm
-  
-  PrintIfVerbose(TVDistance(td, ed, "TV DISTANCE 2 WAY"), verbose)
-  PrintIfVerbose("PROC.TIME", verbose)
-  PrintIfVerbose(time_taken, verbose)
-  chisq_td <- chisq.test(td)[1][[1]][[1]]
-  chisq_ed <- chisq.test(ed)[1][[1]][[1]]
-  if(is.nan(chisq_td)) {
-    chisq_td <- 0
-  }
-  if(is.nan(chisq_ed)) {
-    chisq_ed <- 0
-  }
-  
-  metrics <- list(
-    td_chisq = chisq_td,
-    ed_chisq = chisq_ed,
-    tv = TVDistance(td, ed, ""),
-    time = time_taken[1],
-    dim1 = length(found_strings[[1]]),
-    dim2 = length(found_strings[[2]])
-  )
-  
-  # Write metrics to metrics_filename (default: metrics.csv)
-  filename <- file.path(inp$outdir, metrics_filename)
-  write.csv(metrics, file = filename, row.names = FALSE)
-}
-
-#####################################################################
-##
-## Externally provided reports
-## EM ALGORITHM
-## TODO: Also support 3 way association
-## 
-## Inputs:
-##    
-## Outputs:
-##
-#####################################################################
-ExternalReportsEM <- function(inp,
-                              verbose = FALSE,
-                              metrics_filename = "metrics.csv") {
-  ptm <- proc.time()
-  params <- ReadParameterFile(inp$params)
-  # Ensure sufficient maps as required by number of vars
-  stopifnot(inp$numvars == length(inp$maps))
-  # Correct map from ReadMapFile() for assoc analysis
-  map <- lapply(inp$maps, function(o)
-    CorrectMapForAssoc(ReadMapFile(o, params = params),
-                       params = params))
-  
-  # Reports must be of the format
-  #     client name, cohort no, rappor bitstring 1, rappor bitstring 2, ...
-  reportsObj <- read.csv(inp$reports,
-                           colClasses = c("character", "integer",
-                                          rep("character", inp$numvars)),
-                           header = TRUE)
-  # Ignore the first column
-  reportsObj <- reportsObj[,-1]
-
-  # Parsing reportsObj
-  # ComputeDistributionEM allows for different sets of cohorts
-  # for each variable. Here, both sets of cohorts are identical
-  co <- as.list(reportsObj[1])[[1]]
-  co <- co + 1  # 1 indexing
-  cohorts <- rep(list(co), inp$numvars)
-  # Parse reports from reportObj cols 2, 3, ...
-  reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1]))
-    
-  # Split strings into bit arrays (as required by assoc analysis)
-  reports <- lapply(1:inp$numvars, function(i) {
-    # apply the following function to each of reports[[1]] and reports[[2]]
-    lapply(reports[[i]][[1]], function(x) {
-      # function splits strings and converts them to numeric values
-      # rev needed for endianness
-      rev(as.numeric(strsplit(x, split = "")[[1]]))
-    })
-  })
-    
-  joint_dist <- ComputeDistributionEM(reports, cohorts, map,
-                                      ignore_other = TRUE,
-                                      quick = TRUE,
-                                      params, marginals = NULL,
-                                      estimate_var = FALSE)
-  em <- joint_dist$fit
-  td <- read.csv(file = inp$truefile, header = FALSE)
-  td <- table(td[,3:4])
-  td <- td / sum(td)
-  time_taken <- proc.time() - ptm
-  
-  PrintIfVerbose(TVDistance(td, em, "TV DISTANCE EM"), verbose)
-  PrintIfVerbose("PROC.TIME", verbose)
-  PrintIfVerbose(time_taken, verbose)
-  chisq_td <- chisq.test(td)[1][[1]][[1]]
-  chisq_ed <- chisq.test(em)[1][[1]][[1]]
-  if(is.nan(chisq_td)) {
-    chisq_td <- 0
-  }
-  if(is.nan(chisq_ed)) {
-    chisq_ed <- 0
-  }
-  
-  metrics <- list(
-    td_chisq = chisq_td,
-    ed_chisq = chisq_ed,
-    tv = TVDistance(td, em, ""),
-    time = time_taken[1],
-    dim1 = dim(em)[[1]],
-    dim2 = dim(em)[[2]]
-  )
-  
-  # Write metrics to metrics_filename (default: metrics.csv)
-  filename <- file.path(inp$outdir, metrics_filename)
-  write.csv(metrics, file = filename, row.names = FALSE)
-}
-
-main <- function(opts) {
-  inp <- fromJSON(opts$inp)
-  verbose_flag <- inp$verbose
-  # Choose from a set of experiments to run
-  # direct -> direct simulation of reports (without variances)
-  # external-counts -> externally supplied counts for 2 way and marginals
-  # external-reports -> externally supplied reports 
-
-  if("direct" %in% inp$expt) {
-    PrintIfVerbose("Running Experiment Direct", verbose_flag)
-    DirectSimulationOfReports(inp, verbose = verbose_flag)
-  } 
-  if ("external-counts" %in% inp$expt) {
-    PrintIfVerbose("Running Experiment Ext Counts", verbose_flag)
-    if ("direct" %in% inp$expt) {
-      # external-counts expt is run to compare results
-      ExternalCounts(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv")
-    } else {
-      ExternalCounts(inp, verbose = verbose_flag)
-    }
-  }
-  if ("external-reports-em" %in% inp$expt) {
-    PrintIfVerbose("Running Experiment Ext Reports", verbose_flag)
-    if (("direct" %in% inp$expt)||("external-counts" %in% inp$expt)) {
-      # external-reports-em expt is run to compare results
-      ExternalReportsEM(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv")
-    } else {
-      ExternalReportsEM(inp, verbose = verbose_flag)
-    }
-  }
-}
-
-if(!interactive()) {
-  main(opts)
-}
diff --git a/tests/assoctest.html b/tests/assoctest.html
index 0c839c86..e4b23875 100644
--- a/tests/assoctest.html
+++ b/tests/assoctest.html
@@ -25,9 +25,9 @@ <h2>RAPPOR assoctest.sh</h2>
   <table cellspacing="0" cellpadding="5">
     <colgroup>
       <col span="1" class="highlight" />
-      <col span="2" />
+      <col span="4" />
       <col span="6" class="highlight" />
-      <col span="6" />
+      <col span="4" />
     </colgroup>
 
     <thead>
@@ -35,20 +35,22 @@ <h2>RAPPOR assoctest.sh</h2>
         <td>
           Test Case
         </td>
-        <td colspan=2>
+        <td colspan=4>
           Input Params
         </td>
         <td colspan=6>
           RAPPOR Params
         </td>
-        <td colspan=6>
+        <td colspan=4>
           Result Metrics
         </td>
       </tr>
 
       <tr class="explain">
         <td></td>
-        <td colspan=2>
+        <td colspan=4>
+          d1: orig. support(var1)<br />
+          d2: orig. support(var2)<br />
           n: num reports<br/>
           e: num extras<br/>
         </td>
@@ -58,11 +60,9 @@ <h2>RAPPOR assoctest.sh</h2>
           m: cohorts<br/>
           p, q, f: probabilities<br/>
         </td>
-        <td colspan=6>
+        <td colspan=4>
           d1: dimension of var1 solutions. <br />
           d2: dimension of var2 solutions. <br />
-          td_chisq: chisq test on true distr.<br />
-          ed_chisq: chisq test on est. distr.<br />
           tv: tot. var. distance<br/>
           rtime: R runtime<br/>
         </td>
@@ -71,6 +71,8 @@ <h2>RAPPOR assoctest.sh</h2>
       <tr class="subhead">
         <td></td>
 
+        <td>d1</td>
+        <td>d2</td>
         <td>n</td>
         <td>e</td>
 
@@ -83,8 +85,6 @@ <h2>RAPPOR assoctest.sh</h2>
 
         <td>d1</td>
         <td>d2</td>
-        <td>td_chisq</td>
-        <td>ed_chisq</td>
         <td>tv</td>
         <td>rtime</td>
       </tr>
diff --git a/tests/gen_assoc_reports.R b/tests/gen_true_values_assoc.R
similarity index 93%
rename from tests/gen_assoc_reports.R
rename to tests/gen_true_values_assoc.R
index e6adb7e6..779fe398 100755
--- a/tests/gen_assoc_reports.R
+++ b/tests/gen_true_values_assoc.R
@@ -14,14 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: Rename reports to values (more in line with its usage for histogram
-# RAPPOR)
-
 source('tests/gen_counts.R')
 
 # Usage:
 #
-# $ ./gen_assoc_reports.R 100 20 10000 foo.csv
+# $ ./gen_true_values_assoc.R 100 20 10000 foo.csv
 #
 # Inputs:
 #   size of the distribution's support for var 1
@@ -31,7 +28,7 @@ source('tests/gen_counts.R')
 # Output:
 #   csv file with reports sampled according to the specified distribution. 
 
-GenerateAssocReports <- function(n, N, num_cohorts) {
+GenerateTrueValuesAssoc <- function(n, N, num_cohorts) {
   # Inputs: n, a list of supports for vars 1, 2
   #         N, the number of reports/clients
   #         num_cohorts, the number of cohorts
@@ -83,7 +80,7 @@ main <- function(argv) {
   num_cohorts <- as.integer(argv[[4]])
   out_file <- argv[[5]]
 
-  res <- GenerateAssocReports(n, N, num_cohorts)
+  res <- GenerateTrueValuesAssoc(n, N, num_cohorts)
   # Prepend with str and opt
   reports <- list(sprintf("str%d", res$values[[1]]),
                   sprintf("opt%d", res$values[[2]]))
diff --git a/tests/gen_assoc_reports_test.R b/tests/gen_true_values_assoc_test.R
similarity index 61%
rename from tests/gen_assoc_reports_test.R
rename to tests/gen_true_values_assoc_test.R
index 10f88c51..ebef1e77 100755
--- a/tests/gen_assoc_reports_test.R
+++ b/tests/gen_true_values_assoc_test.R
@@ -4,15 +4,15 @@
 
 source('analysis/R/util.R')  # Log()
 
-source('tests/gen_assoc_reports.R')  # module under test
+source('tests/gen_true_values_assoc.R')  # module under test
 
 library(RUnit)
 
-TestGenerateAssocReports <- function() {
+TestGenerateTrueValuesAssoc <- function() {
   # list for support of var1, var2, 
   # total number of reports
   # num_cohorts
-  res <- GenerateAssocReports(list(20, 5), 1000, 32)
+  res <- GenerateTrueValuesAssoc(list(20, 5), 1000, 32)
   # print(res$values)
 
   # 1000 reports
@@ -25,10 +25,13 @@ TestGenerateAssocReports <- function() {
 
   # Ensure cohorts are filled up
   checkEquals(32, length(unique(res$cohort)))
+
+  # TODO: Add tests to confirm (w.h.p.?) that certain distribution aspects are
+  # as expected (such as the zipfian on marginals)
 }
 
 TestAll <- function(){
-  TestGenerateAssocReports()
+  TestGenerateTrueValuesAssoc()
 }
 
 TestAll()
diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py
index ad21ea44..f1b0d1ca 100755
--- a/tests/make_summary_assoc.py
+++ b/tests/make_summary_assoc.py
@@ -19,6 +19,8 @@
   <!-- input params -->
   <td></td>
   <td></td>
+  <td></td>
+  <td></td>
 
   <!-- RAPPOR params -->
   <td></td>
@@ -31,8 +33,6 @@
   <!-- Result metrics -->
   <td></td>
   <td></td>
-  <td></td>
-  <td>%(mean_chisqdiff)s</td>
   <td>%(mean_l1d)s</td>
   <td>%(mean_rtime)s</td>
 </tr>
@@ -125,18 +125,14 @@ def MeanOfMeans(dict_of_lists):
 
 
 def ParseSpecFile(spec_filename, empty = False):
-  """Parses the spec (parameters) file.
+  #Parses the spec (parameters) file.
 
-  Returns:
-    An integer and a string. The integer is the number of bogus candidates
-    and the string is parameters in the HTML format.
-  """
   with open(spec_filename) as s:
     spec_row = s.readline().split()
 
-  spec_in_html = ' '.join('<td>%s</td>' % cell for cell in spec_row[3:])
+  spec_in_html = ' '.join('<td>%s</td>' % cell for cell in spec_row[1:])
   if empty == True:
-    spec_in_html = ' '.join('<td></td>' for cell in spec_row[3:])
+    spec_in_html = ' '.join('<td></td>' for cell in spec_row[1:])
 
   return spec_in_html
 
@@ -185,8 +181,6 @@ def ParseMetrics(metrics_file, log_file, italics = False):
   metrics_row_str = [
     '%s' % d1,
     '%s' % d2,
-    '%.3f' % td_chisq,
-    '%.3f' % ed_chisq,
     '%.3f' % l1d,
     str(rtime),
   ]
@@ -249,7 +243,7 @@ def FormatSummaryRow(metrics_lists):
   summary = {
       'name': 'Means',
       'mean_l1d': FormatMeanWithSem(means_with_sem['l1d'], percent=False),
-      'mean_chisqdiff': FormatMeanWithSem(means_with_sem['chisqdiff'], percent=False),
+  #    'mean_chisqdiff': FormatMeanWithSem(means_with_sem['chisqdiff'], percent=False),
       'mean_rtime': FormatMeanWithSem(means_with_sem['rtime']),
   }
   return SUMMARY_ROW % summary
@@ -345,7 +339,7 @@ def main(argv):
 
     # Print summary of test instances
     if(int(test_instance) == num_instances):
-      row_str = ['', '', '', '', 
+      row_str = ['', '',
         '%.3f&plusmn;%.3f' % (Mean(l1d_list), StandardErrorEstimate(l1d_list)),
         '',
       ]
@@ -353,7 +347,7 @@ def main(argv):
               True), ' '.join('<td><b>%s</b></td>' % cell for cell in
                               row_str))
       if (os.path.isfile(metrics_file)):
-        row_str2 = ['', '', '', '', 
+        row_str2 = ['', '',
           '%.3f&plusmn;%.3f' % (Mean(l1d_list2), StandardErrorEstimate(l1d_list2)),
           '',
         ]

From 870ee045e5002b40b98a2dbd3a53f239b9c67bb5 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 23 Jul 2015 12:41:48 -0700
Subject: [PATCH 55/67] Adding sum_bits_assoc_test and fixing small error in
 assoctest.sh

---
 analysis/tools/sum_bits_assoc.py      |  16 ++--
 analysis/tools/sum_bits_assoc_test.py | 116 ++++++++++++++++++++++++++
 assoctest.sh                          |   2 +-
 3 files changed, 127 insertions(+), 7 deletions(-)
 create mode 100755 analysis/tools/sum_bits_assoc_test.py

diff --git a/analysis/tools/sum_bits_assoc.py b/analysis/tools/sum_bits_assoc.py
index a858d78f..2263b671 100755
--- a/analysis/tools/sum_bits_assoc.py
+++ b/analysis/tools/sum_bits_assoc.py
@@ -31,9 +31,12 @@
 
 def SumBits(params, stdin, f_2way, f_1, f_2):
   csv_in = csv.reader(stdin)
-  csv_out_two_way = csv.writer(open(f_2way, "w"))
-  csv_out_1 = csv.writer(open(f_1, "w"))
-  csv_out_2 = csv.writer(open(f_2, "w"))
+  csv_out_two_way = csv.writer(f_2way)
+  csv_out_1 = csv.writer(f_1)
+  csv_out_2 = csv.writer(f_2)
+#  csv_out_two_way = csv.writer(open(f_2way, "w"))
+#  csv_out_1 = csv.writer(open(f_1, "w"))
+#  csv_out_2 = csv.writer(open(f_2, "w"))
 
   num_cohorts = params.num_cohorts
   num_bloombits = params.num_bloombits
@@ -59,7 +62,6 @@ def SumBits(params, stdin, f_2way, f_1, f_2):
       raise RuntimeError('Error indexing cohort number %d (num_cohorts is %d) \
                          ' % (cohort, num_cohorts))
 
-    # TODO: Extend checking for both reports
     if not len(irr_1) == params.num_bloombits:
       raise RuntimeError(
         "Expected %d bits in report 1, got %r" % 
@@ -118,8 +120,10 @@ def main(argv):
     except rappor.Error as e:
       raise RuntimeError(e)
 
-  SumBits(params, sys.stdin, prefix + "_2way.csv",
-          prefix + "_marg1.csv", prefix + "_marg2.csv")
+  with open(prefix + "_2way.csv", "w") as f_2way:
+    with open(prefix + "_marg1.csv", "w") as f_1:
+      with open(prefix + "_marg2.csv", "w") as f_2:
+        SumBits(params, sys.stdin, f_2way, f_1, f_2)
 
 
 if __name__ == '__main__':
diff --git a/analysis/tools/sum_bits_assoc_test.py b/analysis/tools/sum_bits_assoc_test.py
new file mode 100755
index 00000000..fe37f1ce
--- /dev/null
+++ b/analysis/tools/sum_bits_assoc_test.py
@@ -0,0 +1,116 @@
+#!/usr/bin/python -S
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+sum_bits_assoc_test.py: Tests for sum_bits_assoc.py
+"""
+
+import cStringIO
+import unittest
+
+import rappor
+import sum_bits_assoc  # module under test
+
+
+# The header doesn't matter
+CSV_IN = """\
+user_id,cohort,irr1,irr2  
+5,1,0011,1010
+5,1,0011,1010
+5,1,0000,0000
+"""
+
+# ###############################
+# EXPECTED_F_2WAY
+#
+# NOTE: bit order is reversed.
+# First row is 65 zeroes
+EXPECTED_F_2WAY = """\
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\r
+"""
+
+# Cohort 1
+# Total # of reports
+EXPECTED_F_2WAY = EXPECTED_F_2WAY + "3,"
+
+# Looking at LSBs of both irrs
+# Total # of (11, 01, 10, 00) that appear
+EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,2,1,"
+
+# Report 1-major order. So looking at LSB of irr1 and 2nd LSB of irr2
+EXPECTED_F_2WAY = EXPECTED_F_2WAY + "2,0,0,1,"
+
+# And so on ...
+EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,2,1,"
+EXPECTED_F_2WAY = EXPECTED_F_2WAY + "2,0,0,1,"
+
+# Now moving on to 2nd LSB of irr1
+EXPECTED_F_2WAY = EXPECTED_F_2WAY + ("0,0,2,1,2,0,0,1," * 2)
+
+# Now moving on to 3rd LSB of irr1
+# Note that for 3rd LSB of irr1 and LSB of irr2, there are three 00s
+EXPECTED_F_2WAY = EXPECTED_F_2WAY + ("0,0,0,3,0,2,0,1," * 2)
+# MSB of irr1
+EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,0,3,0,2,0,1," + "0,0,0,3,0,2,0,1"
+
+# end of EXPECTED_F_2WAY
+# ###############################
+
+# NOTE: bit order is reversed.
+EXPECTED_F_1 = """\
+0,0,0,0,0\r
+3,2,2,0,0\r
+"""
+
+# NOTE: bit order is reversed.
+EXPECTED_F_2 = """\
+0,0,0,0,0\r
+3,0,2,0,2\r
+"""
+
+class SumBitsAssocTest(unittest.TestCase):
+
+  def setUp(self):
+    self.params = rappor.Params()
+    self.params.num_bloombits = 4
+    self.params.num_cohorts = 2
+    self.maxDiff = None
+
+  def testSum(self):
+    stdin = cStringIO.StringIO(CSV_IN)
+    f_2way = cStringIO.StringIO()
+    f_1 = cStringIO.StringIO()
+    f_2 = cStringIO.StringIO()
+
+    sum_bits_assoc.SumBits(self.params, stdin, f_2way, f_1, f_2)
+    print f_2way.getvalue()
+    print EXPECTED_F_2WAY
+
+    self.assertMultiLineEqual(EXPECTED_F_1, f_1.getvalue())
+    self.assertMultiLineEqual(EXPECTED_F_2, f_2.getvalue())
+    self.assertMultiLineEqual(EXPECTED_F_2WAY, f_2way.getvalue())
+
+#  def testErrors(self):
+#    stdin = cStringIO.StringIO(TOO_MANY_COLUMNS)
+#    stdout = cStringIO.StringIO()
+#
+#    self.assertRaises(
+#        RuntimeError, sum_bits.SumBits, self.params, stdin, stdout)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/assoctest.sh b/assoctest.sh
index 6fbec00a..17d02df6 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -114,7 +114,7 @@ _run-one-instance() {
 
   banner "Generating input"
 
-  tests/gen_assoc_reports.R $num_unique_values $num_unique_values2 \
+  tests/gen_true_values_assoc.R $num_unique_values $num_unique_values2 \
                             $num_clients $num_cohorts $instance_dir/case.csv
 
   banner "Running RAPPOR client"

From 964f8a9de7edf48e08895718bc8ff869dac7ba3f Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 23 Jul 2015 12:46:28 -0700
Subject: [PATCH 56/67] Adding sum_bits_assoc_test.py

---
 analysis/tools/sum_bits_assoc_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/analysis/tools/sum_bits_assoc_test.py b/analysis/tools/sum_bits_assoc_test.py
index fe37f1ce..3a5fb208 100755
--- a/analysis/tools/sum_bits_assoc_test.py
+++ b/analysis/tools/sum_bits_assoc_test.py
@@ -65,7 +65,7 @@
 # Note that for 3rd LSB of irr1 and LSB of irr2, there are three 00s
 EXPECTED_F_2WAY = EXPECTED_F_2WAY + ("0,0,0,3,0,2,0,1," * 2)
 # MSB of irr1
-EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,0,3,0,2,0,1," + "0,0,0,3,0,2,0,1"
+EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,0,3,0,2,0,1," + "0,0,0,3,0,2,0,1\r\n"
 
 # end of EXPECTED_F_2WAY
 # ###############################

From a4accc9042a88f05ed6968ce0902d7fdddb338ee Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 23 Jul 2015 13:23:51 -0700
Subject: [PATCH 57/67] Added a couple more tests to sum_bits_assoc_test

---
 analysis/tools/sum_bits_assoc_test.py | 33 +++++++++++++++++++--------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/analysis/tools/sum_bits_assoc_test.py b/analysis/tools/sum_bits_assoc_test.py
index 3a5fb208..11ac2ff1 100755
--- a/analysis/tools/sum_bits_assoc_test.py
+++ b/analysis/tools/sum_bits_assoc_test.py
@@ -82,6 +82,16 @@
 3,0,2,0,2\r
 """
 
+WRONG_IRR_BITS = """\
+user_id,cohort,irr1,irr2
+cli1,1,00123,11223
+"""
+
+WRONG_COHORT = """\
+user_id,cohort,irr1,irr2
+cli1,3,0011,0001
+"""
+
 class SumBitsAssocTest(unittest.TestCase):
 
   def setUp(self):
@@ -97,19 +107,24 @@ def testSum(self):
     f_2 = cStringIO.StringIO()
 
     sum_bits_assoc.SumBits(self.params, stdin, f_2way, f_1, f_2)
-    print f_2way.getvalue()
-    print EXPECTED_F_2WAY
-
     self.assertMultiLineEqual(EXPECTED_F_1, f_1.getvalue())
     self.assertMultiLineEqual(EXPECTED_F_2, f_2.getvalue())
     self.assertMultiLineEqual(EXPECTED_F_2WAY, f_2way.getvalue())
 
-#  def testErrors(self):
-#    stdin = cStringIO.StringIO(TOO_MANY_COLUMNS)
-#    stdout = cStringIO.StringIO()
-#
-#    self.assertRaises(
-#        RuntimeError, sum_bits.SumBits, self.params, stdin, stdout)
+  def testErrors(self):
+    f_2way = cStringIO.StringIO()
+    f_1 = cStringIO.StringIO()
+    f_2 = cStringIO.StringIO()
+
+    stdin = cStringIO.StringIO(WRONG_IRR_BITS)
+    self.assertRaises(
+        RuntimeError, sum_bits_assoc.SumBits, self.params, stdin,
+        f_2way, f_1, f_2)
+
+    stdin = cStringIO.StringIO(WRONG_COHORT)
+    self.assertRaises(
+        RuntimeError, sum_bits_assoc.SumBits, self.params, stdin,
+        f_2way, f_1, f_2)
 
 
 if __name__ == '__main__':

From e66ffd13f90bbed265b4c38e157dfb79899b317f Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 23 Jul 2015 16:47:08 -0700
Subject: [PATCH 58/67] Adding compare_assoc.R instead of analyze_assoc_expt.R

---
 tests/compare_assoc.R | 548 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 548 insertions(+)
 create mode 100755 tests/compare_assoc.R

diff --git a/tests/compare_assoc.R b/tests/compare_assoc.R
new file mode 100755
index 00000000..86dad21e
--- /dev/null
+++ b/tests/compare_assoc.R
@@ -0,0 +1,548 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Reads map files, report files, and RAPPOR parameters to run
+# an EM algorithm to estimate joint distribution over two or more variables
+#
+# Usage:
+#       $ ./analyze_assoc_expt.R --inp <input JSON file>
+#
+# Input file: 
+# Outputs:
+
+library("jsonlite")
+library("optparse")
+
+options(stringsAsFactors = FALSE)
+
+if(!interactive()) {
+  option_list <- list(
+    make_option(c("--inp"), default = "analyze_inp.json",
+                help = "JSON file with inputs for analyze_assoc_expt"))
+  opts <- parse_args(OptionParser(option_list = option_list))
+}
+
+source("analysis/R/decode2way.R")
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/read_input.R")
+source("analysis/R/association.R")
+source("tests/gen_counts.R")
+
+# Wrapper function to print strings only if verbose flag is passed in
+PrintIfVerbose <- function(string, flag = FALSE) {
+  if(flag == TRUE) {
+    print(string)
+  }
+}
+
+# TV distance = L1 distance / 2 = 1 - sum(min(df1|x, df2|x)) where
+# df1|x / df2|x projects the distribution to the intersection x of the
+# supports of df1 and df2
+TVDistance <- function(df1, df2, statement = "TV DISTANCE") {
+  rowsi <- intersect(rownames(df1), rownames(df2))
+  colsi <- intersect(colnames(df1), colnames(df2))
+  print(statement)
+  1 - sum(mapply(min, 
+                 unlist(as.data.frame(df1[rowsi, colsi]), use.names = FALSE),
+                 unlist(as.data.frame(df2[rowsi, colsi]), use.names = FALSE)))
+}
+
+# Function to combine reports
+# Currently assume 2-way marginals
+CombineReports <- function(reports1, reports2) {
+  # Encoding (var1, var2) \in {(0, 0), (0, 1), (1, 0), (1, 1)}
+  two_bits <- list(c(0, 0, 0, 1), c(0, 1, 0, 0), c(0, 0, 1, 0), c(1, 0, 0, 0))
+  OuterProd <- function(x, y) {
+    as.vector(outer(x, y,
+                    function(z, t) z + 2 * t))
+  }
+  # "report1-major" order
+  creports <- mapply(OuterProd, reports2, reports1,
+                     SIMPLIFY = FALSE)
+  # Collapse counts to bit vector according to two_bits
+  lapply(creports,
+         function(x) as.vector(sapply(x, function(z) two_bits[[z+1]])))
+}
+
+# Given 2 lists of maps, maps1 and maps2, the function
+# combines the maps by cohort and outputs both
+# cohort-organized maps and flattened versions
+CombineMaps <- function(maps1, maps2) {
+  # Combine maps
+  cmap <- mapply(CombineMapsInternal, maps1, maps2)
+  
+  # Flatten map
+  inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE))
+  for (i in seq(1, length(inds))) {
+    inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1]
+  }
+  inds <- do.call("rbind", inds)
+  crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c(
+    nrow(cmap[[1]]) * length(cmap),
+    ncol(cmap[[1]])))
+  colnames(crmap) <- colnames(cmap[[1]])
+  list(cmap = cmap, crmap = crmap)
+}
+
+# Function to combine maps
+# Using map1-major order for both candidates and bits of the report
+# to be consistent with how CombineReports works
+# Currently assume 2-way marginals
+CombineMapsInternal <- function(map1, map2) {
+  # Retrieve set indices and dimensions
+  rows1 <- which(map1, arr.ind = TRUE)[,1]
+  cols1 <- which(map1, arr.ind = TRUE)[,2]
+  length1 <- dim(map1)[[1]]
+  width1 <- dim(map1)[[2]]
+  rows2 <- which(map2, arr.ind = TRUE)[,1]
+  cols2 <- which(map2, arr.ind = TRUE)[,2]
+  length2 <- dim(map2)[[1]]
+  width2 <- dim(map2)[[2]]
+  
+  # Now process map1
+  map1fn <- function(i, j) {
+    i1 <- seq(1, length2) + ((i-1) * length2)
+    j1 <- seq(1, width2) + ((j-1) * width2)
+    expand.grid(i1, j1)  
+  }
+  map1indices <- do.call(rbind,
+                         mapply(map1fn, rows1, cols1, SIMPLIFY = FALSE))
+  map1_big <- sparseMatrix(map1indices[,"Var1"],
+                           map1indices[,"Var2"],
+                           dims = c(length1 * length2,
+                                    width1 * width2))
+  colnames(map1_big) <- t(outer(colnames(map1),
+                              colnames(map2),
+                              function(x, y) paste(x, y, sep = "x")))
+  
+  # Now process map2
+  map2fn <- function(i, j) {
+    i2 <- i + (seq(0, length1 - 1) * length2)
+    j2 <- j + (seq(0, width1 - 1) * width2)
+    expand.grid(i2, j2)
+  }
+  map2indices <- do.call(rbind,
+                         mapply(map2fn, rows2, cols2, SIMPLIFY = FALSE))
+  map2_big <- sparseMatrix(map2indices[,"Var1"],
+                           map2indices[,"Var2"],
+                           dims = c(length1 * length2,
+                                    width1 * width2))
+  colnames(map2_big) <- t(outer(colnames(map1),
+                              colnames(map2),
+                              function(x, y) paste(x, y, sep = "x")))
+  
+  # Now collate two maps with entries in (1000, 0100, 0010, 0001)
+  # (m1&m2, !m1 & m2, m1 & !m2, !(m1 & m2)) respectively
+  findices <- which(map1_big & map2_big, arr.ind = TRUE)
+  # 1000
+  findices[, 1] <- findices[, 1] * 4 - 3
+  # 0100
+  indices_0100 <- which((!map1_big) & map2_big, arr.ind = TRUE)
+  indices_0100[, 1] <- indices_0100[, 1] * 4 - 2
+  findices <- rbind(findices, indices_0100)
+  # 0010
+  indices_0010 <- which(map1_big & (!map2_big), arr.ind = TRUE)
+  indices_0010[, 1] <- indices_0010[, 1] * 4 - 1
+  findices <- rbind(findices, indices_0010)
+  # 0001
+  indices_0001 <- which((!map1_big) & (!map2_big), arr.ind = TRUE)
+  indices_0001[, 1] <- indices_0001[, 1] * 4
+  findices <- rbind(findices, indices_0001)
+  sm <- sparseMatrix(findices[, 1], findices[, 2],
+                     dims = c(4 * length1 * length2,
+                        width1 * width2))
+  colnames(sm) <- colnames(map1_big)
+  sm
+}
+
+GenerateNoiseMatrix <- function(params) {
+  p <- params$p
+  q <- params$q
+  f <- params$f
+  m <- params$m
+  k <- params$k
+  
+  p11 <- q * (1 - f/2) + p * f / 2  # probability of a true 1 reported as 1
+  p01 <- p * (1 - f/2) + q * f / 2  # probability of a true 0 reported as 1
+  p10 <- 1 - p11  # probability of a true 1 reported as 0
+  p00 <- 1 - p01  # probability of a true 0 reported as 0
+  
+  NoiseMatrix <- matrix(rep(0, 16), 4)
+  NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2)
+  NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00)
+  NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01)
+  NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2)
+
+  NoiseMatrix
+}
+
+#####################################################################
+##
+## Direct simulation of reports WITHOUT simulated variance
+## 
+## Inputs: inp object (from parsing JSON) with
+##         num - # of reports
+##         params - file containing RAPPOR params
+##         varcandidates - list containing # of candidates for each var
+##         numvars - # of vars (>=2 for association)
+##         extra - # of extra candidates for var 1 
+##         
+##
+## Outputs: Runs simulation of two-way association analysis by directly
+##          simulating the counts of one way and two way marginals
+##
+#####################################################################
+DirectSimulationOfReports <- function(inp, verbose = FALSE) {
+  ptm <- proc.time()
+  params <- ReadParameterFile(inp$params)  
+  strconstant <- c("string", "option")
+  N <- inp$num
+  n1 <- inp$varcandidates[[1]]
+  n2 <- inp$varcandidates[[2]]
+  
+  # Construct unique vals for each variable using strconstant
+  stopifnot(length(strconstant) == inp$numvars)
+  uvals <- lapply(1:inp$numvars,
+                  function(i) {
+                    apply(as.matrix(1:inp$varcandidates[[i]]),
+                          1,
+                          function(z) sprintf("%s%d", strconstant[[i]], z))
+                  })
+  
+  # Add extras if any
+  if(inp$extras > 0) {
+    uvals[[1]] <- c(uvals[[1]], apply(as.matrix(1:inp$extras), 1,
+                                      function(z) sprintf("%s%d", strconstant[[1]], z + n1)))
+  }
+  
+  # Compute map
+  map <- lapply(uvals, function(u) CreateMap(u, params))
+  
+  # Trim maps to real # of candidates
+  # Use extras only for decoding
+  tmap <- lapply(map[[1]]$map, function(i) i[, 1:n1])
+  crmap_trimmed <- CombineMaps(tmap, map[[2]]$map)$crmap
+  
+  # Sample values to compute partition
+  # Zipfian over n1 strings
+  v1_part <- RandomPartition(N, ComputePdf("zipf1.5", n1))
+  # Zipfian over n2 strings for each of variable 1
+  # Distr. are correlated as in assoc_sim.R
+  final_part <- as.vector(sapply(1:n1,
+                                 function(i) {
+                                   v2_part <- RandomPartition(v1_part[[i]],
+                                                              ComputePdf("zipf1.5", n2))
+                                   if (i %% 2 == 0) {v2_part} else {rev(v2_part)}
+                                 }))
+  
+  td <- matrix(final_part/sum(final_part), nrow = n1, ncol = n2, byrow = TRUE)
+  v2_part <- RandomPartition(N, apply(td, 2, sum))
+  ow_parts <- list(v1_part, v2_part)
+  ow_parts[[1]] <- c(ow_parts[[1]], rep(0, inp$extra))
+  
+  # --------------
+  # Generate 1-way counts
+  ow_counts <- lapply(1:2, function(i)
+    GenerateCounts(params, map[[i]]$rmap, ow_parts[[i]], 1))
+  found_strings <- lapply(1:2, function(i)
+    Decode(ow_counts[[i]],
+           map[[i]]$rmap,
+           params, quick = TRUE)$fit[,"string"])
+  # --------------
+  
+  rownames(td) <- uvals[[1]][1:n1]  # Don't take into account extras
+  colnames(td) <- uvals[[2]]
+  PrintIfVerbose("TRUE DISTRIBUTION", verbose)
+  PrintIfVerbose(signif(td, 4), verbose)
+  cohorts <- as.matrix(
+    apply(as.data.frame(final_part), 1,
+          function(count) RandomPartition(count, rep(1, params$m))))
+  expanded <- apply(cohorts, 2, function(vec) rep(vec, each = ((params$k)**2)*4))
+  true_ones <- apply(expanded * crmap_trimmed, 1, sum)
+  
+  NoiseMatrix <- GenerateNoiseMatrix(params)
+  after_noise <- as.vector(sapply(1:(length(true_ones)/4), 
+                                  function(x) 
+                                    t(NoiseMatrix) %*% true_ones[((x-1)*4+1):(x*4)]))
+  counts <- cbind(apply(cohorts, 1, sum),
+                  matrix(after_noise,
+                         nrow = params$m,
+                         ncol = 4 * (params$k**2),
+                         byrow = TRUE))
+  
+  params2 <- params
+  params2$k <- (params$k ** 2) * 4
+  
+  # Combine maps to feed into Decode2Way
+  # Prune first to found_strings from Decode on 1-way counts
+  pruned <- lapply(1:2, function(i)
+    lapply(map[[i]]$map, function(z) z[,found_strings[[i]]]))
+  crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap
+  marginal <- Decode2Way(counts, crmap, params2)$fit
+  
+  # Fill in estimated results with rows and cols from td
+  ed <- matrix(0, nrow = (n1+inp$extra), ncol = n2)
+  rownames(ed) <- uvals[[1]]
+  colnames(ed) <- uvals[[2]]
+  for (cols in colnames(td)) {
+    for (rows in rownames(td)) {
+      ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
+    }
+  }
+  ed[is.na(ed)] <- 0
+  time_taken <- proc.time() - ptm
+  
+  PrintIfVerbose("2 WAY RESULTS", verbose)
+  PrintIfVerbose(signif(ed, 4), verbose)
+  PrintIfVerbose(TVDistance(td, ed, "TV DISTANCE 2 WAY ALGORITHM"), verbose)
+  PrintIfVerbose("PROC.TIME", verbose)
+  PrintIfVerbose(time_taken, verbose)
+  chisq_td <- chisq.test(td)[1][[1]][[1]]
+  chisq_ed <- chisq.test(ed)[1][[1]][[1]]
+  if(is.nan(chisq_ed)) {
+    chisq_ed <- 0
+  }
+  if(is.nan(chisq_td)) {
+    chisq_td <- 0
+  }
+  
+  metrics <- list(
+    td_chisq = chisq_td,
+    ed_chisq = chisq_ed,
+    tv = TVDistance(td, ed, ""),
+    time = time_taken[1],
+    dim1 = length(found_strings[[1]]),
+    dim2 = length(found_strings[[2]])
+  )
+  filename <- file.path(inp$outdir, 'metrics.csv')
+  write.csv(metrics, file = filename, row.names = FALSE)
+}
+
+#####################################################################
+##
+## Externally provided counts (gen_assoc_counts.R and sum_assoc_reports.py)
+## new_decode flag allows you to switch between two decode algorithm choices
+## Note: Only for two way associations
+## 
+## Inputs: inp object (from parsing JSON) with
+##    count files (2 way counts, individual marginal counts)
+##    map files (2 variables)
+##    params file with RAPPOR params
+##
+## Outputs: Runs simulation of two-way association analysis reading inputs
+##          from counts, maps, and params file.
+#####################################################################
+ExternalCounts <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv") {
+  ptm <- proc.time()
+  params <- ReadParameterFile(inp$params)
+  # Ensure sufficient maps as required by number of vars
+  # Correct map from ReadMapFile() for assoc analysis
+  stopifnot(inp$numvars == length(inp$maps))
+  map <- lapply(inp$maps, function(o)
+    CorrectMapForAssoc(ReadMapFile(o, params = params),
+                       params = params))
+
+  # (2 way counts, marginal 1 counts, marginal 2 counts)
+  counts <- lapply(1:3, function(i) ReadCountsFile(inp$counts[[i]]))
+  
+  params2 <- params
+  params2$k <- (params$k ** 2) * 4
+  
+  # Prune candidates
+  fit <- lapply(1:2, function(i)
+    Decode(counts[[i + 1]],
+           map[[i]]$rmap,
+           params, quick = FALSE)$fit)
+  
+  found_strings = list(fit[[1]][,"string"], fit[[2]][,"string"])
+
+  if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) {
+    PrintIfVerbose("FOUND_STRINGS", verbose)
+    PrintIfVerbose(found_strings, verbose)
+    stop("No strings found in 1-way marginal.")
+  }
+  
+  # Combine maps to feed into Decode2Way
+  # Prune first to found_strings from Decode on 1-way counts
+  pruned <- lapply(1:2, function(i)
+    lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE]))
+  crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap
+  marginal <- Decode2Way(counts[[1]], crmap, params2, fit = fit)$fit
+  td <- read.csv(file = inp$truefile, header = TRUE)
+  td <- table(td[,3:4])
+  td <- td / sum(td)
+  ed <- td
+  for (cols in colnames(td)) {
+    for (rows in rownames(td)) {
+      ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
+    }
+  }
+  ed[is.na(ed)] <- 0
+  ed[ed<0] <- 0
+  
+  time_taken <- proc.time() - ptm
+  
+  PrintIfVerbose(TVDistance(td, ed, "TV DISTANCE 2 WAY"), verbose)
+  PrintIfVerbose("PROC.TIME", verbose)
+  PrintIfVerbose(time_taken, verbose)
+  chisq_td <- chisq.test(td)[1][[1]][[1]]
+  chisq_ed <- chisq.test(ed)[1][[1]][[1]]
+  if(is.nan(chisq_td)) {
+    chisq_td <- 0
+  }
+  if(is.nan(chisq_ed)) {
+    chisq_ed <- 0
+  }
+  
+  metrics <- list(
+    td_chisq = chisq_td,
+    ed_chisq = chisq_ed,
+    tv = TVDistance(td, ed, ""),
+    time = time_taken[1],
+    dim1 = length(found_strings[[1]]),
+    dim2 = length(found_strings[[2]])
+  )
+  
+  # Write metrics to metrics_filename (default: metrics.csv)
+  filename <- file.path(inp$outdir, metrics_filename)
+  write.csv(metrics, file = filename, row.names = FALSE)
+}
+
+#####################################################################
+##
+## Externally provided reports
+## EM ALGORITHM
+## TODO: Also support 3 way association
+## 
+## Inputs:
+##    
+## Outputs:
+##
+#####################################################################
+ExternalReportsEM <- function(inp,
+                              verbose = FALSE,
+                              metrics_filename = "metrics.csv") {
+  ptm <- proc.time()
+  params <- ReadParameterFile(inp$params)
+  # Ensure sufficient maps as required by number of vars
+  stopifnot(inp$numvars == length(inp$maps))
+  # Correct map from ReadMapFile() for assoc analysis
+  map <- lapply(inp$maps, function(o)
+    CorrectMapForAssoc(ReadMapFile(o, params = params),
+                       params = params))
+  
+  # Reports must be of the format
+  #     client name, cohort no, rappor bitstring 1, rappor bitstring 2, ...
+  reportsObj <- read.csv(inp$reports,
+                           colClasses = c("character", "integer",
+                                          rep("character", inp$numvars)),
+                           header = TRUE)
+  # Ignore the first column
+  reportsObj <- reportsObj[,-1]
+
+  # Parsing reportsObj
+  # ComputeDistributionEM allows for different sets of cohorts
+  # for each variable. Here, both sets of cohorts are identical
+  co <- as.list(reportsObj[1])[[1]]
+  co <- co + 1  # 1 indexing
+  cohorts <- rep(list(co), inp$numvars)
+  # Parse reports from reportObj cols 2, 3, ...
+  reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1]))
+    
+  # Split strings into bit arrays (as required by assoc analysis)
+  reports <- lapply(1:inp$numvars, function(i) {
+    # apply the following function to each of reports[[1]] and reports[[2]]
+    lapply(reports[[i]][[1]], function(x) {
+      # function splits strings and converts them to numeric values
+      # rev needed for endianness
+      rev(as.numeric(strsplit(x, split = "")[[1]]))
+    })
+  })
+    
+  joint_dist <- ComputeDistributionEM(reports, cohorts, map,
+                                      ignore_other = TRUE,
+                                      quick = TRUE,
+                                      params, marginals = NULL,
+                                      estimate_var = FALSE)
+  em <- joint_dist$fit
+  td <- read.csv(file = inp$truefile, header = FALSE)
+  td <- table(td[,3:4])
+  td <- td / sum(td)
+  time_taken <- proc.time() - ptm
+  
+  PrintIfVerbose(TVDistance(td, em, "TV DISTANCE EM"), verbose)
+  PrintIfVerbose("PROC.TIME", verbose)
+  PrintIfVerbose(time_taken, verbose)
+  chisq_td <- chisq.test(td)[1][[1]][[1]]
+  chisq_ed <- chisq.test(em)[1][[1]][[1]]
+  if(is.nan(chisq_td)) {
+    chisq_td <- 0
+  }
+  if(is.nan(chisq_ed)) {
+    chisq_ed <- 0
+  }
+  
+  metrics <- list(
+    td_chisq = chisq_td,
+    ed_chisq = chisq_ed,
+    tv = TVDistance(td, em, ""),
+    time = time_taken[1],
+    dim1 = dim(em)[[1]],
+    dim2 = dim(em)[[2]]
+  )
+  
+  # Write metrics to metrics_filename (default: metrics.csv)
+  filename <- file.path(inp$outdir, metrics_filename)
+  write.csv(metrics, file = filename, row.names = FALSE)
+}
+
+main <- function(opts) {
+  inp <- fromJSON(opts$inp)
+  verbose_flag <- inp$verbose
+  # Choose from a set of experiments to run
+  # direct -> direct simulation of reports (without variances)
+  # external-counts -> externally supplied counts for 2 way and marginals
+  # external-reports -> externally supplied reports 
+
+  if("direct" %in% inp$expt) {
+    PrintIfVerbose("Running Experiment Direct", verbose_flag)
+    DirectSimulationOfReports(inp, verbose = verbose_flag)
+  } 
+  if ("external-counts" %in% inp$expt) {
+    PrintIfVerbose("Running Experiment Ext Counts", verbose_flag)
+    if ("direct" %in% inp$expt) {
+      # external-counts expt is run to compare results
+      ExternalCounts(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv")
+    } else {
+      ExternalCounts(inp, verbose = verbose_flag)
+    }
+  }
+  if ("external-reports-em" %in% inp$expt) {
+    PrintIfVerbose("Running Experiment Ext Reports", verbose_flag)
+    if (("direct" %in% inp$expt)||("external-counts" %in% inp$expt)) {
+      # external-reports-em expt is run to compare results
+      ExternalReportsEM(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv")
+    } else {
+      ExternalReportsEM(inp, verbose = verbose_flag)
+    }
+  }
+}
+
+if(!interactive()) {
+  main(opts)
+}

From 75120b98390e43869f6338aa1bcda1fc8d090d17 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Fri, 24 Jul 2015 11:23:56 -0700
Subject: [PATCH 59/67] Code review changes

- threw fitdistribution experimental code into separate function that is now
  only called by a flag passed to FitDistribution
- flag added to assoctest.sh to run comparisons to EM
- added package jsonlite to setup
- further documentation added in sum_bits_assoc
---
 analysis/R/decode2way.R               | 41 ++++--------
 analysis/tools/sum_bits_assoc.py      | 24 ++++++++
 analysis/tools/sum_bits_assoc_test.py |  2 +-
 assoctest.sh                          | 89 ++++++++++++++++-----------
 setup.sh                              |  2 +-
 tests/compare_assoc.R                 |  2 +-
 6 files changed, 91 insertions(+), 69 deletions(-)

diff --git a/analysis/R/decode2way.R b/analysis/R/decode2way.R
index ce52d341..e8b546fa 100644
--- a/analysis/R/decode2way.R
+++ b/analysis/R/decode2way.R
@@ -95,7 +95,8 @@ EstimateBloomCounts2Way <- function(params, obs_counts) {
 # Implements lsei
 FitDistribution2Way <- function(estimates_stds, map,
                                 fit = NULL,
-                                quiet = FALSE) {
+                                quiet = FALSE,
+                                add_constraints = FALSE) {
   X <- map
   Y <- as.vector(t(estimates_stds$estimates))
   m <- dim(X)[1]
@@ -103,25 +104,21 @@ FitDistribution2Way <- function(estimates_stds, map,
   
   G <- rbind2(Diagonal(n), rep(-1, n))
   H <- c(rep(0, n), -1)
-  lsei(A = X, B = Y, G = G, H = H, type = 2)$X
+  if (add_constraints == TRUE) {
+    res <- AddConstraints(fit, X, Y, m, n, G, H)
+    lsei(A = res$X, B = res$Y, G = res$G, H = res$H, type = 2)$X
+  } else {
+    lsei(A = X, B = Y, G = G, H = H, type = 2)$X
+  }
 }
 
-FitDistribution2WayAdditionalConstraints <- function(estimates_stds, map, fit) {
+AddConstraints <- function(fit, X, Y, m, n, G, H) {
   # Experimental code
   # Computes the same output as FitDistribution by 
   # additionally throwing in constraints corresponding to
   # 1-way marginals
   # Requires non-NULL fit as input (with "proportion" containing marginal info)
 
-  X <- as.matrix(map)
-  Y <- as.vector(t(estimates_stds$estimates))
-  m <- dim(X)[1]
-  n <- dim(X)[2]
-  wt <- 10000 #  weight to marginal constraints
-  
-  G <- rbind2(Diagonal(n), rep(-1, n))
-  H <- c(rep(0, n), -1)
-  
   # Adding marginals constraints to X and Y
   fstrs <- lapply(fit, function(x) x[,"string"]) #  found strings
   
@@ -143,24 +140,8 @@ FitDistribution2WayAdditionalConstraints <- function(estimates_stds, map, fit) {
     vec[indices] <- wt
     X <- rbind2(X, vec)
   }
-  
-  lsei(A = X, B = Y, G = G, H = H, type = 2)$X
-  
-  # Random projection params
-  #   size <- 10 * n
-  #   density <- 0.05
-  #   rproj <- matrix(0, size, m)
-  #   rproj[sample(length(rproj), size = density * length(rproj))] <- rnorm(density * length(rproj))
-  #   # rproj <- matrix(rnorm(10*n*m), 10*n, m)
-  #   Xproj <- rproj %*% X
-  #   Yproj <- as.vector(rproj %*% Y)
-  #   mproj <- dim(Xproj)[1]
-  #   nproj <- dim(Xproj)[2]
-  #   
-  #   G <- rbind2(Diagonal(nproj), rep(-1, nproj))
-  #   H <- c(rep(0, nproj), -1)
-  #   lsei(A = Xproj, B = Yproj, G = G, H = H, type = 2)$X
-}
+  list(X = X, Y = Y, G = G, H = H)
+} 
 
 Decode2Way <- function(counts, map, params, fit = NULL) {
   k <- params$k
diff --git a/analysis/tools/sum_bits_assoc.py b/analysis/tools/sum_bits_assoc.py
index 2263b671..8e01d669 100755
--- a/analysis/tools/sum_bits_assoc.py
+++ b/analysis/tools/sum_bits_assoc.py
@@ -21,6 +21,30 @@
 Output counts of bloom filter bits set for each variable (1-way totals)
 and counts of pairwise bits set (2-way totals) into files with suffixes
 _marg1.csv, _marg2.csv, _2way.csv respectively.
+
+The file formats for each of the files are as follows:
+_marg1.csv, _marg2.csv
+Each row corresponds to a cohort with:
+num reports, total count for bit 1, total count for bit 2, ...
+
+_2way.csv
+Each row corresponds to a cohort
+The first entry corresponds to total number of reports in that cohort
+The next set of values indicate 2 way counts grouped 4 elements at a time:
+  the first 4 refer to information about bit 1 of irr1 and bit 1 of irr2
+  the next 4 refer to information about bit 1 of irr1 and bit 2 of irr2
+  ...
+  the next 4 refer to information about bit 1 of irr1 and bit k of irr2
+  the next 4 refer to information about bit 2 of irr1 and bit 1 of irr2
+  (pairwise information about tuples is stored in a "1st report"-major order)
+  ...
+  the last 4 refer to information about bit k of irr1 and bit k of irr2
+
+  for each 4-tuple, the values represents the counts for the pair of bits from
+  irr1 and irr2 having the value:
+  11, 01, 10, and 00, respectively.
+
+  See sum_bits_assoc_test.py for an example
 """
 
 import csv
diff --git a/analysis/tools/sum_bits_assoc_test.py b/analysis/tools/sum_bits_assoc_test.py
index 11ac2ff1..e5ebb467 100755
--- a/analysis/tools/sum_bits_assoc_test.py
+++ b/analysis/tools/sum_bits_assoc_test.py
@@ -37,7 +37,7 @@
 # EXPECTED_F_2WAY
 #
 # NOTE: bit order is reversed.
-# First row is 65 zeroes
+# First row is 65 zeroes because there are no reports with cohort 0
 EXPECTED_F_2WAY = """\
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\r
diff --git a/assoctest.sh b/assoctest.sh
index 17d02df6..06f0c7d4 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -8,18 +8,25 @@
 # At the end, it will print an HTML summary.
 #
 # Three main functions are
-#    run [[<pattern> [<num>]] - run tests matching <pattern> in
-#                               parallel, each <num> times.
+#    run [[<pattern> [<num> [<compare>]]] - run tests matching <pattern> in
+#                               parallel, each <num> times, additionally
+#                               running the EM algorithm if <compare> = T
 #
-#    run-seq [<pattern> [<num>]] - ditto, except that tests are run sequentially
+#    run-seq [<pattern> [<num> [<compare>]]] - ditto, except that tests are run 
+#                                              sequentially
 #
-#    run-all [<num>]             - run all tests, in parallel, each <num> times
+#    run-all [<num> [<compare>]]             - run all tests, in parallel,
+#                                              each <num> times
+#
+# Note: Patterns always start with a-.
 #
 # Examples:
-# $ ./assoctest.sh run-seq tiny-8x16-     # Sequential run, matches 2 cases
-# $ ./assoctest.sh run-seq tiny-8x16- 3   # Sequential, each test is run three
-#                                           times
-# $ ./assoctest.sh run-all                # Run all tests once
+# $ ./assoctest.sh run-seq a-toy      # Sequential run, matches 2 cases
+# $ ./assoctest.sh run-seq a-fizz 3   # Sequential, each test is run three
+#                                       times
+# $ ./assoctest.sh run-all            # Run all tests once
+# $ ./assoctest.sh run-all 5 T        # Run all tests five times with EM
+#                                       comparisons
 #
 # The <pattern> argument is a regex in 'grep -E' format. (Detail: Don't
 # use $ in the pattern, since it matches the whole spec line and not just the
@@ -27,7 +34,6 @@
 #
 # fast_counts param inherited from regtest.sh, but currently not used
 
-
 set -o nounset
 set -o pipefail
 set -o errexit
@@ -107,7 +113,7 @@ _run-one-instance() {
 
   read -r case_name num_unique_values num_unique_values2 \
     num_clients num_extras \
-    num_bits num_hashes num_cohorts p q f < $case_dir/spec.txt
+    num_bits num_hashes num_cohorts p q f compare < $case_dir/spec.txt
 
   local instance_dir=$ASSOCTEST_DIR/$test_case/$test_instance
   mkdir --verbose -p $instance_dir
@@ -165,29 +171,34 @@ _run-one-instance() {
   # substantial) map file. Timing below is more inclusive.
   TIMEFORMAT='Running analyze.R took %R seconds'
 
-  # Setting up JSON file with python
-  python -c "import json; \
-    f = file('$instance_dir/analyze_inp.json', 'w'); \
-    inp = dict(); \
-    inp['maps'] = ['$case_dir/case_map1.csv',\
-                   '$case_dir/case_map2.csv']; \
-    inp['reports'] = '$instance_dir/case_reports.csv'; \
-    inp['truefile'] = '$instance_dir/case.csv'; \
-    inp['outdir'] = '$out_dir'; \
-    inp['params'] = '$case_dir/case_params.csv'; \
-    inp['newalg'] = 'false'; \
-    inp['numvars'] = 2; \
-    inp['num'] = $num_clients; \
-    inp['extras'] = $num_extras; \
-    inp['varcandidates'] = [$num_unique_values, $num_unique_values2]; \
-    inp['verbose'] = 'true'; \
-    inp['counts'] = ['$instance_dir/case_2way.csv',\
-                     '$instance_dir/case_marg1.csv',\
-                     '$instance_dir/case_marg2.csv']; \
-    inp['expt'] = ['external-counts']; \
-    json.dump(inp, f); \
-    f.close();"
-
+  # Setting up JSON file
+  json_file="{\
+    \"maps\":           [\"$case_dir/case_map1.csv\",\
+                       \"$case_dir/case_map2.csv\"],\
+    \"reports\":        \"$instance_dir/case_reports.csv\",\
+    \"truefile\":       \"$instance_dir/case.csv\",\
+    \"outdir\":         \"$out_dir\",\
+    \"params\":         \"$case_dir/case_params.csv\",\
+    \"newalg\":         \"false\",\
+    \"numvars\":        2,\
+    \"num\":            $num_clients,\
+    \"extras\":         $num_extras,\
+    \"varcandidates\":  [$num_unique_values, $num_unique_values2],\
+    \"verbose\":        \"true\",\
+    \"counts\":         [\"$instance_dir/case_2way.csv\",\
+                        \"$instance_dir/case_marg1.csv\",\
+                        \"$instance_dir/case_marg2.csv\"],"
+
+  # Adding EM comparison depending on $compare flag
+  if test $compare = F; then
+    json_file=$json_file"\"expt\": [\"external-counts\"]"
+  else 
+    json_file=$json_file"\"expt\": [\"external-counts\", \
+      \"external-reports-em\"]"
+  fi
+  json_file=$json_file"}"
+  echo $json_file > $instance_dir/analyze_inp.json
+  
   time {
     tests/compare_assoc.R --inp $instance_dir/analyze_inp.json
   }
@@ -254,12 +265,15 @@ _setup-test-instances() {
 #   instances: A number of times each test case is run
 #   parallel: Whether the tests are run in parallel (T/F)
 #   fast_counts: Whether counts are sampled directly (T/F)
+#   compare: Whether the tests run comparisons between EM and Marginal
+#   algorithms or not
 #
 _run-tests() {
   local spec_regex=$1  # grep -E format on the spec
   local instances=$2
   local parallel=$3
   local fast_counts=$4
+  local $compare=$5
 
   rm -r -f --verbose $ASSOCTEST_DIR
 
@@ -270,6 +284,7 @@ _run-tests() {
   echo $instances
   echo $parallel
   echo $fast_counts
+  echo $compare
 
   local func
   local processors=1
@@ -290,7 +305,7 @@ _run-tests() {
   fi
 
   local cases_list=$ASSOCTEST_DIR/test-cases.txt
-  tests/assoctest_spec.py | grep -E $spec_regex > $cases_list
+  tests/assoctest_spec.py | grep -E $spec_regex | sed "s/$/ $compare/" > $cases_list
 
   # Generate parameters for all test cases.
   cat $cases_list \
@@ -314,18 +329,20 @@ _run-tests() {
 run-seq() {
   local spec_regex=${1:-'^a-'}  # grep -E format on the spec
   local instances=${2:-1}
+  local compare=${3:-F}
 
-  _run-tests $spec_regex $instances F T
+  _run-tests $spec_regex $instances F T $compare
 }
 
 # Run tests in parallel
 run-all() {
   local instances=${1:-1}
+  local compare=${2:-F}
 
   log "Running all tests. Can take a while."
   # a- for assoc tests
   # F for sequential
-  _run-tests '^a-' $instances T T
+  _run-tests '^a-' $instances T T $compare
 }
 
 "$@"
diff --git a/setup.sh b/setup.sh
index 90b6537f..fbaaff41 100755
--- a/setup.sh
+++ b/setup.sh
@@ -30,7 +30,7 @@ r-packages() {
   # glmnet, limSolve: solvers for decode.R
   # RJSONIO: for analysis_tool.R
   sudo R -e \
-    'install.packages(c("glmnet", "optparse", "limSolve", "RUnit", "abind", "RJSONIO"), repos="http://cran.rstudio.com/")'
+    'install.packages(c("glmnet", "optparse", "limSolve", "RUnit", "abind", "RJSONIO", "jsonlite"), repos="http://cran.rstudio.com/")'
 }
 
 # R 3.0.2 on Trusty is out of date with CRAN, so we need this workaround.
diff --git a/tests/compare_assoc.R b/tests/compare_assoc.R
index 86dad21e..b0b3a718 100755
--- a/tests/compare_assoc.R
+++ b/tests/compare_assoc.R
@@ -18,7 +18,7 @@
 # an EM algorithm to estimate joint distribution over two or more variables
 #
 # Usage:
-#       $ ./analyze_assoc_expt.R --inp <input JSON file>
+#       $ ./compare_assoc.R --inp <input JSON file>
 #
 # Input file: 
 # Outputs:

From 92590b815c5795066c517042f7f4c19b343b120d Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Fri, 24 Jul 2015 13:03:58 -0700
Subject: [PATCH 60/67] Remove display of compare flag in results.

---
 tests/make_summary_assoc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py
index f1b0d1ca..0f1af04d 100755
--- a/tests/make_summary_assoc.py
+++ b/tests/make_summary_assoc.py
@@ -130,9 +130,9 @@ def ParseSpecFile(spec_filename, empty = False):
   with open(spec_filename) as s:
     spec_row = s.readline().split()
 
-  spec_in_html = ' '.join('<td>%s</td>' % cell for cell in spec_row[1:])
+  spec_in_html = ' '.join('<td>%s</td>' % cell for cell in spec_row[1:len(spec_row)-1])
   if empty == True:
-    spec_in_html = ' '.join('<td></td>' for cell in spec_row[1:])
+    spec_in_html = ' '.join('<td></td>' for cell in spec_row[1:len(spec_row)-1])
 
   return spec_in_html
 

From 21080617e215906ce8779ebdc18cdf71c15df3a2 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 27 Jul 2015 11:16:01 -0700
Subject: [PATCH 61/67] Reconciled with old decode.R for assoc pruning.

---
 analysis/R/decode.R | 85 ++++++++++++++++++++++++---------------------
 1 file changed, 46 insertions(+), 39 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index ba9eb9c6..fe314cd9 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -97,26 +97,22 @@ FitLasso <- function(X, Y, intercept = TRUE) {
   #    a vector of size ncol(X) of coefficients.
 
   # TODO(mironov): Test cv.glmnet instead of glmnet
-
-  # Cap the number of non-zero coefficients to 500 or 80% of the number of
-  # constraints, whichever is less. The 500 cap is for performance reasons, 80%
-  # is to avoid overfitting.
-  cap <- min(500, nrow(X) * .8, ncol(X))
-
-  # TODO: take care of corner case when ncol(X) == 1
-  # currently glmnet() fails
-  mod <- glmnet(X, Y, standardize = FALSE, intercept = intercept,
-                lower.limits = 0,  # outputs are non-negative
-                pmax = cap)
-
-  coefs <- coef(mod)
-  coefs <- coefs[-1, , drop = FALSE]  # drop the intercept
-  l1cap <- sum(colSums(coefs) <= 1.0)  # find all columns with L1 norm <= 1
-  if(l1cap > 0)
-    distr <- coefs[, l1cap]  # return the last set of coefficients with L1 <= 1
-  else
-    distr <- setNames(rep(0, ncol(X)), colnames(X))
-  distr
+  mod <- try(glmnet(X, Y, standardize = FALSE, intercept = intercept,
+                    lower.limits = 0,  # outputs are non-negative
+                    # Cap the number of non-zero coefficients to 500 or
+                    # 80% of the length of Y, whichever is less. The 500 cap
+                    # is for performance reasons, 80% is to avoid overfitting.
+                    pmax = min(500, length(Y) * .8)),
+             silent = TRUE)
+
+  # If fitting fails, return an empty data.frame.
+  if (class(mod)[1] == "try-error") {
+    coefs <- setNames(rep(0, ncol(X)), colnames(X))
+  } else {
+    coefs <- coef(mod)
+    coefs <- coefs[-1, ncol(coefs), drop = FALSE]  # coefs[1] is the intercept
+  }
+  coefs
 }
 
 PerformInference <- function(X, Y, N, mod, params, alpha, correction) {
@@ -227,13 +223,30 @@ FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
   #   according to this vector approximates estimates
 
   S <- ncol(map)  # total number of candidates
-  lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
-  
-  if(!quiet)
-    cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n")
 
-  names(lasso) <- colnames(map)
-  lasso
+  support_coefs <- 1:S
+
+  if (S > length(estimates_stds$estimates) * .8) {
+    # the system is close to being underdetermined
+    lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
+
+    # Select non-zero coefficients.
+    support_coefs <- which(lasso > 0)
+
+    if(!quiet)
+      cat("LASSO selected ", length(support_coefs), " non-zero coefficients.\n")
+  }
+
+  coefs <- setNames(rep(0, S), colnames(map))
+
+  if(length(support_coefs) > 0) {  # LASSO may return an empty list
+    constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE],
+                                             estimates_stds)
+
+    coefs[support_coefs] <- constrained_coefs
+  }
+
+  coefs
 }
 
 Resample <- function(e) {
@@ -247,7 +260,7 @@ Resample <- function(e) {
   list(estimates = estimates, stds = stds)
 }
 
-Decode <- function(counts, map, params, alpha = 0.05, quick = FALSE,
+Decode <- function(counts, map, params, alpha = 0.05,
                    correction = c("Bonferroni"), quiet = FALSE, ...) {
   k <- params$k
   p <- params$p
@@ -273,11 +286,10 @@ Decode <- function(counts, map, params, alpha = 0.05, quick = FALSE,
          stds = es$stds[filter_cohorts, , drop = FALSE])
 
   coefs_all <- vector()
+
   # Run the fitting procedure several times (5 seems to be sufficient and not
   # too many) to estimate standard deviation of the output.
-  if(quick) {num_reps <- 2} else {num_reps <- 5}
-  for(r in 1:num_reps)
-  {
+  for(r in 1:5) {
     if(r > 1)
       e <- Resample(estimates_stds_filtered)
     else
@@ -287,23 +299,16 @@ Decode <- function(counts, map, params, alpha = 0.05, quick = FALSE,
                        FitDistribution(e, map[filter_bits, , drop = FALSE],
                                        quiet))
   }
-  
-  FitDistribution(e, map[filter_bits, , drop = FALSE], quiet)
+
   coefs_ssd <- N * apply(coefs_all, 2, sd)  # compute sample standard deviations
   coefs_ave <- N * apply(coefs_all, 2, mean)
 
   # Only select coefficients more than two standard deviations from 0. May
   # inflate empirical SD of the estimates.
-  reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd)
+  reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd)
 
   mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported])
 
-#   Old code  ...
-#     coefs_all <- FitDistribution(estimates_stds_filtered,
-#                                         map[filter_bits, , drop = FALSE])
-#     reported <- which(coefs_all > 1E-6)
-#     mod <- list(coefs = coefs_all[reported], stds = rep(0, length(reported)))
-
   if (correction == "Bonferroni") {
     alpha <- alpha / S
   }
@@ -333,8 +338,10 @@ Decode <- function(counts, map, params, alpha = 0.05, quick = FALSE,
   # Clamp estimated proportion.  pmin/max: vectorized min and max
   fit$prop_low_95 <- pmax(low_95, 0.0)
   fit$prop_high_95 <- pmin(high_95, 1.0)
+
   fit <- fit[, c("string", "estimate", "std_error", "proportion",
                  "prop_std_error", "prop_low_95", "prop_high_95")]
+
   allocated_mass <- sum(fit$proportion)
   num_detected <- nrow(fit)
 

From 22fa769365ece1ceec9497b0926cdf616ea598b8 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 27 Jul 2015 11:19:08 -0700
Subject: [PATCH 62/67] Fixed expected_f_2way in sum bits assoc test

---
 analysis/tools/sum_bits_assoc_test.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/analysis/tools/sum_bits_assoc_test.py b/analysis/tools/sum_bits_assoc_test.py
index e5ebb467..e19b9fed 100755
--- a/analysis/tools/sum_bits_assoc_test.py
+++ b/analysis/tools/sum_bits_assoc_test.py
@@ -38,34 +38,37 @@
 #
 # NOTE: bit order is reversed.
 # First row is 65 zeroes because there are no reports with cohort 0
-EXPECTED_F_2WAY = """\
+expected_f_2way = """\
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\r
 """
 
 # Cohort 1
 # Total # of reports
-EXPECTED_F_2WAY = EXPECTED_F_2WAY + "3,"
+expected_f_2way = expected_f_2way + "3,"
 
 # Looking at LSBs of both irrs
 # Total # of (11, 01, 10, 00) that appear
-EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,2,1,"
+expected_f_2way = expected_f_2way + "0,0,2,1,"
 
 # Report 1-major order. So looking at LSB of irr1 and 2nd LSB of irr2
-EXPECTED_F_2WAY = EXPECTED_F_2WAY + "2,0,0,1,"
+expected_f_2way = expected_f_2way + "2,0,0,1,"
 
 # And so on ...
-EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,2,1,"
-EXPECTED_F_2WAY = EXPECTED_F_2WAY + "2,0,0,1,"
+expected_f_2way = expected_f_2way + "0,0,2,1,"
+expected_f_2way = expected_f_2way + "2,0,0,1,"
 
 # Now moving on to 2nd LSB of irr1
-EXPECTED_F_2WAY = EXPECTED_F_2WAY + ("0,0,2,1,2,0,0,1," * 2)
+expected_f_2way = expected_f_2way + ("0,0,2,1,2,0,0,1," * 2)
 
 # Now moving on to 3rd LSB of irr1
 # Note that for 3rd LSB of irr1 and LSB of irr2, there are three 00s
-EXPECTED_F_2WAY = EXPECTED_F_2WAY + ("0,0,0,3,0,2,0,1," * 2)
+expected_f_2way = expected_f_2way + ("0,0,0,3,0,2,0,1," * 2)
 # MSB of irr1
-EXPECTED_F_2WAY = EXPECTED_F_2WAY + "0,0,0,3,0,2,0,1," + "0,0,0,3,0,2,0,1\r\n"
+expected_f_2way = expected_f_2way + "0,0,0,3,0,2,0,1," + "0,0,0,3,0,2,0,1\r\n"
+
+# EXPECTED_F_2WAY is a constant
+EXPECTED_F_2WAY = expected_f_2way
 
 # end of EXPECTED_F_2WAY
 # ###############################

From 217417204c27e946038cb0f62ac2edb2e6a4d450 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 27 Jul 2015 14:02:59 -0700
Subject: [PATCH 63/67] Wrapper for running quick analysis.

---
 analysis/R/assoc.R       | 171 +++++++++++++++++++++++++++++++++++++++
 analysis/R/association.R |  19 ++---
 quick_assoc.sh           |  75 +++++++++++++++++
 3 files changed, 256 insertions(+), 9 deletions(-)
 create mode 100755 analysis/R/assoc.R
 create mode 100755 quick_assoc.sh

diff --git a/analysis/R/assoc.R b/analysis/R/assoc.R
new file mode 100755
index 00000000..a7dc63d6
--- /dev/null
+++ b/analysis/R/assoc.R
@@ -0,0 +1,171 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Reads map files, report files, and RAPPOR parameters to run
+# an EM algorithm to estimate joint distribution over two or more variables
+#
+# Usage:
+#       $ ./assoc.R --inp <JSON file>
+#
+# Input: JSON file with the following fields
+#        "maps" for map files of each var
+#        "reports" for a list of reports
+#        "counts" for 2 way marginal counts, individual marginal counts 
+#                 respectively
+#        "params" for params file with RAPPOR params
+#        "csv_out" for a file name into which results will be written
+#                 as comma separated values
+#
+# Output: A table with joint distribution to stdout and csv file with results
+
+library("jsonlite")
+library("optparse")
+
+options(stringsAsFactors = FALSE)
+
+if(!interactive()) {
+  option_list <- list(
+    make_option(c("--inp"), default = "inp.json",
+                help = "JSON file with inputs for assoc.R"))
+  opts <- parse_args(OptionParser(option_list = option_list))
+}
+
+source("analysis/R/decode2way.R")
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/read_input.R")
+source("analysis/R/association.R")
+source("tests/gen_counts.R")
+source("tests/compare_assoc.R")  # For CombineMaps; it should be moved elsewhere
+
+TwoWayAlg <- function(inp) {
+  ptm <- proc.time()
+  params <- ReadParameterFile(inp$params)
+  # Ensure sufficient maps as required by number of vars
+  # Correct map from ReadMapFile() for assoc analysis
+  stopifnot(inp$numvars == length(inp$maps))
+  map <- lapply(inp$maps, function(o)
+    CorrectMapForAssoc(ReadMapFile(o, params = params),
+                       params = params))
+  
+  # (2 way counts, marginal 1 counts, marginal 2 counts)
+  counts <- lapply(1:3, function(i) ReadCountsFile(inp$counts[[i]]))
+  
+  # TODO: account for different parameters across different variables
+  params2 <- params
+  params2$k <- (params$k ** 2) * 4
+  
+  # Prune candidates
+  fit <- lapply(1:2, function(i)
+    Decode(counts[[i + 1]],
+           map[[i]]$rmap,
+           params, quick = FALSE)$fit)
+  
+  found_strings = list(fit[[1]][,"string"], fit[[2]][,"string"])
+  
+  if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) {
+    stop("No strings found in 1-way marginal.")
+  }
+  
+  # Combine maps to feed into Decode2Way
+  # Prune first to found_strings from Decode on 1-way counts
+  pruned <- lapply(1:2, function(i)
+    lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE]))
+  crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap
+  marginal <- Decode2Way(counts[[1]], crmap, params2, fit = fit)$fit
+  
+  # Reconstruct 2-way table from marginals
+  ed <- matrix(0, nrow = length(found_strings[[1]]), ncol = length(found_strings[[2]]))
+  colnames(ed) <- found_strings[[2]]
+  rownames(ed) <- found_strings[[1]]
+  for (cols in found_strings[[2]]) {
+    for (rows in found_strings[[1]]) {
+      ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
+    }
+  }
+  ed[is.na(ed)] <- 0
+  ed[ed<0] <- 0
+  
+  time_taken <- proc.time() - ptm
+  print("Two Way Algorithm Results")
+  print(ed[order(-rowSums(ed)), order(-colSums(ed))])
+  if(inp$time == TRUE)
+    print(time_taken)
+}
+
+EMAlg <- function(inp) {
+  ptm <- proc.time()
+  params <- ReadParameterFile(inp$params)
+  # Ensure sufficient maps as required by number of vars
+  stopifnot(inp$numvars == length(inp$maps))
+  # Correct map from ReadMapFile() for assoc analysis
+  map <- lapply(inp$maps, function(o)
+    CorrectMapForAssoc(ReadMapFile(o, params = params),
+                       params = params))
+  
+  # Reports must be of the format
+  #     client name, cohort no, rappor bitstring 1, rappor bitstring 2, ...
+  reportsObj <- read.csv(inp$reports,
+                         colClasses = c("character", "integer",
+                                        rep("character", inp$numvars)),
+                         header = TRUE)
+  # Ignore the first column
+  reportsObj <- reportsObj[,-1]
+  
+  # Parsing reportsObj
+  # ComputeDistributionEM allows for different sets of cohorts
+  # for each variable. Here, both sets of cohorts are identical
+  co <- as.list(reportsObj[1])[[1]]
+  co <- co + 1  # 1 indexing
+  cohorts <- rep(list(co), inp$numvars)
+  # Parse reports from reportObj cols 2, 3, ...
+  reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1]))
+  
+  # Split strings into bit arrays (as required by assoc analysis)
+  reports <- lapply(1:inp$numvars, function(i) {
+    # apply the following function to each of reports[[1]] and reports[[2]]
+    lapply(reports[[i]][[1]], function(x) {
+      # function splits strings and converts them to numeric values
+      # rev needed for endianness
+      rev(as.numeric(strsplit(x, split = "")[[1]]))
+    })
+  })
+  
+  joint_dist <- ComputeDistributionEM(reports, cohorts, map,
+                                      ignore_other = TRUE,
+                                      quick = TRUE,
+                                      params, marginals = NULL,
+                                      estimate_var = FALSE,
+                                      verbose = inp$time)
+  em <- joint_dist$fit
+  time_taken <- proc.time() - ptm
+  print("EM Algorithm Results")
+  print(em[order(-rowSums(em)), order(-colSums(em))])
+  if(inp$time == TRUE)
+    print(time_taken)
+}
+
+main <- function(opts) {
+  inp <- fromJSON(opts$inp)
+  TwoWayAlg(inp)
+  if(inp$also_em == TRUE)
+    EMAlg(inp)
+}
+
+if(!interactive()) {
+  main(opts)
+}
\ No newline at end of file
diff --git a/analysis/R/association.R b/analysis/R/association.R
index f2d6f59c..eb561267 100644
--- a/analysis/R/association.R
+++ b/analysis/R/association.R
@@ -219,8 +219,8 @@ EM <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE,
       pij[[i + 1]] <- UpdatePij(pij[[i]], cond_prob)
       dif <- max(abs(pij[[i + 1]] - pij[[i]]))
       if (i == 1) {
-        print("ONE ITERATION")
-        print(proc.time() - ptm_iter)
+        PrintIfVerbose("ONE ITERATION", verbose)
+        PrintIfVerbose(proc.time() - ptm_iter, verbose)
       }
       if (dif < epsilon) {
         break
@@ -292,7 +292,8 @@ ComputeDistributionEM <- function(reports, report_cohorts,
                                   maps, ignore_other = FALSE,
                                   params, quick = FALSE,
                                   marginals = NULL,
-                                  estimate_var = FALSE) {
+                                  estimate_var = FALSE,
+                                  verbose = FALSE) {
   # Computes the distribution of num_variables variables, where
   #     num_variables is chosen by the client, using the EM algorithm.
   #
@@ -334,8 +335,8 @@ ComputeDistributionEM <- function(reports, report_cohorts,
       variable_counts <- ComputeCounts(variable_report, variable_cohort, params)
       marginal <- Decode(variable_counts, map$rmap, params, quick,
                          quiet = TRUE)$fit
-      print("TIME IN MARGINALS")
-      print(proc.time() - ptm2)
+      PrintIfVerbose("TIME IN MARGINALS", verbose)
+      PrintIfVerbose(proc.time() - ptm2, verbose)
       if (nrow(marginal) == 0) {
         return (NULL)
       }
@@ -373,16 +374,16 @@ ComputeDistributionEM <- function(reports, report_cohorts,
     # Update the joint conditional distribution of all variables
     joint_conditional <- UpdateJointConditional(cond_report_dist,
                                               joint_conditional)
-    print("TIME IN COND_REPORT_DIST")
-    print(proc.time()-ptm)
+    PrintIfVerbose("TIME IN COND_REPORT_DIST", verbose)
+    PrintIfVerbose(proc.time()-ptm, verbose)
   }
 
   ptm <- proc.time()
   # Run expectation maximization to find joint distribution
   em <- EM(joint_conditional, epsilon = 10 ^ -6, verbose = FALSE,
            estimate_var = estimate_var)
-  print("TIME IN EM")
-  print(proc.time() - ptm)
+  PrintIfVerbose("TIME IN EM", verbose)
+  PrintIfVerbose(proc.time() - ptm, verbose)
   dimnames(em$est) <- found_strings
 
   # Return results in a usable format
diff --git a/quick_assoc.sh b/quick_assoc.sh
new file mode 100755
index 00000000..024e321e
--- /dev/null
+++ b/quick_assoc.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+#
+# Quick script to wrap assoc.R
+#
+# Usage:
+#   ./quick_assoc.sh <dir name> [<EM also? T/F>]
+#
+# For directory name $dir, quick_assoc.sh expects the following files:
+#   $dir/map1.csv -- map files
+#   $dir/map2.csv
+#   $dir/reports.csv -- these are the raw reports
+#   $dir/params.csv -- parameters file
+#
+# At the end, it will output results of the Two Way Algorithm and EM algorithm
+# (if EM also is set to T) to stdout
+#
+# Examples:
+# $ ./quick_assoc.sh . T
+
+readonly THIS_DIR=$(dirname $0)
+readonly REPO_ROOT=$THIS_DIR
+readonly CLIENT_DIR=$REPO_ROOT/client/python
+readonly MAP_SUFFIX=map
+readonly COUNT_SUFFIX=count
+
+# All the Python tools need this
+export PYTHONPATH=$CLIENT_DIR
+
+_run-input() {
+  
+  # Read reports and compute two way counts
+  analysis/tools/sum_bits_assoc.py \
+    $1/params.csv \
+    "$1/$COUNT_SUFFIX" \
+    < $1/reports.csv
+
+  # Currently, the summary file shows and aggregates timing of the inference
+  # engine, which excludes R's loading time and reading of the (possibly
+  # substantial) map file. Timing below is more inclusive.
+  TIMEFORMAT='Running analyze.R took %R seconds'
+
+  # Setting up JSON file inp.json in current directory
+  json_file="{\
+    \"time\":           false,
+    \"maps\":           [\"$1/${MAP_SUFFIX}1.csv\",\
+                       \"$1/${MAP_SUFFIX}2.csv\"],\
+    \"reports\":        \"$1/reports.csv\",\
+    \"params\":         \"$1/params.csv\",\
+    \"numvars\":        2,\
+    \"verbose\":        \"false\",\
+    \"counts\":         [\"$1/${COUNT_SUFFIX}_2way.csv\",\
+                        \"${COUNT_SUFFIX}_marg1.csv\",\
+                        \"${COUNT_SUFFIX}_marg2.csv\"],"
+
+  # Adding EM comparison depending on flag
+  if test $2 = T; then
+    json_file=$json_file"\"also_em\": true"
+  else 
+    json_file=$json_file"\"also_em\": false"
+  fi
+  json_file=$json_file"}"
+  echo $json_file > inp.json
+  
+  time {
+    analysis/R/assoc.R --inp inp.json
+  }
+}
+
+main() {
+  dir=$1
+  also_em=${2:-F}
+  _run-input $dir $also_em
+}
+
+main "$@"

From bda727535befb5e9e2b7294f3ef5a8b16d295611 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Mon, 27 Jul 2015 14:29:53 -0700
Subject: [PATCH 64/67] Clean up in assoctest.sh

---
 assoctest.sh | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/assoctest.sh b/assoctest.sh
index 06f0c7d4..42e1799f 100755
--- a/assoctest.sh
+++ b/assoctest.sh
@@ -140,29 +140,6 @@ _run-one-instance() {
     < $instance_dir/case_reports.csv
 
 
-  # Setting up JSON file containing assoc_sim inputs with python
-  # Currently unused as true values and RAPPOR'd reports are generated
-  # running gen_assoc_reports.R, rappor_assoc_sim.py, and sum_bits_assoc.py
-  python -c "import json; \
-    f = file('$instance_dir/assoc_inp.json', 'w'); \
-    inp = dict(); \
-    inp['params'] = '$case_dir/case_params.csv'; \
-    inp['reports'] = '$instance_dir/reports.csv'; \
-    inp['true'] = '$instance_dir/truedist.csv'; \
-    inp['map'] = '$instance_dir/map'; \
-    inp['num'] = $num_clients; \
-    inp['extras'] = 0; \
-    inp['distr'] = 'zipf2'; \
-    inp['prefix'] = './'; \
-    inp['vars'] = 2; \
-    inp['varcandidates'] = [$num_unique_values, $num_unique_values2]; \
-    json.dump(inp, f); \
-    f.close();"
-
-  # Currently unused as true values and RAPPOR'd reports are generated
-  # running gen_assoc_reports.R, rappor_assoc_sim.py, and sum_bits_assoc.py
-  # tests/assoc_sim_expt.R --inp $instance_dir/assoc_inp.json
-
   local out_dir=${instance_dir}_report
   mkdir --verbose -p $out_dir
 

From 5e665da877b2be017f3b50691205a3a9c4def289 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Tue, 4 Aug 2015 21:00:11 -0700
Subject: [PATCH 65/67] Modifications to work with basic assocations.

---
 analysis/R/assoc.R       | 16 +++++++++++++++-
 analysis/R/association.R | 14 +++++++-------
 analysis/R/decode.R      | 11 ++++++++---
 3 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/analysis/R/assoc.R b/analysis/R/assoc.R
index a7dc63d6..db2af246 100755
--- a/analysis/R/assoc.R
+++ b/analysis/R/assoc.R
@@ -114,9 +114,20 @@ EMAlg <- function(inp) {
   stopifnot(inp$numvars == length(inp$maps))
   # Correct map from ReadMapFile() for assoc analysis
   map <- lapply(inp$maps, function(o)
-    CorrectMapForAssoc(ReadMapFile(o, params = params),
+    CorrectMapForAssoc(LoadMapFile(o, params = params),
                        params = params))
   
+  # For BASIC only
+  m1 <- lapply(1:params$m, function(z) {
+    m <- sparseMatrix(c(1), c(2), dims = c(1, 2))
+    colnames(m) <- c("FALSE", "TRUE")
+    m
+  })
+  m2 <- sparseMatrix(1:params$m, rep(2, params$m))
+  colnames(m2) <- colnames(m1[[1]])
+  map[[2]]$map <- m1
+  map[[2]]$rmap <- m2
+  
   # Reports must be of the format
   #     client name, cohort no, rappor bitstring 1, rappor bitstring 2, ...
   reportsObj <- read.csv(inp$reports,
@@ -126,6 +137,9 @@ EMAlg <- function(inp) {
   # Ignore the first column
   reportsObj <- reportsObj[,-1]
   
+  params = list(params, params)
+  params[[2]]$k = 1
+  
   # Parsing reportsObj
   # ComputeDistributionEM allows for different sets of cohorts
   # for each variable. Here, both sets of cohorts are identical
diff --git a/analysis/R/association.R b/analysis/R/association.R
index eb561267..56a95749 100644
--- a/analysis/R/association.R
+++ b/analysis/R/association.R
@@ -332,8 +332,8 @@ ComputeDistributionEM <- function(reports, report_cohorts,
     variable_counts <- NULL
     if (is.null(marginals)) {
       ptm2 <- proc.time()
-      variable_counts <- ComputeCounts(variable_report, variable_cohort, params)
-      marginal <- Decode(variable_counts, map$rmap, params, quick,
+      variable_counts <- ComputeCounts(variable_report, variable_cohort, params[[j]])
+      marginal <- Decode(variable_counts, map$rmap, params[[j]], quick,
                          quiet = TRUE)$fit
       PrintIfVerbose("TIME IN MARGINALS", verbose)
       PrintIfVerbose(proc.time() - ptm2, verbose)
@@ -346,26 +346,26 @@ ComputeDistributionEM <- function(reports, report_cohorts,
     found_strings[[j]] <- marginal$string
 
     if (ignore_other) {
-      prob_other <- vector(mode = "list", length = params$m)
+      prob_other <- vector(mode = "list", length = params[[j]]$m)
     } else {
       if (is.null(variable_counts)) {
         variable_counts <- ComputeCounts(variable_report, variable_cohort,
-                                         params)
+                                         params[[j]])
       }
       prob_other <- GetOtherProbs(variable_counts, map$map, marginal,
-                                  params)
+                                  params[[j]])
       found_strings[[j]] <- c(found_strings[[j]], "Other")
     }
     
     GetCondProb(variable_report[[1]], candidate_strings = rownames(marginal),
-                params = params, map$map[[variable_cohort[1]]], prob_other[[variable_cohort[1]]])
+                params = params[[j]], map$map[[variable_cohort[1]]], prob_other[[variable_cohort[1]]])
 
     # Get the joint conditional distribution
     cond_report_dist <- lapply(seq(length(variable_report)), function(i) {
       idx <- variable_cohort[i]
       rep <- GetCondProb(variable_report[[i]],
                          candidate_strings = rownames(marginal),
-                         params = params,
+                         params = params[[j]],
                          map$map[[idx]],
                          prob_other[[idx]])
       rep
diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index fe314cd9..626274e2 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -74,9 +74,14 @@ EstimateBloomCounts <- function(params, obs_counts) {
 
   # Transform counts from absolute values to fractional, removing bias due to
   #      variability of reporting between cohorts.
-  ests <- apply(ests, 1, function(x) x / obs_counts[,1])
-  stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1])
-
+  if (ncol(obs_counts) == 2) {
+    ests <- apply(t(ests), 1, function(x) x / obs_counts[,1])
+    stds <- apply(t(variances^.5), 1, function(x) x / obs_counts[,1])
+  } else {
+    ests <- apply((ests), 1, function(x) x / obs_counts[,1])
+    stds <- apply((variances^.5), 1, function(x) x / obs_counts[,1])
+  }
+  
   # Some estimates may be set to infinity, e.g. if f=1. We want to
   #     account for this possibility, and set the corresponding counts
   #     to 0.

From 45052b97c750196b988e648fc603c26f3f863b0d Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Thu, 6 Aug 2015 10:41:45 -0700
Subject: [PATCH 66/67] Rigging old EM code to work with Basic assoc.

---
 analysis/R/assoc.R       |  2 +-
 analysis/R/association.R | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/analysis/R/assoc.R b/analysis/R/assoc.R
index db2af246..662ab1f6 100755
--- a/analysis/R/assoc.R
+++ b/analysis/R/assoc.R
@@ -160,7 +160,7 @@ EMAlg <- function(inp) {
   })
   
   joint_dist <- ComputeDistributionEM(reports, cohorts, map,
-                                      ignore_other = TRUE,
+                                      ignore_other = FALSE,
                                       quick = TRUE,
                                       params, marginals = NULL,
                                       estimate_var = FALSE,
diff --git a/analysis/R/association.R b/analysis/R/association.R
index 56a95749..328d3292 100644
--- a/analysis/R/association.R
+++ b/analysis/R/association.R
@@ -44,7 +44,7 @@ GetOtherProbs <- function(counts, map, marginal, params) {
   # Counts to remove from each cohort.
   top_counts <- ceiling(marginal$proportion * N / params$m)
   sum_top <- sum(top_counts)
-  candidate_map <- lapply(map, function(x) x[, candidate_strings])
+  candidate_map <- lapply(map, function(x) x[, candidate_strings, drop = FALSE])
 
   # Counts set by known strings without noise considerations.
   if (length(marginal) > 0) {
@@ -63,6 +63,10 @@ GetOtherProbs <- function(counts, map, marginal, params) {
   pstar <- (1 - f / 2) * p + (f / 2) * q
   top_counts_cohort <- (sum_top - top_counts_cohort) * pstar +
       top_counts_cohort * qstar
+  
+  # Adjustment for basic rappor
+  if(nrow(top_counts_cohort) == 1) 
+    top_counts_cohort <- t(top_counts_cohort)
   top_counts_cohort <- cbind(sum_top, top_counts_cohort)
 
   # Counts set by the "other" category.
@@ -72,6 +76,9 @@ GetOtherProbs <- function(counts, map, marginal, params) {
   props_other[props_other > 1] <- 1
   props_other[is.nan(props_other)] <- 0
   props_other[is.infinite(props_other)] <- 0
+  # Adjustmet for basic rappor
+  if(is.null(nrow(props_other)))
+    props_other <- t(props_other)
   as.list(as.data.frame(props_other))
 }
 
@@ -356,9 +363,6 @@ ComputeDistributionEM <- function(reports, report_cohorts,
                                   params[[j]])
       found_strings[[j]] <- c(found_strings[[j]], "Other")
     }
-    
-    GetCondProb(variable_report[[1]], candidate_strings = rownames(marginal),
-                params = params[[j]], map$map[[variable_cohort[1]]], prob_other[[variable_cohort[1]]])
 
     # Get the joint conditional distribution
     cond_report_dist <- lapply(seq(length(variable_report)), function(i) {

From bde82f4cf454760d087a61bd69d82bf64e44c384 Mon Sep 17 00:00:00 2001
From: Ananth Raghunathan <pseudorandom@google.com>
Date: Tue, 8 Sep 2015 17:52:07 -0700
Subject: [PATCH 67/67] params causes a bug

---
 tests/compare_assoc.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/compare_assoc.R b/tests/compare_assoc.R
index b0b3a718..a8105662 100755
--- a/tests/compare_assoc.R
+++ b/tests/compare_assoc.R
@@ -473,7 +473,8 @@ ExternalReportsEM <- function(inp,
       rev(as.numeric(strsplit(x, split = "")[[1]]))
     })
   })
-    
+  
+  params = list(params, params)
   joint_dist <- ComputeDistributionEM(reports, cohorts, map,
                                       ignore_other = TRUE,
                                       quick = TRUE,