From 6e0e7c639896d032854306c019da9bcb7afb5e03 Mon Sep 17 00:00:00 2001
From: Ilya Mironov <mironov@gmail.com>
Date: Mon, 13 Jul 2015 14:43:55 -0700
Subject: [PATCH 01/14] Revert "Merge branch 'master' of
 github.com:google/rappor into drop-QP"

This reverts commit 408ed08240d1401c639085d7bb64c720676316ea, reversing
changes made to 5ab1f4f15775a8e2c2ed25560780778517fd604a.
---
 analysis/R/analysis_lib.R                     |  90 ++++++++
 analysis/R/analysis_tool.R                    | 207 +++++++++++++++---
 regtest.sh                                    |   6 +-
 setup.sh                                      |   3 +-
 test.sh                                       |   2 +-
 tests/{compare_dist.R => analyze.R}           |  15 +-
 tests/{compare_dist_test.R => analyze_test.R} |   2 +-
 7 files changed, 279 insertions(+), 46 deletions(-)
 create mode 100644 analysis/R/analysis_lib.R
 rename tests/{compare_dist.R => analyze.R} (95%)
 rename tests/{compare_dist_test.R => analyze_test.R} (97%)

diff --git a/analysis/R/analysis_lib.R b/analysis/R/analysis_lib.R
new file mode 100644
index 00000000..ddb2c0a0
--- /dev/null
+++ b/analysis/R/analysis_lib.R
@@ -0,0 +1,90 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+GetFN <- function(name) {
+  # Helper function to strip extension from the filename.
+  strsplit(basename(name), ".", fixed = TRUE)[[1]][1]
+}
+
+ValidateInput <- function(params, counts, map) {
+  val <- "valid"
+  if (is.null(counts)) {
+    val <- "No counts file found. Skipping"
+    return(val)
+  }
+
+  if (nrow(map) != (params$m * params$k)) {
+    val <- paste("Map does not match the counts file!",
+                 "mk = ", params$m * params$k,
+                 "nrow(map):", nrow(map),
+                 collapse = " ")
+  }
+
+  if ((ncol(counts) - 1) != params$k) {
+    val <- paste("Dimensions of counts file do not match:",
+                 "m =", params$m, "counts rows: ", nrow(counts),
+                 "k =", params$k, "counts cols: ", ncol(counts) - 1,
+                 collapse = " ")
+  }
+
+  # numerically correct comparison
+  if(isTRUE(all.equal((1 - params$f) * (params$p - params$q), 0)))
+    stop("Information is lost. Cannot decode.")
+
+  val
+}
+
+AnalyzeRAPPOR <- function(params, counts, map, correction, alpha,
+                          experiment_name = "", map_name = "", config_name = "",
+                          date = NULL, date_num = NULL, ...) {
+  val <- ValidateInput(params, counts, map)
+  if (val != "valid") {
+    cat(val, "\n")
+    return(NULL)
+  }
+
+  cat("Sample Size: ", sum(counts[, 1]), "\n",
+      "Number of cohorts: ", nrow(counts), "\n", sep = "")
+
+  fit <- Decode(counts, map, params, correction = correction,
+                alpha = alpha, ...)
+
+  res <- fit$fit
+
+  if (nrow(fit$fit) > 0) {
+    res$rank <- 1:nrow(fit$fit)
+    res$detected <- fit$summary[2, 2]
+    res$sample_size <- fit$summary[3, 2]
+    res$detected_prop <- fit$summary[4, 2]
+    res$explained_var <- fit$summary[5, 2]
+    res$missing_var <- fit$summary[6, 2]
+
+    res$exp_e_1 <- fit$privacy[3, 2]
+    res$exp_e_inf <- fit$privacy[5, 2]
+    res$detection_freq <- fit$privacy[7, 2]
+    res$correction <- correction
+    res$alpha <- alpha
+
+    res$experiment <- experiment_name
+    res$map <- map_name
+    res$config <- config_name
+    res$date <- date
+    res$date_num <- date_num
+  }
+  else
+    print("INSUFFICIENT DATA FOR MEANINGFUL ANSWER.")
+
+  res
+}
diff --git a/analysis/R/analysis_tool.R b/analysis/R/analysis_tool.R
index f87b7500..b0acf2df 100755
--- a/analysis/R/analysis_tool.R
+++ b/analysis/R/analysis_tool.R
@@ -1,11 +1,23 @@
 #!/usr/bin/env Rscript
 #
-# Command line tool to decode a RAPPOR data set.  It is a simple wrapper for
-# Decode() in decode.R.
+# IMPORTANT: This script should be deleted when the analysis server is ready.
+#
+# Generates daily Chrome RAPPOR feed. All experiments found in the
+# experiment_config flag are processed for each day.
+#
+# If neither start_date nor end_date are specified, the feed runs on
+# yesterday's data by default. Otherwise, a date range could be indicated
+# to run a backfill.
+#
+# num_days indicates how many days should be aggregated over. If num_days = 7,
+# then 7-day trailing (combining data from the most recent 7 days) analysis for
+# each date (by default, only for yesterday) will be performed. Please use 7
+# days for weekly and 28 days for monthly analyses.
 
 library(optparse)
 library(RJSONIO)
 
+source("analysis/R/analysis_lib.R")
 source("analysis/R/read_input.R")
 source("analysis/R/decode.R")
 source("analysis/R/util.R")
@@ -17,15 +29,30 @@ options(stringsAsFactors = FALSE)
 if (!interactive()) {
   option_list <- list(
     # Flags.
+    make_option("--start_date", default="", help="First date to process. Format: %Y/%m/%d"),
+    make_option("--end_date", default="", help="Last date to process"),
+    make_option("--num_days", default=1, help="Number of trailing days to process"),
+
+    make_option("--experiment_config", default="chrome_rappor_experiments.csv",
+                help="Experiment config file"),
     make_option("--map", default="MA", help="Map file"),
     make_option("--counts", default="CO", help="Counts file"),
     # TODO: Rename this to --params
     make_option("--config", default="", help="Config file"),
-
     make_option("--output_dir", default="./", help="Output directory"),
 
     make_option("--correction", default="FDR", help="Correction method"),
-    make_option("--alpha", default=.05, help="Alpha level")
+    make_option("--alpha", default=.05, help="Alpha level"),
+
+    # Input and output directories.
+    make_option("--counts_dir", default="/cns/pa-d/home/chrome-rappor/daily",
+                help="CNS directory where counts files are located"),
+    make_option("--map_dir", default="/cns/pa-d/home/chrome-rappor/map",
+                help="CNS map directory"),
+    make_option("--config_dir", default="/cns/pa-d/home/chrome-rappor/config",
+                help="CNS configuration files directory"),
+    make_option("--release_dir", default="/cns/pa-d/home/chrome-rappor/release",
+                help="Output directory")
   )
   # NOTE: This API is bad; if you add positional_arguments, the return value changes!
   opts <- parse_args(OptionParser(option_list = option_list))
@@ -42,35 +69,7 @@ AdjustCounts <- function(counts, params) {
   })
 }
 
-ValidateInput <- function(params, counts, map) {
-  val <- "valid"
-  if (is.null(counts)) {
-    val <- "No counts file found. Skipping"
-    return(val)
-  }
-
-  if (nrow(map) != (params$m * params$k)) {
-    val <- paste("Map does not match the counts file!",
-                 "mk = ", params$m * params$k,
-                 "nrow(map):", nrow(map),
-                 collapse = " ")
-  }
-
-  if ((ncol(counts) - 1) != params$k) {
-    val <- paste("Dimensions of counts file do not match:",
-                 "m =", params$m, "counts rows: ", nrow(counts),
-                 "k =", params$k, "counts cols: ", ncol(counts) - 1,
-                 collapse = " ")
-  }
-
-  # numerically correct comparison
-  if(isTRUE(all.equal((1 - params$f) * (params$p - params$q), 0)))
-    stop("Information is lost. Cannot decode.")
-
-  val
-}
-
-main <- function(opts) {
+RunOne <- function(opts) {
   # Run a single model of all inputs are specified.
   params <- ReadParameterFile(opts$config)
   counts <- ReadCountsFile(opts$counts)
@@ -126,6 +125,148 @@ main <- function(opts) {
   Log('DONE')
 }
 
+# Run multiple models.  There is a CSV experiments config file, and we invoke
+# AnalyzeRAPPOR once for each row in it.
+RunMany <- function(opts) {
+
+  # If the date is not specified, run yesterday's analyses only.
+  if (opts$start_date == "" && opts$end_date == "") {
+    start_date <- Sys.Date() - 1
+    end_date <- start_date
+  } else {
+    start_date <- as.Date(opts$start_date)
+    end_date <- as.Date(opts$end_date)
+    if (end_date < start_date) {
+      stop("End date should be larger or equal than start date!")
+    }
+  }
+  dates <- as.character(seq(start_date, end_date, 1))
+
+  # List of experiments to analyze.
+  config_path = file.path(opts$config_dir, opts$experiment_config)
+  Log('Reading experiment config %s', config_path)
+  experiments <- read.csv(config_path,
+                          header = FALSE, as.is = TRUE,
+                          colClasses = "character", comment.char = "#")
+
+  for (date in dates) {
+    Log("Date: %s", date)
+    date <- as.Date(date)
+    year <- format(date, "%Y")
+    month <- format(date, "%m")
+    day <- format(date, "%d")
+
+    # Create an output directory.
+    output_dir <- file.path(opts$release_dir, year, month, day)
+    dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
+
+    base_filename <- "chrome_rappor_experiments_"
+    output_filename <-
+      switch(as.character(opts$num_days),
+             "1" = paste0(base_filename, date, ".csv"),
+             "7" = paste0("weekly_", base_filename, date, ".csv"),
+             "28" = paste0("monthly_", base_filename, date, ".csv"),
+             paste0(opts$num_days, "_", base_filename, date, ".csv"))
+
+    # Delete any existing files with the same name.
+    unlink(file.path(output_dir, output_filename))
+
+    res <- vector("list", nrow(experiments))
+    for (i in 1:nrow(experiments)) {
+      cat(paste0("Experiment ", i, " (of ",
+                 nrow(experiments), "): ", experiments[i, 2], "\n"))
+
+      # Process a line in the experiments file.
+      experiment_name <- experiments[i, 2]
+      map_file <- experiments[i, 3]
+      config_file <- experiments[i, 4]
+
+      # Read in input files specified in the experiments file.
+      params_path = file.path(opts$config_dir, config_file)
+      Log('Reading params %s', params_path)
+      config <- ReadParameterFile(params_path)
+
+      map_path <- file.path(opts$map_dir, map_file)
+      Log('Loading map %s', map_path)
+      LoadMapFile(map_path)  # Loads the "map" object.
+
+      # Read one or more counts file.
+      counts_file <- paste0(experiments[i, 1], "_counts.csv")
+      trailing_dates <- as.character(seq(date - opts$num_days + 1, date, 1))
+      counts_list <- list()
+      for (j in 1:length(trailing_dates)) {
+        counts_path = file.path(opts$counts_dir, trailing_dates[j], counts_file)
+        Log("Reading counts %s", counts_path)
+
+        counts_j <- ReadCountsFile(file.path(opts$counts_dir,
+                                             trailing_dates[j], counts_file))
+        if (!is.null(counts_j)) {
+          counts_list[[j]] <- AdjustCounts(counts_j, config)
+        }
+      }
+      counts <- Reduce("+", counts_list)  # Turn list into matrix
+
+      # Perform the analysis.
+
+      Log("CONFIG")
+      str(config)
+      cat('\n')
+
+      Log("COUNTS")
+      str(counts)
+      cat('\n')
+
+      Log("MAP")
+      str(map$map)
+      cat('\n')
+
+      exp_res <- AnalyzeRAPPOR(config, counts, map$map, opts$correction, opts$alpha,
+                               experiment_name = experiment_name,
+                               map_name = map_file,
+                               config_name = config_file,
+                               date = as.character(date),
+                               date_num = as.numeric(format(date, "%Y%m%d")))
+
+      if (!is.null(exp_res)) {
+        res[[i]] <- exp_res
+        cat("Discovered:", nrow(exp_res), "\n\n")
+      } else {
+        cat("Discovered: 0\n\n")
+      }
+    }
+
+    # Write out a single column IO file for each date.
+    output <- do.call("rbind", res)
+    if (!is.null(output) && nrow(output) > 0) {
+      path = file.path(output_dir, output_filename)
+      write.csv(output, path)
+      Log('Wrote %s', path)
+    }
+
+    Log("RES")
+    str(res)
+    cat('\n')
+
+    Log("OUTPUT")
+    str(output)
+    cat('\n')
+
+    Log("sum(proportion)")
+    print(sum(output$proportion))
+
+    Log("sum(estimate)")
+    print(sum(output$estimate))
+  }
+}
+
+main = function(opts) {
+  if (opts$counts != "" && opts$map != "" && opts$config != "") {
+    RunOne(opts)
+  } else {
+    RunMany(opts)
+  }
+}
+
 if (!interactive()) {
   main(opts)
 }
diff --git a/regtest.sh b/regtest.sh
index ee813eeb..2430edb1 100755
--- a/regtest.sh
+++ b/regtest.sh
@@ -206,11 +206,11 @@ _run-one-instance() {
   # Currently, the summary file shows and aggregates timing of the inference
   # engine, which excludes R's loading time and reading of the (possibly 
   # substantial) map file. Timing below is more inclusive.
-  TIMEFORMAT='Running compare_dist.R took %R seconds'
+  TIMEFORMAT='Running analyze.R took %R seconds'
   time {
     # Input prefix, output dir
-    tests/compare_dist.R -t "Test case: $test_case (instance $test_instance)" \
-                         "$case_dir/case" "$instance_dir/case" $out_dir
+    tests/analyze.R -t "Test case: $test_case (instance $test_instance)" \
+                       "$case_dir/case" "$instance_dir/case" $out_dir
   }
 }
 
diff --git a/setup.sh b/setup.sh
index 48b77633..729c486f 100755
--- a/setup.sh
+++ b/setup.sh
@@ -18,10 +18,9 @@ native-packages() {
   # - build-essential for gcc compilers, invoked while installing R packages.
   # - gfortran Fortran compiler needed for glmnet.
   # - libblas-dev needed for limSolve.
-  # - python-dev is for building the fastrand extension
   #
   # NOTE: we get R 3.0.2 on Trusty.
-  sudo apt-get install build-essential gfortran libblas-dev r-base python-dev
+  sudo apt-get install build-essential gfortran libblas-dev r-base
 }
 
 r-packages() {
diff --git a/test.sh b/test.sh
index 3b5506f2..aa701816 100755
--- a/test.sh
+++ b/test.sh
@@ -106,7 +106,7 @@ r-unit() {
   set -o xtrace  # show tests we're running
 
   # This one needs to be run from the root dir
-  tests/compare_dist_test.R
+  tests/analyze_test.R
 
   tests/gen_counts_test.R
 
diff --git a/tests/compare_dist.R b/tests/analyze.R
similarity index 95%
rename from tests/compare_dist.R
rename to tests/analyze.R
index 27b80973..9f079405 100755
--- a/tests/compare_dist.R
+++ b/tests/analyze.R
@@ -47,6 +47,7 @@ if (library(Cairo, quietly = TRUE, logical.return = TRUE)) {
   cat('Using png\n')
 }
 
+source("analysis/R/analysis_lib.R")
 source("analysis/R/read_input.R")
 source("analysis/R/decode.R")
 source("analysis/R/util.R")
@@ -80,23 +81,25 @@ RunRappor <- function(prefix_case, prefix_instance, ctx) {
   m <- paste0(prefix_case, '_map.csv')
   map <- ReadMapFile(m)  # Switch to LoadMapFile if want to cache the result
 
-  # Main decode.R API
+
   timing <- system.time({
-    res <- Decode(counts, map$map, ctx$params)
+    # Calls AnalyzeRAPPOR to run the analysis code
+    rappor <- AnalyzeRAPPOR(ctx$params, counts, map$map, "FDR", 0.05,
+                          date="01/01/01", date_num="100001")
   })
 
   # The line is searched for, and elapsed time is extracted, by make_summary.py.
   # Should the formating or wording change, make_summary must be updated too.
   Log("Inference took %.3f seconds", timing[["elapsed"]])
 
-  if (is.null(res)) {
+  if (is.null(rappor)) {
     stop("RAPPOR analysis failed.")
   }
 
-  Log("Decoded results:")
-  str(res$fit)
+  Log("Analysis Results:")
+  str(rappor)
 
-  res$fit
+  rappor
 }
 
 LoadActual <- function(prefix_instance) {
diff --git a/tests/compare_dist_test.R b/tests/analyze_test.R
similarity index 97%
rename from tests/compare_dist_test.R
rename to tests/analyze_test.R
index e67c95f1..5d3a09fa 100755
--- a/tests/compare_dist_test.R
+++ b/tests/analyze_test.R
@@ -16,7 +16,7 @@
 
 library(RUnit)
 
-source('tests/compare_dist.R')
+source('tests/analyze.R')
 
 TestProcessAll <- function() {
   ctx <- new.env()

From 4f0bb85dcf3fcb0c5395c9bc767408918ee51466 Mon Sep 17 00:00:00 2001
From: Ilya Mironov <mironov@gmail.com>
Date: Mon, 13 Jul 2015 16:10:49 -0700
Subject: [PATCH 02/14] FitLasso to handle the case of a single variable
 correctly.

---
 analysis/R/decode.R | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 479c8a2c..a081ac51 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -102,16 +102,23 @@ FitLasso <- function(X, Y, intercept = TRUE) {
   # is to avoid overfitting.
   cap <- min(500, nrow(X) * .8, ncol(X))
 
-  mod <- glmnet(X, Y, standardize = FALSE, intercept = intercept,
+  if (ncol(X) == 1)
+  	XX <- cbind2(X, rep(0, nrow(X)))  # add a dummy variable since glmnet can't handle a single-column matrix
+  else
+  	XX <- X
+
+  mod <- glmnet(XX, Y, standardize = FALSE, intercept = intercept,
                 lower.limits = 0,  # outputs are non-negative
                 pmax = cap)
 
   coefs <- coef(mod)
   coefs <- coefs[-1, , drop = FALSE]  # drop the intercept
   l1cap <- sum(colSums(coefs) <= 1.0)  # find all columns with L1 norm <= 1
-  if(l1cap > 0)
+  if (l1cap > 0) {
    	distr <- coefs[, l1cap]  # return the last set of coefficients with L1 <= 1
-  else
+    if (ncol(X) == 1)
+      distr <- distr[1]  # dropping the dummy variable
+  } else
    	distr <- setNames(rep(0, ncol(X)), colnames(X))
   distr
 }

From 11ae8b3e45f0b742c9f6fafcdd440ef852b884cb Mon Sep 17 00:00:00 2001
From: Ilya Mironov <mironov@gmail.com>
Date: Mon, 13 Jul 2015 16:44:32 -0700
Subject: [PATCH 03/14] Merging drop-QP and master.

---
 analysis/R/analysis_tool.R |  2 --
 analysis/R/decode.R        | 47 +++++++++++++-------------------------
 2 files changed, 16 insertions(+), 33 deletions(-)

diff --git a/analysis/R/analysis_tool.R b/analysis/R/analysis_tool.R
index 3afc67c3..f87b7500 100755
--- a/analysis/R/analysis_tool.R
+++ b/analysis/R/analysis_tool.R
@@ -10,8 +10,6 @@ source("analysis/R/read_input.R")
 source("analysis/R/decode.R")
 source("analysis/R/util.R")
 
-source("analysis/R/alternative.R")
-
 options(stringsAsFactors = FALSE)
 
 # Do command line parsing first to catch errors.  Loading libraries in R is
diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 1c6fd248..fde54ae2 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -218,42 +218,27 @@ ComputePrivacyGuarantees <- function(params, alpha, N) {
 }
 
 FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
-  # Find a distribution over rows of map that approximates estimates_stds best
-  #
-  # Input:
-  #   estimates_stds: a list of two m x k matrices, one for estimates, another
-  #                   for standard errors
-  #   map           : an (m * k) x S boolean matrix
-  #
-  # Output:
-  #   a float vector of length S, so that a distribution over map's rows sampled
-  #   according to this vector approximates estimates
-
-  S <- ncol(map)  # total number of candidates
+	# Find a distribution over rows of map that approximates estimates_stds best
+	#
+	# Input:
+	#   estimates_stds: a list of two m x k matrices, one for estimates, another
+	#                   for standard errors
+	#   map           : an (m * k) x S boolean matrix
+	#
+	# Output:
+	#   a float vector of length S, so that a distribution over map's rows sampled
+	#   according to this vector approximates estimates
 
-  support_coefs <- 1:S
+	S <- ncol(map)  # total number of candidates
 
-  if (S > length(estimates_stds$estimates) * .8) {
-    # the system is close to being underdetermined
-    lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
+	lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
 
-    # Select non-zero coefficients.
-    support_coefs <- which(lasso > 0)
+	if(!quiet)
+		cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n")
 
-    if(!quiet)
-      cat("LASSO selected ", length(support_coefs), " non-zero coefficients.\n")
-  }
-
-  coefs <- setNames(rep(0, S), colnames(map))
-
-  if(length(support_coefs) > 0) {  # LASSO may return an empty list
-    constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE],
-                                             estimates_stds)
-
-    coefs[support_coefs] <- constrained_coefs
-  }
+	names(lasso) <- colnames(map)
 
-  coefs
+	lasso
 }
 
 Resample <- function(e) {

From 4d5214bc6a33855ea0c43a0a16090348a1b1febc Mon Sep 17 00:00:00 2001
From: Ilya Mironov <mironov@gmail.com>
Date: Mon, 13 Jul 2015 16:51:20 -0700
Subject: [PATCH 04/14] Removing alternative.R

---
 tests/compare_dist.R | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/compare_dist.R b/tests/compare_dist.R
index 7762482b..27b80973 100755
--- a/tests/compare_dist.R
+++ b/tests/compare_dist.R
@@ -51,8 +51,6 @@ source("analysis/R/read_input.R")
 source("analysis/R/decode.R")
 source("analysis/R/util.R")
 
-source("analysis/R/alternative.R")  # temporary
-
 LoadContext <- function(prefix_case) {
   # Creates the context, filling it with privacy parameters
   # Arg:

From 04d9769f357f7e42d9aff4f6fd1115bfe89410ae Mon Sep 17 00:00:00 2001
From: Ilya Mironov <mironov@gmail.com>
Date: Mon, 13 Jul 2015 17:14:31 -0700
Subject: [PATCH 05/14] Restoring gen_counts_test.R

---
 tests/gen_counts_test.R | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/tests/gen_counts_test.R b/tests/gen_counts_test.R
index 49ad3be5..e91de68e 100755
--- a/tests/gen_counts_test.R
+++ b/tests/gen_counts_test.R
@@ -51,7 +51,7 @@ TestGenerateCounts <- function() {
   noise1 <- list(p = .5, q = .5, f = 0)  # truly random IRRs
   counts1 <- GenerateCounts(c(report_params, noise1), map, partition, v)
 
-  for(i in 2:4)
+  for(i in 2:5)
     for(j in 1:2)
       pvalues <- c(pvalues,
                    chisq.test(c(counts1[j,1] - counts1[j,i], counts1[j,i]),
@@ -64,14 +64,13 @@ TestGenerateCounts <- function() {
 
   counts2 <- counts2 / v
 
-  for(i in 2:4)
+  for(i in 2:5)
     for(j in 1:2)
       pvalues <- c(pvalues,
                    chisq.test(c(counts2[j,1] - counts2[j,i], counts2[j,i]),
                               p = c(.5, .5))$p.value)
 
-  checkTrue(min(pvalues) > 1E-9 && max(pvalues) < 1 - 1E-9,
-            "Chi-squared test failed")
+  checkTrue(min(pvalues) > 1E-9, "Chi-squared test failed")
 }
 
 TestRandomPartition <- function() {
@@ -97,14 +96,14 @@ TestRandomPartition <- function() {
   p5 <- RandomPartition(total = 1000000, c(1, 2, 3, 4))
   p.value <- chisq.test(p5, p = c(.1, .2, .3, .4))$p.value
 
-  # Apply the chi squared test and fail if p.value is too high or too low.
-  # Probability of failure is 2 * 1E-9, which should never happen.
-  checkTrue((p.value > 1E-9) && (p.value <  1 - 1E-9))
+  # Apply the chi squared test and fail if p.value is too low.
+  # Probability of failure is 1E-9, which should never happen.
+  checkTrue(p.value <  1 - 1E-9)
 }
 
-TestAll <- function(){
+CheckAll <- function(){
   TestRandomPartition()
   TestGenerateCounts()
 }
 
-TestAll()
\ No newline at end of file
+CheckAll()
\ No newline at end of file

From 4f3aa3a664b99fb261db91d04a7305b0f2821612 Mon Sep 17 00:00:00 2001
From: Ilya Mironov <mironov@gmail.com>
Date: Tue, 14 Jul 2015 13:49:49 -0700
Subject: [PATCH 06/14] Fixing reviewer's comments. Functionality of TestDecode
 is restored.

---
 analysis/R/decode.R      | 40 ++++++++++++++++++++--------------------
 analysis/R/decode_test.R | 10 ++++++----
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index fde54ae2..72680118 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -218,27 +218,27 @@ ComputePrivacyGuarantees <- function(params, alpha, N) {
 }
 
 FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
-	# Find a distribution over rows of map that approximates estimates_stds best
-	#
-	# Input:
-	#   estimates_stds: a list of two m x k matrices, one for estimates, another
-	#                   for standard errors
-	#   map           : an (m * k) x S boolean matrix
-	#
-	# Output:
-	#   a float vector of length S, so that a distribution over map's rows sampled
-	#   according to this vector approximates estimates
+  # Find a distribution over rows of map that approximates estimates_stds best
+  #
+  # Input:
+  #   estimates_stds: a list of two m x k matrices, one for estimates, another
+  #                   for standard errors
+  #   map           : an (m * k) x S boolean matrix
+  #
+  # Output:
+  #   a float vector of length S, so that a distribution over map's rows sampled
+  #   according to this vector approximates estimates
 
-	S <- ncol(map)  # total number of candidates
+  S <- ncol(map)  # total number of candidates
 
-	lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
+  lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
 
-	if(!quiet)
-		cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n")
+  if (!quiet)
+    cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n")
 
-	names(lasso) <- colnames(map)
+  names(lasso) <- colnames(map)
 
-	lasso
+  lasso
 }
 
 Resample <- function(e) {
@@ -279,10 +279,10 @@ Decode <- function(counts, map, params, alpha = 0.05,
 
   coefs_all <- vector()
 
-  # Run the fitting procedure several times (5 seems to be sufficient and not
-  # too many) to estimate standard deviation of the output.
+  # Run the fitting procedure several times (5 seems to be the minimum that
+  # makes sense) to estimate standard deviation of the output.
   for(r in 1:10) {
-    if(r > 1)
+    if (r > 1)
       e <- Resample(estimates_stds_filtered)
     else
       e <- estimates_stds_filtered
@@ -295,7 +295,7 @@ Decode <- function(counts, map, params, alpha = 0.05,
   coefs_ssd <- N * apply(coefs_all, 2, sd)  # compute sample standard deviations
   coefs_ave <- N * apply(coefs_all, 2, mean)
 
-  # Only select coefficients more than two standard deviations from 0. May
+  # Only select coefficients more than one standard deviation from 0. May
   # inflate empirical SD of the estimates.
   reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd)
 
diff --git a/analysis/R/decode_test.R b/analysis/R/decode_test.R
index 71db7aa5..c4fbdfcf 100755
--- a/analysis/R/decode_test.R
+++ b/analysis/R/decode_test.R
@@ -205,10 +205,12 @@ CheckDecodeAveAndStds <- function(...) {
   results <- RunMultipleTests(...)
 
   estimates <- matrix(nrow = 0, ncol = 0)
-  lapply(results, function(r) MatrixVectorMerge(estimates, r$estimates))
-
   stds <- matrix(nrow = 0, ncol = 0)
-  lapply(results, function(r) MatrixVectorMerge(stds, r$stds))
+
+  for(i in 1:length(results)) {
+    estimates <- MatrixVectorMerge(estimates, results[[i]]$estimates)
+    stds <- MatrixVectorMerge(stds, results[[i]]$stds)
+  }
 
   empirical_stds <- apply(estimates, 2, sd, na.rm = TRUE)
   estimated_stds <- apply(stds, 2, mean, na.rm = TRUE)
@@ -217,7 +219,7 @@ CheckDecodeAveAndStds <- function(...) {
     checkTrue(any(estimated_stds > empirical_stds / 2),
               "Our estimate for the standard deviation is too low")
 
-    checkTrue(any(estimated_stds < empirical_stds * 3),
+    checkTrue(any(estimated_stds < empirical_stds * 2),
               "Our estimate for the standard deviation is too high")
   }
 }

From 6b38d612ed674194f0f811981463fe7b421a73b7 Mon Sep 17 00:00:00 2001
From: Ilya Mironov <mironov@gmail.com>
Date: Tue, 21 Jul 2015 13:09:19 -0700
Subject: [PATCH 07/14] Adjusting the parameter of the Lasso algorithm, and
 fixing elapsed time reporting.

---
 analysis/R/decode.R   | 3 ++-
 tests/make_summary.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 72680118..29488448 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -106,8 +106,9 @@ FitLasso <- function(X, Y, intercept = TRUE) {
   else
   	XX <- X
 
-  mod <- glmnet(XX, Y, standardize = FALSE, intercept = intercept,
+  mod <- glmnet(XX, Y, standardize = TRUE, intercept = intercept,
                 lower.limits = 0,  # outputs are non-negative
+                lambda.min.ratio = 1E-4,
                 pmax = cap)
 
   coefs <- coef(mod)
diff --git a/tests/make_summary.py b/tests/make_summary.py
index f8299a04..b55559c6 100755
--- a/tests/make_summary.py
+++ b/tests/make_summary.py
@@ -160,7 +160,7 @@ def ExtractTime(log_filename):
     with open(log_filename) as log:
       log_str = log.read()
     # Matching a line output by analyze.R.
-    match = re.search(r'Running analyze.R took ([0-9.]+) seconds', log_str)
+    match = re.search(r'Inference took ([0-9.]+) seconds', log_str)
     if match:
       return float(match.group(1))
   return None

From 0c76391e41e149349834801b29001af5e013e5b8 Mon Sep 17 00:00:00 2001
From: Ilya Mironov <mironov@google.com>
Date: Tue, 21 Jul 2015 20:50:10 +0000
Subject: [PATCH 08/14] Requiring that the estimated count be at least two SD
 away from 0.

---
 analysis/R/decode.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 29488448..558082b8 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -296,9 +296,9 @@ Decode <- function(counts, map, params, alpha = 0.05,
   coefs_ssd <- N * apply(coefs_all, 2, sd)  # compute sample standard deviations
   coefs_ave <- N * apply(coefs_all, 2, mean)
 
-  # Only select coefficients more than one standard deviation from 0. May
+  # Only select coefficients more than two standard deviations from 0. May
   # inflate empirical SD of the estimates.
-  reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd)
+  reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd)
 
   mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported])
 

From ec3c32064f8faf87d6701c18084b3dc621adac02 Mon Sep 17 00:00:00 2001
From: Ilya Mironov <mironov@google.com>
Date: Tue, 21 Jul 2015 20:50:10 +0000
Subject: [PATCH 09/14] Requiring that the estimated count be at least two SD
 away from 0.

---
 analysis/R/decode.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 29488448..558082b8 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -296,9 +296,9 @@ Decode <- function(counts, map, params, alpha = 0.05,
   coefs_ssd <- N * apply(coefs_all, 2, sd)  # compute sample standard deviations
   coefs_ave <- N * apply(coefs_all, 2, mean)
 
-  # Only select coefficients more than one standard deviation from 0. May
+  # Only select coefficients more than two standard deviations from 0. May
   # inflate empirical SD of the estimates.
-  reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd)
+  reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd)
 
   mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported])
 

From f4b722268d40149359c27d1305bb669edf47f930 Mon Sep 17 00:00:00 2001
From: Ilya Mironov <mironov@gmail.com>
Date: Wed, 22 Jul 2015 12:06:10 -0700
Subject: [PATCH 10/14] Deleting obsolete code stemming from PerformInference
 calls, shaving off tens of seconds from each run.

---
 analysis/R/decode.R | 39 +++++++--------------------------------
 1 file changed, 7 insertions(+), 32 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 3da7b016..e0002e57 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -302,15 +302,11 @@ Decode <- function(counts, map, params, alpha = 0.05,
 
   mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported])
 
-  if (correction == "Bonferroni") {
-    alpha <- alpha / S
-  }
+  fit <- data.frame(string = colnames(map[, reported]),
+                    Estimate = matrix(mod$coefs, ncol = 1),
+                    SD = mod$stds,
+                    stringsAsFactors = FALSE)
 
-  inf <- PerformInference(map[filter_bits, reported, drop = FALSE],
-                          as.vector(t(estimates_stds_filtered$estimates)),
-                          N, mod, params, alpha,
-                          correction)
-  fit <- inf$fit
   # If this is a basic RAPPOR instance, just use the counts for the estimate
   #     (Check if the map is diagonal to tell if this is basic RAPPOR.)
   if (sum(map) == sum(diag(map))) {
@@ -338,38 +334,17 @@ Decode <- function(counts, map, params, alpha = 0.05,
   allocated_mass <- sum(fit$proportion)
   num_detected <- nrow(fit)
 
-  ss <- round(inf$SS, digits = 3)
-  explained_var <- ss[[1]]
-  missing_var <- ss[[2]]
-  noise_var <- ss[[3]]
-
-  noise_std_dev <- round(inf$resid_sigma, digits = 3)
-
-  # Compute summary of the fit.
-  parameters <-
-      c("Candidate strings", "Detected strings",
-        "Sample size (N)", "Discovered Prop (out of N)",
-        "Explained Variance", "Missing Variance", "Noise Variance",
-        "Theoretical Noise Std. Dev.")
-  values <- c(S, num_detected, N, allocated_mass,
-              explained_var, missing_var, noise_var, noise_std_dev)
-
-  res_summary <- data.frame(parameters = parameters, values = values)
-
   privacy <- ComputePrivacyGuarantees(params, alpha, N)
   params <- data.frame(parameters =
                        c("k", "h", "m", "p", "q", "f", "N", "alpha"),
                        values = c(k, h, m, p, q, f, N, alpha))
 
-  # This is a list of decode stats in a better format than 'summary'.
-  # TODO: Delete summary.
   metrics <- list(sample_size = N,
                   allocated_mass = allocated_mass,
-                  num_detected = num_detected,
-                  explained_var = explained_var,
-                  missing_var = missing_var)
+                  num_detected = num_detected
+                  )
 
-  list(fit = fit, summary = res_summary, privacy = privacy, params = params,
+  list(fit = fit, privacy = privacy, params = params,
        lasso = NULL, ests = as.vector(t(estimates_stds_filtered$estimates)),
        counts = counts[, -1], resid = NULL, metrics = metrics)
 }

From e27ccc0ea8d5663f07526888b0b1cd8cb0fcdce6 Mon Sep 17 00:00:00 2001
From: Ilya Mironov <mironov@gmail.com>
Date: Thu, 13 Aug 2015 13:58:30 -0700
Subject: [PATCH 11/14] Another drop = FALSE error

---
 analysis/R/decode.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 52e231a7..ec09e54c 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -302,7 +302,7 @@ Decode <- function(counts, map, params, alpha = 0.05,
 
   mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported])
 
-  fit <- data.frame(string = colnames(map[, reported]),
+  fit <- data.frame(string = colnames(map[, reported, drop = FALSE]),
                     Estimate = matrix(mod$coefs, ncol = 1),
                     SD = mod$stds,
                     stringsAsFactors = FALSE)

From fd62d970930117e6c79ed445e2da8122c476c61c Mon Sep 17 00:00:00 2001
From: Ilya Mironov <mironov@gmail.com>
Date: Wed, 9 Sep 2015 11:55:03 -0700
Subject: [PATCH 12/14] Dropping reference to alternative.R

---
 analysis/R/decode.R | 2 --
 bin/decode_dist.R   | 2 --
 2 files changed, 4 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 1b663a12..190e0634 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -23,8 +23,6 @@ source.rappor <- function(rel_path)  {
   source(abs_path)
 }
 
-source.rappor('analysis/R/alternative.R')
-
 EstimateBloomCounts <- function(params, obs_counts) {
   # Estimates the number of times each bit in each cohort was set in original
   # Bloom filters.
diff --git a/bin/decode_dist.R b/bin/decode_dist.R
index f61e155b..6ed38b18 100755
--- a/bin/decode_dist.R
+++ b/bin/decode_dist.R
@@ -16,8 +16,6 @@ source.rappor("analysis/R/read_input.R")
 source.rappor("analysis/R/decode.R")
 source.rappor("analysis/R/util.R")
 
-source.rappor("analysis/R/alternative.R")
-
 options(stringsAsFactors = FALSE)
 
 # Do command line parsing first to catch errors.  Loading libraries in R is

From 7e4d00be535846e0b0ce57e2d8503e256695fc6a Mon Sep 17 00:00:00 2001
From: Ilya Mironov <mironov@gmail.com>
Date: Thu, 10 Sep 2015 18:46:55 -0700
Subject: [PATCH 13/14] Changing the cutoff from two SD to one for outputting a
 candidate string.

---
 analysis/R/decode.R | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 190e0634..1fa40a3b 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -102,10 +102,10 @@ FitLasso <- function(X, Y, intercept = TRUE) {
 
   # TODO(mironov): Test cv.glmnet instead of glmnet
 
-  # Cap the number of non-zero coefficients to 500 or 80% of the number of
-  # constraints, whichever is less. The 500 cap is for performance reasons, 80%
+  # Cap the number of non-zero coefficients to 1,000 or 80% of the number of
+  # constraints, whichever is less. The 1,000 cap is for performance reasons, 80%
   # is to avoid overfitting.
-  cap <- min(500, nrow(X) * .8, ncol(X))
+  cap <- min(1000, nrow(X) * .8, ncol(X))
 
   if (ncol(X) == 1)
   	XX <- cbind2(X, rep(0, nrow(X)))  # add a dummy variable since glmnet can't handle a single-column matrix
@@ -224,21 +224,18 @@ ComputePrivacyGuarantees <- function(params, alpha, N) {
   privacy
 }
 
-FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
+FitDistribution <- function(estimates, map, quiet = FALSE) {
   # Find a distribution over rows of map that approximates estimates_stds best
   #
   # Input:
-  #   estimates_stds: a list of two m x k matrices, one for estimates, another
-  #                   for standard errors
-  #   map           : an (m * k) x S boolean matrix
+  #   estimates: an m x k matrix of estimates
+  #   map      : an (m * k) x S boolean matrix
   #
   # Output:
-  #   a float vector of length S, so that a distribution over map's rows sampled
-  #   according to this vector approximates estimates
+  #   an S-long of vector of non-negative floats, so that a distribution over
+  #   the map's rows sampled according to this vector approximates estimates
 
-  S <- ncol(map)  # total number of candidates
-
-  lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
+  lasso <- FitLasso(map, estimates)
 
   if (!quiet)
     cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n")
@@ -259,7 +256,7 @@ Resample <- function(e) {
   list(estimates = estimates, stds = stds)
 }
 
-Decode <- function(counts, map, params, alpha = 0.05,
+Decode <- function(counts, map, params, alpha = 0.05, iterations = 20,
                    correction = c("Bonferroni"), quiet = FALSE, ...) {
   k <- params$k
   p <- params$p
@@ -288,23 +285,29 @@ Decode <- function(counts, map, params, alpha = 0.05,
 
   # Run the fitting procedure several times (5 seems to be the minimum that
   # makes sense) to estimate standard deviation of the output.
-  for(r in 1:10) {
+  groups <- sample(1:iterations, length(filter_bits), replace=T)
+
+  for(r in 1:iterations) {
     if (r > 1)
       e <- Resample(estimates_stds_filtered)
     else
       e <- estimates_stds_filtered
 
+    e_vec <- as.vector(t(e$estimates))
+    subset <- filter_bits[groups != r]
+
     coefs_all <- rbind(coefs_all,
-                       FitDistribution(e, map[filter_bits, , drop = FALSE],
+                       FitDistribution(e_vec[subset],
+                                       map[subset, , drop = FALSE],
                                        quiet))
   }
 
   coefs_ssd <- N * apply(coefs_all, 2, sd)  # compute sample standard deviations
   coefs_ave <- N * apply(coefs_all, 2, mean)
 
-  # Only select coefficients more than two standard deviations from 0. May
+  # Only select coefficients more than one standard deviation from 0. May
   # inflate empirical SD of the estimates.
-  reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd)
+  reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd)
 
   mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported])
 

From 60c64ea194917df5d5879c19a48e5ae5fd8da26f Mon Sep 17 00:00:00 2001
From: Ilya Mironov <mironov@google.com>
Date: Fri, 11 Sep 2015 21:16:26 +0000
Subject: [PATCH 14/14] Changing the number of iterations from 20 to 10.

---
 analysis/R/decode.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/analysis/R/decode.R b/analysis/R/decode.R
index 1fa40a3b..7d651145 100644
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@@ -256,7 +256,7 @@ Resample <- function(e) {
   list(estimates = estimates, stds = stds)
 }
 
-Decode <- function(counts, map, params, alpha = 0.05, iterations = 20,
+Decode <- function(counts, map, params, alpha = 0.05, iterations = 10,
                    correction = c("Bonferroni"), quiet = FALSE, ...) {
   k <- params$k
   p <- params$p