From 6e0e7c639896d032854306c019da9bcb7afb5e03 Mon Sep 17 00:00:00 2001 From: Ilya Mironov Date: Mon, 13 Jul 2015 14:43:55 -0700 Subject: [PATCH 01/14] Revert "Merge branch 'master' of github.com:google/rappor into drop-QP" This reverts commit 408ed08240d1401c639085d7bb64c720676316ea, reversing changes made to 5ab1f4f15775a8e2c2ed25560780778517fd604a. --- analysis/R/analysis_lib.R | 90 ++++++++ analysis/R/analysis_tool.R | 207 +++++++++++++++--- regtest.sh | 6 +- setup.sh | 3 +- test.sh | 2 +- tests/{compare_dist.R => analyze.R} | 15 +- tests/{compare_dist_test.R => analyze_test.R} | 2 +- 7 files changed, 279 insertions(+), 46 deletions(-) create mode 100644 analysis/R/analysis_lib.R rename tests/{compare_dist.R => analyze.R} (95%) rename tests/{compare_dist_test.R => analyze_test.R} (97%) diff --git a/analysis/R/analysis_lib.R b/analysis/R/analysis_lib.R new file mode 100644 index 00000000..ddb2c0a0 --- /dev/null +++ b/analysis/R/analysis_lib.R @@ -0,0 +1,90 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +GetFN <- function(name) { + # Helper function to strip extension from the filename. + strsplit(basename(name), ".", fixed = TRUE)[[1]][1] +} + +ValidateInput <- function(params, counts, map) { + val <- "valid" + if (is.null(counts)) { + val <- "No counts file found. Skipping" + return(val) + } + + if (nrow(map) != (params$m * params$k)) { + val <- paste("Map does not match the counts file!", + "mk = ", params$m * params$k, + "nrow(map):", nrow(map), + collapse = " ") + } + + if ((ncol(counts) - 1) != params$k) { + val <- paste("Dimensions of counts file do not match:", + "m =", params$m, "counts rows: ", nrow(counts), + "k =", params$k, "counts cols: ", ncol(counts) - 1, + collapse = " ") + } + + # numerically correct comparison + if(isTRUE(all.equal((1 - params$f) * (params$p - params$q), 0))) + stop("Information is lost. Cannot decode.") + + val +} + +AnalyzeRAPPOR <- function(params, counts, map, correction, alpha, + experiment_name = "", map_name = "", config_name = "", + date = NULL, date_num = NULL, ...) { + val <- ValidateInput(params, counts, map) + if (val != "valid") { + cat(val, "\n") + return(NULL) + } + + cat("Sample Size: ", sum(counts[, 1]), "\n", + "Number of cohorts: ", nrow(counts), "\n", sep = "") + + fit <- Decode(counts, map, params, correction = correction, + alpha = alpha, ...) + + res <- fit$fit + + if (nrow(fit$fit) > 0) { + res$rank <- 1:nrow(fit$fit) + res$detected <- fit$summary[2, 2] + res$sample_size <- fit$summary[3, 2] + res$detected_prop <- fit$summary[4, 2] + res$explained_var <- fit$summary[5, 2] + res$missing_var <- fit$summary[6, 2] + + res$exp_e_1 <- fit$privacy[3, 2] + res$exp_e_inf <- fit$privacy[5, 2] + res$detection_freq <- fit$privacy[7, 2] + res$correction <- correction + res$alpha <- alpha + + res$experiment <- experiment_name + res$map <- map_name + res$config <- config_name + res$date <- date + res$date_num <- date_num + } + else + print("INSUFFICIENT DATA FOR MEANINGFUL ANSWER.") + + res +} diff --git a/analysis/R/analysis_tool.R b/analysis/R/analysis_tool.R index f87b7500..b0acf2df 100755 --- a/analysis/R/analysis_tool.R +++ b/analysis/R/analysis_tool.R @@ -1,11 +1,23 @@ #!/usr/bin/env Rscript # -# Command line tool to decode a RAPPOR data set. It is a simple wrapper for -# Decode() in decode.R. +# IMPORTANT: This script should be deleted when the analysis server is ready. +# +# Generates daily Chrome RAPPOR feed. All experiments found in the +# experiment_config flag are processed for each day. +# +# If neither start_date nor end_date are specified, the feed runs on +# yesterday's data by default. Otherwise, a date range could be indicated +# to run a backfill. +# +# num_days indicates how many days should be aggregated over. If num_days = 7, +# then 7-day trailing (combining data from the most recent 7 days) analysis for +# each date (by default, only for yesterday) will be performed. Please use 7 +# days for weekly and 28 days for monthly analyses. library(optparse) library(RJSONIO) +source("analysis/R/analysis_lib.R") source("analysis/R/read_input.R") source("analysis/R/decode.R") source("analysis/R/util.R") @@ -17,15 +29,30 @@ options(stringsAsFactors = FALSE) if (!interactive()) { option_list <- list( # Flags. + make_option("--start_date", default="", help="First date to process. Format: %Y/%m/%d"), + make_option("--end_date", default="", help="Last date to process"), + make_option("--num_days", default=1, help="Number of trailing days to process"), + + make_option("--experiment_config", default="chrome_rappor_experiments.csv", + help="Experiment config file"), make_option("--map", default="MA", help="Map file"), make_option("--counts", default="CO", help="Counts file"), # TODO: Rename this to --params make_option("--config", default="", help="Config file"), - make_option("--output_dir", default="./", help="Output directory"), make_option("--correction", default="FDR", help="Correction method"), - make_option("--alpha", default=.05, help="Alpha level") + make_option("--alpha", default=.05, help="Alpha level"), + + # Input and output directories. + make_option("--counts_dir", default="/cns/pa-d/home/chrome-rappor/daily", + help="CNS directory where counts files are located"), + make_option("--map_dir", default="/cns/pa-d/home/chrome-rappor/map", + help="CNS map directory"), + make_option("--config_dir", default="/cns/pa-d/home/chrome-rappor/config", + help="CNS configuration files directory"), + make_option("--release_dir", default="/cns/pa-d/home/chrome-rappor/release", + help="Output directory") ) # NOTE: This API is bad; if you add positional_arguments, the return value changes! opts <- parse_args(OptionParser(option_list = option_list)) @@ -42,35 +69,7 @@ AdjustCounts <- function(counts, params) { }) } -ValidateInput <- function(params, counts, map) { - val <- "valid" - if (is.null(counts)) { - val <- "No counts file found. Skipping" - return(val) - } - - if (nrow(map) != (params$m * params$k)) { - val <- paste("Map does not match the counts file!", - "mk = ", params$m * params$k, - "nrow(map):", nrow(map), - collapse = " ") - } - - if ((ncol(counts) - 1) != params$k) { - val <- paste("Dimensions of counts file do not match:", - "m =", params$m, "counts rows: ", nrow(counts), - "k =", params$k, "counts cols: ", ncol(counts) - 1, - collapse = " ") - } - - # numerically correct comparison - if(isTRUE(all.equal((1 - params$f) * (params$p - params$q), 0))) - stop("Information is lost. Cannot decode.") - - val -} - -main <- function(opts) { +RunOne <- function(opts) { # Run a single model of all inputs are specified. params <- ReadParameterFile(opts$config) counts <- ReadCountsFile(opts$counts) @@ -126,6 +125,148 @@ main <- function(opts) { Log('DONE') } +# Run multiple models. There is a CSV experiments config file, and we invoke +# AnalyzeRAPPOR once for each row in it. +RunMany <- function(opts) { + + # If the date is not specified, run yesterday's analyses only. + if (opts$start_date == "" && opts$end_date == "") { + start_date <- Sys.Date() - 1 + end_date <- start_date + } else { + start_date <- as.Date(opts$start_date) + end_date <- as.Date(opts$end_date) + if (end_date < start_date) { + stop("End date should be larger or equal than start date!") + } + } + dates <- as.character(seq(start_date, end_date, 1)) + + # List of experiments to analyze. + config_path = file.path(opts$config_dir, opts$experiment_config) + Log('Reading experiment config %s', config_path) + experiments <- read.csv(config_path, + header = FALSE, as.is = TRUE, + colClasses = "character", comment.char = "#") + + for (date in dates) { + Log("Date: %s", date) + date <- as.Date(date) + year <- format(date, "%Y") + month <- format(date, "%m") + day <- format(date, "%d") + + # Create an output directory. + output_dir <- file.path(opts$release_dir, year, month, day) + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) + + base_filename <- "chrome_rappor_experiments_" + output_filename <- + switch(as.character(opts$num_days), + "1" = paste0(base_filename, date, ".csv"), + "7" = paste0("weekly_", base_filename, date, ".csv"), + "28" = paste0("monthly_", base_filename, date, ".csv"), + paste0(opts$num_days, "_", base_filename, date, ".csv")) + + # Delete any existing files with the same name. + unlink(file.path(output_dir, output_filename)) + + res <- vector("list", nrow(experiments)) + for (i in 1:nrow(experiments)) { + cat(paste0("Experiment ", i, " (of ", + nrow(experiments), "): ", experiments[i, 2], "\n")) + + # Process a line in the experiments file. + experiment_name <- experiments[i, 2] + map_file <- experiments[i, 3] + config_file <- experiments[i, 4] + + # Read in input files specified in the experiments file. + params_path = file.path(opts$config_dir, config_file) + Log('Reading params %s', params_path) + config <- ReadParameterFile(params_path) + + map_path <- file.path(opts$map_dir, map_file) + Log('Loading map %s', map_path) + LoadMapFile(map_path) # Loads the "map" object. + + # Read one or more counts file. + counts_file <- paste0(experiments[i, 1], "_counts.csv") + trailing_dates <- as.character(seq(date - opts$num_days + 1, date, 1)) + counts_list <- list() + for (j in 1:length(trailing_dates)) { + counts_path = file.path(opts$counts_dir, trailing_dates[j], counts_file) + Log("Reading counts %s", counts_path) + + counts_j <- ReadCountsFile(file.path(opts$counts_dir, + trailing_dates[j], counts_file)) + if (!is.null(counts_j)) { + counts_list[[j]] <- AdjustCounts(counts_j, config) + } + } + counts <- Reduce("+", counts_list) # Turn list into matrix + + # Perform the analysis. + + Log("CONFIG") + str(config) + cat('\n') + + Log("COUNTS") + str(counts) + cat('\n') + + Log("MAP") + str(map$map) + cat('\n') + + exp_res <- AnalyzeRAPPOR(config, counts, map$map, opts$correction, opts$alpha, + experiment_name = experiment_name, + map_name = map_file, + config_name = config_file, + date = as.character(date), + date_num = as.numeric(format(date, "%Y%m%d"))) + + if (!is.null(exp_res)) { + res[[i]] <- exp_res + cat("Discovered:", nrow(exp_res), "\n\n") + } else { + cat("Discovered: 0\n\n") + } + } + + # Write out a single column IO file for each date. + output <- do.call("rbind", res) + if (!is.null(output) && nrow(output) > 0) { + path = file.path(output_dir, output_filename) + write.csv(output, path) + Log('Wrote %s', path) + } + + Log("RES") + str(res) + cat('\n') + + Log("OUTPUT") + str(output) + cat('\n') + + Log("sum(proportion)") + print(sum(output$proportion)) + + Log("sum(estimate)") + print(sum(output$estimate)) + } +} + +main = function(opts) { + if (opts$counts != "" && opts$map != "" && opts$config != "") { + RunOne(opts) + } else { + RunMany(opts) + } +} + if (!interactive()) { main(opts) } diff --git a/regtest.sh b/regtest.sh index ee813eeb..2430edb1 100755 --- a/regtest.sh +++ b/regtest.sh @@ -206,11 +206,11 @@ _run-one-instance() { # Currently, the summary file shows and aggregates timing of the inference # engine, which excludes R's loading time and reading of the (possibly # substantial) map file. Timing below is more inclusive. - TIMEFORMAT='Running compare_dist.R took %R seconds' + TIMEFORMAT='Running analyze.R took %R seconds' time { # Input prefix, output dir - tests/compare_dist.R -t "Test case: $test_case (instance $test_instance)" \ - "$case_dir/case" "$instance_dir/case" $out_dir + tests/analyze.R -t "Test case: $test_case (instance $test_instance)" \ + "$case_dir/case" "$instance_dir/case" $out_dir } } diff --git a/setup.sh b/setup.sh index 48b77633..729c486f 100755 --- a/setup.sh +++ b/setup.sh @@ -18,10 +18,9 @@ native-packages() { # - build-essential for gcc compilers, invoked while installing R packages. # - gfortran Fortran compiler needed for glmnet. # - libblas-dev needed for limSolve. - # - python-dev is for building the fastrand extension # # NOTE: we get R 3.0.2 on Trusty. - sudo apt-get install build-essential gfortran libblas-dev r-base python-dev + sudo apt-get install build-essential gfortran libblas-dev r-base } r-packages() { diff --git a/test.sh b/test.sh index 3b5506f2..aa701816 100755 --- a/test.sh +++ b/test.sh @@ -106,7 +106,7 @@ r-unit() { set -o xtrace # show tests we're running # This one needs to be run from the root dir - tests/compare_dist_test.R + tests/analyze_test.R tests/gen_counts_test.R diff --git a/tests/compare_dist.R b/tests/analyze.R similarity index 95% rename from tests/compare_dist.R rename to tests/analyze.R index 27b80973..9f079405 100755 --- a/tests/compare_dist.R +++ b/tests/analyze.R @@ -47,6 +47,7 @@ if (library(Cairo, quietly = TRUE, logical.return = TRUE)) { cat('Using png\n') } +source("analysis/R/analysis_lib.R") source("analysis/R/read_input.R") source("analysis/R/decode.R") source("analysis/R/util.R") @@ -80,23 +81,25 @@ RunRappor <- function(prefix_case, prefix_instance, ctx) { m <- paste0(prefix_case, '_map.csv') map <- ReadMapFile(m) # Switch to LoadMapFile if want to cache the result - # Main decode.R API + timing <- system.time({ - res <- Decode(counts, map$map, ctx$params) + # Calls AnalyzeRAPPOR to run the analysis code + rappor <- AnalyzeRAPPOR(ctx$params, counts, map$map, "FDR", 0.05, + date="01/01/01", date_num="100001") }) # The line is searched for, and elapsed time is extracted, by make_summary.py. # Should the formating or wording change, make_summary must be updated too. Log("Inference took %.3f seconds", timing[["elapsed"]]) - if (is.null(res)) { + if (is.null(rappor)) { stop("RAPPOR analysis failed.") } - Log("Decoded results:") - str(res$fit) + Log("Analysis Results:") + str(rappor) - res$fit + rappor } LoadActual <- function(prefix_instance) { diff --git a/tests/compare_dist_test.R b/tests/analyze_test.R similarity index 97% rename from tests/compare_dist_test.R rename to tests/analyze_test.R index e67c95f1..5d3a09fa 100755 --- a/tests/compare_dist_test.R +++ b/tests/analyze_test.R @@ -16,7 +16,7 @@ library(RUnit) -source('tests/compare_dist.R') +source('tests/analyze.R') TestProcessAll <- function() { ctx <- new.env() From 4f0bb85dcf3fcb0c5395c9bc767408918ee51466 Mon Sep 17 00:00:00 2001 From: Ilya Mironov Date: Mon, 13 Jul 2015 16:10:49 -0700 Subject: [PATCH 02/14] FitLasso to handle the case of a single variable correctly. --- analysis/R/decode.R | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 479c8a2c..a081ac51 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -102,16 +102,23 @@ FitLasso <- function(X, Y, intercept = TRUE) { # is to avoid overfitting. cap <- min(500, nrow(X) * .8, ncol(X)) - mod <- glmnet(X, Y, standardize = FALSE, intercept = intercept, + if (ncol(X) == 1) + XX <- cbind2(X, rep(0, nrow(X))) # add a dummy variable since glmnet can't handle a single-column matrix + else + XX <- X + + mod <- glmnet(XX, Y, standardize = FALSE, intercept = intercept, lower.limits = 0, # outputs are non-negative pmax = cap) coefs <- coef(mod) coefs <- coefs[-1, , drop = FALSE] # drop the intercept l1cap <- sum(colSums(coefs) <= 1.0) # find all columns with L1 norm <= 1 - if(l1cap > 0) + if (l1cap > 0) { distr <- coefs[, l1cap] # return the last set of coefficients with L1 <= 1 - else + if (ncol(X) == 1) + distr <- distr[1] # dropping the dummy variable + } else distr <- setNames(rep(0, ncol(X)), colnames(X)) distr } From 11ae8b3e45f0b742c9f6fafcdd440ef852b884cb Mon Sep 17 00:00:00 2001 From: Ilya Mironov Date: Mon, 13 Jul 2015 16:44:32 -0700 Subject: [PATCH 03/14] Merging drop-QP and master. --- analysis/R/analysis_tool.R | 2 -- analysis/R/decode.R | 47 +++++++++++++------------------------- 2 files changed, 16 insertions(+), 33 deletions(-) diff --git a/analysis/R/analysis_tool.R b/analysis/R/analysis_tool.R index 3afc67c3..f87b7500 100755 --- a/analysis/R/analysis_tool.R +++ b/analysis/R/analysis_tool.R @@ -10,8 +10,6 @@ source("analysis/R/read_input.R") source("analysis/R/decode.R") source("analysis/R/util.R") -source("analysis/R/alternative.R") - options(stringsAsFactors = FALSE) # Do command line parsing first to catch errors. Loading libraries in R is diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 1c6fd248..fde54ae2 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -218,42 +218,27 @@ ComputePrivacyGuarantees <- function(params, alpha, N) { } FitDistribution <- function(estimates_stds, map, quiet = FALSE) { - # Find a distribution over rows of map that approximates estimates_stds best - # - # Input: - # estimates_stds: a list of two m x k matrices, one for estimates, another - # for standard errors - # map : an (m * k) x S boolean matrix - # - # Output: - # a float vector of length S, so that a distribution over map's rows sampled - # according to this vector approximates estimates - - S <- ncol(map) # total number of candidates + # Find a distribution over rows of map that approximates estimates_stds best + # + # Input: + # estimates_stds: a list of two m x k matrices, one for estimates, another + # for standard errors + # map : an (m * k) x S boolean matrix + # + # Output: + # a float vector of length S, so that a distribution over map's rows sampled + # according to this vector approximates estimates - support_coefs <- 1:S + S <- ncol(map) # total number of candidates - if (S > length(estimates_stds$estimates) * .8) { - # the system is close to being underdetermined - lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) + lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) - # Select non-zero coefficients. - support_coefs <- which(lasso > 0) + if(!quiet) + cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n") - if(!quiet) - cat("LASSO selected ", length(support_coefs), " non-zero coefficients.\n") - } - - coefs <- setNames(rep(0, S), colnames(map)) - - if(length(support_coefs) > 0) { # LASSO may return an empty list - constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE], - estimates_stds) - - coefs[support_coefs] <- constrained_coefs - } + names(lasso) <- colnames(map) - coefs + lasso } Resample <- function(e) { From 4d5214bc6a33855ea0c43a0a16090348a1b1febc Mon Sep 17 00:00:00 2001 From: Ilya Mironov Date: Mon, 13 Jul 2015 16:51:20 -0700 Subject: [PATCH 04/14] Removing alternative.R --- tests/compare_dist.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/compare_dist.R b/tests/compare_dist.R index 7762482b..27b80973 100755 --- a/tests/compare_dist.R +++ b/tests/compare_dist.R @@ -51,8 +51,6 @@ source("analysis/R/read_input.R") source("analysis/R/decode.R") source("analysis/R/util.R") -source("analysis/R/alternative.R") # temporary - LoadContext <- function(prefix_case) { # Creates the context, filling it with privacy parameters # Arg: From 04d9769f357f7e42d9aff4f6fd1115bfe89410ae Mon Sep 17 00:00:00 2001 From: Ilya Mironov Date: Mon, 13 Jul 2015 17:14:31 -0700 Subject: [PATCH 05/14] Restoring gen_counts_test.R --- tests/gen_counts_test.R | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/gen_counts_test.R b/tests/gen_counts_test.R index 49ad3be5..e91de68e 100755 --- a/tests/gen_counts_test.R +++ b/tests/gen_counts_test.R @@ -51,7 +51,7 @@ TestGenerateCounts <- function() { noise1 <- list(p = .5, q = .5, f = 0) # truly random IRRs counts1 <- GenerateCounts(c(report_params, noise1), map, partition, v) - for(i in 2:4) + for(i in 2:5) for(j in 1:2) pvalues <- c(pvalues, chisq.test(c(counts1[j,1] - counts1[j,i], counts1[j,i]), @@ -64,14 +64,13 @@ TestGenerateCounts <- function() { counts2 <- counts2 / v - for(i in 2:4) + for(i in 2:5) for(j in 1:2) pvalues <- c(pvalues, chisq.test(c(counts2[j,1] - counts2[j,i], counts2[j,i]), p = c(.5, .5))$p.value) - checkTrue(min(pvalues) > 1E-9 && max(pvalues) < 1 - 1E-9, - "Chi-squared test failed") + checkTrue(min(pvalues) > 1E-9, "Chi-squared test failed") } TestRandomPartition <- function() { @@ -97,14 +96,14 @@ TestRandomPartition <- function() { p5 <- RandomPartition(total = 1000000, c(1, 2, 3, 4)) p.value <- chisq.test(p5, p = c(.1, .2, .3, .4))$p.value - # Apply the chi squared test and fail if p.value is too high or too low. - # Probability of failure is 2 * 1E-9, which should never happen. - checkTrue((p.value > 1E-9) && (p.value < 1 - 1E-9)) + # Apply the chi squared test and fail if p.value is too low. + # Probability of failure is 1E-9, which should never happen. + checkTrue(p.value < 1 - 1E-9) } -TestAll <- function(){ +CheckAll <- function(){ TestRandomPartition() TestGenerateCounts() } -TestAll() \ No newline at end of file +CheckAll() \ No newline at end of file From 4f3aa3a664b99fb261db91d04a7305b0f2821612 Mon Sep 17 00:00:00 2001 From: Ilya Mironov Date: Tue, 14 Jul 2015 13:49:49 -0700 Subject: [PATCH 06/14] Fixing reviewer's comments. Functionality of TestDecode is restored. --- analysis/R/decode.R | 40 ++++++++++++++++++++-------------------- analysis/R/decode_test.R | 10 ++++++---- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index fde54ae2..72680118 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -218,27 +218,27 @@ ComputePrivacyGuarantees <- function(params, alpha, N) { } FitDistribution <- function(estimates_stds, map, quiet = FALSE) { - # Find a distribution over rows of map that approximates estimates_stds best - # - # Input: - # estimates_stds: a list of two m x k matrices, one for estimates, another - # for standard errors - # map : an (m * k) x S boolean matrix - # - # Output: - # a float vector of length S, so that a distribution over map's rows sampled - # according to this vector approximates estimates + # Find a distribution over rows of map that approximates estimates_stds best + # + # Input: + # estimates_stds: a list of two m x k matrices, one for estimates, another + # for standard errors + # map : an (m * k) x S boolean matrix + # + # Output: + # a float vector of length S, so that a distribution over map's rows sampled + # according to this vector approximates estimates - S <- ncol(map) # total number of candidates + S <- ncol(map) # total number of candidates - lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) + lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) - if(!quiet) - cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n") + if (!quiet) + cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n") - names(lasso) <- colnames(map) + names(lasso) <- colnames(map) - lasso + lasso } Resample <- function(e) { @@ -279,10 +279,10 @@ Decode <- function(counts, map, params, alpha = 0.05, coefs_all <- vector() - # Run the fitting procedure several times (5 seems to be sufficient and not - # too many) to estimate standard deviation of the output. + # Run the fitting procedure several times (5 seems to be the minimum that + # makes sense) to estimate standard deviation of the output. for(r in 1:10) { - if(r > 1) + if (r > 1) e <- Resample(estimates_stds_filtered) else e <- estimates_stds_filtered @@ -295,7 +295,7 @@ Decode <- function(counts, map, params, alpha = 0.05, coefs_ssd <- N * apply(coefs_all, 2, sd) # compute sample standard deviations coefs_ave <- N * apply(coefs_all, 2, mean) - # Only select coefficients more than two standard deviations from 0. May + # Only select coefficients more than one standard deviation from 0. May # inflate empirical SD of the estimates. reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd) diff --git a/analysis/R/decode_test.R b/analysis/R/decode_test.R index 71db7aa5..c4fbdfcf 100755 --- a/analysis/R/decode_test.R +++ b/analysis/R/decode_test.R @@ -205,10 +205,12 @@ CheckDecodeAveAndStds <- function(...) { results <- RunMultipleTests(...) estimates <- matrix(nrow = 0, ncol = 0) - lapply(results, function(r) MatrixVectorMerge(estimates, r$estimates)) - stds <- matrix(nrow = 0, ncol = 0) - lapply(results, function(r) MatrixVectorMerge(stds, r$stds)) + + for(i in 1:length(results)) { + estimates <- MatrixVectorMerge(estimates, results[[i]]$estimates) + stds <- MatrixVectorMerge(stds, results[[i]]$stds) + } empirical_stds <- apply(estimates, 2, sd, na.rm = TRUE) estimated_stds <- apply(stds, 2, mean, na.rm = TRUE) @@ -217,7 +219,7 @@ CheckDecodeAveAndStds <- function(...) { checkTrue(any(estimated_stds > empirical_stds / 2), "Our estimate for the standard deviation is too low") - checkTrue(any(estimated_stds < empirical_stds * 3), + checkTrue(any(estimated_stds < empirical_stds * 2), "Our estimate for the standard deviation is too high") } } From 6b38d612ed674194f0f811981463fe7b421a73b7 Mon Sep 17 00:00:00 2001 From: Ilya Mironov Date: Tue, 21 Jul 2015 13:09:19 -0700 Subject: [PATCH 07/14] Adjusting the parameter of the Lasso algorithm, and fixing elapsed time reporting. --- analysis/R/decode.R | 3 ++- tests/make_summary.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 72680118..29488448 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -106,8 +106,9 @@ FitLasso <- function(X, Y, intercept = TRUE) { else XX <- X - mod <- glmnet(XX, Y, standardize = FALSE, intercept = intercept, + mod <- glmnet(XX, Y, standardize = TRUE, intercept = intercept, lower.limits = 0, # outputs are non-negative + lambda.min.ratio = 1E-4, pmax = cap) coefs <- coef(mod) diff --git a/tests/make_summary.py b/tests/make_summary.py index f8299a04..b55559c6 100755 --- a/tests/make_summary.py +++ b/tests/make_summary.py @@ -160,7 +160,7 @@ def ExtractTime(log_filename): with open(log_filename) as log: log_str = log.read() # Matching a line output by analyze.R. - match = re.search(r'Running analyze.R took ([0-9.]+) seconds', log_str) + match = re.search(r'Inference took ([0-9.]+) seconds', log_str) if match: return float(match.group(1)) return None From 0c76391e41e149349834801b29001af5e013e5b8 Mon Sep 17 00:00:00 2001 From: Ilya Mironov Date: Tue, 21 Jul 2015 20:50:10 +0000 Subject: [PATCH 08/14] Requiring that the estimated count be at least two SD away from 0. --- analysis/R/decode.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 29488448..558082b8 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -296,9 +296,9 @@ Decode <- function(counts, map, params, alpha = 0.05, coefs_ssd <- N * apply(coefs_all, 2, sd) # compute sample standard deviations coefs_ave <- N * apply(coefs_all, 2, mean) - # Only select coefficients more than one standard deviation from 0. May + # Only select coefficients more than two standard deviations from 0. May # inflate empirical SD of the estimates. - reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd) + reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd) mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported]) From ec3c32064f8faf87d6701c18084b3dc621adac02 Mon Sep 17 00:00:00 2001 From: Ilya Mironov Date: Tue, 21 Jul 2015 20:50:10 +0000 Subject: [PATCH 09/14] Requiring that the estimated count be at least two SD away from 0. --- analysis/R/decode.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 29488448..558082b8 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -296,9 +296,9 @@ Decode <- function(counts, map, params, alpha = 0.05, coefs_ssd <- N * apply(coefs_all, 2, sd) # compute sample standard deviations coefs_ave <- N * apply(coefs_all, 2, mean) - # Only select coefficients more than one standard deviation from 0. May + # Only select coefficients more than two standard deviations from 0. May # inflate empirical SD of the estimates. - reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd) + reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd) mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported]) From f4b722268d40149359c27d1305bb669edf47f930 Mon Sep 17 00:00:00 2001 From: Ilya Mironov Date: Wed, 22 Jul 2015 12:06:10 -0700 Subject: [PATCH 10/14] Deleting obsolete code stemming from PerformInference calls, shaving off tens of seconds from each run. --- analysis/R/decode.R | 39 +++++++-------------------------------- 1 file changed, 7 insertions(+), 32 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 3da7b016..e0002e57 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -302,15 +302,11 @@ Decode <- function(counts, map, params, alpha = 0.05, mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported]) - if (correction == "Bonferroni") { - alpha <- alpha / S - } + fit <- data.frame(string = colnames(map[, reported]), + Estimate = matrix(mod$coefs, ncol = 1), + SD = mod$stds, + stringsAsFactors = FALSE) - inf <- PerformInference(map[filter_bits, reported, drop = FALSE], - as.vector(t(estimates_stds_filtered$estimates)), - N, mod, params, alpha, - correction) - fit <- inf$fit # If this is a basic RAPPOR instance, just use the counts for the estimate # (Check if the map is diagonal to tell if this is basic RAPPOR.) if (sum(map) == sum(diag(map))) { @@ -338,38 +334,17 @@ Decode <- function(counts, map, params, alpha = 0.05, allocated_mass <- sum(fit$proportion) num_detected <- nrow(fit) - ss <- round(inf$SS, digits = 3) - explained_var <- ss[[1]] - missing_var <- ss[[2]] - noise_var <- ss[[3]] - - noise_std_dev <- round(inf$resid_sigma, digits = 3) - - # Compute summary of the fit. - parameters <- - c("Candidate strings", "Detected strings", - "Sample size (N)", "Discovered Prop (out of N)", - "Explained Variance", "Missing Variance", "Noise Variance", - "Theoretical Noise Std. Dev.") - values <- c(S, num_detected, N, allocated_mass, - explained_var, missing_var, noise_var, noise_std_dev) - - res_summary <- data.frame(parameters = parameters, values = values) - privacy <- ComputePrivacyGuarantees(params, alpha, N) params <- data.frame(parameters = c("k", "h", "m", "p", "q", "f", "N", "alpha"), values = c(k, h, m, p, q, f, N, alpha)) - # This is a list of decode stats in a better format than 'summary'. - # TODO: Delete summary. metrics <- list(sample_size = N, allocated_mass = allocated_mass, - num_detected = num_detected, - explained_var = explained_var, - missing_var = missing_var) + num_detected = num_detected + ) - list(fit = fit, summary = res_summary, privacy = privacy, params = params, + list(fit = fit, privacy = privacy, params = params, lasso = NULL, ests = as.vector(t(estimates_stds_filtered$estimates)), counts = counts[, -1], resid = NULL, metrics = metrics) } From e27ccc0ea8d5663f07526888b0b1cd8cb0fcdce6 Mon Sep 17 00:00:00 2001 From: Ilya Mironov Date: Thu, 13 Aug 2015 13:58:30 -0700 Subject: [PATCH 11/14] Another drop = FALSE error --- analysis/R/decode.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 52e231a7..ec09e54c 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -302,7 +302,7 @@ Decode <- function(counts, map, params, alpha = 0.05, mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported]) - fit <- data.frame(string = colnames(map[, reported]), + fit <- data.frame(string = colnames(map[, reported, drop = FALSE]), Estimate = matrix(mod$coefs, ncol = 1), SD = mod$stds, stringsAsFactors = FALSE) From fd62d970930117e6c79ed445e2da8122c476c61c Mon Sep 17 00:00:00 2001 From: Ilya Mironov Date: Wed, 9 Sep 2015 11:55:03 -0700 Subject: [PATCH 12/14] Dropping reference to alternative.R --- analysis/R/decode.R | 2 -- bin/decode_dist.R | 2 -- 2 files changed, 4 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 1b663a12..190e0634 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -23,8 +23,6 @@ source.rappor <- function(rel_path) { source(abs_path) } -source.rappor('analysis/R/alternative.R') - EstimateBloomCounts <- function(params, obs_counts) { # Estimates the number of times each bit in each cohort was set in original # Bloom filters. diff --git a/bin/decode_dist.R b/bin/decode_dist.R index f61e155b..6ed38b18 100755 --- a/bin/decode_dist.R +++ b/bin/decode_dist.R @@ -16,8 +16,6 @@ source.rappor("analysis/R/read_input.R") source.rappor("analysis/R/decode.R") source.rappor("analysis/R/util.R") -source.rappor("analysis/R/alternative.R") - options(stringsAsFactors = FALSE) # Do command line parsing first to catch errors. Loading libraries in R is From 7e4d00be535846e0b0ce57e2d8503e256695fc6a Mon Sep 17 00:00:00 2001 From: Ilya Mironov Date: Thu, 10 Sep 2015 18:46:55 -0700 Subject: [PATCH 13/14] Changing the cutoff from two SD to one for outputting a candidate string. --- analysis/R/decode.R | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 190e0634..1fa40a3b 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -102,10 +102,10 @@ FitLasso <- function(X, Y, intercept = TRUE) { # TODO(mironov): Test cv.glmnet instead of glmnet - # Cap the number of non-zero coefficients to 500 or 80% of the number of - # constraints, whichever is less. The 500 cap is for performance reasons, 80% + # Cap the number of non-zero coefficients to 1,000 or 80% of the number of + # constraints, whichever is less. The 1,000 cap is for performance reasons, 80% # is to avoid overfitting. - cap <- min(500, nrow(X) * .8, ncol(X)) + cap <- min(1000, nrow(X) * .8, ncol(X)) if (ncol(X) == 1) XX <- cbind2(X, rep(0, nrow(X))) # add a dummy variable since glmnet can't handle a single-column matrix @@ -224,21 +224,18 @@ ComputePrivacyGuarantees <- function(params, alpha, N) { privacy } -FitDistribution <- function(estimates_stds, map, quiet = FALSE) { +FitDistribution <- function(estimates, map, quiet = FALSE) { # Find a distribution over rows of map that approximates estimates_stds best # # Input: - # estimates_stds: a list of two m x k matrices, one for estimates, another - # for standard errors - # map : an (m * k) x S boolean matrix + # estimates: an m x k matrix of estimates + # map : an (m * k) x S boolean matrix # # Output: - # a float vector of length S, so that a distribution over map's rows sampled - # according to this vector approximates estimates + # an S-long of vector of non-negative floats, so that a distribution over + # the map's rows sampled according to this vector approximates estimates - S <- ncol(map) # total number of candidates - - lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) + lasso <- FitLasso(map, estimates) if (!quiet) cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n") @@ -259,7 +256,7 @@ Resample <- function(e) { list(estimates = estimates, stds = stds) } -Decode <- function(counts, map, params, alpha = 0.05, +Decode <- function(counts, map, params, alpha = 0.05, iterations = 20, correction = c("Bonferroni"), quiet = FALSE, ...) { k <- params$k p <- params$p @@ -288,23 +285,29 @@ Decode <- function(counts, map, params, alpha = 0.05, # Run the fitting procedure several times (5 seems to be the minimum that # makes sense) to estimate standard deviation of the output. - for(r in 1:10) { + groups <- sample(1:iterations, length(filter_bits), replace=T) + + for(r in 1:iterations) { if (r > 1) e <- Resample(estimates_stds_filtered) else e <- estimates_stds_filtered + e_vec <- as.vector(t(e$estimates)) + subset <- filter_bits[groups != r] + coefs_all <- rbind(coefs_all, - FitDistribution(e, map[filter_bits, , drop = FALSE], + FitDistribution(e_vec[subset], + map[subset, , drop = FALSE], quiet)) } coefs_ssd <- N * apply(coefs_all, 2, sd) # compute sample standard deviations coefs_ave <- N * apply(coefs_all, 2, mean) - # Only select coefficients more than two standard deviations from 0. May + # Only select coefficients more than one standard deviation from 0. May # inflate empirical SD of the estimates. - reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd) + reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd) mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported]) From 60c64ea194917df5d5879c19a48e5ae5fd8da26f Mon Sep 17 00:00:00 2001 From: Ilya Mironov Date: Fri, 11 Sep 2015 21:16:26 +0000 Subject: [PATCH 14/14] Changing the number of iterations from 20 to 10. --- analysis/R/decode.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 1fa40a3b..7d651145 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -256,7 +256,7 @@ Resample <- function(e) { list(estimates = estimates, stds = stds) } -Decode <- function(counts, map, params, alpha = 0.05, iterations = 20, +Decode <- function(counts, map, params, alpha = 0.05, iterations = 10, correction = c("Bonferroni"), quiet = FALSE, ...) { k <- params$k p <- params$p