diff --git a/analysis/R/assoc.R b/analysis/R/assoc.R new file mode 100755 index 00000000..662ab1f6 --- /dev/null +++ b/analysis/R/assoc.R @@ -0,0 +1,185 @@ +#!/usr/bin/env Rscript +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Reads map files, report files, and RAPPOR parameters to run +# an EM algorithm to estimate joint distribution over two or more variables +# +# Usage: +# $ ./assoc.R --inp +# +# Input: JSON file with the following fields +# "maps" for map files of each var +# "reports" for a list of reports +# "counts" for 2 way marginal counts, individual marginal counts +# respectively +# "params" for params file with RAPPOR params +# "csv_out" for a file name into which results will be written +# as comma separated values +# +# Output: A table with joint distribution to stdout and csv file with results + +library("jsonlite") +library("optparse") + +options(stringsAsFactors = FALSE) + +if(!interactive()) { + option_list <- list( + make_option(c("--inp"), default = "inp.json", + help = "JSON file with inputs for assoc.R")) + opts <- parse_args(OptionParser(option_list = option_list)) +} + +source("analysis/R/decode2way.R") +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/read_input.R") +source("analysis/R/association.R") +source("tests/gen_counts.R") +source("tests/compare_assoc.R") # For CombineMaps; it should be moved elsewhere + +TwoWayAlg <- function(inp) { + ptm <- proc.time() + params <- ReadParameterFile(inp$params) + # Ensure sufficient maps as required by number of vars + # Correct map from ReadMapFile() for assoc analysis + stopifnot(inp$numvars == length(inp$maps)) + map <- lapply(inp$maps, function(o) + CorrectMapForAssoc(ReadMapFile(o, params = params), + params = params)) + + # (2 way counts, marginal 1 counts, marginal 2 counts) + counts <- lapply(1:3, function(i) ReadCountsFile(inp$counts[[i]])) + + # TODO: account for different parameters across different variables + params2 <- params + params2$k <- (params$k ** 2) * 4 + + # Prune candidates + fit <- lapply(1:2, function(i) + Decode(counts[[i + 1]], + map[[i]]$rmap, + params, quick = FALSE)$fit) + + found_strings = list(fit[[1]][,"string"], fit[[2]][,"string"]) + + if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) { + stop("No strings found in 1-way marginal.") + } + + # Combine maps to feed into Decode2Way + # Prune first to found_strings from Decode on 1-way counts + pruned <- lapply(1:2, function(i) + lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE])) + crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap + marginal <- Decode2Way(counts[[1]], crmap, params2, fit = fit)$fit + + # Reconstruct 2-way table from marginals + ed <- matrix(0, nrow = length(found_strings[[1]]), ncol = length(found_strings[[2]])) + colnames(ed) <- found_strings[[2]] + rownames(ed) <- found_strings[[1]] + for (cols in found_strings[[2]]) { + for (rows in found_strings[[1]]) { + ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] + } + } + ed[is.na(ed)] <- 0 + ed[ed<0] <- 0 + + time_taken <- proc.time() - ptm + print("Two Way Algorithm Results") + print(ed[order(-rowSums(ed)), order(-colSums(ed))]) + if(inp$time == TRUE) + print(time_taken) +} + +EMAlg <- function(inp) { + ptm <- proc.time() + params <- ReadParameterFile(inp$params) + # Ensure sufficient maps as required by number of vars + stopifnot(inp$numvars == length(inp$maps)) + # Correct map from ReadMapFile() for assoc analysis + map <- lapply(inp$maps, function(o) + CorrectMapForAssoc(LoadMapFile(o, params = params), + params = params)) + + # For BASIC only + m1 <- lapply(1:params$m, function(z) { + m <- sparseMatrix(c(1), c(2), dims = c(1, 2)) + colnames(m) <- c("FALSE", "TRUE") + m + }) + m2 <- sparseMatrix(1:params$m, rep(2, params$m)) + colnames(m2) <- colnames(m1[[1]]) + map[[2]]$map <- m1 + map[[2]]$rmap <- m2 + + # Reports must be of the format + # client name, cohort no, rappor bitstring 1, rappor bitstring 2, ... + reportsObj <- read.csv(inp$reports, + colClasses = c("character", "integer", + rep("character", inp$numvars)), + header = TRUE) + # Ignore the first column + reportsObj <- reportsObj[,-1] + + params = list(params, params) + params[[2]]$k = 1 + + # Parsing reportsObj + # ComputeDistributionEM allows for different sets of cohorts + # for each variable. Here, both sets of cohorts are identical + co <- as.list(reportsObj[1])[[1]] + co <- co + 1 # 1 indexing + cohorts <- rep(list(co), inp$numvars) + # Parse reports from reportObj cols 2, 3, ... + reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1])) + + # Split strings into bit arrays (as required by assoc analysis) + reports <- lapply(1:inp$numvars, function(i) { + # apply the following function to each of reports[[1]] and reports[[2]] + lapply(reports[[i]][[1]], function(x) { + # function splits strings and converts them to numeric values + # rev needed for endianness + rev(as.numeric(strsplit(x, split = "")[[1]])) + }) + }) + + joint_dist <- ComputeDistributionEM(reports, cohorts, map, + ignore_other = FALSE, + quick = TRUE, + params, marginals = NULL, + estimate_var = FALSE, + verbose = inp$time) + em <- joint_dist$fit + time_taken <- proc.time() - ptm + print("EM Algorithm Results") + print(em[order(-rowSums(em)), order(-colSums(em))]) + if(inp$time == TRUE) + print(time_taken) +} + +main <- function(opts) { + inp <- fromJSON(opts$inp) + TwoWayAlg(inp) + if(inp$also_em == TRUE) + EMAlg(inp) +} + +if(!interactive()) { + main(opts) +} \ No newline at end of file diff --git a/analysis/R/association.R b/analysis/R/association.R index 422fc634..328d3292 100644 --- a/analysis/R/association.R +++ b/analysis/R/association.R @@ -44,7 +44,7 @@ GetOtherProbs <- function(counts, map, marginal, params) { # Counts to remove from each cohort. top_counts <- ceiling(marginal$proportion * N / params$m) sum_top <- sum(top_counts) - candidate_map <- lapply(map, function(x) x[, candidate_strings]) + candidate_map <- lapply(map, function(x) x[, candidate_strings, drop = FALSE]) # Counts set by known strings without noise considerations. if (length(marginal) > 0) { @@ -63,6 +63,10 @@ GetOtherProbs <- function(counts, map, marginal, params) { pstar <- (1 - f / 2) * p + (f / 2) * q top_counts_cohort <- (sum_top - top_counts_cohort) * pstar + top_counts_cohort * qstar + + # Adjustment for basic rappor + if(nrow(top_counts_cohort) == 1) + top_counts_cohort <- t(top_counts_cohort) top_counts_cohort <- cbind(sum_top, top_counts_cohort) # Counts set by the "other" category. @@ -72,6 +76,9 @@ GetOtherProbs <- function(counts, map, marginal, params) { props_other[props_other > 1] <- 1 props_other[is.nan(props_other)] <- 0 props_other[is.infinite(props_other)] <- 0 + # Adjustmet for basic rappor + if(is.null(nrow(props_other))) + props_other <- t(props_other) as.list(as.data.frame(props_other)) } @@ -213,8 +220,15 @@ EM <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE, if (nrow(pij[[1]]) > 0) { # Run EM for (i in 1:max_iter) { + if (i == 1) { + ptm_iter <- proc.time() + } pij[[i + 1]] <- UpdatePij(pij[[i]], cond_prob) dif <- max(abs(pij[[i + 1]] - pij[[i]])) + if (i == 1) { + PrintIfVerbose("ONE ITERATION", verbose) + PrintIfVerbose(proc.time() - ptm_iter, verbose) + } if (dif < epsilon) { break } @@ -283,9 +297,10 @@ UpdateJointConditional <- function(cond_report_dist, joint_conditional = NULL) { ComputeDistributionEM <- function(reports, report_cohorts, maps, ignore_other = FALSE, - params, + params, quick = FALSE, marginals = NULL, - estimate_var = FALSE) { + estimate_var = FALSE, + verbose = FALSE) { # Computes the distribution of num_variables variables, where # num_variables is chosen by the client, using the EM algorithm. # @@ -312,8 +327,10 @@ ComputeDistributionEM <- function(reports, report_cohorts, # Compute the counts for each variable and then do conditionals. joint_conditional = NULL found_strings <- list() + cd_for_reports <- list() for (j in (1:num_variables)) { + ptm <- proc.time() variable_report <- reports[[j]] variable_cohort <- report_cohorts[[j]] map <- maps[[j]] @@ -321,8 +338,12 @@ ComputeDistributionEM <- function(reports, report_cohorts, # Compute the probability of the "other" category variable_counts <- NULL if (is.null(marginals)) { - variable_counts <- ComputeCounts(variable_report, variable_cohort, params) - marginal <- Decode(variable_counts, map$rmap, params, quiet = TRUE)$fit + ptm2 <- proc.time() + variable_counts <- ComputeCounts(variable_report, variable_cohort, params[[j]]) + marginal <- Decode(variable_counts, map$rmap, params[[j]], quick, + quiet = TRUE)$fit + PrintIfVerbose("TIME IN MARGINALS", verbose) + PrintIfVerbose(proc.time() - ptm2, verbose) if (nrow(marginal) == 0) { return (NULL) } @@ -332,14 +353,14 @@ ComputeDistributionEM <- function(reports, report_cohorts, found_strings[[j]] <- marginal$string if (ignore_other) { - prob_other <- vector(mode = "list", length = params$m) + prob_other <- vector(mode = "list", length = params[[j]]$m) } else { if (is.null(variable_counts)) { variable_counts <- ComputeCounts(variable_report, variable_cohort, - params) + params[[j]]) } prob_other <- GetOtherProbs(variable_counts, map$map, marginal, - params) + params[[j]]) found_strings[[j]] <- c(found_strings[[j]], "Other") } @@ -348,7 +369,7 @@ ComputeDistributionEM <- function(reports, report_cohorts, idx <- variable_cohort[i] rep <- GetCondProb(variable_report[[i]], candidate_strings = rownames(marginal), - params = params, + params = params[[j]], map$map[[idx]], prob_other[[idx]]) rep @@ -356,14 +377,19 @@ ComputeDistributionEM <- function(reports, report_cohorts, # Update the joint conditional distribution of all variables joint_conditional <- UpdateJointConditional(cond_report_dist, - joint_conditional) + joint_conditional) + PrintIfVerbose("TIME IN COND_REPORT_DIST", verbose) + PrintIfVerbose(proc.time()-ptm, verbose) } + ptm <- proc.time() # Run expectation maximization to find joint distribution em <- EM(joint_conditional, epsilon = 10 ^ -6, verbose = FALSE, estimate_var = estimate_var) + PrintIfVerbose("TIME IN EM", verbose) + PrintIfVerbose(proc.time() - ptm, verbose) dimnames(em$est) <- found_strings + # Return results in a usable format list(fit = em$est, sd = em$sd, em = em) - } diff --git a/analysis/R/decode.R b/analysis/R/decode.R index fe314cd9..626274e2 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -74,9 +74,14 @@ EstimateBloomCounts <- function(params, obs_counts) { # Transform counts from absolute values to fractional, removing bias due to # variability of reporting between cohorts. - ests <- apply(ests, 1, function(x) x / obs_counts[,1]) - stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1]) - + if (ncol(obs_counts) == 2) { + ests <- apply(t(ests), 1, function(x) x / obs_counts[,1]) + stds <- apply(t(variances^.5), 1, function(x) x / obs_counts[,1]) + } else { + ests <- apply((ests), 1, function(x) x / obs_counts[,1]) + stds <- apply((variances^.5), 1, function(x) x / obs_counts[,1]) + } + # Some estimates may be set to infinity, e.g. if f=1. We want to # account for this possibility, and set the corresponding counts # to 0. diff --git a/analysis/R/decode2way.R b/analysis/R/decode2way.R new file mode 100644 index 00000000..e8b546fa --- /dev/null +++ b/analysis/R/decode2way.R @@ -0,0 +1,174 @@ +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# This library implements RAPPOR decoding algorithms for 2 way association. +# + +library(limSolve) +source("analysis/R/decode.R") + +EstimateBloomCounts2Way <- function(params, obs_counts) { + # Estimates original bloom filter counts of each pair of bits + # in the original bloom filters of each report + # + # Input: + # params: a list of RAPPOR parameters: + # k - size of a Bloom filter + # h - number of hash functions + # m - number of cohorts + # p - P(IRR = 1 | PRR = 0) + # q - P(IRR = 1 | PRR = 1) + # f - Proportion of bits in the Bloom filter that are set randomly + # to 0 or 1 regardless of the underlying true bit value + # obs_counts: a matrix of size m by (4k^2 + 1). Column one contains sample + # sizes for each cohort. Other counts indicated how many times + # pairs of bits {11, 10, 01, 00} were set across the two + # reports (in a "1st report"-major order) + # + # Output: + # ests: a matrix of size m by 4k**2 with estimated counts + # stds: currently, just a filler value of 100 + + p <- params$p + q <- params$q + f <- params$f + m <- params$m + k <- params$k + + stopifnot(m == nrow(obs_counts), params$k + 1 == ncol(obs_counts)) + + p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1 + p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1 + p10 <- 1 - p11 # probability of a true 1 reported as 0 + p00 <- 1 - p01 # probability of a true 0 reported as 0 + + # The NoiseMatrix describes the probability that input pairs of bits + # are mapped to outputs {11, 10, 01, 00} due to noise added by RAPPOR + NoiseMatrix <- matrix(rep(0, 16), 4) + NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2) + NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00) + NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01) + NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2) + # Invert NoiseMatrix for estimator + InvNoiseMatrix <- t(solve(NoiseMatrix)) + + # Apply the inverse of NoiseMatrix to get an unbiased estimator for + # the number of times input pairs of bits were seen. + # Apply the matrix to 4 values at a time from obs_counts + ests <- apply(obs_counts, 1, function(x) { + N <- x[1] + inds <- seq(0, (k/4)-1) + v <- x[-1] + sapply(inds, function(i){ + as.vector(InvNoiseMatrix %*% v[(i*4 + 1):((i+1)*4)]) + }) + }) + + # Transform counts from absolute values to fractional, removing bias due to + # variability of reporting between cohorts. + ests <- apply(ests, 1, function(x) x / obs_counts[,1]) + # TODO: compute stddev in distribution induced by estimation + # stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1]) + + # Some estimates may be set to infinity, e.g. if f=1. We want to + # account for this possibility, and set the corresponding counts + # to 0. + ests[abs(ests) == Inf] <- 0 + + list(estimates = ests, + stds = matrix(rep(100, length(ests[,1]) * length(ests[1,])), + length(ests[,1]))) +} + +# Implements lsei +FitDistribution2Way <- function(estimates_stds, map, + fit = NULL, + quiet = FALSE, + add_constraints = FALSE) { + X <- map + Y <- as.vector(t(estimates_stds$estimates)) + m <- dim(X)[1] + n <- dim(X)[2] + + G <- rbind2(Diagonal(n), rep(-1, n)) + H <- c(rep(0, n), -1) + if (add_constraints == TRUE) { + res <- AddConstraints(fit, X, Y, m, n, G, H) + lsei(A = res$X, B = res$Y, G = res$G, H = res$H, type = 2)$X + } else { + lsei(A = X, B = Y, G = G, H = H, type = 2)$X + } +} + +AddConstraints <- function(fit, X, Y, m, n, G, H) { + # Experimental code + # Computes the same output as FitDistribution by + # additionally throwing in constraints corresponding to + # 1-way marginals + # Requires non-NULL fit as input (with "proportion" containing marginal info) + + # Adding marginals constraints to X and Y + fstrs <- lapply(fit, function(x) x[,"string"]) # found strings + + Y <- c(Y, wt * t(fit[[1]]["proportion"]), wt * t(fit[[2]]["proportion"])) + + for (strs in fstrs[[1]]) { + indices <- which(colnames(map) %in% outer(strs, + fstrs[[2]], + function(x, y) paste(x, y, sep = "x"))) + vec <- rep(0, n) + vec[indices] <- wt + X <- rbind2(X, vec) + } + for (strs in fstrs[[2]]) { + indices <- which(colnames(map) %in% outer(fstrs[[1]], + strs, + function(x, y) paste(x, y, sep = "x"))) + vec <- rep(0, n) + vec[indices] <- wt + X <- rbind2(X, vec) + } + list(X = X, Y = Y, G = G, H = H) +} + +Decode2Way <- function(counts, map, params, fit = NULL) { + k <- params$k + p <- params$p + q <- params$q + f <- params$f + h <- params$h + m <- params$m + + S <- ncol(map) # total number of candidates + + N <- sum(counts[, 1]) + + filter_cohorts <- which(counts[, 1] != 0) # exclude cohorts with zero reports + + # stretch cohorts to bits + filter_bits <- as.vector( + t(matrix(1:nrow(map), nrow = m, byrow = TRUE)[filter_cohorts,])) + + es <- EstimateBloomCounts2Way(params, counts) + e <- list(estimates = es$estimates[filter_cohorts, , drop = FALSE], + stds = es$stds[filter_cohorts, , drop = FALSE]) + coefs <- FitDistribution2Way(e, map[filter_bits, , drop = FALSE], fit) + fit <- data.frame(String = colnames(map[filter_bits, , drop = FALSE]), + Estimate = matrix(coefs, ncol = 1), + SD = matrix(coefs, ncol = 1), + stringsAsFactors = FALSE) + rownames(fit) <- fit[,"String"] + list(fit = fit) +} diff --git a/analysis/R/read_input.R b/analysis/R/read_input.R index 95ea1b0d..051b35c4 100644 --- a/analysis/R/read_input.R +++ b/analysis/R/read_input.R @@ -101,6 +101,23 @@ ReadMapFile <- function(map_file, params = NULL, quote = "") { list(map = map, strs = strs, map_pos = map_pos) } +# This function processes the maps loaded using ReadMapFile +# Association analysis requires a map object with a map +# field that has the map split into cohorts and an rmap field +# that has all the cohorts combined +# Arguments: +# map = map object with cohorts as sparse matrix in +# object map$map +# This is the expected object from ReadMapFile +# params = data field with parameters +CorrectMapForAssoc <- function(map, params) { + map$rmap <- map$map + map$map <- lapply(1:params$m, function(i) + map$rmap[seq(from = ((i - 1) * params$k + 1), + length.out = params$k),]) + map +} + LoadMapFile <- function(map_file, params = NULL, quote = "") { # Reads the map file and creates an R binary .rda. If the .rda file already # exists, just loads that file. NOTE: It assumes the map file is diff --git a/analysis/tools/sum_bits_assoc.py b/analysis/tools/sum_bits_assoc.py new file mode 100755 index 00000000..8e01d669 --- /dev/null +++ b/analysis/tools/sum_bits_assoc.py @@ -0,0 +1,158 @@ +#!/usr/bin/python +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Read RAPPOR values of 2 variables from stdin. +Read parameters from parameter file and a prefix. + +Output counts of bloom filter bits set for each variable (1-way totals) +and counts of pairwise bits set (2-way totals) into files with suffixes +_marg1.csv, _marg2.csv, _2way.csv respectively. + +The file formats for each of the files are as follows: +_marg1.csv, _marg2.csv +Each row corresponds to a cohort with: +num reports, total count for bit 1, total count for bit 2, ... + +_2way.csv +Each row corresponds to a cohort +The first entry corresponds to total number of reports in that cohort +The next set of values indicate 2 way counts grouped 4 elements at a time: + the first 4 refer to information about bit 1 of irr1 and bit 1 of irr2 + the next 4 refer to information about bit 1 of irr1 and bit 2 of irr2 + ... + the next 4 refer to information about bit 1 of irr1 and bit k of irr2 + the next 4 refer to information about bit 2 of irr1 and bit 1 of irr2 + (pairwise information about tuples is stored in a "1st report"-major order) + ... + the last 4 refer to information about bit k of irr1 and bit k of irr2 + + for each 4-tuple, the values represents the counts for the pair of bits from + irr1 and irr2 having the value: + 11, 01, 10, and 00, respectively. + + See sum_bits_assoc_test.py for an example +""" + +import csv +import sys + +import rappor + + +def SumBits(params, stdin, f_2way, f_1, f_2): + csv_in = csv.reader(stdin) + csv_out_two_way = csv.writer(f_2way) + csv_out_1 = csv.writer(f_1) + csv_out_2 = csv.writer(f_2) +# csv_out_two_way = csv.writer(open(f_2way, "w")) +# csv_out_1 = csv.writer(open(f_1, "w")) +# csv_out_2 = csv.writer(open(f_2, "w")) + + num_cohorts = params.num_cohorts + num_bloombits = params.num_bloombits + + sums = [[0] * (4 * (num_bloombits ** 2)) for _ in xrange(num_cohorts)] + sums_1 = [[0] * num_bloombits for _ in xrange(num_cohorts)] + sums_2 = [[0] * num_bloombits for _ in xrange(num_cohorts)] + num_reports = [0] * num_cohorts + + for i, row in enumerate(csv_in): + try: + (_, cohort, irr_1, irr_2) = row + except ValueError: + raise RuntimeError('Error parsing row %r' % row) + + if i == 0: + continue # skip header + + cohort = int(cohort) + try: + num_reports[cohort] += 1 + except IndexError: + raise RuntimeError('Error indexing cohort number %d (num_cohorts is %d) \ + ' % (cohort, num_cohorts)) + + if not len(irr_1) == params.num_bloombits: + raise RuntimeError( + "Expected %d bits in report 1, got %r" % + (params.num_bloombits, len(irr_1))) + if not len(irr_2) == params.num_bloombits: + raise RuntimeError( + "Expected %d bits in report 2, got %r" % + (params.num_bloombits, len(irr_2))) + # "Unrolled" joint encoding of both reports + index_array = [[3, 1], [2, 0]] + for i, c in enumerate(irr_1): + for j, d in enumerate(irr_2): + index = 4 * ((num_bloombits - i - 1) * params.num_bloombits + + num_bloombits - j - 1) + try: + diff = index_array[int(c)][int(d)] + except IndexError: + raise RuntimeError('Invalid IRRs; digits should be 0/1') + sums[cohort][index + diff] += 1 + + for i, c in enumerate(irr_1): + bit_num = num_bloombits - i - 1 # e.g. char 0 = bit 15, char 15 = bit 0 + if c == '1': + sums_1[cohort][bit_num] += 1 + else: + if c != '0': + raise RuntimeError('Invalid IRRs; digits should be 0/1') + + for i, c in enumerate(irr_2): + bit_num = num_bloombits - i - 1 # e.g. char 0 = bit 15, char 15 = bit 0 + if c == '1': + sums_2[cohort][bit_num] += 1 + else: + if c != '0': + raise RuntimeError('Invalid IRRs; digits should be 0/1') + + for cohort in xrange(num_cohorts): + # First column is the total number of reports in the cohort. + row = [num_reports[cohort]] + sums[cohort] + csv_out_two_way.writerow(row) + row = [num_reports[cohort]] + sums_1[cohort] + csv_out_1.writerow(row) + row = [num_reports[cohort]] + sums_2[cohort] + csv_out_2.writerow(row) + + +def main(argv): + try: + filename = argv[1] + prefix = argv[2] + except IndexError: + raise RuntimeError('Usage: sum_bits.py ') + with open(filename) as f: + try: + params = rappor.Params.from_csv(f) + except rappor.Error as e: + raise RuntimeError(e) + + with open(prefix + "_2way.csv", "w") as f_2way: + with open(prefix + "_marg1.csv", "w") as f_1: + with open(prefix + "_marg2.csv", "w") as f_2: + SumBits(params, sys.stdin, f_2way, f_1, f_2) + + +if __name__ == '__main__': + try: + main(sys.argv) + except RuntimeError, e: + print >>sys.stderr, e.args[0] + sys.exit(1) diff --git a/analysis/tools/sum_bits_assoc_test.py b/analysis/tools/sum_bits_assoc_test.py new file mode 100755 index 00000000..e19b9fed --- /dev/null +++ b/analysis/tools/sum_bits_assoc_test.py @@ -0,0 +1,134 @@ +#!/usr/bin/python -S +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +sum_bits_assoc_test.py: Tests for sum_bits_assoc.py +""" + +import cStringIO +import unittest + +import rappor +import sum_bits_assoc # module under test + + +# The header doesn't matter +CSV_IN = """\ +user_id,cohort,irr1,irr2 +5,1,0011,1010 +5,1,0011,1010 +5,1,0000,0000 +""" + +# ############################### +# EXPECTED_F_2WAY +# +# NOTE: bit order is reversed. +# First row is 65 zeroes because there are no reports with cohort 0 +expected_f_2way = """\ +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\ +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\r +""" + +# Cohort 1 +# Total # of reports +expected_f_2way = expected_f_2way + "3," + +# Looking at LSBs of both irrs +# Total # of (11, 01, 10, 00) that appear +expected_f_2way = expected_f_2way + "0,0,2,1," + +# Report 1-major order. So looking at LSB of irr1 and 2nd LSB of irr2 +expected_f_2way = expected_f_2way + "2,0,0,1," + +# And so on ... +expected_f_2way = expected_f_2way + "0,0,2,1," +expected_f_2way = expected_f_2way + "2,0,0,1," + +# Now moving on to 2nd LSB of irr1 +expected_f_2way = expected_f_2way + ("0,0,2,1,2,0,0,1," * 2) + +# Now moving on to 3rd LSB of irr1 +# Note that for 3rd LSB of irr1 and LSB of irr2, there are three 00s +expected_f_2way = expected_f_2way + ("0,0,0,3,0,2,0,1," * 2) +# MSB of irr1 +expected_f_2way = expected_f_2way + "0,0,0,3,0,2,0,1," + "0,0,0,3,0,2,0,1\r\n" + +# EXPECTED_F_2WAY is a constant +EXPECTED_F_2WAY = expected_f_2way + +# end of EXPECTED_F_2WAY +# ############################### + +# NOTE: bit order is reversed. +EXPECTED_F_1 = """\ +0,0,0,0,0\r +3,2,2,0,0\r +""" + +# NOTE: bit order is reversed. +EXPECTED_F_2 = """\ +0,0,0,0,0\r +3,0,2,0,2\r +""" + +WRONG_IRR_BITS = """\ +user_id,cohort,irr1,irr2 +cli1,1,00123,11223 +""" + +WRONG_COHORT = """\ +user_id,cohort,irr1,irr2 +cli1,3,0011,0001 +""" + +class SumBitsAssocTest(unittest.TestCase): + + def setUp(self): + self.params = rappor.Params() + self.params.num_bloombits = 4 + self.params.num_cohorts = 2 + self.maxDiff = None + + def testSum(self): + stdin = cStringIO.StringIO(CSV_IN) + f_2way = cStringIO.StringIO() + f_1 = cStringIO.StringIO() + f_2 = cStringIO.StringIO() + + sum_bits_assoc.SumBits(self.params, stdin, f_2way, f_1, f_2) + self.assertMultiLineEqual(EXPECTED_F_1, f_1.getvalue()) + self.assertMultiLineEqual(EXPECTED_F_2, f_2.getvalue()) + self.assertMultiLineEqual(EXPECTED_F_2WAY, f_2way.getvalue()) + + def testErrors(self): + f_2way = cStringIO.StringIO() + f_1 = cStringIO.StringIO() + f_2 = cStringIO.StringIO() + + stdin = cStringIO.StringIO(WRONG_IRR_BITS) + self.assertRaises( + RuntimeError, sum_bits_assoc.SumBits, self.params, stdin, + f_2way, f_1, f_2) + + stdin = cStringIO.StringIO(WRONG_COHORT) + self.assertRaises( + RuntimeError, sum_bits_assoc.SumBits, self.params, stdin, + f_2way, f_1, f_2) + + +if __name__ == '__main__': + unittest.main() diff --git a/assoctest.sh b/assoctest.sh new file mode 100755 index 00000000..42e1799f --- /dev/null +++ b/assoctest.sh @@ -0,0 +1,325 @@ +#!/bin/bash +# +# Run and end-to-end association test in parallel. +# +# Usage: +# ./assoctest.sh + +# At the end, it will print an HTML summary. +# +# Three main functions are +# run [[ [ []]] - run tests matching in +# parallel, each times, additionally +# running the EM algorithm if = T +# +# run-seq [ [ []]] - ditto, except that tests are run +# sequentially +# +# run-all [ []] - run all tests, in parallel, +# each times +# +# Note: Patterns always start with a-. +# +# Examples: +# $ ./assoctest.sh run-seq a-toy # Sequential run, matches 2 cases +# $ ./assoctest.sh run-seq a-fizz 3 # Sequential, each test is run three +# times +# $ ./assoctest.sh run-all # Run all tests once +# $ ./assoctest.sh run-all 5 T # Run all tests five times with EM +# comparisons +# +# The argument is a regex in 'grep -E' format. (Detail: Don't +# use $ in the pattern, since it matches the whole spec line and not just the +# test case name.) The number of processors used in a parallel run is 5. +# +# fast_counts param inherited from regtest.sh, but currently not used + +set -o nounset +set -o pipefail +set -o errexit + +. util.sh + +readonly THIS_DIR=$(dirname $0) +readonly REPO_ROOT=$THIS_DIR +readonly CLIENT_DIR=$REPO_ROOT/client/python +readonly ASSOCTEST_DIR=_tmp/assoctest + +# All the Python tools need this +export PYTHONPATH=$CLIENT_DIR + +# Print true inputs into a file with selected prefix +print-true-inputs() { + local num_unique_values=$1 + local prefix=$2 + seq 1 $num_unique_values | awk '{print "'$prefix'" $1}' +} + +# Generate a single test case, specified by a line of the test spec. +# This is a helper function for _run_tests(). +_setup-one-case() { + local test_case=$1 + + # Input parameters + local num_unique_values=$2 + local num_unique_values2=$3 + local num_clients=$4 + local num_extras=$5 + + # RAPPOR params + local num_bits=$6 + local num_hashes=$7 + local num_cohorts=$8 + local p=$9 + local q=${10} # need curly braces to get the 10th arg + local f=${11} + + banner 'Setting up parameters and candidate files for '$test_case + + local case_dir=$ASSOCTEST_DIR/$test_case + mkdir --verbose -p $case_dir + + # Save the "spec" + echo "$@" > $case_dir/spec.txt + + local params_path=$case_dir/case_params.csv + + echo 'k,h,m,p,q,f' > $params_path + echo "$num_bits,$num_hashes,$num_cohorts,$p,$q,$f" >> $params_path + + print-true-inputs $[num_unique_values+num_extras] \ + "str" > $case_dir/case_true_inputs1.txt + print-true-inputs $num_unique_values2 "opt" > $case_dir/case_true_inputs2.txt + + # Hash candidates + analysis/tools/hash_candidates.py \ + $params_path \ + < $case_dir/case_true_inputs1.txt \ + > $case_dir/case_map1.csv + + analysis/tools/hash_candidates.py \ + $params_path \ + < $case_dir/case_true_inputs2.txt \ + > $case_dir/case_map2.csv +} + +# Run a single test instance, specified by . +# This is a helper function for _run_tests(). +_run-one-instance() { + local test_case=$1 + local test_instance=$2 + + local case_dir=$ASSOCTEST_DIR/$test_case + + read -r case_name num_unique_values num_unique_values2 \ + num_clients num_extras \ + num_bits num_hashes num_cohorts p q f compare < $case_dir/spec.txt + + local instance_dir=$ASSOCTEST_DIR/$test_case/$test_instance + mkdir --verbose -p $instance_dir + + banner "Generating input" + + tests/gen_true_values_assoc.R $num_unique_values $num_unique_values2 \ + $num_clients $num_cohorts $instance_dir/case.csv + + banner "Running RAPPOR client" + tests/rappor_assoc_sim.py \ + --num-bits $num_bits \ + --num-hashes $num_hashes \ + --num-cohorts $num_cohorts \ + -p $p \ + -q $q \ + -f $f \ + < $instance_dir/case.csv \ + > "$instance_dir/case_reports.csv" + + analysis/tools/sum_bits_assoc.py \ + $case_dir/case_params.csv \ + "$instance_dir/case" \ + < $instance_dir/case_reports.csv + + + local out_dir=${instance_dir}_report + mkdir --verbose -p $out_dir + + # Currently, the summary file shows and aggregates timing of the inference + # engine, which excludes R's loading time and reading of the (possibly + # substantial) map file. Timing below is more inclusive. + TIMEFORMAT='Running analyze.R took %R seconds' + + # Setting up JSON file + json_file="{\ + \"maps\": [\"$case_dir/case_map1.csv\",\ + \"$case_dir/case_map2.csv\"],\ + \"reports\": \"$instance_dir/case_reports.csv\",\ + \"truefile\": \"$instance_dir/case.csv\",\ + \"outdir\": \"$out_dir\",\ + \"params\": \"$case_dir/case_params.csv\",\ + \"newalg\": \"false\",\ + \"numvars\": 2,\ + \"num\": $num_clients,\ + \"extras\": $num_extras,\ + \"varcandidates\": [$num_unique_values, $num_unique_values2],\ + \"verbose\": \"true\",\ + \"counts\": [\"$instance_dir/case_2way.csv\",\ + \"$instance_dir/case_marg1.csv\",\ + \"$instance_dir/case_marg2.csv\"]," + + # Adding EM comparison depending on $compare flag + if test $compare = F; then + json_file=$json_file"\"expt\": [\"external-counts\"]" + else + json_file=$json_file"\"expt\": [\"external-counts\", \ + \"external-reports-em\"]" + fi + json_file=$json_file"}" + echo $json_file > $instance_dir/analyze_inp.json + + time { + tests/compare_assoc.R --inp $instance_dir/analyze_inp.json + } +} + +# Like _run-once-case, but log to a file. +_run-one-instance-logged() { + local test_case_id=$1 + local test_case_run=$2 + + local log_dir=$ASSOCTEST_DIR/$test_case_id/${test_case_run}_report + mkdir --verbose -p $log_dir + + log "Started '$test_case_id' (instance $test_case_run) -- logging to $log_dir/log.txt" + _run-one-instance "$@" >$log_dir/log.txt 2>&1 \ + && log "Test case $test_case_id (instance $test_case_run) done" \ + || log "Test case $test_case_id (instance $test_case_run) failed" +} + +make-summary() { + local dir=$1 + local filename=${2:-results.html} + local instances=${3:-1} + + tests/make_summary_assoc.py $dir $instances > $dir/rows.html + + pushd $dir >/dev/null + + cat ../../tests/assoctest.html \ + | sed -e '/TABLE_ROWS/ r rows.html' \ + > $filename + + popd >/dev/null + + log "Wrote $dir/$filename" + log "URL: file://$PWD/$dir/$filename" +} + +test-error() { + local spec_regex=${1:-} + log "Some test cases failed" + if test -n "$spec_regex"; then + log "(Perhaps none matched pattern '$spec_regex')" + fi + # don't quit just yet + # exit 1 +} + +# Assuming the spec file, write a list of test case names (first column) with +# the instance ids (second column), where instance ids run from 1 to $1. +_setup-test-instances() { + local instances=$1 + + while read line; do + for i in $(seq 1 $instances); do + read case_name _ <<< $line # extract the first token + echo $case_name $i + done + done +} + +# Args: +# regexp: A pattern selecting the subset of tests to run +# instances: A number of times each test case is run +# parallel: Whether the tests are run in parallel (T/F) +# fast_counts: Whether counts are sampled directly (T/F) +# compare: Whether the tests run comparisons between EM and Marginal +# algorithms or not +# +_run-tests() { + local spec_regex=$1 # grep -E format on the spec + local instances=$2 + local parallel=$3 + local fast_counts=$4 + local $compare=$5 + + rm -r -f --verbose $ASSOCTEST_DIR + + mkdir --verbose -p $ASSOCTEST_DIR + + echo "PARAMS" + echo $spec_regex + echo $instances + echo $parallel + echo $fast_counts + echo $compare + + local func + local processors=1 + + if test $parallel = F; then + func=_run-one-instance-logged # output to the console + else + func=_run-one-instance-logged + processors=$(grep -c ^processor /proc/cpuinfo || echo 4) # POSIX-specific + if test $processors -gt 6; then # leave few CPUs for the OS + # Association tests take up a lot of memory; so restricted to a few + # processes at a time + processors=5 + else + processors=1 + fi + log "Running $processors parallel processes" + fi + + local cases_list=$ASSOCTEST_DIR/test-cases.txt + tests/assoctest_spec.py | grep -E $spec_regex | sed "s/$/ $compare/" > $cases_list + + # Generate parameters for all test cases. + cat $cases_list \ + | xargs -l -P $processors -- $0 _setup-one-case \ + || test-error + + log "Done generating parameters for all test cases" + + local instances_list=$ASSOCTEST_DIR/test-instances.txt + _setup-test-instances $instances $fast_counts < $cases_list > $instances_list + + cat $instances_list \ + | xargs -l -P $processors -- $0 $func || test-error + + log "Done running all test instances" + + make-summary $ASSOCTEST_DIR "results.html" $instances +} + +# Run tests sequentially +run-seq() { + local spec_regex=${1:-'^a-'} # grep -E format on the spec + local instances=${2:-1} + local compare=${3:-F} + + _run-tests $spec_regex $instances F T $compare +} + +# Run tests in parallel +run-all() { + local instances=${1:-1} + local compare=${2:-F} + + log "Running all tests. Can take a while." + # a- for assoc tests + # F for sequential + _run-tests '^a-' $instances T T $compare +} + +"$@" diff --git a/experimental/assoc/analyze_assoc.R b/experimental/assoc/analyze_assoc.R new file mode 100755 index 00000000..100d204f --- /dev/null +++ b/experimental/assoc/analyze_assoc.R @@ -0,0 +1,184 @@ +#!/usr/bin/env Rscript +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Reads map files, report files, and RAPPOR parameters to run +# an EM algorithm to estimate joint distribution over two or more variables +# +# Usage: +# $ ./analyze_assoc.R -map1 map_1.csv -map2 map_2.csv \ +# -reports reports.csv \ +# Inputs: map1, map2, reports, params +# see how options are parsed below for more information +# Outputs: +# prints a table with estimated joint probability masses +# over candidate strings +# Ex. +# ssl nossl +# intel 0.1 0.3 +# google 0.5 0.1 + +############################################################################## +############################################################################## +############################################################################## +# D E P R E C A T E D +# Please use analyze_assoc_expt.R to run assoc analysis experiments +############################################################################## +############################################################################## +############################################################################## + +library("optparse") + +options(stringsAsFactors = FALSE) + +if(!interactive()) { + option_list <- list( + # Flags + make_option(c("--map1"), default = "map_1.csv", + help = "Hashed candidates for 1st variable"), + make_option(c("--map2"), default = "map_2.csv", + help = "Hashed candidates for 2nd variable"), + make_option(c("--map3"), default = "map_3.csv", + help = "Hashed candidates for 3rd variable"), + make_option(c("--reports", "-r"), default = "reports.csv", + help = "File with raw reports as "), + make_option(c("--truefile", "-t"), default = "truedist.csv", + help = "File with true distribution generated by assoc_sim.R"), + make_option(c("--outdir", "-o"), default = ".", + help = "File where the metrics go"), + make_option(c("--params", "-p"), default = "params.csv", + help = "Filename for RAPPOR parameters"), + make_option(c("--newalg", "-a"), default = FALSE, + help = "Flag to run new EM3 algorithm or not") + ) + opts <- parse_args(OptionParser(option_list = option_list)) +} + +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/read_input.R") +source("analysis/R/association.R") + +# This function processes the maps loaded using ReadMapFile +# Association analysis requires a map object with a map +# field that has the map split into cohorts and an rmap field +# that has all the cohorts combined +# Arguments: +# map = map object with cohorts as sparse matrix in +# object map$map +# This is the expected object from ReadMapFile +# params = data field with parameters +# TODO(pseudorandom): move this functionality to ReadMapFile +ProcessMap <- function(map, params) { + map$rmap <- map$map + map$map <- lapply(1:params$m, function(i) + map$rmap[seq(from = ((i - 1) * params$k + 1), + length.out = params$k),]) + map +} + +main <- function(opts) { + ptm <- proc.time() + + params <- ReadParameterFile(opts$params) + opts_map <- list(opts$map1, opts$map2, opts$map3) + map <- lapply(opts_map, function(o) + ProcessMap(ReadMapFile(o, params = params), + params = params)) + # Reports must be of the format + # cohort no, rappor bitstring 1, rappor bitstring 2 + reportsObj <- read.csv(opts$reports, + colClasses = c("integer", "character", + "character", "character"), + header = FALSE) + + # Parsing reportsObj + # ComputeDistributionEM allows for different sets of cohorts + # for each variable. Here, both sets of cohorts are identical + co <- as.list(reportsObj[1])[[1]] + cohorts <- list(co, co, co) + # Parse reports from reportObj cols 2, 3, and 4 + reports <- lapply(1:3, function(x) as.list(reportsObj[x + 1])) + + # Split strings into bit arrays (as required by assoc analysis) + reports <- lapply(1:3, function(i) { + # apply the following function to each of reports[[1]] and reports[[2]] + lapply(reports[[i]][[1]], function(x) { + # function splits strings and converts them to numeric values + as.numeric(strsplit(x, split = "")[[1]]) + }) + }) + + joint_dist <- ComputeDistributionEM(reports, cohorts, map, + ignore_other = TRUE, + quick = TRUE, + params, marginals = NULL, + estimate_var = FALSE, + new_alg = opts$newalg) + + + td <- read.csv(file = opts$truefile) + ed <- joint_dist$orig$fit + if(length(reports) == 3) { + ed <- as.data.frame(ed) + } + + # We can see if chi-squared tests show different results + # for estimated vs real distribution + print("CHI-SQUARED") + td_chisq <- chisq.test(td) + ed_chisq <- chisq.test(ed) + print(td_chisq) + print(ed_chisq) + print(l1d(td, ed, "L1 DISTANCE")) + l1d_metric <- l1d(td, ed, "") + print("JOINT_DIST$FIT") + print(signif(ed[order(rowSums(ed)),], 4)) + td_metric <- td_chisq[1][[1]][[1]] + ed_metric <- ed_chisq[1][[1]][[1]] + + print("PROC.TIME") + time_taken <- proc.time() - ptm + print(time_taken) + + metrics <- list(td_chisq = td_metric, + ed_chisq = ed_metric, + tv = l1d_metric/2, + time = time_taken[1], + dim1 = dim(ed)[[2]], + dim2 = dim(ed)[[1]]) + + # Write metrics to metrics.csv + # Report l1 distance / 2 to be consistent with histogram analysis + filename <- file.path(opts$outdir, 'metrics.csv') + write.csv(metrics, file = filename, row.names = FALSE) +} + +# L1 distance = 1 - sum(min(df1|x, df2|x)) where +# df1|x / df2|x projects the distribution to the intersection x of the +# supports of df1 and df2 +l1d <- function(df1, df2, statement = "L1 DISTANCE") { + rowsi <- intersect(rownames(df1), rownames(df2)) + colsi <- intersect(colnames(df1), colnames(df2)) + print(statement) + 1 - sum(mapply(min, + unlist(as.data.frame(df1)[rowsi, colsi], use.names = FALSE), + unlist(as.data.frame(df2)[rowsi, colsi], use.names = FALSE))) +} + +if(!interactive()) { + main(opts) +} diff --git a/experimental/assoc/assoc_sim.R b/experimental/assoc/assoc_sim.R new file mode 100755 index 00000000..1b1726de --- /dev/null +++ b/experimental/assoc/assoc_sim.R @@ -0,0 +1,279 @@ +#!/usr/bin/env Rscript +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Simulates inputs on which association analysis can be run. +# Currently assoc_sim.R only supports 2 variables but can +# be easily extended to support more. +# +# Usage: +# $ ./assoc_sim.R -n 1000 +# Inputs: uvals, params, reports, map, num, unif +# see how options are parsed below for more information +# Outputs: +# reports.csv file containing reports +# map_{1, 2, ...}.csv file(s) containing maps of variables + +############################################################################## +############################################################################## +############################################################################## +# D E P R E C A T E D +# Please look at workflow to use analyze_assoc_expt.R and +# run gen_assoc_reports.R, rappor_assoc_sim.py, and sum_bits_assoc.py +# to generate inputs to association analysis +# (For more details, see _run-one-instance() in assoctest.sh) +############################################################################## +############################################################################## +############################################################################## + +library("optparse") + +options(stringsAsFactors = FALSE) + +if(!interactive()) { + option_list <- list( + make_option(c("--uvals", "-v"), + help = "Filename for list of values over which + distributions are simulated. The file is a list of + comma-separated strings each line of which refers + to a new variable."), + make_option(c("--params", "-p"), default = "params.csv", + help = "Filename for RAPPOR parameters"), + make_option(c("--reports", "-r"), default = "reports.csv", + help = "Filename for reports"), + make_option(c("--true", "-t"), default = "truedist.csv", + help = "Filename for the true distribution"), + make_option(c("--map", "-m"), default = "map", + help = "Filename *prefix* for map(s)"), + make_option(c("--num", "-n"), default = 1e05, + help = "Number of reports"), + make_option(c("--var1_num", "-z"), default = 100, + help = "Number of values for var1"), + make_option(c("--var2_num", "-y"), default = 20, + help = "Number of values for var2"), + make_option(c("--extras", "-e"), default = 1e05, + help = "How many spurious candidates does the 1st map have?"), + make_option(c("--distr", "-d"), default = "zipf3", + help = "Type of distribution. Choose between + {unif, poisson, poisson2, zipf2}"), + make_option(c("--prefix", "-x"), default = "./", + help = "Path to prefix all default files") + ) + opts <- parse_args(OptionParser(option_list = option_list)) +} + +apply_prefix <- function(path) { + paste(opts$prefix, path, sep = "") +} + +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/read_input.R") +source("analysis/R/association.R") +source("tests/gen_counts.R") + +# Read unique values of reports from a csv file +# Inputs: filename. The file is expected to contain two rows of strings +# (one for each variable): +# "google.com", "apple.com", ... +# "ssl", "nossl", ... +# Returns: a list containing strings +GetUniqueValsFromFile <- function(filename) { + contents <- read.csv(filename, header = FALSE) + # Expect 2 rows of unique vals + if(nrow(contents) != 2) { + stop(paste("Unique vals file", filename, "expected to have + two rows of strings.")) + } + # Removes superfluous "" entries if the lists of unique values + # differ in length + strip_empty <- function(vec) { + vec[!vec %in% c("")] + } + list(var1 = strip_empty(as.vector(t(contents[1,]))), + var2 = strip_empty(as.vector(t(contents[2,])))) +} + +# Simulate correlated reports and write into reportsfile +# Inputs: N = number of reports +# uvals = list containing a list of unique values +# params = list with RAPPOR parameters +# distr = the type of distribution to use +# {unif, poisson, poisson2, zipfg} +# extras = whether map_1.csv has spurious candidates or not +# truefile = name of the file with true distribution +# var1_num = number of var1 candidates +# var2_num = number of var2 candidates +# *** FOR ASSOCTEST TEST SUITE, USE ONLY ZIPF2 / ZIPF3 *** +# mapfile = file to write maps into (with .csv suffixes) +# reportsfile = file to write reports into (with .csv suffix) +SimulateReports <- function(N, uvals, params, distr, extras, truefile, + var1_num, var2_num, + mapfile, reportsfile) { + # Compute true distribution + m <- params$m + + if (distr == "unif") { + # Draw uniformly from 1 to 10 + v1_samples <- as.integer(runif(N, 1, 10)) + + # Pr[var2 = N + 1 | var1 = N] = 0.5 + # Pr[var2 = N | var1 = N] = 0.5 + v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE) + + } else if(distr == "poisson") { + # Draw from a Poisson random variable + v1_samples <- rpois(N, 1) + rep(1, N) + + # Pr[var2 = N + 1 | var1 = N] = 0.5 + # Pr[var2 = N | var1 = N] = 0.5 + v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE) + } else if (distr == "poisson2") { + + v1_samples <- rpois(N, 1) + rep(1, N) + # supp(var2) = {1, 2} + # Pr[var2 = 1 | var1 = even] = 0.75 + # Pr[var2 = 1 | var1 = odd] = 0.25 + pr25 <- rbinom(N, 1, 0.25) + 1 + pr75 <- rbinom(N, 1, 0.75) + 1 + v2_samples <- rep(1, N) + v2_samples[v1_samples %% 2 == 0] <- pr25[v1_samples %% 2 == 0] + v2_samples[v1_samples %% 2 == 1] <- pr75[v1_samples %% 2 == 1] + } else if (distr == "zipf2" || distr == "zipf3") { + + # Zipfian over var1_num strings + partition <- RandomPartition(N, ComputePdf("zipf1.5", var1_num)) + v1_samples <- rep(1:var1_num, partition) # expand partition + # Shuffle values randomly (may take a few sec for > 10^8 inputs) + v1_samples <- sample(v1_samples) + + # supp(var2) = {1, 2, 3, ..., var2_num} + # We look at two zipfian distributions over supp(var2) + # D1 = zipfian distribution + # D2 = zipfian distr over {var2_num, ..., 4, 3, 2, 1} + # (i.e., D1 in reverse) + # var2 ~ D1 if var1 = even + # var2 ~ D2 if var1 = odd + d1 <- sample(rep(1:var2_num, + RandomPartition(N, ComputePdf("zipf1.5", var2_num)))) + d2 <- (var2_num:1)[d1] + v2_samples <- rep(1, N) + v3_samples <- rep(1, N) + v2_samples[v1_samples %% 2 == 0] <- d1[v1_samples %% 2 == 0] + v2_samples[v1_samples %% 2 == 1] <- d2[v1_samples %% 2 == 1] + if(distr == "zipf3") { + bool1 <- rbinom(N, 1, 0.25) + rep(1, N) + bool2 <- rbinom(N, 1, 0.75) + rep(1, N) + v3_samples[v1_samples %% 2 == 0] <- bool1[v1_samples %% 2 == 0] + v3_samples[v1_samples %% 2 == 1] <- bool2[v1_samples %% 2 == 1] + } + } + + if(distr == "zipf2") { + tmp_samples <- list(v1_samples, v2_samples) + } else if(distr == "zipf3") { + tmp_samples <- list(v1_samples, v2_samples, v3_samples) + } + + + # Function to pad strings to uval_vec if sample_vec has + # larger support than the number of strings in uval_vec + # For e.g., if samples have support {1, 2, 3, 4, ...} and uvals + # only have "value1", "value2", and "value3", samples now + # over support {"value1", "value2", "value3", "str4", ...} + PadStrings <- function(sample_vec, uval_vec) { + if (max(sample_vec) > length(uval_vec)) { + # Padding uvals to required length + len <- length(uval_vec) + max_of_samples <- max(sample_vec) + uval_vec[(len + 1):max_of_samples] <- apply( + as.matrix((len + 1):max_of_samples), + 1, + function(i) sprintf("str%d", i)) + } + uval_vec + } + + # Pad and update uvals + uvals <- lapply(1:2, function(i) PadStrings(tmp_samples[[i]], + uvals[[i]])) + + uvals[[3]] <- c("true", "false") + # Replace integers in tmp_samples with actual sample strings + samples <- lapply(1:3, function(i) uvals[[i]][tmp_samples[[i]]]) + + print("TRUE DISTR") + td <- table(samples)/sum(table(samples)) + td <- td[order(rowSums(td), decreasing = TRUE),,] + print(td) + write.table(td, file = truefile, sep = ",", col.names = TRUE, + row.names = TRUE, quote = FALSE) + # Randomly assign cohorts in each dimension + cohorts <- sample(1:m, N, replace = TRUE) + + # Create and write map into mapfile_1.csv and mapfile_2.csv + if (extras > 0) { + # spurious candidates for mapfile_1.csv + len <- length(uvals[[1]]) + as.numeric(extras) + uvals[[1]] <- PadStrings(len, uvals[[1]]) + } + map <- lapply(uvals, function(u) CreateMap(u, params)) + write.table(map[[1]]$map_pos, file = paste(mapfile, "_1.csv", sep = ""), + sep = ",", col.names = FALSE, na = "", quote = FALSE) + write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""), + sep = ",", col.names = FALSE, na = "", quote = FALSE) + write.table(map[[3]]$map_pos, file = paste(mapfile, "_3.csv", sep = ""), + sep = ",", col.names = FALSE, na = "", quote = FALSE) + + # Write reports into a csv file + # Format: + # cohort, bloom filter var1, bloom filter var2 + reports <- lapply(1:3, function(i) + EncodeAll(samples[[i]], cohorts, map[[i]]$map, params)) + # Organize cohorts and reports into format + write_matrix <- cbind(as.matrix(cohorts), + as.matrix(lapply(reports[[1]], + function(x) paste(x, collapse = ""))), + as.matrix(lapply(reports[[2]], + function(x) paste(x, collapse = ""))), + as.matrix(lapply(reports[[3]], + function(x) paste(x, collapse = "")))) + write.table(write_matrix, file = reportsfile, quote = FALSE, + row.names = FALSE, col.names = FALSE, sep = ",") +} + +main <- function(opts) { + ptm <- proc.time() + + if(is.null(opts$uvals)) { + uvals = list(var1 = c("str1"), var2 = c("option1")) + } else { + uvals <- GetUniqueValsFromFile(apply_prefix(opts$uvals)) + } + params <- ReadParameterFile(apply_prefix(opts$params)) + SimulateReports(opts$num, uvals, params, opts$distr, # inuts + opts$extras, apply_prefix(opts$true), # inputs + opts$var1_num, opts$var2_num, # inputs + apply_prefix(opts$map), + apply_prefix(opts$reports)) # outputs + + print("PROC.TIME") + print(proc.time() - ptm) +} + +if(!interactive()) { + main(opts) +} diff --git a/experimental/assoc/assoc_sim_expt.R b/experimental/assoc/assoc_sim_expt.R new file mode 100755 index 00000000..5d3438ef --- /dev/null +++ b/experimental/assoc/assoc_sim_expt.R @@ -0,0 +1,250 @@ +#!/usr/bin/env Rscript +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Simulates inputs on which association analysis can be run. +# Currently assoc_sim.R only supports 2 variables but can +# be easily extended to support more. +# +# Usage: +# $ ./assoc_sim_expt.R --inp sim_inp.json +# Inputs: uvals, params, reports, map, num, unif +# see how options are parsed below for more information +# Outputs: +# reports.csv file containing reports +# map_{1, 2, ...}.csv file(s) containing maps of variables + +library("jsonlite") +library("optparse") + +options(stringsAsFactors = FALSE) + +if(!interactive()) { + option_list <- list( + make_option(c("--inp"), default = "assoc_inp.json", + help = "JSON file with inputs for assoc_sim_expt")) + opts <- parse_args(OptionParser(option_list = option_list)) + inp <- fromJSON(opts$inp) +} + +apply_prefix <- function(path) { + paste(inp$prefix, path, sep = "") +} + +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/read_input.R") +source("analysis/R/association.R") +source("tests/gen_counts.R") + +# Read unique values of reports from a csv file +# Inputs: filename. The file is expected to contain two rows of strings +# (one for each variable): +# "google.com", "apple.com", ... +# "ssl", "nossl", ... +# Returns: a list containing strings +GetUniqueValsFromFile <- function(filename) { + contents <- read.csv(filename, header = FALSE) + # Expect 2 rows of unique vals + if(nrow(contents) != 2) { + stop(paste("Unique vals file", filename, "expected to have + two rows of strings.")) + } + # Removes superfluous "" entries if the lists of unique values + # differ in length + strip_empty <- function(vec) { + vec[!vec %in% c("")] + } + list(var1 = strip_empty(as.vector(t(contents[1,]))), + var2 = strip_empty(as.vector(t(contents[2,])))) +} + +# Simulate correlated reports and write into reportsfile +# Inputs: N = number of reports +# uvals = list containing a list of unique values +# params = list with RAPPOR parameters +# distr = the type of distribution to use +# {unif, poisson, poisson2, zipfg} +# extras = whether map_1.csv has spurious candidates or not +# truefile = name of the file with true distribution +# varcandidates = list of number of candidates for each var +# *** FOR ASSOCTEST TEST SUITE, USE ONLY ZIPF2 / ZIPF3 *** +# mapfile = file to write maps into (with .csv suffixes) +# reportsfile = file to write reports into (with .csv suffix) +SimulateReports <- function(N, uvals, params, distr, extras, truefile, + varcandidates, + mapfile, reportsfile) { + # Compute true distribution + m <- params$m + + if (distr == "unif") { + # Draw uniformly from 1 to 10 + v1_samples <- as.integer(runif(N, 1, 10)) + + # Pr[var2 = N + 1 | var1 = N] = 0.5 + # Pr[var2 = N | var1 = N] = 0.5 + v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE) + + } else if(distr == "poisson") { + # Draw from a Poisson random variable + v1_samples <- rpois(N, 1) + rep(1, N) + + # Pr[var2 = N + 1 | var1 = N] = 0.5 + # Pr[var2 = N | var1 = N] = 0.5 + v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE) + } else if (distr == "poisson2") { + + v1_samples <- rpois(N, 1) + rep(1, N) + # supp(var2) = {1, 2} + # Pr[var2 = 1 | var1 = even] = 0.75 + # Pr[var2 = 1 | var1 = odd] = 0.25 + pr25 <- rbinom(N, 1, 0.25) + 1 + pr75 <- rbinom(N, 1, 0.75) + 1 + v2_samples <- rep(1, N) + v2_samples[v1_samples %% 2 == 0] <- pr25[v1_samples %% 2 == 0] + v2_samples[v1_samples %% 2 == 1] <- pr75[v1_samples %% 2 == 1] + } else if (distr == "zipf2" || distr == "zipf3") { + + var1_num <- varcandidates[[1]] + var2_num <- varcandidates[[2]] + + # Zipfian over var1_num strings + partition <- RandomPartition(N, ComputePdf("zipf1.5", var1_num)) + v1_samples <- rep(1:var1_num, partition) # expand partition + # Shuffle values randomly (may take a few sec for > 10^8 inputs) + v1_samples <- sample(v1_samples) + + # supp(var2) = {1, 2, 3, ..., var2_num} + # We look at two zipfian distributions over supp(var2) + # D1 = zipfian distribution + # D2 = zipfian distr over {var2_num, ..., 4, 3, 2, 1} + # (i.e., D1 in reverse) + # var2 ~ D1 if var1 = even + # var2 ~ D2 if var1 = odd + d1 <- sample(rep(1:var2_num, + RandomPartition(N, ComputePdf("zipf1.5", var2_num)))) + d2 <- (var2_num:1)[d1] + v2_samples <- rep(1, N) + v3_samples <- rep(1, N) + v2_samples[v1_samples %% 2 == 0] <- d1[v1_samples %% 2 == 0] + v2_samples[v1_samples %% 2 == 1] <- d2[v1_samples %% 2 == 1] + if(distr == "zipf3") { + bool1 <- rbinom(N, 1, 0.25) + rep(1, N) + bool2 <- rbinom(N, 1, 0.75) + rep(1, N) + v3_samples[v1_samples %% 2 == 0] <- bool1[v1_samples %% 2 == 0] + v3_samples[v1_samples %% 2 == 1] <- bool2[v1_samples %% 2 == 1] + } + } + + if(length(varcandidates) == 3) { + tmp_samples <- list(v1_samples, v2_samples, v3_samples) + } else if (length(varcandidates) == 2) { + tmp_samples <- list(v1_samples, v2_samples) + } + + # Function to pad strings to uval_vec if sample_vec has + # larger support than the number of strings in uval_vec + # For e.g., if samples have support {1, 2, 3, 4, ...} and uvals + # only have "value1", "value2", and "value3", samples now + # over support {"value1", "value2", "value3", "str4", ...} + PadStrings <- function(sample_vec, uval_vec) { + if (max(sample_vec) > length(uval_vec)) { + # Padding uvals to required length + len <- length(uval_vec) + max_of_samples <- max(sample_vec) + uval_vec[(len + 1):max_of_samples] <- apply( + as.matrix((len + 1):max_of_samples), + 1, + function(i) sprintf("str%d", i)) + } + uval_vec + } + + # Pad and update uvals + uvals <- lapply(1:length(varcandidates), + function(i) PadStrings(tmp_samples[[i]], + uvals[[i]])) + # Replace integers in tmp_samples with actual sample strings + samples <- lapply(1:length(varcandidates), + function(i) uvals[[i]][tmp_samples[[i]]]) + + print("TRUE DISTR") + td <- table(samples)/sum(table(samples)) + if (length(varcandidates) == 2) { + td <- td[order(rowSums(td), decreasing = TRUE),] + } else { + td <- td[order(rowSums(td), decreasing = TRUE),,] + } + print(td) + write.table(td, file = truefile, sep = ",", col.names = TRUE, + row.names = TRUE, quote = FALSE) + # Randomly assign cohorts in each dimension + cohorts <- sample(1:m, N, replace = TRUE) + + # Create and write map into mapfile_1.csv and mapfile_2.csv + if (extras > 0) { + # spurious candidates for mapfile_1.csv + len <- length(uvals[[1]]) + as.numeric(extras) + uvals[[1]] <- PadStrings(len, uvals[[1]]) + } + map <- lapply(uvals, function(u) CreateMap(u, params)) + write.table(map[[1]]$map_pos, file = paste(mapfile, "_1.csv", sep = ""), + sep = ",", col.names = FALSE, na = "", quote = FALSE) + write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""), + sep = ",", col.names = FALSE, na = "", quote = FALSE) + if(length(varcandidates) == 3) { + write.table(map[[3]]$map_pos, file = paste(mapfile, "_3.csv", sep = ""), + sep = ",", col.names = FALSE, na = "", quote = FALSE) + } + + # Write reports into a csv file + # Format: + # cohort, bloom filter var1, bloom filter var2 + reports <- lapply(1:length(varcandidates), function(i) + EncodeAll(samples[[i]], cohorts, map[[i]]$map, params)) + # Organize cohorts and reports into format + write_matrix <- cbind(as.matrix(cohorts), + sapply(reports, + function(x) as.matrix(lapply(x, + function(z) paste(z, collapse = ""))))) + write.table(write_matrix, file = reportsfile, quote = FALSE, + row.names = FALSE, col.names = FALSE, sep = ",") +} + +main <- function(inp) { + ptm <- proc.time() + + if(is.null(inp$uvals)) { + # One off case. + # TODO(pseudorandom): More sensible defaults. + uvals = list(var1 = c("str1", "str2"), var2 = c("option1", "option2")) + } else { + uvals <- GetUniqueValsFromFile(apply_prefix(inp$uvals)) + } + params <- ReadParameterFile(apply_prefix(inp$params)) + SimulateReports(inp$num, uvals, params, inp$distr, # inuts + inp$extras, apply_prefix(inp$true), # inputs + inp$varcandidates, # inputs + apply_prefix(inp$map), + apply_prefix(inp$reports)) # outputs + + print("PROC.TIME") + print(proc.time() - ptm) +} + +if(!interactive()) { + main(inp) +} diff --git a/quick_assoc.sh b/quick_assoc.sh new file mode 100755 index 00000000..024e321e --- /dev/null +++ b/quick_assoc.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# +# Quick script to wrap assoc.R +# +# Usage: +# ./quick_assoc.sh [] +# +# For directory name $dir, quick_assoc.sh expects the following files: +# $dir/map1.csv -- map files +# $dir/map2.csv +# $dir/reports.csv -- these are the raw reports +# $dir/params.csv -- parameters file +# +# At the end, it will output results of the Two Way Algorithm and EM algorithm +# (if EM also is set to T) to stdout +# +# Examples: +# $ ./quick_assoc.sh . T + +readonly THIS_DIR=$(dirname $0) +readonly REPO_ROOT=$THIS_DIR +readonly CLIENT_DIR=$REPO_ROOT/client/python +readonly MAP_SUFFIX=map +readonly COUNT_SUFFIX=count + +# All the Python tools need this +export PYTHONPATH=$CLIENT_DIR + +_run-input() { + + # Read reports and compute two way counts + analysis/tools/sum_bits_assoc.py \ + $1/params.csv \ + "$1/$COUNT_SUFFIX" \ + < $1/reports.csv + + # Currently, the summary file shows and aggregates timing of the inference + # engine, which excludes R's loading time and reading of the (possibly + # substantial) map file. Timing below is more inclusive. + TIMEFORMAT='Running analyze.R took %R seconds' + + # Setting up JSON file inp.json in current directory + json_file="{\ + \"time\": false, + \"maps\": [\"$1/${MAP_SUFFIX}1.csv\",\ + \"$1/${MAP_SUFFIX}2.csv\"],\ + \"reports\": \"$1/reports.csv\",\ + \"params\": \"$1/params.csv\",\ + \"numvars\": 2,\ + \"verbose\": \"false\",\ + \"counts\": [\"$1/${COUNT_SUFFIX}_2way.csv\",\ + \"${COUNT_SUFFIX}_marg1.csv\",\ + \"${COUNT_SUFFIX}_marg2.csv\"]," + + # Adding EM comparison depending on flag + if test $2 = T; then + json_file=$json_file"\"also_em\": true" + else + json_file=$json_file"\"also_em\": false" + fi + json_file=$json_file"}" + echo $json_file > inp.json + + time { + analysis/R/assoc.R --inp inp.json + } +} + +main() { + dir=$1 + also_em=${2:-F} + _run-input $dir $also_em +} + +main "$@" diff --git a/setup.sh b/setup.sh index 90b6537f..fbaaff41 100755 --- a/setup.sh +++ b/setup.sh @@ -30,7 +30,7 @@ r-packages() { # glmnet, limSolve: solvers for decode.R # RJSONIO: for analysis_tool.R sudo R -e \ - 'install.packages(c("glmnet", "optparse", "limSolve", "RUnit", "abind", "RJSONIO"), repos="http://cran.rstudio.com/")' + 'install.packages(c("glmnet", "optparse", "limSolve", "RUnit", "abind", "RJSONIO", "jsonlite"), repos="http://cran.rstudio.com/")' } # R 3.0.2 on Trusty is out of date with CRAN, so we need this workaround. diff --git a/test.sh b/test.sh index 37ef0f14..22df91f5 100755 --- a/test.sh +++ b/test.sh @@ -112,6 +112,8 @@ r-unit() { tests/gen_true_values_test.R + tests/gen_true_values_assoc_test.R + analysis/R/decode_test.R analysis/test/run_tests.R diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R deleted file mode 100755 index 5d78806f..00000000 --- a/tests/analyze_assoc.R +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/env Rscript -# -# Copyright 2015 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Reads map files, report files, and RAPPOR parameters to run -# an EM algorithm to estimate joint distribution over two or more variables -# -# Usage: -# $ ./analyze_assoc.R -map1 map_1.csv -map2 map_2.csv \ -# -reports reports.csv \ -# Inputs: map1, map2, reports, params -# see how options are parsed below for more information -# Outputs: -# prints a table with estimated joint probability masses -# over candidate strings -# Ex. -# ssl nossl -# intel 0.1 0.3 -# google 0.5 0.1 - -library("optparse") - -options(stringsAsFactors = FALSE) - -if(!interactive()) { - option_list <- list( - # Flags - make_option(c("--map1", "-m1"), default = "map_1.csv", - help = "Hashed candidates for 1st variable"), - make_option(c("--map2", "-m2"), default = "map_2.csv", - help = "Hashed candidates for 2nd variable"), - make_option(c("--reports", "-r"), default = "reports.csv", - help = "File with raw reports as "), - make_option(c("--params", "-p"), default = "params.csv", - help = "Filename for RAPPOR parameters") - ) - opts <- parse_args(OptionParser(option_list = option_list)) -} - -source("../analysis/R/encode.R") -source("../analysis/R/decode.R") -source("../analysis/R/simulation.R") -source("../analysis/R/read_input.R") -source("../analysis/R/association.R") - -# This function processes the maps loaded using ReadMapFile -# Association analysis requires a map object with a map -# field that has the map split into cohorts and an rmap field -# that has all the cohorts combined -# Arguments: -# map = map object with cohorts as sparse matrix in -# object map$map -# This is the expected object from ReadMapFile -# params = data field with parameters -# TODO(pseudorandom): move this functionality to ReadMapFile -ProcessMap <- function(map, params) { - map$rmap <- map$map - split_map <- function(i, map_struct) { - numbits <- params$k - indices <- which(as.matrix( - map_struct[((i - 1) * numbits + 1):(i * numbits),]) == TRUE, - arr.ind = TRUE) - sparseMatrix(indices[, "row"], indices[, "col"], - dims = c(numbits, max(indices[, "col"]))) - } - map$map <- lapply(1:params$m, function(i) split_map(i, map$rmap)) - map -} - -main <- function(opts) { - ptm <- proc.time() - - params <- ReadParameterFile(opts$params) - opts_map <- list(opts$map1, opts$map2) - map <- lapply(opts_map, function(o) - ProcessMap(ReadMapFile(o, params = params), - params = params)) - # Reports must be of the format - # cohort no, rappor bitstring 1, rappor bitstring 2 - reportsObj <- read.csv(opts$reports, - colClasses = c("integer", "character", "character"), - header = FALSE) - - # Parsing reportsObj - # ComputeDistributionEM allows for different sets of cohorts - # for each variable. Here, both sets of cohorts are identical - co <- as.list(reportsObj[1])[[1]] - cohorts <- list(co, co) - # Parse reports from reportObj cols 2 and 3 - reports <- lapply(1:2, function(x) as.list(reportsObj[x + 1])) - - # Split strings into bit arrays (as required by assoc analysis) - reports <- lapply(1:2, function(i) { - # apply the following function to each of reports[[1]] and reports[[2]] - lapply(reports[[i]][[1]], function(x) { - # function splits strings and converts them to numeric values - as.numeric(strsplit(x, split = "")[[1]]) - }) - }) - - joint_dist <- ComputeDistributionEM(reports, cohorts, map, - ignore_other = TRUE, - params, marginals = NULL, - estimate_var = FALSE) - # TODO(pseudorandom): Export the results to a file for further analysis - print("JOINT_DIST$FIT") - print(joint_dist$fit) - print("PROC.TIME") - print(proc.time() - ptm) -} - -if(!interactive()) { - main(opts) -} \ No newline at end of file diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R deleted file mode 100755 index 3ff1e5df..00000000 --- a/tests/assoc_sim.R +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/env Rscript -# -# Copyright 2015 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Simulates inputs on which association analysis can be run. -# Currently assoc_sim.R only supports 2 variables but can -# be easily extended to support more. -# -# Usage: -# $ ./assoc_sim.R -n 1000 -# Inputs: uvals, params, reports, map, num, unif -# see how options are parsed below for more information -# Outputs: -# reports.csv file containing reports -# map_{1, 2, ...}.csv file(s) containing maps of variables - -library("optparse") - -options(stringsAsFactors = FALSE) - -if(!interactive()) { - option_list <- list( - make_option(c("--uvals", "-v"), default = "uvals.csv", - help = "Filename for list of values over which - distributions are simulated. The file is a list of - comma-separated strings each line of which refers - to a new variable."), - make_option(c("--params", "-p"), default = "params.csv", - help = "Filename for RAPPOR parameters"), - make_option(c("--reports", "-r"), default = "reports.csv", - help = "Filename for reports"), - make_option(c("--map", "-m"), default = "map", - help = "Filename *prefix* for map(s)"), - make_option(c("--num", "-n"), default = 1e05, - help = "Number of reports"), - make_option(c("--unif", "-u"), default = FALSE, - help = "Run simulation with uniform distribution") - ) - opts <- parse_args(OptionParser(option_list = option_list)) -} - -source("../analysis/R/encode.R") -source("../analysis/R/decode.R") -source("../analysis/R/simulation.R") -source("../analysis/R/read_input.R") -source("../analysis/R/association.R") - -# Read unique values of reports from a csv file -# Inputs: filename. The file is expected to contain two rows of strings -# (one for each variable): -# "google.com", "apple.com", ... -# "ssl", "nossl", ... -# Returns: a list containing strings -GetUniqueValsFromFile <- function(filename) { - contents <- read.csv(filename, header = FALSE) - # Expect 2 rows of unique vals - if(nrow(contents) != 2) { - stop(paste("Unique vals file", filename, "expected to have - two rows of strings.")) - } - # Removes superfluous "" entries if the lists of unique values - # differ in length - strip_empty <- function(vec) { - vec[!vec %in% c("")] - } - list(var1 = strip_empty(as.vector(t(contents[1,]))), - var2 = strip_empty(as.vector(t(contents[2,])))) -} - -# Simulate correlated reports and write into reportsfile -# Inputs: N = number of reports -# uvals = list containing a list of unique values -# params = list with RAPPOR parameters -# unif = whether to replace poisson with uniform -# mapfile = file to write maps into (with .csv suffixes) -# reportsfile = file to write reports into (with .csv suffix) -SimulateReports <- function(N, uvals, params, unif, - mapfile, reportsfile) { - # Compute true distribution - m <- params$m - - if (unif) { - # Draw uniformly from 1 to 10 - v1_samples <- as.integer(runif(N, 1, 10)) - } else { - # Draw from a Poisson random variable - v1_samples <- rpois(N, 1) + rep(1, N) - } - - # Pr[var2 = N + 1 | var1 = N] = 0.5 - # Pr[var2 = N | var1 = N] = 0.5 - v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE) - - tmp_samples <- list(v1_samples, v2_samples) - - # Function to pad strings to uval_vec if sample_vec has - # larger support than the number of strings in uval_vec - # For e.g., if samples have support {1, 2, 3, 4, ...} and uvals - # only have "value1", "value2", and "value3", samples now - # over support {"value1", "value2", "value3", "str4", ...} - PadStrings <- function(sample_vec, uval_vec) { - if (max(sample_vec) > length(uval_vec)) { - # Padding uvals to required length - len <- length(uval_vec) - max_of_samples <- max(sample_vec) - uval_vec[(len + 1):max_of_samples] <- apply( - as.matrix((len + 1):max_of_samples), - 1, - function(i) sprintf("str%d", i)) - } - uval_vec - } - - # Pad and update uvals - uvals <- lapply(1:2, function(i) PadStrings(tmp_samples[[i]], - uvals[[i]])) - - # Replace integers in tmp_samples with actual sample strings - samples <- lapply(1:2, function(i) uvals[[i]][tmp_samples[[i]]]) - - # Randomly assign cohorts in each dimension - cohorts <- sample(1:m, N, replace = TRUE) - - # Create and write map into mapfile_1.csv and mapfile_2.csv - map <- lapply(uvals, function(u) CreateMap(u, params)) - write.table(map[[1]]$map_pos, file = paste(mapfile, "_1.csv", sep = ""), - sep = ",", col.names = FALSE, na = "", quote = FALSE) - write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""), - sep = ",", col.names = FALSE, na = "", quote = FALSE) - - # Write reports into a csv file - # Format: - # cohort, bloom filter var1, bloom filter var2 - reports <- lapply(1:2, function(i) - EncodeAll(samples[[i]], cohorts, map[[i]]$map, params)) - # Organize cohorts and reports into format - write_matrix <- cbind(as.matrix(cohorts), - as.matrix(lapply(reports[[1]], - function(x) paste(x, collapse = ""))), - as.matrix(lapply(reports[[2]], - function(x) paste(x, collapse = "")))) - write.table(write_matrix, file = reportsfile, quote = FALSE, - row.names = FALSE, col.names = FALSE, sep = ",") -} - -main <- function(opts) { - ptm <- proc.time() - - uvals <- GetUniqueValsFromFile(opts$uvals) - params <- ReadParameterFile(opts$params) - SimulateReports(opts$num, uvals, params, opts$unif, # inputs - opts$map, opts$reports) # outputs - - print("PROC.TIME") - print(proc.time() - ptm) -} - -if(!interactive()) { - main(opts) -} diff --git a/tests/assoctest.html b/tests/assoctest.html new file mode 100644 index 00000000..e4b23875 --- /dev/null +++ b/tests/assoctest.html @@ -0,0 +1,98 @@ + + + + RAPPOR assoctest.sh + + + + + + +

RAPPOR assoctest.sh

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/assoctest_spec.py b/tests/assoctest_spec.py new file mode 100755 index 00000000..b6952730 --- /dev/null +++ b/tests/assoctest_spec.py @@ -0,0 +1,137 @@ +#!/usr/bin/python +"""Print a test spec on stdout. + +Each line has parameters for a test case. The assoctest.sh shell script reads +these lines and runs parallel processes. + +We use Python data structures so the test cases are easier to read and edit. +""" + +import optparse +import sys + +DISTRIBUTION_PARAMS_ASSOC = { + # name, num unique values 1, + # num unique values 2, num clients + 'tiny': (100, 2, int(1e03)), # test for insufficient data + 'small': (100, 10, int(1e04)), +# 'fizz-tiny': (100, 20, int(1e03)), +# 'fizz-tiny-bool': (100, 2, int(1e03)), +# 'fizz-small': (100, 20, int(1e04)), +# 'fizz-small-bool': (100, 2, int(1e04)), +# 'fizz': (100, 20, int(1e05)), +# 'fizz-large': (100, 50, int(1e05)), +# 'fizz-2large': (100, 50, int(5e05)), +# 'fizz-bool': (100, 2, int(1e05)), + 'medium': (1000, 10, int(1e05)), + 'medium2': (1000, 2, int(1e05)), + 'large': (10000, 10, int(1e06)), + 'large2': (10000, 2, int(1e06)), + 'largesquared': (int(1e04), 100, int(1e06)), + + # new test names for 2-way marginals + # includes testing for extras + 'fizz-tiny': (100, 20, int(1e03), int(1e04)), + 'fizz-tiny-bool': (100, 2, int(1e03), int(1e04)), + 'fizz-small': (100, 20, int(1e04), int(1e04)), + 'fizz-small-bool': (100, 2, int(1e04), int(1e04)), + 'fizz': (100, 20, int(1e05), int(1e04)), + 'fizz-bool': (100, 2, int(1e05), int(1e04)), + + 'toy': (5, 2, 1e04, 20), # for testing purposes only + 'compact-noextra-small': (40, 5, 1e04, 0), + 'loose-noextra-small': (100, 20, 1e04, 0), + 'compact-noextra-large': (40, 5, 1e06, 0), + 'loose-noextra-large': (100, 20, 1e06, 0), + 'compact-extra-small': (40, 5, int(1e04), int(1e04)), + 'loose-extra-small': (100, 20, int(1e04), int(1e04)), + 'compact-extra-large': (40, 5, int(1e06), int(1e04)), + 'loose-extra-large': (100, 20, int(1e06), int(1e04)), + 'compact-excess-small': (40, 5, int(1e04), int(1e05)), + 'loose-excess-small': (100, 20, int(1e04), int(1e05)), + 'compact-excess-large': (40, 5, int(1e06), int(1e05)), + 'loose-excess-large': (100, 20, int(1e06), int(1e05)), +} + +# 'k, h, m' as in params file. +BLOOMFILTER_PARAMS = { + '8x16': (8, 2, 16), # 16 cohorts, 8 bits each, 2 bits set in each + '8x32': (8, 2, 32), # 32 cohorts, 8 bits each, 2 bits set in each + '16x32': (16, 2, 32), # 32 cohorts, 16 bits each, 2 bits set in each + '8x128': (8, 2, 128), # 128 cohorts, 8 bits each, 2 bits set in each + '128x128': (128, 2, 128), # 8 cohorts, 128 bits each, 2 bits set in each +} + +# 'p, q, f' as in params file. +PRIVACY_PARAMS = { + 'eps_zero': (0, 0.99, 0), # testing purposes only! + 'eps_1_1': (0.39, 0.61, 0.45), # eps_1 = 1, eps_inf = 5: + 'eps_1_5': (0.225, 0.775, 0.0), # eps_1 = 5, no eps_inf + 'eps_verysmall': (0.125, 0.875, 0.125), + 'eps_small': (0.125, 0.875, 0.5), + 'eps_chrome': (0.25, 0.75, 0.5), + 'uma_rappor_type': (0.50, 0.75, 0.5), +} + +# assoc test configuration -> +# (distribution params set, bloomfilter params set, +# privacy params set) +# The test config runs a test suite that is the cross product of all the above +# sets +ASSOC_TEST_CONFIG = { + 'distr': ( +# 'fizz-tiny', +# 'fizz-tiny-bool', +# 'fizz-small', +# 'fizz-small-bool', +# 'fizz', +# 'fizz-bool',), +# 'toy',), + 'compact-noextra-small',), +# 'loose-noextra-small',), +# 'compact-extra-small', +# 'loose-extra-small', +# 'compact-excess-small', +# 'loose-excess-small',), +# 'compact-noextra-large', +# 'loose-noextra-large', +# 'compact-extra-large', +# 'loose-extra-large', +# 'compact-excess-large', +# 'loose-excess-large'), + 'blooms': ( + '8x32',), +# '16x32',), + 'privacy': ( + 'eps_small', + 'eps_chrome',) +} + +# +# END TEST CONFIGURATION +# + +def main(argv): + rows = [] + test_case = [] + # Association tests + for distr in ASSOC_TEST_CONFIG['distr']: + for blooms in ASSOC_TEST_CONFIG['blooms']: + for privacy in ASSOC_TEST_CONFIG['privacy']: + print distr, blooms, privacy + test_name = 'a-{}-{}-{}'.format(distr, blooms, privacy) + params = (BLOOMFILTER_PARAMS[blooms] + + PRIVACY_PARAMS[privacy]) + test_case = (test_name,) + DISTRIBUTION_PARAMS_ASSOC[distr] + params + row_str = [str(element) for element in test_case] + rows.append(row_str) + + for row in rows: + print ' '.join(row) + +if __name__ == '__main__': + try: + main(sys.argv) + except RuntimeError, e: + print >>sys.stderr, 'FATAL: %s' % e + sys.exit(1) diff --git a/tests/compare_assoc.R b/tests/compare_assoc.R new file mode 100755 index 00000000..a8105662 --- /dev/null +++ b/tests/compare_assoc.R @@ -0,0 +1,549 @@ +#!/usr/bin/env Rscript +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Reads map files, report files, and RAPPOR parameters to run +# an EM algorithm to estimate joint distribution over two or more variables +# +# Usage: +# $ ./compare_assoc.R --inp +# +# Input file: +# Outputs: + +library("jsonlite") +library("optparse") + +options(stringsAsFactors = FALSE) + +if(!interactive()) { + option_list <- list( + make_option(c("--inp"), default = "analyze_inp.json", + help = "JSON file with inputs for analyze_assoc_expt")) + opts <- parse_args(OptionParser(option_list = option_list)) +} + +source("analysis/R/decode2way.R") +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/read_input.R") +source("analysis/R/association.R") +source("tests/gen_counts.R") + +# Wrapper function to print strings only if verbose flag is passed in +PrintIfVerbose <- function(string, flag = FALSE) { + if(flag == TRUE) { + print(string) + } +} + +# TV distance = L1 distance / 2 = 1 - sum(min(df1|x, df2|x)) where +# df1|x / df2|x projects the distribution to the intersection x of the +# supports of df1 and df2 +TVDistance <- function(df1, df2, statement = "TV DISTANCE") { + rowsi <- intersect(rownames(df1), rownames(df2)) + colsi <- intersect(colnames(df1), colnames(df2)) + print(statement) + 1 - sum(mapply(min, + unlist(as.data.frame(df1[rowsi, colsi]), use.names = FALSE), + unlist(as.data.frame(df2[rowsi, colsi]), use.names = FALSE))) +} + +# Function to combine reports +# Currently assume 2-way marginals +CombineReports <- function(reports1, reports2) { + # Encoding (var1, var2) \in {(0, 0), (0, 1), (1, 0), (1, 1)} + two_bits <- list(c(0, 0, 0, 1), c(0, 1, 0, 0), c(0, 0, 1, 0), c(1, 0, 0, 0)) + OuterProd <- function(x, y) { + as.vector(outer(x, y, + function(z, t) z + 2 * t)) + } + # "report1-major" order + creports <- mapply(OuterProd, reports2, reports1, + SIMPLIFY = FALSE) + # Collapse counts to bit vector according to two_bits + lapply(creports, + function(x) as.vector(sapply(x, function(z) two_bits[[z+1]]))) +} + +# Given 2 lists of maps, maps1 and maps2, the function +# combines the maps by cohort and outputs both +# cohort-organized maps and flattened versions +CombineMaps <- function(maps1, maps2) { + # Combine maps + cmap <- mapply(CombineMapsInternal, maps1, maps2) + + # Flatten map + inds <- lapply(cmap, function(x) which(x, arr.ind = TRUE)) + for (i in seq(1, length(inds))) { + inds[[i]][, 1] <- inds[[i]][, 1] + (i-1) * dim(cmap[[1]])[1] + } + inds <- do.call("rbind", inds) + crmap <- sparseMatrix(inds[, 1], inds[, 2], dims = c( + nrow(cmap[[1]]) * length(cmap), + ncol(cmap[[1]]))) + colnames(crmap) <- colnames(cmap[[1]]) + list(cmap = cmap, crmap = crmap) +} + +# Function to combine maps +# Using map1-major order for both candidates and bits of the report +# to be consistent with how CombineReports works +# Currently assume 2-way marginals +CombineMapsInternal <- function(map1, map2) { + # Retrieve set indices and dimensions + rows1 <- which(map1, arr.ind = TRUE)[,1] + cols1 <- which(map1, arr.ind = TRUE)[,2] + length1 <- dim(map1)[[1]] + width1 <- dim(map1)[[2]] + rows2 <- which(map2, arr.ind = TRUE)[,1] + cols2 <- which(map2, arr.ind = TRUE)[,2] + length2 <- dim(map2)[[1]] + width2 <- dim(map2)[[2]] + + # Now process map1 + map1fn <- function(i, j) { + i1 <- seq(1, length2) + ((i-1) * length2) + j1 <- seq(1, width2) + ((j-1) * width2) + expand.grid(i1, j1) + } + map1indices <- do.call(rbind, + mapply(map1fn, rows1, cols1, SIMPLIFY = FALSE)) + map1_big <- sparseMatrix(map1indices[,"Var1"], + map1indices[,"Var2"], + dims = c(length1 * length2, + width1 * width2)) + colnames(map1_big) <- t(outer(colnames(map1), + colnames(map2), + function(x, y) paste(x, y, sep = "x"))) + + # Now process map2 + map2fn <- function(i, j) { + i2 <- i + (seq(0, length1 - 1) * length2) + j2 <- j + (seq(0, width1 - 1) * width2) + expand.grid(i2, j2) + } + map2indices <- do.call(rbind, + mapply(map2fn, rows2, cols2, SIMPLIFY = FALSE)) + map2_big <- sparseMatrix(map2indices[,"Var1"], + map2indices[,"Var2"], + dims = c(length1 * length2, + width1 * width2)) + colnames(map2_big) <- t(outer(colnames(map1), + colnames(map2), + function(x, y) paste(x, y, sep = "x"))) + + # Now collate two maps with entries in (1000, 0100, 0010, 0001) + # (m1&m2, !m1 & m2, m1 & !m2, !(m1 & m2)) respectively + findices <- which(map1_big & map2_big, arr.ind = TRUE) + # 1000 + findices[, 1] <- findices[, 1] * 4 - 3 + # 0100 + indices_0100 <- which((!map1_big) & map2_big, arr.ind = TRUE) + indices_0100[, 1] <- indices_0100[, 1] * 4 - 2 + findices <- rbind(findices, indices_0100) + # 0010 + indices_0010 <- which(map1_big & (!map2_big), arr.ind = TRUE) + indices_0010[, 1] <- indices_0010[, 1] * 4 - 1 + findices <- rbind(findices, indices_0010) + # 0001 + indices_0001 <- which((!map1_big) & (!map2_big), arr.ind = TRUE) + indices_0001[, 1] <- indices_0001[, 1] * 4 + findices <- rbind(findices, indices_0001) + sm <- sparseMatrix(findices[, 1], findices[, 2], + dims = c(4 * length1 * length2, + width1 * width2)) + colnames(sm) <- colnames(map1_big) + sm +} + +GenerateNoiseMatrix <- function(params) { + p <- params$p + q <- params$q + f <- params$f + m <- params$m + k <- params$k + + p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1 + p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1 + p10 <- 1 - p11 # probability of a true 1 reported as 0 + p00 <- 1 - p01 # probability of a true 0 reported as 0 + + NoiseMatrix <- matrix(rep(0, 16), 4) + NoiseMatrix[1,] <- c(p11**2, p11*p10, p10*p11, p10**2) + NoiseMatrix[2,] <- c(p11*p01, p11*p00, p10*p01, p10*p00) + NoiseMatrix[3,] <- c(p01*p11, p01*p10, p00*p11, p00*p01) + NoiseMatrix[4,] <- c(p01**2, p00*p01, p01*p00, p00**2) + + NoiseMatrix +} + +##################################################################### +## +## Direct simulation of reports WITHOUT simulated variance +## +## Inputs: inp object (from parsing JSON) with +## num - # of reports +## params - file containing RAPPOR params +## varcandidates - list containing # of candidates for each var +## numvars - # of vars (>=2 for association) +## extra - # of extra candidates for var 1 +## +## +## Outputs: Runs simulation of two-way association analysis by directly +## simulating the counts of one way and two way marginals +## +##################################################################### +DirectSimulationOfReports <- function(inp, verbose = FALSE) { + ptm <- proc.time() + params <- ReadParameterFile(inp$params) + strconstant <- c("string", "option") + N <- inp$num + n1 <- inp$varcandidates[[1]] + n2 <- inp$varcandidates[[2]] + + # Construct unique vals for each variable using strconstant + stopifnot(length(strconstant) == inp$numvars) + uvals <- lapply(1:inp$numvars, + function(i) { + apply(as.matrix(1:inp$varcandidates[[i]]), + 1, + function(z) sprintf("%s%d", strconstant[[i]], z)) + }) + + # Add extras if any + if(inp$extras > 0) { + uvals[[1]] <- c(uvals[[1]], apply(as.matrix(1:inp$extras), 1, + function(z) sprintf("%s%d", strconstant[[1]], z + n1))) + } + + # Compute map + map <- lapply(uvals, function(u) CreateMap(u, params)) + + # Trim maps to real # of candidates + # Use extras only for decoding + tmap <- lapply(map[[1]]$map, function(i) i[, 1:n1]) + crmap_trimmed <- CombineMaps(tmap, map[[2]]$map)$crmap + + # Sample values to compute partition + # Zipfian over n1 strings + v1_part <- RandomPartition(N, ComputePdf("zipf1.5", n1)) + # Zipfian over n2 strings for each of variable 1 + # Distr. are correlated as in assoc_sim.R + final_part <- as.vector(sapply(1:n1, + function(i) { + v2_part <- RandomPartition(v1_part[[i]], + ComputePdf("zipf1.5", n2)) + if (i %% 2 == 0) {v2_part} else {rev(v2_part)} + })) + + td <- matrix(final_part/sum(final_part), nrow = n1, ncol = n2, byrow = TRUE) + v2_part <- RandomPartition(N, apply(td, 2, sum)) + ow_parts <- list(v1_part, v2_part) + ow_parts[[1]] <- c(ow_parts[[1]], rep(0, inp$extra)) + + # -------------- + # Generate 1-way counts + ow_counts <- lapply(1:2, function(i) + GenerateCounts(params, map[[i]]$rmap, ow_parts[[i]], 1)) + found_strings <- lapply(1:2, function(i) + Decode(ow_counts[[i]], + map[[i]]$rmap, + params, quick = TRUE)$fit[,"string"]) + # -------------- + + rownames(td) <- uvals[[1]][1:n1] # Don't take into account extras + colnames(td) <- uvals[[2]] + PrintIfVerbose("TRUE DISTRIBUTION", verbose) + PrintIfVerbose(signif(td, 4), verbose) + cohorts <- as.matrix( + apply(as.data.frame(final_part), 1, + function(count) RandomPartition(count, rep(1, params$m)))) + expanded <- apply(cohorts, 2, function(vec) rep(vec, each = ((params$k)**2)*4)) + true_ones <- apply(expanded * crmap_trimmed, 1, sum) + + NoiseMatrix <- GenerateNoiseMatrix(params) + after_noise <- as.vector(sapply(1:(length(true_ones)/4), + function(x) + t(NoiseMatrix) %*% true_ones[((x-1)*4+1):(x*4)])) + counts <- cbind(apply(cohorts, 1, sum), + matrix(after_noise, + nrow = params$m, + ncol = 4 * (params$k**2), + byrow = TRUE)) + + params2 <- params + params2$k <- (params$k ** 2) * 4 + + # Combine maps to feed into Decode2Way + # Prune first to found_strings from Decode on 1-way counts + pruned <- lapply(1:2, function(i) + lapply(map[[i]]$map, function(z) z[,found_strings[[i]]])) + crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap + marginal <- Decode2Way(counts, crmap, params2)$fit + + # Fill in estimated results with rows and cols from td + ed <- matrix(0, nrow = (n1+inp$extra), ncol = n2) + rownames(ed) <- uvals[[1]] + colnames(ed) <- uvals[[2]] + for (cols in colnames(td)) { + for (rows in rownames(td)) { + ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] + } + } + ed[is.na(ed)] <- 0 + time_taken <- proc.time() - ptm + + PrintIfVerbose("2 WAY RESULTS", verbose) + PrintIfVerbose(signif(ed, 4), verbose) + PrintIfVerbose(TVDistance(td, ed, "TV DISTANCE 2 WAY ALGORITHM"), verbose) + PrintIfVerbose("PROC.TIME", verbose) + PrintIfVerbose(time_taken, verbose) + chisq_td <- chisq.test(td)[1][[1]][[1]] + chisq_ed <- chisq.test(ed)[1][[1]][[1]] + if(is.nan(chisq_ed)) { + chisq_ed <- 0 + } + if(is.nan(chisq_td)) { + chisq_td <- 0 + } + + metrics <- list( + td_chisq = chisq_td, + ed_chisq = chisq_ed, + tv = TVDistance(td, ed, ""), + time = time_taken[1], + dim1 = length(found_strings[[1]]), + dim2 = length(found_strings[[2]]) + ) + filename <- file.path(inp$outdir, 'metrics.csv') + write.csv(metrics, file = filename, row.names = FALSE) +} + +##################################################################### +## +## Externally provided counts (gen_assoc_counts.R and sum_assoc_reports.py) +## new_decode flag allows you to switch between two decode algorithm choices +## Note: Only for two way associations +## +## Inputs: inp object (from parsing JSON) with +## count files (2 way counts, individual marginal counts) +## map files (2 variables) +## params file with RAPPOR params +## +## Outputs: Runs simulation of two-way association analysis reading inputs +## from counts, maps, and params file. +##################################################################### +ExternalCounts <- function(inp, verbose = FALSE, metrics_filename = "metrics.csv") { + ptm <- proc.time() + params <- ReadParameterFile(inp$params) + # Ensure sufficient maps as required by number of vars + # Correct map from ReadMapFile() for assoc analysis + stopifnot(inp$numvars == length(inp$maps)) + map <- lapply(inp$maps, function(o) + CorrectMapForAssoc(ReadMapFile(o, params = params), + params = params)) + + # (2 way counts, marginal 1 counts, marginal 2 counts) + counts <- lapply(1:3, function(i) ReadCountsFile(inp$counts[[i]])) + + params2 <- params + params2$k <- (params$k ** 2) * 4 + + # Prune candidates + fit <- lapply(1:2, function(i) + Decode(counts[[i + 1]], + map[[i]]$rmap, + params, quick = FALSE)$fit) + + found_strings = list(fit[[1]][,"string"], fit[[2]][,"string"]) + + if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) { + PrintIfVerbose("FOUND_STRINGS", verbose) + PrintIfVerbose(found_strings, verbose) + stop("No strings found in 1-way marginal.") + } + + # Combine maps to feed into Decode2Way + # Prune first to found_strings from Decode on 1-way counts + pruned <- lapply(1:2, function(i) + lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE])) + crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap + marginal <- Decode2Way(counts[[1]], crmap, params2, fit = fit)$fit + td <- read.csv(file = inp$truefile, header = TRUE) + td <- table(td[,3:4]) + td <- td / sum(td) + ed <- td + for (cols in colnames(td)) { + for (rows in rownames(td)) { + ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"] + } + } + ed[is.na(ed)] <- 0 + ed[ed<0] <- 0 + + time_taken <- proc.time() - ptm + + PrintIfVerbose(TVDistance(td, ed, "TV DISTANCE 2 WAY"), verbose) + PrintIfVerbose("PROC.TIME", verbose) + PrintIfVerbose(time_taken, verbose) + chisq_td <- chisq.test(td)[1][[1]][[1]] + chisq_ed <- chisq.test(ed)[1][[1]][[1]] + if(is.nan(chisq_td)) { + chisq_td <- 0 + } + if(is.nan(chisq_ed)) { + chisq_ed <- 0 + } + + metrics <- list( + td_chisq = chisq_td, + ed_chisq = chisq_ed, + tv = TVDistance(td, ed, ""), + time = time_taken[1], + dim1 = length(found_strings[[1]]), + dim2 = length(found_strings[[2]]) + ) + + # Write metrics to metrics_filename (default: metrics.csv) + filename <- file.path(inp$outdir, metrics_filename) + write.csv(metrics, file = filename, row.names = FALSE) +} + +##################################################################### +## +## Externally provided reports +## EM ALGORITHM +## TODO: Also support 3 way association +## +## Inputs: +## +## Outputs: +## +##################################################################### +ExternalReportsEM <- function(inp, + verbose = FALSE, + metrics_filename = "metrics.csv") { + ptm <- proc.time() + params <- ReadParameterFile(inp$params) + # Ensure sufficient maps as required by number of vars + stopifnot(inp$numvars == length(inp$maps)) + # Correct map from ReadMapFile() for assoc analysis + map <- lapply(inp$maps, function(o) + CorrectMapForAssoc(ReadMapFile(o, params = params), + params = params)) + + # Reports must be of the format + # client name, cohort no, rappor bitstring 1, rappor bitstring 2, ... + reportsObj <- read.csv(inp$reports, + colClasses = c("character", "integer", + rep("character", inp$numvars)), + header = TRUE) + # Ignore the first column + reportsObj <- reportsObj[,-1] + + # Parsing reportsObj + # ComputeDistributionEM allows for different sets of cohorts + # for each variable. Here, both sets of cohorts are identical + co <- as.list(reportsObj[1])[[1]] + co <- co + 1 # 1 indexing + cohorts <- rep(list(co), inp$numvars) + # Parse reports from reportObj cols 2, 3, ... + reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1])) + + # Split strings into bit arrays (as required by assoc analysis) + reports <- lapply(1:inp$numvars, function(i) { + # apply the following function to each of reports[[1]] and reports[[2]] + lapply(reports[[i]][[1]], function(x) { + # function splits strings and converts them to numeric values + # rev needed for endianness + rev(as.numeric(strsplit(x, split = "")[[1]])) + }) + }) + + params = list(params, params) + joint_dist <- ComputeDistributionEM(reports, cohorts, map, + ignore_other = TRUE, + quick = TRUE, + params, marginals = NULL, + estimate_var = FALSE) + em <- joint_dist$fit + td <- read.csv(file = inp$truefile, header = FALSE) + td <- table(td[,3:4]) + td <- td / sum(td) + time_taken <- proc.time() - ptm + + PrintIfVerbose(TVDistance(td, em, "TV DISTANCE EM"), verbose) + PrintIfVerbose("PROC.TIME", verbose) + PrintIfVerbose(time_taken, verbose) + chisq_td <- chisq.test(td)[1][[1]][[1]] + chisq_ed <- chisq.test(em)[1][[1]][[1]] + if(is.nan(chisq_td)) { + chisq_td <- 0 + } + if(is.nan(chisq_ed)) { + chisq_ed <- 0 + } + + metrics <- list( + td_chisq = chisq_td, + ed_chisq = chisq_ed, + tv = TVDistance(td, em, ""), + time = time_taken[1], + dim1 = dim(em)[[1]], + dim2 = dim(em)[[2]] + ) + + # Write metrics to metrics_filename (default: metrics.csv) + filename <- file.path(inp$outdir, metrics_filename) + write.csv(metrics, file = filename, row.names = FALSE) +} + +main <- function(opts) { + inp <- fromJSON(opts$inp) + verbose_flag <- inp$verbose + # Choose from a set of experiments to run + # direct -> direct simulation of reports (without variances) + # external-counts -> externally supplied counts for 2 way and marginals + # external-reports -> externally supplied reports + + if("direct" %in% inp$expt) { + PrintIfVerbose("Running Experiment Direct", verbose_flag) + DirectSimulationOfReports(inp, verbose = verbose_flag) + } + if ("external-counts" %in% inp$expt) { + PrintIfVerbose("Running Experiment Ext Counts", verbose_flag) + if ("direct" %in% inp$expt) { + # external-counts expt is run to compare results + ExternalCounts(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv") + } else { + ExternalCounts(inp, verbose = verbose_flag) + } + } + if ("external-reports-em" %in% inp$expt) { + PrintIfVerbose("Running Experiment Ext Reports", verbose_flag) + if (("direct" %in% inp$expt)||("external-counts" %in% inp$expt)) { + # external-reports-em expt is run to compare results + ExternalReportsEM(inp, verbose = verbose_flag, metrics_filename = "metrics_2.csv") + } else { + ExternalReportsEM(inp, verbose = verbose_flag) + } + } +} + +if(!interactive()) { + main(opts) +} diff --git a/tests/gen_true_values_assoc.R b/tests/gen_true_values_assoc.R new file mode 100755 index 00000000..779fe398 --- /dev/null +++ b/tests/gen_true_values_assoc.R @@ -0,0 +1,99 @@ +#!/usr/bin/env Rscript +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +source('tests/gen_counts.R') + +# Usage: +# +# $ ./gen_true_values_assoc.R 100 20 10000 foo.csv +# +# Inputs: +# size of the distribution's support for var 1 +# size of the distribution's support for var 2 +# number of clients +# name of the output file +# Output: +# csv file with reports sampled according to the specified distribution. + +GenerateTrueValuesAssoc <- function(n, N, num_cohorts) { + # Inputs: n, a list of supports for vars 1, 2 + # N, the number of reports/clients + # num_cohorts, the number of cohorts + # Output: tuples of values sampled according to a zipf x zipf distr + # with support n[[1]] and n[[2]] respectively + + # Sample values to compute partition + # Resulting distribution is a correlated zipf x zipf + # distribution over 2 variables + PartitionWithCorrelation <- function(size, support, index) { + part <- RandomPartition(size, ComputePdf("zipf1.5", support)) + if (index %% 2 == 0) {part} else {rev(part)} + } + + # Zipfian over n[[1]] strings + part <- RandomPartition(N, ComputePdf("zipf1.5", n[[1]])) + # Zipfian over n[[2]] strings for each of variable 1 + final_part <- as.vector(sapply(1:n[[1]], + function(i) PartitionWithCorrelation(part[[i]], n[[2]], i))) + + final_part <- matrix(final_part, nrow = n[[1]], byrow = TRUE) + rownames(final_part) <- sapply(1:n[[1]], function(x) paste("str", x, sep = "")) + colnames(final_part) <- sapply(1:n[[2]], function(x) paste("opt", x, sep = "")) + distr <- final_part/sum(final_part) + print("DISTRIBUTION") + print(distr) + + print('PARTITION') + print(final_part) + + # Expand partition + values <- list( + rep(1:n[[1]], rowSums(final_part)), + unlist(sapply(1:n[[1]], function(x) rep(1:n[[2]], final_part[x, ])))) + + stopifnot((length(values[[1]]) == N) & + (length(values[[2]]) == N)) + + # Shuffle values randomly (may take a few sec for > 10^8 inputs) + perm <- sample(N) + values <- list(values[[1]][perm], values[[2]][perm]) + cohorts <- rep(1:N) %% num_cohorts + list(cohorts = cohorts, values = values) +} + +main <- function(argv) { + n <- list(as.integer(argv[[1]]), as.integer(argv[[2]])) + N <- as.integer(argv[[3]]) + num_cohorts <- as.integer(argv[[4]]) + out_file <- argv[[5]] + + res <- GenerateTrueValuesAssoc(n, N, num_cohorts) + # Prepend with str and opt + reports <- list(sprintf("str%d", res$values[[1]]), + sprintf("opt%d", res$values[[2]])) + + + # Paste together client name, cohort input, report1, report2 + reports <- cbind(sprintf("cli%d", 1:N), + res$cohorts, reports[[1]], reports[[2]]) + colnames(reports) <- c("client", "cohort", "value1", "value2") + write.table(reports, file = out_file, row.names = FALSE, col.names = TRUE, + sep = ",", quote = FALSE) +} + +if (length(sys.frames()) == 0) { + main(commandArgs(TRUE)) +} diff --git a/tests/gen_true_values_assoc_test.R b/tests/gen_true_values_assoc_test.R new file mode 100755 index 00000000..ebef1e77 --- /dev/null +++ b/tests/gen_true_values_assoc_test.R @@ -0,0 +1,37 @@ +#!/usr/bin/Rscript +# +# gen_reports_test.R + +source('analysis/R/util.R') # Log() + +source('tests/gen_true_values_assoc.R') # module under test + +library(RUnit) + +TestGenerateTrueValuesAssoc <- function() { + # list for support of var1, var2, + # total number of reports + # num_cohorts + res <- GenerateTrueValuesAssoc(list(20, 5), 1000, 32) + # print(res$values) + + # 1000 reports + checkEquals(1000, length(res$values[[1]])) + + # support(var1) <= 20 + # support(var2) <= 5 + checkTrue(max(res$values[[1]]) <= 20) + checkTrue(max(res$values[[2]]) <= 5) + + # Ensure cohorts are filled up + checkEquals(32, length(unique(res$cohort))) + + # TODO: Add tests to confirm (w.h.p.?) that certain distribution aspects are + # as expected (such as the zipfian on marginals) +} + +TestAll <- function(){ + TestGenerateTrueValuesAssoc() +} + +TestAll() diff --git a/tests/make_summary_assoc.py b/tests/make_summary_assoc.py new file mode 100755 index 00000000..0f1af04d --- /dev/null +++ b/tests/make_summary_assoc.py @@ -0,0 +1,377 @@ +#!/usr/bin/python +"""Given a regtest result tree, prints an HTML summary on stdout. + +See HTML skeleton in tests/assoctest.html. +""" + +import os +import re +import sys + + +SUMMARY_ROW = """\ + + + + + + + + + + + + + + + + + + + + + + + + + +""" + +# Navigation and links to plot. +DETAILS = """\ +

+ Up +

+ + + +

+ +

+ +

+%(name)s files +

+""" + + +def FormatFloat(x, percent): + """Formats a floating-point number.""" + if percent: + return '{:.1f}%'.format(x * 100.0) + else: + return '{:.3f}'.format(x) + + +def FormatMeanWithSem(m_std_error, percent=False): + """Formats an estimate with standard error.""" + if m_std_error is None: + return '' + m, std_error = m_std_error + if std_error is None: + return FormatFloat(m, percent) + else: + return '{}±{}'.format( + FormatFloat(m, percent), + FormatFloat(std_error, percent)) + + +def Mean(l): + """Computes the mean (average) for a list of numbers.""" + if l: + return float(sum(l)) / len(l) + else: + return None + + +def SampleVar(l): + """Computes the sample variance for a list of numbers.""" + if len(l) > 1: + mean = Mean(l) + var = sum([(x - mean) ** 2 for x in l]) / (len(l) - 1) + return var + else: + return None + + +def StandardErrorEstimate(l): + """Returns the standard error estimate for a list of numbers. + + For a singleton the standard error is assumed to be 10% of its value. + """ + if len(l) > 1: + return (SampleVar(l) / len(l)) ** .5 + elif l: + return l[0] / 10.0 + else: + return None + + +def MeanOfMeans(dict_of_lists): + """Returns the average of averages with the standard error of the estimate. + """ + means = [Mean(dict_of_lists[key]) for key in dict_of_lists + if dict_of_lists[key]] + if means: + # Compute variances of the estimate for each sublist. + se = [StandardErrorEstimate(dict_of_lists[key]) ** 2 for key + in dict_of_lists if dict_of_lists[key]] + return (Mean(means), # Mean over all sublists + sum(se) ** .5 / len(se)) # Standard deviation of the mean + else: + return None + + +def ParseSpecFile(spec_filename, empty = False): + #Parses the spec (parameters) file. + + with open(spec_filename) as s: + spec_row = s.readline().split() + + spec_in_html = ' '.join('' % cell for cell in spec_row[1:len(spec_row)-1]) + if empty == True: + spec_in_html = ' '.join('' for cell in spec_row[1:len(spec_row)-1]) + + return spec_in_html + + +def ExtractTime(log_filename): + """Extracts the elapsed time information from the log file. + + Returns: + Elapsed time (in seconds) or None in case of failure. + """ + if os.path.isfile(log_filename): + with open(log_filename) as log: + log_str = log.read() + # Matching a line output by analyze.R. + match = re.search(r'Inference took ([0-9.]+) seconds', log_str) + if match: + return float(match.group(1)) + return None + + +def ParseMetrics(metrics_file, log_file, italics = False): + """Processes the metrics file. + + Args: + report_dir: A directory name containing metrics.csv and log.txt. + num_additional: A number of bogus candidates added to the candidate list. + + Returns a pair: + - A dictionary of metrics (some can be []). + - An HTML-formatted portion of the report row. + """ + with open(metrics_file) as m: + m.readline() + metrics_row = m.readline().split(',') + + (td_chisq, ed_chisq, l1d, rtime, d1, d2) = metrics_row + + td_chisq = float(td_chisq) + ed_chisq = float(ed_chisq) + + l1d = float(l1d) + rtime = float(rtime) + + elapsed_time = ExtractTime(log_file) + + metrics_row_str = [ + '%s' % d1, + '%s' % d2, + '%.3f' % l1d, + str(rtime), + ] + + metrics_row_dict = { + 'd1': [d1], + 'd2': [d2], + 'l1d': [l1d], + 'rtime': [rtime], + 'chisqdiff': [abs(td_chisq - ed_chisq)], + } + + # return metrics formatted as HTML table entries + if(italics == True): + return (metrics_row_dict, + ' '.join('' % cell for cell in metrics_row_str)) + else: + return (metrics_row_dict, + ' '.join('' % cell for cell in metrics_row_str)) + + +def FormatCell1(test_case, test_instance, metrics_file, log_file, plot_file, + link_to_plots): + """Outputs an HTML table entry for the first cell of the row. + + The row is filled if the metrics file exist. The first cell contains a link + that for short tables points to a plot file inline, for large tables to an + external file. + + If the metrics file is missing, the link points to the log file (if one + exists) + """ + relpath_report = '{}/{}_report'.format(test_case, test_instance) + if os.path.isfile(metrics_file): + external_file = plot_file + if link_to_plots: + link = '#{}_{}'.format(test_case, test_instance) # anchor + else: + link = os.path.join(relpath_report, 'dist.png') + else: # no results likely due to an error, puts a link to the log file + external_file = log_file + link = os.path.join(relpath_report, 'log.txt') + + if os.path.isfile(external_file): + return ''.format(link, test_case) + else: # if no file to link to + return ''.format(test_case) + + +def FormatSummaryRow(metrics_lists): + """Outputs an HTML-formatted summary row.""" + means_with_sem = {} # SEM - standard error of the mean + + for key in metrics_lists: + means_with_sem[key] = MeanOfMeans(metrics_lists[key]) + # If none of the lists is longer than one element, drop the SEM component. + if means_with_sem[key] and max([len(l) for l in metrics_lists[key]]) < 2: + means_with_sem[key] = [means_with_sem[key][0], None] + + summary = { + 'name': 'Means', + 'mean_l1d': FormatMeanWithSem(means_with_sem['l1d'], percent=False), + # 'mean_chisqdiff': FormatMeanWithSem(means_with_sem['chisqdiff'], percent=False), + 'mean_rtime': FormatMeanWithSem(means_with_sem['rtime']), + } + return SUMMARY_ROW % summary + + +def FormatPlots(base_dir, test_instances): + """Outputs HTML-formatted plots.""" + result = '' + for instance in test_instances: + # A test instance is identified by the test name and the test run. + test_case, test_instance, _ = instance.split(' ') + instance_dir = test_case + '/' + test_instance + '_report' + if os.path.isfile(os.path.join(base_dir, instance_dir, 'dist.png')): + result += DETAILS % {'anchor': test_case + '_' + test_instance, + 'name': '{} (instance {})'.format(test_case, + test_instance), + 'instance_dir': instance_dir} + return result + + +def main(argv): + base_dir = argv[1] + num_instances = int(argv[2]) + + # This file has the test case names, in the order that they should be + # displayed. + path = os.path.join(base_dir, 'test-instances.txt') + with open(path) as f: + test_instances = [line.strip() for line in f] + + # Metrics are assembled into a dictionary of dictionaries. The top-level + # key is the metric name ('tv', 'fpr', etc.), the second level key is + # the test case. These keys reference a list of floats, which can be empty. + metrics = { + 'l1d': {}, # l1 distance + 'chisqdiff': {}, # abs diff in values for the chisq test between true + # distr and estimated distr. + 'rtime': {}, # R run time + } + + # If there are too many tests, the plots are not included in the results + # file. Instead, rows' names are links to the corresponding .png files. + include_plots = len(test_instances) < 20 + include_plots = False + l1d_list = [] + l1d_list2 = [] + + for instance in test_instances: + # A test instance is idenfied by the test name and the test run. + test_case, test_instance = instance.split(' ') + + spec_file = os.path.join(base_dir, test_case, 'spec.txt') + if not os.path.isfile(spec_file): + raise RuntimeError('{} is missing'.format(spec_file)) + + spec_html = ParseSpecFile(spec_file) + metrics_html = '' # will be filled in later on, if metrics exist + + report_dir = os.path.join(base_dir, test_case, test_instance + '_report') + + metrics_file = os.path.join(report_dir, 'metrics.csv') + log_file = os.path.join(report_dir, 'log.txt') + plot_file = os.path.join(report_dir, 'dist.png') + + cell1_html = FormatCell1(test_case, test_instance, metrics_file, log_file, + plot_file, include_plots) + + if(int(test_instance) == 1): + l1d_list = [] + l1d_list2 = [] + + if os.path.isfile(metrics_file): + # ParseMetrics outputs an HTML table row and also updates lists + metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file) + l1d_list += metrics_dict['l1d'] + # Update the metrics structure. Initialize dictionaries if necessary. + for m in metrics: + if not test_case in metrics[m]: + metrics[m][test_case] = metrics_dict[m] + else: + metrics[m][test_case] += metrics_dict[m] + + print '{}{}{}'.format(cell1_html, spec_html, metrics_html) + + # Printing metrics 2 if available + metrics_file = os.path.join(report_dir, 'metrics_2.csv') + if (os.path.isfile(metrics_file)): + metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file, + italics = True) + l1d_list2 += metrics_dict['l1d'] + print '{}{}'.format(ParseSpecFile(spec_file, empty = + True), metrics_html) + + # Print summary of test instances + if(int(test_instance) == num_instances): + row_str = ['', '', + '%.3f±%.3f' % (Mean(l1d_list), StandardErrorEstimate(l1d_list)), + '', + ] + print '{}{}'.format(ParseSpecFile(spec_file, empty = + True), ' '.join('' % cell for cell in + row_str)) + if (os.path.isfile(metrics_file)): + row_str2 = ['', '', + '%.3f±%.3f' % (Mean(l1d_list2), StandardErrorEstimate(l1d_list2)), + '', + ] + print '{}{}'.format(ParseSpecFile(spec_file, empty = + True), ' '.join('' % cell for cell in + row_str2)) + + print FormatSummaryRow(metrics) + + print '' + print '
+ Test Case + + Input Params + + RAPPOR Params + + Result Metrics +
+ d1: orig. support(var1)
+ d2: orig. support(var2)
+ n: num reports
+ e: num extras
+
+ k: report bits
+ h: hashes
+ m: cohorts
+ p, q, f: probabilities
+
+ d1: dimension of var1 solutions.
+ d2: dimension of var2 solutions.
+ tv: tot. var. distance
+ rtime: R runtime
+
d1d2nekhmpqfd1d2tvrtime
+ %(name)s + %(mean_l1d)s%(mean_rtime)s
%s%s%s{}{}
%s
%s
' + print '

' # vertical space + + # Plot links. + if include_plots: + print FormatPlots(base_dir, test_instances) + else: + print ('

Too many tests to include plots. ' + 'Click links within rows for details.

') + + +if __name__ == '__main__': + try: + main(sys.argv) + except RuntimeError, e: + print >>sys.stderr, 'FATAL: %s' % e + sys.exit(1) diff --git a/tests/params.csv b/tests/params.csv deleted file mode 100644 index a2114c90..00000000 --- a/tests/params.csv +++ /dev/null @@ -1,2 +0,0 @@ -k, h, m, p, q, f -16, 2, 4, 0.1, 0.9, 0.2 diff --git a/tests/rappor_assoc_sim.py b/tests/rappor_assoc_sim.py new file mode 100755 index 00000000..b46c8436 --- /dev/null +++ b/tests/rappor_assoc_sim.py @@ -0,0 +1,161 @@ +#!/usr/bin/python +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tool to run RAPPOR on simulated client input. + +TODO: fill up + +Input columns: client,true_value +Ouput coumns: client,cohort,rappor +""" + +import csv +import collections +import optparse +import os +import random +import sys +import time + +import rappor # client library +try: + import fastrand +except ImportError: + print >>sys.stderr, ( + "Native fastrand module not imported; see README for speedups") + fastrand = None + + +def log(msg, *args): + if args: + msg = msg % args + print >>sys.stderr, msg + + +def CreateOptionsParser(): + p = optparse.OptionParser() + + p.add_option( + '--num-bits', type='int', metavar='INT', dest='num_bits', default=16, + help='Number of bloom filter bits.') + p.add_option( + '--num-hashes', type='int', metavar='INT', dest='num_hashes', default=2, + help='Number of hashes.') + p.add_option( + '--num-cohorts', type='int', metavar='INT', dest='num_cohorts', + default=64, help='Number of cohorts.') + + p.add_option( + '-p', type='float', metavar='FLOAT', dest='prob_p', default=1, + help='Probability p') + p.add_option( + '-q', type='float', metavar='FLOAT', dest='prob_q', default=1, + help='Probability q') + p.add_option( + '-f', type='float', metavar='FLOAT', dest='prob_f', default=1, + help='Probability f') + + choices = ['simple', 'fast'] + p.add_option( + '-r', type='choice', metavar='STR', + dest='random_mode', default='fast', choices=choices, + help='Random algorithm (%s)' % '|'.join(choices)) + + return p + +def main(argv): + (opts, argv) = CreateOptionsParser().parse_args(argv) + + # Copy flags into params + params = rappor.Params() + params.num_bloombits = opts.num_bits + params.num_hashes = opts.num_hashes + params.num_cohorts = opts.num_cohorts + params.prob_p = opts.prob_p + params.prob_q = opts.prob_q + params.prob_f = opts.prob_f + + if opts.random_mode == 'simple': + irr_rand = rappor.SimpleIrrRand(params) + elif opts.random_mode == 'fast': + if fastrand: + log('Using fastrand extension') + # NOTE: This doesn't take 'rand'. It's seeded in C with srand(). + irr_rand = fastrand.FastIrrRand(params) + else: + log('Warning: fastrand module not importable; see README for build ' + 'instructions. Falling back to simple randomness.') + irr_rand = rappor.SimpleIrrRand(params) + else: + raise AssertionError + # Other possible implementations: + # - random.SystemRandom (probably uses /dev/urandom on Linux) + # - HMAC-SHA256 with another secret? This could match C++ byte for byte. + # - or srand(0) might do it. + + csv_in = csv.reader(sys.stdin) + csv_out = csv.writer(sys.stdout) + + # NOTE: We can also modify the output to include + # bloombits and the prr for each variable. + # This is useful for debugging purposes + header = ('client', 'cohort', 'irr1', 'irr2') + csv_out.writerow(header) + + # TODO: It would be more instructive/efficient to construct an encoder + # instance up front per client, rather than one per row below. + start_time = time.time() + + for i, (client_str, cohort_str, true_value_1, + true_value_2) in enumerate(csv_in): + if i == 0: + if client_str != 'client': + raise RuntimeError('Expected client header, got %s' % client_str) + if cohort_str != 'cohort': + raise RuntimeError('Expected cohort header, got %s' % cohort_str) + if true_value_1 != 'value1': + raise RuntimeError('Expected value1 header, got %s' % value) + if true_value_2 != 'value2': + raise RuntimeError('Expected value2 header, got %s' % value) + continue # skip header row + + #if i == 30: # EARLY STOP + # break + + if i % 10000 == 0: + elapsed = time.time() - start_time + log('Processed %d inputs in %.2f seconds', i, elapsed) + + cohort = int(cohort_str) % params.num_cohorts + secret = client_str + e = rappor.Encoder(params, cohort, secret, irr_rand) + + # For testing purposes, call e._internal_encode() + irr_1 = e.encode(true_value_1) + irr_2 = e.encode(true_value_2) + + irr_1_str = rappor.bit_string(irr_1, params.num_bloombits) + irr_2_str = rappor.bit_string(irr_2, params.num_bloombits) + + out_row = (client_str, cohort, irr_1_str, irr_2_str) + csv_out.writerow(out_row) + + +if __name__ == "__main__": + try: + main(sys.argv) + except RuntimeError, e: + log('rappor_sim.py: FATAL: %s', e) diff --git a/tests/uvals.csv b/tests/uvals.csv index cebc17ec..18600571 100644 --- a/tests/uvals.csv +++ b/tests/uvals.csv @@ -1,2 +1,2 @@ -google.com,intel.com,yahoo.com -ssl,nossl +str1 +option1