diff options
author | Sadaf Ebrahimi <sadafebrahimi@google.com> | 2022-09-19 21:26:21 +0000 |
---|---|---|
committer | Sadaf Ebrahimi <sadafebrahimi@google.com> | 2022-10-14 17:51:46 +0000 |
commit | 9d7cfa8217b3d5a501b3cf6b35511cc1f5a36948 (patch) | |
tree | 5d2b98a63d4f942bd3babd266cb81763bb59e1dc /bin | |
parent | 912f8cc3fe4b53ebe8931f8f593e18a4f07c96b6 (diff) | |
download | rappor-9d7cfa8217b3d5a501b3cf6b35511cc1f5a36948.tar.gz |
Writing METADATA file for rappor
Also adding missing files that are upstream
Test: Treehugger
Change-Id: Ia4275b3a5d4d84196f6efc948bc57caf50151b60
Diffstat (limited to 'bin')
-rw-r--r-- | bin/README.md | 51 | ||||
-rw-r--r-- | bin/decode-assoc | 19 | ||||
-rw-r--r-- | bin/decode-dist | 19 | ||||
-rwxr-xr-x | bin/decode_assoc.R | 429 | ||||
-rwxr-xr-x | bin/decode_dist.R | 144 | ||||
-rw-r--r-- | bin/hash-candidates | 7 | ||||
-rwxr-xr-x | bin/hash_candidates.py | 64 | ||||
-rwxr-xr-x | bin/hash_candidates_test.py | 59 | ||||
-rw-r--r-- | bin/sum-bits | 7 | ||||
-rwxr-xr-x | bin/sum_bits.py | 86 | ||||
-rwxr-xr-x | bin/sum_bits_test.py | 70 | ||||
-rwxr-xr-x | bin/test.sh | 261 |
12 files changed, 1216 insertions, 0 deletions
diff --git a/bin/README.md b/bin/README.md new file mode 100644 index 0000000..f4262bf --- /dev/null +++ b/bin/README.md @@ -0,0 +1,51 @@ +Command Line Tools +================== + +This directory contains command line tools for RAPPOR analysis. + +Analysis Tools +-------------- + +### decode-dist + +Decode a distribution -- requires a "counts" file (summed bits from reports), +map file, and a params file. See `test.sh decode-dist` in this dir for an +example. + +### decode-assoc + +Decode a joint distribution between 2 variables ("association analysis"). See +`test.sh decode-assoc-R` or `test.sh decode-assoc-cpp` in this dir for an +example. + +Currently it only supports associating strings vs. booleans. + +### Setup + +Both of these tools are written in R, and require several R libraries to be +installed (see `../setup.sh r-packages`). + +`decode-assoc` also shells out to a native binary written in C++ if +`--em-executable` is passed. This requires a C++ compiler (see +`analysis/cpp/run.sh`). You can run `test.sh decode-assoc-cpp` to test it. + + +Helper Tools +------------ + +These are simple Python implementations of tools needed for analysis. At +Google, Chrome uses alternative C++/Go implementations of these tools. + +### sum-bits + +Given a CSV file with RAPPOR reports (IRRs), produce a "counts" CSV file on +stdout. This is the `m x (k+1)` matrix that is used in the R analysis (where m += #cohorts and k = report width in bits). + +### hash-candidates + +Given a list of candidates on stdin, produce a CSV file of hashes (the "map +file"). Each row has `m x h` cells (where m = #cohorts and h = #hashes) + +See the `regtest.sh` script for examples of how these tools are invoked. + diff --git a/bin/decode-assoc b/bin/decode-assoc new file mode 100644 index 0000000..aaa2050 --- /dev/null +++ b/bin/decode-assoc @@ -0,0 +1,19 @@ +#!/bin/bash +# +# Decode multidimensional reports. +# +# This is a tiny shell wrapper around R. + +readonly THIS_DIR=$(dirname $0) + +# NOTE: A trailing / is *required* on RAPPOR_REPO, because we use string +# concatenation to form the absolute path. (file.path() in R doesn't do what +# we want.) + +readonly RAPPOR_REPO=$THIS_DIR/../ + +# RAPPOR_REPO is used by source() statements to find .R files. +export RAPPOR_REPO + +# Make sure to reuse the same process so it can be killed easily. +exec $THIS_DIR/decode_assoc.R "$@" diff --git a/bin/decode-dist b/bin/decode-dist new file mode 100644 index 0000000..147e41c --- /dev/null +++ b/bin/decode-dist @@ -0,0 +1,19 @@ +#!/bin/bash +# +# Decode a distribution from summed RAPPOR reports. +# +# This is a tiny shell wrapper around R. + +readonly THIS_DIR=$(dirname $0) + +# NOTE: A trailing / is *required* on RAPPOR_REPO, because we use string +# concatenation to form the absolute path. (file.path() in R doesn't do what +# we want.) + +readonly RAPPOR_REPO=$THIS_DIR/../ + +# RAPPOR_REPO is used by source() statements to find .R files. +export RAPPOR_REPO + +# Make sure to reuse the same process so it can be killed easily. +exec $THIS_DIR/decode_dist.R "$@" diff --git a/bin/decode_assoc.R b/bin/decode_assoc.R new file mode 100755 index 0000000..58e35f2 --- /dev/null +++ b/bin/decode_assoc.R @@ -0,0 +1,429 @@ +#!/usr/bin/env Rscript +# +# Command line tool to decode multidimensional reports. It's a simple wrapper +# around functions in association.R. + +library(optparse) + +# +# Command line parsing. Do this first before loading libraries to catch errors +# quickly. Loading libraries in R is slow. +# + +# Display an error string and quit. +UsageError <- function(...) { + cat(sprintf(...)) + cat('\n') + quit(status = 1) +} + +option_list <- list( + make_option( + "--metric-name", dest="metric_name", default="", + help="Name of the metric; metrics contain variables (required)"), + make_option( + "--reports", default="", + help="CSV file with reports; each variable is a column (required)"), + make_option( + "--schema", default="", + help="CSV file with variable types and metadata (required)"), + make_option( + "--params-dir", dest="params_dir", default="", + help="Directory where parameter CSV files are stored (required)"), + + make_option( + "--var1", default="", + help="Name of first variable (required)"), + make_option( + "--var2", default="", + help="Name of second variable (required)"), + + make_option( + "--map1", default="", + help="Path to map file, if var1 is a string"), + make_option( + "--map2", default="", + help="Path to map file, if var2 is a string"), + + make_option( + "--output-dir", dest="output_dir", default=".", + help="Output directory (default .)"), + + make_option( + "--create-bool-map", dest="create_bool_map", default=FALSE, + action="store_true", + help="Hack to use string RAPPOR to analyze boolean variables."), + make_option( + "--remove-bad-rows", dest="remove_bad_rows", default=FALSE, + action="store_true", + help="Whether we should remove rows where any value is missing (by + default, the program aborts with an error)"), + + # Options that speed it up + make_option( + "--reports-sample-size", dest="reports_sample_size", default=-1, + help="Only analyze a random sample of this size. This is for + limiting the execution time at the expense of accuracy."), + make_option( + "--num-cores", dest="num_cores", default=1, + help="Number of cores for mclapply to use. Speeds up the parts + of the computation proportional to the number of reports, + EXCEPT the EM step, which can be sped up by native code."), + make_option( + "--max-em-iters", dest="max_em_iters", default=1000, + help="Maximum number of EM iterations"), + make_option( + "--em-executable", dest="em_executable", default="", + help="Shell out to this executable for an accelerated implementation + of EM."), + make_option( + "--tmp-dir", dest="tmp_dir", default="/tmp", + help="Use this tmp dir to communicate with the EM executable") +) + +ParseOptions <- function() { + # NOTE: This API is bad; if you add positional_arguments, the return value + # changes! + parser <- OptionParser(option_list = option_list) + opts <- parse_args(parser) + + if (opts$metric_name == "") { + UsageError("--metric-name is required.") + } + if (opts$reports== "") { + UsageError("--reports is required.") + } + if (opts$schema == "") { + UsageError("--schema is required.") + } + if (opts$params_dir == "") { + UsageError("--params-dir is required.") + } + if (opts$var1 == "") { + UsageError("--var1 is required.") + } + if (opts$var2 == "") { + UsageError("--var2 is required.") + } + + return(opts) +} + +if (!interactive()) { + opts <- ParseOptions() +} + +# +# Load libraries and source our own code. +# + +library(RJSONIO) # toJSON() + +# So we don't have to change pwd +source.rappor <- function(rel_path) { + abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path) + source(abs_path) +} + +source.rappor("analysis/R/association.R") +source.rappor("analysis/R/fast_em.R") +source.rappor("analysis/R/read_input.R") +source.rappor("analysis/R/util.R") + +options(stringsAsFactors = FALSE) +options(max.print = 100) # So our structure() debug calls look better + +CreateAssocStringMap <- function(all_cohorts_map, params) { + # Processes the maps loaded using ReadMapFile and turns it into something + # that association.R can use. Namely, we want a map per cohort. + # + # Arguments: + # all_cohorts_map: map matrix, as for single variable analysis + # params: encoding parameters + + if (nrow(all_cohorts_map) != (params$m * params$k)) { + stop(sprintf( + "Map matrix has invalid dimensions: m * k = %d, nrow(map) = %d", + params$m * params$k, nrow(all_cohorts_map))) + } + + k <- params$k + map_by_cohort <- lapply(0 : (params$m-1), function(cohort) { + begin <- cohort * k + end <- (cohort + 1) * k + all_cohorts_map[(begin+1) : end, ] + }) + + list(all_cohorts_map = all_cohorts_map, map_by_cohort = map_by_cohort) +} + +# Hack to create a map for booleans. We should use closed-form formulas instead. +CreateAssocBoolMap <- function(params) { + names <- c("FALSE", "TRUE") + + map_by_cohort <- lapply(1:params$m, function(unused_cohort) { + # The (1,1) cell is false and the (1,2) cell is true. + m <- sparseMatrix(c(1), c(2), dims = c(1, 2)) + colnames(m) <- names + m + }) + + all_cohorts_map <- sparseMatrix(1:params$m, rep(2, params$m)) + colnames(all_cohorts_map) <- names + + list(map_by_cohort = map_by_cohort, all_cohorts_map = all_cohorts_map) +} + +ResultMatrixToDataFrame <- function(m, string_var_name, bool_var_name) { + # Args: + # m: A 2D matrix as output by ComputeDistributionEM, e.g. + # bing.com yahoo.com google.com Other + # TRUE 0.2718526 0.1873424 0.19637704 0.003208933 + # Other 0.1404581 0.1091826 0.08958427 0.001994163 + # Returns: + # A flattened data frame, e.g. + + # Name the dimensions of the matrix. + dim_names <- list() + # TODO: generalize this. Right now we're assuming the first dimension is + # boolean. + dim_names[[bool_var_name]] <- c('TRUE', 'FALSE') + dim_names[[string_var_name]] <- dimnames(m)[[2]] + + dimnames(m) <- dim_names + + # http://stackoverflow.com/questions/15885111/create-data-frame-from-a-matrix-in-r + fit_df <- as.data.frame(as.table(m)) + + # The as.table conversion gives you a Freq column. Call it "proportion" to + # be consistent with single variable analysis. + colnames(fit_df)[colnames(fit_df) == "Freq"] <- "proportion" + + fit_df +} + +main <- function(opts) { + Log("decode-assoc") + Log("argv:") + print(commandArgs(TRUE)) + + schema <- read.csv(opts$schema) + Log("Read %d vars from schema", nrow(schema)) + + schema1 <- schema[schema$metric == opts$metric_name & + schema$var == opts$var1, ] + if (nrow(schema1) == 0) { + UsageError("Couldn't find metric '%s', field '%s' in schema", + opts$metric_name, opts$var1) + } + schema2 <- schema[schema$metric == opts$metric_name & + schema$var== opts$var2, ] + if (nrow(schema2) == 0) { + UsageError("Couldn't find metric '%s', field '%s' in schema", + opts$metric_name, opts$var2) + } + + if (schema1$params != schema2$params) { + UsageError('var1 and var2 should have the same params (%s != %s)', + schema1$params, schema2$params) + } + params_name <- schema1$params + params_path <- file.path(opts$params_dir, paste0(params_name, '.csv')) + params <- ReadParameterFile(params_path) + + var1_type <- schema1$var_type + var2_type <- schema2$var_type + + # Right now we're assuming that --var1 is a string and --var2 is a boolean. + # TODO: Remove these limitations. + if (var1_type != "string") { + UsageError("Variable 1 should be a string (%s is of type %s)", opts$var1, + var1_type) + } + if (var2_type != "boolean") { + UsageError("Variable 2 should be a boolean (%s is of type %s)", opts$var2, + var2_type) + } + + if (opts$map1 == "") { + UsageError("--map1 must be provided when --var1 is a string (var = %s)", + opts$var1) + } + + # Example cache speedup for 100k map file: 31 seconds to load map and write + # cache; vs 2.2 seconds to read cache. + string_params <- params + map <- LoadMapFile(opts$map1, string_params) + + # Important: first column is cohort (integer); the rest are variables, which + # are ASCII bit strings. + reports <- read.csv(opts$reports, colClasses=c("character"), as.is = TRUE) + + Log("Read %d reports. Preview:", nrow(reports)) + print(head(reports)) + cat('\n') + + # Filter bad reports first + is_empty1 <- reports[[opts$var1]] == "" + is_empty2 <- reports[[opts$var2]] == "" + Log('Found %d blank values in %s', sum(is_empty1), opts$var1) + Log('Found %d blank values in %s', sum(is_empty2), opts$var2) + + is_empty <- is_empty1 | is_empty2 # boolean vectors + Log('%d bad rows', sum(is_empty)) + if (sum(is_empty) > 0) { + if (opts$remove_bad_rows) { + reports <- reports[!is_empty, ] + Log('Removed %d rows, giving %d rows', sum(is_empty), nrow(reports)) + } else { + stop("Found bad rows and --remove-bad-rows wasn't passed") + } + } + + N <- nrow(reports) + + if (N == 0) { + # Use an arbitrary error code when there is nothing to analyze, so we can + # distinguish this from more serious failures. + Log("No reports to analyze. Exiting with code 9.") + quit(status = 9) + } + + # Sample reports if specified. + if (opts$reports_sample_size != -1) { + if (N > opts$reports_sample_size) { + indices <- sample(1:N, opts$reports_sample_size) + reports <- reports[indices, ] + Log("Created a sample of %d reports", nrow(reports)) + } else { + Log("Got less than %d reports, not sampling", opts$reports_sample_size) + } + } + + num_vars <- 2 # hard-coded for now, since there is --var1 and --var2. + + # Convert strings to integers + cohorts <- as.integer(reports$cohort) + + # Hack for Chrome: like AdjustCounts in decode_dist.R. + cohorts <- cohorts %% params$m + + # Assume the input has 0-based cohorts, and change to 1-based cohorts. + cohorts <- cohorts + 1 + + # i.e. create a list of length 2, with identical cohorts. + # NOTE: Basic RAPPOR doesn't need cohorts. + cohorts_list <- rep(list(cohorts), num_vars) + + # TODO: We should use the closed-form formulas rather than calling the + # solver, and not require this flag. + if (!opts$create_bool_map) { + stop("ERROR: pass --create-bool-map to analyze booleans.") + } + + bool_params <- params + # HACK: Make this the boolean. The Decode() step uses k. (Note that R makes + # a copy here) + bool_params$k <- 1 + + params_list <- list(bool_params, string_params) + + Log('CreateAssocStringMap') + string_map <- CreateAssocStringMap(map$map, params) + + Log('CreateAssocBoolMap') + bool_map <- CreateAssocBoolMap(params) + + map_list <- list(bool_map, string_map) + + string_var <- reports[[opts$var1]] + bool_var <- reports[[opts$var2]] + + Log('Preview of string var:') + print(head(table(string_var))) + cat('\n') + + Log('Preview of bool var:') + print(head(table(bool_var))) + cat('\n') + + # Split ASCII strings into array of numerics (as required by association.R) + + Log('Splitting string reports (%d cores)', opts$num_cores) + string_reports <- mclapply(string_var, function(x) { + # function splits strings and converts them to numeric values + # rev needed for endianness + rev(as.integer(strsplit(x, split = "")[[1]])) + }, mc.cores = opts$num_cores) + + Log('Splitting bool reports (%d cores)', opts$num_cores) + # Has to be an list of length 1 integer vectors + bool_reports <- mclapply(bool_var, function(x) { + as.integer(x) + }, mc.cores = opts$num_cores) + + reports_list <- list(bool_reports, string_reports) + + Log('Association for %d vars', length(reports_list)) + + if (opts$em_executable != "") { + Log('Will shell out to %s for native EM implementation', opts$em_executable) + em_iter_func <- ConstructFastEM(opts$em_executable, opts$tmp_dir) + } else { + Log('Will use R implementation of EM (slow)') + em_iter_func <- EM + } + + assoc_result <- ComputeDistributionEM(reports_list, cohorts_list, map_list, + ignore_other = FALSE, + params_list = params_list, + marginals = NULL, + estimate_var = FALSE, + num_cores = opts$num_cores, + em_iter_func = em_iter_func, + max_em_iters = opts$max_em_iters) + + # This happens if the marginal can't be decoded. + if (is.null(assoc_result)) { + stop("ComputeDistributionEM failed.") + } + + # NOTE: It would be nicer if reports_list, cohorts_list, etc. were indexed by + # names like 'domain' rather than numbers, and the result assoc_result$fit + # matrix had corresponding named dimensions. Instead we call + # ResultMatrixToDataFrame to do this. + + fit <- assoc_result$fit + fit_df <- ResultMatrixToDataFrame(fit, opts$var1, opts$var2) + + Log("Association results:") + print(fit_df) + cat('\n') + + results_csv_path <- file.path(opts$output_dir, 'assoc-results.csv') + write.csv(fit_df, file = results_csv_path, row.names = FALSE) + Log("Wrote %s", results_csv_path) + + # Measure elapsed time as close to the end as possible + total_elapsed_time <- proc.time()[['elapsed']] + + metrics <- list(num_reports = N, + reports_sample_size = opts$reports_sample_size, + # fit is a matrix + estimate_dimensions = dim(fit), + # should sum to near 1.0 + sum_estimates = sum(fit), + total_elapsed_time = total_elapsed_time, + em_elapsed_time = assoc_result$em_elapsed_time, + num_em_iters = assoc_result$num_em_iters) + + metrics_json_path <- file.path(opts$output_dir, 'assoc-metrics.json') + writeLines(toJSON(metrics), con = metrics_json_path) + Log("Wrote %s", metrics_json_path) + + Log('DONE decode-assoc') +} + +if (!interactive()) { + main(opts) +} diff --git a/bin/decode_dist.R b/bin/decode_dist.R new file mode 100755 index 0000000..5c83f74 --- /dev/null +++ b/bin/decode_dist.R @@ -0,0 +1,144 @@ +#!/usr/bin/env Rscript +# +# Command line tool to decode a RAPPOR data set. It is a simple wrapper for +# Decode() in decode.R. + +library(optparse) + +# +# Command line parsing. Do this first before loading libraries to catch errors +# quickly. Loading libraries in R is slow. +# + +# For command line error checking. +UsageError <- function(...) { + cat(sprintf(...)) + cat('\n') + quit(status = 1) +} + +option_list <- list( + # Inputs + make_option("--map", default="", help="Map file (required)"), + make_option("--counts", default="", help="Counts file (required)"), + make_option("--params", default="", help="Params file (required)"), + make_option("--output-dir", dest="output_dir", default=".", + help="Output directory (default .)"), + + make_option("--correction", default="FDR", help="Correction method"), + make_option("--alpha", default=.05, help="Alpha level"), + + make_option("--adjust-counts-hack", dest="adjust_counts_hack", + default=FALSE, action="store_true", + help="Allow the counts file to have more rows than cohorts. + Most users should not use this.") +) + +ParseOptions <- function() { + # NOTE: This API is bad; if you add positional_arguments, the return value + # changes! + parser <- OptionParser(option_list = option_list) + opts <- parse_args(parser) + + if (opts$map == "") { + UsageError("--map is required.") + } + if (opts$counts == "") { + UsageError("--counts is required.") + } + if (opts$params == "") { + UsageError("--params is required.") + } + return(opts) +} + +if (!interactive()) { + opts <- ParseOptions() +} + +# +# Load libraries and source our own code. +# + +library(RJSONIO) + +# So we don't have to change pwd +source.rappor <- function(rel_path) { + abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path) + source(abs_path) +} + +source.rappor("analysis/R/read_input.R") +source.rappor("analysis/R/decode.R") +source.rappor("analysis/R/util.R") + +source.rappor("analysis/R/alternative.R") + +options(stringsAsFactors = FALSE) + + +main <- function(opts) { + Log("decode-dist") + Log("argv:") + print(commandArgs(TRUE)) + + Log("Loading inputs") + + # Run a single model of all inputs are specified. + params <- ReadParameterFile(opts$params) + counts <- ReadCountsFile(opts$counts, params, adjust_counts = opts$adjust_counts_hack) + counts <- AdjustCounts(counts, params) + + + # The left-most column has totals. + num_reports <- sum(counts[, 1]) + + map <- LoadMapFile(opts$map, params) + + Log("Decoding %d reports", num_reports) + res <- Decode(counts, map$map, params, correction = opts$correction, + alpha = opts$alpha) + Log("Done decoding") + + if (nrow(res$fit) == 0) { + Log("FATAL: Analysis returned no strings.") + quit(status = 1) + } + + # Write analysis results as CSV. + results_csv_path <- file.path(opts$output_dir, 'results.csv') + write.csv(res$fit, file = results_csv_path, row.names = FALSE) + + # Write residual histograph as a png. + results_png_path <- file.path(opts$output_dir, 'residual.png') + png(results_png_path) + breaks <- pretty(res$residual, n = 200) + histogram <- hist(res$residual, breaks, plot = FALSE) + histogram$counts <- histogram$counts / sum(histogram$counts) # convert the histogram to frequencies + plot(histogram, main = "Histogram of the residual", + xlab = sprintf("Residual (observed - explained, %d x %d values)", params$m, params$k)) + dev.off() + + res$metrics$total_elapsed_time <- proc.time()[['elapsed']] + + # Write summary as JSON (scalar values). + metrics_json_path <- file.path(opts$output_dir, 'metrics.json') + m <- toJSON(res$metrics) + writeLines(m, con = metrics_json_path) + Log("Wrote %s, %s, and %s", results_csv_path, results_png_path, metrics_json_path) + + # TODO: + # - These are in an 2 column 'parameters' and 'values' format. Should these + # just be a plain list? + # - Should any of these privacy params be in metrics.json? + + Log("Privacy summary:") + print(res$privacy) + cat("\n") + + Log('DONE') +} + +if (!interactive()) { + main(opts) +} diff --git a/bin/hash-candidates b/bin/hash-candidates new file mode 100644 index 0000000..ed65fcb --- /dev/null +++ b/bin/hash-candidates @@ -0,0 +1,7 @@ +#!/bin/bash +# +# Shell wrapper around hash_candidates.py. + +readonly THIS_DIR=$(dirname $0) + +PYTHONPATH=$THIS_DIR/../client/python $THIS_DIR/hash_candidates.py "$@" diff --git a/bin/hash_candidates.py b/bin/hash_candidates.py new file mode 100755 index 0000000..e59295e --- /dev/null +++ b/bin/hash_candidates.py @@ -0,0 +1,64 @@ +#!/usr/bin/python +# +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Given a list of candidates on stdin, produce a file of hashes ("map file"). +""" + +import csv +import sys + +import rappor + + +def HashCandidates(params, stdin, stdout): + num_bloombits = params.num_bloombits + csv_out = csv.writer(stdout) + + for line in stdin: + word = line.strip() + row = [word] + for cohort in xrange(params.num_cohorts): + bloom_bits = rappor.get_bloom_bits(word, cohort, params.num_hashes, + num_bloombits) + for bit_to_set in bloom_bits: + # bits are indexed from 1. Add a fixed offset for each cohort. + # NOTE: This detail could be omitted from the map file format, and done + # in R. + row.append(cohort * num_bloombits + (bit_to_set + 1)) + csv_out.writerow(row) + + +def main(argv): + try: + filename = argv[1] + except IndexError: + raise RuntimeError('Usage: hash_candidates.py <params file>') + with open(filename) as f: + try: + params = rappor.Params.from_csv(f) + except rappor.Error as e: + raise RuntimeError(e) + + HashCandidates(params, sys.stdin, sys.stdout) + + +if __name__ == '__main__': + try: + main(sys.argv) + except RuntimeError, e: + print >>sys.stderr, e.args[0] + sys.exit(1) diff --git a/bin/hash_candidates_test.py b/bin/hash_candidates_test.py new file mode 100755 index 0000000..2d0c4f1 --- /dev/null +++ b/bin/hash_candidates_test.py @@ -0,0 +1,59 @@ +#!/usr/bin/python -S +# +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +hash_candidates_test.py: Tests for hash_candidates.py +""" + +import cStringIO +import unittest + +import rappor +import hash_candidates # module under test + + +STDIN = """\ +apple +banana +carrot +""" + +EXPECTED_CSV_OUT = """\ +apple,5,1,26,26,38,34,63,62\r +banana,12,14,28,24,37,34,62,49\r +carrot,4,12,25,21,48,38,61,54\r +""" + + +class HashCandidatesTest(unittest.TestCase): + + def setUp(self): + self.params = rappor.Params() + self.params.num_bloombits = 16 + self.params.num_cohorts = 4 + self.params.num_hashes = 2 + + def testHash(self): + stdin = cStringIO.StringIO(STDIN) + stdout = cStringIO.StringIO() + + hash_candidates.HashCandidates(self.params, stdin, stdout) + + self.assertMultiLineEqual(EXPECTED_CSV_OUT, stdout.getvalue()) + + +if __name__ == '__main__': + unittest.main() diff --git a/bin/sum-bits b/bin/sum-bits new file mode 100644 index 0000000..bfa9b44 --- /dev/null +++ b/bin/sum-bits @@ -0,0 +1,7 @@ +#!/bin/bash +# +# Shell wrapper around sum_bits.py. + +readonly THIS_DIR=$(dirname $0) + +PYTHONPATH=$THIS_DIR/../client/python $THIS_DIR/sum_bits.py "$@" diff --git a/bin/sum_bits.py b/bin/sum_bits.py new file mode 100755 index 0000000..f211656 --- /dev/null +++ b/bin/sum_bits.py @@ -0,0 +1,86 @@ +#!/usr/bin/python +# +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Read the RAPPOR'd values on stdin, and sum the bits to produce a Counting Bloom +filter by cohort. This can then be analyzed by R. +""" + +import csv +import sys + +import rappor + + +def SumBits(params, stdin, stdout): + csv_in = csv.reader(stdin) + csv_out = csv.writer(stdout) + + num_cohorts = params.num_cohorts + num_bloombits = params.num_bloombits + + sums = [[0] * num_bloombits for _ in xrange(num_cohorts)] + num_reports = [0] * num_cohorts + + for i, row in enumerate(csv_in): + try: + (user_id, cohort, unused_bloom, unused_prr, irr) = row + except ValueError: + raise RuntimeError('Error parsing row %r' % row) + + if i == 0: + continue # skip header + + cohort = int(cohort) + num_reports[cohort] += 1 + + if not len(irr) == params.num_bloombits: + raise RuntimeError( + "Expected %d bits, got %r" % (params.num_bloombits, len(irr))) + for i, c in enumerate(irr): + bit_num = num_bloombits - i - 1 # e.g. char 0 = bit 15, char 15 = bit 0 + if c == '1': + sums[cohort][bit_num] += 1 + else: + if c != '0': + raise RuntimeError('Invalid IRR -- digits should be 0 or 1') + + for cohort in xrange(num_cohorts): + # First column is the total number of reports in the cohort. + row = [num_reports[cohort]] + sums[cohort] + csv_out.writerow(row) + + +def main(argv): + try: + filename = argv[1] + except IndexError: + raise RuntimeError('Usage: sum_bits.py <params file>') + with open(filename) as f: + try: + params = rappor.Params.from_csv(f) + except rappor.Error as e: + raise RuntimeError(e) + + SumBits(params, sys.stdin, sys.stdout) + + +if __name__ == '__main__': + try: + main(sys.argv) + except RuntimeError, e: + print >>sys.stderr, e.args[0] + sys.exit(1) diff --git a/bin/sum_bits_test.py b/bin/sum_bits_test.py new file mode 100755 index 0000000..91c109f --- /dev/null +++ b/bin/sum_bits_test.py @@ -0,0 +1,70 @@ +#!/usr/bin/python -S +# +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +sum_bits_test.py: Tests for sum_bits.py +""" + +import cStringIO +import unittest + +import rappor +import sum_bits # module under test + + +CSV_IN = """\ +user_id,cohort,bloom,prr,rappor +5,1,dummy,dummy,0000111100001111 +5,1,dummy,dummy,0000000000111100 +""" + +# NOTE: bit order is reversed. +EXPECTED_CSV_OUT = """\ +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\r +2,1,1,2,2,1,1,0,0,1,1,1,1,0,0,0,0\r +""" + +TOO_MANY_COLUMNS = """\ +user_id,cohort,rappor +5,1,0000111100001111,extra +""" + + +class SumBitsTest(unittest.TestCase): + + def setUp(self): + self.params = rappor.Params() + self.params.num_bloombits = 16 + self.params.num_cohorts = 2 + + def testSum(self): + stdin = cStringIO.StringIO(CSV_IN) + stdout = cStringIO.StringIO() + + sum_bits.SumBits(self.params, stdin, stdout) + + self.assertMultiLineEqual(EXPECTED_CSV_OUT, stdout.getvalue()) + + def testErrors(self): + stdin = cStringIO.StringIO(TOO_MANY_COLUMNS) + stdout = cStringIO.StringIO() + + self.assertRaises( + RuntimeError, sum_bits.SumBits, self.params, stdin, stdout) + + +if __name__ == '__main__': + unittest.main() diff --git a/bin/test.sh b/bin/test.sh new file mode 100755 index 0000000..6b0381e --- /dev/null +++ b/bin/test.sh @@ -0,0 +1,261 @@ +#!/bin/bash +usage() { +echo " + + Simple smoke test for the decode-dist tool. This will fail if your machine + doesn't have the right R libraries. + + Usage: + ./test.sh <function name> + + Example: + ./test.sh decode-assoc-R-smoke # test pure R implementation + ./test.sh decode-assoc-cpp-smoke # test with analysis/cpp/fast_em.cc + ./test.sh decode-assoc-cpp-converge # run for longer with C++ + ./test.sh decode-assoc-tensorflow +" +} + +set -o nounset +set -o pipefail +set -o errexit + +readonly THIS_DIR=$(dirname $0) +readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd) +readonly EM_CPP_EXECUTABLE=$RAPPOR_SRC/analysis/cpp/_tmp/fast_em + +source $RAPPOR_SRC/util.sh + +readonly ASSOC_TESTDATA_DIR=_tmp/decode-assoc-test +readonly DIST_TESTDATA_DIR=_tmp/decode-dist-test + +# Clear the R cache for the map files. +clear-cached-files() { + local dir=$1 + find $dir -name '*.rda' | xargs --no-run-if-empty -- rm --verbose +} + +write-dist-testdata() { + local input_dir=$DIST_TESTDATA_DIR/input + + mkdir -p $input_dir + + clear-cached-files $DIST_TESTDATA_DIR + + # Right now, we copy a case from regtest.sh. (./demo.sh quick-python creates + # just this case) + local case_dir=$RAPPOR_SRC/_tmp/python/demo3 + + cp --verbose $case_dir/1/case_counts.csv $input_dir/counts.csv + cp --verbose $case_dir/case_map.csv $input_dir/map.csv + cp --verbose $case_dir/case_params.csv $input_dir/params.csv +} + +decode-dist() { + write-dist-testdata + + local output_dir=$DIST_TESTDATA_DIR + + local input_dir=$DIST_TESTDATA_DIR/input + + # Uses the ./demo.sh regtest files + time $RAPPOR_SRC/bin/decode-dist \ + --counts $input_dir/counts.csv \ + --map $input_dir/map.csv \ + --params $input_dir/params.csv \ + --output-dir $output_dir + + echo + head $output_dir/results.csv + echo + cat $output_dir/metrics.json +} + +write-assoc-testdata() { + # 'build' has intermediate build files, 'input' is the final input to the + # decode-assoc tool. + local build_dir=$ASSOC_TESTDATA_DIR/build + local input_dir=$ASSOC_TESTDATA_DIR/input + + mkdir -p $build_dir $input_dir + + clear-cached-files $ASSOC_TESTDATA_DIR + + cat >$build_dir/true_values.csv <<EOF +domain,flag..HTTPS +google.com,1 +google.com,1 +google.com,1 +google.com,1 +google.com,0 +yahoo.com,1 +yahoo.com,0 +bing.com,1 +bing.com,1 +bing.com,0 +EOF + + local num_bits=8 + local num_hashes=1 + local num_cohorts=128 + + local prob_p=0.25 + local prob_q=0.75 + local prob_f=0.5 + + # 10 items in the input. 50,000 items is enough to eyeball accuracy of + # results. + local assoc_testdata_count=5000 + + PYTHONPATH=$RAPPOR_SRC/client/python \ + $RAPPOR_SRC/tests/rappor_sim.py \ + --assoc-testdata $assoc_testdata_count \ + --num-bits $num_bits \ + --num-hashes $num_hashes \ + --num-cohorts $num_cohorts \ + -p $prob_p \ + -q $prob_q \ + -f $prob_f \ + < $build_dir/true_values.csv \ + > $input_dir/reports.csv + + # Output two bad rows: each row is missing one of the columns. + cat >$build_dir/bad_rows.txt <<EOF +c0,0,10101010, +c0,0,,0 +EOF + + # Make CSV file with the header + cat - $build_dir/bad_rows.txt > $input_dir/bad_rows.csv <<EOF +client,cohort,domain,flag..HTTPS +EOF + + # Make reports file with bad rows + cat $input_dir/reports.csv $build_dir/bad_rows.txt > $input_dir/reports_bad_rows.csv + + # Define a string variable and a boolean varaible. + cat >$input_dir/rappor-vars.csv <<EOF +metric, var, var_type, params +m,domain,string,m_params +m,flag..HTTPS,boolean,m_params +EOF + + cat >$input_dir/m_params.csv <<EOF +k,h,m,p,q,f +$num_bits,$num_hashes,$num_cohorts,$prob_p,$prob_q,$prob_f +EOF + + # Add a string with a double quote to test quoting behavior + cat >$build_dir/domain_candidates.csv <<EOF +google.com +yahoo.com +bing.com +q"q +EOF + + # Hash candidates to create map. + $RAPPOR_SRC/bin/hash-candidates $input_dir/m_params.csv \ + < $build_dir/domain_candidates.csv \ + > $input_dir/domain_map.csv + + banner "Wrote testdata in $input_dir (intermediate files in $build_dir)" +} + +# Helper function to run decode-assoc with testdata. +decode-assoc-helper() { + write-assoc-testdata + + local output_dir=$1 + shift + + local build_dir=$ASSOC_TESTDATA_DIR/build + local input_dir=$ASSOC_TESTDATA_DIR/input + + time $RAPPOR_SRC/bin/decode-assoc \ + --metric-name m \ + --schema $input_dir/rappor-vars.csv \ + --reports $input_dir/reports.csv \ + --params-dir $input_dir \ + --var1 domain \ + --var2 flag..HTTPS \ + --map1 $input_dir/domain_map.csv \ + --create-bool-map \ + --max-em-iters 10 \ + --num-cores 2 \ + --output-dir $output_dir \ + --tmp-dir $output_dir \ + "$@" + + head $output_dir/assoc-* + + # Print true values for comparison + echo + echo "$build_dir/true_values.csv:" + cat "$build_dir/true_values.csv" +} + +# Quick smoke test for R version. +decode-assoc-R-smoke() { + local output_dir=_tmp/R + mkdir -p $output_dir + decode-assoc-helper $output_dir +} + +# Test what happens when there are bad rows. +decode-assoc-bad-rows() { + local output_dir=_tmp/bad + mkdir -p $output_dir + + # Later flags override earlier ones + + # Reports + bad rows + decode-assoc-helper $output_dir \ + --reports _tmp/reports_bad_rows.csv \ + --remove-bad-rows \ + "$@" + + # ONLY bad rows + decode-assoc-helper $output_dir \ + --reports _tmp/bad_rows.csv \ + --remove-bad-rows \ + "$@" +} + +build-em-executable() { + pushd $RAPPOR_SRC/analysis/cpp >/dev/null + ./run.sh build-fast-em + popd >/dev/null +} + +decode-assoc-cpp-smoke() { + local output_dir=_tmp/cpp + mkdir -p $output_dir + + build-em-executable + + decode-assoc-helper $output_dir \ + --em-executable "$EM_CPP_EXECUTABLE" "$@" +} + +decode-assoc-cpp-converge() { + # With the data we have, this converges and exits before 1000 iterations. + decode-assoc-cpp-smoke --max-em-iters 1000 +} + +decode-assoc-tensorflow() { + local output_dir=_tmp/tensorflow + mkdir -p $output_dir + + decode-assoc-helper $output_dir \ + --em-executable $RAPPOR_SRC/analysis/tensorflow/fast_em.sh "$@" +} + +decode-assoc-tensorflow-converge() { + decode-assoc-tensorflow --max-em-iters 1000 +} + +if test $# -eq 0 ; then + usage +else + "$@" +fi |