Writing METADATA file for rappor

Also adding missing files that are upstream Test: Treehugger Change-Id: Ia4275b3a5d4d84196f6efc948bc57caf50151b60
author: Sadaf Ebrahimi <sadafebrahimi@google.com> 2022-09-19 21:26:21 +0000
committer: Sadaf Ebrahimi <sadafebrahimi@google.com> 2022-10-14 17:51:46 +0000
commit: 9d7cfa8217b3d5a501b3cf6b35511cc1f5a36948 (patch)
tree: 5d2b98a63d4f942bd3babd266cb81763bb59e1dc /bin
parent: 912f8cc3fe4b53ebe8931f8f593e18a4f07c96b6 (diff)
download: rappor-9d7cfa8217b3d5a501b3cf6b35511cc1f5a36948.tar.gz
12 files changed, 1216 insertions, 0 deletions
diff --git a/bin/README.md b/bin/README.md
new file mode 100644
index 0000000..f4262bf
--- /dev/null
+++ b/bin/README.md
@@ -0,0 +1,51 @@
+Command Line Tools
+==================
+
+This directory contains command line tools for RAPPOR analysis.
+
+Analysis Tools
+--------------
+
+### decode-dist
+
+Decode a distribution -- requires a "counts" file (summed bits from reports),
+map file, and a params file.  See `test.sh decode-dist` in this dir for an
+example.
+
+### decode-assoc
+
+Decode a joint distribution between 2 variables ("association analysis").  See
+`test.sh decode-assoc-R` or `test.sh decode-assoc-cpp` in this dir for an
+example.
+
+Currently it only supports associating strings vs. booleans.
+
+### Setup
+
+Both of these tools are written in R, and require several R libraries to be
+installed (see `../setup.sh r-packages`).
+
+`decode-assoc` also shells out to a native binary written in C++ if
+`--em-executable` is passed.  This requires a C++ compiler (see
+`analysis/cpp/run.sh`).  You can run `test.sh decode-assoc-cpp` to test it.
+
+
+Helper Tools
+------------
+
+These are simple Python implementations of tools needed for analysis.  At
+Google, Chrome uses alternative C++/Go implementations of these tools.
+
+### sum-bits
+
+Given a CSV file with RAPPOR reports (IRRs), produce a "counts" CSV file on
+stdout.  This is the `m x (k+1)` matrix that is used in the R analysis (where m
+= #cohorts and k = report width in bits).
+
+### hash-candidates
+
+Given a list of candidates on stdin, produce a CSV file of hashes (the "map
+file").  Each row has `m x h` cells (where m = #cohorts and h = #hashes)
+
+See the `regtest.sh` script for examples of how these tools are invoked.
+
diff --git a/bin/decode-assoc b/bin/decode-assoc
new file mode 100644
index 0000000..aaa2050
--- /dev/null
+++ b/bin/decode-assoc
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+# Decode multidimensional reports.
+#
+# This is a tiny shell wrapper around R.
+
+readonly THIS_DIR=$(dirname $0)
+
+# NOTE: A trailing / is *required* on RAPPOR_REPO, because we use string
+# concatenation to form the absolute path.  (file.path() in R doesn't do what
+# we want.)
+
+readonly RAPPOR_REPO=$THIS_DIR/../
+
+# RAPPOR_REPO is used by source() statements to find .R files.
+export RAPPOR_REPO
+
+# Make sure to reuse the same process so it can be killed easily.
+exec $THIS_DIR/decode_assoc.R "$@"
diff --git a/bin/decode-dist b/bin/decode-dist
new file mode 100644
index 0000000..147e41c
--- /dev/null
+++ b/bin/decode-dist
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+# Decode a distribution from summed RAPPOR reports.
+#
+# This is a tiny shell wrapper around R.
+
+readonly THIS_DIR=$(dirname $0)
+
+# NOTE: A trailing / is *required* on RAPPOR_REPO, because we use string
+# concatenation to form the absolute path.  (file.path() in R doesn't do what
+# we want.)
+
+readonly RAPPOR_REPO=$THIS_DIR/../
+
+# RAPPOR_REPO is used by source() statements to find .R files.
+export RAPPOR_REPO
+
+# Make sure to reuse the same process so it can be killed easily.
+exec $THIS_DIR/decode_dist.R "$@"
diff --git a/bin/decode_assoc.R b/bin/decode_assoc.R
new file mode 100755
index 0000000..58e35f2
--- /dev/null
+++ b/bin/decode_assoc.R
@@ -0,0 +1,429 @@
+#!/usr/bin/env Rscript
+#
+# Command line tool to decode multidimensional reports.  It's a simple wrapper
+# around functions in association.R.
+
+library(optparse)
+
+#
+# Command line parsing.  Do this first before loading libraries to catch errors
+# quickly.  Loading libraries in R is slow.
+#
+
+# Display an error string and quit.
+UsageError <- function(...) {
+  cat(sprintf(...))
+  cat('\n')
+  quit(status = 1)
+}
+
+option_list <- list(
+    make_option(
+        "--metric-name", dest="metric_name", default="",
+        help="Name of the metric; metrics contain variables (required)"),
+    make_option(
+        "--reports", default="",
+        help="CSV file with reports; each variable is a column (required)"),
+    make_option(
+        "--schema", default="",
+        help="CSV file with variable types and metadata (required)"),
+    make_option(
+        "--params-dir", dest="params_dir", default="",
+        help="Directory where parameter CSV files are stored (required)"),
+
+    make_option(
+        "--var1", default="",
+        help="Name of first variable (required)"),
+    make_option(
+        "--var2", default="",
+        help="Name of second variable (required)"),
+
+    make_option(
+        "--map1", default="",
+        help="Path to map file, if var1 is a string"),
+    make_option(
+        "--map2", default="",
+        help="Path to map file, if var2 is a string"),
+
+    make_option(
+        "--output-dir", dest="output_dir", default=".",
+        help="Output directory (default .)"),
+
+    make_option(
+        "--create-bool-map", dest="create_bool_map", default=FALSE,
+        action="store_true",
+        help="Hack to use string RAPPOR to analyze boolean variables."),
+    make_option(
+        "--remove-bad-rows", dest="remove_bad_rows", default=FALSE,
+        action="store_true",
+        help="Whether we should remove rows where any value is missing (by 
+             default, the program aborts with an error)"),
+
+    # Options that speed it up
+    make_option(
+        "--reports-sample-size", dest="reports_sample_size", default=-1,
+        help="Only analyze a random sample of this size.  This is for
+              limiting the execution time at the expense of accuracy."),
+    make_option(
+        "--num-cores", dest="num_cores", default=1,
+        help="Number of cores for mclapply to use.  Speeds up the parts
+              of the computation proportional to the number of reports,
+              EXCEPT the EM step, which can be sped up by native code."),
+    make_option(
+        "--max-em-iters", dest="max_em_iters", default=1000,
+        help="Maximum number of EM iterations"),
+    make_option(
+        "--em-executable", dest="em_executable", default="",
+        help="Shell out to this executable for an accelerated implementation
+             of EM."),
+    make_option(
+        "--tmp-dir", dest="tmp_dir", default="/tmp",
+        help="Use this tmp dir to communicate with the EM executable")
+)
+
+ParseOptions <- function() {
+  # NOTE: This API is bad; if you add positional_arguments, the return value
+  # changes!
+  parser <- OptionParser(option_list = option_list)
+  opts <- parse_args(parser)
+
+  if (opts$metric_name == "") {
+    UsageError("--metric-name is required.")
+  }
+  if (opts$reports== "") {
+    UsageError("--reports is required.")
+  }
+  if (opts$schema == "") {
+    UsageError("--schema is required.")
+  }
+  if (opts$params_dir == "") {
+    UsageError("--params-dir is required.")
+  }
+  if (opts$var1 == "") {
+    UsageError("--var1 is required.")
+  }
+  if (opts$var2 == "") {
+    UsageError("--var2 is required.")
+  }
+
+  return(opts)
+}
+
+if (!interactive()) {
+  opts <- ParseOptions()
+}
+
+#
+# Load libraries and source our own code.
+#
+
+library(RJSONIO)  # toJSON()
+
+# So we don't have to change pwd
+source.rappor <- function(rel_path)  {
+  abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
+  source(abs_path)
+}
+
+source.rappor("analysis/R/association.R")
+source.rappor("analysis/R/fast_em.R")
+source.rappor("analysis/R/read_input.R")
+source.rappor("analysis/R/util.R")
+
+options(stringsAsFactors = FALSE)
+options(max.print = 100)  # So our structure() debug calls look better
+
+CreateAssocStringMap <- function(all_cohorts_map, params) {
+  # Processes the maps loaded using ReadMapFile and turns it into something
+  # that association.R can use.  Namely, we want a map per cohort.
+  #
+  # Arguments:
+  #   all_cohorts_map: map matrix, as for single variable analysis
+  #   params: encoding parameters
+
+  if (nrow(all_cohorts_map) != (params$m * params$k)) {
+    stop(sprintf(
+        "Map matrix has invalid dimensions: m * k = %d, nrow(map) = %d",
+        params$m * params$k, nrow(all_cohorts_map)))
+  }
+
+  k <- params$k
+  map_by_cohort <- lapply(0 : (params$m-1), function(cohort) {
+    begin <- cohort * k
+    end <- (cohort + 1) * k
+    all_cohorts_map[(begin+1) : end, ]
+  })
+
+  list(all_cohorts_map = all_cohorts_map, map_by_cohort = map_by_cohort)
+}
+
+# Hack to create a map for booleans.  We should use closed-form formulas instead.
+CreateAssocBoolMap <- function(params) {
+  names <- c("FALSE", "TRUE")
+
+  map_by_cohort <- lapply(1:params$m, function(unused_cohort) {
+    # The (1,1) cell is false and the (1,2) cell is true.
+    m <- sparseMatrix(c(1), c(2), dims = c(1, 2))
+    colnames(m) <- names
+    m
+  })
+
+  all_cohorts_map <- sparseMatrix(1:params$m, rep(2, params$m))
+  colnames(all_cohorts_map) <- names
+
+  list(map_by_cohort = map_by_cohort, all_cohorts_map = all_cohorts_map)
+}
+
+ResultMatrixToDataFrame <- function(m, string_var_name, bool_var_name) {
+  # Args:
+  #   m: A 2D matrix as output by ComputeDistributionEM, e.g.
+  #          bing.com yahoo.com google.com       Other
+  #   TRUE  0.2718526 0.1873424 0.19637704 0.003208933
+  #   Other 0.1404581 0.1091826 0.08958427 0.001994163
+  # Returns:
+  #   A flattened data frame, e.g.
+
+  # Name the dimensions of the matrix.
+  dim_names <- list()
+  # TODO: generalize this.  Right now we're assuming the first dimension is
+  # boolean.
+  dim_names[[bool_var_name]] <- c('TRUE', 'FALSE')
+  dim_names[[string_var_name]] <- dimnames(m)[[2]]
+
+  dimnames(m) <- dim_names
+
+  # http://stackoverflow.com/questions/15885111/create-data-frame-from-a-matrix-in-r
+  fit_df <- as.data.frame(as.table(m))
+
+  # The as.table conversion gives you a Freq column.  Call it "proportion" to
+  # be consistent with single variable analysis.
+  colnames(fit_df)[colnames(fit_df) == "Freq"] <- "proportion" 
+
+  fit_df
+}
+
+main <- function(opts) {
+  Log("decode-assoc")
+  Log("argv:")
+  print(commandArgs(TRUE))
+
+  schema <- read.csv(opts$schema)
+  Log("Read %d vars from schema", nrow(schema))
+
+  schema1 <- schema[schema$metric == opts$metric_name &
+                    schema$var == opts$var1, ]
+  if (nrow(schema1) == 0) {
+    UsageError("Couldn't find metric '%s', field '%s' in schema",
+               opts$metric_name, opts$var1)
+  }
+  schema2 <- schema[schema$metric == opts$metric_name &
+                    schema$var== opts$var2, ]
+  if (nrow(schema2) == 0) {
+    UsageError("Couldn't find metric '%s', field '%s' in schema",
+               opts$metric_name, opts$var2)
+  }
+
+  if (schema1$params != schema2$params) {
+    UsageError('var1 and var2 should have the same params (%s != %s)',
+               schema1$params, schema2$params)
+  }
+  params_name <- schema1$params
+  params_path <- file.path(opts$params_dir, paste0(params_name, '.csv'))
+  params <- ReadParameterFile(params_path)
+
+  var1_type <- schema1$var_type
+  var2_type <- schema2$var_type
+
+  # Right now we're assuming that --var1 is a string and --var2 is a boolean.
+  # TODO: Remove these limitations.
+  if (var1_type != "string") {
+    UsageError("Variable 1 should be a string (%s is of type %s)", opts$var1,
+               var1_type)
+  }
+  if (var2_type != "boolean") {
+    UsageError("Variable 2 should be a boolean (%s is of type %s)", opts$var2,
+               var2_type)
+  }
+
+  if (opts$map1 == "") {
+    UsageError("--map1 must be provided when --var1 is a string (var = %s)",
+               opts$var1)
+  }
+
+  # Example cache speedup for 100k map file: 31 seconds to load map and write
+  # cache; vs 2.2 seconds to read cache.
+  string_params <- params
+  map <- LoadMapFile(opts$map1, string_params)
+
+  # Important: first column is cohort (integer); the rest are variables, which
+  # are ASCII bit strings.
+  reports <- read.csv(opts$reports, colClasses=c("character"), as.is = TRUE)
+
+  Log("Read %d reports.  Preview:", nrow(reports))
+  print(head(reports))
+  cat('\n')
+
+  # Filter bad reports first
+  is_empty1 <- reports[[opts$var1]] == ""
+  is_empty2 <- reports[[opts$var2]] == ""
+  Log('Found %d blank values in %s', sum(is_empty1), opts$var1)
+  Log('Found %d blank values in %s', sum(is_empty2), opts$var2)
+
+  is_empty <- is_empty1 | is_empty2 # boolean vectors
+  Log('%d bad rows', sum(is_empty))
+  if (sum(is_empty) > 0) {
+    if (opts$remove_bad_rows) {
+      reports <- reports[!is_empty, ]
+      Log('Removed %d rows, giving %d rows', sum(is_empty), nrow(reports))
+    } else {
+      stop("Found bad rows and --remove-bad-rows wasn't passed")
+    }
+  }
+
+  N <- nrow(reports)
+
+  if (N == 0) {
+    # Use an arbitrary error code when there is nothing to analyze, so we can
+    # distinguish this from more serious failures.
+    Log("No reports to analyze.  Exiting with code 9.")
+    quit(status = 9)
+  }
+
+  # Sample reports if specified.
+  if (opts$reports_sample_size != -1) {
+    if (N > opts$reports_sample_size) {
+      indices <- sample(1:N, opts$reports_sample_size)
+      reports <- reports[indices, ]
+      Log("Created a sample of %d reports", nrow(reports))
+    } else {
+      Log("Got less than %d reports, not sampling", opts$reports_sample_size)
+    }
+  }
+
+  num_vars <- 2  # hard-coded for now, since there is --var1 and --var2.
+
+  # Convert strings to integers
+  cohorts <- as.integer(reports$cohort)
+
+  # Hack for Chrome: like AdjustCounts in decode_dist.R.
+  cohorts <- cohorts %% params$m
+
+  # Assume the input has 0-based cohorts, and change to 1-based cohorts.
+  cohorts <- cohorts + 1
+
+  # i.e. create a list of length 2, with identical cohorts.
+  # NOTE: Basic RAPPOR doesn't need cohorts.
+  cohorts_list <- rep(list(cohorts), num_vars)
+
+  # TODO: We should use the closed-form formulas rather than calling the
+  # solver, and not require this flag.
+  if (!opts$create_bool_map) {
+    stop("ERROR: pass --create-bool-map to analyze booleans.")
+  }
+
+  bool_params <- params
+  # HACK: Make this the boolean.  The Decode() step uses k.  (Note that R makes
+  # a copy here)
+  bool_params$k <- 1
+
+  params_list <- list(bool_params, string_params)
+
+  Log('CreateAssocStringMap')
+  string_map <- CreateAssocStringMap(map$map, params)
+
+  Log('CreateAssocBoolMap')
+  bool_map <- CreateAssocBoolMap(params)
+
+  map_list <- list(bool_map, string_map)
+
+  string_var <- reports[[opts$var1]]
+  bool_var <- reports[[opts$var2]]
+
+  Log('Preview of string var:')
+  print(head(table(string_var)))
+  cat('\n')
+
+  Log('Preview of bool var:')
+  print(head(table(bool_var)))
+  cat('\n')
+
+  # Split ASCII strings into array of numerics (as required by association.R)
+
+  Log('Splitting string reports (%d cores)', opts$num_cores)
+  string_reports <- mclapply(string_var, function(x) {
+    # function splits strings and converts them to numeric values
+    # rev needed for endianness
+    rev(as.integer(strsplit(x, split = "")[[1]]))
+  }, mc.cores = opts$num_cores)
+
+  Log('Splitting bool reports (%d cores)', opts$num_cores)
+  # Has to be an list of length 1 integer vectors
+  bool_reports <- mclapply(bool_var, function(x) {
+    as.integer(x)
+  }, mc.cores = opts$num_cores)
+
+  reports_list <- list(bool_reports, string_reports)
+
+  Log('Association for %d vars', length(reports_list))
+
+  if (opts$em_executable != "") {
+    Log('Will shell out to %s for native EM implementation', opts$em_executable)
+    em_iter_func <- ConstructFastEM(opts$em_executable, opts$tmp_dir)
+  } else {
+    Log('Will use R implementation of EM (slow)')
+    em_iter_func <- EM
+  }
+
+  assoc_result <- ComputeDistributionEM(reports_list, cohorts_list, map_list,
+                                        ignore_other = FALSE,
+                                        params_list = params_list,
+                                        marginals = NULL,
+                                        estimate_var = FALSE,
+                                        num_cores = opts$num_cores,
+                                        em_iter_func = em_iter_func,
+                                        max_em_iters = opts$max_em_iters)
+
+  # This happens if the marginal can't be decoded.
+  if (is.null(assoc_result)) {
+    stop("ComputeDistributionEM failed.")
+  }
+
+  # NOTE: It would be nicer if reports_list, cohorts_list, etc. were indexed by
+  # names like 'domain' rather than numbers, and the result assoc_result$fit
+  # matrix had corresponding named dimensions.  Instead we call
+  # ResultMatrixToDataFrame to do this.
+
+  fit <- assoc_result$fit
+  fit_df <- ResultMatrixToDataFrame(fit, opts$var1, opts$var2)
+
+  Log("Association results:")
+  print(fit_df)
+  cat('\n')
+
+  results_csv_path <- file.path(opts$output_dir, 'assoc-results.csv')
+  write.csv(fit_df, file = results_csv_path, row.names = FALSE)
+  Log("Wrote %s", results_csv_path)
+
+  # Measure elapsed time as close to the end as possible
+  total_elapsed_time <- proc.time()[['elapsed']]
+
+  metrics <- list(num_reports = N,
+                  reports_sample_size = opts$reports_sample_size,
+                  # fit is a matrix
+                  estimate_dimensions = dim(fit),
+                  # should sum to near 1.0
+                  sum_estimates = sum(fit),
+                  total_elapsed_time = total_elapsed_time,
+                  em_elapsed_time = assoc_result$em_elapsed_time,
+                  num_em_iters = assoc_result$num_em_iters)
+
+  metrics_json_path <- file.path(opts$output_dir, 'assoc-metrics.json')
+  writeLines(toJSON(metrics), con = metrics_json_path)
+  Log("Wrote %s", metrics_json_path)
+   
+  Log('DONE decode-assoc')
+}
+
+if (!interactive()) {
+  main(opts)
+}
diff --git a/bin/decode_dist.R b/bin/decode_dist.R
new file mode 100755
index 0000000..5c83f74
--- /dev/null
+++ b/bin/decode_dist.R
@@ -0,0 +1,144 @@
+#!/usr/bin/env Rscript
+#
+# Command line tool to decode a RAPPOR data set.  It is a simple wrapper for
+# Decode() in decode.R.
+
+library(optparse)
+
+#
+# Command line parsing.  Do this first before loading libraries to catch errors
+# quickly.  Loading libraries in R is slow.
+#
+
+# For command line error checking.
+UsageError <- function(...) {
+  cat(sprintf(...))
+  cat('\n')
+  quit(status = 1)
+}
+
+option_list <- list(
+  # Inputs
+  make_option("--map", default="", help="Map file (required)"),
+  make_option("--counts", default="", help="Counts file (required)"),
+  make_option("--params", default="", help="Params file (required)"),
+  make_option("--output-dir", dest="output_dir", default=".",
+              help="Output directory (default .)"),
+
+  make_option("--correction", default="FDR", help="Correction method"),
+  make_option("--alpha", default=.05, help="Alpha level"),
+
+  make_option("--adjust-counts-hack", dest="adjust_counts_hack",
+              default=FALSE, action="store_true",
+              help="Allow the counts file to have more rows than cohorts. 
+                    Most users should not use this.")
+)
+
+ParseOptions <- function() {
+  # NOTE: This API is bad; if you add positional_arguments, the return value
+  # changes!
+  parser <- OptionParser(option_list = option_list)
+  opts <- parse_args(parser)
+
+  if (opts$map == "") {
+    UsageError("--map is required.")
+  }
+  if (opts$counts == "") {
+    UsageError("--counts is required.")
+  }
+  if (opts$params == "") {
+    UsageError("--params is required.")
+  }
+  return(opts)
+}
+
+if (!interactive()) {
+  opts <- ParseOptions()
+}
+
+#
+# Load libraries and source our own code.
+#
+
+library(RJSONIO)
+
+# So we don't have to change pwd
+source.rappor <- function(rel_path)  {
+  abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
+  source(abs_path)
+}
+
+source.rappor("analysis/R/read_input.R")
+source.rappor("analysis/R/decode.R")
+source.rappor("analysis/R/util.R")
+
+source.rappor("analysis/R/alternative.R")
+
+options(stringsAsFactors = FALSE)
+
+
+main <- function(opts) {
+  Log("decode-dist")
+  Log("argv:")
+  print(commandArgs(TRUE))
+
+  Log("Loading inputs")
+
+  # Run a single model of all inputs are specified.
+  params <- ReadParameterFile(opts$params)
+  counts <- ReadCountsFile(opts$counts, params, adjust_counts = opts$adjust_counts_hack)
+  counts <- AdjustCounts(counts, params)
+
+
+  # The left-most column has totals.
+  num_reports <- sum(counts[, 1])
+
+  map <- LoadMapFile(opts$map, params)
+
+  Log("Decoding %d reports", num_reports)
+  res <- Decode(counts, map$map, params, correction = opts$correction,
+                alpha = opts$alpha)
+  Log("Done decoding")
+
+  if (nrow(res$fit) == 0) {
+    Log("FATAL: Analysis returned no strings.")
+    quit(status = 1)
+  }
+
+  # Write analysis results as CSV.
+  results_csv_path <- file.path(opts$output_dir, 'results.csv')
+  write.csv(res$fit, file = results_csv_path, row.names = FALSE)
+
+  # Write residual histograph as a png.
+  results_png_path <- file.path(opts$output_dir, 'residual.png')
+  png(results_png_path)
+  breaks <- pretty(res$residual, n = 200)
+  histogram <- hist(res$residual, breaks, plot = FALSE)
+  histogram$counts <- histogram$counts / sum(histogram$counts)  # convert the histogram to frequencies
+  plot(histogram, main = "Histogram of the residual",
+       xlab = sprintf("Residual (observed - explained, %d x %d values)", params$m, params$k))
+  dev.off()
+
+  res$metrics$total_elapsed_time <- proc.time()[['elapsed']]
+
+  # Write summary as JSON (scalar values).
+  metrics_json_path <- file.path(opts$output_dir, 'metrics.json')
+  m <- toJSON(res$metrics)
+  writeLines(m, con = metrics_json_path)
+  Log("Wrote %s, %s, and %s", results_csv_path, results_png_path, metrics_json_path)
+
+  # TODO:
+  # - These are in an 2 column 'parameters' and 'values' format.  Should these
+  # just be a plain list?
+  # - Should any of these privacy params be in metrics.json?
+
+  Log("Privacy summary:")
+  print(res$privacy)
+  cat("\n")
+
+  Log('DONE')
+}
+
+if (!interactive()) {
+  main(opts)
+}
diff --git a/bin/hash-candidates b/bin/hash-candidates
new file mode 100644
index 0000000..ed65fcb
--- /dev/null
+++ b/bin/hash-candidates
@@ -0,0 +1,7 @@
+#!/bin/bash
+#
+# Shell wrapper around hash_candidates.py.
+
+readonly THIS_DIR=$(dirname $0)
+
+PYTHONPATH=$THIS_DIR/../client/python $THIS_DIR/hash_candidates.py "$@"
diff --git a/bin/hash_candidates.py b/bin/hash_candidates.py
new file mode 100755
index 0000000..e59295e
--- /dev/null
+++ b/bin/hash_candidates.py
@@ -0,0 +1,64 @@
+#!/usr/bin/python
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Given a list of candidates on stdin, produce a file of hashes ("map file").
+"""
+
+import csv
+import sys
+
+import rappor
+
+
+def HashCandidates(params, stdin, stdout):
+  num_bloombits = params.num_bloombits
+  csv_out = csv.writer(stdout)
+
+  for line in stdin:
+    word = line.strip()
+    row = [word]
+    for cohort in xrange(params.num_cohorts):
+      bloom_bits = rappor.get_bloom_bits(word, cohort, params.num_hashes,
+                                         num_bloombits)
+      for bit_to_set in bloom_bits:
+        # bits are indexed from 1.  Add a fixed offset for each cohort.
+        # NOTE: This detail could be omitted from the map file format, and done
+        # in R.
+        row.append(cohort * num_bloombits + (bit_to_set + 1))
+    csv_out.writerow(row)
+
+
+def main(argv):
+  try:
+    filename = argv[1]
+  except IndexError:
+    raise RuntimeError('Usage: hash_candidates.py <params file>')
+  with open(filename) as f:
+    try:
+      params = rappor.Params.from_csv(f)
+    except rappor.Error as e:
+      raise RuntimeError(e)
+
+  HashCandidates(params, sys.stdin, sys.stdout)
+
+
+if __name__ == '__main__':
+  try:
+    main(sys.argv)
+  except RuntimeError, e:
+    print >>sys.stderr, e.args[0]
+    sys.exit(1)
diff --git a/bin/hash_candidates_test.py b/bin/hash_candidates_test.py
new file mode 100755
index 0000000..2d0c4f1
--- /dev/null
+++ b/bin/hash_candidates_test.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python -S
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+hash_candidates_test.py: Tests for hash_candidates.py
+"""
+
+import cStringIO
+import unittest
+
+import rappor
+import hash_candidates  # module under test
+
+
+STDIN = """\
+apple
+banana
+carrot
+"""
+
+EXPECTED_CSV_OUT = """\
+apple,5,1,26,26,38,34,63,62\r
+banana,12,14,28,24,37,34,62,49\r
+carrot,4,12,25,21,48,38,61,54\r
+"""
+
+
+class HashCandidatesTest(unittest.TestCase):
+
+  def setUp(self):
+    self.params = rappor.Params()
+    self.params.num_bloombits = 16
+    self.params.num_cohorts = 4
+    self.params.num_hashes = 2
+
+  def testHash(self):
+    stdin = cStringIO.StringIO(STDIN)
+    stdout = cStringIO.StringIO()
+
+    hash_candidates.HashCandidates(self.params, stdin, stdout)
+
+    self.assertMultiLineEqual(EXPECTED_CSV_OUT, stdout.getvalue())
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/bin/sum-bits b/bin/sum-bits
new file mode 100644
index 0000000..bfa9b44
--- /dev/null
+++ b/bin/sum-bits
@@ -0,0 +1,7 @@
+#!/bin/bash
+#
+# Shell wrapper around sum_bits.py.
+
+readonly THIS_DIR=$(dirname $0)
+
+PYTHONPATH=$THIS_DIR/../client/python $THIS_DIR/sum_bits.py "$@"
diff --git a/bin/sum_bits.py b/bin/sum_bits.py
new file mode 100755
index 0000000..f211656
--- /dev/null
+++ b/bin/sum_bits.py
@@ -0,0 +1,86 @@
+#!/usr/bin/python
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Read the RAPPOR'd values on stdin, and sum the bits to produce a Counting Bloom
+filter by cohort.  This can then be analyzed by R.
+"""
+
+import csv
+import sys
+
+import rappor
+
+
+def SumBits(params, stdin, stdout):
+  csv_in = csv.reader(stdin)
+  csv_out = csv.writer(stdout)
+
+  num_cohorts = params.num_cohorts
+  num_bloombits = params.num_bloombits
+
+  sums = [[0] * num_bloombits for _ in xrange(num_cohorts)]
+  num_reports = [0] * num_cohorts
+
+  for i, row in enumerate(csv_in):
+    try:
+      (user_id, cohort, unused_bloom, unused_prr, irr) = row
+    except ValueError:
+      raise RuntimeError('Error parsing row %r' % row)
+
+    if i == 0:
+      continue  # skip header
+
+    cohort = int(cohort)
+    num_reports[cohort] += 1
+
+    if not len(irr) == params.num_bloombits:
+      raise RuntimeError(
+          "Expected %d bits, got %r" % (params.num_bloombits, len(irr)))
+    for i, c in enumerate(irr):
+      bit_num = num_bloombits - i - 1  # e.g. char 0 = bit 15, char 15 = bit 0
+      if c == '1':
+        sums[cohort][bit_num] += 1
+      else:
+        if c != '0':
+          raise RuntimeError('Invalid IRR -- digits should be 0 or 1')
+
+  for cohort in xrange(num_cohorts):
+    # First column is the total number of reports in the cohort.
+    row = [num_reports[cohort]] + sums[cohort]
+    csv_out.writerow(row)
+
+
+def main(argv):
+  try:
+    filename = argv[1]
+  except IndexError:
+    raise RuntimeError('Usage: sum_bits.py <params file>')
+  with open(filename) as f:
+    try:
+      params = rappor.Params.from_csv(f)
+    except rappor.Error as e:
+      raise RuntimeError(e)
+
+  SumBits(params, sys.stdin, sys.stdout)
+
+
+if __name__ == '__main__':
+  try:
+    main(sys.argv)
+  except RuntimeError, e:
+    print >>sys.stderr, e.args[0]
+    sys.exit(1)
diff --git a/bin/sum_bits_test.py b/bin/sum_bits_test.py
new file mode 100755
index 0000000..91c109f
--- /dev/null
+++ b/bin/sum_bits_test.py
@@ -0,0 +1,70 @@
+#!/usr/bin/python -S
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+sum_bits_test.py: Tests for sum_bits.py
+"""
+
+import cStringIO
+import unittest
+
+import rappor
+import sum_bits  # module under test
+
+
+CSV_IN = """\
+user_id,cohort,bloom,prr,rappor
+5,1,dummy,dummy,0000111100001111
+5,1,dummy,dummy,0000000000111100
+"""
+
+# NOTE: bit order is reversed.
+EXPECTED_CSV_OUT = """\
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\r
+2,1,1,2,2,1,1,0,0,1,1,1,1,0,0,0,0\r
+"""
+
+TOO_MANY_COLUMNS = """\
+user_id,cohort,rappor
+5,1,0000111100001111,extra
+"""
+
+
+class SumBitsTest(unittest.TestCase):
+
+  def setUp(self):
+    self.params = rappor.Params()
+    self.params.num_bloombits = 16
+    self.params.num_cohorts = 2
+
+  def testSum(self):
+    stdin = cStringIO.StringIO(CSV_IN)
+    stdout = cStringIO.StringIO()
+
+    sum_bits.SumBits(self.params, stdin, stdout)
+
+    self.assertMultiLineEqual(EXPECTED_CSV_OUT, stdout.getvalue())
+
+  def testErrors(self):
+    stdin = cStringIO.StringIO(TOO_MANY_COLUMNS)
+    stdout = cStringIO.StringIO()
+
+    self.assertRaises(
+        RuntimeError, sum_bits.SumBits, self.params, stdin, stdout)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/bin/test.sh b/bin/test.sh
new file mode 100755
index 0000000..6b0381e
--- /dev/null
+++ b/bin/test.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+usage() {
+echo "
+
+ Simple smoke test for the decode-dist tool.  This will fail if your machine
+ doesn't have the right R libraries.
+
+ Usage:
+   ./test.sh <function name>
+
+ Example:
+   ./test.sh decode-assoc-R-smoke       # test pure R implementation
+   ./test.sh decode-assoc-cpp-smoke     # test with analysis/cpp/fast_em.cc
+   ./test.sh decode-assoc-cpp-converge  # run for longer with C++
+   ./test.sh decode-assoc-tensorflow
+"
+}
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+readonly THIS_DIR=$(dirname $0)
+readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd)
+readonly EM_CPP_EXECUTABLE=$RAPPOR_SRC/analysis/cpp/_tmp/fast_em
+
+source $RAPPOR_SRC/util.sh
+
+readonly ASSOC_TESTDATA_DIR=_tmp/decode-assoc-test
+readonly DIST_TESTDATA_DIR=_tmp/decode-dist-test
+
+# Clear the R cache for the map files.
+clear-cached-files() {
+  local dir=$1
+  find $dir -name '*.rda' | xargs --no-run-if-empty -- rm --verbose
+}
+
+write-dist-testdata() {
+  local input_dir=$DIST_TESTDATA_DIR/input
+
+  mkdir -p $input_dir
+
+  clear-cached-files $DIST_TESTDATA_DIR
+
+  # Right now, we copy a case from regtest.sh.  (./demo.sh quick-python creates
+  # just this case)
+  local case_dir=$RAPPOR_SRC/_tmp/python/demo3
+
+  cp --verbose $case_dir/1/case_counts.csv $input_dir/counts.csv
+  cp --verbose $case_dir/case_map.csv $input_dir/map.csv
+  cp --verbose $case_dir/case_params.csv $input_dir/params.csv
+}
+
+decode-dist() {
+  write-dist-testdata
+
+  local output_dir=$DIST_TESTDATA_DIR
+
+  local input_dir=$DIST_TESTDATA_DIR/input
+
+  # Uses the ./demo.sh regtest files
+  time $RAPPOR_SRC/bin/decode-dist \
+    --counts $input_dir/counts.csv \
+    --map $input_dir/map.csv \
+    --params $input_dir/params.csv \
+    --output-dir $output_dir
+
+  echo
+  head $output_dir/results.csv 
+  echo
+  cat $output_dir/metrics.json
+}
+
+write-assoc-testdata() {
+  # 'build' has intermediate build files, 'input' is the final input to the
+  # decode-assoc tool.
+  local build_dir=$ASSOC_TESTDATA_DIR/build
+  local input_dir=$ASSOC_TESTDATA_DIR/input
+
+  mkdir -p $build_dir $input_dir
+
+  clear-cached-files $ASSOC_TESTDATA_DIR
+
+  cat >$build_dir/true_values.csv <<EOF 
+domain,flag..HTTPS
+google.com,1
+google.com,1
+google.com,1
+google.com,1
+google.com,0
+yahoo.com,1
+yahoo.com,0
+bing.com,1
+bing.com,1
+bing.com,0
+EOF
+
+  local num_bits=8
+  local num_hashes=1
+  local num_cohorts=128
+
+  local prob_p=0.25
+  local prob_q=0.75
+  local prob_f=0.5
+
+  # 10 items in the input. 50,000 items is enough to eyeball accuracy of
+  # results.
+  local assoc_testdata_count=5000
+
+  PYTHONPATH=$RAPPOR_SRC/client/python \
+    $RAPPOR_SRC/tests/rappor_sim.py \
+    --assoc-testdata $assoc_testdata_count \
+    --num-bits $num_bits \
+    --num-hashes $num_hashes \
+    --num-cohorts $num_cohorts \
+    -p $prob_p \
+    -q $prob_q \
+    -f $prob_f \
+    < $build_dir/true_values.csv \
+    > $input_dir/reports.csv
+
+  # Output two bad rows: each row is missing one of the columns.
+  cat >$build_dir/bad_rows.txt <<EOF
+c0,0,10101010,
+c0,0,,0
+EOF
+
+  # Make CSV file with the header
+  cat - $build_dir/bad_rows.txt > $input_dir/bad_rows.csv <<EOF
+client,cohort,domain,flag..HTTPS
+EOF
+
+  # Make reports file with bad rows
+  cat $input_dir/reports.csv $build_dir/bad_rows.txt > $input_dir/reports_bad_rows.csv
+
+  # Define a string variable and a boolean varaible.
+  cat >$input_dir/rappor-vars.csv <<EOF 
+metric, var, var_type, params
+m,domain,string,m_params
+m,flag..HTTPS,boolean,m_params
+EOF
+
+  cat >$input_dir/m_params.csv <<EOF
+k,h,m,p,q,f
+$num_bits,$num_hashes,$num_cohorts,$prob_p,$prob_q,$prob_f
+EOF
+
+  # Add a string with a double quote to test quoting behavior
+  cat >$build_dir/domain_candidates.csv <<EOF
+google.com
+yahoo.com
+bing.com
+q"q
+EOF
+
+  # Hash candidates to create map.
+  $RAPPOR_SRC/bin/hash-candidates $input_dir/m_params.csv \
+    < $build_dir/domain_candidates.csv \
+    > $input_dir/domain_map.csv
+
+  banner "Wrote testdata in $input_dir (intermediate files in $build_dir)"
+}
+
+# Helper function to run decode-assoc with testdata.
+decode-assoc-helper() {
+  write-assoc-testdata
+
+  local output_dir=$1
+  shift
+
+  local build_dir=$ASSOC_TESTDATA_DIR/build
+  local input_dir=$ASSOC_TESTDATA_DIR/input
+
+  time $RAPPOR_SRC/bin/decode-assoc \
+    --metric-name m \
+    --schema $input_dir/rappor-vars.csv \
+    --reports $input_dir/reports.csv \
+    --params-dir $input_dir \
+    --var1 domain \
+    --var2 flag..HTTPS \
+    --map1 $input_dir/domain_map.csv \
+    --create-bool-map \
+    --max-em-iters 10 \
+    --num-cores 2 \
+    --output-dir $output_dir \
+    --tmp-dir $output_dir \
+    "$@"
+
+  head $output_dir/assoc-*
+
+  # Print true values for comparison
+  echo
+  echo "$build_dir/true_values.csv:"
+  cat "$build_dir/true_values.csv"
+}
+
+# Quick smoke test for R version.
+decode-assoc-R-smoke() {
+  local output_dir=_tmp/R
+  mkdir -p $output_dir
+  decode-assoc-helper $output_dir
+}
+
+# Test what happens when there are bad rows.
+decode-assoc-bad-rows() {
+  local output_dir=_tmp/bad
+  mkdir -p $output_dir
+
+  # Later flags override earlier ones
+
+  # Reports + bad rows
+  decode-assoc-helper $output_dir \
+    --reports _tmp/reports_bad_rows.csv \
+    --remove-bad-rows \
+    "$@"
+
+  # ONLY bad rows
+  decode-assoc-helper $output_dir \
+    --reports _tmp/bad_rows.csv \
+    --remove-bad-rows \
+    "$@"
+}
+
+build-em-executable() {
+  pushd $RAPPOR_SRC/analysis/cpp >/dev/null
+  ./run.sh build-fast-em
+  popd >/dev/null
+}
+
+decode-assoc-cpp-smoke() {
+  local output_dir=_tmp/cpp
+  mkdir -p $output_dir
+
+  build-em-executable
+
+  decode-assoc-helper $output_dir \
+    --em-executable "$EM_CPP_EXECUTABLE" "$@"
+}
+
+decode-assoc-cpp-converge() {
+  # With the data we have, this converges and exits before 1000 iterations.
+  decode-assoc-cpp-smoke --max-em-iters 1000
+}
+
+decode-assoc-tensorflow() {
+  local output_dir=_tmp/tensorflow
+  mkdir -p $output_dir
+
+  decode-assoc-helper $output_dir \
+    --em-executable $RAPPOR_SRC/analysis/tensorflow/fast_em.sh "$@"
+}
+
+decode-assoc-tensorflow-converge() {
+  decode-assoc-tensorflow --max-em-iters 1000
+}
+
+if test $# -eq 0 ; then
+  usage
+else
+  "$@"
+fi
author	Sadaf Ebrahimi <sadafebrahimi@google.com>	2022-09-19 21:26:21 +0000
committer	Sadaf Ebrahimi <sadafebrahimi@google.com>	2022-10-14 17:51:46 +0000
commit	9d7cfa8217b3d5a501b3cf6b35511cc1f5a36948 (patch)
tree	5d2b98a63d4f942bd3babd266cb81763bb59e1dc /bin
parent	912f8cc3fe4b53ebe8931f8f593e18a4f07c96b6 (diff)
download	rappor-9d7cfa8217b3d5a501b3cf6b35511cc1f5a36948.tar.gz