1 files changed, 343 insertions, 0 deletions
diff --git a/pipeline/metric_status.R b/pipeline/metric_status.R
new file mode 100755
index 0000000..0774423
--- /dev/null
+++ b/pipeline/metric_status.R
@@ -0,0 +1,343 @@
+#!/usr/bin/Rscript
+#
+# Write an overview of task status, per-metric task status, task histograms.
+
+library(data.table)
+library(ggplot2)
+
+options(stringsAsFactors = FALSE)  # get rid of annoying behavior
+
+Log <- function(fmt, ...) {
+  cat(sprintf(fmt, ...))
+  cat('\n')
+}
+
+# max of non-NA values; NA if there are none
+MaybeMax <- function(values) {
+  v <- values[!is.na(values)]
+  if (length(v) == 0) {
+    m <- NA
+  } else {
+    m <- max(v)
+  }
+  as.numeric(m)  # data.table requires this; otherwise we get type errors
+}
+
+# mean of non-NA values; NA if there are none
+MaybeMean <- function(values) {
+  v <- values[!is.na(values)]
+  if (length(v) == 0) {
+    m <- NA
+  } else {
+    m <- mean(v)
+  }
+  as.numeric(m)  # data.table require this; otherwise we get type errors
+}
+
+WriteDistOverview <- function(summary, output_dir) {
+  s <- data.table(summary)  # data.table syntax is easier here
+
+  by_metric <-  s[ , list(
+      params_file = unique(params_file),
+      map_file = unique(map_file),
+      days = length(date),
+      max_num_reports = MaybeMax(num_reports),
+
+      # summarize status
+      ok = sum(status == 'OK'),
+      fail = sum(status == 'FAIL'),
+      timeout = sum(status == 'TIMEOUT'),
+      skipped = sum(status == 'SKIPPED'),
+
+      # TODO: Need to document the meaning of these metrics.
+      # All could be NA
+      # KiB -> MB
+      #max_vm5_peak_mb = MaybeMax(vm5_peak_kib * 1024 / 1e6),
+      #mean_vm5_mean_mb = MaybeMean(vm5_mean_kib * 1024 / 1e6),
+
+      mean_secs = MaybeMean(seconds),
+      mean_allocated_mass = MaybeMean(allocated_mass)
+
+      # unique failure reasons
+      # This can be used when there are different call stacks.
+      #fail_reasons = length(unique(fail_reason[fail_reason != ""]))
+      ), by=metric]
+
+  # Case insensitive sort by metric name
+  by_metric <- by_metric[order(tolower(by_metric$metric)), ]
+
+  overview_path <- file.path(output_dir, 'overview.csv')
+  write.csv(by_metric, file = overview_path, row.names = FALSE)
+  Log("Wrote %s", overview_path)
+
+  by_metric
+}
+
+WriteDistMetricStatus <- function(summary, output_dir) {
+  # Write status.csv, num_reports.csv, and mass.csv for each metric.
+
+  s <- data.table(summary)
+
+  # loop over unique metrics, and write a CSV for each one
+  for (m in unique(s$metric)) {
+    # Select cols, and convert units.  Don't need params / map / metric.
+    subframe <- s[s$metric == m,
+                  list(job_id, date, status,
+                       #vm5_peak_mb = vm5_peak_kib * 1024 / 1e6,
+                       #vm5_mean_mb = vm5_mean_kib * 1024 / 1e6,
+                       num_reports,
+                       seconds,
+                       allocated_mass, num_rappor)]
+
+    # Sort by descending date.  Alphabetical sort works fine for YYYY-MM-DD.
+    subframe <- subframe[order(subframe$date, decreasing = TRUE), ]
+
+    out_path = file.path(output_dir, m, 'status.csv')
+    write.csv(subframe, file = out_path, row.names = FALSE)
+    Log("Wrote %s", out_path)
+  }
+
+  # This one is just for plotting with dygraphs.  TODO: can dygraphs do
+  # something smarter?  Maybe you need to select the column in JavaScript, and
+  # pass it an array, rather than CSV text.
+  for (m in unique(s$metric)) {
+    f1 <- s[s$metric == m, list(date, num_reports)]
+    path1 <- file.path(output_dir, m, 'num_reports.csv')
+    # NOTE: dygraphs (only in Firefox?) doesn't like the quotes around
+    # "2015-04-03".  In general, we can't turn off quotes, because strings with
+    # double quotes will be invalid CSV files.  But in this case, we only have
+    # date and number columns, so we can.  dygraphs is mistaken here.
+    write.csv(f1, file = path1, row.names = FALSE, quote = FALSE)
+    Log("Wrote %s", path1)
+
+    # Write unallocated mass.  TODO: Write the other 2 vars too?
+    f2 <- s[s$metric == m,
+            list(date,
+                 unallocated_mass = 1.0 - allocated_mass)]
+
+    path2 <- file.path(output_dir, m, 'mass.csv')
+    write.csv(f2, file = path2, row.names = FALSE, quote = FALSE)
+    Log("Wrote %s", path2)
+  }
+}
+
+WritePlot <- function(p, outdir, filename, width = 800, height = 600) {
+  filename <- file.path(outdir, filename)
+  png(filename, width = width, height = height)
+  plot(p)
+  dev.off()
+  Log('Wrote %s', filename)
+}
+
+# Make sure the histogram has some valid input.  If we don't do this, ggplot
+# blows up with an unintuitive error message.
+CheckHistogramInput <- function(v) {
+  if (all(is.na(v))) {
+    arg_name <- deparse(substitute(v))  # R idiom to get name
+    Log('FATAL: All values in %s are NA (no successful runs?)', arg_name)
+    quit(status = 1)
+  }
+}
+
+WriteDistHistograms <- function(s, output_dir) {
+  CheckHistogramInput(s$allocated_mass)
+
+  p <- qplot(s$allocated_mass, geom = "histogram")
+  t <- ggtitle("Allocated Mass by Task")
+  x <- xlab("allocated mass")
+  y <- ylab("number of tasks")
+  WritePlot(p + t + x + y, output_dir, 'allocated_mass.png')
+
+  CheckHistogramInput(s$num_rappor)
+
+  p <- qplot(s$num_rappor, geom = "histogram")
+  t <- ggtitle("Detected Strings by Task")
+  x <- xlab("detected strings")
+  y <- ylab("number of tasks")
+  WritePlot(p + t + x + y, output_dir, 'num_rappor.png')
+
+  CheckHistogramInput(s$num_reports)
+
+  p <- qplot(s$num_reports / 1e6, geom = "histogram")
+  t <- ggtitle("Raw Reports by Task")
+  x <- xlab("millions of reports")
+  y <- ylab("number of tasks")
+  WritePlot(p + t + x + y, output_dir, 'num_reports.png')
+
+  CheckHistogramInput(s$seconds)
+
+  p <- qplot(s$seconds, geom = "histogram")
+  t <- ggtitle("Analysis Duration by Task")
+  x <- xlab("seconds")
+  y <- ylab("number of tasks")
+  WritePlot(p + t + x + y, output_dir, 'seconds.png')
+
+  # NOTE: Skipping this for 'series' jobs.
+  if (sum(!is.na(s$vm5_peak_kib)) > 0) {
+    p <- qplot(s$vm5_peak_kib * 1024 / 1e6, geom = "histogram")
+    t <- ggtitle("Peak Memory Usage by Task")
+    x <- xlab("Peak megabytes (1e6 bytes) of memory")
+    y <- ylab("number of tasks")
+    WritePlot(p + t + x + y, output_dir, 'memory.png')
+  }
+}
+
+ProcessAllDist <- function(s, output_dir) {
+  Log('dist: Writing per-metric status.csv')
+  WriteDistMetricStatus(s, output_dir)
+
+  Log('dist: Writing histograms')
+  WriteDistHistograms(s, output_dir)
+
+  Log('dist: Writing aggregated overview.csv')
+  WriteDistOverview(s, output_dir)
+}
+
+# Write the single CSV file loaded by assoc-overview.html.
+WriteAssocOverview <- function(summary, output_dir) {
+  s <- data.table(summary)  # data.table syntax is easier here
+
+  by_metric <-  s[ , list(
+      #params_file = unique(params_file),
+      #map_file = unique(map_file),
+
+      days = length(date),
+      max_num_reports = MaybeMax(num_reports),
+
+      # summarize status
+      ok = sum(status == 'OK'),
+      fail = sum(status == 'FAIL'),
+      timeout = sum(status == 'TIMEOUT'),
+      skipped = sum(status == 'SKIPPED'),
+
+      mean_total_secs = MaybeMean(total_elapsed_seconds),
+      mean_em_secs = MaybeMean(em_elapsed_seconds)
+
+      ), by=list(metric)]
+
+  # Case insensitive sort by metric name
+  by_metric <- by_metric[order(tolower(by_metric$metric)), ]
+
+  overview_path <- file.path(output_dir, 'assoc-overview.csv')
+  write.csv(by_metric, file = overview_path, row.names = FALSE)
+  Log("Wrote %s", overview_path)
+
+  by_metric
+}
+
+# Write the CSV files loaded by assoc-metric.html -- that is, one
+# metric-status.csv for each metric name.
+WriteAssocMetricStatus <- function(summary, output_dir) {
+  s <- data.table(summary)
+  csv_list <- unique(s[, list(metric)])
+  for (i in 1:nrow(csv_list)) {
+    u <- csv_list[i, ]
+    # Select cols, and convert units.  Don't need params / map / metric.
+    by_pair <- s[s$metric == u$metric,
+                 list(days = length(date),
+                      max_num_reports = MaybeMax(num_reports),
+
+                      # summarize status
+                      ok = sum(status == 'OK'),
+                      fail = sum(status == 'FAIL'),
+                      timeout = sum(status == 'TIMEOUT'),
+                      skipped = sum(status == 'SKIPPED'),
+
+                      mean_total_secs = MaybeMean(total_elapsed_seconds),
+                      mean_em_secs = MaybeMean(em_elapsed_seconds)
+                      ),
+                 by=list(var1, var2)]
+
+    # Case insensitive sort by var1 name
+    by_pair <- by_pair[order(tolower(by_pair$var1)), ]
+
+    csv_path <- file.path(output_dir, u$metric, 'metric-status.csv')
+    write.csv(by_pair, file = csv_path, row.names = FALSE)
+    Log("Wrote %s", csv_path)
+  }
+}
+
+# This naming convention is in task_spec.py AssocTaskSpec.
+FormatAssocRelPath <- function(metric, var1, var2) {
+  v2 <- gsub('..', '_', var2, fixed = TRUE)
+  var_dir <- sprintf('%s_X_%s', var1, v2)
+  file.path(metric, var_dir)
+}
+
+# Write the CSV files loaded by assoc-pair.html -- that is, one pair-status.csv
+# for each (metric, var1, var2) pair.
+WriteAssocPairStatus <- function(summary, output_dir) {
+
+  s <- data.table(summary)
+
+  csv_list <- unique(s[, list(metric, var1, var2)])
+  Log('CSV list:')
+  print(csv_list)
+
+  # loop over unique metrics, and write a CSV for each one
+  for (i in 1:nrow(csv_list)) {
+    u <- csv_list[i, ]
+
+    # Select cols, and convert units.  Don't need params / map / metric.
+    subframe <- s[s$metric == u$metric & s$var1 == u$var1 & s$var2 == u$var2,
+                  list(job_id, date, status,
+                       num_reports, d1, d2,
+                       total_elapsed_seconds,
+                       em_elapsed_seconds)]
+
+    # Sort by descending date.  Alphabetical sort works fine for YYYY-MM-DD.
+    subframe <- subframe[order(subframe$date, decreasing = TRUE), ]
+
+    pair_rel_path <- FormatAssocRelPath(u$metric, u$var1, u$var2)
+
+    csv_path <- file.path(output_dir, pair_rel_path, 'pair-status.csv')
+    write.csv(subframe, file = csv_path, row.names = FALSE)
+    Log("Wrote %s", csv_path)
+
+    # Write a file with the raw variable names.  Parsed by ui.sh, to pass to
+    # csv_to_html.py.
+    meta_path <- file.path(output_dir, pair_rel_path, 'pair-metadata.txt')
+
+    # NOTE: The conversion from data.table to character vector requires
+    # stringsAsFactors to work correctly!
+    lines <- as.character(u)
+    writeLines(lines, con = meta_path)
+    Log("Wrote %s", meta_path)
+  }
+}
+
+ProcessAllAssoc <- function(s, output_dir) {
+  Log('assoc: Writing pair-status.csv for each variable pair in each metric')
+  WriteAssocPairStatus(s, output_dir)
+
+  Log('assoc: Writing metric-status.csv for each metric')
+  WriteAssocMetricStatus(s, output_dir)
+
+  Log('assoc: Writing aggregated overview.csv')
+  WriteAssocOverview(s, output_dir)
+}
+
+main <- function(argv) {
+  # increase ggplot font size globally
+  theme_set(theme_grey(base_size = 16))
+
+  action = argv[[1]]
+  input = argv[[2]]
+  output_dir = argv[[3]]
+
+  if (action == 'dist') {
+    summary = read.csv(input)
+    ProcessAllDist(summary, output_dir)
+  } else if (action == 'assoc') {
+    summary = read.csv(input)
+    ProcessAllAssoc(summary, output_dir)
+  } else {
+    stop(sprintf('Invalid action %s', action))
+  }
+
+  Log('Done')
+}
+
+if (length(sys.frames()) == 0) {
+  main(commandArgs(TRUE))
+}