aboutsummaryrefslogtreecommitdiff
path: root/pipeline/metric_status.R
diff options
context:
space:
mode:
Diffstat (limited to 'pipeline/metric_status.R')
-rwxr-xr-xpipeline/metric_status.R343
1 files changed, 343 insertions, 0 deletions
diff --git a/pipeline/metric_status.R b/pipeline/metric_status.R
new file mode 100755
index 0000000..0774423
--- /dev/null
+++ b/pipeline/metric_status.R
@@ -0,0 +1,343 @@
+#!/usr/bin/Rscript
+#
+# Write an overview of task status, per-metric task status, task histograms.
+
+library(data.table)
+library(ggplot2)
+
+options(stringsAsFactors = FALSE) # get rid of annoying behavior
+
+Log <- function(fmt, ...) {
+ cat(sprintf(fmt, ...))
+ cat('\n')
+}
+
+# max of non-NA values; NA if there are none
+MaybeMax <- function(values) {
+ v <- values[!is.na(values)]
+ if (length(v) == 0) {
+ m <- NA
+ } else {
+ m <- max(v)
+ }
+ as.numeric(m) # data.table requires this; otherwise we get type errors
+}
+
+# mean of non-NA values; NA if there are none
+MaybeMean <- function(values) {
+ v <- values[!is.na(values)]
+ if (length(v) == 0) {
+ m <- NA
+ } else {
+ m <- mean(v)
+ }
+ as.numeric(m) # data.table require this; otherwise we get type errors
+}
+
+WriteDistOverview <- function(summary, output_dir) {
+ s <- data.table(summary) # data.table syntax is easier here
+
+ by_metric <- s[ , list(
+ params_file = unique(params_file),
+ map_file = unique(map_file),
+ days = length(date),
+ max_num_reports = MaybeMax(num_reports),
+
+ # summarize status
+ ok = sum(status == 'OK'),
+ fail = sum(status == 'FAIL'),
+ timeout = sum(status == 'TIMEOUT'),
+ skipped = sum(status == 'SKIPPED'),
+
+ # TODO: Need to document the meaning of these metrics.
+ # All could be NA
+ # KiB -> MB
+ #max_vm5_peak_mb = MaybeMax(vm5_peak_kib * 1024 / 1e6),
+ #mean_vm5_mean_mb = MaybeMean(vm5_mean_kib * 1024 / 1e6),
+
+ mean_secs = MaybeMean(seconds),
+ mean_allocated_mass = MaybeMean(allocated_mass)
+
+ # unique failure reasons
+ # This can be used when there are different call stacks.
+ #fail_reasons = length(unique(fail_reason[fail_reason != ""]))
+ ), by=metric]
+
+ # Case insensitive sort by metric name
+ by_metric <- by_metric[order(tolower(by_metric$metric)), ]
+
+ overview_path <- file.path(output_dir, 'overview.csv')
+ write.csv(by_metric, file = overview_path, row.names = FALSE)
+ Log("Wrote %s", overview_path)
+
+ by_metric
+}
+
+WriteDistMetricStatus <- function(summary, output_dir) {
+ # Write status.csv, num_reports.csv, and mass.csv for each metric.
+
+ s <- data.table(summary)
+
+ # loop over unique metrics, and write a CSV for each one
+ for (m in unique(s$metric)) {
+ # Select cols, and convert units. Don't need params / map / metric.
+ subframe <- s[s$metric == m,
+ list(job_id, date, status,
+ #vm5_peak_mb = vm5_peak_kib * 1024 / 1e6,
+ #vm5_mean_mb = vm5_mean_kib * 1024 / 1e6,
+ num_reports,
+ seconds,
+ allocated_mass, num_rappor)]
+
+ # Sort by descending date. Alphabetical sort works fine for YYYY-MM-DD.
+ subframe <- subframe[order(subframe$date, decreasing = TRUE), ]
+
+ out_path = file.path(output_dir, m, 'status.csv')
+ write.csv(subframe, file = out_path, row.names = FALSE)
+ Log("Wrote %s", out_path)
+ }
+
+ # This one is just for plotting with dygraphs. TODO: can dygraphs do
+ # something smarter? Maybe you need to select the column in JavaScript, and
+ # pass it an array, rather than CSV text.
+ for (m in unique(s$metric)) {
+ f1 <- s[s$metric == m, list(date, num_reports)]
+ path1 <- file.path(output_dir, m, 'num_reports.csv')
+ # NOTE: dygraphs (only in Firefox?) doesn't like the quotes around
+ # "2015-04-03". In general, we can't turn off quotes, because strings with
+ # double quotes will be invalid CSV files. But in this case, we only have
+ # date and number columns, so we can. dygraphs is mistaken here.
+ write.csv(f1, file = path1, row.names = FALSE, quote = FALSE)
+ Log("Wrote %s", path1)
+
+ # Write unallocated mass. TODO: Write the other 2 vars too?
+ f2 <- s[s$metric == m,
+ list(date,
+ unallocated_mass = 1.0 - allocated_mass)]
+
+ path2 <- file.path(output_dir, m, 'mass.csv')
+ write.csv(f2, file = path2, row.names = FALSE, quote = FALSE)
+ Log("Wrote %s", path2)
+ }
+}
+
+WritePlot <- function(p, outdir, filename, width = 800, height = 600) {
+ filename <- file.path(outdir, filename)
+ png(filename, width = width, height = height)
+ plot(p)
+ dev.off()
+ Log('Wrote %s', filename)
+}
+
+# Make sure the histogram has some valid input. If we don't do this, ggplot
+# blows up with an unintuitive error message.
+CheckHistogramInput <- function(v) {
+ if (all(is.na(v))) {
+ arg_name <- deparse(substitute(v)) # R idiom to get name
+ Log('FATAL: All values in %s are NA (no successful runs?)', arg_name)
+ quit(status = 1)
+ }
+}
+
+WriteDistHistograms <- function(s, output_dir) {
+ CheckHistogramInput(s$allocated_mass)
+
+ p <- qplot(s$allocated_mass, geom = "histogram")
+ t <- ggtitle("Allocated Mass by Task")
+ x <- xlab("allocated mass")
+ y <- ylab("number of tasks")
+ WritePlot(p + t + x + y, output_dir, 'allocated_mass.png')
+
+ CheckHistogramInput(s$num_rappor)
+
+ p <- qplot(s$num_rappor, geom = "histogram")
+ t <- ggtitle("Detected Strings by Task")
+ x <- xlab("detected strings")
+ y <- ylab("number of tasks")
+ WritePlot(p + t + x + y, output_dir, 'num_rappor.png')
+
+ CheckHistogramInput(s$num_reports)
+
+ p <- qplot(s$num_reports / 1e6, geom = "histogram")
+ t <- ggtitle("Raw Reports by Task")
+ x <- xlab("millions of reports")
+ y <- ylab("number of tasks")
+ WritePlot(p + t + x + y, output_dir, 'num_reports.png')
+
+ CheckHistogramInput(s$seconds)
+
+ p <- qplot(s$seconds, geom = "histogram")
+ t <- ggtitle("Analysis Duration by Task")
+ x <- xlab("seconds")
+ y <- ylab("number of tasks")
+ WritePlot(p + t + x + y, output_dir, 'seconds.png')
+
+ # NOTE: Skipping this for 'series' jobs.
+ if (sum(!is.na(s$vm5_peak_kib)) > 0) {
+ p <- qplot(s$vm5_peak_kib * 1024 / 1e6, geom = "histogram")
+ t <- ggtitle("Peak Memory Usage by Task")
+ x <- xlab("Peak megabytes (1e6 bytes) of memory")
+ y <- ylab("number of tasks")
+ WritePlot(p + t + x + y, output_dir, 'memory.png')
+ }
+}
+
+ProcessAllDist <- function(s, output_dir) {
+ Log('dist: Writing per-metric status.csv')
+ WriteDistMetricStatus(s, output_dir)
+
+ Log('dist: Writing histograms')
+ WriteDistHistograms(s, output_dir)
+
+ Log('dist: Writing aggregated overview.csv')
+ WriteDistOverview(s, output_dir)
+}
+
+# Write the single CSV file loaded by assoc-overview.html.
+WriteAssocOverview <- function(summary, output_dir) {
+ s <- data.table(summary) # data.table syntax is easier here
+
+ by_metric <- s[ , list(
+ #params_file = unique(params_file),
+ #map_file = unique(map_file),
+
+ days = length(date),
+ max_num_reports = MaybeMax(num_reports),
+
+ # summarize status
+ ok = sum(status == 'OK'),
+ fail = sum(status == 'FAIL'),
+ timeout = sum(status == 'TIMEOUT'),
+ skipped = sum(status == 'SKIPPED'),
+
+ mean_total_secs = MaybeMean(total_elapsed_seconds),
+ mean_em_secs = MaybeMean(em_elapsed_seconds)
+
+ ), by=list(metric)]
+
+ # Case insensitive sort by metric name
+ by_metric <- by_metric[order(tolower(by_metric$metric)), ]
+
+ overview_path <- file.path(output_dir, 'assoc-overview.csv')
+ write.csv(by_metric, file = overview_path, row.names = FALSE)
+ Log("Wrote %s", overview_path)
+
+ by_metric
+}
+
+# Write the CSV files loaded by assoc-metric.html -- that is, one
+# metric-status.csv for each metric name.
+WriteAssocMetricStatus <- function(summary, output_dir) {
+ s <- data.table(summary)
+ csv_list <- unique(s[, list(metric)])
+ for (i in 1:nrow(csv_list)) {
+ u <- csv_list[i, ]
+ # Select cols, and convert units. Don't need params / map / metric.
+ by_pair <- s[s$metric == u$metric,
+ list(days = length(date),
+ max_num_reports = MaybeMax(num_reports),
+
+ # summarize status
+ ok = sum(status == 'OK'),
+ fail = sum(status == 'FAIL'),
+ timeout = sum(status == 'TIMEOUT'),
+ skipped = sum(status == 'SKIPPED'),
+
+ mean_total_secs = MaybeMean(total_elapsed_seconds),
+ mean_em_secs = MaybeMean(em_elapsed_seconds)
+ ),
+ by=list(var1, var2)]
+
+ # Case insensitive sort by var1 name
+ by_pair <- by_pair[order(tolower(by_pair$var1)), ]
+
+ csv_path <- file.path(output_dir, u$metric, 'metric-status.csv')
+ write.csv(by_pair, file = csv_path, row.names = FALSE)
+ Log("Wrote %s", csv_path)
+ }
+}
+
+# This naming convention is in task_spec.py AssocTaskSpec.
+FormatAssocRelPath <- function(metric, var1, var2) {
+ v2 <- gsub('..', '_', var2, fixed = TRUE)
+ var_dir <- sprintf('%s_X_%s', var1, v2)
+ file.path(metric, var_dir)
+}
+
+# Write the CSV files loaded by assoc-pair.html -- that is, one pair-status.csv
+# for each (metric, var1, var2) pair.
+WriteAssocPairStatus <- function(summary, output_dir) {
+
+ s <- data.table(summary)
+
+ csv_list <- unique(s[, list(metric, var1, var2)])
+ Log('CSV list:')
+ print(csv_list)
+
+ # loop over unique metrics, and write a CSV for each one
+ for (i in 1:nrow(csv_list)) {
+ u <- csv_list[i, ]
+
+ # Select cols, and convert units. Don't need params / map / metric.
+ subframe <- s[s$metric == u$metric & s$var1 == u$var1 & s$var2 == u$var2,
+ list(job_id, date, status,
+ num_reports, d1, d2,
+ total_elapsed_seconds,
+ em_elapsed_seconds)]
+
+ # Sort by descending date. Alphabetical sort works fine for YYYY-MM-DD.
+ subframe <- subframe[order(subframe$date, decreasing = TRUE), ]
+
+ pair_rel_path <- FormatAssocRelPath(u$metric, u$var1, u$var2)
+
+ csv_path <- file.path(output_dir, pair_rel_path, 'pair-status.csv')
+ write.csv(subframe, file = csv_path, row.names = FALSE)
+ Log("Wrote %s", csv_path)
+
+ # Write a file with the raw variable names. Parsed by ui.sh, to pass to
+ # csv_to_html.py.
+ meta_path <- file.path(output_dir, pair_rel_path, 'pair-metadata.txt')
+
+ # NOTE: The conversion from data.table to character vector requires
+ # stringsAsFactors to work correctly!
+ lines <- as.character(u)
+ writeLines(lines, con = meta_path)
+ Log("Wrote %s", meta_path)
+ }
+}
+
+ProcessAllAssoc <- function(s, output_dir) {
+ Log('assoc: Writing pair-status.csv for each variable pair in each metric')
+ WriteAssocPairStatus(s, output_dir)
+
+ Log('assoc: Writing metric-status.csv for each metric')
+ WriteAssocMetricStatus(s, output_dir)
+
+ Log('assoc: Writing aggregated overview.csv')
+ WriteAssocOverview(s, output_dir)
+}
+
+main <- function(argv) {
+ # increase ggplot font size globally
+ theme_set(theme_grey(base_size = 16))
+
+ action = argv[[1]]
+ input = argv[[2]]
+ output_dir = argv[[3]]
+
+ if (action == 'dist') {
+ summary = read.csv(input)
+ ProcessAllDist(summary, output_dir)
+ } else if (action == 'assoc') {
+ summary = read.csv(input)
+ ProcessAllAssoc(summary, output_dir)
+ } else {
+ stop(sprintf('Invalid action %s', action))
+ }
+
+ Log('Done')
+}
+
+if (length(sys.frames()) == 0) {
+ main(commandArgs(TRUE))
+}