1 files changed, 218 insertions, 0 deletions
diff --git a/pipeline/csv_to_html.py b/pipeline/csv_to_html.py
new file mode 100755
index 0000000..e4d76ae
--- /dev/null
+++ b/pipeline/csv_to_html.py
@@ -0,0 +1,218 @@
+#!/usr/bin/python
+"""Reads a CSV file on stdin, and prints an an HTML table on stdout.
+
+The static HTML can then be made made dynamic with JavaScript, e.g. jQuery
+DataTable.
+
+Use Cases:
+
+  - overview.csv -- each row is a metric
+    - links: to metric page
+
+  - status.csv -- each row is a day
+    - links: to log.txt, to results.html
+"""
+
+import cgi
+import csv
+import optparse
+import sys
+
+import util
+
+
+def CreateOptionsParser():
+  p = optparse.OptionParser()
+
+  # We are taking a path, and not using stdin, because we read it twice.
+  p.add_option(
+      '--col-format', dest='col_formats', metavar="'COLNAME FMT'", type='str',
+      default=[], action='append',
+      help='Add HTML links to the named column, using the given Python '
+           '.format() string')
+
+  p.add_option(
+      '--def', dest='defs', metavar="'NAME VALUE'", type='str',
+      default=[], action='append',
+      help='Define varaibles for use in format strings')
+
+  p.add_option(
+      '--as-percent', dest='percent_cols', metavar="COLNAME", type='str',
+      default=[], action='append',
+      help='Format this floating point column as a percentage string')
+
+  # TODO: We could include this by default, and then change all the HTML to
+  # have <div> placeholders instead of <table>.
+  p.add_option(
+      '--table', dest='table', default=False, action='store_true',
+      help='Add <table></table> tags (useful for testing)')
+
+  return p
+
+
+def ParseSpec(arg_list):
+  """Given an argument list, return a string -> string dictionary."""
+  # The format string is passed the cell value.  Escaped as HTML?
+  d = {}
+  for s in arg_list:
+    try:
+      name, value = s.split(' ', 1)
+    except ValueError:
+      raise RuntimeError('Invalid column format %r' % s)
+    d[name] = value
+  return d
+
+
+def PrintRow(row, col_names, col_formats, defs, percent_cols):
+  """Print a CSV row as HTML, using the given formatting.
+
+  Returns:
+    An array of booleans indicating whether each cell is a number.
+  """
+  is_number_flags = [False] * len(col_names)
+
+  for i, cell in enumerate(row):
+    # The cell as a string.  By default we leave it as is; it may be mutated
+    # below.
+    cell_str = cell
+    css_class = ''  # CSS class for the cell.
+    col_name = col_names[i]  # column that the cell is under
+
+    # Does the cell look like a float?
+    try:
+      cell_float = float(cell)
+      if col_name in percent_cols:  # Floats can be formatted as percentages.
+        cell_str = '{:.1f}%'.format(cell_float * 100)
+      else:
+        # Arbitrarily use 3 digits of precision for display
+        cell_str = '{:.3f}'.format(cell_float)
+      css_class = 'num'
+      is_number_flags[i] = True
+    except ValueError:
+      pass
+
+    # Does it look lik an int?
+    try:
+      cell_int = int(cell)
+      cell_str = '{:,}'.format(cell_int)
+      css_class = 'num'
+      is_number_flags[i] = True
+    except ValueError:
+      pass
+
+    # Special CSS class for R NA values.
+    if cell_str.strip() == 'NA':
+      css_class = 'num na'  # num should right justify; na should make it red
+      is_number_flags[i] = True
+
+    if css_class:
+      print '    <td class="{}">'.format(css_class),
+    else:
+      print '    <td>',
+
+    cell_safe = cgi.escape(cell_str)
+
+    # If the cell has a format string, print it this way.
+
+    fmt = col_formats.get(col_name)  # e.g. "../{date}.html"
+    if fmt:
+      # Copy variable bindings
+      bindings = dict(defs)
+
+      # Also let the format string use other column names.  TODO: Is there a
+      # more efficient way?
+      bindings.update(zip(col_names, [cgi.escape(c) for c in row]))
+
+      bindings[col_name] = cell_safe
+
+      print fmt.format(**bindings),  # no newline
+    else:
+      print cell_safe,  # no newline
+
+    print '</td>'
+
+  return is_number_flags
+
+
+def ReadCsv(f):
+  """Read the CSV file, returning the column names and rows."""
+  c = csv.reader(f)
+
+  # The first row of the CSV is assumed to be a header.  The rest are data.
+  col_names = []
+  rows = []
+  for i, row in enumerate(c):
+    if i == 0:
+      col_names = row
+      continue
+    rows.append(row)
+  return col_names, rows
+
+
+def PrintColGroup(col_names, col_is_numeric):
+  """Print HTML colgroup element, used for JavaScript sorting."""
+  print '<colgroup>'
+  for i, col in enumerate(col_names):
+    # CSS class is used for sorting
+    if col_is_numeric[i]:
+      css_class = 'number'
+    else:
+      css_class = 'case-insensitive'
+
+    # NOTE: id is a comment only; not used
+    print '  <col id="{}" type="{}" />'.format(col, css_class)
+  print '</colgroup>'
+
+
+def main(argv):
+  (opts, argv) = CreateOptionsParser().parse_args(argv)
+
+  col_formats = ParseSpec(opts.col_formats)
+  defs = ParseSpec(opts.defs)
+
+  col_names, rows = ReadCsv(sys.stdin)
+
+  for col in opts.percent_cols:
+    if col not in col_names:
+      raise RuntimeError('--percent-col %s is not a valid column' % col)
+
+  # By default, we don't print the <table> bit -- that's up to the host page
+  if opts.table:
+    print '<table>'
+
+  print '<thead>'
+  for col in col_names:
+    # change _ to space so long column names can wrap
+    print '  <td>%s</td>' % cgi.escape(col.replace('_', ' '))
+  print '</thead>'
+
+  # Assume all columns are numeric at first.  Look at each row for non-numeric
+  # values.
+  col_is_numeric = [True] * len(col_names)
+
+  print '<tbody>'
+  for row in rows:
+    print '  <tr>'
+    is_number_flags = PrintRow(row, col_names, col_formats, defs,
+                               opts.percent_cols)
+
+    # If one cell in a column is not a number, then the whole cell isn't.
+    for (i, is_number) in enumerate(is_number_flags):
+      if not is_number:
+        col_is_numeric[i] = False
+
+    print '  </tr>'
+  print '</tbody>'
+
+  PrintColGroup(col_names, col_is_numeric)
+
+  if opts.table:
+    print '</table>'
+
+
+if __name__ == '__main__':
+  try:
+    main(sys.argv)
+  except RuntimeError, e:
+    print >>sys.stderr, 'FATAL: %s' % e
+    sys.exit(1)