diff options
Diffstat (limited to 'pipeline/csv_to_html.py')
-rwxr-xr-x | pipeline/csv_to_html.py | 218 |
1 files changed, 218 insertions, 0 deletions
diff --git a/pipeline/csv_to_html.py b/pipeline/csv_to_html.py new file mode 100755 index 0000000..e4d76ae --- /dev/null +++ b/pipeline/csv_to_html.py @@ -0,0 +1,218 @@ +#!/usr/bin/python +"""Reads a CSV file on stdin, and prints an an HTML table on stdout. + +The static HTML can then be made made dynamic with JavaScript, e.g. jQuery +DataTable. + +Use Cases: + + - overview.csv -- each row is a metric + - links: to metric page + + - status.csv -- each row is a day + - links: to log.txt, to results.html +""" + +import cgi +import csv +import optparse +import sys + +import util + + +def CreateOptionsParser(): + p = optparse.OptionParser() + + # We are taking a path, and not using stdin, because we read it twice. + p.add_option( + '--col-format', dest='col_formats', metavar="'COLNAME FMT'", type='str', + default=[], action='append', + help='Add HTML links to the named column, using the given Python ' + '.format() string') + + p.add_option( + '--def', dest='defs', metavar="'NAME VALUE'", type='str', + default=[], action='append', + help='Define varaibles for use in format strings') + + p.add_option( + '--as-percent', dest='percent_cols', metavar="COLNAME", type='str', + default=[], action='append', + help='Format this floating point column as a percentage string') + + # TODO: We could include this by default, and then change all the HTML to + # have <div> placeholders instead of <table>. + p.add_option( + '--table', dest='table', default=False, action='store_true', + help='Add <table></table> tags (useful for testing)') + + return p + + +def ParseSpec(arg_list): + """Given an argument list, return a string -> string dictionary.""" + # The format string is passed the cell value. Escaped as HTML? + d = {} + for s in arg_list: + try: + name, value = s.split(' ', 1) + except ValueError: + raise RuntimeError('Invalid column format %r' % s) + d[name] = value + return d + + +def PrintRow(row, col_names, col_formats, defs, percent_cols): + """Print a CSV row as HTML, using the given formatting. + + Returns: + An array of booleans indicating whether each cell is a number. + """ + is_number_flags = [False] * len(col_names) + + for i, cell in enumerate(row): + # The cell as a string. By default we leave it as is; it may be mutated + # below. + cell_str = cell + css_class = '' # CSS class for the cell. + col_name = col_names[i] # column that the cell is under + + # Does the cell look like a float? + try: + cell_float = float(cell) + if col_name in percent_cols: # Floats can be formatted as percentages. + cell_str = '{:.1f}%'.format(cell_float * 100) + else: + # Arbitrarily use 3 digits of precision for display + cell_str = '{:.3f}'.format(cell_float) + css_class = 'num' + is_number_flags[i] = True + except ValueError: + pass + + # Does it look lik an int? + try: + cell_int = int(cell) + cell_str = '{:,}'.format(cell_int) + css_class = 'num' + is_number_flags[i] = True + except ValueError: + pass + + # Special CSS class for R NA values. + if cell_str.strip() == 'NA': + css_class = 'num na' # num should right justify; na should make it red + is_number_flags[i] = True + + if css_class: + print ' <td class="{}">'.format(css_class), + else: + print ' <td>', + + cell_safe = cgi.escape(cell_str) + + # If the cell has a format string, print it this way. + + fmt = col_formats.get(col_name) # e.g. "../{date}.html" + if fmt: + # Copy variable bindings + bindings = dict(defs) + + # Also let the format string use other column names. TODO: Is there a + # more efficient way? + bindings.update(zip(col_names, [cgi.escape(c) for c in row])) + + bindings[col_name] = cell_safe + + print fmt.format(**bindings), # no newline + else: + print cell_safe, # no newline + + print '</td>' + + return is_number_flags + + +def ReadCsv(f): + """Read the CSV file, returning the column names and rows.""" + c = csv.reader(f) + + # The first row of the CSV is assumed to be a header. The rest are data. + col_names = [] + rows = [] + for i, row in enumerate(c): + if i == 0: + col_names = row + continue + rows.append(row) + return col_names, rows + + +def PrintColGroup(col_names, col_is_numeric): + """Print HTML colgroup element, used for JavaScript sorting.""" + print '<colgroup>' + for i, col in enumerate(col_names): + # CSS class is used for sorting + if col_is_numeric[i]: + css_class = 'number' + else: + css_class = 'case-insensitive' + + # NOTE: id is a comment only; not used + print ' <col id="{}" type="{}" />'.format(col, css_class) + print '</colgroup>' + + +def main(argv): + (opts, argv) = CreateOptionsParser().parse_args(argv) + + col_formats = ParseSpec(opts.col_formats) + defs = ParseSpec(opts.defs) + + col_names, rows = ReadCsv(sys.stdin) + + for col in opts.percent_cols: + if col not in col_names: + raise RuntimeError('--percent-col %s is not a valid column' % col) + + # By default, we don't print the <table> bit -- that's up to the host page + if opts.table: + print '<table>' + + print '<thead>' + for col in col_names: + # change _ to space so long column names can wrap + print ' <td>%s</td>' % cgi.escape(col.replace('_', ' ')) + print '</thead>' + + # Assume all columns are numeric at first. Look at each row for non-numeric + # values. + col_is_numeric = [True] * len(col_names) + + print '<tbody>' + for row in rows: + print ' <tr>' + is_number_flags = PrintRow(row, col_names, col_formats, defs, + opts.percent_cols) + + # If one cell in a column is not a number, then the whole cell isn't. + for (i, is_number) in enumerate(is_number_flags): + if not is_number: + col_is_numeric[i] = False + + print ' </tr>' + print '</tbody>' + + PrintColGroup(col_names, col_is_numeric) + + if opts.table: + print '</table>' + + +if __name__ == '__main__': + try: + main(sys.argv) + except RuntimeError, e: + print >>sys.stderr, 'FATAL: %s' % e + sys.exit(1) |