pipeline/csv_to_html.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218

#!/usr/bin/python
"""Reads a CSV file on stdin, and prints an an HTML table on stdout.

The static HTML can then be made made dynamic with JavaScript, e.g. jQuery
DataTable.

Use Cases:

  - overview.csv -- each row is a metric
    - links: to metric page

  - status.csv -- each row is a day
    - links: to log.txt, to results.html
"""

import cgi
import csv
import optparse
import sys

import util


def CreateOptionsParser():
  p = optparse.OptionParser()

  # We are taking a path, and not using stdin, because we read it twice.
  p.add_option(
      '--col-format', dest='col_formats', metavar="'COLNAME FMT'", type='str',
      default=[], action='append',
      help='Add HTML links to the named column, using the given Python '
           '.format() string')

  p.add_option(
      '--def', dest='defs', metavar="'NAME VALUE'", type='str',
      default=[], action='append',
      help='Define varaibles for use in format strings')

  p.add_option(
      '--as-percent', dest='percent_cols', metavar="COLNAME", type='str',
      default=[], action='append',
      help='Format this floating point column as a percentage string')

  # TODO: We could include this by default, and then change all the HTML to
  # have <div> placeholders instead of <table>.
  p.add_option(
      '--table', dest='table', default=False, action='store_true',
      help='Add <table></table> tags (useful for testing)')

  return p


def ParseSpec(arg_list):
  """Given an argument list, return a string -> string dictionary."""
  # The format string is passed the cell value.  Escaped as HTML?
  d = {}
  for s in arg_list:
    try:
      name, value = s.split(' ', 1)
    except ValueError:
      raise RuntimeError('Invalid column format %r' % s)
    d[name] = value
  return d


def PrintRow(row, col_names, col_formats, defs, percent_cols):
  """Print a CSV row as HTML, using the given formatting.

  Returns:
    An array of booleans indicating whether each cell is a number.
  """
  is_number_flags = [False] * len(col_names)

  for i, cell in enumerate(row):
    # The cell as a string.  By default we leave it as is; it may be mutated
    # below.
    cell_str = cell
    css_class = ''  # CSS class for the cell.
    col_name = col_names[i]  # column that the cell is under

    # Does the cell look like a float?
    try:
      cell_float = float(cell)
      if col_name in percent_cols:  # Floats can be formatted as percentages.
        cell_str = '{:.1f}%'.format(cell_float * 100)
      else:
        # Arbitrarily use 3 digits of precision for display
        cell_str = '{:.3f}'.format(cell_float)
      css_class = 'num'
      is_number_flags[i] = True
    except ValueError:
      pass

    # Does it look lik an int?
    try:
      cell_int = int(cell)
      cell_str = '{:,}'.format(cell_int)
      css_class = 'num'
      is_number_flags[i] = True
    except ValueError:
      pass

    # Special CSS class for R NA values.
    if cell_str.strip() == 'NA':
      css_class = 'num na'  # num should right justify; na should make it red
      is_number_flags[i] = True

    if css_class:
      print '    <td class="{}">'.format(css_class),
    else:
      print '    <td>',

    cell_safe = cgi.escape(cell_str)

    # If the cell has a format string, print it this way.

    fmt = col_formats.get(col_name)  # e.g. "../{date}.html"
    if fmt:
      # Copy variable bindings
      bindings = dict(defs)

      # Also let the format string use other column names.  TODO: Is there a
      # more efficient way?
      bindings.update(zip(col_names, [cgi.escape(c) for c in row]))

      bindings[col_name] = cell_safe

      print fmt.format(**bindings),  # no newline
    else:
      print cell_safe,  # no newline

    print '</td>'

  return is_number_flags


def ReadCsv(f):
  """Read the CSV file, returning the column names and rows."""
  c = csv.reader(f)

  # The first row of the CSV is assumed to be a header.  The rest are data.
  col_names = []
  rows = []
  for i, row in enumerate(c):
    if i == 0:
      col_names = row
      continue
    rows.append(row)
  return col_names, rows


def PrintColGroup(col_names, col_is_numeric):
  """Print HTML colgroup element, used for JavaScript sorting."""
  print '<colgroup>'
  for i, col in enumerate(col_names):
    # CSS class is used for sorting
    if col_is_numeric[i]:
      css_class = 'number'
    else:
      css_class = 'case-insensitive'

    # NOTE: id is a comment only; not used
    print '  <col id="{}" type="{}" />'.format(col, css_class)
  print '</colgroup>'


def main(argv):
  (opts, argv) = CreateOptionsParser().parse_args(argv)

  col_formats = ParseSpec(opts.col_formats)
  defs = ParseSpec(opts.defs)

  col_names, rows = ReadCsv(sys.stdin)

  for col in opts.percent_cols:
    if col not in col_names:
      raise RuntimeError('--percent-col %s is not a valid column' % col)

  # By default, we don't print the <table> bit -- that's up to the host page
  if opts.table:
    print '<table>'

  print '<thead>'
  for col in col_names:
    # change _ to space so long column names can wrap
    print '  <td>%s</td>' % cgi.escape(col.replace('_', ' '))
  print '</thead>'

  # Assume all columns are numeric at first.  Look at each row for non-numeric
  # values.
  col_is_numeric = [True] * len(col_names)

  print '<tbody>'
  for row in rows:
    print '  <tr>'
    is_number_flags = PrintRow(row, col_names, col_formats, defs,
                               opts.percent_cols)

    # If one cell in a column is not a number, then the whole cell isn't.
    for (i, is_number) in enumerate(is_number_flags):
      if not is_number:
        col_is_numeric[i] = False

    print '  </tr>'
  print '</tbody>'

  PrintColGroup(col_names, col_is_numeric)

  if opts.table:
    print '</table>'


if __name__ == '__main__':
  try:
    main(sys.argv)
  except RuntimeError, e:
    print >>sys.stderr, 'FATAL: %s' % e
    sys.exit(1)