Package qcsv ::
Module qcsv
|
|
1 import csv
2
3
4 -def read(fname, delimiter=',', skip_header=False):
5 """
6 read loads cell data, column headers and type information for each column
7 given a file path to a CSV formatted file.
8
9 All cells have left and right whitespace trimmed.
10
11 All rows MUST be the same length.
12
13 delimiter is the string the separates each field in a row.
14
15 If skip_header is set, then no column headers are read, and column names
16 are set to their corresponding indices (as strings).
17 """
18 names, rows = data(f, delimiter, skip_header)
19 types = column_types(names, rows)
20 rows = cast(types, names, rows)
21 return types, names, rows
22
23
24 -def data(fname, delimiter=',', skip_header=False):
25 """
26 data loads cell data and column headers.
27
28 All cells have left and right whitespace trimmed.
29
30 All rows MUST be the same length.
31
32 delimiter and skip_header are described in read.
33 """
34 names = []
35 rows = []
36 reader = csv.reader(open(fname), delimiter=delimiter)
37 if not skip_header:
38 names = map(str.strip, reader.next())
39
40 for i, row in enumerate(reader):
41
42
43
44 if len(names) == 0:
45 names = map(str, range(0, len(row)))
46 assert len(row) == len(names), \
47 'The length of row %d is %d, but others rows have length %d' \
48 % (i, len(row), len(names))
49
50 rows.append(map(str.strip, row))
51
52 return names, rows
53
54
56 """
57 column_types infers type information from the columns in rows. Types are
58 stored as either a Python type conversion function (str, int or float) or
59 as a None value.
60
61 A column has type None if and only if all cells in the column are empty.
62 (Cells are empty if the length of its value is zero after left and right
63 whitespace has been trimmed.)
64
65 A column has type float if and only if all cells in the column are empty,
66 integers or floats AND at least one value is a float.
67
68 A column has type int if and only if all cells in the column are empty or
69 integers AND at least one value is an int.
70
71 A column has type string in any other case.
72 """
73 types = dict([(name, None) for name in names])
74 for row in rows:
75 for i, col in enumerate(row):
76 name = names[i]
77
78
79
80 prev_typ = types[name]
81
82
83
84
85 next_typ = None
86
87
88 if len(col) == 0:
89 next_typ = None
90 else:
91
92
93
94 try:
95
96
97
98 int(col)
99 next_typ = int
100 except ValueError:
101 try:
102
103
104 float(col)
105 next_typ = float
106 except ValueError:
107 next_typ = str
108
109
110
111 if prev_typ == str or next_typ == str:
112 types[name] = str
113
114 elif next_typ == float and prev_typ == int:
115 types[name] = float
116
117 elif prev_typ is None and next_typ is not None:
118 types[name] = next_typ
119 return types
120
121
122 -def cast(types, names, rows):
123 """
124 cast type casts all of the values in 'rows' to their corresponding types
125 in types.
126
127 The only special case here is missing values or NULL columns. If a value
128 is missing or a column has type NULL (i.e., all values are missing), then
129 the value is replaced with None, which is Python's version of a NULL value.
130
131 N.B. cast is idempotent. i.e., cast(x) = cast(cast(x)).
132 """
133 new_rows = []
134 for row in rows:
135 new_row = []
136 for i, col in enumerate(row):
137 typ = types[names[i]]
138 if (isinstance(col, basestring) and len(col) == 0) \
139 or typ is None or col is None:
140 new_row.append(None)
141 else:
142 new_row.append(typ(col))
143 new_rows.append(new_row)
144 return new_rows
145
146
148 """
149 convert_missing_cells changes the values of all NULL cells to the values
150 specified by dstr, dint and dfloat. For example, all NULL cells in columns
151 with type "string" will be replaced with the value given to dstr.
152 """
153 new_rows = []
154 for row in rows:
155 new_row = []
156 for i, col in enumerate(row):
157 name = names[i]
158 typ = types[name]
159 if col is None and typ is not None:
160 if typ == str:
161 new_row.append(dstr)
162 elif typ == int:
163 new_row.append(dint)
164 elif typ == float:
165 new_row.append(dfloat)
166 else:
167 assert False, "Unknown type: %s" % typ
168 else:
169 new_row.append(col)
170 new_rows.append(new_row)
171 return new_rows
172
173
175 """
176 convert_columns executes converter functions on specific columns, where
177 the parameter names for kwargs are the column names, and the parameter
178 values are functions of one parameter that return a single value.
179
180 e.g., convert_columns(names, rows, colname=lambda s: s.lower()) would
181 convert all values in the column with name 'colname' to lowercase.
182 """
183 new_rows = []
184 for row in rows:
185 new_row = []
186 for i, col in enumerate(row):
187 name = names[i]
188 if name in kwargs:
189 new_row.append(kwargs[name](col))
190 else:
191 new_row.append(col)
192 new_rows.append(new_row)
193 return new_rows
194
195
196 -def convert_types(types, names, rows, fstr=None, fint=None, ffloat=None):
197 """
198 convert_types works just like convert_columns, but on types instead of
199 specific columns. This function will likely be more useful, since
200 sanitizatiion functions are typically type oriented rather than column
201 oriented.
202
203 However, when there are specific kinds of columns that need special
204 sanitization, convert_columns should be used.
205 """
206 new_rows = []
207 for row in rows:
208 new_row = []
209 for i, col in enumerate(row):
210 name = names[i]
211 typ = types[name]
212 if typ == str and fstr is not None:
213 new_row.append(fstr(col))
214 elif typ == int and fint is not None:
215 new_row.append(fint(col))
216 elif typ == float and ffloat is not None:
217 new_row.append(ffloat(col))
218 else:
219 new_row.append(col)
220 new_rows.append(new_row)
221 return new_rows
222
223
224 -def column(types, names, rows, colname):
225 """
226 column returns the column with name "colname", where the column returned
227 is a triple of the column type, the column name and a list of cells in the
228 column.
229 """
230 colcells = []
231 colname = colname.lower()
232 colindex = -1
233 for i, name in enumerate(names):
234 if name.lower() == colname.lower():
235 colindex = i
236 break
237 assert colindex > -1, 'Column name %s does not exist' % colname
238
239 for row in rows:
240 for i, col in enumerate(row):
241 if i == colindex:
242 colcells.append(col)
243
244 return types[names[colindex]], names[colindex], colcells
245
246
248 """
249 columns returns a list of all columns in the data set, where each column
250 is a triple of its type, name and a list of cells in the column.
251 """
252 colcells = []
253 for _ in names:
254 colcells.append([])
255 for row in rows:
256 for i, col in enumerate(row):
257 colcells[i].append(col)
258
259 cols = []
260 for i, name in enumerate(names):
261 cols.append((types[name], name, colcells[i]))
262 return cols
263
264
266 """
267 type_str returns a string representation of a column type.
268 """
269 if typ is None:
270 return "None"
271 elif typ is float:
272 return "float"
273 elif typ is int:
274 return "int"
275 elif typ is str:
276 return "str"
277 return "Unknown"
278
279
281 """
282 cell_str is a convenience function for converting cell contents to a string
283 when there are still NULL values.
284
285 N.B. If you choose to work with data while keeping NULL values, you will
286 likely need to write more functions similar to this one.
287 """
288 if cell_contents is None:
289 return "NULL"
290 return str(cell_contents)
291
292
294 """
295 print_data_table is a convenience function for pretty-printing the
296 data in tabular format, including header names and type annotations.
297 """
298 padding = 2
299 headers = ['%s (%s)' % (name, type_str(types[name])) for name in names]
300 maxlens = map(len, headers)
301 for row in rows:
302 for i, col in enumerate(row):
303 maxlens[i] = max(maxlens[i], len(cell_str(col)))
304
305 def padded_cell(i, s):
306 spaces = maxlens[i] - len(cell_str(s)) + padding
307 return '%s%s' % (cell_str(s), ' ' * spaces)
308
309 line = ""
310 for i, name in enumerate(headers):
311 line += padded_cell(i, name)
312 print line
313 print '-' * (sum(map(len, headers)) + len(headers) * padding)
314 for row in rows:
315 line = ""
316 for i, col in enumerate(row):
317 line += padded_cell(i, cell_str(col))
318 print line
319
320
321 if __name__ == '__main__':
322
323 f = "sample.csv"
324
325
326
327
328
329 names, rows = data(f)
330
331
332
333 types = column_types(names, rows)
334
335
336
337
338
339
340
341
342 rows = cast(types, names, rows)
343
344
345
346
347
348
349
350
351
352
353
354
355 print "# Raw data."
356 print_data_table(types, names, rows)
357 print '\n'
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377 rows = convert_missing_cells(types, names, rows)
378 print "# Convert missing cells to arbitrary values"
379 print_data_table(types, names, rows)
380 print '\n'
381
382
383
384
385
386
387
388
389
390 rows = convert_columns(names, rows, string1=str.lower)
391 print "# Sanitize just one column of data"
392 print_data_table(types, names, rows)
393 print '\n'
394
395
396
397
398
399
400 rows = convert_types(types, names, rows, fstr=str.lower)
401 print "# Sanitize all cells that have type string"
402 print_data_table(types, names, rows)
403 print '\n'
404
405
406 for typ, name, cells in columns(types, names, rows):
407 print '(%s, %s) [%s]' % (name, typ, ', '.join(map(cell_str, cells)))
408 print '\n'
409
410
411 print column(types, names, rows, "mixed")
412