-
Notifications
You must be signed in to change notification settings - Fork 0
/
datautils.py
671 lines (578 loc) · 24 KB
/
datautils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
# Python utility functions for reading in data and storing it in something
# approximating an R data frame.
import sys, os, copy, subprocess
import numpy as np
error1 = "Input to ListDataFrame should be a list of lists or list of numpy arrays"
error2 = "Input column is not the same length as existing columns in ListDataFrame"
# Fast line-counting approach to deal with really large files
# (e.g., with 350,000-line file, this takes 0.4s, vs 1.4s for the more Pythonic
# return len([ line for line in open(fname)])
def CountLinesInFile( fname ):
p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
result, err = p.communicate()
if p.returncode != 0:
raise IOError(err)
return int(result.strip().split()[0])
# Slower method which counts "data" lines only:
def CountDataLinesInFile( fname, skip="#" ):
return len([ line for line in open(fname) if len(line.strip()) > 0 and line[0] not in skip ])
class ListDataFrame(object):
"""A class designed to hold a 2D list array (a list of lists, each of the
latter being "columns" and presumably having the same length), corresponding
to some table of data. Individual columns should have a single data type,
but different columns can have different data types (e.g., one column can
be strings, another integers, and a third floating point numbers).
Columns can also be NumPy arrays.
Optionally, a list of column names for the array can be supplied, or
added later; extra column-name lists can also be added after creation.
When indexed with one of the column names (e.g., obj["radius"]), it acts
like a dictionary and returns the corresponding column. When indexed
with an integer or a slice, it acts like the underlying list of lists.
If the column names are strings, then they also become attributes
of the object instance and can be accessed as, e.g., obj.radius -- as
long as the names are valid Python identifiers (must contain only
alphanumerica or _ and must start with letter or _).
Example: if the first column corresponds to column name "radius", then
it can be accessed as: obj[0], obj["radius"], or obj.radius (and also
as obj.data[0]).
The original list-of-lists is always accessible via obj.data
If you have data that are all floating-point, it's probably better to
turn it into a big NumPy array and use the ArrayDataFrame class instead.
"""
def __init__(self, dataList, columnNames=None):
if type(dataList) != list:
raise TypeError(error1)
if type(dataList[0]) not in [list, np.ndarray]:
raise TypeError(error1)
self.data = dataList
self.colNames = columnNames
self.dict = {}
self.nCols = len(dataList)
if self.colNames is not None:
self.SetColumns(columnNames)
def __getitem__(self, key):
"""Defines behavior of indexing: indexing with strings causes
internal column-name dictionary to be accessed (returns corresponding
column of data array); indexing with anything else (e.g., integers or
slices) is passed on to the data array.
"""
ktype = type(key)
if ktype is str:
return self.dict[key]
else:
return self.data[key]
def __str__(self):
outString = str(self.data)
if self.colNames is not None:
outString = str(self.colNames) + "\n" + outString
return outString
def SetColumns(self, columnNames):
"""Define the column names (dictionary keys pointing to columns
within the data frame).
columnNames should be a list of objects (usually strings).
If called more than once, erases previous column-name definitions.
"""
if self.dict != {}:
# remove old column definitions
for oldName in self.dict:
# remove old column-name attributes
oldName_attr = oldName.split()[0].strip()
if oldName_attr in self.__dict__:
junk = self.__dict__.pop(oldName_attr)
self.dict = {}
for i in range(self.nCols):
try:
colName = columnNames[i]
self.dict[colName] = self.data[i]
# define a new attribute, if possible (for access via x.colName)
if type(colName) is str:
colName_attr = colName.split()[0].strip()
self.__dict__[colName_attr] = self.data[i]
except IndexError:
pass
self.colNames = columnNames
def SetAltColumns(self, columnNames):
"""Define an additional set of column names (dictionary keys pointing
to columns within the data array) for all columns.
Does not erase previous column-name definitions.
"""
for i in range(self.nCols):
try:
colName = columnNames[i]
self.dict[colName] = self.data[i]
# define a new attribute, if possible (for access via x.colName)
if type(colName) is str:
colName_attr = colName.split()[0].strip()
self.__dict__[colName_attr] = self.data[i]
except IndexError:
pass
def AddColumnName(self, oldName, newName):
if (type(newName) is not str):
print(oldName, newName)
print(type(oldName), type(newName))
msg = "New columns names must be strings."
raise KeyError(msg)
if (newName in self.colNames):
msg = "%s is already a column name in this ListDataFrame." % newName
raise KeyError(msg)
if (oldName in self.colNames):
column = self.dict[oldName]
self.dict[newName] = self.dict[oldName]
colName_attr = newName.split()[0].strip()
self.__dict__[colName_attr] = column
else:
msg = "Column name \"%s\" does not exist." % oldName
raise KeyError(msg)
def ChangeColumnName(self, oldName, newName):
"""Change the name of one of the columns. Change is propagated into
the internal attribute dictionary, so obj.newName will return the
column which obj.oldName formerly returned.
"""
if (oldName in self.colNames):
# replace name in column name list
newList = self.colNames[:]
i_old = newList.index(oldName)
newList.insert(i_old, newName)
newList.remove(oldName)
# store new column names, generate keys and attributes
self.SetColumns(newList)
# clean up internal dictionary by removing old attribute ref
oldName_attr = oldName.split()[0].strip()
if oldName in self.__dict__:
junk = self.__dict__.pop(oldName_attr)
else:
msg = "Column name \"%s\" does not exist." % oldName
raise KeyError(msg)
def AddNewColumn(self, dataColumn, columnName=None):
"""Adds a new column to the ListDataFrame, along with the column name,
if supplied. Throws an error if dataColumn is not a list or numpy array;
also throws an error if the length of dataColumns is different from
the existing columns.
"""
if type(dataColumn) not in [list, np.ndarray]:
raise TypeError(error1)
if len(dataColumn) != len(self.data[0]):
raise TypeError(error2)
self.data.append(dataColumn)
self.nCols += 1
if columnName is not None and self.colNames is not None:
columnNames = copy.copy(self.colNames)
columnNames.append(columnName)
self.SetColumns(columnNames)
class ArrayDataFrame(object):
"""A class designed to hold a 2D NumPy floating-point array, and
optionally a list of column names for the array. When indexed
with one of the column names (e.g., obj["radius"]), it acts like a
dictionary and returns the corresponding column. When indexed with
an integer or a slice, it acts like the underlying NumPy array.
If the column names are strings, then they also become attributes
of the object instance and can be accessed as, e.g., obj.radius -- as
long as the names are valid Python identifiers (must contain only
alphanumerics or _ and must start with letter or _).
Example: if the first column corresponds to column name "radius", then
it can be accessed as: obj[:,0], obj["radius"], or obj.radius (and also
as obj.data[:,0]).
The underlying NumPy array is always accessable as obj.data
If instantiated without a list of column names, it behaves just
like an ordinary NumPy array (except for being somewhat slower).
"""
def __init__(self, array, columnNames=None):
if type(array) != np.ndarray or len(array.shape) != 2:
raise TypeError("Input to ArrayDataFrame should be NumPy 2D array")
self.data = array
self.colNames = columnNames
self.dict = {}
arShape = np.shape(array)
self.nCols = arShape[1]
if self.colNames is not None:
self.SetColumns(columnNames)
def __getitem__(self, key):
"""Defines behavior of indexing: indexing with strings causes
internal column-name dictionary to be accessed (returns corresponding
column of data array); indexing with anything else (e.g., integers or
slices) is passed on to the data array.
"""
ktype = type(key)
if ktype is str:
return self.dict[key]
else:
return self.data[key]
def __str__(self):
outString = str(self.data)
if self.colNames is not None:
outString = str(self.colNames) + "\n" + outString
return outString
def SetColumns(self, columnNames):
"""Define the column names (dictionary keys pointing to columns
within the data array.
If called more than once, erases previous column-name definitions.
"""
if self.dict != {}:
# remove old column definitions
for oldName in self.dict:
# remove old column-name attributes
oldName_attr = oldName.split()[0].strip()
if oldName_attr in self.__dict__:
junk = self.__dict__.pop(oldName_attr)
self.dict = {}
for i in range(self.nCols):
try:
colName = columnNames[i]
self.dict[colName] = self.data[:,i]
# define a new attribute, if possible
if type(colName) is str:
colName_attr = colName.split()[0].strip()
self.__dict__[colName_attr] = self.data[:,i]
except IndexError:
pass
def SetAltColumns(self, columnNames):
"""Define an additional set of column names (dictionary keys pointing
to columns within the data array).
Does not erase previous column-name definitions.
"""
for i in range(self.nCols):
try:
colName = columnNames[i]
self.dict[colName] = self.data[:,i]
# define a new attribute, if possible
if type(colName) is str:
colName_attr = colName.split()[0].strip()
self.__dict__[colName_attr] = self.data[:,i]
except IndexError:
pass
def ChangeColumnName(self, oldName, newName):
"""Change the name of one of the columns. Change is propagated into
the internal attribute dictionary, so obj.newName will return the
column which obj.oldName formerly returned.
"""
if (oldName in self.colNames):
# replace name in column name list
newList = self.colNames[:]
i_old = newList.index(oldName)
newList.insert(i_old, newName)
newList.remove(oldName)
# store new column names, generate keys and attributes
self.SetColumns(newList)
# clean up internal dictionary by removing old attribute ref
oldName_attr = oldName.split()[0].strip()
if oldName in self.__dict__:
junk = self.__dict__.pop(oldName_attr)
else:
msg = "Column name \"%s\" does not exist." % oldName
raise KeyError(msg)
def ReadTableArray(fileName, skip="#", dataFrame=False, delimiter=None):
"""Read data from fileName, store in a NumPy array. All values are
stored as floating-point. Format is row-major: d[i][j] = d[i,j] =
row i, column j. (To access an entire column, use d[:,j].)
Blank lines are ignored.
skip = string containing one or more characters which. Lines
beginning with any of these characters will be ignored.
If dataFrame=True, then the result is an ArrayDataFrame object
containing the NumPy array.
"""
# open file in "universal" mode to ensure Mac or DOS/Windows
# line endings are converted to \n
lines = open(fileName, 'rU').readlines()
dlines = [line.rstrip() for line in lines if len(line.strip()) > 0 and line[0] not in skip ]
nrows = len(dlines)
ncols = len(dlines[0].split(delimiter))
dataArray = np.zeros((nrows, ncols))
for i in range(nrows):
pieces = dlines[i].split(delimiter)
for j in range(ncols):
dataArray[i, j] = float(pieces[j])
if dataFrame:
return ArrayDataFrame(dataArray)
else:
return dataArray
def ExtractSubLists( textList, nSubLists ):
"""Given a list of strings, where each string is of the
form "{x1, ..., xn}" with n = nSubLists elements, return
a list contaning nSubLists NumPy arrays (1-D).
"""
bigList = []
for i in range(nSubLists):
bigList.append([])
for textChunk in textList:
bareText = textChunk.strip().strip("{").strip("}")
pp = bareText.split(",")
for i in range(nSubLists):
bigList[i].append(float(pp[i]))
for i in range(nSubLists):
bigList[i] = np.array(bigList[i])
return bigList
def InsertAndReplace( theList, ii, newItems ):
"""Given a list, replace the entry at index ii with the elements of
newItems (also a list).
"""
nNewItems = len(newItems)
if (ii < 0) or (ii > len(theList)):
msg = "\ndatautils.InsertAndReplace: *** ERROR: requested insert location"
msg += " (index = %d) is < 0 or > length(theList) [%d]" % (ii, len(theList))
msg += "\n"
print(msg)
return
del theList[ii]
for j in reversed(range(nNewItems)): theList.insert(ii, newItems[j])
def AddExtraColumnNames( columnNames, subListColumns, subListLengths, subListSuffixes ):
"""Given a list of column names, process it to replace column names for those
columns which have sub-lists. New column names corresponding to each sub-list
column are inserted in place of the original name, for each such column.
"""
oldColNames = [ columnNames[i] for i in subListColumns ]
nSubListCols = len(subListColumns)
for i in range(nSubListCols):
baseName = oldColNames[i]
nSubLists = subListLengths[i]
# generate suffixes:
if (subListSuffixes is not None):
if (len(subListSuffixes) != nSubLists):
msg = "\tdatautils.AddExtraColumnNames: *** WARNING: number of subListSuffixes"
msg += " elements (%d) != actual number\n\tof sub-lists (%d)" % (len(subListSuffixes), nSubLists)
msg += " for column with orig. name = %s...\n" % (baseName)
msg += "\tNumerical suffixes will be used instead for new column names.\n"
print(msg)
suffixes = [ str(k) for k in range(nSubLists) ]
else:
suffixes = subListSuffixes
else:
suffixes = [ str(k) for k in range(nSubLists) ]
newNames = [ "%s_%s" % (baseName, suffixes[k]) for k in range(nSubLists) ]
insertLoc = columnNames.index(baseName)
InsertAndReplace(columnNames, insertLoc, newNames)
def ColumnToFloats( inputList, blankValue ):
"""Takes a list of numbers in string format and converts them to floating-point,
with blank entries being replaced by blankValue (which should be float).
"""
try:
floatList = np.array(inputList, "Float64")
except ValueError:
# looks like column has some blanks in it
floatList = copy.copy(inputList)
for j in range(len(inputList)):
try:
floatList[j] = float(inputList[j])
except ValueError:
floatList[j] = blankValue
floatList = np.array(floatList)
return floatList
def ReadCompositeTable( fileName, skip="#", delimiter=None, noConvert=None,
intCols=None, blankVal=0, convertSubLists=False, expandSubLists=False,
dataFrame=False, columnRow=None, subListSuffixes=None ):
"""Function which reads a text datafile and returns a list of columns.
Comments and other lines to skip should start with the "skip" character
(which by default is "#"); column separators are specified with "delimiter"
(default is whitespace).
By default, all columns are converted to 1-D NumPy arrays, unless
the data in that column are non-numeric [only the first row of data is
checked to see which columns might be non-numeric] or the column number
[0-based: first column = 0, 2nd column = 1, etc.] is in the noConvert list.
Numeric columns with column number in intCols (list) are converted to Int64 arrays;
all other numeric columns become Float64 arrays.
blankVal specifies the default number to use for blank values in numerical
columns.
convertSubLists specifies whether embedded sublists (e.g., "{x,y,z}" should be
recognized and processed; if so, each such column becomes a *list of NumPy arrays*;
if convertSubLists=False, then each such column is a list of strings.
If expandSublists=True, then embedded sublists are converted into extra
columns (this forces convertSubLists to be True).
If dataFrame=True, then the result is a ListDataFrame object.
If columnRow = x, then that line [0-based; first line = 0, etc.] is assumed
to contain column headers and is processed accordingly (only useful if
dataFrame = True as well).
In addition, if columnRow != None, then subListSuffixes can be used to
modify the column names for sublists (newNames[i] = origName + "_" + subListSuffixes[i]);
if subListSuffixes is None [the default], then renamed column names have
"_0", "_1", etc. as suffixes.
"""
subListsFound = False
if noConvert is None:
noConvert = []
if intCols is None:
intCols = []
if expandSubLists is True:
convertSubLists = True
if convertSubLists is True:
subListCols = []
subListLengths = {}
subListLengthList = []
nDataRows = CountDataLinesInFile(fileName, skip=skip)
nAllRows = CountLinesInFile(fileName)
# open file in "universal" mode to convert Mac or DOS line endings to \n
inFile = open(fileName, 'rU')
dlines = [line.rstrip() for line in inFile if len(line.strip()) > 0 and line[0] not in skip ]
# if requested, extract column names
if ((columnRow is not None) and (columnRow >= 0) and (columnRow < nAllRows)):
inFile.seek(0,0) # rewind to beginning of file
i = 0
while (i <= columnRow):
line = inFile.readline()
i += 1
colHeaderLine = line.strip("#")
pp = colHeaderLine.split(delimiter)
colNames = [ p.strip() for p in pp ]
else:
colNames = None
inFile.close()
# Figure out number of columns, which ones are non-numeric, and which have
# sub-lists (if we're allowing for the latter)
pp = dlines[0].split(delimiter)
nInputCols = len(pp)
nonNumberCols = []
for i in range(nInputCols):
if (i not in intCols) and (i not in noConvert):
# check to make sure this column has numbers
try:
x = float(pp[i])
except ValueError:
if convertSubLists is True and pp[i].find("{") >= 0:
# a-ha, this is a column with sublists, so let's convert it
subListsFound = True
subListCols.append(i)
ppp = pp[i].split(",")
nSubLists = len(ppp)
subListLengths[i] = nSubLists
subListLengthList.append(nSubLists)
else:
noConvert.append(i)
# Create the master list of input columns
dataList = []
for i in range(nInputCols):
dataList.append([])
# go through the table and assign entries to individual-column lists
for n in range(nDataRows):
pieces = dlines[n].split(delimiter)
for i in range(nInputCols):
dataList[i].append(pieces[i])
# Now convert columns to NumPy arrays, if possible:
if (not expandSubLists):
# "Normal" approach (if sublists columns exist, then each is stored
# as a list of NumPy arrays); total number of columns is unchanged.
# Note that columns in noConvert are left untouched (as list of strings)
for i in range(nInputCols):
if i in intCols:
dataList[i] = np.array(dataList[i], "Int64")
elif convertSubLists is True and i in subListCols:
dataList[i] = ExtractSubLists(dataList[i], subListLengths[i])
elif i not in noConvert:
# this must, by default, be a floating-point column
dataList[i] = ColumnToFloats(dataList[i], blankVal)
else:
# Alternate approach, where we expand sublists into individual, new columns.
# Have to be careful, since number of columns in dataList will be changing...
# Note that columns in noConvert are left untouched (as list of strings)
nAddedCols = 0
for i_orig in range(nInputCols): # i_orig = index into original (input) columns
ii = i_orig + nAddedCols # ii = index into current version of dataList
if i_orig in intCols:
dataList[ii] = np.array(dataList[ii], "Int64")
elif i_orig in subListCols:
# number of added cols = n(subLists) - 1, bcs. we *remove* original column
nAddedCols += subListLengths[i_orig] - 1
listOfSublists = ExtractSubLists(dataList[ii], subListLengths[i_orig])
InsertAndReplace(dataList, ii, listOfSublists)
elif i_orig not in noConvert:
# this must, by default, be a floating-point column
dataList[ii] = ColumnToFloats(dataList[ii], blankVal)
# OK, if there were sublists *and* we generated extra columns, update
# the colNames list to account for extra columns
if (expandSubLists is True) and (subListsFound is True) and (colNames is not None):
AddExtraColumnNames(colNames, subListCols, subListLengthList, subListSuffixes)
if dataFrame:
return ListDataFrame(dataList, colNames)
else:
return dataList
def ReadCompositeTableFromText( textLines, skip="#", delimiter=None, noConvert=None,
intCols=None, blankVal=0, convertSubLists=False, expandSubLists=False,
dataFrame=False, columnRow=None, subListSuffixes=None ):
"""Identical to ReadCompositeTable, except that it accepts a list of lines
(each line a string), with the first line assumed to be column headers.
"""
subListsFound = False
if noConvert is None:
noConvert = []
if intCols is None:
intCols = []
if expandSubLists is True:
convertSubLists = True
if convertSubLists is True:
subListCols = []
subListLengths = {}
subListLengthList = []
nAllRows = len(textLines)
dlines = [line.rstrip() for line in textLines if len(line.strip()) > 0 and line[0] not in skip ]
nDataRows = len(dlines)
# if requested, extract column names
if ((columnRow is not None) and (columnRow >= 0) and (columnRow < nAllRows)):
colHeaderLine = textLines[columnRow].strip(skip)
colNames = [ p.strip() for p in colHeaderLine.split(delimiter) ]
else:
colNames = None
# Figure out number of columns, which ones are non-numeric, and which have
# sub-lists (if we're allowing for the latter)
pp = dlines[0].split(delimiter)
nInputCols = len(pp)
nonNumberCols = []
for i in range(nInputCols):
if (i not in intCols) and (i not in noConvert):
# check to make sure this column has numbers
try:
x = float(pp[i])
except ValueError:
if convertSubLists is True and pp[i].find("{") >= 0:
# a-ha, this is a column with sublists, so let's convert it
subListsFound = True
subListCols.append(i)
ppp = pp[i].split(",")
nSubLists = len(ppp)
subListLengths[i] = nSubLists
subListLengthList.append(nSubLists)
else:
noConvert.append(i)
# Create the master list of input columns
dataList = []
for i in range(nInputCols):
dataList.append([])
# go through the table and assign entries to individual-column lists
for n in range(nDataRows):
pieces = dlines[n].split(delimiter)
for i in range(nInputCols):
dataList[i].append(pieces[i])
# Now convert columns to NumPy arrays, if possible:
if (not expandSubLists):
# "Normal" approach (if sublists columns exist, then each is stored
# as a list of NumPy arrays); total number of columns is unchanged.
# Note that columns in noConvert are left untouched (as list of strings)
for i in range(nInputCols):
if i in intCols:
dataList[i] = np.array(dataList[i], "Int64")
elif convertSubLists is True and i in subListCols:
dataList[i] = ExtractSubLists(dataList[i], subListLengths[i])
elif i not in noConvert:
# this must, by default, be a floating-point column
dataList[i] = ColumnToFloats(dataList[i], blankVal)
else:
# Alternate approach, where we expand sublists into individual, new columns.
# Have to be careful, since number of columns in dataList will be changing...
# Note that columns in noConvert are left untouched (as list of strings)
nAddedCols = 0
for i_orig in range(nInputCols): # i_orig = index into original (input) columns
ii = i_orig + nAddedCols # ii = index into current version of dataList
if i_orig in intCols:
dataList[ii] = np.array(dataList[ii], "Int64")
elif i_orig in subListCols:
# number of added cols = n(subLists) - 1, bcs. we *remove* original column
nAddedCols += subListLengths[i_orig] - 1
listOfSublists = ExtractSubLists(dataList[ii], subListLengths[i_orig])
InsertAndReplace(dataList, ii, listOfSublists)
elif i_orig not in noConvert:
# this must, by default, be a floating-point column
dataList[ii] = ColumnToFloats(dataList[ii], blankVal)
# OK, if there were sublists *and* we generated extra columns, update
# the colNames list to account for extra columns
if (expandSubLists is True) and (subListsFound is True) and (colNames is not None):
AddExtraColumnNames(colNames, subListCols, subListLengthList, subListSuffixes)
if dataFrame:
return ListDataFrame(dataList, colNames)
else:
return dataList