Skip to content

Commit

Permalink
Per #1019, update DataLine and LineDataFile classes to support parsin…
Browse files Browse the repository at this point in the history
…g .csv files. Get rid of the unneeded Offsets vector. Add AllowEmptyColumns option to the DataLine class so that multiple delimiters in a row will be treated as separate columns. Since the default delim is whitespace, it makes sense that you'd want to parse multiple delims in a group. But for .csv files, each comma indicates a new column.
  • Loading branch information
JohnHalleyGotway committed Jan 10, 2025
1 parent a9530a1 commit 64a6f23
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 24 deletions.
86 changes: 64 additions & 22 deletions src/basic/vx_util/data_line.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
////////////////////////////////////////////////////////////////////////


#include <bits/stdc++.h>
#include <iostream>
#include <fstream>
#include <sys/types.h>
Expand Down Expand Up @@ -115,6 +116,8 @@ Delimiter.assign(dataline_default_delim);

File = (LineDataFile *) nullptr;

AllowEmptyColumns = false;

IsHeader = false;

return;
Expand All @@ -131,7 +134,6 @@ void DataLine::clear()

Line.clear();
Items.clear();
Offset.clear();

LineNumber = 0;

Expand All @@ -157,8 +159,6 @@ clear();
Line = a.Line;
Items = a.Items;

Offset = a.Offset;

N_items = a.N_items;

LineNumber = a.LineNumber;
Expand Down Expand Up @@ -202,20 +202,6 @@ for (j=0; j<N_items; ++j) {

out.flush();

}

out << prefix << "\n";

for (j=0; j<N_items; ++j) {

snprintf(junk, sizeof(junk), "Offset[%2d] = ", j);

out << prefix << junk << Offset[j] << '\n';

if ( (j%5) == 4 ) out << prefix << '\n';

out.flush();

}

//
Expand Down Expand Up @@ -355,14 +341,13 @@ count = 0;
if ( ! read_single_text_line(ldf) ) { clear(); return 0; }

//
// parse the line with strtok
// parse the line
//

size_t len, tpos = std::string::npos;

if (0 == Line.find_first_not_of(Delimiter)) { // no leading delimiter
++count;
Offset.push_back(pos);
Items.push_back(Line.substr(pos, Line.find_first_of(Delimiter, pos) - pos));
}
while ((tpos = Line.substr(pos).find_first_of(Delimiter)) != std::string::npos) {
Expand All @@ -372,7 +357,6 @@ while ((tpos = Line.substr(pos).find_first_of(Delimiter)) != std::string::npos)
pos += tpos + len;

++count;
Offset.push_back(pos);
Items.push_back(Line.substr(pos, Line.find_first_of(Delimiter, pos) - pos));
}

Expand All @@ -389,6 +373,64 @@ return 1;
////////////////////////////////////////////////////////////////////////


int DataLine::read_line_empty_columns(LineDataFile * ldf)

{

clear();

//
// get a line from the file
//

if ( ! read_single_text_line(ldf) ) { clear(); return 0; }

//
// parse the line
//

regex del(Delimiter);

//
// Create a regex_token_iterator to split the string
//

sregex_token_iterator it(Line.begin(), Line.end(), del, -1);

//
// End iterator for the regex_token_iterator
//

sregex_token_iterator end;

//
// Store each token
//

while (it != end) {
Items.push_back(*it);
++it;
}

//
// Append empty item if string ends with a delimiter
//

if ( Delimiter.find(Line.back()) != string::npos ) Items.push_back("");

N_items = Items.size();

LineNumber = ldf->last_line_number() + 1;


return 1;

}


////////////////////////////////////////////////////////////////////////


int DataLine::read_fwf_line(LineDataFile * ldf, const int *wdth, int n_wdth)

{
Expand Down Expand Up @@ -439,7 +481,6 @@ for( i=0; i<n_wdth; i++ ) {
// store the offset to this entry
//
start = pos;
Offset.push_back(pos);

//
// store this entry
Expand Down Expand Up @@ -822,7 +863,8 @@ int status;

do {

status = a.read_line(this);
if ( a.allow_empty_columns() ) status = a.read_line_empty_columns(this);
else status = a.read_line(this);

if ( !status ) return 0;

Expand Down
14 changes: 12 additions & 2 deletions src/basic/vx_util/data_line.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ class DataLine {

int N_chars;

std::vector<int> Offset;

int N_ints;

int LineNumber;
Expand All @@ -68,6 +66,8 @@ class DataLine {

LineDataFile * File; // not allocated

bool AllowEmptyColumns;

bool IsHeader;


Expand Down Expand Up @@ -130,10 +130,16 @@ class DataLine {

virtual int read_line(LineDataFile *);

virtual int read_line_empty_columns(LineDataFile *);

virtual int read_fwf_line(LineDataFile *, const int *wdth, int n_wdth);

virtual bool is_ok() const;

bool allow_empty_columns() const;

void set_allow_empty_columns(bool = true);

virtual bool is_header() const;

virtual void set_is_header(bool = true);
Expand All @@ -154,6 +160,10 @@ inline const char * DataLine::get_line () const { return Line.c_str(); }

inline const char * DataLine::get_delimiter() const { return Delimiter.c_str(); }

inline bool DataLine::allow_empty_columns() const { return AllowEmptyColumns; }

inline void DataLine::set_allow_empty_columns(bool __tf__) { AllowEmptyColumns = __tf__; return; }

inline void DataLine::set_is_header(bool __tf__) { IsHeader = __tf__; return; }


Expand Down

0 comments on commit 64a6f23

Please sign in to comment.