-
Notifications
You must be signed in to change notification settings - Fork 901
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[REVIEW] Fix N/A detection for empty fields in CSV reader #6922
Changes from all commits
a79f9d5
461a0a3
910f0fd
d48ffe4
9a6b5d1
5e61aed
00fccbc
46db10e
65e156f
81b943d
3a03db6
944384c
e364f1a
d2b93b8
cfa64bb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -224,7 +224,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) | |
long tempPos = pos - 1; | ||
long field_len = pos - start; | ||
|
||
if (field_len <= 0 || serialized_trie_contains(opts.trie_na, raw_csv + start, field_len)) { | ||
if (field_len < 0 || serialized_trie_contains(opts.trie_na, raw_csv + start, field_len)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might want to add a comment to this so it's not thought to be a mistake. Or use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Took another look and it seems like |
||
atomicAdd(&d_columnData[actual_col].null_count, 1); | ||
} else if (serialized_trie_contains(opts.trie_true, raw_csv + start, field_len) || | ||
serialized_trie_contains(opts.trie_false, raw_csv + start, field_len)) { | ||
|
@@ -273,10 +273,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) | |
--int_req_number_cnt; | ||
} | ||
|
||
if (field_len == 0) { | ||
// Ignoring whitespace and quotes can result in empty fields | ||
atomicAdd(&d_columnData[actual_col].null_count, 1); | ||
} else if (column_flags[col] & column_parse::as_datetime) { | ||
if (column_flags[col] & column_parse::as_datetime) { | ||
// PANDAS uses `object` dtype if the date is unparseable | ||
if (is_datetime(countString, countDecimal, countColon, countDash, countSlash)) { | ||
atomicAdd(&d_columnData[actual_col].datetime_count, 1); | ||
|
@@ -592,16 +589,15 @@ __global__ void __launch_bounds__(csvparse_block_dim) | |
|
||
if (column_flags[col] & column_parse::enabled) { | ||
// check if the entire field is a NaN string - consistent with pandas | ||
const bool is_na = serialized_trie_contains(options.trie_na, raw_csv + start, pos - start); | ||
auto const is_valid = | ||
!serialized_trie_contains(options.trie_na, raw_csv + start, pos - start); | ||
|
||
// Modify start & end to ignore whitespace and quotechars | ||
long tempPos = pos - 1; | ||
if (!is_na && dtypes[actual_col].id() != cudf::type_id::STRING) { | ||
if (is_valid && dtypes[actual_col].id() != cudf::type_id::STRING) { | ||
trim_field_start_end(raw_csv, &start, &tempPos, options.quotechar); | ||
} | ||
|
||
if (!is_na && start <= (tempPos)) { // Empty fields are not legal values | ||
|
||
if (is_valid) { | ||
// Type dispatcher does not handle STRING | ||
if (dtypes[actual_col].id() == cudf::type_id::STRING) { | ||
long end = pos; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this mean all tries match empty input? Difficult to follow the logic how this coordinates with the code below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not always. Here we create a node that will have
is_end_of_word
= true if the root node of thetree_trie
is also end of a word (empty word, since it's the root). Hence, passing tree_trie.is_end_of_word as the second parameter.I'll expand the comment above to make this clearer. 👍