Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add empty_value parameter to CSV processor #51567

Merged
merged 6 commits into from
Feb 5, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/reference/ingest/processors/csv.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Extracts fields from CSV line out of a single text field within a document. Any
| `quote` | no | " | Quote used in CSV, has to be single character string
| `ignore_missing` | no | `true` | If `true` and `field` does not exist, the processor quietly exits without modifying the document
| `trim` | no | `false` | Trim whitespaces in unquoted fields
| `empty_value` | no | - | Value used to fill empty fields, empty fields will be skipped if this is not provided
probakowski marked this conversation as resolved.
Show resolved Hide resolved
include::common-options.asciidoc[]
|======

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ private enum State {
private final char separator;
private final boolean trim;
private final String[] headers;
private final Object emptyValue;
private final IngestDocument ingestDocument;
private final StringBuilder builder = new StringBuilder();
private State state = State.START;
Expand All @@ -45,12 +46,13 @@ private enum State {
private int length;
private int currentIndex;

CsvParser(IngestDocument ingestDocument, char quote, char separator, boolean trim, String[] headers) {
CsvParser(IngestDocument ingestDocument, char quote, char separator, boolean trim, String[] headers, Object emptyValue) {
this.ingestDocument = ingestDocument;
this.quote = quote;
this.separator = separator;
this.trim = trim;
this.headers = headers;
this.emptyValue = emptyValue;
}

void process(String line) {
Expand Down Expand Up @@ -102,7 +104,8 @@ private boolean processStart() {
return false;
} else if (c == separator) {
startIndex++;
if (nextHeader()) {
builder.setLength(0);
if (setField(startIndex)) {
return true;
}
} else if (isWhitespace(c)) {
Expand Down Expand Up @@ -190,16 +193,17 @@ private boolean isWhitespace(char c) {
}

private boolean setField(int endIndex) {
String value;
if (builder.length() == 0) {
ingestDocument.setFieldValue(headers[currentHeader], line.substring(startIndex, endIndex));
value = line.substring(startIndex, endIndex);
} else {
builder.append(line, startIndex, endIndex);
ingestDocument.setFieldValue(headers[currentHeader], builder.toString());
value = builder.append(line, startIndex, endIndex).toString();
}
if (value.length() > 0) {
ingestDocument.setFieldValue(headers[currentHeader], value);
} else if (emptyValue != null) {
ingestDocument.setFieldValue(headers[currentHeader], emptyValue);
}
return nextHeader();
}

private boolean nextHeader() {
currentHeader++;
return currentHeader == headers.length;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,14 @@
* A processor that breaks line from CSV file into separate fields.
* If there's more fields requested than there is in the CSV, extra field will not be present in the document after processing.
* In the same way this processor will skip any field that is empty in CSV.
*
* <p>
* By default it uses rules according to <a href="https://tools.ietf.org/html/rfc4180">RCF 4180</a> with one exception: whitespaces are
* allowed before or after quoted field. Processor can be tweaked with following parameters:
*
* <p>
* quote: set custom quote character (defaults to ")
* separator: set custom separator (defaults to ,)
* trim: trim leading and trailing whitespaces in unquoted fields
* empty_value: sets custom value to use for empty fields (field is skipped if null)
*/
public final class CsvProcessor extends AbstractProcessor {

Expand All @@ -50,15 +51,18 @@ public final class CsvProcessor extends AbstractProcessor {
private final char quote;
private final char separator;
private final boolean ignoreMissing;
private final Object emptyValue;

CsvProcessor(String tag, String field, String[] headers, boolean trim, char separator, char quote, boolean ignoreMissing) {
CsvProcessor(String tag, String field, String[] headers, boolean trim, char separator, char quote, boolean ignoreMissing,
Object emptyValue) {
super(tag);
this.field = field;
this.headers = headers;
this.trim = trim;
this.quote = quote;
this.separator = separator;
this.ignoreMissing = ignoreMissing;
this.emptyValue = emptyValue;
}

@Override
Expand All @@ -73,7 +77,7 @@ public IngestDocument execute(IngestDocument ingestDocument) {
} else if (line == null) {
throw new IllegalArgumentException("field [" + field + "] is null, cannot process it.");
}
new CsvParser(ingestDocument, quote, separator, trim, headers).process(line);
new CsvParser(ingestDocument, quote, separator, trim, headers, emptyValue).process(line);
return ingestDocument;
}

Expand All @@ -96,13 +100,17 @@ public CsvProcessor create(Map<String, org.elasticsearch.ingest.Processor.Factor
throw newConfigurationException(TYPE, processorTag, "separator", "separator has to be single character like , or ;");
}
boolean trim = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "trim", false);
Object emptyValue = null;
if(config.containsKey("emptyValue")){
emptyValue = ConfigurationUtils.readObject(TYPE, processorTag, config, "empty_value");
}
boolean ignoreMissing = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false);
List<String> targetFields = ConfigurationUtils.readList(TYPE, processorTag, config, "target_fields");
if (targetFields.isEmpty()) {
throw newConfigurationException(TYPE, processorTag, "target_fields", "target fields list can't be empty");
}
return new CsvProcessor(processorTag, field, targetFields.toArray(String[]::new), trim, separator.charAt(0), quote.charAt(0),
ignoreMissing);
ignoreMissing, emptyValue);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public void setup() {
separator = randomFrom(SEPARATORS);
}

public void testExactNumberOfFields() throws Exception {
public void testExactNumberOfFields() {
int numItems = randomIntBetween(2, 10);
Map<String, String> items = new LinkedHashMap<>();
for (int i = 0; i < numItems; i++) {
Expand All @@ -67,7 +67,67 @@ public void testExactNumberOfFields() throws Exception {
items.forEach((key, value) -> assertEquals(value, ingestDocument.getFieldValue(key, String.class)));
}

public void testLessFieldsThanHeaders() throws Exception {
public void testEmptyValues() {
int numItems = randomIntBetween(5, 10);
Map<String, String> items = new LinkedHashMap<>();
for (int i = 0; i < 3; i++) {
items.put(randomAlphaOfLengthBetween(5, 10), randomAlphaOfLengthBetween(5, 10));
}
String emptyKey = randomAlphaOfLengthBetween(5, 10);
items.put(emptyKey, "");
for (int i = 0; i < numItems - 4; i++) {
items.put(randomAlphaOfLengthBetween(5, 10), randomAlphaOfLengthBetween(5, 10));
}
String[] headers = items.keySet().toArray(new String[numItems]);
String csv = items.values().stream().map(v -> quote + v + quote).collect(Collectors.joining(separator + ""));

IngestDocument ingestDocument = processDocument(headers, csv);

items.forEach((key, value) -> {
if (emptyKey.equals(key)) {
assertFalse(ingestDocument.hasField(key));
} else {
assertEquals(value, ingestDocument.getFieldValue(key, String.class));
}
});
}

public void testEmptyValuesReplace() {
int numItems = randomIntBetween(5, 10);
Map<String, String> items = new LinkedHashMap<>();
for (int i = 0; i < 3; i++) {
items.put(randomAlphaOfLengthBetween(5, 10), randomAlphaOfLengthBetween(5, 10));
}
String emptyKey = randomAlphaOfLengthBetween(5, 10);
items.put(emptyKey, "");
for (int i = 0; i < numItems - 4; i++) {
items.put(randomAlphaOfLengthBetween(5, 10), randomAlphaOfLengthBetween(5, 10));
}
String[] headers = items.keySet().toArray(new String[numItems]);
String csv = items.values().stream().map(v -> quote + v + quote).collect(Collectors.joining(separator + ""));

IngestDocument ingestDocument = processDocument(headers, csv, true, "");

items.forEach((key, value) -> {
if (emptyKey.equals(key)) {
assertEquals("", ingestDocument.getFieldValue(key, String.class));
} else {
assertEquals(value, ingestDocument.getFieldValue(key, String.class));
}
});

IngestDocument ingestDocument2 = processDocument(headers, csv, true, 0);

items.forEach((key, value) -> {
if (emptyKey.equals(key)) {
assertEquals(0, (int) ingestDocument2.getFieldValue(key, Integer.class));
} else {
assertEquals(value, ingestDocument2.getFieldValue(key, String.class));
}
});
}

public void testLessFieldsThanHeaders() {
int numItems = randomIntBetween(4, 10);
Map<String, String> items = new LinkedHashMap<>();
for (int i = 0; i < numItems; i++) {
Expand All @@ -82,7 +142,7 @@ public void testLessFieldsThanHeaders() throws Exception {
items.entrySet().stream().limit(3).forEach(e -> assertEquals(e.getValue(), ingestDocument.getFieldValue(e.getKey(), String.class)));
}

public void testLessHeadersThanFields() throws Exception {
public void testLessHeadersThanFields() {
int numItems = randomIntBetween(5, 10);
Map<String, String> items = new LinkedHashMap<>();
for (int i = 0; i < numItems; i++) {
Expand All @@ -96,7 +156,7 @@ public void testLessHeadersThanFields() throws Exception {
items.entrySet().stream().limit(3).forEach(e -> assertEquals(e.getValue(), ingestDocument.getFieldValue(e.getKey(), String.class)));
}

public void testSingleField() throws Exception {
public void testSingleField() {
String[] headers = new String[]{randomAlphaOfLengthBetween(5, 10)};
String value = randomAlphaOfLengthBetween(5, 10);
String csv = quote + value + quote;
Expand All @@ -106,7 +166,7 @@ public void testSingleField() throws Exception {
assertEquals(value, ingestDocument.getFieldValue(headers[0], String.class));
}

public void testEscapedQuote() throws Exception {
public void testEscapedQuote() {
int numItems = randomIntBetween(2, 10);
Map<String, String> items = new LinkedHashMap<>();
for (int i = 0; i < numItems; i++) {
Expand All @@ -121,7 +181,7 @@ public void testEscapedQuote() throws Exception {
items.forEach((key, value) -> assertEquals(value.replace(quote + quote, quote), ingestDocument.getFieldValue(key, String.class)));
}

public void testQuotedStrings() throws Exception {
public void testQuotedStrings() {
assumeFalse("quote needed", quote.isEmpty());
int numItems = randomIntBetween(2, 10);
Map<String, String> items = new LinkedHashMap<>();
Expand All @@ -138,7 +198,7 @@ public void testQuotedStrings() throws Exception {
String.class)));
}

public void testEmptyFields() throws Exception {
public void testEmptyFields() {
int numItems = randomIntBetween(5, 10);
Map<String, String> items = new LinkedHashMap<>();
for (int i = 0; i < numItems; i++) {
Expand All @@ -158,7 +218,7 @@ public void testEmptyFields() throws Exception {
items.keySet().stream().skip(numItems - 1).forEach(key -> assertFalse(ingestDocument.hasField(key)));
}

public void testWrongStings() throws Exception {
public void testWrongStings() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo: Sting -> String

(But it was there before and you've got a green build so if no other changes are required then no need to trigger a rebuild just for this.)

assumeTrue("single run only", quote.isEmpty());
expectThrows(IllegalArgumentException.class, () -> processDocument(new String[]{"a"}, "abc\"abc"));
expectThrows(IllegalArgumentException.class, () -> processDocument(new String[]{"a"}, "\"abc\"asd"));
Expand All @@ -167,7 +227,7 @@ public void testWrongStings() throws Exception {
expectThrows(IllegalArgumentException.class, () -> processDocument(new String[]{"a"}, "abc\rabc"));
}

public void testQuotedWhitespaces() throws Exception {
public void testQuotedWhitespaces() {
assumeFalse("quote needed", quote.isEmpty());
IngestDocument document = processDocument(new String[]{"a", "b", "c", "d"},
" abc " + separator + " def" + separator + "ghi " + separator + " " + quote + " ooo " + quote);
Expand All @@ -177,7 +237,7 @@ public void testQuotedWhitespaces() throws Exception {
assertEquals(" ooo ", document.getFieldValue("d", String.class));
}

public void testUntrimmed() throws Exception {
public void testUntrimmed() {
assumeFalse("quote needed", quote.isEmpty());
IngestDocument document = processDocument(new String[]{"a", "b", "c", "d", "e", "f"},
" abc " + separator + " def" + separator + "ghi " + separator + " "
Expand All @@ -190,30 +250,35 @@ public void testUntrimmed() throws Exception {
assertFalse(document.hasField("f"));
}

public void testEmptyHeaders() throws Exception {
public void testEmptyHeaders() {
assumeTrue("single run only", quote.isEmpty());
IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random());
String fieldName = RandomDocumentPicks.addRandomField(random(), ingestDocument, "abc,abc");
HashMap<String, Object> metadata = new HashMap<>(ingestDocument.getSourceAndMetadata());

CsvProcessor processor = new CsvProcessor(randomAlphaOfLength(5), fieldName, new String[0], false, ',', '"', false);
CsvProcessor processor = new CsvProcessor(randomAlphaOfLength(5), fieldName, new String[0], false, ',', '"', false, null);

processor.execute(ingestDocument);

assertEquals(metadata, ingestDocument.getSourceAndMetadata());
}

private IngestDocument processDocument(String[] headers, String csv) throws Exception {
private IngestDocument processDocument(String[] headers, String csv) {
return processDocument(headers, csv, true);
}

private IngestDocument processDocument(String[] headers, String csv, boolean trim) throws Exception {
private IngestDocument processDocument(String[] headers, String csv, boolean trim) {
return processDocument(headers, csv, trim, null);
}

private IngestDocument processDocument(String[] headers, String csv, boolean trim, Object emptyValue) {
IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random());
Arrays.stream(headers).filter(ingestDocument::hasField).forEach(ingestDocument::removeField);

String fieldName = RandomDocumentPicks.addRandomField(random(), ingestDocument, csv);
char quoteChar = quote.isEmpty() ? '"' : quote.charAt(0);
CsvProcessor processor = new CsvProcessor(randomAlphaOfLength(5), fieldName, headers, trim, separator, quoteChar, false);
CsvProcessor processor = new CsvProcessor(randomAlphaOfLength(5), fieldName, headers, trim, separator, quoteChar, false,
emptyValue);

processor.execute(ingestDocument);

Expand Down