From 55efe93b237b2d8fd84e5e1f44c64b43e4e4e577 Mon Sep 17 00:00:00 2001 From: Przemko Robakowski Date: Tue, 28 Jan 2020 21:37:52 +0100 Subject: [PATCH 1/3] Add empty_value parameter to CSV processor This change adds `empty_value` parameter to the CSV processor. This value is used to fill empty fields. Fields will be skipped if this parameter is ommited. This behavior is the same for both quoted and unquoted fields. --- docs/reference/ingest/processors/csv.asciidoc | 1 + .../ingest/common/CsvParser.java | 22 +++-- .../ingest/common/CsvProcessor.java | 18 +++- .../ingest/common/CsvProcessorTests.java | 95 ++++++++++++++++--- 4 files changed, 107 insertions(+), 29 deletions(-) diff --git a/docs/reference/ingest/processors/csv.asciidoc b/docs/reference/ingest/processors/csv.asciidoc index c589c9eb4361c..72bc7648dcc94 100644 --- a/docs/reference/ingest/processors/csv.asciidoc +++ b/docs/reference/ingest/processors/csv.asciidoc @@ -13,6 +13,7 @@ Extracts fields from CSV line out of a single text field within a document. Any | `quote` | no | " | Quote used in CSV, has to be single character string | `ignore_missing` | no | `true` | If `true` and `field` does not exist, the processor quietly exits without modifying the document | `trim` | no | `false` | Trim whitespaces in unquoted fields +| `empty_value` | no | - | Value used to fill empty fields, empty fields will be skipped if this is not provided include::common-options.asciidoc[] |====== diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvParser.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvParser.java index 077d12684e9a1..0997ecbb2a737 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvParser.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvParser.java @@ -36,6 +36,7 @@ private enum State { private final char separator; private final boolean trim; private final String[] headers; + private final Object emptyValue; private final IngestDocument ingestDocument; private final StringBuilder builder = new StringBuilder(); private State state = State.START; @@ -45,12 +46,13 @@ private enum State { private int length; private int currentIndex; - CsvParser(IngestDocument ingestDocument, char quote, char separator, boolean trim, String[] headers) { + CsvParser(IngestDocument ingestDocument, char quote, char separator, boolean trim, String[] headers, Object emptyValue) { this.ingestDocument = ingestDocument; this.quote = quote; this.separator = separator; this.trim = trim; this.headers = headers; + this.emptyValue = emptyValue; } void process(String line) { @@ -102,7 +104,8 @@ private boolean processStart() { return false; } else if (c == separator) { startIndex++; - if (nextHeader()) { + builder.setLength(0); + if (setField(startIndex)) { return true; } } else if (isWhitespace(c)) { @@ -190,16 +193,17 @@ private boolean isWhitespace(char c) { } private boolean setField(int endIndex) { + String value; if (builder.length() == 0) { - ingestDocument.setFieldValue(headers[currentHeader], line.substring(startIndex, endIndex)); + value = line.substring(startIndex, endIndex); } else { - builder.append(line, startIndex, endIndex); - ingestDocument.setFieldValue(headers[currentHeader], builder.toString()); + value = builder.append(line, startIndex, endIndex).toString(); + } + if (value.length() > 0) { + ingestDocument.setFieldValue(headers[currentHeader], value); + } else if (emptyValue != null) { + ingestDocument.setFieldValue(headers[currentHeader], emptyValue); } - return nextHeader(); - } - - private boolean nextHeader() { currentHeader++; return currentHeader == headers.length; } diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvProcessor.java index 66d10cc239e46..9b17b430ac56a 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvProcessor.java @@ -32,13 +32,14 @@ * A processor that breaks line from CSV file into separate fields. * If there's more fields requested than there is in the CSV, extra field will not be present in the document after processing. * In the same way this processor will skip any field that is empty in CSV. - * + *

* By default it uses rules according to RCF 4180 with one exception: whitespaces are * allowed before or after quoted field. Processor can be tweaked with following parameters: - * + *

* quote: set custom quote character (defaults to ") * separator: set custom separator (defaults to ,) * trim: trim leading and trailing whitespaces in unquoted fields + * empty_value: sets custom value to use for empty fields (field is skipped if null) */ public final class CsvProcessor extends AbstractProcessor { @@ -50,8 +51,10 @@ public final class CsvProcessor extends AbstractProcessor { private final char quote; private final char separator; private final boolean ignoreMissing; + private final Object emptyValue; - CsvProcessor(String tag, String field, String[] headers, boolean trim, char separator, char quote, boolean ignoreMissing) { + CsvProcessor(String tag, String field, String[] headers, boolean trim, char separator, char quote, boolean ignoreMissing, + Object emptyValue) { super(tag); this.field = field; this.headers = headers; @@ -59,6 +62,7 @@ public final class CsvProcessor extends AbstractProcessor { this.quote = quote; this.separator = separator; this.ignoreMissing = ignoreMissing; + this.emptyValue = emptyValue; } @Override @@ -73,7 +77,7 @@ public IngestDocument execute(IngestDocument ingestDocument) { } else if (line == null) { throw new IllegalArgumentException("field [" + field + "] is null, cannot process it."); } - new CsvParser(ingestDocument, quote, separator, trim, headers).process(line); + new CsvParser(ingestDocument, quote, separator, trim, headers, emptyValue).process(line); return ingestDocument; } @@ -96,13 +100,17 @@ public CsvProcessor create(Map targetFields = ConfigurationUtils.readList(TYPE, processorTag, config, "target_fields"); if (targetFields.isEmpty()) { throw newConfigurationException(TYPE, processorTag, "target_fields", "target fields list can't be empty"); } return new CsvProcessor(processorTag, field, targetFields.toArray(String[]::new), trim, separator.charAt(0), quote.charAt(0), - ignoreMissing); + ignoreMissing, emptyValue); } } } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/CsvProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/CsvProcessorTests.java index 45f00a7435033..3328fa37c725c 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/CsvProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/CsvProcessorTests.java @@ -53,7 +53,7 @@ public void setup() { separator = randomFrom(SEPARATORS); } - public void testExactNumberOfFields() throws Exception { + public void testExactNumberOfFields() { int numItems = randomIntBetween(2, 10); Map items = new LinkedHashMap<>(); for (int i = 0; i < numItems; i++) { @@ -67,7 +67,67 @@ public void testExactNumberOfFields() throws Exception { items.forEach((key, value) -> assertEquals(value, ingestDocument.getFieldValue(key, String.class))); } - public void testLessFieldsThanHeaders() throws Exception { + public void testEmptyValues() { + int numItems = randomIntBetween(5, 10); + Map items = new LinkedHashMap<>(); + for (int i = 0; i < 3; i++) { + items.put(randomAlphaOfLengthBetween(5, 10), randomAlphaOfLengthBetween(5, 10)); + } + String emptyKey = randomAlphaOfLengthBetween(5, 10); + items.put(emptyKey, ""); + for (int i = 0; i < numItems - 4; i++) { + items.put(randomAlphaOfLengthBetween(5, 10), randomAlphaOfLengthBetween(5, 10)); + } + String[] headers = items.keySet().toArray(new String[numItems]); + String csv = items.values().stream().map(v -> quote + v + quote).collect(Collectors.joining(separator + "")); + + IngestDocument ingestDocument = processDocument(headers, csv); + + items.forEach((key, value) -> { + if (emptyKey.equals(key)) { + assertFalse(ingestDocument.hasField(key)); + } else { + assertEquals(value, ingestDocument.getFieldValue(key, String.class)); + } + }); + } + + public void testEmptyValuesReplace() { + int numItems = randomIntBetween(5, 10); + Map items = new LinkedHashMap<>(); + for (int i = 0; i < 3; i++) { + items.put(randomAlphaOfLengthBetween(5, 10), randomAlphaOfLengthBetween(5, 10)); + } + String emptyKey = randomAlphaOfLengthBetween(5, 10); + items.put(emptyKey, ""); + for (int i = 0; i < numItems - 4; i++) { + items.put(randomAlphaOfLengthBetween(5, 10), randomAlphaOfLengthBetween(5, 10)); + } + String[] headers = items.keySet().toArray(new String[numItems]); + String csv = items.values().stream().map(v -> quote + v + quote).collect(Collectors.joining(separator + "")); + + IngestDocument ingestDocument = processDocument(headers, csv, true, ""); + + items.forEach((key, value) -> { + if (emptyKey.equals(key)) { + assertEquals("", ingestDocument.getFieldValue(key, String.class)); + } else { + assertEquals(value, ingestDocument.getFieldValue(key, String.class)); + } + }); + + IngestDocument ingestDocument2 = processDocument(headers, csv, true, 0); + + items.forEach((key, value) -> { + if (emptyKey.equals(key)) { + assertEquals(0, (int) ingestDocument2.getFieldValue(key, Integer.class)); + } else { + assertEquals(value, ingestDocument2.getFieldValue(key, String.class)); + } + }); + } + + public void testLessFieldsThanHeaders() { int numItems = randomIntBetween(4, 10); Map items = new LinkedHashMap<>(); for (int i = 0; i < numItems; i++) { @@ -82,7 +142,7 @@ public void testLessFieldsThanHeaders() throws Exception { items.entrySet().stream().limit(3).forEach(e -> assertEquals(e.getValue(), ingestDocument.getFieldValue(e.getKey(), String.class))); } - public void testLessHeadersThanFields() throws Exception { + public void testLessHeadersThanFields() { int numItems = randomIntBetween(5, 10); Map items = new LinkedHashMap<>(); for (int i = 0; i < numItems; i++) { @@ -96,7 +156,7 @@ public void testLessHeadersThanFields() throws Exception { items.entrySet().stream().limit(3).forEach(e -> assertEquals(e.getValue(), ingestDocument.getFieldValue(e.getKey(), String.class))); } - public void testSingleField() throws Exception { + public void testSingleField() { String[] headers = new String[]{randomAlphaOfLengthBetween(5, 10)}; String value = randomAlphaOfLengthBetween(5, 10); String csv = quote + value + quote; @@ -106,7 +166,7 @@ public void testSingleField() throws Exception { assertEquals(value, ingestDocument.getFieldValue(headers[0], String.class)); } - public void testEscapedQuote() throws Exception { + public void testEscapedQuote() { int numItems = randomIntBetween(2, 10); Map items = new LinkedHashMap<>(); for (int i = 0; i < numItems; i++) { @@ -121,7 +181,7 @@ public void testEscapedQuote() throws Exception { items.forEach((key, value) -> assertEquals(value.replace(quote + quote, quote), ingestDocument.getFieldValue(key, String.class))); } - public void testQuotedStrings() throws Exception { + public void testQuotedStrings() { assumeFalse("quote needed", quote.isEmpty()); int numItems = randomIntBetween(2, 10); Map items = new LinkedHashMap<>(); @@ -138,7 +198,7 @@ public void testQuotedStrings() throws Exception { String.class))); } - public void testEmptyFields() throws Exception { + public void testEmptyFields() { int numItems = randomIntBetween(5, 10); Map items = new LinkedHashMap<>(); for (int i = 0; i < numItems; i++) { @@ -158,7 +218,7 @@ public void testEmptyFields() throws Exception { items.keySet().stream().skip(numItems - 1).forEach(key -> assertFalse(ingestDocument.hasField(key))); } - public void testWrongStings() throws Exception { + public void testWrongStings() { assumeTrue("single run only", quote.isEmpty()); expectThrows(IllegalArgumentException.class, () -> processDocument(new String[]{"a"}, "abc\"abc")); expectThrows(IllegalArgumentException.class, () -> processDocument(new String[]{"a"}, "\"abc\"asd")); @@ -167,7 +227,7 @@ public void testWrongStings() throws Exception { expectThrows(IllegalArgumentException.class, () -> processDocument(new String[]{"a"}, "abc\rabc")); } - public void testQuotedWhitespaces() throws Exception { + public void testQuotedWhitespaces() { assumeFalse("quote needed", quote.isEmpty()); IngestDocument document = processDocument(new String[]{"a", "b", "c", "d"}, " abc " + separator + " def" + separator + "ghi " + separator + " " + quote + " ooo " + quote); @@ -177,7 +237,7 @@ public void testQuotedWhitespaces() throws Exception { assertEquals(" ooo ", document.getFieldValue("d", String.class)); } - public void testUntrimmed() throws Exception { + public void testUntrimmed() { assumeFalse("quote needed", quote.isEmpty()); IngestDocument document = processDocument(new String[]{"a", "b", "c", "d", "e", "f"}, " abc " + separator + " def" + separator + "ghi " + separator + " " @@ -190,30 +250,35 @@ public void testUntrimmed() throws Exception { assertFalse(document.hasField("f")); } - public void testEmptyHeaders() throws Exception { + public void testEmptyHeaders() { assumeTrue("single run only", quote.isEmpty()); IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random()); String fieldName = RandomDocumentPicks.addRandomField(random(), ingestDocument, "abc,abc"); HashMap metadata = new HashMap<>(ingestDocument.getSourceAndMetadata()); - CsvProcessor processor = new CsvProcessor(randomAlphaOfLength(5), fieldName, new String[0], false, ',', '"', false); + CsvProcessor processor = new CsvProcessor(randomAlphaOfLength(5), fieldName, new String[0], false, ',', '"', false, null); processor.execute(ingestDocument); assertEquals(metadata, ingestDocument.getSourceAndMetadata()); } - private IngestDocument processDocument(String[] headers, String csv) throws Exception { + private IngestDocument processDocument(String[] headers, String csv) { return processDocument(headers, csv, true); } - private IngestDocument processDocument(String[] headers, String csv, boolean trim) throws Exception { + private IngestDocument processDocument(String[] headers, String csv, boolean trim) { + return processDocument(headers, csv, trim, null); + } + + private IngestDocument processDocument(String[] headers, String csv, boolean trim, Object emptyValue) { IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random()); Arrays.stream(headers).filter(ingestDocument::hasField).forEach(ingestDocument::removeField); String fieldName = RandomDocumentPicks.addRandomField(random(), ingestDocument, csv); char quoteChar = quote.isEmpty() ? '"' : quote.charAt(0); - CsvProcessor processor = new CsvProcessor(randomAlphaOfLength(5), fieldName, headers, trim, separator, quoteChar, false); + CsvProcessor processor = new CsvProcessor(randomAlphaOfLength(5), fieldName, headers, trim, separator, quoteChar, false, + emptyValue); processor.execute(ingestDocument); From 7c31654847023e2d9084550e961499226348f167 Mon Sep 17 00:00:00 2001 From: Przemko Robakowski Date: Wed, 5 Feb 2020 00:04:59 +0100 Subject: [PATCH 2/3] docs updated --- docs/reference/ingest/processors/csv.asciidoc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/reference/ingest/processors/csv.asciidoc b/docs/reference/ingest/processors/csv.asciidoc index 72bc7648dcc94..1ac3f4de9304d 100644 --- a/docs/reference/ingest/processors/csv.asciidoc +++ b/docs/reference/ingest/processors/csv.asciidoc @@ -13,7 +13,8 @@ Extracts fields from CSV line out of a single text field within a document. Any | `quote` | no | " | Quote used in CSV, has to be single character string | `ignore_missing` | no | `true` | If `true` and `field` does not exist, the processor quietly exits without modifying the document | `trim` | no | `false` | Trim whitespaces in unquoted fields -| `empty_value` | no | - | Value used to fill empty fields, empty fields will be skipped if this is not provided +| `empty_value` | no | - | Value used to fill empty fields, empty fields will be skipped if this is not provided. + Empty field is one with no value (2 consecutive separators) or empty quotes (`""`) include::common-options.asciidoc[] |====== From aaa5a7a394ebd27baba7db05750bf4a72ee52815 Mon Sep 17 00:00:00 2001 From: Przemko Robakowski Date: Wed, 5 Feb 2020 21:07:28 +0100 Subject: [PATCH 3/3] Fix compilation problem --- .../org/elasticsearch/ingest/common/CsvProcessorTests.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/CsvProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/CsvProcessorTests.java index 01d4414ca7bbe..bfde04555e6ab 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/CsvProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/CsvProcessorTests.java @@ -257,9 +257,9 @@ public void testIgnoreMissing() { if (ingestDocument.hasField(fieldName)) { ingestDocument.removeField(fieldName); } - CsvProcessor processor = new CsvProcessor(randomAlphaOfLength(5), fieldName, new String[]{"a"}, false, ',', '"', true); + CsvProcessor processor = new CsvProcessor(randomAlphaOfLength(5), fieldName, new String[]{"a"}, false, ',', '"', true, null); processor.execute(ingestDocument); - CsvProcessor processor2 = new CsvProcessor(randomAlphaOfLength(5), fieldName, new String[]{"a"}, false, ',', '"', false); + CsvProcessor processor2 = new CsvProcessor(randomAlphaOfLength(5), fieldName, new String[]{"a"}, false, ',', '"', false, null); expectThrows(IllegalArgumentException.class, () -> processor2.execute(ingestDocument)); }